[{"data":1,"prerenderedAt":445},["ShallowReactive",2],{"content-query-bXeYk7BSic":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":439,"_id":440,"_source":441,"_file":442,"_stem":443,"_extension":444},"/version-updates/zh/3077","zh",false,"","大模型首选AI框架——昇思MindSpore2.3.RC1版本上线开源社区","经过社区开发者们几个月的开发与贡献，现正式发布昇思MindSpore2.3.RC1版本","2024-04-24","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/04/25/079b2e6487bb4877a02468b96dc957b8.png","version-updates",{"type":14,"children":15,"toc":419},"root",[16,24,30,35,54,58,63,68,76,81,88,93,106,110,124,128,133,143,148,164,169,174,179,184,191,206,211,215,225,229,246,250,255,260,265,272,276,286,290,307,311,316,323,327,336,340,354,358,363,368,375,380,387,392,397,404,409],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"大模型首选ai框架昇思mindspore23rc1版本上线开源社区",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28],{"type":23,"value":29},"经过社区开发者们几个月的开发与贡献，现正式发布昇思MindSpore2.3.RC1版本，通过多维混合并行以及确定性CKPT来实现超大集群的高性能训练，支持大模型训推一体架构，大模型开发训练推理更简、更稳、更高效，并在训推一体框架的基础上通过多样的大模型推理优化技术，进一步降低大模型推理成本；通过使能kernel by kernel调度执行，进一步提升静态图调试调优能力；持续升级昇思MindSpore TransFormers大模型套件和昇思MindSpore One生成式套件，全流程开箱即用，一周即可完成大模型全流程的开发、验证；创新AI+科学计算（科学智能）范式，孵化科学领域基础大模型；下面就带大家详细了解下昇思MindSpore2.3.RC1版本的关键特性。",{"type":17,"tag":25,"props":31,"children":32},{},[33],{"type":23,"value":34},"**",{"type":17,"tag":36,"props":37,"children":39},"h2",{"id":38},"_1大模型训练细粒度多副本并行有效提升计算通信并发度显著提升大模型训练性能",[40,42],{"type":23,"value":41},"**1.**",{"type":17,"tag":43,"props":44,"children":45},"strong",{},[46],{"type":17,"tag":43,"props":47,"children":48},{},[49],{"type":17,"tag":43,"props":50,"children":51},{},[52],{"type":23,"value":53},"大模型训练：细粒度多副本并行，有效提升计算通信并发度，显著提升大模型训练性能",{"type":17,"tag":25,"props":55,"children":56},{},[57],{"type":23,"value":34},{"type":17,"tag":25,"props":59,"children":60},{},[61],{"type":23,"value":62},"大模型训练下，为了降低显存开销，广泛的使用算子级并行技术，其中引入了大量的模型并行的通信，极大地影响了大模型的训练效率。模型并行的通信，从网络的结构上来看，其处于正反向计算过程中，阻塞正反向计算，无法与正反向的计算进行互相掩盖。为了解决模型并行通信的掩盖问题，昇思MindSpore提出了多副本并行技术。",{"type":17,"tag":25,"props":64,"children":65},{},[66],{"type":23,"value":67},"在旧版本的昇思MindSpore上，通过将网络从数据开始进行拆分，如下图所示，在单张卡内，通过Slice算子将Batch维度进行拆分，进而产生多个分支，这多个分支的计算与通信互相之间没有依赖，存在并发的空间，通过执行序调度算法，控制多个分支的计算与通信进行并发。",{"type":17,"tag":25,"props":69,"children":70},{},[71],{"type":17,"tag":72,"props":73,"children":75},"img",{"alt":7,"src":74},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/04/25/99051fda52484eacb8ec102c6a0b39e9.png",[],{"type":17,"tag":25,"props":77,"children":78},{},[79],{"type":23,"value":80},"随着网络规模的增大，受限于显存限制，当一张卡内的BatchSize仅支持为1时，上述对整网进行Batch拆分的方案不再可行。因此，考虑到模型并行通信的位置，昇思MindSpore2.3.RC1版本将TransFormer模型中的AttentionProjection层以及FFN层进行拆分，产生多个分支，通过执行序调度算法控制细粒度的多分支的并行，其中拆分从AttentionProjection开始，到下一个Layer的QKV计算前结束。",{"type":17,"tag":25,"props":82,"children":83},{},[84],{"type":17,"tag":72,"props":85,"children":87},{"alt":7,"src":86},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/04/25/254ba23215d74e6b8f08cb51c64bc39c.png",[],{"type":17,"tag":25,"props":89,"children":90},{},[91],{"type":23,"value":92},"上图描述了序列并行场景下的细粒度多副本拆分与掩盖基本思路，拆分为两个副本，在正向可以达成50%+的通信掩盖；而在反向，结合计算梯度的分支的计算与TP通信的掩盖，可达成90%的通信的掩盖。当前细粒度多副本并行仅在昇思MindSpore TransFormers的LLAMA网络进行了实现，需要对模型结构进行手动改造为多个副本。后续版本昇思MindSpore将集成自动拆分副本的逻辑，达成更易用的细粒度多副本并行。",{"type":17,"tag":25,"props":94,"children":95},{},[96,98],{"type":23,"value":97},"参考链接：",{"type":17,"tag":99,"props":100,"children":104},"a",{"href":101,"rel":102},"https://www.mindspore.cn/docs/zh-CN/master/model_train/parallel/multiple_copy.html",[103],"nofollow",[105],{"type":23,"value":101},{"type":17,"tag":25,"props":107,"children":108},{},[109],{"type":23,"value":34},{"type":17,"tag":36,"props":111,"children":113},{"id":112},"_2大模型推理全栈升级",[114,116],{"type":23,"value":115},"**2.**",{"type":17,"tag":43,"props":117,"children":118},{},[119],{"type":17,"tag":43,"props":120,"children":121},{},[122],{"type":23,"value":123},"大模型推理全栈升级",{"type":17,"tag":25,"props":125,"children":126},{},[127],{"type":23,"value":34},{"type":17,"tag":25,"props":129,"children":130},{},[131],{"type":23,"value":132},"大模型大规模商用之后，推理消耗的算力规模将十分庞大，相应地带来高昂的成本，商业闭环依赖推理规模突破。在降低大模型推理的成本的同时，要兼顾模型精度和计算时延，不能影响用户的体验。昇思MindSpore 2.3.RC1版本，从最上层推理服务到模型脚本优化到推理引擎LLM Serving，为用户提供端到端的高效推理解决方案。",{"type":17,"tag":134,"props":135,"children":137},"h4",{"id":136},"_21-训推一体大模型训推统一脚本大幅简化部署流程提高效率",[138],{"type":17,"tag":43,"props":139,"children":140},{},[141],{"type":23,"value":142},"2.1 训推一体：大模型训/推统一脚本，大幅简化部署流程，提高效率",{"type":17,"tag":25,"props":144,"children":145},{},[146],{"type":23,"value":147},"模型脚本默认使能了增量推理、FlashAttention/PagedAttention等推理加速技术，避免了模型导出、切分、推理脚本开发等一系列工作，训练到推理加速平滑迁移，部署周期下降到天级。",{"type":17,"tag":134,"props":149,"children":151},{"id":150},"_22-极致性能持续提升融合大算子并行推理模型小型化的关键能力",[152,157,159],{"type":17,"tag":43,"props":153,"children":154},{},[155],{"type":23,"value":156},"2.2",{"type":23,"value":158}," ",{"type":17,"tag":43,"props":160,"children":161},{},[162],{"type":23,"value":163},"极致性能：持续提升融合大算子、并行推理、模型小型化的关键能力",{"type":17,"tag":25,"props":165,"children":166},{},[167],{"type":23,"value":168},"**融合大算子：**新增10+业界最新的推理融合大算子接口，模型开发人员可以快速使能推理融合算子实现加速。",{"type":17,"tag":25,"props":170,"children":171},{},[172],{"type":23,"value":173},"**并行推理：**训练推理并行策略接口一致，提供训练并行到推理并行ckpt重切分接口，支持动态shape模型切分。",{"type":17,"tag":25,"props":175,"children":176},{},[177],{"type":23,"value":178},"**模型压缩：**昇思MindSpore金箍棒升级到2.0版本，提供了针对大模型的业界SOTA以及华为诺亚自研的量化、减枝等算法，实现千亿大模型10倍+压缩。",{"type":17,"tag":25,"props":180,"children":181},{},[182],{"type":23,"value":183},"以上技术均可泛化的应用于TransFormer结构的大模型中，经过验证，在盘古、Llama 2的8卡模型推理中，首token时延做到百ms级，平均token时延小于50ms，保持业界领先水平。",{"type":17,"tag":25,"props":185,"children":186},{},[187],{"type":17,"tag":72,"props":188,"children":190},{"alt":7,"src":189},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/04/25/966c3e79dfe64777926a955d3a353d57.png",[],{"type":17,"tag":134,"props":192,"children":194},{"id":193},"_23-服务化高吞吐",[195,200,201],{"type":17,"tag":43,"props":196,"children":197},{},[198],{"type":23,"value":199},"2.3",{"type":23,"value":158},{"type":17,"tag":43,"props":202,"children":203},{},[204],{"type":23,"value":205},"服务化高吞吐",{"type":17,"tag":25,"props":207,"children":208},{},[209],{"type":23,"value":210},"通过连续批调度、Prefill/Decoding混合部署等手段，尽可能的消除掉冗余计算，确保算力不闲置，实现大模型推理吞吐提升2倍+。",{"type":17,"tag":25,"props":212,"children":213},{},[214],{"type":23,"value":97},{"type":17,"tag":25,"props":216,"children":217},{},[218],{"type":17,"tag":99,"props":219,"children":222},{"href":220,"rel":221},"https://www.mindspore.cn/lite/docs/zh-CN/r2.3.0rc1/use/cloud%5C_infer/runtime%5C_distributed%5C_python.html",[103],[223],{"type":23,"value":224},"https://www.mindspore.cn/lite/docs/zh-CN/r2.3.0rc1/use/cloud\\_infer/runtime\\_distributed\\_python.html",{"type":17,"tag":25,"props":226,"children":227},{},[228],{"type":23,"value":34},{"type":17,"tag":36,"props":230,"children":232},{"id":231},"_3静态图优化支持on多级编译使能kernel-by-kernel调度执行提升静态图调试调优能力",[233,235],{"type":23,"value":234},"**3.**",{"type":17,"tag":43,"props":236,"children":237},{},[238],{"type":17,"tag":43,"props":239,"children":240},{},[241],{"type":17,"tag":43,"props":242,"children":243},{},[244],{"type":23,"value":245},"静态图优化：支持O(n)多级编译，使能kernel by kernel调度执行，提升静态图调试调优能力",{"type":17,"tag":25,"props":247,"children":248},{},[249],{"type":23,"value":34},{"type":17,"tag":25,"props":251,"children":252},{},[253],{"type":23,"value":254},"整图下沉执行性能最优，但大模型的规模和参数量发展得更为庞大，整图下沉执行方式在整图编译过程中耗时较长，一个千亿级别的大模型的编译时间为30分钟-60分钟，调试调优效率低下。为解决上述问题，昇思MindSpore2.3.RC1版本中，提供了多级编译技术，O0原生构图不优化、O1增加自动算子融合优化、O2整图下沉执行优化。",{"type":17,"tag":25,"props":256,"children":257},{},[258],{"type":23,"value":259},"在O0的编译选项下，通过原生图编译和kernel by kernel（KBK）的执行技术，可以将编译时间提升到15分钟以内，同时我们在新版本中还开发了DryRun技术，用户可以直接在离线的情况进行内存瓶颈分析和并行策略调优，结合这两大技术可以使得大模型调试效率倍增。",{"type":17,"tag":25,"props":261,"children":262},{},[263],{"type":23,"value":264},"在O0这种编译条件下，我们使能了SOMAS/LazyInline/控制流Inline来提升内存复用率，使能了多流并行/流水异步调度，可以提升执行性能；在O1这种编译条件下，通过使能算子融合技术，KBK执行模式下可以有更好的执行性能。",{"type":17,"tag":25,"props":266,"children":267},{},[268],{"type":17,"tag":72,"props":269,"children":271},{"alt":7,"src":270},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/04/25/0c266b88972341728a57a10edee58f30.png",[],{"type":17,"tag":25,"props":273,"children":274},{},[275],{"type":23,"value":97},{"type":17,"tag":25,"props":277,"children":278},{},[279],{"type":17,"tag":99,"props":280,"children":283},{"href":281,"rel":282},"https://www.mindspore.cn/docs/zh-CN/r2.3.0rc1/api%5C_python/mindspore/mindspore.JitConfig.html?highlight=jitconfig",[103],[284],{"type":23,"value":285},"https://www.mindspore.cn/docs/zh-CN/r2.3.0rc1/api\\_python/mindspore/mindspore.JitConfig.html?highlight=jitconfig",{"type":17,"tag":25,"props":287,"children":288},{},[289],{"type":23,"value":34},{"type":17,"tag":36,"props":291,"children":293},{"id":292},"_4jit兼具易用性和性能动静统一提供灵活高效开发",[294,296],{"type":23,"value":295},"**4.**",{"type":17,"tag":43,"props":297,"children":298},{},[299],{"type":17,"tag":43,"props":300,"children":301},{},[302],{"type":17,"tag":43,"props":303,"children":304},{},[305],{"type":23,"value":306},"JIT兼具易用性和性能，动静统一，提供灵活高效开发",{"type":17,"tag":25,"props":308,"children":309},{},[310],{"type":23,"value":34},{"type":17,"tag":25,"props":312,"children":313},{},[314],{"type":23,"value":315},"昇思MindSpore支持图模式（静态图）和PyNative模式（动态图）两种运行方法。动态图易于调试，开发灵活，易用性好；静态图语法支持有限，但执行性能好。JIT兼顾性能和易用性，通过对Python字节码进行分析&调整、执行流进行图捕获&图优化，支持入图的Python代码做静态图方式执行，不支持的进行子图切分以动态图方式执行，自动地做到动静统一，实现方法如下图所示。",{"type":17,"tag":25,"props":317,"children":318},{},[319],{"type":17,"tag":72,"props":320,"children":322},{"alt":7,"src":321},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/04/25/58d327062eea49a9a49ee8a5c08bba60.png",[],{"type":17,"tag":25,"props":324,"children":325},{},[326],{"type":23,"value":97},{"type":17,"tag":25,"props":328,"children":329},{},[330],{"type":17,"tag":99,"props":331,"children":334},{"href":332,"rel":333},"https://www.mindspore.cn/docs/zh-CN/master/model_train/program_form/pynative.html#%E5%8A%A8%E9%9D%99%E7%BB%93%E5%90%88",[103],[335],{"type":23,"value":332},{"type":17,"tag":25,"props":337,"children":338},{},[339],{"type":23,"value":34},{"type":17,"tag":36,"props":341,"children":343},{"id":342},"_5mindspore-elec新增大地电磁智能反演模型",[344,346],{"type":23,"value":345},"**5.**",{"type":17,"tag":43,"props":347,"children":348},{},[349],{"type":17,"tag":43,"props":350,"children":351},{},[352],{"type":23,"value":353},"MindSpore Elec：新增大地电磁智能反演模型",{"type":17,"tag":25,"props":355,"children":356},{},[357],{"type":23,"value":34},{"type":17,"tag":25,"props":359,"children":360},{},[361],{"type":23,"value":362},"MindSpore Elec电磁仿真套件升级至0.3版本，联合清华大学李懋坤教授团队、华为先进计算与存储实验室共同打造了基于昇思MindSpore的大地电磁（Magnetotelluric，MT）智能反演模型。该模型通过变分自编码器（VAE）灵活嵌入了多物理先验知识，达到了业界SOTA。该成果已被国际顶级勘探地球物理期刊《Geophysics》收录，同时也在昇思人工智能框架峰会2024上发布亮相。",{"type":17,"tag":25,"props":364,"children":365},{},[366],{"type":23,"value":367},"（1）基础MT反演：反演区域水平长度为10km，深度为1km。下图1中目标电阻率分布（第一列）与传统大地电磁反演（第二列）、大地电磁智能反演（第三列），可以看出大地电磁智能反演相比传统反演精度显著提升（前者残差为0.0056和0.0054；后者为0.023和0.024 ）；下图2中，大地电磁智能反演性能也优于传统反演方法（前者收敛步数为4和4；后者为6和4）。",{"type":17,"tag":25,"props":369,"children":370},{},[371],{"type":17,"tag":72,"props":372,"children":374},{"alt":7,"src":373},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/04/25/5753b0e3933b43ce9421e17d0a9de236.png",[],{"type":17,"tag":25,"props":376,"children":377},{},[378],{"type":23,"value":379},"图1 大地电磁反演精度对比",{"type":17,"tag":25,"props":381,"children":382},{},[383],{"type":17,"tag":72,"props":384,"children":386},{"alt":7,"src":385},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/04/25/5fc7c9d2656647a88cce307698918561.png",[],{"type":17,"tag":25,"props":388,"children":389},{},[390],{"type":23,"value":391},"图2 大地电磁反演收敛速度对比（Pixel-based：传统反演；Feature-based：我们的工作）",{"type":17,"tag":25,"props":393,"children":394},{},[395],{"type":23,"value":396},"（2）南部非洲MT反演：大地电磁智能反演模型也在南部非洲开源数据集（SAMTEX）上做了验证。该反演区域位于南部非洲西海岸附近，长度约为750km，深度选定为80km。该测区显著特征为在水平方向100km至400km之间，深度20km以浅的区域存在的高导结构。由于低频电磁波在导体结构中的衰减，MT方法对高导结构下部区域的敏感度很低, 因此无先验知识约束的传统MT反演难以准确重建高导地层的下边界位置。大地电磁智能反演对高导地层的下边界重建较为清晰准确，较好地将地层厚度的先验知识融入了反演。",{"type":17,"tag":25,"props":398,"children":399},{},[400],{"type":17,"tag":72,"props":401,"children":403},{"alt":7,"src":402},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/04/25/4602fc909839440bb4220b251aba81a7.png",[],{"type":17,"tag":25,"props":405,"children":406},{},[407],{"type":23,"value":408},"南部非洲MT反演示例图（上图：传统反演；下图：我们的工作）",{"type":17,"tag":25,"props":410,"children":411},{},[412,413],{"type":23,"value":97},{"type":17,"tag":99,"props":414,"children":417},{"href":415,"rel":416},"https://gitee.com/mindspore/mindscience/tree/master/MindElec",[103],[418],{"type":23,"value":415},{"title":7,"searchDepth":420,"depth":420,"links":421},4,[422,425,433,435,437],{"id":38,"depth":423,"text":424},2,"**1.**大模型训练：细粒度多副本并行，有效提升计算通信并发度，显著提升大模型训练性能",{"id":112,"depth":423,"text":426,"children":427},"**2.**大模型推理全栈升级",[428,429,431],{"id":136,"depth":420,"text":142},{"id":150,"depth":420,"text":430},"2.2 极致性能：持续提升融合大算子、并行推理、模型小型化的关键能力",{"id":193,"depth":420,"text":432},"2.3 服务化高吞吐",{"id":231,"depth":423,"text":434},"**3.**静态图优化：支持O(n)多级编译，使能kernel by kernel调度执行，提升静态图调试调优能力",{"id":292,"depth":423,"text":436},"**4.**JIT兼具易用性和性能，动静统一，提供灵活高效开发",{"id":342,"depth":423,"text":438},"**5.**MindSpore Elec：新增大地电磁智能反演模型","markdown","content:version-updates:zh:3077.md","content","version-updates/zh/3077.md","version-updates/zh/3077","md",1776506145296]