[{"data":1,"prerenderedAt":293},["ShallowReactive",2],{"content-query-wi35f9uXxU":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":287,"_id":288,"_source":289,"_file":290,"_stem":291,"_extension":292},"/news/zh/2025-12-8-2","zh",false,"","昇思人工智能框架峰会 | 昇思MindSpore优化器并行、Zero还是FSDP？","就AI框架共有的优化器并行相关技术进行探讨","2025-12-8","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/25/199b735845bf4106b44b2035dc97bd39.png","news",{"type":14,"children":15,"toc":284},"root",[16,24,30,35,40,49,57,62,73,78,83,90,95,100,107,112,117,124,129,134,141,146,151,158,163,171,179,184,191,196,201,208,213,220,225,230,237,242,247,254,259,264,279],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"昇思人工智能框架峰会-昇思mindspore优化器并行zero还是fsdp",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28],{"type":23,"value":29},"据悉，昇思MindSpore开源社区将于 2025 年 12 月 25日在杭州举办昇思人工智能框架峰会。本次峰会将展示昇思亲和超节点技术创新点。面向本次大会的昇思人工智能框架技术发展与行业实践论坛，本文基本昇思AI框架技术发展实践，就AI框架共有的优化器并行相关技术进行探讨。",{"type":17,"tag":25,"props":31,"children":32},{},[33],{"type":23,"value":34},"大模型训练中，随着模型参数向万亿级迈进，显存压力呈指数级增长，因此高效地利用显存，是训练能否有效进行的关键因素。为此，业界发展出多种“以通信换显存”的并行策略——其中最主流的是DeepSpeed Zero(ZeRO)、Megatron FSDP（Fully Sharded Data Parallel）。这些技术看似不同，实则都是将权重在dp(data parallel)维度进一步切分，以通信换显存。",{"type":17,"tag":25,"props":36,"children":37},{},[38],{"type":23,"value":39},"昇思MindSpore的优化器并行方案于2020提出并将其实现开源，早于DeepSpeed Zero和Megatron FSDP，且Pytorch的原生并行库Titan提出的FSDP方案采用了与昇思MindSpore的优化器并行完全相同的方案。 本文将深入剖析这些技术，并揭示一个常被忽略的关键问题：“权重副本”到底需不需要？",{"type":17,"tag":25,"props":41,"children":42},{},[43],{"type":17,"tag":44,"props":45,"children":46},"strong",{},[47],{"type":23,"value":48},"# 01",{"type":17,"tag":25,"props":50,"children":51},{},[52],{"type":17,"tag":44,"props":53,"children":54},{},[55],{"type":23,"value":56},"Zero的深入剖析",{"type":17,"tag":25,"props":58,"children":59},{},[60],{"type":23,"value":61},"DeepSpeed提出Zero概念时，就对权重、梯度、优化器状态的分级切分给了严格的定义，如下表如下：",{"type":17,"tag":63,"props":64,"children":66},"div",{"style":65},"text-align: center;",[67],{"type":17,"tag":68,"props":69,"children":72},"img",{"src":70,"style":71,"alt":7},"/category/information/news/banner/2025-12-8-2-1.jpg","display: block;margin: 0 auto;max-width:60%",[],{"type":17,"tag":25,"props":74,"children":75},{},[76],{"type":23,"value":77},"表1. Zero切分的定义",{"type":17,"tag":25,"props":79,"children":80},{},[81],{"type":23,"value":82},"根据Zero分级切分定义，其形成的切分效果如下图所示：",{"type":17,"tag":63,"props":84,"children":85},{"style":65},[86],{"type":17,"tag":68,"props":87,"children":89},{"src":88,"style":71,"alt":7},"/category/information/news/banner/2025-12-8-2-2.jpg",[],{"type":17,"tag":25,"props":91,"children":92},{},[93],{"type":23,"value":94},"图1. Zero的切分效果",{"type":17,"tag":25,"props":96,"children":97},{},[98],{"type":23,"value":99},"Zero的切分是针对模型的优化器状态、梯度、模型参数这3份静态显存进行切分，但这里存在一个隐藏问题：如果只切分优化器状态而不切权重，那更新时用哪个权重？实际上，还有一个“权重副本”（parameter copy） 需要参与计算。因此模型静态显存应为优化器状态、梯度、模型参数和权重副本4份，进而更新Zero的定义如下表：",{"type":17,"tag":63,"props":101,"children":102},{"style":65},[103],{"type":17,"tag":68,"props":104,"children":106},{"src":105,"style":71,"alt":7},"/category/information/news/banner/2025-12-8-2-3.jpg",[],{"type":17,"tag":25,"props":108,"children":109},{},[110],{"type":23,"value":111},"表2. Zero切分的更新定义",{"type":17,"tag":25,"props":113,"children":114},{},[115],{"type":23,"value":116},"Zero-1的实现切分了优化器状态和权重副本，其模型的前向与反向在梯度累加场景下会执行N次(N为梯度累加次数)，但AllGather与ReduceScatter只执行一次，其流程如图2所示：",{"type":17,"tag":63,"props":118,"children":119},{"style":65},[120],{"type":17,"tag":68,"props":121,"children":123},{"src":122,"style":71,"alt":7},"/category/information/news/banner/2025-12-8-2-4.jpg",[],{"type":17,"tag":25,"props":125,"children":126},{},[127],{"type":23,"value":128},"图2. Zero-1的流程",{"type":17,"tag":25,"props":130,"children":131},{},[132],{"type":23,"value":133},"Zero-2对权重副本、优化器状态、梯度都进行了切分，其流程图如图3所示。为了能够切分梯度，在梯度累加前，先对反向出来的梯度执行一次ReduceScatter，那么这个ReduceScatter要执行N次，这样为了节省梯度参数的显存，引入了过量的ReduceScatter开销，因此在实际大模型训练中含梯度累加的场景，一般仅使用Zero-1。",{"type":17,"tag":63,"props":135,"children":136},{"style":65},[137],{"type":17,"tag":68,"props":138,"children":140},{"src":139,"style":71,"alt":7},"/category/information/news/banner/2025-12-8-2-5.jpg",[],{"type":17,"tag":25,"props":142,"children":143},{},[144],{"type":23,"value":145},"图3. Zero-2的流程",{"type":17,"tag":25,"props":147,"children":148},{},[149],{"type":23,"value":150},"Zero-3对所有静态显存都进行了切分，其流程图如图4所示。为了除低显存在前反向开始计算前都会进行AllGather操作，引入了巨大的通信开销，同样在大模型训练的梯度累加场景一般是不可接受的。",{"type":17,"tag":63,"props":152,"children":153},{"style":65},[154],{"type":17,"tag":68,"props":155,"children":157},{"src":156,"style":71,"alt":7},"/category/information/news/banner/2025-12-8-2-6.jpg",[],{"type":17,"tag":25,"props":159,"children":160},{},[161],{"type":23,"value":162},"图4. Zero-3的流程",{"type":17,"tag":25,"props":164,"children":165},{},[166],{"type":17,"tag":44,"props":167,"children":168},{},[169],{"type":23,"value":170},"# 02",{"type":17,"tag":25,"props":172,"children":173},{},[174],{"type":17,"tag":44,"props":175,"children":176},{},[177],{"type":23,"value":178},"昇思MindSpore的优化器并行方案：去权重副本，更简洁",{"type":17,"tag":25,"props":180,"children":181},{},[182],{"type":23,"value":183},"昇思MindSpore的优化器并行方案自2020年起就采用了一种无需权重副本的方案，其核心思路是“要切分优化器状态，就必须要切权重本身”。模型的静态显存可分优化器状态：直接切分到dp域，权重：随优化器状态切分，梯度：权在梯度累加时才分配静态显存。昇思MindSpore优化器并行Zero的方案通过表3给出。",{"type":17,"tag":63,"props":185,"children":186},{"style":65},[187],{"type":17,"tag":68,"props":188,"children":190},{"src":189,"style":71,"alt":7},"/category/information/news/banner/2025-12-8-2-7.jpg",[],{"type":17,"tag":25,"props":192,"children":193},{},[194],{"type":23,"value":195},"表3. 昇思MindSpore优化器并行Zero切分的定义",{"type":17,"tag":25,"props":197,"children":198},{},[199],{"type":23,"value":200},"昇思MindSpore优化器并行Zero-1的流程只切分优化器状态与权重，分为非梯度累加与梯度累加2个场景，其中非梯度累加流程如图5所示，梯度累加流程如图6所示。在非梯度累加场景，去掉了梯度的静态显存，因此显存利用是明显优于DeepSpeed原版的。",{"type":17,"tag":63,"props":202,"children":203},{"style":65},[204],{"type":17,"tag":68,"props":205,"children":207},{"src":206,"style":71,"alt":7},"/category/information/news/banner/2025-12-8-2-8.jpg",[],{"type":17,"tag":25,"props":209,"children":210},{},[211],{"type":23,"value":212},"图5. 昇思MindSpore优化器并行Zero-1非梯度累加场景的流程",{"type":17,"tag":63,"props":214,"children":215},{"style":65},[216],{"type":17,"tag":68,"props":217,"children":219},{"src":218,"style":71,"alt":7},"/category/information/news/banner/2025-12-8-2-9.jpg",[],{"type":17,"tag":25,"props":221,"children":222},{},[223],{"type":23,"value":224},"图6. 昇思MindSpore优化器并行Zero-1梯度累加场景的流程",{"type":17,"tag":25,"props":226,"children":227},{},[228],{"type":23,"value":229},"昇思MindSpore优化器并行Zero-2的流程与DeepSpeed/Megatron的实现没有本质区别，如图7所示。",{"type":17,"tag":63,"props":231,"children":232},{"style":65},[233],{"type":17,"tag":68,"props":234,"children":236},{"src":235,"style":71,"alt":7},"/category/information/news/banner/2025-12-8-2-10.jpg",[],{"type":17,"tag":25,"props":238,"children":239},{},[240],{"type":23,"value":241},"图7. 昇思MindSpore优化器并行Zero-2的流程",{"type":17,"tag":25,"props":243,"children":244},{},[245],{"type":23,"value":246},"昇思MindSpore优化器并行Zero-3的流程与DeepSpeed/Megatron的实现也没有本质区别，给反向的AllGather通过对前向AllGather的重计算来实现，如图8所示。",{"type":17,"tag":63,"props":248,"children":249},{"style":65},[250],{"type":17,"tag":68,"props":251,"children":253},{"src":252,"style":71,"alt":7},"/category/information/news/banner/2025-12-8-2-11.jpg",[],{"type":17,"tag":25,"props":255,"children":256},{},[257],{"type":23,"value":258},"图8. 昇思MindSpore优化器并行Zero-3的流程",{"type":17,"tag":25,"props":260,"children":261},{},[262],{"type":23,"value":263},"名字只是表象，思想才是核心。昇思MindSpore的优化器并行方案相较于DeepSpeed和Megaron方案有以下2大优势：",{"type":17,"tag":265,"props":266,"children":267},"ol",{},[268,274],{"type":17,"tag":269,"props":270,"children":271},"li",{},[272],{"type":23,"value":273},"避免冗余静态显存：无需维护权重副本，显存更优。",{"type":17,"tag":269,"props":275,"children":276},{},[277],{"type":23,"value":278},"流程更简洁：前向开始前执行AllGather获取完整权重，反向结束后释放。",{"type":17,"tag":25,"props":280,"children":281},{},[282],{"type":23,"value":283},"PyTorch 最新原生并行库Titan的FSDP方案也采用了类似设计，说明该思路已成趋势。",{"title":7,"searchDepth":285,"depth":285,"links":286},4,[],"markdown","content:news:zh:2025-12-8-2.md","content","news/zh/2025-12-8-2.md","news/zh/2025-12-8-2","md",1776506060529]