[{"data":1,"prerenderedAt":275},["ShallowReactive",2],{"content-query-MyeiezpzYi":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":269,"_id":270,"_source":271,"_file":272,"_stem":273,"_extension":274},"/news/zh/3147","zh",false,"","多维度混合并行自动搜索优化策略 | 课程回顾","昇思MindSpore技术公开课大模型专题精彩继续！","2024-06-05","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/06/07/8791877f592d45d9887d20f0671a69c2.png","news",{"type":14,"children":15,"toc":266},"root",[16,24,30,39,47,55,60,67,72,79,87,95,100,107,112,119,124,131,139,147,152,159,164,171,179,187,194,201,209,217,222,227,232,247,254,259],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"多维度混合并行自动搜索优化策略-课程回顾",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28],{"type":23,"value":29},"昇思MindSpore技术公开课大模型专题精彩继续！上期课程我们带来了从深度学习并行模式到优化方案求解的全面讲解。下面我们对第十三讲的课程多维度混合并行自动搜索优化策略知识点进行总结：",{"type":17,"tag":25,"props":31,"children":32},{},[33],{"type":17,"tag":34,"props":35,"children":36},"strong",{},[37],{"type":23,"value":38},"01",{"type":17,"tag":25,"props":40,"children":41},{},[42],{"type":17,"tag":34,"props":43,"children":44},{},[45],{"type":23,"value":46},"并行模式回顾",{"type":17,"tag":25,"props":48,"children":49},{},[50],{"type":17,"tag":51,"props":52,"children":54},"img",{"alt":7,"src":53},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/06/07/79c30027e55d496784481655a47d7638.png",[],{"type":17,"tag":25,"props":56,"children":57},{},[58],{"type":23,"value":59},"并行模式主要包括数据并行和模型并行。",{"type":17,"tag":25,"props":61,"children":62},{},[63],{"type":17,"tag":51,"props":64,"children":66},{"alt":7,"src":65},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/06/07/eb61d4986a054fa0a35207d6fe065763.png",[],{"type":17,"tag":25,"props":68,"children":69},{},[70],{"type":23,"value":71},"一些经典的并行架构包括了：Megatron-LM、GPipe、TeraPipe等。",{"type":17,"tag":25,"props":73,"children":74},{},[75],{"type":17,"tag":51,"props":76,"children":78},{"alt":7,"src":77},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/06/07/fe596d08bbbe45e789935055c4df3fad.png",[],{"type":17,"tag":25,"props":80,"children":81},{},[82],{"type":17,"tag":34,"props":83,"children":84},{},[85],{"type":23,"value":86},"02",{"type":17,"tag":25,"props":88,"children":89},{},[90],{"type":17,"tag":34,"props":91,"children":92},{},[93],{"type":23,"value":94},"时间损失模型建模回顾",{"type":17,"tag":25,"props":96,"children":97},{},[98],{"type":23,"value":99},"现有一些损失模型存在三个方案的不足。",{"type":17,"tag":25,"props":101,"children":102},{},[103],{"type":17,"tag":51,"props":104,"children":106},{"alt":7,"src":105},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/06/07/3dae7c39505b4622a22b5c5e6242acb7.png",[],{"type":17,"tag":25,"props":108,"children":109},{},[110],{"type":23,"value":111},"根据理论推导，建立了考虑计算和通信的GPipe并行架构的时间损失模型：",{"type":17,"tag":25,"props":113,"children":114},{},[115],{"type":17,"tag":51,"props":116,"children":118},{"alt":7,"src":117},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/06/07/e613ef357fe449a29e788cf99ed56e85.png",[],{"type":17,"tag":25,"props":120,"children":121},{},[122],{"type":23,"value":123},"考虑流水线模型划分，进一步推导了时间损失模型的包含可优化项的公式：",{"type":17,"tag":25,"props":125,"children":126},{},[127],{"type":17,"tag":51,"props":128,"children":130},{"alt":7,"src":129},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/06/07/2c94fae8c2f24d77be8727d9b08c540b.png",[],{"type":17,"tag":25,"props":132,"children":133},{},[134],{"type":17,"tag":34,"props":135,"children":136},{},[137],{"type":23,"value":138},"03",{"type":17,"tag":25,"props":140,"children":141},{},[142],{"type":17,"tag":34,"props":143,"children":144},{},[145],{"type":23,"value":146},"改进多维度二分法回顾",{"type":17,"tag":25,"props":148,"children":149},{},[150],{"type":23,"value":151},"针对GPipe的流水线并行方案划分，介绍了改进多维度二分法：",{"type":17,"tag":25,"props":153,"children":154},{},[155],{"type":17,"tag":51,"props":156,"children":158},{"alt":7,"src":157},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/06/07/0f0b7f7c1854436cbacd0921ed0e559f.png",[],{"type":17,"tag":25,"props":160,"children":161},{},[162],{"type":23,"value":163},"并对其进行了示例说明：",{"type":17,"tag":25,"props":165,"children":166},{},[167],{"type":17,"tag":51,"props":168,"children":170},{"alt":7,"src":169},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/06/07/80cb0ee4c5d04e129da6eff98f07e24e.png",[],{"type":17,"tag":25,"props":172,"children":173},{},[174],{"type":17,"tag":34,"props":175,"children":176},{},[177],{"type":23,"value":178},"04",{"type":17,"tag":25,"props":180,"children":181},{},[182],{"type":17,"tag":34,"props":183,"children":184},{},[185],{"type":23,"value":186},"实验现象回顾",{"type":17,"tag":25,"props":188,"children":189},{},[190],{"type":17,"tag":51,"props":191,"children":193},{"alt":7,"src":192},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/06/07/3a2e72396c10451c93d452b2052c04b7.png",[],{"type":17,"tag":25,"props":195,"children":196},{},[197],{"type":17,"tag":51,"props":198,"children":200},{"alt":7,"src":199},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/06/07/b495ac72e5d7470d86d3ace42e49e693.png",[],{"type":17,"tag":25,"props":202,"children":203},{},[204],{"type":17,"tag":34,"props":205,"children":206},{},[207],{"type":23,"value":208},"05",{"type":17,"tag":25,"props":210,"children":211},{},[212],{"type":17,"tag":34,"props":213,"children":214},{},[215],{"type":23,"value":216},"Automatic 3D Parallelism Strategies Search (APaSS/APSS)算法",{"type":17,"tag":25,"props":218,"children":219},{},[220],{"type":23,"value":221},"1）因为随着目前深度学习领域相关研究的推进，模型的参数越来越多超出了单个GPU的承受能力，特别是当下大语言模型的火热加剧了这一现状，需要从不同维度配合进行分布式训练。",{"type":17,"tag":25,"props":223,"children":224},{},[225],{"type":23,"value":226},"2）现有的分布式训练策略有哪些：数据并行、张量并行、流水线并行以及序列并行",{"type":17,"tag":25,"props":228,"children":229},{},[230],{"type":23,"value":231},"3）APaSS/APSS算法:",{"type":17,"tag":233,"props":234,"children":235},"ul",{},[236,242],{"type":17,"tag":237,"props":238,"children":239},"li",{},[240],{"type":23,"value":241},"基于对待训练模型的计算开销以及结合集群的候选通信开销的采集，利用启发式策略结合神经网络算法对3D并行策略进行求解。",{"type":17,"tag":237,"props":243,"children":244},{},[245],{"type":23,"value":246},"利用对比强化学习策略离线对神经网络求解器进行训练，随后在迁移应用场景使用的时候无需微调和重新训练。",{"type":17,"tag":25,"props":248,"children":249},{},[250],{"type":17,"tag":51,"props":251,"children":253},{"alt":7,"src":252},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/06/07/cac2c39f91ac42dbafb2ca0545d041ea.png",[],{"type":17,"tag":25,"props":255,"children":256},{},[257],{"type":23,"value":258},"4）APaSS/APSS算法的使用:",{"type":17,"tag":25,"props":260,"children":261},{},[262],{"type":17,"tag":51,"props":263,"children":265},{"alt":7,"src":264},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/06/07/81fbee46117d41c79189594606e973f5.png",[],{"title":7,"searchDepth":267,"depth":267,"links":268},4,[],"markdown","content:news:zh:3147.md","content","news/zh/3147.md","news/zh/3147","md",1776506080808]