[{"data":1,"prerenderedAt":777},["ShallowReactive",2],{"content-query-pVkfYYTx7z":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":771,"_id":772,"_source":773,"_file":774,"_stem":775,"_extension":776},"/news/zh/3017","zh",false,"","【昇思MindSpore技术公开课】第十讲 MoE 课程回顾","昇思MindSpore公开课大模型专题第二季课程火爆来袭！","2024-03-01","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/08/bd5d3b3a20d543e5bbc1462860ce1405.png","news",{"type":14,"children":15,"toc":768},"root",[16,24,36,44,49,64,79,86,91,105,112,117,129,143,148,155,170,175,182,187,192,210,220,233,238,246,251,256,270,284,291,296,301,306,311,316,323,337,344,349,354,361,371,378,385,390,408,413,420,428,448,462,467,472,477,482,487,492,497,502,507,512,517,522,527,532,546,550,554,559,563,567,571,576,580,585,589,593,597,601,606,619,624,642,656,661,668,686,691,698,706,719,724,738,743,754,759],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"昇思mindspore技术公开课第十讲-moe-课程回顾",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28,30],{"type":23,"value":29},"昇思MindSpore公开课大模型专题第二季课程火爆来袭！未报名的小伙伴抓紧时间扫描下方二维码参与课程，并同步加入课程群，有免费丰富的课程资源在等着你。",{"type":17,"tag":31,"props":32,"children":33},"strong",{},[34],{"type":23,"value":35},"课程同步赋能华为ICT大赛2023-2024，助力各位选手取得理想成绩！",{"type":17,"tag":25,"props":37,"children":38},{},[39],{"type":17,"tag":40,"props":41,"children":43},"img",{"alt":7,"src":42},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/08/bfd5189028204dd4b4ab83728c9afe45.png",[],{"type":17,"tag":25,"props":45,"children":46},{},[47],{"type":23,"value":48},"大模型发展即将进入下一阶段但目前仍面临众多难题。为满足与日俱增的实际需求，大模型参数会越来越大，数据集类型越来越多，从而导致训练难度大增，同时也提高了推理成本。为了实现大模型的高效训练和推理，混合专家模型MoE便横空出世。",{"type":17,"tag":25,"props":50,"children":51},{},[52,57,59],{"type":17,"tag":31,"props":53,"children":54},{},[55],{"type":23,"value":56},"1",{"type":23,"value":58},"**、**",{"type":17,"tag":31,"props":60,"children":61},{},[62],{"type":23,"value":63},"MoE结构的发展",{"type":17,"tag":25,"props":65,"children":66},{},[67,72,74],{"type":17,"tag":31,"props":68,"children":69},{},[70],{"type":23,"value":71},"1.1",{"type":23,"value":73}," ",{"type":17,"tag":31,"props":75,"children":76},{},[77],{"type":23,"value":78},"Vanilla MoE",{"type":17,"tag":25,"props":80,"children":81},{},[82],{"type":17,"tag":40,"props":83,"children":85},{"alt":7,"src":84},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/08/635ddc4abab74464a3ab779041d4b170.png",[],{"type":17,"tag":25,"props":87,"children":88},{},[89],{"type":23,"value":90},"Export Network，用于学习不同数据，一个Gating Network用于分配每个Expert的输出权重。",{"type":17,"tag":25,"props":92,"children":93},{},[94,99,100],{"type":17,"tag":31,"props":95,"children":96},{},[97],{"type":23,"value":98},"1.2",{"type":23,"value":73},{"type":17,"tag":31,"props":101,"children":102},{},[103],{"type":23,"value":104},"Sparse MoE",{"type":17,"tag":25,"props":106,"children":107},{},[108],{"type":17,"tag":40,"props":109,"children":111},{"alt":7,"src":110},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/08/155d02b2e9654915a18c535fd54bc807.png",[],{"type":17,"tag":25,"props":113,"children":114},{},[115],{"type":23,"value":116},"Experts的输出是稀疏的，只有部分的Experts的权重> 0，其余=0的Expert直接不参与计算。",{"type":17,"tag":25,"props":118,"children":119},{},[120,122,127],{"type":23,"value":121},"**Expert Balancing问题：",{"type":17,"tag":31,"props":123,"children":124},{},[125],{"type":23,"value":126},"不同Experts在竞争的过程中，会出现",{"type":23,"value":128},"“赢者通吃”的现象。**前期变现好的Expert会更容易被Gating Network选择，导致最终只有少数的几个Experts真正起作用。",{"type":17,"tag":25,"props":130,"children":131},{},[132,137,138],{"type":17,"tag":31,"props":133,"children":134},{},[135],{"type":23,"value":136},"1.3",{"type":23,"value":73},{"type":17,"tag":31,"props":139,"children":140},{},[141],{"type":23,"value":142},"Transformer MoE",{"type":17,"tag":25,"props":144,"children":145},{},[146],{"type":23,"value":147},"1）GShard",{"type":17,"tag":25,"props":149,"children":150},{},[151],{"type":17,"tag":40,"props":152,"children":154},{"alt":7,"src":153},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/08/5e2442bd4f9f4ee78e478e9d181de51d.png",[],{"type":17,"tag":156,"props":157,"children":158},"ul",{},[159,165],{"type":17,"tag":160,"props":161,"children":162},"li",{},[163],{"type":23,"value":164},"Transformer的Encoder和Decoder中，每隔一个（every other）FFN层，替换成Position-wise MoE层。",{"type":17,"tag":160,"props":166,"children":167},{},[168],{"type":23,"value":169},"Top-2 Gating Network。",{"type":17,"tag":25,"props":171,"children":172},{},[173],{"type":23,"value":174},"2）Switch Transformer",{"type":17,"tag":25,"props":176,"children":177},{},[178],{"type":17,"tag":40,"props":179,"children":181},{"alt":7,"src":180},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/08/f3649599c4db4e899f4e1796f1dd1c27.png",[],{"type":17,"tag":25,"props":183,"children":184},{},[185],{"type":23,"value":186},"简化了MoE的Routing算法，Gating Network每次只Route 到1个Expert。",{"type":17,"tag":25,"props":188,"children":189},{},[190],{"type":23,"value":191},"3）GLaM",{"type":17,"tag":156,"props":193,"children":194},{},[195,200,205],{"type":17,"tag":160,"props":196,"children":197},{},[198],{"type":23,"value":199},"Gshard结构",{"type":17,"tag":160,"props":201,"children":202},{},[203],{"type":23,"value":204},"Scale参数量",{"type":17,"tag":160,"props":206,"children":207},{},[208],{"type":23,"value":209},"降低训练推理成本",{"type":17,"tag":25,"props":211,"children":212},{},[213,215],{"type":23,"value":214},"**2、**",{"type":17,"tag":31,"props":216,"children":217},{},[218],{"type":23,"value":219},"MoE的分布式通信和昇思MindSpore优化",{"type":17,"tag":25,"props":221,"children":222},{},[223,228,229],{"type":17,"tag":31,"props":224,"children":225},{},[226],{"type":23,"value":227},"2.1",{"type":23,"value":73},{"type":17,"tag":31,"props":230,"children":231},{},[232],{"type":23,"value":219},{"type":17,"tag":25,"props":234,"children":235},{},[236],{"type":23,"value":237},"MoE结构和普通的Dense模型的差异在于，其需要额外的AllToAll通信，来实现数据的路由(Gating)和结果的回收。而AllToAll通信会跨Node（服务器）、跨pod（路由），进而造成大量的通信阻塞问题。",{"type":17,"tag":25,"props":239,"children":240},{},[241],{"type":17,"tag":31,"props":242,"children":243},{},[244],{"type":23,"value":245},"2.2 昇思****MindSpore的MoE优化",{"type":17,"tag":25,"props":247,"children":248},{},[249],{"type":23,"value":250},"大模型训练主要瓶颈在于片上内存与卡间通信。常用的内存优化手段：MoE并行、优化器异构，常用的通信优化手段、多副本并行。 1）MoE并行：将不同的专家切分到不同的卡上，由于MoE的路由机制，需要使用AllToAll通信，将token发送到正确的卡上。对AllToAll的优化：分级AllToAll、Group-wise AllToAll等。 2）优化器异构：大模型训练常使用的adam系列优化器，其占用的内存往往是模型参数本身的2倍或以上，可以将优化器状态存储在Host内存上。 3）多副本并行：将串行的通信、计算拆分成多组，组件流水，掩盖通信时间。",{"type":17,"tag":25,"props":252,"children":253},{},[254],{"type":23,"value":255},"昇思MindSpore已使能上述优化，大幅提升了万亿参数稀疏模型的训练吞吐。",{"type":17,"tag":25,"props":257,"children":258},{},[259,264,265],{"type":17,"tag":31,"props":260,"children":261},{},[262],{"type":23,"value":263},"3",{"type":23,"value":58},{"type":17,"tag":31,"props":266,"children":267},{},[268],{"type":23,"value":269},"Mixtral 8x7B MoE大模型",{"type":17,"tag":25,"props":271,"children":272},{},[273,278,279],{"type":17,"tag":31,"props":274,"children":275},{},[276],{"type":23,"value":277},"3.1",{"type":23,"value":73},{"type":17,"tag":31,"props":280,"children":281},{},[282],{"type":23,"value":283},"Mixtral的基础模型Mistral",{"type":17,"tag":25,"props":285,"children":286},{},[287],{"type":17,"tag":40,"props":288,"children":290},{"alt":7,"src":289},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/08/81feac2623f647c495aec50faa86ff60.png",[],{"type":17,"tag":25,"props":292,"children":293},{},[294],{"type":23,"value":295},"1）RoPE",{"type":17,"tag":25,"props":297,"children":298},{},[299],{"type":23,"value":300},"2）RMSNorm",{"type":17,"tag":25,"props":302,"children":303},{},[304],{"type":23,"value":305},"3）Transformer decoder",{"type":17,"tag":25,"props":307,"children":308},{},[309],{"type":23,"value":310},"4）Grouped Multi-Query Attention",{"type":17,"tag":25,"props":312,"children":313},{},[314],{"type":23,"value":315},"5）Sliding Window Attention: 优化随着序列长度增加而增长的显存占用和计算消耗",{"type":17,"tag":25,"props":317,"children":318},{},[319],{"type":17,"tag":40,"props":320,"children":322},{"alt":7,"src":321},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/08/a928398a1c3841499e72602cc0e83cf9.png",[],{"type":17,"tag":25,"props":324,"children":325},{},[326,331,332],{"type":17,"tag":31,"props":327,"children":328},{},[329],{"type":23,"value":330},"3.2",{"type":23,"value":73},{"type":17,"tag":31,"props":333,"children":334},{},[335],{"type":23,"value":336},"Mixtral",{"type":17,"tag":25,"props":338,"children":339},{},[340],{"type":17,"tag":40,"props":341,"children":343},{"alt":7,"src":342},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/08/9c40b1f1c29b48129a030d2e8901f2d6.png",[],{"type":17,"tag":25,"props":345,"children":346},{},[347],{"type":23,"value":348},"1）8个Expert（类GPT-4）",{"type":17,"tag":25,"props":350,"children":351},{},[352],{"type":23,"value":353},"2）Top2 Gating",{"type":17,"tag":25,"props":355,"children":356},{},[357],{"type":17,"tag":40,"props":358,"children":360},{"alt":7,"src":359},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/08/e60b9b155f4d41aeb1e3dcd800601284.png",[],{"type":17,"tag":362,"props":363,"children":365},"pre",{"code":364},"class MoeLayer(nn.Cell):\n    def _init_(self,experts: List[nn.Cell], gate: nn.Cell, moe_args): MoeArgs):\n        super()._init_()\n        assert len(experts)>0\n        self,experts = nn.cellList(experts)\n        self.gate = gate\n        self.args = moe_args\n\n    def construct(self,inputs:mindspore.Tensor):\n        gate_logits: self.gate(inputs)\n        weights,select_edexperts = ops.topk(gate_logits,self.args.num _experts_per_tok)\n        weights = ops.softmax(weights,axis=1, dtype=mindspore.float32).to(inputs.dtype)\n        results = ops.zeros_like(inputs)\n        for i,expert in enumerate(self.experts):\n            non zero = ops.nonzero(selected_experts == i)\n            if 0 not in non_zero.shape:\n                batch_idx,nth_expert= non_zero.tensor_split(2，1)\n                results[batch_idx] = results[batch_idx] + weights[batch_idx, nth_expert, None] * expert(\n                    inputslbatch idx]\n                }\nreturn results\n",[366],{"type":17,"tag":367,"props":368,"children":369},"code",{"__ignoreMap":7},[370],{"type":23,"value":364},{"type":17,"tag":25,"props":372,"children":373},{},[374],{"type":17,"tag":40,"props":375,"children":377},{"alt":7,"src":376},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/08/cca87a86d9d042b481e5a45b58c7fcfb.png",[],{"type":17,"tag":25,"props":379,"children":380},{},[381],{"type":17,"tag":40,"props":382,"children":384},{"alt":7,"src":383},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/08/6cc16cfb4baa4b29bc5bd43206805e29.png",[],{"type":17,"tag":25,"props":386,"children":387},{},[388],{"type":23,"value":389},"1）基于MindFormers实现Mixtral-8x7B MoE模型。",{"type":17,"tag":156,"props":391,"children":392},{},[393,398,403],{"type":17,"tag":160,"props":394,"children":395},{},[396],{"type":23,"value":397},"关键结构: GQA，RoPE，RMSNorm，Silu。",{"type":17,"tag":160,"props":399,"children":400},{},[401],{"type":23,"value":402},"MoE配置: 8 Experts，TopK=2，capacity c=1.1。",{"type":17,"tag":160,"props":404,"children":405},{},[406],{"type":23,"value":407},"加载开源的Mixtral权重和tokenizer，推理结果对齐HF。",{"type":17,"tag":25,"props":409,"children":410},{},[411],{"type":23,"value":412},"2）4机32卡EP，PP等多维混合并行，基于自有数据集试验性训练收敛符合预期。200 epoch loss 10 --> 0.02。",{"type":17,"tag":25,"props":414,"children":415},{},[416],{"type":17,"tag":40,"props":417,"children":419},{"alt":7,"src":418},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/08/b7ff3f5e6a4f4fd48117a05cb3305fce.png",[],{"type":17,"tag":25,"props":421,"children":422},{},[423],{"type":17,"tag":31,"props":424,"children":425},{},[426],{"type":23,"value":427},"EP=8,MP=1时性能最佳,约1147 tokens/s/p。",{"type":17,"tag":25,"props":429,"children":430},{},[431,436,438,446],{"type":17,"tag":31,"props":432,"children":433},{},[434],{"type":23,"value":435},"4",{"type":23,"value":437},"**、",{"type":17,"tag":31,"props":439,"children":440},{},[441],{"type":17,"tag":31,"props":442,"children":443},{},[444],{"type":23,"value":445},"MoE和Life",{"type":23,"value":447},"long learning**",{"type":17,"tag":25,"props":449,"children":450},{},[451,456,457],{"type":17,"tag":31,"props":452,"children":453},{},[454],{"type":23,"value":455},"4.1",{"type":23,"value":73},{"type":17,"tag":31,"props":458,"children":459},{},[460],{"type":23,"value":461},"终身学习/持续学习的性质",{"type":17,"tag":25,"props":463,"children":464},{},[465],{"type":23,"value":466},"性质",{"type":17,"tag":25,"props":468,"children":469},{},[470],{"type":23,"value":471},"定义",{"type":17,"tag":25,"props":473,"children":474},{},[475],{"type":23,"value":476},"知识记忆(knowledge retention)",{"type":17,"tag":25,"props":478,"children":479},{},[480],{"type":23,"value":481},"模型不易产生遗忘灾难",{"type":17,"tag":25,"props":483,"children":484},{},[485],{"type":23,"value":486},"前向迁移(forward transfer)",{"type":17,"tag":25,"props":488,"children":489},{},[490],{"type":23,"value":491},"利用旧知识学习新任务",{"type":17,"tag":25,"props":493,"children":494},{},[495],{"type":23,"value":496},"后向迁移(backward transfer)",{"type":17,"tag":25,"props":498,"children":499},{},[500],{"type":23,"value":501},"新任务学习后提升旧任务",{"type":17,"tag":25,"props":503,"children":504},{},[505],{"type":23,"value":506},"在线学习(online learning)",{"type":17,"tag":25,"props":508,"children":509},{},[510],{"type":23,"value":511},"连续数据流学习",{"type":17,"tag":25,"props":513,"children":514},{},[515],{"type":23,"value":516},"无任务边界(No task boudaries)",{"type":17,"tag":25,"props":518,"children":519},{},[520],{"type":23,"value":521},"不需要明确的任务或数据定义",{"type":17,"tag":25,"props":523,"children":524},{},[525],{"type":23,"value":526},"固定模型容量(Fixed model capacity)",{"type":17,"tag":25,"props":528,"children":529},{},[530],{"type":23,"value":531},"模型大小不随任务和数据变化",{"type":17,"tag":25,"props":533,"children":534},{},[535,540,541],{"type":17,"tag":31,"props":536,"children":537},{},[538],{"type":23,"value":539},"4.2",{"type":23,"value":73},{"type":17,"tag":31,"props":542,"children":543},{},[544],{"type":23,"value":545},"MoE模型+终身学习",{"type":17,"tag":25,"props":547,"children":548},{},[549],{"type":23,"value":466},{"type":17,"tag":25,"props":551,"children":552},{},[553],{"type":23,"value":476},{"type":17,"tag":25,"props":555,"children":556},{},[557],{"type":23,"value":558},"√",{"type":17,"tag":25,"props":560,"children":561},{},[562],{"type":23,"value":486},{"type":17,"tag":25,"props":564,"children":565},{},[566],{"type":23,"value":558},{"type":17,"tag":25,"props":568,"children":569},{},[570],{"type":23,"value":496},{"type":17,"tag":25,"props":572,"children":573},{},[574],{"type":23,"value":575},"-",{"type":17,"tag":25,"props":577,"children":578},{},[579],{"type":23,"value":506},{"type":17,"tag":25,"props":581,"children":582},{},[583],{"type":23,"value":584},"×",{"type":17,"tag":25,"props":586,"children":587},{},[588],{"type":23,"value":516},{"type":17,"tag":25,"props":590,"children":591},{},[592],{"type":23,"value":558},{"type":17,"tag":25,"props":594,"children":595},{},[596],{"type":23,"value":526},{"type":17,"tag":25,"props":598,"children":599},{},[600],{"type":23,"value":558},{"type":17,"tag":25,"props":602,"children":603},{},[604],{"type":23,"value":605},"MoE的特点：",{"type":17,"tag":156,"props":607,"children":608},{},[609,614],{"type":17,"tag":160,"props":610,"children":611},{},[612],{"type":23,"value":613},"多个Expert分别处理不同分布（domain/topic）的数据",{"type":17,"tag":160,"props":615,"children":616},{},[617],{"type":23,"value":618},"推理仅需要部分Expert",{"type":17,"tag":25,"props":620,"children":621},{},[622],{"type":23,"value":623},"LLM的终身学习：",{"type":17,"tag":156,"props":625,"children":626},{},[627,632,637],{"type":17,"tag":160,"props":628,"children":629},{},[630],{"type":23,"value":631},"世界知识底座持续学习。",{"type":17,"tag":160,"props":633,"children":634},{},[635],{"type":23,"value":636},"Expert可插拔",{"type":17,"tag":160,"props":638,"children":639},{},[640],{"type":23,"value":641},"Gating Network可增删。",{"type":17,"tag":25,"props":643,"children":644},{},[645,650,651],{"type":17,"tag":31,"props":646,"children":647},{},[648],{"type":23,"value":649},"4.3",{"type":23,"value":73},{"type":17,"tag":31,"props":652,"children":653},{},[654],{"type":23,"value":655},"MoE+终身学习的典型工作",{"type":17,"tag":25,"props":657,"children":658},{},[659],{"type":23,"value":660},"1）Lifelong-MoE",{"type":17,"tag":25,"props":662,"children":663},{},[664],{"type":17,"tag":40,"props":665,"children":667},{"alt":7,"src":666},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/08/34b79dbf4d334024b58f09c8932387bc.png",[],{"type":17,"tag":156,"props":669,"children":670},{},[671,676,681],{"type":17,"tag":160,"props":672,"children":673},{},[674],{"type":23,"value":675},"扩展Expert和Gating Network的维度",{"type":17,"tag":160,"props":677,"children":678},{},[679],{"type":23,"value":680},"冻结旧的Expert和Gating Network维度",{"type":17,"tag":160,"props":682,"children":683},{},[684],{"type":23,"value":685},"使用正则克服遗忘灾难",{"type":17,"tag":25,"props":687,"children":688},{},[689],{"type":23,"value":690},"2）Pangu-sigma",{"type":17,"tag":25,"props":692,"children":693},{},[694],{"type":17,"tag":40,"props":695,"children":697},{"alt":7,"src":696},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/08/a08350ef80f946729e5562f1d84fd32b.png",[],{"type":17,"tag":25,"props":699,"children":700},{},[701],{"type":17,"tag":31,"props":702,"children":703},{},[704],{"type":23,"value":705},"Random Routed Experts：",{"type":17,"tag":156,"props":707,"children":708},{},[709,714],{"type":17,"tag":160,"props":710,"children":711},{},[712],{"type":23,"value":713},"第一层，根据任务分配给不同的专家组（多个Expert构成一个专家组，供一个task/domain使用）",{"type":17,"tag":160,"props":715,"children":716},{},[717],{"type":23,"value":718},"第二层，使用组内随机Gating，让专家组的Expert可以负载均衡。",{"type":17,"tag":25,"props":720,"children":721},{},[722],{"type":23,"value":723},"这样可以保证某个领域对应的Expert可以直接被抽取出来作为单个模型使用。",{"type":17,"tag":25,"props":725,"children":726},{},[727,732,733],{"type":17,"tag":31,"props":728,"children":729},{},[730],{"type":23,"value":731},"5",{"type":23,"value":58},{"type":17,"tag":31,"props":734,"children":735},{},[736],{"type":23,"value":737},"Mixtral 8x7B Demo",{"type":17,"tag":25,"props":739,"children":740},{},[741],{"type":23,"value":742},"Mistral-MindSpore:",{"type":17,"tag":25,"props":744,"children":745},{},[746],{"type":17,"tag":747,"props":748,"children":752},"a",{"href":749,"rel":750},"https://github.com/lvyufeng/mistral-mindspore",[751],"nofollow",[753],{"type":23,"value":749},{"type":17,"tag":25,"props":755,"children":756},{},[757],{"type":23,"value":758},"Mindformer(MoE预训练)：",{"type":17,"tag":25,"props":760,"children":761},{},[762],{"type":17,"tag":747,"props":763,"children":766},{"href":764,"rel":765},"https://gitee.com/mindspore/mindformers/",[751],[767],{"type":23,"value":764},{"title":7,"searchDepth":769,"depth":769,"links":770},4,[],"markdown","content:news:zh:3017.md","content","news/zh/3017.md","news/zh/3017","md",1776506077607]