[{"data":1,"prerenderedAt":167},["ShallowReactive",2],{"content-query-yVdeQEdpPv":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":161,"_id":162,"_source":163,"_file":164,"_stem":165,"_extension":166},"/technology-blogs/zh/435","zh",false,"","MindSpore大V博文系列：TeraPipe论文分析","TeraPipe（token-level的pipeline并行进行超大模型训练）论文分析。","2021-04-09","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/04/09/87a040d999fb48cfad883b3f81e0eae8.png","technology-blogs","大V博文",{"type":15,"children":16,"toc":158},"root",[17,25,31,44,55,68,79,89,97,102,109,114,124,131,136,146,151],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore大v博文系列terapipe论文分析",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"作者：金雪锋",{"type":18,"tag":26,"props":32,"children":33},{},[34,36],{"type":24,"value":35},"作者主页：",{"type":18,"tag":37,"props":38,"children":42},"a",{"href":39,"rel":40},"https://www.zhihu.com/people/jin-xue-feng",[41],"nofollow",[43],{"type":24,"value":39},{"type":18,"tag":26,"props":45,"children":46},{},[47,49],{"type":24,"value":48},"文章来源：",{"type":18,"tag":37,"props":50,"children":53},{"href":51,"rel":52},"https://zhuanlan.zhihu.com/p/363309190",[41],[54],{"type":24,"value":51},{"type":18,"tag":26,"props":56,"children":57},{},[58,60,66],{"type":24,"value":59},"最近Google出了一篇关于超大模型pipeline并行训练的论文《",{"type":18,"tag":61,"props":62,"children":63},"em",{},[64],{"type":24,"value":65},"TeraPipe: Token-Level Pipeline Parallelism for Training Large-Scale Language Models",{"type":24,"value":67},"》，小伙伴们分析了一下，分享出来，供大家参考。",{"type":18,"tag":26,"props":69,"children":70},{},[71,77],{"type":18,"tag":72,"props":73,"children":74},"strong",{},[75],{"type":24,"value":76},"背景",{"type":24,"value":78},"：大语言模型进行训练时，通常需要使用大sequence length来保持住语言序列中的长依赖情况，然而大sequence length的内存开销比较大，就需要用更小的minibatch来训练，才能保证切分后的模型内存开销足够小，能够放的进设备中去。Gpipe并行，minibatch越小，pipeline stage中间的空闲bubble就越大，导致并行加速比降低。论文提出了一种在sequence维度进行更细粒度pipeline并行的计算方法，可以显著提高训练性能。",{"type":18,"tag":26,"props":80,"children":81},{},[82,87],{"type":18,"tag":72,"props":83,"children":84},{},[85],{"type":24,"value":86},"Opportunity",{"type":24,"value":88},"：Transformer是多层Layer堆叠而成，每层layer包含SelfAttention和FeedFroward操作。",{"type":18,"tag":26,"props":90,"children":91},{},[92],{"type":18,"tag":93,"props":94,"children":96},"img",{"alt":7,"src":95},"https://pic2.zhimg.com/80/v2-b456e3fa1644d240cd81f4a3c3317649_720w.jpg",[],{"type":18,"tag":26,"props":98,"children":99},{},[100],{"type":24,"value":101},"h_i是hidden state, 对应着input sequence中的每个position。SelfAttention的计算只依赖于t之前的hidden state，而FeedFroward只依赖于h_t自己。它们都不依赖于未来的hidden state，这样的结构使得把input sequence切开并行成为可能。也就是在Transformer结构中，当前layer处理当前token时，下一个layer处理上个token。如下图中(c)和(d)对比。",{"type":18,"tag":26,"props":103,"children":104},{},[105],{"type":18,"tag":93,"props":106,"children":108},{"alt":7,"src":107},"https://pic4.zhimg.com/80/v2-9284c2b0f947184179ff46af5ac00b9b_720w.jpg",[],{"type":18,"tag":26,"props":110,"children":111},{},[112],{"type":24,"value":113},"切分input sequence (token dimension)可以和其他模型并行方式组合使用，如pipeline并行和拆分算子并行。在给定input sequence [x1, x2, …, xL]，如何找到合适切分点使得切分后[s1, s2, …, sM]，其中si包含[xl,…, sr]，使得端到端的训练效率最高。",{"type":18,"tag":26,"props":115,"children":116},{},[117,122],{"type":18,"tag":72,"props":118,"children":119},{},[120],{"type":24,"value":121},"解决方法",{"type":24,"value":123},"：选择合适的切分点很重要。若切分后的sequence太小，会使得设备利用率低；若太大，会使得bubble变大。同时，input sequence不能均分，h_t的计算依赖于之前的h1, …, h_t，处于模型后端的layer的computation load更大。下图是使用input sequence均分和运行时间均分的对比图。",{"type":18,"tag":26,"props":125,"children":126},{},[127],{"type":18,"tag":93,"props":128,"children":130},{"alt":7,"src":129},"https://pic3.zhimg.com/80/v2-5dc1d7e73081413c1961a2354919df92_720w.jpg",[],{"type":18,"tag":26,"props":132,"children":133},{},[134],{"type":24,"value":135},"在给定pipeline切分stage数的前提下，文章提出了用动态规划的算法找到合适的input sequence切分点。该算法可扩展到同时切分batch维和input sequence。",{"type":18,"tag":26,"props":137,"children":138},{},[139,144],{"type":18,"tag":72,"props":140,"children":141},{},[142],{"type":24,"value":143},"效果",{"type":24,"value":145},"：训练GPT-3 175B模型时，利用384 GPU环境，相比于不切分input sequence的Gpipe，此工作的切分input sequence的方法会有5-6倍的性能提升。",{"type":18,"tag":26,"props":147,"children":148},{},[149],{"type":24,"value":150},"**评论：**论文中提到的这种逐字计算的方式，最初的驱动力是由于Gpipe这种方式内存开销大，不过，解决Gpipe并行方式内存开销大的问题，业界还有其他的方式，例如DeepSpeed、Dapple都有对Gpipe的改进方案，一样可以做到很高的并行并行加速比和低内存开销。笔者认为TeraPipe算是很好的一种技术路径探索，不过从训练的角度上看，实现上可能还是DeepSpeed、Dapple那种更加的高效。",{"type":18,"tag":26,"props":152,"children":153},{},[154],{"type":18,"tag":37,"props":155,"children":157},{"href":51,"rel":156},[41],[],{"title":7,"searchDepth":159,"depth":159,"links":160},4,[],"markdown","content:technology-blogs:zh:435.md","content","technology-blogs/zh/435.md","technology-blogs/zh/435","md",1776506137436]