[{"data":1,"prerenderedAt":872},["ShallowReactive",2],{"content-query-zzcPoPnHFq":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"state":13,"body":14,"_type":866,"_id":867,"_source":868,"_file":869,"_stem":870,"_extension":871},"/activities/zh/652","zh",false,"","AI漫分享 | 欢迎收听MindSpore全新播客栏目","本期为您分享：分布式并行计算的前沿发展（上）","2021-07-16","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/16/276dbe3fb13442d3a56131c658a5800b.png","activities",1,{"type":15,"children":16,"toc":863},"root",[17,25,34,39,55,60,65,82,87,94,99,104,111,118,123,128,194,201,206,210,233,240,245,255,265,277,289,301,313,325,337,349,361,373,385,397,409,421,433,445,457,469,481,493,505,517,529,538,550,562,574,586,598,610,622,634,646,658,670,682,694,706,718,730,742,754,763,772,779,784,789,794,799,804,809,814,819,824,829,834,839,844,851,856],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"ai漫分享-欢迎收听mindspore全新播客栏目",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":18,"tag":30,"props":31,"children":33},"img",{"alt":7,"src":32},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/16/9d651ba9db14449db569dab66de80fd7.gif",[],{"type":18,"tag":26,"props":35,"children":36},{},[37],{"type":24,"value":38},"欢迎大家收听MindSpore新栏目",{"type":18,"tag":26,"props":40,"children":41},{},[42,48,50],{"type":18,"tag":43,"props":44,"children":45},"strong",{},[46],{"type":24,"value":47},"AI漫分享",{"type":24,"value":49}," ",{"type":18,"tag":43,"props":51,"children":52},{},[53],{"type":24,"value":54},"vol.1",{"type":18,"tag":26,"props":56,"children":57},{},[58],{"type":24,"value":59},"听人工智能领域的专家",{"type":18,"tag":26,"props":61,"children":62},{},[63],{"type":24,"value":64},"为您讲述生动有趣的AI应用案例",{"type":18,"tag":26,"props":66,"children":67},{},[68,72,73],{"type":18,"tag":30,"props":69,"children":71},{"alt":7,"src":70},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/16/948271933c8b4751a8cd6b2aa1ddf0e1.png",[],{"type":24,"value":49},{"type":18,"tag":74,"props":75,"children":79},"a",{"href":76,"rel":77},"https://mp.weixin.qq.com/s/v6rj6TioI4dblvx443JZtg",[78],"nofollow",[80],{"type":24,"value":81},"请打开微信进行观看",{"type":18,"tag":26,"props":83,"children":84},{},[85],{"type":24,"value":86},"MindSpore社区全新上线的播客**『AI漫分享』**，这里有大家熟悉的主持晓曼小姐姐搭档柳成龙小哥哥，每期将邀请人工智能领域专家，围绕AI领域有趣的应用案例展开分享。",{"type":18,"tag":26,"props":88,"children":89},{},[90],{"type":18,"tag":30,"props":91,"children":93},{"alt":7,"src":92},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/16/e6a88ff012dd43a5aa41fd7e0a2fc7a3.gif",[],{"type":18,"tag":26,"props":95,"children":96},{},[97],{"type":24,"value":98},"本期主题：分布式并行计算的前沿发展（上）",{"type":18,"tag":26,"props":100,"children":101},{},[102],{"type":24,"value":103},"邀请嘉宾 ：华为MindSpore分布式并行计算技术专家——ST",{"type":18,"tag":26,"props":105,"children":106},{},[107],{"type":18,"tag":30,"props":108,"children":110},{"alt":7,"src":109},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/16/2d96faa425c64c4387f994c12c9c6893.jpg",[],{"type":18,"tag":26,"props":112,"children":113},{},[114],{"type":18,"tag":30,"props":115,"children":117},{"alt":7,"src":116},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/16/6bf5cc483bbf4e2bb21a79eb90a4535f.png",[],{"type":18,"tag":26,"props":119,"children":120},{},[121],{"type":24,"value":122},"聊一下分布式训练的几种模式（Mesh，Megatron，ZeRO，XLA）。",{"type":18,"tag":26,"props":124,"children":125},{},[126],{"type":24,"value":127},"时间线如下：",{"type":18,"tag":129,"props":130,"children":131},"ul",{},[132,144,154,164,174,184],{"type":18,"tag":133,"props":134,"children":135},"li",{},[136,138,142],{"type":24,"value":137},"00:25～～02:30",{"type":18,"tag":139,"props":140,"children":141},"br",{},[],{"type":24,"value":143},"分布式并行——从小白的角度的一个背景介绍",{"type":18,"tag":133,"props":145,"children":146},{},[147,149,152],{"type":24,"value":148},"02:40～～07:50",{"type":18,"tag":139,"props":150,"children":151},{},[],{"type":24,"value":153},"开始做分布式并行，对你启发最大是Mesh吗？",{"type":18,"tag":133,"props":155,"children":156},{},[157,159,162],{"type":24,"value":158},"08:30～～09:06",{"type":18,"tag":139,"props":160,"children":161},{},[],{"type":24,"value":163},"MindSpore论坛里与矩阵层相关的问题",{"type":18,"tag":133,"props":165,"children":166},{},[167,169,172],{"type":24,"value":168},"11:50～～13:21",{"type":18,"tag":139,"props":170,"children":171},{},[],{"type":24,"value":173},"聊一聊对Megatron-LM的看法",{"type":18,"tag":133,"props":175,"children":176},{},[177,179,182],{"type":24,"value":178},"13:59～～16:30",{"type":18,"tag":139,"props":180,"children":181},{},[],{"type":24,"value":183},"ZeRO这里面提的方法比Mesh跟Megatron要更好吗？",{"type":18,"tag":133,"props":185,"children":186},{},[187,189,192],{"type":24,"value":188},"16:40～～19:40",{"type":18,"tag":139,"props":190,"children":191},{},[],{"type":24,"value":193},"Google最近的XLA用融合算子的方式去减少好多update信息，这个是一个可行的方向吗？",{"type":18,"tag":26,"props":195,"children":196},{},[197],{"type":18,"tag":30,"props":198,"children":200},{"alt":7,"src":199},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/16/7737d21fa62c4f8fa6177531dcaa7acd.png",[],{"type":18,"tag":26,"props":202,"children":203},{},[204],{"type":24,"value":205},"聊一聊切图（GPipe，XPipe，Pipe Dream）。",{"type":18,"tag":26,"props":207,"children":208},{},[209],{"type":24,"value":127},{"type":18,"tag":129,"props":211,"children":212},{},[213,223],{"type":18,"tag":133,"props":214,"children":215},{},[216,218,221],{"type":24,"value":217},"19:50～～24:58",{"type":18,"tag":139,"props":219,"children":220},{},[],{"type":24,"value":222},"GPipe跟Mesh，Tensorflow有关系吗？",{"type":18,"tag":133,"props":224,"children":225},{},[226,228,231],{"type":24,"value":227},"24:59～～25:30",{"type":18,"tag":139,"props":229,"children":230},{},[],{"type":24,"value":232},"XPipe有一个对权重的预测，你觉得对信息失效管用吗？",{"type":18,"tag":26,"props":234,"children":235},{},[236],{"type":18,"tag":30,"props":237,"children":239},{"alt":7,"src":238},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/16/3312a153ea4b4ebdb1625c8be40fae52.png",[],{"type":18,"tag":26,"props":241,"children":242},{},[243],{"type":24,"value":244},"Hello 大家好，欢迎收听 AI 漫分享，这是一套由 MindSpore 开源社区组织的全新播客节目，邀请来自人工智能领域的专家，讲述有趣的 AI 应用案例，吐槽 AI 生产落地的血泪，以及点评 AI 前沿技术发展。我是主持人小曼，我是柳成龙。",{"type":18,"tag":26,"props":246,"children":247},{},[248,253],{"type":18,"tag":43,"props":249,"children":250},{},[251],{"type":24,"value":252},"柳成龙",{"type":24,"value":254},"：好，我们今天邀请到了 MindSpore 的专家ST老师，今天我们聊一下分布式并行。ST老师之前给了至少十篇论文，然后今天可以好好聊一下，咱们一开始可以聊一下分布式训练的几种模式，昨天看了一下，提纲里面提到的这几篇论文，我看好像 Mesh-Tensorflow[0]出来的比较早。",{"type":18,"tag":26,"props":256,"children":257},{},[258,263],{"type":18,"tag":43,"props":259,"children":260},{},[261],{"type":24,"value":262},"ST",{"type":24,"value":264},"：对，出来的比较早。",{"type":18,"tag":26,"props":266,"children":267},{},[268,275],{"type":18,"tag":43,"props":269,"children":270},{},[271],{"type":18,"tag":43,"props":272,"children":273},{},[274],{"type":24,"value":252},{"type":24,"value":276},"：嗯，然后后面才是 Megatron-LM[1]，然后 ZeRO[2]出来，我读着感觉ZeRO 的论文写的非常好，写得挺深入浅出。",{"type":18,"tag":26,"props":278,"children":279},{},[280,287],{"type":18,"tag":43,"props":281,"children":282},{},[283],{"type":18,"tag":43,"props":284,"children":285},{},[286],{"type":24,"value":262},{"type":24,"value":288},"：确实是，那篇论文写的非常的言简意赅，理解起来还是有点难度啊。",{"type":18,"tag":26,"props":290,"children":291},{},[292,299],{"type":18,"tag":43,"props":293,"children":294},{},[295],{"type":18,"tag":43,"props":296,"children":297},{},[298],{"type":24,"value":252},{"type":24,"value":300},"：我作为小白，先从最小白的一个基础大概先说一下我的理解，看我理解对不对，我们一般说 AI 里面，说分布式训练就是针对这种大规模的场景，然后一般来说我们有数据并行，模型运行，还有 pipeline 并行。数据并行是最常用的，就是说只要是模型能装到 CPU，或者是只要能装到内存里面，数据并行就是可用的。基本上是通过 minibatch，所有的节点会算一个平均的一个梯度。之后，模型并行出来，有的模型太大了，横竖装不下，模型并行和 pipeline 并行都是为了解决这个问题。模型并行，主要是指垂直起，就是顺着层起。pipeline并行呢，是水平切，每一层都是按照算子维度来的。",{"type":18,"tag":26,"props":302,"children":303},{},[304,311],{"type":18,"tag":43,"props":305,"children":306},{},[307],{"type":18,"tag":43,"props":308,"children":309},{},[310],{"type":24,"value":262},{"type":24,"value":312},"：pipeline 并行水平切还是横切，一般看我们图的画法，一般我们如果图横着画，那就是竖切。",{"type":18,"tag":26,"props":314,"children":315},{},[316,323],{"type":18,"tag":43,"props":317,"children":318},{},[319],{"type":18,"tag":43,"props":320,"children":321},{},[322],{"type":24,"value":252},{"type":24,"value":324},"：可能还有一些不必要用到并行，但是可以帮助解决，像把好多 activition之类的卸载到 CPU 去处理。这是大概的一个从小白的角度的一个背景介绍，看咱们从哪里开始聊，你觉得你开始做分布式并行这块儿，对你启发最大的什么？Mesh 吗？还是？",{"type":18,"tag":26,"props":326,"children":327},{},[328,335],{"type":18,"tag":43,"props":329,"children":330},{},[331],{"type":18,"tag":43,"props":332,"children":333},{},[334],{"type":24,"value":262},{"type":24,"value":336},"：Mesh 是最有名的，在 Mesh 之前，其实已经有一些小论文，可能没像Mesh 这么有名，因为是在 nips 上面发表的，这些论文已经开始在讲 Mesh 这条思路了，但是它们没有完整地提出这么一套系统。我们看了 Mesh 这篇论文之后，确实启发是最大的，通过 Mesh 这些论文又去回查了非常多的材料。发现Mesh 之前，实际上有位来自斯坦福的博士生，在 match 之前他提了很多这方面的 idear。因为 Mesh 那篇论文，讲的比较 high level，我们当时也通过查其他一些论文来把模型并行这个方向的信息理清晰。Mesh 这一块，它这篇论文主要还是在 Tensorflow 之上做的一套框架。Mesh 当时放出来主要是针对应用Transformer 的大型 Transformer 网络。Transformer 这种网络，像现在比较典型的就是 GPT-3，它叠加的非常多，它可以横向扩展，纵向扩展和模型扩展，模型会扩张的非常大。所以 Mesh 还提出了模型扩大，实际上是对精度是有提升的，这方面也是目前业界看的一个趋势。大家现在都在测试一个模型。其实一个模型的结构，例如我们神经网络模型就是要么横向扩张，要么纵向扩张，叠加更多嘛。每一个模型可能存在一个他极限的一个表达能力，通俗点说，我可以横向长到多胖，或者长的多长，这个可能是他的一个结构，有一个极限的能力。目前 Transformer 在这方面的能力上面特别强，甚至可以叠加到几十层，其实 resnet 也是类似的， resnet30、50、101 其实也是对模型不停地对同一个结构不停的叠加嘛。这种方式，以 Mesh 来说，目前对 Transformer 这样一个体系的东西，支持的特别好。",{"type":18,"tag":26,"props":338,"children":339},{},[340,347],{"type":18,"tag":43,"props":341,"children":342},{},[343],{"type":18,"tag":43,"props":344,"children":345},{},[346],{"type":24,"value":252},{"type":24,"value":348},"：其实我发现谷歌好像特别爱做特别全面的一套系统，我之前阅读他们做差分隐私，它也是一套像 type system 定义的特别全。我看像 GPipe[3]，包括Mesh-Tensorflow，他们好像习惯于定义一整套系统，Mesh-Tensorflow 是一套语言，是吧？",{"type":18,"tag":26,"props":350,"children":351},{},[352,359],{"type":18,"tag":43,"props":353,"children":354},{},[355],{"type":18,"tag":43,"props":356,"children":357},{},[358],{"type":24,"value":262},{"type":24,"value":360},"：是的。",{"type":18,"tag":26,"props":362,"children":363},{},[364,371],{"type":18,"tag":43,"props":365,"children":366},{},[367],{"type":18,"tag":43,"props":368,"children":369},{},[370],{"type":24,"value":252},{"type":24,"value":372},"：他们就是从语言层面到编译，给你定义好。比如说，我允许你去定义的tensor 的 dimension 有哪些。然后你通过这个去执行，这种方法你觉得是一种比较好的方法吗？还是你觉得还是有点过了？",{"type":18,"tag":26,"props":374,"children":375},{},[376,383],{"type":18,"tag":43,"props":377,"children":378},{},[379],{"type":18,"tag":43,"props":380,"children":381},{},[382],{"type":24,"value":262},{"type":24,"value":384},"：确实，Mesh 他称自己为一套 DSL 语言，他实际上本质还是 python，在python 抽象了一堆的 API，我感觉他可能这个程度还没有到那么深。它的核心就是把每个轴给它取一些名字，像我们目前，如 Tensorflow、新版 Pytorch 都有这个特性。像 Tensorflow 原版的话，它实际上每一个轴是没有名字的，我们只有说它是第 0 轴、第 1 轴。它给轴取名字之后，每一个就是 tensor 或者数据的每一个维度，实际上它可以贯穿整个网络，比如说对于卷积来说，NSHW 四个轴中，N 这个轴，你在第一个算子，第一个卷积的 N 轴，这个是在第 0 位。然后第二个卷积 N 也是在第 0 位，那 HW，它同样也是映射到，比如说第三位或者第四位。但是它通过一个名字，把两个算子之间表达同一个意思的轴关联起来。这样的话做有个好处，就是在像 Mesh 这种，我们切 tensor 的这种模型并行方式，其实就是把某一轴给切开。这样定义有个好处，就比如说他说我们切这个叫做 N 轴，或者切 H 轴，那所有整个网络里面所有的 H 轴都会被切开，所以切 N 轴，叫 N 的这个轴都会被切开。这样的话就是能够通过一个 name 的名字定义，在切某一轴的时候把整个网络给切开了，这样的话对用户来说是比较方便的。比如说第一个卷积可能切一个轴，第二个卷积又切另一个轴，它可能就用一套方式把整个模型给切开来了，这个是 Mesh 里比较有意思的一个东西。",{"type":18,"tag":26,"props":386,"children":387},{},[388,395],{"type":18,"tag":43,"props":389,"children":390},{},[391],{"type":18,"tag":43,"props":392,"children":393},{},[394],{"type":24,"value":252},{"type":24,"value":396},"：具体执行的时候，是不是我们如果有一个比较强的类型系统的话，他也是有利于执行的，相当于没有随意声明每个轴，他都有统一套原语的定义，是吧？处理一下好像确实会好些。",{"type":18,"tag":26,"props":398,"children":399},{},[400,407],{"type":18,"tag":43,"props":401,"children":402},{},[403],{"type":18,"tag":43,"props":404,"children":405},{},[406],{"type":24,"value":262},{"type":24,"value":408},"：确实是，有了一个名字之后，实际上对我们写这个模型也尽可能会在编译阶段或者写的阶段就能够报出来。有一些确实是有一些走势相关的，比如说我们矩阵层的话，它第一个矩阵的列，跟第二个矩阵的函，这个其实他是一个相关的，这两个组应该是取同一个名字的，因为他的维度是一模一样的，这个是矩阵层给他做的一个约束。前几天，MindSpore 论坛里有用户提问改了我们的一个样例说跑不通。其实就是碰到了矩阵层的这个问题。他把第一个矩阵的 列，和第二个矩阵的函，配成了两个长度，违反了矩阵层的数学定义。肯定就跑不通的。但是如果说有一个 named tensor，这时候把所有的名字给固定下来，就哪些轴是关联的给这样定下来之后他可能就不会出现这种错误，这时候只会定一次，肯定都是符合这个数学运算的一个规则的。",{"type":18,"tag":26,"props":410,"children":411},{},[412,419],{"type":18,"tag":43,"props":413,"children":414},{},[415],{"type":18,"tag":43,"props":416,"children":417},{},[418],{"type":24,"value":252},{"type":24,"value":420},"：我们在其他框架有看到类似 Mesh-Tensorflow 这样的吗？",{"type":18,"tag":26,"props":422,"children":423},{},[424,431],{"type":18,"tag":43,"props":425,"children":426},{},[427],{"type":18,"tag":43,"props":428,"children":429},{},[430],{"type":24,"value":262},{"type":24,"value":432},"：有的。Pytorch 最新版好像是 1.3。",{"type":18,"tag":26,"props":434,"children":435},{},[436,443],{"type":18,"tag":43,"props":437,"children":438},{},[439],{"type":18,"tag":43,"props":440,"children":441},{},[442],{"type":24,"value":252},{"type":24,"value":444},"：也有一类 DSL 的。",{"type":18,"tag":26,"props":446,"children":447},{},[448,455],{"type":18,"tag":43,"props":449,"children":450},{},[451],{"type":18,"tag":43,"props":452,"children":453},{},[454],{"type":24,"value":262},{"type":24,"value":456},"：对，倒不是分布式这一层，而是说对于 tensor 的维度的描述这一层。原来维度可能就是01234，现在给维度取了一个名字能够表达，不同维度之间的一些关联的关系，有一个全局的一个名字。Pytorch 现在也在引入这个，不过现在属于 experiment 试验阶段，但是已经有这个功能了。",{"type":18,"tag":26,"props":458,"children":459},{},[460,467],{"type":18,"tag":43,"props":461,"children":462},{},[463],{"type":18,"tag":43,"props":464,"children":465},{},[466],{"type":24,"value":252},{"type":24,"value":468},"：这个确实对编译的好处也比较大。",{"type":18,"tag":26,"props":470,"children":471},{},[472,479],{"type":18,"tag":43,"props":473,"children":474},{},[475],{"type":18,"tag":43,"props":476,"children":477},{},[478],{"type":24,"value":262},{"type":24,"value":480},"：对，分布式表达，有个 named tensor 的这么一个表达，这个是哈佛大学他们其中一个实验室提出来的一个东西，它其实就是你刚才说的要增加一些类型更强的类型系统，避免出错。这个也是他其中一个目的，然后他提出了一整套相对完整的 named tensor 的一套定义的一个方式。现在 Pytorch 也有了这么一套，不过因为有点类似一套语言的类型系统，要把它做完善其实比较麻烦的。其实我们最开始在做 MindSpore 这一层分布式的时候，我们也考虑过要导引入 named tensor。后来发现 named tensor 真正意义上就是引入了一套类型系统，这套类型系统要把它做成一个相对自洽的，相对完善的一套类型系统，其实难度是非常大的。以至于最后没有做，就是因为要把他做成一套完善的，就这个方面工作量是比较困难的，所以目前一直有在考虑，但是还没有真正做到我们系统里面去。",{"type":18,"tag":26,"props":482,"children":483},{},[484,491],{"type":18,"tag":43,"props":485,"children":486},{},[487],{"type":18,"tag":43,"props":488,"children":489},{},[490],{"type":24,"value":252},{"type":24,"value":492},"：如果说我单独实现一套完整的系统，比较复杂，但是这个东西有没有可能形成一套标准的，几个框架都同意，就是大的维度上 named tensor 应该是遵从这几个标准。实现呢，就是大家各自实现这个。",{"type":18,"tag":26,"props":494,"children":495},{},[496,503],{"type":18,"tag":43,"props":497,"children":498},{},[499],{"type":18,"tag":43,"props":500,"children":501},{},[502],{"type":24,"value":262},{"type":24,"value":504},"：我觉得有可能，能够形成一套标标准，实际上这个 named tensor 名称的定义是他想要跟具体的算子相关。有的矩阵层以我们刚才那个例子。第一个矩阵函，然后他的列，跟第二个矩阵，好像是两个其实是一个，有个卷积也一样的是有这种相关委的。这个我觉得是可以形成一套标准的，因为这种最终本质还是由数学公式来的，只是说给每个数学公式不同的维度取个统一的名字。",{"type":18,"tag":26,"props":506,"children":507},{},[508,515],{"type":18,"tag":43,"props":509,"children":510},{},[511],{"type":18,"tag":43,"props":512,"children":513},{},[514],{"type":24,"value":252},{"type":24,"value":516},"：好，咱们再看一下 Megatron，Megatron 这个 paper，我感觉我读的时候就好像没有什么太大的感觉，你对 Megatron-LM 这个有什么看法。",{"type":18,"tag":26,"props":518,"children":519},{},[520,527],{"type":18,"tag":43,"props":521,"children":522},{},[523],{"type":18,"tag":43,"props":524,"children":525},{},[526],{"type":24,"value":262},{"type":24,"value":528},"：Megatron 这一层它还是模仿 GPT-2 的模型。这还是实现了 GPT-2 的那个模型的一个分布式并行训练，其实简单来说，我觉得可以把它理解成一个手动版的 Mesh，你看它里面用到的这种并行的模式跟 Tensorflow Mesh 对于Transformer 方面的并行是一样的，只不过 Mesh 把并行做成了一套框架，然后外部用户接口去调的时候，他显然是感知不到里面的并行的，通过他的框架展开来这么一套并行的模式。Megatron 的话，它是基于 Pytorch 做的，直接就手写了一种特定的一种并形的模式。",{"type":18,"tag":26,"props":530,"children":531},{},[532,536],{"type":18,"tag":43,"props":533,"children":534},{},[535],{"type":24,"value":252},{"type":24,"value":537},"：他是加了一个库是吧？",{"type":18,"tag":26,"props":539,"children":540},{},[541,548],{"type":18,"tag":43,"props":542,"children":543},{},[544],{"type":18,"tag":43,"props":545,"children":546},{},[547],{"type":24,"value":262},{"type":24,"value":549},"：对，加了一个库。在那个库里面，他手写的这个并行，其实最终跟 Mesh是完成的效果是一样的。只不过说它变成用户调 python 的原生 API，再去写对这个网络的并行。但他这里有问题，他写出来这个只能在 Megatron 里面用，他不能再用到其他其他网络里面，因为他是手写出来的这么一套东西。Megatron 给我们的一个很大的启发，我们做这种大模型的时候，要体现一个系统的能力，通过这种大模型去体现是非常合适的。",{"type":18,"tag":26,"props":551,"children":552},{},[553,560],{"type":18,"tag":43,"props":554,"children":555},{},[556],{"type":18,"tag":43,"props":557,"children":558},{},[559],{"type":24,"value":252},{"type":24,"value":561},"：OK，然后咱们再看两篇比较近的，ZeRO 跟 Google 之前发的一个 XLA，ZeRO 这篇 paper 主要是说它把整个的模型并行，现在他发现的问题，基本上他就归结为做模型并行，还是要保留很多的状态信息。基本上分成两类，一类就是模型相关的 perimeter 那些。另外就是 residual net、 activition 之类的。它分成这两类处理。你觉得 ZeRO 这里面提的方法比 Mesh 跟 Megatron 要更好吗？还是也只是另外一个角度？",{"type":18,"tag":26,"props":563,"children":564},{},[565,572],{"type":18,"tag":43,"props":566,"children":567},{},[568],{"type":18,"tag":43,"props":569,"children":570},{},[571],{"type":24,"value":262},{"type":24,"value":573},"：ZeRO 应该说它有两部分，一部分呢，其实还是 Megatron、Mesh 本质上是一样的。只不过是大家实现的方式不太一样。但最后从并行角度来说，他们还是模型并行，还是切 tensor，而且切的规则也类似，都是对 Transformer 的这种结构进行去模型并行。但是 ZeRO 它最大的特点是提出了一个在 optimizer 里面做模型并行。我们现在把 optimizer 特别模型并行的时候，都是重复算的，因为数据并行时候梯度最后做的一次归约之后每个节点的梯度是一样的。然后再建 optimizer，那个 optimizer，比如说一机八卡的话，那八个节点上 optimizer都是在做一模一样的计算。ZeRO 那篇论文，它把 optimizer 也做了一个模型并行。optimizer 里面的 optimizer state，也类似于一个参数下的东西，做了一个切分，然后让每个节点拿到其中一个切片，这样的话就是 optimizer 这里也做了一个并行。好处就是，因为 optimizer 里面那个 state 数量是非常大的。",{"type":18,"tag":26,"props":575,"children":576},{},[577,584],{"type":18,"tag":43,"props":578,"children":579},{},[580],{"type":18,"tag":43,"props":581,"children":582},{},[583],{"type":24,"value":252},{"type":24,"value":585},"：他举了一个例子， GPT-2 的模型好像本身其实一点几个 g 就够了，但是为什么 32G 的 GPU 还是装不下？就是因为其他参数其实有好几十倍，特别大。",{"type":18,"tag":26,"props":587,"children":588},{},[589,596],{"type":18,"tag":43,"props":590,"children":591},{},[592],{"type":18,"tag":43,"props":593,"children":594},{},[595],{"type":24,"value":262},{"type":24,"value":597},"：是的，目前我们跑一些稠密模型，一个 32G 的卡，基本上只能跑四个 G 的稠密模型。稀疏的话可能会跑的更大。稠密基本上就是这样的一个关系，因为这个跟里面的内存管理、内核复用的关系非常大。所以说内存管理、内存复用这一块也是这个框架的一个非常核心的一个关键项接入。像 ZeRO 他就是通过一个 optimizer 切分，把这个内存的使用率给降下来，因为它本质还是内存在多个节点，它做的复制。内存使用率降下来，让模型数据并行，能够跑更大的模型。因为从我们的角度来看，数据并行，特别是在 Transformer 的这种结构里面，数据并行应该是理论上效果最好的，计算通信比应该是最好的。ZeRO 就是在数据并行搞更大的规模，紧接着在此基础上，再做模型并行，后面的模型和 Megatron 和 Mesh 也差不多，也是类似的。",{"type":18,"tag":26,"props":599,"children":600},{},[601,608],{"type":18,"tag":43,"props":602,"children":603},{},[604],{"type":18,"tag":43,"props":605,"children":606},{},[607],{"type":24,"value":252},{"type":24,"value":609},"：Google 最近 XLA 这个 paper[4]也很有意思，我比较粗浅的理解，我觉得里面一个比较有意思的，就是它是用融合算子，其实是希望用融合算子的方式去减少好多 update 信息，你觉得这个是一个可行的方向吗？",{"type":18,"tag":26,"props":611,"children":612},{},[613,620],{"type":18,"tag":43,"props":614,"children":615},{},[616],{"type":18,"tag":43,"props":617,"children":618},{},[619],{"type":24,"value":262},{"type":24,"value":621},"：确实是，像我们现在在昇腾硬件平台上面做，也是写了大量的融合算子，通过融合算子去提升整个端到端的性能。因为算子融合之后，它可以减少很多，比如说在达芬奇架构上，它有多层的这种 cache 结构，一个是减少 cache到 memory 之间的一个内存搬运，几个融合了之后。然后还有一些融合之后，就可以并行算了，就是在数学公式里面。有一些可能是归约掉的，给并行算降低这种计算量。融合方式，一个是计算量会更优，还有一个像内存搬移与cache 之间的搬移，这些都能够降下来。那我们看到现在很多框架其实像 ZeRO这种，说是做并行，实际上内部可能自己也写了一些算子，因为算子这一块怎么去优化，对最终的性能影响非常大。",{"type":18,"tag":26,"props":623,"children":624},{},[625,632],{"type":18,"tag":43,"props":626,"children":627},{},[628],{"type":18,"tag":43,"props":629,"children":630},{},[631],{"type":24,"value":252},{"type":24,"value":633},"：我看 XLA 那篇 paper，它里面用了大量的 sharding 的能力，这块如果是传统做数据库，其实是 sharding 各个节点的信息，这块 overhead 会不会比较大，感觉做的比较彻底。",{"type":18,"tag":26,"props":635,"children":636},{},[637,644],{"type":18,"tag":43,"props":638,"children":639},{},[640],{"type":18,"tag":43,"props":641,"children":642},{},[643],{"type":24,"value":262},{"type":24,"value":645},"：是的，会有 overhead。不过这个 sharding 做了之后效果要好，依赖底层的通信带宽。但是 sharding 有两层。它可以在比如说像昇腾硬件平台这种架构，或者 GPU 架构，它是多核的这种架构，一个片内有可能有，比如说八个核和 32 核，核与核之间，他因为在片内，所以说他这个带宽是非常高的，因为这种 tensor sharding，之后他肯定会触发一系列的通信，这些通信的话就是我们引入了一个额外开销。因为计算还是那些计算量，额外引入了这些开销，它本质还是不如把这个计算用多核或者多卡的能力给并行起来，所以底下的通信带宽是非常关键的。XLA 里面也做了一个 sharding 优化，类似 ZeRO 我感觉是一模一样的，但是他的论文没有深入去讲，从他的流程跟结构来看，跟 ZeRO 应该是一样的。optimizer 的 state 特别占内存，将 States 给切开，切到多个核上 面再去跑，也是用了 ZeRO 这个 idea。ZeRO 这个 idea，其实怎么说呢，ZeRO确实是第一个提出来的。我们很早之前也讨论过相关的一些东西，当时更多的是从性能的角度去看待这个问题，而不是从内存的这个角度去看待问这个问题。因为像 ZeRO 这种方式，他切了之后，会引入一些通信，再有一些系统下面，如果通信带宽没那么高的话，其实对于性能是会有一些降低的。好处是在大模型的情况下，他把内存降下来，能跑更大的模型。所以说他在对大模型来说效果是非常好的。如果对小模型来说，像 ZeRO 这么去做，可能性能都不对。",{"type":18,"tag":26,"props":647,"children":648},{},[649,656],{"type":18,"tag":43,"props":650,"children":651},{},[652],{"type":18,"tag":43,"props":653,"children":654},{},[655],{"type":24,"value":252},{"type":24,"value":657},"：好，我们进入到下一个环节，切图，GPipe，XPipe[5]，Pipe Dream[6]哪一个稍微早一些？",{"type":18,"tag":26,"props":659,"children":660},{},[661,668],{"type":18,"tag":43,"props":662,"children":663},{},[664],{"type":18,"tag":43,"props":665,"children":666},{},[667],{"type":24,"value":262},{"type":24,"value":669},"：GPipe 是比较早的，XPipe 是最新出来的。",{"type":18,"tag":26,"props":671,"children":672},{},[673,680],{"type":18,"tag":43,"props":674,"children":675},{},[676],{"type":18,"tag":43,"props":677,"children":678},{},[679],{"type":24,"value":252},{"type":24,"value":681},"：GPipe 好像跟 Mesh，Tensorflow 他们两个有关系吗？感觉思路特别像~",{"type":18,"tag":26,"props":683,"children":684},{},[685,692],{"type":18,"tag":43,"props":686,"children":687},{},[688],{"type":18,"tag":43,"props":689,"children":690},{},[691],{"type":24,"value":262},{"type":24,"value":693},"：应该说都是 Tensorflow 之上的一套框架，GPipe 也是 Tensorflow 官方的一个库。GPipe 是比较早的，因为最开始做模型并行，我们一般都是通过切图的方式，或者 pipeline 的方式。这个分类有一些论文，也有一些人，把 pipeline归结为单独的一类并行，又把它归到模型并行里面。但 GPipe、pipeline 其实最早的。像 Tensorflow 最早出来的时候，他实际上做了模型并行，但是他的模型并行其实就是切图。跟最初的硬件相关，因为最初的硬件 GPU 可能就八个 GB的显存。TPU 的话，第一代 TPU 应该也就是八个 G 的一个显存，那会儿时间要跑稍微大一点模型肯定是要模型并行的，这是跑不动的。不像现在我们昇腾硬件平台上来就是 32GB 的一个内存。在硬件性能比较弱的情况下，模型并行用的是比较多的。随着现在内存越来越大，很多的模型要用更大的内存，用数据并行就能跑。BERT 刚出来的时候，很多公司 BERT 跑不起来是为什么？因为很多公司存量的机器可能都是 8 个 G 的内存。而 BERT 一出来它是很大的一个参数量，所以说跑不起来。BERT 刚出来的时候，模型并行在网上、在学术圈也是热议了很长的一段时间。因为有这么大的一个模型出来，大家发现自己现在机器怎么跑不起来，想去解决这个问题。回到 GPipe，主要是想把 pipeline，怎么做的更高效。像 pipeline 这种并行，假如说我们要做全同步的计算，像 pipeline这种并行，他组织的 bubble 是比较大的，空闲的时间是比较大的。因为我们正向算完要算反向。反向没算完，之前所有下一轮的迭代的真相要在那等着，这个等待时间是比较长的，说白了就是我们的算力空闲的时间是比较长的。GPipe它的优势就是把每一个 minibatch，再切成他叫 micro batch，再去做这种pipeline，把中间 bubble 等待的空闲时间给降下来。虽然降下来都是些空闲， 但还是蛮大的，只能说是有提升的。我们刚开始做 MindSpore 的时候，有适用过 GPipe 的这种方式。也用 Pytorch 这种框架去做一些前期的一些并行方面的验证。我们用 Pytorch 也试了 GPipe 这种方式，确实是有效果的，但是计算的等待时间还是比较长的。接下来就有一篇 Pipe Dream 论文，它主要也是在GPipe 的基础上，他引入了异步。因为引入 pipeline 的异步之后，解决了bubble 问题。但是异步对网络的精度是会有影响的，可能有少量的网络，像OCR 这种的网络，影响稍微小一点。",{"type":18,"tag":26,"props":695,"children":696},{},[697,704],{"type":18,"tag":43,"props":698,"children":699},{},[700],{"type":18,"tag":43,"props":701,"children":702},{},[703],{"type":24,"value":252},{"type":24,"value":705},"：Pipe Dream 的一个论文提到它的一个主要缺点就是会有一个失效或者说就是信息过期，这个是所有异步解决方案都会共性的问题吗？",{"type":18,"tag":26,"props":707,"children":708},{},[709,716],{"type":18,"tag":43,"props":710,"children":711},{},[712],{"type":18,"tag":43,"props":713,"children":714},{},[715],{"type":24,"value":262},{"type":24,"value":717},"：对，这是所有异步都共性的问题。",{"type":18,"tag":26,"props":719,"children":720},{},[721,728],{"type":18,"tag":43,"props":722,"children":723},{},[724],{"type":18,"tag":43,"props":725,"children":726},{},[727],{"type":24,"value":252},{"type":24,"value":729},"：他主要是什么造成的呢？是因为异步的话，就是相当于又增加了一个地理的维度会造成这个问题？还是其他的因素造成的这种过期？",{"type":18,"tag":26,"props":731,"children":732},{},[733,740],{"type":18,"tag":43,"props":734,"children":735},{},[736],{"type":18,"tag":43,"props":737,"children":738},{},[739],{"type":24,"value":262},{"type":24,"value":741},"：其实如果从同步角度来理解的话，就是我们比如说 SGD 往前走都是根据上一次的结果嘛。这样才知道我这一步要往哪个方向或者往梯度的哪个维度去走，但是实际上一旦引入异步的时候，这一步走的可能不是基于上一步的结果，而是基于上上步，或者前 100 步。有可能因为异步跨度大的话，计算速度快，可能是前 100 步的某一个结果，这种的梯度方向找的没那么准，就是会有这样的情况。我个人倒是有一些观察，异步这一块，比如说我们以前用 PS 做异步，现在 Pipe Dream 也用 pipeline 做异步，这两种异步虽然都是异步，但因为他异步的模式变了，实际上对优化器本身是有一些影响的。因为我们看很多异 步论文里面，他首先他自己做了异步，另外也做了一个优化器，用这个优化器来提升这个异步的一个训练效率，或者最后的这个收敛的一个进度或效果，但是如果说异步的模式换的话，可能对优化器设计本身也有一些，所以说异步出来，你得是一个全套的系统性。",{"type":18,"tag":26,"props":743,"children":744},{},[745,752],{"type":18,"tag":43,"props":746,"children":747},{},[748],{"type":18,"tag":43,"props":749,"children":750},{},[751],{"type":24,"value":252},{"type":24,"value":753},"：XPipe 主要说的是比较创新的点，就是它有一个对权重的预测，你觉得对信息失效管用吗？当然他的论文里面是说她用这个还是比较管用的，咱们在实践里面有试过吗？",{"type":18,"tag":26,"props":755,"children":756},{},[757,761],{"type":18,"tag":43,"props":758,"children":759},{},[760],{"type":24,"value":262},{"type":24,"value":762},"：目前倒确实没试过，因为这篇论文比较新。我们还没有做的相关的团队，",{"type":18,"tag":26,"props":764,"children":765},{},[766,770],{"type":18,"tag":43,"props":767,"children":768},{},[769],{"type":24,"value":252},{"type":24,"value":771},"：他等于说把历史信息叠加起来，保证相关性。好，感谢ST老师~",{"type":18,"tag":26,"props":773,"children":774},{},[775],{"type":18,"tag":30,"props":776,"children":778},{"alt":7,"src":777},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/16/871f4305c7c64e8ea0acc32d748cd405.png",[],{"type":18,"tag":26,"props":780,"children":781},{},[782],{"type":24,"value":783},"[0] Mesh-Tensorflow:Deep Learning for Supercomputers. “Wheredata-parallelism can be viewed as splitting tensors and operations along the\"batch\" dimension, in Mesh-Tensorflow, the user can specific anytensor-dimensions to be split across any dimensions of a multi-dimensional meshof processors",{"type":18,"tag":26,"props":785,"children":786},{},[787],{"type":24,"value":788},"(1)Tensors have named dimensions.",{"type":18,"tag":26,"props":790,"children":791},{},[792],{"type":24,"value":793},"(2) A global \"computation layout\" is a partial map fromtensor-dimension to mesh-dimension specifying which tensor-dimensions are splitacross which dimensions of the processormesh”",{"type":18,"tag":26,"props":795,"children":796},{},[797],{"type":24,"value":798},"[1] Megatron-LM: Training Multi-Billion Parameter Language Models UsingModel Parallelism",{"type":18,"tag":26,"props":800,"children":801},{},[802],{"type":24,"value":803},"[2] ZeRO: Memory Optimizations Toward Training Trillion Parameter Models“”",{"type":18,"tag":26,"props":805,"children":806},{},[807],{"type":24,"value":808},"[3] GPipe: Easy Scaling with Micro-Batch Pipeline Parallelism. “GPipeand Mesh-Tensorflow provide frameworks for model parallelism of differentkinds. However, they require rewriting the model, and rely on custom compilersand frameworks that are still under development.With GPipe, each model can bespecified as a sequence of layers, and consecutive groups of layers can be partitionedinto cells. Each cell is then placed on a separate accelerator",{"type":18,"tag":26,"props":810,"children":811},{},[812],{"type":24,"value":813},"Interface:",{"type":18,"tag":26,"props":815,"children":816},{},[817],{"type":24,"value":818},"(i) the number of model partitions K",{"type":18,"tag":26,"props":820,"children":821},{},[822],{"type":24,"value":823},"(ii) the number of micro-batches M",{"type":18,"tag":26,"props":825,"children":826},{},[827],{"type":24,"value":828},"(iii) the sequence and definitions of L layers that define the model”",{"type":18,"tag":26,"props":830,"children":831},{},[832],{"type":24,"value":833},"[4] Automatic Cross-Replica Sharding of Weight Update in Data-ParallelTraining. “Weight update is not sharded in data parallelism because the weightsand gradients do not have a batch dimension to be partitioned. Our goal is toenable sharded weight update across the replicated devices as an optimization,without using more devices”",{"type":18,"tag":26,"props":835,"children":836},{},[837],{"type":24,"value":838},"[5] XPipe- Efficient Pipeline Model Parallelism for Multi-GPU DNNTraining “Most importantly, the novel weight prediction strategy adopted byXPipe enables it to effectively address the weight inconsistency and stalenessissues incurred by the asynchronous pipeline parallelism Weight Prediction”",{"type":18,"tag":26,"props":840,"children":841},{},[842],{"type":24,"value":843},"[6] PipeDream: Generalized Pipeline Parallelism for DNN Training.“PipeDream suffers from staleness problem because it uses different versions ofweights in the whole feedforwardback propagation round. PipeDream divides themodel among available workers, assigning a group of consecutive operators(called layers in DNN terminology) in the operator graph to each of them, andthen overlaps the computation and communication of different inputs in apipelined fashion”",{"type":18,"tag":26,"props":845,"children":846},{},[847],{"type":18,"tag":30,"props":848,"children":850},{"alt":7,"src":849},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/16/1579673aa35e4450bcb75cd697f6106d.gif",[],{"type":18,"tag":26,"props":852,"children":853},{},[854],{"type":24,"value":855},"分布式并行计算的前沿发展上期更新到这里啦，意犹未尽的小伙伴，下期精彩继续，记得关注MindSpore获取最新动态~",{"type":18,"tag":26,"props":857,"children":858},{},[859],{"type":18,"tag":30,"props":860,"children":862},{"alt":7,"src":861},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/16/ab5805b865694f96b0193ddabe6b5b6e.jpg",[],{"title":7,"searchDepth":864,"depth":864,"links":865},4,[],"markdown","content:activities:zh:652.md","content","activities/zh/652.md","activities/zh/652","md",1776506038608]