[{"data":1,"prerenderedAt":359},["ShallowReactive",2],{"content-query-x9zSL43SGe":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":353,"_id":354,"_source":355,"_file":356,"_stem":357,"_extension":358},"/technology-blogs/zh/667","zh",false,"","大V博文系列：盘古大模型的推理解决方案：增量推理+分布式推理","增量推理+分布式推理","2021-07-28","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/28/3adf011afb1a4355a01682183916a27c.png","technology-blogs","大V博文",{"type":15,"children":16,"toc":347},"root",[17,25,31,44,55,61,66,75,80,85,90,95,103,108,113,121,126,146,151,160,165,170,175,183,188,196,201,206,211,216,221,226,231,236,244,249,260,268,273,278,283,326,334,342],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"大v博文系列盘古大模型的推理解决方案增量推理分布式推理",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"作者：金雪锋",{"type":18,"tag":26,"props":32,"children":33},{},[34,36],{"type":24,"value":35},"作者主页：",{"type":18,"tag":37,"props":38,"children":42},"a",{"href":39,"rel":40},"https://www.zhihu.com/people/jin-xue-feng",[41],"nofollow",[43],{"type":24,"value":39},{"type":18,"tag":26,"props":45,"children":46},{},[47,49],{"type":24,"value":48},"文章来源：",{"type":18,"tag":37,"props":50,"children":53},{"href":51,"rel":52},"https://zhuanlan.zhihu.com/p/393812996",[41],[54],{"type":24,"value":51},{"type":18,"tag":56,"props":57,"children":59},"h2",{"id":58},"增量推理",[60],{"type":24,"value":58},{"type":18,"tag":26,"props":62,"children":63},{},[64],{"type":24,"value":65},"鹏程.盘古的基础结构是Transformer的Decoder模块，这种自回归（Auto-regressive）的语言模型会根据上文预测下一个字，因此在推理时会根据输入的文本逐字（词）生成。显然这种方式会导致每一步推理的网络输入shape在不断变大。",{"type":18,"tag":26,"props":67,"children":68},{},[69],{"type":18,"tag":70,"props":71,"children":74},"img",{"alt":72,"src":73},"1.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202107/28/110018skpdxeyzngqhn6h0.jpg",[],{"type":18,"tag":26,"props":76,"children":77},{},[78],{"type":24,"value":79},"静态图执行时，要求图中每个算子的shape不能改变，否则执行会报错；动态图执行时，在不同迭代间，图中每个算子的shape可以改变，不过改变了算子shape，就无法利用之前缓存的算子编译信息，每次都需重新编译，会影响性能。因此对于这种自回归模型，通用做法一般为将输入padding到固定长度（如1024），以此保证每一步推理各个算子的shape保持一致。",{"type":18,"tag":26,"props":81,"children":82},{},[83],{"type":24,"value":84},"这种padding的方式易于实现，无需做算法修改，但是明显会引入冗余计算，会极大地降低推理的性能。",{"type":18,"tag":26,"props":86,"children":87},{},[88],{"type":24,"value":89},"结合Transformer的attention结构特点，业界有一种状态复用（state reuse）的改进算法，以下称增量推理。",{"type":18,"tag":26,"props":91,"children":92},{},[93],{"type":24,"value":94},"Attention机制可以由下式表示",{"type":18,"tag":26,"props":96,"children":97},{},[98],{"type":18,"tag":70,"props":99,"children":102},{"alt":100,"src":101},"2.png","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202107/28/110038z0t4r93ihgsfhz5y.png",[],{"type":18,"tag":26,"props":104,"children":105},{},[106],{"type":24,"value":107},"其中Q和V的shape为（batch_size, num_heads, seq_length, size_per_head）;K的shape为（batch_size, num_heads, size_per_head, seq_length）。",{"type":18,"tag":26,"props":109,"children":110},{},[111],{"type":24,"value":112},"对应维度为seq_length这一维，每一个位置分别对应输入的相应位置，回顾图1的形式，不同步的输入，其前面的部分完全相同，当计算seq-index为i的位置时，前面0~i-1位置对应的state在上一步推理中已经计算过，因此在整个推理过程中存在很多的重复计算，如果能够通过某种方式保存下当前步计算出的state供给下一步使用，即可省掉这些重复计算，这便是增量计算的思想，下图展示了增量推理的计算逻辑。",{"type":18,"tag":26,"props":114,"children":115},{},[116],{"type":18,"tag":70,"props":117,"children":120},{"alt":118,"src":119},"3.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202107/28/110101myuapuyyshx7g9be.jpg",[],{"type":18,"tag":26,"props":122,"children":123},{},[124],{"type":24,"value":125},"当使用增量推理的时候，",{"type":18,"tag":127,"props":128,"children":129},"ol",{},[130,136,141],{"type":18,"tag":131,"props":132,"children":133},"li",{},[134],{"type":24,"value":135},"首先将使用完整输入推理一步，此时通过某种方式保存下来输入对应的state，",{"type":18,"tag":131,"props":137,"children":138},{},[139],{"type":24,"value":140},"在第二步推理时，输入仅为上一步推理得到的字（词），然后将本步推理得到的state与保存下来的前序state拼接，作为本步推理的完整state，同时再次保存state，得到本步的输出字（词）",{"type":18,"tag":131,"props":142,"children":143},{},[144],{"type":24,"value":145},"重复步骤2，直到推理结束。",{"type":18,"tag":26,"props":147,"children":148},{},[149],{"type":24,"value":150},"通过这种方式，在需要多次运行的步骤2中，可以保证最小的输入shape（seq_length=1），这样可以极大提升推理性能。",{"type":18,"tag":26,"props":152,"children":153},{},[154],{"type":18,"tag":155,"props":156,"children":157},"strong",{},[158],{"type":24,"value":159},"MindSpore1.3实现增量推理",{"type":18,"tag":26,"props":161,"children":162},{},[163],{"type":24,"value":164},"详细分析上述增量推理的步骤，有两个问题：一是第一步推理的输入是不定长的，后续推理步骤的输入是固定长度（seq_length=1）；二是如何通过“某种方式”保存下中间的state。",{"type":18,"tag":26,"props":166,"children":167},{},[168],{"type":24,"value":169},"对于第一个问题，当使用动态图模式时，每次推理的第一步都会遇到一个不定长的输入，在后续步中，将本次推理的state与前序state拼接时也会遇到长度不断增加的情况，不过这种不定长的情况对于动态图来说，不会出现执行错误的问题，只会有些许性能损失。而当使用静态图时，这两个不定长的情况则会直接导致执行错误。",{"type":18,"tag":26,"props":171,"children":172},{},[173],{"type":24,"value":174},"在使用MindSpore实现时，对于第一步推理，我们将输入padding到max_length（1024长度），这样应对不同输入语句时不会遇到shape改变的情况。对于state拼接时，我们并没有使用concat来进行拼接，而是使用加法来进行“拼接”，将所有的state存储到max_length长度的向量中，只更新其有效对应位置的值，其余位置置零，最后使用加法进行“拼接”，具体流程如下图所示。",{"type":18,"tag":26,"props":176,"children":177},{},[178],{"type":18,"tag":70,"props":179,"children":182},{"alt":180,"src":181},"4.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202107/28/110127txtogryiaqtinrlv.jpg",[],{"type":18,"tag":26,"props":184,"children":185},{},[186],{"type":24,"value":187},"由于使用增量推理的方式，我们将推理过程分为两个阶段，两阶段共享参数。第一阶段（seq_length=max_length）执行一步，然后执行第二阶段（seq_length=1）若干步。当对下一个样本进行推理时再重复上述步骤，具体流程如下。",{"type":18,"tag":26,"props":189,"children":190},{},[191],{"type":18,"tag":70,"props":192,"children":195},{"alt":193,"src":194},"5.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202107/28/110142cynillobqowhfzzd.jpg",[],{"type":18,"tag":26,"props":197,"children":198},{},[199],{"type":24,"value":200},"对于问题二，该如何通过“某种方式”保存中间的state。最直接的想法是将state作为网络的输出返回到host侧，再下一步推理时，再将state作为输入传递给网络。这种方式明显的问题在于每一步的state的传入与传出，很遗憾的是，state的维度过大，经我们实验发现对于鹏程.盘古13B的模型，当输入seq_length=1时，推理耗时基本与state的传出耗时接近。因此很自然的我们将state保存在了device上，作为一个网络的parameter，从而避免了state的传入传出。",{"type":18,"tag":26,"props":202,"children":203},{},[204],{"type":24,"value":205},"我们通过鹏程.盘古在Ascend910上进行了一系列实验，结果如下图所示：",{"type":18,"tag":26,"props":207,"children":208},{},[209],{"type":24,"value":210},"可以看出使用了这种增量推理的方式，第二阶段（输入长度=1）的执行速度可以达到第一阶段（输入长度=1024）的5倍，随着bs的增大，提升愈发明显。而且在增量推理过程中，第一阶段只执行一次，而第二阶段会执行多次，整体性能提升比较明显。",{"type":18,"tag":26,"props":212,"children":213},{},[214],{"type":24,"value":215},"需要指出的是，由于网络中LayerNorm算子的存在，增量推理与常规推理在数学原理上并不完全等价，不过在我们的下游任务实验中发现，增量推理与常规推理的精度基本一致。",{"type":18,"tag":56,"props":217,"children":219},{"id":218},"分布式推理",[220],{"type":24,"value":218},{"type":18,"tag":26,"props":222,"children":223},{},[224],{"type":24,"value":225},"盘古alpha是最大的稠密形式的中文预训练语言模型，拥有2000亿参数。如此庞大的模型在推理时，无法简单地部署在单卡上，需要使用分布式推理。",{"type":18,"tag":26,"props":227,"children":228},{},[229],{"type":24,"value":230},"分布式推理是指推理阶段采用多卡进行推理，分布式推理与单卡推理相比，大部分流程相似，其中并行策略的给定和分布式训练情况下脚本一致，即通过设置并行策略配置模型并行、通过设置pipeline_stage配置pipeline并行，相应的HCCL集合通信算子会由自动并行模块自动插入。",{"type":18,"tag":26,"props":232,"children":233},{},[234],{"type":24,"value":235},"对于鹏程.盘古模型的分布式推理，我们使用了如下图所示的两种并行策略，分别是OP-Level和PipeLine模型并行。",{"type":18,"tag":26,"props":237,"children":238},{},[239],{"type":18,"tag":70,"props":240,"children":243},{"alt":241,"src":242},"6.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202107/28/1101582xihy32gxvdygrxb.jpg",[],{"type":18,"tag":26,"props":245,"children":246},{},[247],{"type":24,"value":248},"OP-Level模型并行是算子级别的并行，会将单个tensor在不同的维度进行切分，每块卡只保存tensor的一部分；pipeline模型并行是将整个模型切分为几张子图，每张子图放置在若干节点上。跨子图的通信需要使用Send/Recv算子，而不同分布的tensor之间需要使用AllReduce/AllGather等算子进行tensor的重新排布。需要正确的插入通信算子才能保证执行结果的正确性。",{"type":18,"tag":26,"props":250,"children":251},{},[252,254],{"type":24,"value":253},"具体可参考往前文章：",{"type":18,"tag":37,"props":255,"children":258},{"href":256,"rel":257},"https://www.zhihu.com/question/456443707/answer/1856014437",[41],[259],{"type":24,"value":256},{"type":18,"tag":26,"props":261,"children":262},{},[263],{"type":18,"tag":155,"props":264,"children":265},{},[266],{"type":24,"value":267},"MindSpore1.3 Serving部署盘古推理",{"type":18,"tag":26,"props":269,"children":270},{},[271],{"type":24,"value":272},"一次完整的推理会话包括多个tokens的生成，增量推理的两个阶段的每次推理将生成一个token，需要多次推理，且过程中需要保持和共享权重数据（包括state数据）。",{"type":18,"tag":26,"props":274,"children":275},{},[276],{"type":24,"value":277},"在MindSpore Serving中实现上述增量推理部署，将遇到以下两个问题：两个阶段的推理输入长度不同，存在两个推理入口；由于推理的state在每单次推理后更新，以用于下一次增量推理，所以增量推理是有状态的模型，在一次请求执行结束前不能有其他请求中间干扰。",{"type":18,"tag":26,"props":279,"children":280},{},[281],{"type":24,"value":282},"为解决上述问题，通过子图间编排串接两个阶段（体现为两个子图）的执行，一次子图编排脚本的执行作为一次推理会话，仅当上次推理会话执行结束后，才会执行下一次会话，避免多个会话同时执行相互干扰。具体的推理过程如下：",{"type":18,"tag":127,"props":284,"children":285},{},[286,291,296,301,306,311,316,321],{"type":18,"tag":131,"props":287,"children":288},{},[289],{"type":24,"value":290},"在模型配置Python脚本中定义串接两个阶段的编排脚本。",{"type":18,"tag":131,"props":292,"children":293},{},[294],{"type":24,"value":295},"启动Serving服务器，加载模型。",{"type":18,"tag":131,"props":297,"children":298},{},[299],{"type":24,"value":300},"客户端通过gRPC或者RESTful将请求文本语句发送给Serving服务器；",{"type":18,"tag":131,"props":302,"children":303},{},[304],{"type":24,"value":305},"Serving服务器执行编排脚本。",{"type":18,"tag":131,"props":307,"children":308},{},[309],{"type":24,"value":310},"在编排脚本中，输入文本语句转换为一组tokens，传递给子图0（输入长度为1024），初始化state数据，产生一个新的token。",{"type":18,"tag":131,"props":312,"children":313},{},[314],{"type":24,"value":315},"新增的token数据将传给子图1（输入长度为1），子图1每次接受上一次新增token，更新state数据，继续产生下一个token。",{"type":18,"tag":131,"props":317,"children":318},{},[319],{"type":24,"value":320},"持续步骤6，直到满足条件退出生成。",{"type":18,"tag":131,"props":322,"children":323},{},[324],{"type":24,"value":325},"将所有新增的tokens转换为文本语句返回给客户端。",{"type":18,"tag":26,"props":327,"children":328},{},[329],{"type":18,"tag":70,"props":330,"children":333},{"alt":331,"src":332},"7.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202107/28/110218k1emtelfpqz48fxy.jpg",[],{"type":18,"tag":26,"props":335,"children":336},{},[337],{"type":18,"tag":70,"props":338,"children":341},{"alt":339,"src":340},"8.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202107/28/110231rb5qqhxnzitg8zkc.jpg",[],{"type":18,"tag":26,"props":343,"children":344},{},[345],{"type":24,"value":346},"注：分布式推理目前还只支持昇腾，其他芯片的支持正在进行中。",{"title":7,"searchDepth":348,"depth":348,"links":349},4,[350,352],{"id":58,"depth":351,"text":58},2,{"id":218,"depth":351,"text":218},"markdown","content:technology-blogs:zh:667.md","content","technology-blogs/zh/667.md","technology-blogs/zh/667","md",1776506139516]