[{"data":1,"prerenderedAt":403},["ShallowReactive",2],{"content-query-hh1jhgSQB0":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":397,"_id":398,"_source":399,"_file":400,"_stem":401,"_extension":402},"/technology-blogs/zh/565","zh",false,"","大V博文系列：MLSys 2021论文分析5—《Value Learning for Throughput Optimization of Deep Neutal Netwoorks》","在深度学习领域算子Tuning的重要性","2021-05-21","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/05/21/f2bb1230006e46a0a9fb9066f10fad56.png","technology-blogs","大V博文",{"type":15,"children":16,"toc":394},"root",[17,25,28,34,47,58,69,74,79,84,89,94,99,104,109,117,122,129,134,139,144,149,154,159,164,169,174,181,186,191,196,210,215,222,227,232,237,242,249,254,259,266,271,277,282,289,294,305,310,315,320,327,332,339,344,349,354,359,364,369,374,379,384,389],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"大v博文系列mlsys-2021论文分析5value-learning-for-throughput-optimization-of-deep-neutal-netwoorks",[23],{"type":24,"value":8},"text",{"type":18,"tag":19,"props":26,"children":27},{"id":7},[],{"type":18,"tag":29,"props":30,"children":31},"p",{},[32],{"type":24,"value":33},"作者：金雪锋",{"type":18,"tag":29,"props":35,"children":36},{},[37,39],{"type":24,"value":38},"作者主页：",{"type":18,"tag":40,"props":41,"children":45},"a",{"href":42,"rel":43},"https://www.zhihu.com/people/jin-xue-feng",[44],"nofollow",[46],{"type":24,"value":42},{"type":18,"tag":29,"props":48,"children":49},{},[50,52],{"type":24,"value":51},"文章来源：",{"type":18,"tag":40,"props":53,"children":56},{"href":54,"rel":55},"https://zhuanlan.zhihu.com/p/374114567",[44],[57],{"type":24,"value":54},{"type":18,"tag":29,"props":59,"children":60},{},[61,63],{"type":24,"value":62},"大家好，今天给大家带来MLSys 2021的一篇论文，来自Facebook AI Research的《VALUE LEARNING FOR THROUGHPUT OPTIMIZATION OF DEEP NEURAL NETWORKS》。",{"type":18,"tag":64,"props":65,"children":66},"strong",{},[67],{"type":24,"value":68},"这是本系列分享关于Tuning的第三篇论文，足以见得在深度学习领域算子Tuning的重要性。",{"type":18,"tag":29,"props":70,"children":71},{},[72],{"type":24,"value":73},"如今机器学习的使用无处不在，神经网络的执行效率在很多场景下变得尤为重要。AI算子框架如Halide、TVM等利用算子调度（Schedule）将深度学习模型拆分成众多的算子。然而为算子寻找出优秀的调度，是一件困难的事情。自动调整（Auto-tuning）任务，希望在合法的算子空间中找出最佳的算子调度并完成实测。这篇论文通过强化学习（Value Iteration）+ CostModel（BiLSTM）来解决auto-tuning问题。",{"type":18,"tag":29,"props":75,"children":76},{},[77],{"type":24,"value":78},"论文的主要的贡献点在于：",{"type":18,"tag":29,"props":80,"children":81},{},[82],{"type":24,"value":83},"1.提出一种可以生成合法候选调度（schedule）的方式；",{"type":18,"tag":29,"props":85,"children":86},{},[87],{"type":24,"value":88},"2.提出一种高精度的预测调度运行时间的Cost Model；",{"type":18,"tag":29,"props":90,"children":91},{},[92],{"type":24,"value":93},"3.提出一种基于Cost Model的迭代式Value function，用于贪心地从空间中寻找最佳候选调度，比以往的方法速度提升2到3个数量级；",{"type":18,"tag":29,"props":95,"children":96},{},[97],{"type":24,"value":98},"4.在这一系列的技术帮助下，优化后的神经网络比Halide提升1.5x，比TVM提升2.6x。",{"type":18,"tag":19,"props":100,"children":102},{"id":101},"背景与基础",[103],{"type":24,"value":101},{"type":18,"tag":29,"props":105,"children":106},{},[107],{"type":24,"value":108},"论文以Halide为基础，支持schedule的split、reorder、vectorize、parrallel、compute_at、store_at等操作。原始的DSL可以通过这一系列的操作转化成目标高性能schedule。与Halide相仿，论文同样将一个算子生成任务称作pipeline，而同一个pipeline的不同阶段称为stage。",{"type":18,"tag":29,"props":110,"children":111},{},[112],{"type":18,"tag":113,"props":114,"children":116},"img",{"alt":7,"src":115},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/05/21/ae6fcf4dad414324a9e065208172bdb0.jpg",[],{"type":18,"tag":29,"props":118,"children":119},{},[120],{"type":24,"value":121},"图1 支持的基本原语",{"type":18,"tag":29,"props":123,"children":124},{},[125],{"type":18,"tag":113,"props":126,"children":128},{"alt":7,"src":127},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/05/21/95346406177141c2abaaeffbb71dfbb3.jpg",[],{"type":18,"tag":29,"props":130,"children":131},{},[132],{"type":24,"value":133},"图2 由原始DSL到简单的schedule，再变换成高性能的schedule",{"type":18,"tag":19,"props":135,"children":137},{"id":136},"空间生成",[138],{"type":24,"value":136},{"type":18,"tag":29,"props":140,"children":141},{},[142],{"type":24,"value":143},"论文将空间生成分成5个部分：",{"type":18,"tag":29,"props":145,"children":146},{},[147],{"type":24,"value":148},"1.在每一个stage中，将每一个变量v拆分成3重子循环（split）；",{"type":18,"tag":29,"props":150,"children":151},{},[152],{"type":24,"value":153},"2.在1 的基础上增加重排（reorder）；",{"type":18,"tag":29,"props":155,"children":156},{},[157],{"type":24,"value":158},"3.尝试对每个loop添加向量化选项（vectorize）；",{"type":18,"tag":29,"props":160,"children":161},{},[162],{"type":24,"value":163},"4.在满足生产者-消费者模型的条件下，增加临时buffer和inline（compute_at、store_at）；",{"type":18,"tag":29,"props":165,"children":166},{},[167],{"type":24,"value":168},"5.选择loop进行并行化（parallelize）。",{"type":18,"tag":29,"props":170,"children":171},{},[172],{"type":24,"value":173},"经过这5个阶段后，空间将会变得庞大起来。论文通过一定的方式去减少空间大小，如：向量化时只考虑因子是向量本身的整数倍的情况。在常见网络中，平均每个stage有大约30万个选择。",{"type":18,"tag":29,"props":175,"children":176},{},[177],{"type":18,"tag":113,"props":178,"children":180},{"alt":7,"src":179},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/05/21/94e1ae0831364eeaa4116903b15e04b2.jpg",[],{"type":18,"tag":29,"props":182,"children":183},{},[184],{"type":24,"value":185},"图3 常见网络的空间大小",{"type":18,"tag":19,"props":187,"children":189},{"id":188},"寻找最佳调度",[190],{"type":24,"value":188},{"type":18,"tag":29,"props":192,"children":193},{},[194],{"type":24,"value":195},"论文采取基于Value function搜索最佳调度，采用了马尔科夫决策过程，MDP。",{"type":18,"tag":29,"props":197,"children":198},{},[199,201,208],{"type":24,"value":200},"【注】关于马尔可夫决策过程、强化学习的相关学习可以参考",{"type":18,"tag":40,"props":202,"children":205},{"href":203,"rel":204},"https://link.zhihu.com/?target=https%3A//www.cnblogs.com/pinard/p/9426283.html",[44],[206],{"type":24,"value":207},"https://www.cnblogs.com/pinard/p/9426283.html",{"type":24,"value":209},"。",{"type":18,"tag":29,"props":211,"children":212},{},[213],{"type":24,"value":214},"通过一个Value Function V(s)来评估当前pipeline的“最低运行时间”。整体搜索流程如下：（1）输入一个pipeline（包含n个stages，按照拓扑序排列）、输入估价函数V(s)、设置初始化状态state0；（2）按照stage的顺序寻找当前stage最佳的调度决策，首先生成一系列的候选调度决策；（3）然后对于每一个调度决策s，计算其V(s)，保留最大值vi以及对应决策si；（4）返回n个stages的决策s1...sn。N-stage的pipeline，每个stage平均调度决策为M的情况下，需要计算的总次数是M*N，而不是M^N。这样可以使得我们能以很快的速度计算完全部的结果。",{"type":18,"tag":29,"props":216,"children":217},{},[218],{"type":18,"tag":113,"props":219,"children":221},{"alt":7,"src":220},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/05/21/974767f36715454386d53364d5cf6be9.jpg",[],{"type":18,"tag":29,"props":223,"children":224},{},[225],{"type":24,"value":226},"图4 基于Value function搜索最佳调度",{"type":18,"tag":29,"props":228,"children":229},{},[230],{"type":24,"value":231},"这种设计下，auto-tune的效果很大程度上取决于Cost Model的准确程度。论文提供了一种基于BiLSTM+特征工程的思路来解决估价问题。",{"type":18,"tag":29,"props":233,"children":234},{},[235],{"type":24,"value":236},"论文中包含了三类特征：（1）与调度无关的指令特征，例如浮点数加减法、index下标计算、内存访问特征等等；（2）调度相关特征，例如向量化的数量、循环轴相关特征、inline相关特征、CPU利用率、影响性能的辅助消耗特征等等；（3）派生类型特征，如算术强度等。",{"type":18,"tag":29,"props":238,"children":239},{},[240],{"type":24,"value":241},"论文构建了一个双向LSTM用作预测运行时间，结构如图所示。N个stages对应N个LSTM cell，最后汇总N个结果，得到整个pipeline的总结果。Cost Model的训练使用了13352个piplines，每个pipeline平均包含1184个调度。Loss的设计包含了预测值P、平均实际运行值M、重要程度C、减重程度D。",{"type":18,"tag":29,"props":243,"children":244},{},[245],{"type":18,"tag":113,"props":246,"children":248},{"alt":7,"src":247},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/05/21/004356bea0984daba9537a83ebead6c6.jpg",[],{"type":18,"tag":29,"props":250,"children":251},{},[252],{"type":24,"value":253},"图5 Cost Model的Loss设计",{"type":18,"tag":29,"props":255,"children":256},{},[257],{"type":24,"value":258},"【注】文章中的Cost Model是指用于预测经过完整调度的pipeline时间模型（BiLSTM），Value Function是已经应用部分调度后的pipeline可能达到的最佳时间预估函数。",{"type":18,"tag":29,"props":260,"children":261},{},[262],{"type":18,"tag":113,"props":263,"children":265},{"alt":7,"src":264},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/05/21/f196beba3af448108cd2568e186e3571.jpg",[],{"type":18,"tag":29,"props":267,"children":268},{},[269],{"type":24,"value":270},"图6 BiLSTM结构图",{"type":18,"tag":19,"props":272,"children":274},{"id":273},"迭代式value-function训练",[275],{"type":24,"value":276},"迭代式Value Function训练",{"type":18,"tag":29,"props":278,"children":279},{},[280],{"type":24,"value":281},"为了训练出更加精确的模型V(s)，论文采用迭代式优化Value Function：（1）对于一系列的pipelines以及上一个Value Function Vi-1，通过Algorithm1获得对应的最佳调度序列s0…sn；（2）对于任意一个调度sj，通过BeamSearch的方式（即Beam形式的Algorithm1），获得后面的调度tj+1…tn，同时获得s0..sj,tj+1..tn调度序列的BenchMark结果r，更新Vi（sj）。",{"type":18,"tag":29,"props":283,"children":284},{},[285],{"type":18,"tag":113,"props":286,"children":288},{"alt":7,"src":287},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/05/21/5b1253a0222c4d78ac6938f3d4592cbc.jpg",[],{"type":18,"tag":29,"props":290,"children":291},{},[292],{"type":24,"value":293},"图7 迭代优化Value Function流程",{"type":18,"tag":29,"props":295,"children":296},{},[297,299],{"type":24,"value":298},"【注】关于Value Iteration过程，可以参考",{"type":18,"tag":40,"props":300,"children":303},{"href":301,"rel":302},"https://zhuanlan.zhihu.com/p/33229439",[44],[304],{"type":24,"value":301},{"type":18,"tag":19,"props":306,"children":308},{"id":307},"结果对比",[309],{"type":24,"value":307},{"type":18,"tag":29,"props":311,"children":312},{},[313],{"type":24,"value":314},"1.Cost Model。Cost Model的对比项主要是平均预测误差、最大预测误差、决定系数。由实验结果可以看出，论文的结果均远超于Halide、TVM。",{"type":18,"tag":29,"props":316,"children":317},{},[318],{"type":24,"value":319},"2.整体的结果对比：加速效果与搜索时间对比。",{"type":18,"tag":29,"props":321,"children":322},{},[323],{"type":18,"tag":113,"props":324,"children":326},{"alt":7,"src":325},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/05/21/3fd4f221c2ce43fe8e300a568714cfea.jpg",[],{"type":18,"tag":29,"props":328,"children":329},{},[330],{"type":24,"value":331},"图8 Cost Model实验对比",{"type":18,"tag":29,"props":333,"children":334},{},[335],{"type":18,"tag":113,"props":336,"children":338},{"alt":7,"src":337},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/05/21/e9809d2d190a41e2b8909e79786e3505.jpg",[],{"type":18,"tag":29,"props":340,"children":341},{},[342],{"type":24,"value":343},"图9 加速实验、搜索时间实验",{"type":18,"tag":19,"props":345,"children":347},{"id":346},"总结",[348],{"type":24,"value":346},{"type":18,"tag":29,"props":350,"children":351},{},[352],{"type":24,"value":353},"下面总结一下已经分析的三篇论文对tuning问题的思考与处理方法（TVM-Ansor[3]、Google-TPU[2]、MIT-TIRAMISU[4]、Facebook-本篇[1]）差异。在tuning的基础流程上，各个研究机构是相似的，都是遵循空间生成、Cost Model训练与预测、空间中搜索寻找最优解。而在关键问题上（Cost Model和搜索），各家又各有不同。",{"type":18,"tag":29,"props":355,"children":356},{},[357],{"type":24,"value":358},"首先在Cost Model上，从模型选择上来说，基于XLA的TPU-tune选择了GNN、TIRAMISU和Facebook使用LSTM、Ansor则采用一种特殊的packed-XGBoost。不同的机构的AI框架与设计都大有不同，各个论文给出的speedup很难从数值上直接断定“哪个模型更好”。具体模型落在具体场景的性能优劣，可能是一个工程问题。但是从设计思路上来说，它们不约而同的采取“子模块建模+求和”的方式，将一个子图转化成若干子模块来建模：TPU-tune的opcode、TIRAMISU基于loop的computation、Facebook的stages和Ansor的stages。这些子模块都可以被单独建模（如一个LSTM cell或者单个xgboost的预测值等），汇总之后就可以用来表达整个子图的最终结果。想要更精确的Cost model，则重点关注建模子模块（features）即可。从这点上来看，是跟子图整体式建模有区别的。",{"type":18,"tag":29,"props":360,"children":361},{},[362],{"type":24,"value":363},"在搜索算法上，TPU-tune的论文侧重点在于Cost Model，理论上来说任意搜索算法都适用；TIRAMUSA采用的是树搜索Beam search或MCTS，Beam search属于一种贪心但是有一定的容忍度，而MCTS则为搜索加入了反馈，以希望修正误差；Facebook采用的是另一种强化学习的思路Value Iteration；而Ansor则采用的是Guided random search中的Evolutionary algorithm。从搜索算法而言，越来越多的机构采取各种强化学习（RL）的方式处理搜索，RL确实是一种更理想的方式，毕竟“调option、调切分”这件事某种程度上来说跟“agent在environment上探索然后根据reward去调整”实在太像。但是就目前而言，tuning框架上似乎还是没有像AlphaGo一样的模型能直接证明RL比传统的搜索方式有优势。传统的搜索方式没有模型引导[3]，但是更方便研究人员加入各式各样的策略去解决搜索方向问题；RL则通过模型去学习策略[1]，而能不能学到复杂有效的策略又成一个难题。",{"type":18,"tag":29,"props":365,"children":366},{},[367],{"type":24,"value":368},"总体而言，AI框架处于一个百花齐放的时代，而其下的算子（子图）框架、tuning框架也多种多样。从学术研究上来说，很多方面都可以有进一步的尝试：优化空间生成、更精确的建模、搜索的速度与方向引导等等；从工程上来说，让tuning优美的落地不同层级的AI场景，实现更快的搜索生成更高性能的算子，同样需要工程师们做出更多的努力。",{"type":18,"tag":29,"props":370,"children":371},{},[372],{"type":24,"value":373},"参考文献",{"type":18,"tag":29,"props":375,"children":376},{},[377],{"type":24,"value":378},"[1]Steiner, B., Cummins, C., He, H., and Leather, H., “Value Function Based Performance Optimization of Deep Learning Workloads”, arXiv e-prints, 2020.",{"type":18,"tag":29,"props":380,"children":381},{},[382],{"type":24,"value":383},"[2] Kaufman S J, Phothilimthana P M, Zhou Y, et al. A Learned Performance Model for Tensor Processing Units[J]. arXiv preprint arXiv:2008.01040, 2020.",{"type":18,"tag":29,"props":385,"children":386},{},[387],{"type":24,"value":388},"[3] Zheng L, Jia C, Sun M, et al. Ansor: Generating high-performance tensor programs for deep learning[C]//14th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 20). 2020: 863-879.",{"type":18,"tag":29,"props":390,"children":391},{},[392],{"type":24,"value":393},"[4] Merouani M, Leghettas M H, Pr R B D T A, et al. A Deep Learning Based Cost Model for Automatic Code Optimization in Tiramisu[D]. Master’s thesis, École nationale supérieure d’informatique, Algiers, Algeria, 2020.",{"title":7,"searchDepth":395,"depth":395,"links":396},4,[],"markdown","content:technology-blogs:zh:565.md","content","technology-blogs/zh/565.md","technology-blogs/zh/565","md",1776506137868]