[{"data":1,"prerenderedAt":510},["ShallowReactive",2],{"content-query-0ogveVr7PN":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":504,"_id":505,"_source":506,"_file":507,"_stem":508,"_extension":509},"/technology-blogs/zh/628","zh",false,"","大V博文系列：PLDI 2021论文AKG-NPU上算子自动生成技术探索","解决面向领域特定硬件，自动生成运算密集型算子的挑战","2021-06-26","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/06/26/084c6a0b061b45ecb16ad1e59a403a0f.png","technology-blogs","大V博文",{"type":15,"children":16,"toc":493},"root",[17,25,28,34,47,58,87,92,102,115,124,129,136,144,149,160,168,173,178,185,193,198,203,208,233,240,245,250,257,262,267,272,277,284,289,294,299,304,309,314,321,326,333,341,346,351,356,361,366,374,386,391,398,403,410,415,422,427,434,439,447,452,460,465,475,480,488],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"大v博文系列pldi-2021论文akg-npu上算子自动生成技术探索",[23],{"type":24,"value":8},"text",{"type":18,"tag":19,"props":26,"children":27},{"id":7},[],{"type":18,"tag":29,"props":30,"children":31},"p",{},[32],{"type":24,"value":33},"作者：金雪锋",{"type":18,"tag":29,"props":35,"children":36},{},[37,39],{"type":24,"value":38},"作者主页：",{"type":18,"tag":40,"props":41,"children":45},"a",{"href":42,"rel":43},"https://www.zhihu.com/people/jin-xue-feng",[44],"nofollow",[46],{"type":24,"value":42},{"type":18,"tag":29,"props":48,"children":49},{},[50,52],{"type":24,"value":51},"文章来源：",{"type":18,"tag":40,"props":53,"children":56},{"href":54,"rel":55},"https://zhuanlan.zhihu.com/p/384191216",[44],[57],{"type":24,"value":54},{"type":18,"tag":29,"props":59,"children":60},{},[61,63,70,72,78,80],{"type":24,"value":62},"本次给大家分享的是MindSpore团队和 赵捷老师（",{"type":18,"tag":40,"props":64,"children":67},{"href":65,"rel":66},"https://www.zhihu.com/people/f3edc0b3c8221a4fa224169746ae9ed0",[44],[68],{"type":24,"value":69},"@要术甲杰",{"type":24,"value":71}," ）合作的中了PLDI 2021的一篇论文《",{"type":18,"tag":73,"props":74,"children":75},"strong",{},[76],{"type":24,"value":77},"AKG: Automatic Kernel Generation for Neural Processing Units using Polyhedral Transformations",{"type":24,"value":79},"》:",{"type":18,"tag":40,"props":81,"children":84},{"href":82,"rel":83},"https://link.zhihu.com/?target=https%3A//dl.acm.org/doi/pdf/10.1145/3453483.3454106",[44],[85],{"type":24,"value":86},"https://dl.acm.org/doi/pdf/10.1145/3453483.3454106dl.acm.org",{"type":18,"tag":29,"props":88,"children":89},{},[90],{"type":24,"value":91},"这也是我们和赵捷老师合作发出的第二篇顶会论文，上一篇论文发布在micro：",{"type":18,"tag":29,"props":93,"children":94},{},[95],{"type":18,"tag":40,"props":96,"children":99},{"href":97,"rel":98},"https://zhuanlan.zhihu.com/p/333394142",[44],[100],{"type":24,"value":101},"量子位：53年来国内唯三，MindSpore加速昇腾芯片论文获国际顶会MICRO最佳论文提名zhuanlan.zhihu.com",{"type":18,"tag":29,"props":103,"children":104},{},[105],{"type":18,"tag":40,"props":106,"children":108},{"href":97,"rel":107},[44],[109],{"type":18,"tag":110,"props":111,"children":114},"img",{"alt":112,"src":113},"图标","https://pic2.zhimg.com/v2-f77a789e3ef3f39e5e4e0148c4d79c85_180x120.jpg",[],{"type":18,"tag":116,"props":117,"children":119},"h2",{"id":118},"背景与动机",[120],{"type":18,"tag":73,"props":121,"children":122},{},[123],{"type":24,"value":118},{"type":18,"tag":29,"props":125,"children":126},{},[127],{"type":24,"value":128},"深度学习技术和深度学习编译技术目前一直是业界研究的热点，AI编译技术也可以从两个层次来看，偏前端的是基于图的编译器技术，偏后端的是基于张量的编译器技术。现有的张量编译器已经证明其在通用硬件上(CPU，GPU)部署深度神经网络的有效性，但部署在基于AI应用的NPU上，仍然具有非常大的挑战。",{"type":18,"tag":29,"props":130,"children":131},{},[132],{"type":18,"tag":110,"props":133,"children":135},{"alt":7,"src":134},"https://pic3.zhimg.com/80/v2-258fc88b26e48779c576492deca5ec8e_720w.jpg",[],{"type":18,"tag":116,"props":137,"children":139},{"id":138},"挑战",[140],{"type":18,"tag":73,"props":141,"children":142},{},[143],{"type":24,"value":138},{"type":18,"tag":29,"props":145,"children":146},{},[147],{"type":24,"value":148},"开发一个面向NPU架构的编译器仍然面临着诸多挑战，因为NPU上通常需要更复杂融合策略设计，以期充分利用NPU上的快速存储部件。编译器需要解决不同程序对并行性和局部性的需求不同的挑战，并找到最优的schedule结果。另一方面，领域特定芯片的内存层级都是多层次、多方向设计的，编译器需要解决如何在软件层面上实现自动内存管理的挑战。解决面向领域特定硬件，自动生成运算密集型算子的挑战，例如卷积，矩阵乘等。",{"type":18,"tag":29,"props":150,"children":151},{},[152,156],{"type":18,"tag":110,"props":153,"children":155},{"alt":7,"src":154},"https://pic2.zhimg.com/80/v2-b64c1d7a9427280bf4994eecb2d50071_720w.jpg",[],{"type":18,"tag":110,"props":157,"children":159},{"alt":7,"src":158},"https://pic4.zhimg.com/80/v2-3bc7e4a57085d1d198c99b86e3401b83_720w.jpg",[],{"type":18,"tag":116,"props":161,"children":163},{"id":162},"解决方案",[164],{"type":18,"tag":73,"props":165,"children":166},{},[167],{"type":24,"value":162},{"type":18,"tag":29,"props":169,"children":170},{},[171],{"type":24,"value":172},"为了更友好的和前端的图编译器配合，AKG支持Graph Kernel Fusion算子json和TVM的DSL作为输入，来表征张量计算。同时为了弥补当前解决方案，在面向NPU后端时，调度原语中对自动transformation和自动内存管理的表征不足，AKG引入一个PASS将HalideIR转换为Polyhedral IR，便于后续进行基于多面体的IR变换。将多面体模型和TVM相结合，不仅可以重用后者现有的一些功能(减少重复造轮子的工作)，还可以平衡两者的不足。",{"type":18,"tag":29,"props":174,"children":175},{},[176],{"type":24,"value":177},"AKG利用多面体模型的调度算法，更好的同时挖掘程序并行性和局部性。利用多面体模型，AKG对loop fusion和loop tiling进行了很好的建模，并很好地实现了数据排布和内存层级之间的解耦。同时AKG将外部调度树作为输入来实现img2col卷积的转换。AKG还实现了基于NPU指令集的完备向量化方案，多级流水线的自动同步插入，以及auto-tuning来优化生成代码的性能。",{"type":18,"tag":29,"props":179,"children":180},{},[181],{"type":18,"tag":110,"props":182,"children":184},{"alt":7,"src":183},"https://pic3.zhimg.com/80/v2-a487b4a9d73d7f86cb3aeed452769bfe_720w.jpg",[],{"type":18,"tag":29,"props":186,"children":187},{},[188],{"type":18,"tag":73,"props":189,"children":190},{},[191],{"type":24,"value":192},"多面体变换",{"type":18,"tag":29,"props":194,"children":195},{},[196],{"type":24,"value":197},"多面体模型是对程序的一个数学抽象，在该抽象上更准确的进行程序分析和编译优化。",{"type":18,"tag":29,"props":199,"children":200},{},[201],{"type":24,"value":202},"Abstraction Lowering",{"type":18,"tag":29,"props":204,"children":205},{},[206],{"type":24,"value":207},"AKG可以将用TVM的DSL编写的张量程序lowering到多面体模型表征的中间表达，调度树上。调度树的表达非常丰富，包括了各种不同类型的节点，其可以根据程序需要来表示程序中不同类型的信息：",{"type":18,"tag":209,"props":210,"children":211},"ul",{},[212,218,223,228],{"type":18,"tag":213,"props":214,"children":215},"li",{},[216],{"type":24,"value":217},"Domain节点，filter节点",{"type":18,"tag":213,"props":219,"children":220},{},[221],{"type":24,"value":222},"Band节点，sequence节点和set节点",{"type":18,"tag":213,"props":224,"children":225},{},[226],{"type":24,"value":227},"Extension节点",{"type":18,"tag":213,"props":229,"children":230},{},[231],{"type":24,"value":232},"Mark节点，等",{"type":18,"tag":29,"props":234,"children":235},{},[236],{"type":18,"tag":110,"props":237,"children":239},{"alt":7,"src":238},"https://pic2.zhimg.com/80/v2-b1582058653e95b1a5cfb73406645095_720w.jpg",[],{"type":18,"tag":29,"props":241,"children":242},{},[243],{"type":24,"value":244},"多面体调度",{"type":18,"tag":29,"props":246,"children":247},{},[248],{"type":24,"value":249},"AKG利用基于整数线性规划的ISL调度器来对输入程序进行新的调度变换。ISL调度器以Pluto算法为主，并辅以Feautrier算法，在程序的并行性和局部性之间寻求最优。下图是一个循环融合的示例，该示例是一个2D卷积算子，后跟了两个abs和Relu的张量计算。在经过融合后，可以看到语句1，语句2，语句3合并到同一个Band节点之下。",{"type":18,"tag":29,"props":251,"children":252},{},[253],{"type":18,"tag":110,"props":254,"children":256},{"alt":7,"src":255},"https://pic2.zhimg.com/80/v2-daf155034425f48856da9071e6a96d31_720w.jpg",[],{"type":18,"tag":29,"props":258,"children":259},{},[260],{"type":24,"value":261},"Loop Tiling",{"type":18,"tag":29,"props":263,"children":264},{},[265],{"type":24,"value":266},"Loop tiling在多面体的循环变换中是非常重要的一个环节。其对于程序的局部性和并行性，以及内存层级管理都非常重要。Loop tiling的实现可以归结为两个基本问题：一个是tile shape的构建，另一个是tile size的选择。",{"type":18,"tag":29,"props":268,"children":269},{},[270],{"type":24,"value":271},"Tile shape的构建方法非常灵活，可以基于正向的策略，也可以基于反向的策略(更多内容可以参考MICRO/ Optimizing the memory hierarchy by compositing automatic transformations on computations and data)，开发者还可以根据自己的需求进一步的丰富构建策略。但基本上在Loop Fusion之后，就可以实施了。",{"type":18,"tag":29,"props":273,"children":274},{},[275],{"type":24,"value":276},"Tile size的选择也是一个非常有意思的topic，目前的compiler有几种常用的方法：基于类型使用程序默认的tile size；或者将这一个过程委托给用户，让用户在开发算子schedule模板时来设置tile size的大小。为了更好的适配多面体模型，AKG为tile size专门提出了一个specification language。",{"type":18,"tag":29,"props":278,"children":279},{},[280],{"type":18,"tag":110,"props":281,"children":283},{"alt":7,"src":282},"https://pic3.zhimg.com/80/v2-2b86709f330a70c92e48e24baa35b722_720w.jpg",[],{"type":18,"tag":29,"props":285,"children":286},{},[287],{"type":24,"value":288},"Tile size的spec由下述描述组成：语句id，每个循环维度的tile size大小，指示字符串，表示该语句应该放置的位置。该spec可以简化AKG对于tile size的自动设置，便于AKG实现Tiling的自动化。",{"type":18,"tag":29,"props":290,"children":291},{},[292],{"type":24,"value":293},"Loop Fusion",{"type":18,"tag":29,"props":295,"children":296},{},[297],{"type":24,"value":298},"Loop fusion是一种最小化生产者消费者距离的循环变换，便于优化程序的局部性。Loop fusion已经集成到多面体编译中，以便于挖掘与其他循环变换的组合。现有的启发式融合算法，并没有考虑面向NPU的多向的内存层级结构。AKG完善了面向NPU多内存层级的融合策略。",{"type":18,"tag":29,"props":300,"children":301},{},[302],{"type":24,"value":303},"AKG中实现了多种融合策略，包括：offloading数据融合, forking数据融合，以及intra-tiling的rescheduling。Rescheduling的过程中，对于适用于Vector Unit进行向量化加速的计算进行loop distribution，对于适用于Cube Unit的计算进行更aggressive的融合策略。",{"type":18,"tag":29,"props":305,"children":306},{},[307],{"type":24,"value":308},"Convolution优化",{"type":18,"tag":29,"props":310,"children":311},{},[312],{"type":24,"value":313},"由于卷积运算是CV类网络中计算量占比最高的算子，高效的卷积实现更有助于DL模型运行加速。我们实现了基于多面体的img2col转换，并将卷积转换为矩阵乘操作，以加速卷积算子。",{"type":18,"tag":29,"props":315,"children":316},{},[317],{"type":18,"tag":110,"props":318,"children":320},{"alt":7,"src":319},"https://pic3.zhimg.com/80/v2-54dfe26599f7889888a22ade16611772_720w.jpg",[],{"type":18,"tag":29,"props":322,"children":323},{},[324],{"type":24,"value":325},"矩阵乘法可以充分挖掘NPU芯片中Cube计算单元的算力。AKG构建了基于外部多面体IR的分型结构的矩阵乘法，在实现方案中同时考虑了band node的切分，以及内层维度的分型对齐，如下图所示。",{"type":18,"tag":29,"props":327,"children":328},{},[329],{"type":18,"tag":110,"props":330,"children":332},{"alt":7,"src":331},"https://pic4.zhimg.com/80/v2-1541fbd17e061be98035550ab03e1f67_720w.jpg",[],{"type":18,"tag":29,"props":334,"children":335},{},[336],{"type":18,"tag":73,"props":337,"children":338},{},[339],{"type":24,"value":340},"Other Optimizations in AKG",{"type":18,"tag":29,"props":342,"children":343},{},[344],{"type":24,"value":345},"AKG的目标程序语言是类C风格的CCE代码，其充分考虑了昇腾910架构以及Cube算力，SIMD的向量化等。多面体优化是AKG关于自动调度非常重要的一个环节，但是并不是整个AKG的优化flow。在多面体编译优化之外，还有基于HalideIR的优化pass来自动生成高效的CCE代码，主要包括程序规范化相关的优化，以及硬件相关优化等。",{"type":18,"tag":29,"props":347,"children":348},{},[349],{"type":24,"value":350},"程序规范化优化，包括运算符inline、公共子表达式删除、程序三地址化等。",{"type":18,"tag":29,"props":352,"children":353},{},[354],{"type":24,"value":355},"由于在DL网络中还有除去计算密集的卷积运算之外的其他计算，所以有效的利用SIMD对于NPU的高效使能也至关重要。向量化pass会将多面体技术优化后的Halide IR作为输入，并从中分析中有效的信息，包括alignment, strides, source和destination等，可以自动生成高效的SIMD代码，并具有自动考虑代码中未对齐场景，进行自动对齐。",{"type":18,"tag":29,"props":357,"children":358},{},[359],{"type":24,"value":360},"另外，现代NPU架构都会考虑decoupled access-execute (DAE)的架构，其中每个计算单元和数据搬移单元都有一个独立的指令pipeline，这些pipeline之间是相互独立的，可以同步执行。这一相关优化，也超出了多面体的编译优化，AKG采用DP的方法，实现了面向NPU的优化pass来实现同步优化。",{"type":18,"tag":29,"props":362,"children":363},{},[364],{"type":24,"value":365},"最后，多面体变换通过对硬件架构抽象来优化程序调度，得到的程序性能并非实际中的最佳。AKG基于auto-tuner工具来解决这个问题，通过统计不同tiling策略下算子的性能。通常张量计算的tuning space是非常巨大的，因此我们使用一种基于机器学习的样本抽样方法，对tuning space进行裁剪，在较短的时间内，获取最优性能。",{"type":18,"tag":116,"props":367,"children":369},{"id":368},"实验",[370],{"type":18,"tag":73,"props":371,"children":372},{},[373],{"type":24,"value":368},{"type":18,"tag":29,"props":375,"children":376},{},[377,379],{"type":24,"value":378},"AKG是全栈全场景AI计算框架MindSpore的算子自动生成工具，可以自动生成华为昇腾910芯片上的执行代码，用于深度神经网络训练和推理任务。更多的算法实现可以参考：",{"type":18,"tag":40,"props":380,"children":383},{"href":381,"rel":382},"https://link.zhihu.com/?target=https%3A//gitee.com/mindspore/akg",[44],[384],{"type":24,"value":385},"MindSpore/akggitee.com",{"type":18,"tag":29,"props":387,"children":388},{},[389],{"type":24,"value":390},"实验从单算子、子图以及整网的角度来对比不同工具与AKG的性能 。",{"type":18,"tag":29,"props":392,"children":393},{},[394],{"type":18,"tag":110,"props":395,"children":397},{"alt":7,"src":396},"https://pic4.zhimg.com/80/v2-fad71b82a766b262f07447ebd593358b_720w.jpg",[],{"type":18,"tag":29,"props":399,"children":400},{},[401],{"type":24,"value":402},"在单算子的对比实验中，我们可以看到手工优化后的CCE代码比优化前能提升2.8倍。AKG自动生成的代码性能，能够和优化后的CCE代码持平；并且平均超越TVM有1.6倍。",{"type":18,"tag":29,"props":404,"children":405},{},[406],{"type":18,"tag":110,"props":407,"children":409},{"alt":7,"src":408},"https://pic3.zhimg.com/80/v2-747545b07996edde0006d4d8f3e6fffe_720w.jpg",[],{"type":18,"tag":29,"props":411,"children":412},{},[413],{"type":24,"value":414},"本实验，选取了41个shape大小从(64,64)到(4608,4608)不同大小的矩阵乘法来评估性能的稳定性。在41个不同的case中，AKG有29个case的性能优于TVM的结果。",{"type":18,"tag":29,"props":416,"children":417},{},[418],{"type":18,"tag":110,"props":419,"children":421},{"alt":7,"src":420},"https://pic2.zhimg.com/80/v2-8b0f56bcc7b5cef48f34a754f2d9cf99_720w.jpg",[],{"type":18,"tag":29,"props":423,"children":424},{},[425],{"type":24,"value":426},"在子图融合的对比实验中，AKG平均超越adapted TVM1.3倍，平均超越CCE opt有5.6倍。",{"type":18,"tag":29,"props":428,"children":429},{},[430],{"type":18,"tag":110,"props":431,"children":433},{"alt":7,"src":432},"https://pic2.zhimg.com/80/v2-3b85aa447b410121c02cabd8edbc58b9_720w.jpg",[],{"type":18,"tag":29,"props":435,"children":436},{},[437],{"type":24,"value":438},"在整网的对比实验中，AKG在ResNet-50, MobileNet，AlexNet和TVM基本持平。但是在Bert和SSD上，超出TVM平均20.2%。此外，虽然在ResNet-50上CCE opt的整网性能最佳，但是手动优化需要大量的人力，而采用自动化工具，可以大大节省人力成本。",{"type":18,"tag":116,"props":440,"children":442},{"id":441},"总结",[443],{"type":18,"tag":73,"props":444,"children":445},{},[446],{"type":24,"value":441},{"type":18,"tag":29,"props":448,"children":449},{},[450],{"type":24,"value":451},"首先，AKG采用反向融合策略合理的平衡了tiling和fusion的相互影响，并且实现了平台中立的变换实现。此外，AKG采用hierarchical fusion的方式，可以很好的应用在多内存层级的NPU架构上。还有就是AKG中自动执行卷积算子的领域特定的transformation以及fractal tiling都是较为通用的实现方案。最后，AKG扩展了schedule tree的表达，这一做法其实和MLIR有着异曲同工的效果，都希望在多面体表征上加入领域特定的知识信息。",{"type":18,"tag":116,"props":453,"children":455},{"id":454},"扩展工作",[456],{"type":18,"tag":73,"props":457,"children":458},{},[459],{"type":24,"value":454},{"type":18,"tag":29,"props":461,"children":462},{},[463],{"type":24,"value":464},"目前，AKG还增强了面向GPU硬件的自动生成高性能的cuda算子的能力。与的图算融合技术相结合，在MindSpore全场景AI框架上，对于许多业界benchmark网络，性能已经超越了主流AI框架(TensorFlow/MXNet等)，取得了非常不错的成绩。更多关于实验的评测报告，大家可以在下述链接中翻阅：",{"type":18,"tag":29,"props":466,"children":467},{},[468],{"type":18,"tag":40,"props":469,"children":472},{"href":470,"rel":471},"https://zhuanlan.zhihu.com/p/350343159",[44],[473],{"type":24,"value":474},"Orange Lee：国产深度学习框架MindSpore训练性能评测 —— by 中科大ADSL实验室zhuanlan.zhihu.com",{"type":18,"tag":29,"props":476,"children":477},{},[478],{"type":24,"value":479},"基于AKG算子自动生成技术和图算融合技术，目前还在紧锣密鼓地进行更多NN网络的泛化验证工作，大家很快也可看到详细的工作进展。",{"type":18,"tag":116,"props":481,"children":483},{"id":482},"展望",[484],{"type":18,"tag":73,"props":485,"children":486},{},[487],{"type":24,"value":482},{"type":18,"tag":29,"props":489,"children":490},{},[491],{"type":24,"value":492},"在面向NPU后端，引入多面体模型，实现调度自动化，AKG工具做了非常多新颖并且有意思的工作。虽然AKG的论文解读到此就结束了，但是多样化算力异构计算的挑战和软件2.0下AI应用的浪潮远未结束。相信在不久的将来，随着DSA硬件架构的不断迭代，以及AI编译技术的持续创新，终究会迎来软硬紧密协同的黄金年代。",{"title":7,"searchDepth":494,"depth":494,"links":495},4,[496,498,499,500,501,502,503],{"id":118,"depth":497,"text":118},2,{"id":138,"depth":497,"text":138},{"id":162,"depth":497,"text":162},{"id":368,"depth":497,"text":368},{"id":441,"depth":497,"text":441},{"id":454,"depth":497,"text":454},{"id":482,"depth":497,"text":482},"markdown","content:technology-blogs:zh:628.md","content","technology-blogs/zh/628.md","technology-blogs/zh/628","md",1776506138914]