[{"data":1,"prerenderedAt":1374},["ShallowReactive",2],{"content-query-e1ymBIEbnT":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":1368,"_id":1369,"_source":1370,"_file":1371,"_stem":1372,"_extension":1373},"/technology-blogs/zh/2203","zh",false,"","技术干货 | 一文了解AI编译器的前世今生（赠书）","作者：金雪锋 ｜来源：知乎  随着人工智能时代的来临，AI领域应用的大量出现也促进着领域编译的发展，最突出的表现就是多种AI编译器的普及和应用。本期将结合AI编译器的发展概述及昇思MindSpore的实践梳理出相对完整的AI编译器全视图，希望对大家有所启发。","2023-03-20","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/03/23/67bf580edd32414b84380ee776a18196.png","technology-blogs","大V博文",{"type":15,"children":16,"toc":1353},"root",[17,25,42,47,52,57,62,67,72,80,90,96,104,112,120,125,130,138,143,148,153,158,166,171,176,181,189,194,203,208,215,223,228,233,238,243,251,256,261,266,270,275,280,285,293,298,303,308,313,321,329,334,339,344,349,357,362,367,372,377,384,389,394,399,404,409,417,422,427,432,437,442,451,456,463,468,473,478,483,491,496,501,509,514,519,524,529,534,541,546,551,556,561,570,575,583,588,593,598,603,611,616,621,626,634,643,650,658,663,668,673,678,683,688,697,702,707,714,719,726,735,740,745,750,764,769,774,779,787,792,799,807,812,819,827,832,844,853,858,866,871,876,881,889,894,899,907,912,917,922,930,935,940,945,950,955,967,976,981,989,994,999,1007,1012,1017,1025,1032,1037,1042,1047,1052,1059,1067,1072,1079,1084,1091,1096,1101,1109,1116,1121,1126,1134,1139,1144,1149,1157,1162,1167,1176,1181,1188,1193,1200,1208,1216,1221,1226,1231,1236,1241,1249,1254,1262,1267,1272,1277,1285,1295,1305,1313,1318,1326,1331,1341,1346],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"技术干货-一文了解ai编译器的前世今生赠书",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29,35,37],{"type":18,"tag":30,"props":31,"children":32},"strong",{},[33],{"type":24,"value":34},"作者：金雪锋",{"type":24,"value":36}," ｜",{"type":18,"tag":30,"props":38,"children":39},{},[40],{"type":24,"value":41},"来源：知乎",{"type":18,"tag":26,"props":43,"children":44},{},[45],{"type":24,"value":46},"随着人工智能时代的来临，AI领域应用的大量出现也促进着领域编译的发展，最突出的表现就是多种AI编译器的普及和应用。本期将结合AI编译器的发展概述及昇思MindSpore的实践梳理出相对完整的AI编译器全视图，希望对大家有所启发。",{"type":18,"tag":26,"props":48,"children":49},{},[50],{"type":24,"value":51},"文章主要分为四个部分：",{"type":18,"tag":26,"props":53,"children":54},{},[55],{"type":24,"value":56},"1、AI编译器的概览",{"type":18,"tag":26,"props":58,"children":59},{},[60],{"type":24,"value":61},"2、AI编译器的挑战",{"type":18,"tag":26,"props":63,"children":64},{},[65],{"type":24,"value":66},"3、昇思MindSpore的实践",{"type":18,"tag":26,"props":68,"children":69},{},[70],{"type":24,"value":71},"4、未来的展望及赠书活动",{"type":18,"tag":26,"props":73,"children":74},{},[75],{"type":18,"tag":30,"props":76,"children":77},{},[78],{"type":24,"value":79},"AI编译器的概览",{"type":18,"tag":81,"props":82,"children":84},"h3",{"id":83},"ai编译器的定义",[85],{"type":18,"tag":30,"props":86,"children":87},{},[88],{"type":24,"value":89},"AI编译器的定义",{"type":18,"tag":81,"props":91,"children":93},{"id":92},"首先介绍一下个人对ai编译器的一个理解",[94],{"type":24,"value":95},"首先介绍一下，个人对AI编译器的一个理解：",{"type":18,"tag":26,"props":97,"children":98},{},[99],{"type":18,"tag":100,"props":101,"children":103},"img",{"alt":7,"src":102},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/df0b8d5d6283481eb040ed50b2aa706b.png",[],{"type":18,"tag":26,"props":105,"children":106},{},[107],{"type":18,"tag":30,"props":108,"children":109},{},[110],{"type":24,"value":111},"与传统编译器相比，AI编译器是一个领域特定的编译器，有四个明显的特征：",{"type":18,"tag":26,"props":113,"children":114},{},[115],{"type":18,"tag":30,"props":116,"children":117},{},[118],{"type":24,"value":119},"1）Python为主的动态解释器语言前端",{"type":18,"tag":26,"props":121,"children":122},{},[123],{"type":24,"value":124},"与传统编译器不同，AI编译器通常不需要Lexer/Parser，而是基于前端语言（主要是Python）的AST将模型解析并构造为计算图IR，侧重于保留shape、layout等Tensor计算特征信息，当然部分编译器还能保留控制流的信息。",{"type":18,"tag":26,"props":126,"children":127},{},[128],{"type":24,"value":129},"这里的难点在于，Python是一种灵活度极高的解释执行的语言，AI编译器需要把它转到静态的IR上。",{"type":18,"tag":26,"props":131,"children":132},{},[133],{"type":18,"tag":30,"props":134,"children":135},{},[136],{"type":24,"value":137},"2）多层IR设计：",{"type":18,"tag":26,"props":139,"children":140},{},[141],{"type":24,"value":142},"为什么需要多层IR设计，主要是为了同时满足易用性与高性能这两类需求。为了让开发者使用方便，框架前端(图层)会尽量对Tensor计算进行抽象封装，开发者只要关注逻辑意义上的模型和算子；而在后端算子性能优化时，又可以打破算子的边界，从更细粒度的循环调度等维度，结合不同的硬件特点完成优化。因此，多层IR设计无疑是较好的选择。",{"type":18,"tag":26,"props":144,"children":145},{},[146],{"type":24,"value":147},"1、图编译器：如昇思MindSpore的MindCompiler（MindIR）、TF的XLA（HLO），TVM的Relay等，重点关注非循环相关的优化。除了传统编译器中常见的常量折叠、代数化简、公共子表达式等优化外，还会完成Layout转换，算子融合等优化，通过分析和优化现有网络计算图逻辑，对原有计算逻辑进行拆分、重组、融合等操作，以减少算子执行间隙的开销并且提升设备计算资源利用率，从而实现网络整体执行时间的优化。",{"type":18,"tag":26,"props":149,"children":150},{},[151],{"type":24,"value":152},"2、算子编译器：如MindSpore AKG、CANN TBE、TVM(HalideIR)等。针对Low-level IR主要有循环变换、循环切分等调度相关的优化，与硬件intrinsic映射、内存分配等后端pass优化。其中，当前的自动调度优化主要包含了基于搜索的自动调度优化（如ansor[8]）和基于polyhedral编译技术的自动调度优化（如TC和MindAKG[9]）",{"type":18,"tag":26,"props":154,"children":155},{},[156],{"type":24,"value":157},"3、Codegen：当前基本上收敛在LLVM上。最后还有一个是MLIR，它实际上是一种编译的基础设施。",{"type":18,"tag":26,"props":159,"children":160},{},[161],{"type":18,"tag":30,"props":162,"children":163},{},[164],{"type":24,"value":165},"3）面向神经网络的特定优化",{"type":18,"tag":26,"props":167,"children":168},{},[169],{"type":24,"value":170},"1、数据类型-Tensor：AI领域，计算被抽象成张量的计算，这就意味着AI编译器中主要处理的数据类型也是张量，这个是非常重要的前提。",{"type":18,"tag":26,"props":172,"children":173},{},[174],{"type":24,"value":175},"2、自动微分：BP是深度学习/神经网络最有代表的部分，目前相对已经比较成熟，基于计算图的自动微分、基于Tape和运算符重载的自动微分方案、基于source2source的自动微分都是现在主流的方案。",{"type":18,"tag":26,"props":177,"children":178},{},[179],{"type":24,"value":180},"3、自动并行：随着深度学习的模型规模越来越大，模型的并行优化也成为编译优化的一部分，包括：数据并行、算子级模型并行、Pipeline模型并行、优化器模型并行和重计算等。",{"type":18,"tag":26,"props":182,"children":183},{},[184],{"type":18,"tag":30,"props":185,"children":186},{},[187],{"type":24,"value":188},"4）DSA芯片架构的支持",{"type":18,"tag":26,"props":190,"children":191},{},[192],{"type":24,"value":193},"SIMT、SIMD、Dataflow：AI的训练和推理对性能和时延都非常敏感，所以大量使用加速器进行计算，所以AI编译器其实是以加速器为中心的编译器，这个也是区别于通用编译器的一个特征。",{"type":18,"tag":81,"props":195,"children":197},{"id":196},"ai编译器的发展历程",[198],{"type":18,"tag":30,"props":199,"children":200},{},[201],{"type":24,"value":202},"AI编译器的发展历程",{"type":18,"tag":26,"props":204,"children":205},{},[206],{"type":24,"value":207},"我个人把它分为三个阶段：",{"type":18,"tag":26,"props":209,"children":210},{},[211],{"type":18,"tag":100,"props":212,"children":214},{"alt":7,"src":213},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/ea90db24e4dd45a9a2627d82d9e4fd45.png",[],{"type":18,"tag":26,"props":216,"children":217},{},[218],{"type":18,"tag":30,"props":219,"children":220},{},[221],{"type":24,"value":222},"第1阶段-朴素的AI编译器：",{"type":18,"tag":26,"props":224,"children":225},{},[226],{"type":24,"value":227},"AI框架的早期主要是两个抽象，一个是基于张量的计算图，分为图和算子；另外一个动态图和静态图。动态图基本上和AI编译器没有太多关联，静态图采用了部分编译器的思想，比如图优化的时候会使用一些硬件无关的优化（表达式化简/常量折叠等）、硬件相关的优化（手工的算子融合）。",{"type":18,"tag":26,"props":229,"children":230},{},[231],{"type":24,"value":232},"问题：",{"type":18,"tag":26,"props":234,"children":235},{},[236],{"type":24,"value":237},"1、表达上：静态图的表达式非Python原生的，开发者主要通过框架提供的API进行显示构图，易用性上不好；",{"type":18,"tag":26,"props":239,"children":240},{},[241],{"type":24,"value":242},"2、性能上：开发者定义的算子粒度未必最能发挥硬件的性能；硬件厂商的提供的缺省的算子库也未必是最优的，在模型和shape确定的情况下，可能还有更有的算子实现；DSA芯片出现加剧了性能上的挑战。",{"type":18,"tag":26,"props":244,"children":245},{},[246],{"type":18,"tag":30,"props":247,"children":248},{},[249],{"type":24,"value":250},"第2阶段-专用的AI编译器：",{"type":18,"tag":26,"props":252,"children":253},{},[254],{"type":24,"value":255},"主要的两个特征：",{"type":18,"tag":26,"props":257,"children":258},{},[259],{"type":24,"value":260},"1、表达上，动态图和静态图趋于一致，意味着AI编译的入口更加与Python原生的表达接近；",{"type":18,"tag":26,"props":262,"children":263},{},[264],{"type":24,"value":265},"2、出现相对独立的AI编译器，聚焦在打开图和算子边界进行融合优化，发挥芯片的算力。",{"type":18,"tag":26,"props":267,"children":268},{},[269],{"type":24,"value":232},{"type":18,"tag":26,"props":271,"children":272},{},[273],{"type":24,"value":274},"1、表达上，图层和算子层的表达还是分开的，算法工程师主要关注图层的表达，算子的表达和实现主要是框架开发者和芯片开发者来提供。",{"type":18,"tag":26,"props":276,"children":277},{},[278],{"type":24,"value":279},"2、功能泛化的问题：动静转换的成功率、动态shape、稀疏、分布式并行优化等更多的需求无法满足",{"type":18,"tag":26,"props":281,"children":282},{},[283],{"type":24,"value":284},"3、效率和性能的平衡：算子实现上在schedule、tiling、codegen上缺乏自动化手段，门槛高。",{"type":18,"tag":26,"props":286,"children":287},{},[288],{"type":18,"tag":30,"props":289,"children":290},{},[291],{"type":24,"value":292},"第3阶段-通用的AI编译器：",{"type":18,"tag":26,"props":294,"children":295},{},[296],{"type":24,"value":297},"主要的特征：",{"type":18,"tag":26,"props":299,"children":300},{},[301],{"type":24,"value":302},"1、图算统一表达；",{"type":18,"tag":26,"props":304,"children":305},{},[306],{"type":24,"value":307},"2、更泛化的优化能力：动静统一、动态shape、稀疏、复数、自动并行等；",{"type":18,"tag":26,"props":309,"children":310},{},[311],{"type":24,"value":312},"3、图算融合优化、算子自动生成。",{"type":18,"tag":26,"props":314,"children":315},{},[316],{"type":18,"tag":30,"props":317,"children":318},{},[319],{"type":24,"value":320},"总的来说，个人感觉当前的阶段还是处于2.0~2.3阶段，大家想尽快构建通用AI编译器的能力，但是还有许多关键的问题还没有完全解决。",{"type":18,"tag":26,"props":322,"children":323},{},[324],{"type":18,"tag":30,"props":325,"children":326},{},[327],{"type":24,"value":328},"AI编译器发展的驱动力和挑战",{"type":18,"tag":26,"props":330,"children":331},{},[332],{"type":24,"value":333},"个人认为，AI编译器发展的驱动力和挑战主要还是三个：",{"type":18,"tag":26,"props":335,"children":336},{},[337],{"type":24,"value":338},"1、Python的静态化",{"type":18,"tag":26,"props":340,"children":341},{},[342],{"type":24,"value":343},"2、怎么发挥硬件的性能，特别是DSA类的芯片",{"type":18,"tag":26,"props":345,"children":346},{},[347],{"type":24,"value":348},"3、如何处理NN的特定优化：自动微分、自动并行等等",{"type":18,"tag":26,"props":350,"children":351},{},[352],{"type":18,"tag":30,"props":353,"children":354},{},[355],{"type":24,"value":356},"挑战1：Python的静态化",{"type":18,"tag":26,"props":358,"children":359},{},[360],{"type":24,"value":361},"Python静态化是指通过JIT等技术，让Python程序进行静态的编译优化，提升性能、方便部署，Python静态化是AI编译器开始工作的一个起点。",{"type":18,"tag":26,"props":363,"children":364},{},[365],{"type":24,"value":366},"业界Python静态化的两种方式：",{"type":18,"tag":26,"props":368,"children":369},{},[370],{"type":24,"value":371},"1、通用Python JIT虚拟机：主要是期望在Python解释执行的基础上增加JIT编译加速的能力，典型的如PyPy；不过由于前期CPython暴露了太多内部接口，导致Python JIT虚拟机兼容的困难。",{"type":18,"tag":26,"props":373,"children":374},{},[375],{"type":24,"value":376},"2、修饰符方式的Python编译加速方案：典型的如Numba，Python JIT虚拟机的一种妥协实现方式，通过修饰符，进行部分Python语句加速。",{"type":18,"tag":26,"props":378,"children":379},{},[380],{"type":18,"tag":100,"props":381,"children":383},{"alt":7,"src":382},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/277afdda81934590a977a11fa2c8e731.png",[],{"type":18,"tag":26,"props":385,"children":386},{},[387],{"type":24,"value":388},"AI框架静态化的方案普遍采用修饰符这套方案，这套方案细分下来也有三种不同的方法：",{"type":18,"tag":26,"props":390,"children":391},{},[392],{"type":24,"value":393},"1、Tracing Based",{"type":18,"tag":26,"props":395,"children":396},{},[397],{"type":24,"value":398},"2、ByteCode Based",{"type":18,"tag":26,"props":400,"children":401},{},[402],{"type":24,"value":403},"3、AST Based",{"type":18,"tag":26,"props":405,"children":406},{},[407],{"type":24,"value":408},"其中tracing和bytecode的方式接近于JIT的方式，而AST Based方式接近于AOT的方式。",{"type":18,"tag":26,"props":410,"children":411},{},[412],{"type":18,"tag":30,"props":413,"children":414},{},[415],{"type":24,"value":416},"AI编译器在Python静态化方面的挑战：",{"type":18,"tag":26,"props":418,"children":419},{},[420],{"type":24,"value":421},"1、类型推导：从Python动态类型到编译器IR的静态类型",{"type":18,"tag":26,"props":423,"children":424},{},[425],{"type":24,"value":426},"2、灵活的语法和数据类型转换：slice、dict等",{"type":18,"tag":26,"props":428,"children":429},{},[430],{"type":24,"value":431},"3、控制流的处理",{"type":18,"tag":26,"props":433,"children":434},{},[435],{"type":24,"value":436},"4、JIT的编译性能",{"type":18,"tag":26,"props":438,"children":439},{},[440],{"type":24,"value":441},"5、.....",{"type":18,"tag":81,"props":443,"children":445},{"id":444},"挑战2ai编译器如何使能多样性算力特别是如何充分发挥dsa芯片的算力",[446],{"type":18,"tag":30,"props":447,"children":448},{},[449],{"type":24,"value":450},"挑战2：AI编译器如何使能多样性算力，特别是如何充分发挥DSA芯片的算力",{"type":18,"tag":26,"props":452,"children":453},{},[454],{"type":24,"value":455},"前面提到AI的训练和推理都是对性能非常敏感的，所以在AI的场景中大量用到加速器，包括CPU的SIMD单元、GPU的SIMT架构、NPU这样的专用架构等；AI编译器逐步成为发挥这些多样性算力的关键，特别是近期Dataflow+SIMD这样的DSA芯片占比逐步提升的情况下：",{"type":18,"tag":26,"props":457,"children":458},{},[459],{"type":18,"tag":100,"props":460,"children":462},{"alt":7,"src":461},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/b155c1eb9ba54eb6bc739859b8362f36.png",[],{"type":18,"tag":26,"props":464,"children":465},{},[466],{"type":24,"value":467},"统计数据表明，最近出现的AI芯片中，Dataflow架构占比大于50%，这类架构的特点：",{"type":18,"tag":26,"props":469,"children":470},{},[471],{"type":24,"value":472},"1、数据流图的执行调度更加能发挥芯片的性能，即芯片进行整图或者子图的调度，而不是像GPU那样，主流是 kernel by kernel的调度",{"type":18,"tag":26,"props":474,"children":475},{},[476],{"type":24,"value":477},"2、较强大的Cube处理单元（类SIMD），较为复杂的内存管理机制",{"type":18,"tag":26,"props":479,"children":480},{},[481],{"type":24,"value":482},"同时我们也看到最新NV GPU的H100架构的DSA特征也逐步明显。",{"type":18,"tag":26,"props":484,"children":485},{},[486],{"type":18,"tag":30,"props":487,"children":488},{},[489],{"type":24,"value":490},"AI编译器在性能优化的难度和复杂度挑战变大：",{"type":18,"tag":26,"props":492,"children":493},{},[494],{"type":24,"value":495},"1、性能优化更加依赖图算融合优化，图层和算子层独立优化无法充分发挥芯片性能，需要图算融合优化；子图切分、子图内垂直融合优化和水平并行优化；",{"type":18,"tag":26,"props":497,"children":498},{},[499],{"type":24,"value":500},"2、优化的复杂度提升，标量+向量+张量+加速指令、多级的存储结构，导致Schedule、Tilling、Vectorization/Tensorization复杂。",{"type":18,"tag":26,"props":502,"children":503},{},[504],{"type":18,"tag":30,"props":505,"children":506},{},[507],{"type":24,"value":508},"当前的AI编译器在这一块还没有一个完善的方案：",{"type":18,"tag":26,"props":510,"children":511},{},[512],{"type":24,"value":513},"首先我们看一下AI编译器的普遍需求：",{"type":18,"tag":26,"props":515,"children":516},{},[517],{"type":24,"value":518},"1、打开图和算子的边界，进行重新组合优化",{"type":18,"tag":26,"props":520,"children":521},{},[522],{"type":24,"value":523},"2、多种优化手段：垂直融合优化（buffer fusion等）和水平并行（msa rammer等）优化",{"type":18,"tag":26,"props":525,"children":526},{},[527],{"type":24,"value":528},"3、重新组合优化后的子图的代码自动生成（scheduling、tilling、vectorizing）",{"type":18,"tag":26,"props":530,"children":531},{},[532],{"type":24,"value":533},"其次，我们看一下业界已有的方案：",{"type":18,"tag":26,"props":535,"children":536},{},[537],{"type":18,"tag":100,"props":538,"children":540},{"alt":7,"src":539},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/be9e32b40bf042ebbe2c49cc457b48a0.png",[],{"type":18,"tag":26,"props":542,"children":543},{},[544],{"type":24,"value":545},"1、XLA：基本上的思路是把图层下发的子图中的算子全部打开成小算子，然后基于这张小算子组成的子图进行编译优化，整体设计主要通过HLO/LLO/LLVM层层lowering实现，算子打开/子图融合优化/融合算子生成的规则都是手工提前指定。",{"type":18,"tag":26,"props":547,"children":548},{},[549],{"type":24,"value":550},"2、TVM：分为Relay和TVM两层，Relay关注图层，TVM关注算子层，总体思路与XLA是类似的，Relay做子图的优化、TVM实现融合算子的生成，区别在于TVM是开放的架构，提供了compute和schedule分离的方案，方便定制算子生成的优化。",{"type":18,"tag":26,"props":552,"children":553},{},[554],{"type":24,"value":555},"另外，不得不提MLIR，不过他的定位还是聚焦提供MetaIR，作为构建AI编译器的基础，如果从功能完善性的角度看，目前看还有比较大的差距。",{"type":18,"tag":26,"props":557,"children":558},{},[559],{"type":24,"value":560},"最后总结是：AI编译器目前还没有一个完善的解决方案，仍在持续演进",{"type":18,"tag":81,"props":562,"children":564},{"id":563},"挑战3面向nn领域的特定优化自动并行自动微分",[565],{"type":18,"tag":30,"props":566,"children":567},{},[568],{"type":24,"value":569},"挑战3：面向NN领域的特定优化—自动并行+自动微分",{"type":18,"tag":26,"props":571,"children":572},{},[573],{"type":24,"value":574},"个人认为自动并行和自动微分这两个最为关键",{"type":18,"tag":26,"props":576,"children":577},{},[578],{"type":18,"tag":30,"props":579,"children":580},{},[581],{"type":24,"value":582},"自动并行依然是大模型训练的一个难题：",{"type":18,"tag":26,"props":584,"children":585},{},[586],{"type":24,"value":587},"当前大模型训练碰到碰到的内存墙、性能墙依赖复杂的切分策略来解决，包括：",{"type":18,"tag":26,"props":589,"children":590},{},[591],{"type":24,"value":592},"1、Scale out：多维混合并行能力，含：数据并行、算子级模型并行、流水线并行、优化器并行等",{"type":18,"tag":26,"props":594,"children":595},{},[596],{"type":24,"value":597},"2、Scale up：重计算、Host/Devcie并行等",{"type":18,"tag":26,"props":599,"children":600},{},[601],{"type":24,"value":602},"这种方式最大的挑战就是效率墙：如果依赖手工去配置切分策略，对算法工程师来说，门槛高，效率低；当前类似半自动并行的方式可以解决一部分效率的问题，但是真正要解放工程师还是依赖编译+寻优结合，自动化的找到并行策略。",{"type":18,"tag":26,"props":604,"children":605},{},[606],{"type":18,"tag":30,"props":607,"children":608},{},[609],{"type":24,"value":610},"面向未来AI+科学计算场景，自动微分的要求更高，是另外一个挑战",{"type":18,"tag":26,"props":612,"children":613},{},[614],{"type":24,"value":615},"这里有两个大的难题：",{"type":18,"tag":26,"props":617,"children":618},{},[619],{"type":24,"value":620},"**控制流：**传统的自动微分都是通过控制流展开方式来解决问题，动态图通过正向在Python侧执行进行控制流展开，一旦循环次数多的话，性能劣化；静态图的控制流自动微分目前还没有太完善的方案。",{"type":18,"tag":26,"props":622,"children":623},{},[624],{"type":24,"value":625},"**高级微分的性能：**前向微分+后向微分；Jacobian matrix(雅克比)；高阶微分：Hessian matrix（海森矩阵）",{"type":18,"tag":26,"props":627,"children":628},{},[629],{"type":18,"tag":30,"props":630,"children":631},{},[632],{"type":24,"value":633},"昇思MindSpore的创新和实践",{"type":18,"tag":81,"props":635,"children":637},{"id":636},"昇思mindspore的ai编译器全景图",[638],{"type":18,"tag":30,"props":639,"children":640},{},[641],{"type":24,"value":642},"昇思MindSpore的AI编译器全景图",{"type":18,"tag":26,"props":644,"children":645},{},[646],{"type":18,"tag":100,"props":647,"children":649},{"alt":7,"src":648},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/ad02043e111c46e6b4d9f8e0fc01e7b4.png",[],{"type":18,"tag":26,"props":651,"children":652},{},[653],{"type":18,"tag":30,"props":654,"children":655},{},[656],{"type":24,"value":657},"昇思MindSpore的AI编译器总共有五个特点：",{"type":18,"tag":26,"props":659,"children":660},{},[661],{"type":24,"value":662},"1、完整的AI编译器解决方案",{"type":18,"tag":26,"props":664,"children":665},{},[666],{"type":24,"value":667},"2、完善的Python静态化方案",{"type":18,"tag":26,"props":669,"children":670},{},[671],{"type":24,"value":672},"3、为AI领域优化的编译器IR-MindSpore IR（函数式图形IR）",{"type":18,"tag":26,"props":674,"children":675},{},[676],{"type":24,"value":677},"4、充分发挥硬件算力的图算融合+算子自动生成方案",{"type":18,"tag":26,"props":679,"children":680},{},[681],{"type":24,"value":682},"5、针对大规模并行的完整编译优化方案",{"type":18,"tag":26,"props":684,"children":685},{},[686],{"type":24,"value":687},"下面基于这几个特点，展开进行描述。",{"type":18,"tag":81,"props":689,"children":691},{"id":690},"特点1完整的ai编译器解决方案",[692],{"type":18,"tag":30,"props":693,"children":694},{},[695],{"type":24,"value":696},"特点1：完整的AI编译器解决方案",{"type":18,"tag":26,"props":698,"children":699},{},[700],{"type":24,"value":701},"完整主要体现在两个维度：",{"type":18,"tag":26,"props":703,"children":704},{},[705],{"type":24,"value":706},"1、纵向提供图编译器（前端/中端/后端）以及算子编译器：",{"type":18,"tag":26,"props":708,"children":709},{},[710],{"type":18,"tag":100,"props":711,"children":713},{"alt":7,"src":712},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/b239e0510d0445ea9f7182b998f23b6d.png",[],{"type":18,"tag":26,"props":715,"children":716},{},[717],{"type":24,"value":718},"2、横向提供端云统一的编译架构：统一的IR(MindIR)、公共Pass共享",{"type":18,"tag":26,"props":720,"children":721},{},[722],{"type":18,"tag":100,"props":723,"children":725},{"alt":7,"src":724},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/d07e5bc201684a20b357a46e56d5397f.png",[],{"type":18,"tag":81,"props":727,"children":729},{"id":728},"特点2完善的python静态化方案",[730],{"type":18,"tag":30,"props":731,"children":732},{},[733],{"type":24,"value":734},"特点2：完善的Python静态化方案",{"type":18,"tag":26,"props":736,"children":737},{},[738],{"type":24,"value":739},"前面提到AI编译器的起点就是把Python表达的AI网络和模型转化为编译器IR，但是这本身是一个非常有挑战性的工作，原因在于：",{"type":18,"tag":26,"props":741,"children":742},{},[743],{"type":24,"value":744},"1、Python是动态类型，无法通过AST解析直接确定类型",{"type":18,"tag":26,"props":746,"children":747},{},[748],{"type":24,"value":749},"2、Python的语法非常灵活，全量转换工作量大",{"type":18,"tag":26,"props":751,"children":752},{},[753,755],{"type":24,"value":754},"当前Python静态化的主流方案有三种，如前面介绍的，Tracing based、Bytecode Based、AST Based（",{"type":18,"tag":756,"props":757,"children":761},"a",{"href":758,"rel":759},"https://zhuanlan.zhihu.com/p/393031067%EF%BC%89",[760],"nofollow",[762],{"type":24,"value":763},"https://zhuanlan.zhihu.com/p/393031067）",{"type":18,"tag":26,"props":765,"children":766},{},[767],{"type":24,"value":768},"个人判断，Tracing Based和Bytecode Based这两种JIT的方式最终会统一到Bytecode Based；而AST Based作为AOT的解决方案会与JIT一起共存。",{"type":18,"tag":26,"props":770,"children":771},{},[772],{"type":24,"value":773},"昇思MindSpore的静态化方案当前主要采用了AST Based的AOT方式，未来也规划支持Bytecode Based的JIT方式。",{"type":18,"tag":26,"props":775,"children":776},{},[777],{"type":24,"value":778},"昇思MindSpore静态化方案主要有三个关键点：",{"type":18,"tag":26,"props":780,"children":781},{},[782],{"type":18,"tag":30,"props":783,"children":784},{},[785],{"type":24,"value":786},"第1步—语法解析：从AST翻译到MindIR",{"type":18,"tag":26,"props":788,"children":789},{},[790],{"type":24,"value":791},"昇思MindSpore定义了一套相对完整的语法解析规则来进行AST到MindIR的转换：",{"type":18,"tag":26,"props":793,"children":794},{},[795],{"type":18,"tag":100,"props":796,"children":798},{"alt":7,"src":797},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/a0edc06aca274871874bd2d9431cfa12.png",[],{"type":18,"tag":26,"props":800,"children":801},{},[802],{"type":18,"tag":30,"props":803,"children":804},{},[805],{"type":24,"value":806},"第2步—基于抽象释义的静态分析：完成类型推导和特化",{"type":18,"tag":26,"props":808,"children":809},{},[810],{"type":24,"value":811},"完成了AST到MindIR的转换后，下一步的难点就是如何从Python这种解释性语言中推导出静态类型，并进行常量传播和特化，昇思MindSpore的做法比较类似Julia的JIT方式，从顶层函数图入口开始解释执行，将函数图中所有节点进行拓扑排序，根据节点的语义递归推导各节点的抽象值。当遇到函数子图时，递归进入函数子图进行解释执行，最后返回顶层函数输出节点的抽象值。",{"type":18,"tag":26,"props":813,"children":814},{},[815],{"type":18,"tag":100,"props":816,"children":818},{"alt":7,"src":817},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/3ea3911dbb6c42398113179950ee67a6.png",[],{"type":18,"tag":26,"props":820,"children":821},{},[822],{"type":18,"tag":30,"props":823,"children":824},{},[825],{"type":24,"value":826},"第3步—JIT Fallback：难以解析的Python语法返回Python解释器去处理",{"type":18,"tag":26,"props":828,"children":829},{},[830],{"type":24,"value":831},"Python是解释性语言，语法比较灵活，想全量从Python转到静态IR上工作量大，难度非常高，有许多语法很难通过AST转换进行全量转换，JIT Fallback机制允许编译期遇到原生不支持语法时，通过Fallback到Python解释器去支持此语法。",{"type":18,"tag":26,"props":833,"children":834},{},[835,837],{"type":24,"value":836},"（",{"type":18,"tag":756,"props":838,"children":841},{"href":839,"rel":840},"https://zhuanlan.zhihu.com/p/416643687%EF%BC%89",[760],[842],{"type":24,"value":843},"https://zhuanlan.zhihu.com/p/416643687）",{"type":18,"tag":81,"props":845,"children":847},{"id":846},"特点3为ai领域优化的编译器ir-mindspore-ir函数式图形ir",[848],{"type":18,"tag":30,"props":849,"children":850},{},[851],{"type":24,"value":852},"特点3：为AI领域优化的编译器IR-MindSpore IR（函数式图形IR）",{"type":18,"tag":26,"props":854,"children":855},{},[856],{"type":24,"value":857},"业界编译器的IR按照不同的分类方法，有多种类型",{"type":18,"tag":26,"props":859,"children":860},{},[861],{"type":18,"tag":30,"props":862,"children":863},{},[864],{"type":24,"value":865},"分类方法1-按照组织结构：",{"type":18,"tag":26,"props":867,"children":868},{},[869],{"type":24,"value":870},"1、线性IR：三地址代码",{"type":18,"tag":26,"props":872,"children":873},{},[874],{"type":24,"value":875},"2、图IR：V8/JVM，常用于虚拟机",{"type":18,"tag":26,"props":877,"children":878},{},[879],{"type":24,"value":880},"3、混合IR:LLVM",{"type":18,"tag":26,"props":882,"children":883},{},[884],{"type":18,"tag":30,"props":885,"children":886},{},[887],{"type":24,"value":888},"分类方法2-按照编程语言的角度：",{"type":18,"tag":26,"props":890,"children":891},{},[892],{"type":24,"value":893},"1、命令式风格：SSA",{"type":18,"tag":26,"props":895,"children":896},{},[897],{"type":24,"value":898},"2、函数式风格：CPS/ANF",{"type":18,"tag":26,"props":900,"children":901},{},[902],{"type":18,"tag":30,"props":903,"children":904},{},[905],{"type":24,"value":906},"AI领域IR有其特殊的需求，包括：",{"type":18,"tag":26,"props":908,"children":909},{},[910],{"type":24,"value":911},"1、自动微分：能够处理控制流、递归、高阶微分等复杂",{"type":18,"tag":26,"props":913,"children":914},{},[915],{"type":24,"value":916},"2、场景隐式并行：程序能根据数据流依赖自动分析可以并行部分",{"type":18,"tag":26,"props":918,"children":919},{},[920],{"type":24,"value":921},"3、JIT能力：编译时间要短",{"type":18,"tag":26,"props":923,"children":924},{},[925],{"type":18,"tag":30,"props":926,"children":927},{},[928],{"type":24,"value":929},"昇思MindSpore的解决方案-MindSpore IR(函数式图形IR)：",{"type":18,"tag":26,"props":931,"children":932},{},[933],{"type":24,"value":934},"Functional(函数式)-更自然的自动微分实现方式和更方便的隐式并行分析能力：",{"type":18,"tag":26,"props":936,"children":937},{},[938],{"type":24,"value":939},"1、函数作为一等公民，支持高阶函数，控制流也是特殊的函数，以统一的形式来实现微分，容易处理控制流、递归、高阶微分等复杂场景。",{"type":18,"tag":26,"props":941,"children":942},{},[943],{"type":24,"value":944},"2、函数以无副作用的方式实现，与命令式语言相比，可基于数据依赖的偏序分析，方便的分析出程序的可并行部分，实现隐式并行的能力。",{"type":18,"tag":26,"props":946,"children":947},{},[948],{"type":24,"value":949},"Graph based(图形IR)-更适合JIT的快速优化能力：",{"type":18,"tag":26,"props":951,"children":952},{},[953],{"type":24,"value":954},"采用类似Sea of Nodes IR的只有一层的表示方式，控制流和数据流合一，直接表达used-def，适合JIT优化。",{"type":18,"tag":26,"props":956,"children":957},{},[958,960],{"type":24,"value":959},"详细的AI框架的IR的介绍以及MindSpore IR的设计，可参见（",{"type":18,"tag":756,"props":961,"children":964},{"href":962,"rel":963},"https://zhuanlan.zhihu.com/p/263420069%EF%BC%89",[760],[965],{"type":24,"value":966},"https://zhuanlan.zhihu.com/p/263420069）",{"type":18,"tag":81,"props":968,"children":970},{"id":969},"特点4充分发挥硬件算力的图算融合算子自动生成方案",[971],{"type":18,"tag":30,"props":972,"children":973},{},[974],{"type":24,"value":975},"特点4：充分发挥硬件算力的图算融合+算子自动生成方案",{"type":18,"tag":26,"props":977,"children":978},{},[979],{"type":24,"value":980},"AI芯片对AI编译器来说，主要带来两大挑战，性能和开发效率：",{"type":18,"tag":26,"props":982,"children":983},{},[984],{"type":18,"tag":30,"props":985,"children":986},{},[987],{"type":24,"value":988},"AI芯片的发展对AI框架的关键性能挑战：",{"type":18,"tag":26,"props":990,"children":991},{},[992],{"type":24,"value":993},"1、由于并行度及工艺的快速提升，AI芯片计算能力相比带宽能力提升更快。需要软件通过平衡带宽的能力不足（Buffer融合 – 减少带宽）；",{"type":18,"tag":26,"props":995,"children":996},{},[997],{"type":24,"value":998},"2、随着芯片并行度的增加，如何增加融合算子计算并行度，以提升芯片资源利用率。也成为一个重要的性能优化方向（并行融合 – 提高并行度）。",{"type":18,"tag":26,"props":1000,"children":1001},{},[1002],{"type":18,"tag":30,"props":1003,"children":1004},{},[1005],{"type":24,"value":1006},"AI模型的规模和复杂度发展对算子融合的挑战：",{"type":18,"tag":26,"props":1008,"children":1009},{},[1010],{"type":24,"value":1011},"1、由于复杂度和规模增加，完全依赖手工算子融合并实现融合算子变得不再可能；",{"type":18,"tag":26,"props":1013,"children":1014},{},[1015],{"type":24,"value":1016},"2、业界传统采用图层和算子层严格分层独立的实现方法，为算子融合技术演进带来一定的障碍。",{"type":18,"tag":26,"props":1018,"children":1019},{},[1020],{"type":18,"tag":30,"props":1021,"children":1022},{},[1023],{"type":24,"value":1024},"昇思MindSpore的解决方案：",{"type":18,"tag":26,"props":1026,"children":1027},{},[1028],{"type":18,"tag":100,"props":1029,"children":1031},{"alt":7,"src":1030},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/b8a02c06d53e45f39ad628f2b86fce9b.png",[],{"type":18,"tag":26,"props":1033,"children":1034},{},[1035],{"type":24,"value":1036},"主要是两个关键点：基于多层规约的图算融合引擎、自动算子生成。",{"type":18,"tag":26,"props":1038,"children":1039},{},[1040],{"type":24,"value":1041},"整体的流程：基于昇思MindSpore的统一IR MindIR，通过expander的复合算子白盒化，实现对不同网络的无侵入使能和优化；在完成跨边界聚合优化后，实现多层次多维的算子融合重建；包括buffer融合，并行融合，buffer stitch等等；最后，交给算子编译器AKG，完成子图的codegen。",{"type":18,"tag":26,"props":1043,"children":1044},{},[1045],{"type":24,"value":1046},"整个方案的构建，实际上花了比较长的时间，三年三篇顶会，从侧面上反映了这个过程。",{"type":18,"tag":26,"props":1048,"children":1049},{},[1050],{"type":24,"value":1051},"与业界已有的AI编译器的对比：",{"type":18,"tag":26,"props":1053,"children":1054},{},[1055],{"type":18,"tag":100,"props":1056,"children":1058},{"alt":7,"src":1057},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/828a7c532983477a9dbc9d89cf73396c.png",[],{"type":18,"tag":26,"props":1060,"children":1061},{},[1062],{"type":18,"tag":30,"props":1063,"children":1064},{},[1065],{"type":24,"value":1066},"基于多层规约的图算融合引擎：",{"type":18,"tag":26,"props":1068,"children":1069},{},[1070],{"type":24,"value":1071},"昇思MindSpore的图算融合引擎充分吸收了当前多类算子融合的技术，并有效的把它们整合在一起。",{"type":18,"tag":26,"props":1073,"children":1074},{},[1075],{"type":18,"tag":100,"props":1076,"children":1078},{"alt":7,"src":1077},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/088f199f668046c78076a33ebdf2518a.png",[],{"type":18,"tag":26,"props":1080,"children":1081},{},[1082],{"type":24,"value":1083},"整体的架构如下：",{"type":18,"tag":26,"props":1085,"children":1086},{},[1087],{"type":18,"tag":100,"props":1088,"children":1090},{"alt":7,"src":1089},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/31d662990ac2437c9f43fb95d3d7056d.png",[],{"type":18,"tag":26,"props":1092,"children":1093},{},[1094],{"type":24,"value":1095},"流程主要分为partition和fusion两个阶段。对于输入的DNN模型，在partition阶段，会先完成黑盒白化以及白盒聚合，也就是把复合算子通过扩展器扩展为基本算子的白盒子图，打开原子图边界后，完成跨边界的聚合优化以及计算逻辑优化，包括代数化简常量折叠等等。最后，按照预定的pattern以及costmodel，对计算图拆分为适合codegen的子图；",{"type":18,"tag":26,"props":1097,"children":1098},{},[1099],{"type":24,"value":1100},"在fusion阶段，我们提供了多层级的融合方案。在layer1层，主要是完成buffer融合，将子图拆分后的融合子图交给算子编译器AKG，从loop粒度，完成循环变化、调度优化等编译优化。接着，对于AKG生成的子图kernel，通过bufferStitch按照依赖关系对其做buffer拼接，进一步减少访存次数（这部分工作其实跟阿里的Astitch理念上比较类似，实现上不同）；最后，对不存在依赖关系的子图kernel，寻求并行融合机会。",{"type":18,"tag":26,"props":1102,"children":1103},{},[1104],{"type":18,"tag":30,"props":1105,"children":1106},{},[1107],{"type":24,"value":1108},"算子自动生成（AKG）",{"type":18,"tag":26,"props":1110,"children":1111},{},[1112],{"type":18,"tag":100,"props":1113,"children":1115},{"alt":7,"src":1114},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/2453aa491fcf4d059a23789af4f7e507.png",[],{"type":18,"tag":26,"props":1117,"children":1118},{},[1119],{"type":24,"value":1120},"AKG基于TVM开发，但与TVM不同的是，我们是通过polyhedral实现调度自动优化。AKG的输入除了tvm中提供的dsl算子外，还支持图算融合后的子图以及MindSpore提供的python自定义算子。在通过一系列规范化的pass后，将halideIR转为poly模块中的schedule tree，并对schedule tree完成自动调度优化，自动切分，内存搬移等操作，随即转回HalideIR完成后端指令生成及后端优化。在切分策略上，提供了两种模式。对于训练场景，使用autotiling在较短时间给出相对较优的切分，对于性能极致优化场景，我们提供了tuning能力，在poly辅助计算的切分空间中，利用进化算法，costmodel等寻求最优切分。",{"type":18,"tag":26,"props":1122,"children":1123},{},[1124],{"type":24,"value":1125},"整套方案同其他编译器相比，主要有两个优势：由于调度是自动完成的，极大降低了开发门槛；对于异构硬件、以及融合算子都能够有较好的支持。",{"type":18,"tag":26,"props":1127,"children":1128},{},[1129],{"type":18,"tag":30,"props":1130,"children":1131},{},[1132],{"type":24,"value":1133},"GPU平台",{"type":18,"tag":26,"props":1135,"children":1136},{},[1137],{"type":24,"value":1138},"已经基本实现泛化支持",{"type":18,"tag":26,"props":1140,"children":1141},{},[1142],{"type":24,"value":1143},"1、NLP、推荐类网络收益明显：NLP类96.4%；推荐类136.6%;",{"type":18,"tag":26,"props":1145,"children":1146},{},[1147],{"type":24,"value":1148},"2、CV类由于以卷积为主，平均30.7%。",{"type":18,"tag":26,"props":1150,"children":1151},{},[1152],{"type":18,"tag":30,"props":1153,"children":1154},{},[1155],{"type":24,"value":1156},"CPU平台",{"type":18,"tag":26,"props":1158,"children":1159},{},[1160],{"type":24,"value":1161},"已打通CPU后端支持",{"type":18,"tag":26,"props":1163,"children":1164},{},[1165],{"type":24,"value":1166},"强化学习网络提升15~20%，典型NN网络提升17%~33%",{"type":18,"tag":81,"props":1168,"children":1170},{"id":1169},"特点5针对大规模并行的完整编译优化方案",[1171],{"type":18,"tag":30,"props":1172,"children":1173},{},[1174],{"type":24,"value":1175},"特点5：针****对大规模并行的完整编译优化方案",{"type":18,"tag":26,"props":1177,"children":1178},{},[1179],{"type":24,"value":1180},"昇思MindSpore另一个创新是把AI编译器从单芯片支持拓展到集群支持，在编译中实现通用分布式并行及内存优化。",{"type":18,"tag":26,"props":1182,"children":1183},{},[1184],{"type":18,"tag":100,"props":1185,"children":1187},{"alt":7,"src":1186},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/f02e143cec0d44acb38501e6af59f858.png",[],{"type":18,"tag":26,"props":1189,"children":1190},{},[1191],{"type":24,"value":1192},"整体的编译流程：",{"type":18,"tag":26,"props":1194,"children":1195},{},[1196],{"type":18,"tag":100,"props":1197,"children":1199},{"alt":7,"src":1198},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/a3f4e12f3a834ba0901a316a88c5af77.png",[],{"type":18,"tag":26,"props":1201,"children":1202},{},[1203],{"type":18,"tag":30,"props":1204,"children":1205},{},[1206],{"type":24,"value":1207},"未来一些开放性的问题",{"type":18,"tag":26,"props":1209,"children":1210},{},[1211],{"type":18,"tag":30,"props":1212,"children":1213},{},[1214],{"type":24,"value":1215},"问题1：AI的图层优化最终是一个什么问题？",{"type":18,"tag":26,"props":1217,"children":1218},{},[1219],{"type":24,"value":1220},"整体而言，AI图层的优化当前有三种思路：",{"type":18,"tag":26,"props":1222,"children":1223},{},[1224],{"type":24,"value":1225},"1、编译器的优化思路，基于Pattern Match，从搜索算法的本质讲是一种贪心的算法，是局部寻优的思路",{"type":18,"tag":26,"props":1227,"children":1228},{},[1229],{"type":24,"value":1230},"2、全局规划的思路，无论是动态规划、还是类似ILP的线性规划也好，都是采用全局寻优的思路，这里的关键是需要一个较好的cost model。",{"type":18,"tag":26,"props":1232,"children":1233},{},[1234],{"type":24,"value":1235},"3、Tuning的思路，还是全局寻优的思路，但是弱化cost model。",{"type":18,"tag":26,"props":1237,"children":1238},{},[1239],{"type":24,"value":1240},"未来图层的优化哪一种方式是主流，或者哪几种方式组合最优。",{"type":18,"tag":26,"props":1242,"children":1243},{},[1244],{"type":18,"tag":30,"props":1245,"children":1246},{},[1247],{"type":24,"value":1248},"问题2：图算能否统一表达，统一编译优化，成为一个通用编译器",{"type":18,"tag":26,"props":1250,"children":1251},{},[1252],{"type":24,"value":1253},"当前的AI框架下，图层和算子层是分开表达和优化的，算法工程师主要是接触图层的表达，AI框架或者芯片使能的工程师主要是接触算子的表达，但是未来在AI+科学计算的场景下，图层和算子层不再清晰，能否放在一起表达，统一优化？",{"type":18,"tag":26,"props":1255,"children":1256},{},[1257],{"type":18,"tag":30,"props":1258,"children":1259},{},[1260],{"type":24,"value":1261},"问题3：完全的自动并行是否可行",{"type":18,"tag":26,"props":1263,"children":1264},{},[1265],{"type":24,"value":1266},"在完全的自动并行方面，昇思MindSpore以及学术界都做了有益的探索，但是目前看泛化性还有一些差距，未来能否真正做到自动并行？",{"type":18,"tag":26,"props":1268,"children":1269},{},[1270],{"type":24,"value":1271},"如何借助编译优化理论和方法，将计算图描述的深度学习算法部署在具体硬件上并让算法高效运行，是学术界和工业界一个重要的研究课题。更多AI编译技术内容在清华大学出版社新书**《多面体编译理论与深度学习实践》**中。",{"type":18,"tag":26,"props":1273,"children":1274},{},[1275],{"type":24,"value":1276},"为感谢小孢子对昇思MindSpore的关注，昇思MindSpore准备了5本《多面体编译理论与深度学习实践》作为福利免费赠送。",{"type":18,"tag":26,"props":1278,"children":1279},{},[1280],{"type":18,"tag":30,"props":1281,"children":1282},{},[1283],{"type":24,"value":1284},"活动规则（以下三点均需同时满足）：",{"type":18,"tag":26,"props":1286,"children":1287},{},[1288,1290],{"type":24,"value":1289},"**1、**",{"type":18,"tag":30,"props":1291,"children":1292},{},[1293],{"type":24,"value":1294},"将本篇文章转发至朋友圈并带上#昇思MindSpore，发表你对AI编译器相关看法或建议；",{"type":18,"tag":26,"props":1296,"children":1297},{},[1298,1300],{"type":24,"value":1299},"**2、该朋友圈保留24小时（**",{"type":18,"tag":30,"props":1301,"children":1302},{},[1303],{"type":24,"value":1304},"不设分组）；",{"type":18,"tag":26,"props":1306,"children":1307},{},[1308],{"type":18,"tag":30,"props":1309,"children":1310},{},[1311],{"type":24,"value":1312},"3、集满88个赞。",{"type":18,"tag":26,"props":1314,"children":1315},{},[1316],{"type":24,"value":1317},"满足以上三点即可获得《多面体编译理论与深度学习实践》一本，数量有限先到先得。",{"type":18,"tag":26,"props":1319,"children":1320},{},[1321],{"type":18,"tag":30,"props":1322,"children":1323},{},[1324],{"type":24,"value":1325},"领奖规则:",{"type":18,"tag":26,"props":1327,"children":1328},{},[1329],{"type":24,"value":1330},"**1、活动时间：**2023年3月20日-2023年3月24日12：00；",{"type":18,"tag":26,"props":1332,"children":1333},{},[1334,1339],{"type":18,"tag":30,"props":1335,"children":1336},{},[1337],{"type":24,"value":1338},"2、领取方式",{"type":24,"value":1340},"**：**将以上三点截图发送至昇思MindSpore公众号后台，等待工作人员审核即可。",{"type":18,"tag":26,"props":1342,"children":1343},{},[1344],{"type":24,"value":1345},"*活动解释权归昇思MindSpore开源社区所有。",{"type":18,"tag":26,"props":1347,"children":1348},{},[1349],{"type":18,"tag":100,"props":1350,"children":1352},{"alt":7,"src":1351},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/01/ad9f88e4b74a4a29bde739019af47fc6.png",[],{"title":7,"searchDepth":1354,"depth":1354,"links":1355},4,[1356,1358,1359,1360,1361,1362,1363,1364,1365,1366,1367],{"id":83,"depth":1357,"text":89},3,{"id":92,"depth":1357,"text":95},{"id":196,"depth":1357,"text":202},{"id":444,"depth":1357,"text":450},{"id":563,"depth":1357,"text":569},{"id":636,"depth":1357,"text":642},{"id":690,"depth":1357,"text":696},{"id":728,"depth":1357,"text":734},{"id":846,"depth":1357,"text":852},{"id":969,"depth":1357,"text":975},{"id":1169,"depth":1357,"text":1175},"markdown","content:technology-blogs:zh:2203.md","content","technology-blogs/zh/2203.md","technology-blogs/zh/2203","md",1776506120932]