[{"data":1,"prerenderedAt":461},["ShallowReactive",2],{"content-query-8IDLxlAQ1T":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":455,"_id":456,"_source":457,"_file":458,"_stem":459,"_extension":460},"/technology-blogs/zh/407","zh",false,"","MIT高性能自动微分框架Enzyme论文分析","麻省理工学院（MIT）提出的自动微分框架 Enzyme 的简单分析，供参考。","2021-03-03","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/03/03/ce7340e08c4948a98499abd7951eb226.png","technology-blogs",{"type":14,"children":15,"toc":446},"root",[16,24,30,43,63,68,74,79,84,99,104,113,118,126,131,139,144,152,157,165,173,178,186,191,198,203,208,215,224,229,236,241,247,252,259,264,272,282,287,295,300,308,313,321,326,331,337,342,347,357,362,385,392,397,403,422,434],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"mit高性能自动微分框架enzyme论文分析",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28],{"type":23,"value":29},"作者：金雪锋",{"type":17,"tag":25,"props":31,"children":32},{},[33,35],{"type":23,"value":34},"文章链接：",{"type":17,"tag":36,"props":37,"children":41},"a",{"href":38,"rel":39},"https://zhuanlan.zhihu.com/p/353918898",[40],"nofollow",[42],{"type":23,"value":38},{"type":17,"tag":25,"props":44,"children":45},{},[46,48,55,57],{"type":23,"value":47},"作者主页：",{"type":17,"tag":36,"props":49,"children":52},{"href":50,"rel":51},"https://www.zhihu.com/people/jin-xue-feng",[40],[53],{"type":23,"value":54},"https",{"type":23,"value":56}," : ",{"type":17,"tag":36,"props":58,"children":60},{"href":50,"rel":59},[40],[61],{"type":23,"value":62},"//www.zhihu.com/people/jin-xue-feng",{"type":17,"tag":25,"props":64,"children":65},{},[66],{"type":23,"value":67},"麻省理工学院（MIT）提出的自动微分框架 Enzyme 在 NeurIPS 2020 大会上引起过不少人的兴趣，最近也简单分析一下，供参考。",{"type":17,"tag":69,"props":70,"children":72},"h2",{"id":71},"概述",[73],{"type":23,"value":71},{"type":17,"tag":25,"props":75,"children":76},{},[77],{"type":23,"value":78},"当前对于将新领域，例如物理模拟、游戏引擎、气候模型等，引入到机器学习中来，存在一个普遍问题--求梯度时，需要将外来代码通过源码重写或者操作符重载，以融入现有AD（automatic differentiation自动微分）工具（Adept、Autograd等）或深度学习框架（TensorFlow、PyTorch、MindSpore等），这增加了在机器学习工作流中引入外来代码的工作量。而MIT的这篇论文提出了Enzyme，一个基于LLVM IR的自动微分编译器插件，来缓解该问题。Enzyme可以生成用LLVM IR表达的静态可分析程序的梯度。",{"type":17,"tag":25,"props":80,"children":81},{},[82],{"type":23,"value":83},"Enzyme的关键点：",{"type":17,"tag":85,"props":86,"children":87},"ol",{},[88,94],{"type":17,"tag":89,"props":90,"children":91},"li",{},[92],{"type":23,"value":93},"可以自动生成基于静态可分析的LLVM IR的梯度。",{"type":17,"tag":89,"props":95,"children":96},{},[97],{"type":23,"value":98},"在优化后的IR上做自动微分，以得到一个高性能的梯度计算。",{"type":17,"tag":69,"props":100,"children":102},{"id":101},"设计",[103],{"type":23,"value":101},{"type":17,"tag":25,"props":105,"children":106},{},[107],{"type":17,"tag":108,"props":109,"children":110},"strong",{},[111],{"type":23,"value":112},"1、类型分析",{"type":17,"tag":25,"props":114,"children":115},{},[116],{"type":23,"value":117},"LLVM IR没必要表示出所有数据的潜在类型，所以Enzyme做了一些自己的抽象解析。如引入类型树，实现了一个类型传播规则。",{"type":17,"tag":25,"props":119,"children":120},{},[121],{"type":17,"tag":108,"props":122,"children":123},{},[124],{"type":23,"value":125},"2、活动分析",{"type":17,"tag":25,"props":127,"children":128},{},[129],{"type":23,"value":130},"分析出哪些指令对梯度计算有影响以及哪些指令是不可微分的，从而减少不必要的计算。",{"type":17,"tag":25,"props":132,"children":133},{},[134],{"type":17,"tag":108,"props":135,"children":136},{},[137],{"type":23,"value":138},"3、影子内存",{"type":17,"tag":25,"props":140,"children":141},{},[142],{"type":23,"value":143},"对于正向部分需要计算的数据，Enzyme会复制一份影子内存，用于保存梯度计算结果，一直到梯度计算不需要时才会释放。",{"type":17,"tag":25,"props":145,"children":146},{},[147],{"type":17,"tag":108,"props":148,"children":149},{},[150],{"type":23,"value":151},"4、合成梯度",{"type":17,"tag":25,"props":153,"children":154},{},[155],{"type":23,"value":156},"根据类型和活动分析的结果，创建梯度函数，梯度函数包括正向部分和反向部分。",{"type":17,"tag":25,"props":158,"children":159},{},[160],{"type":17,"tag":161,"props":162,"children":164},"img",{"alt":7,"src":163},"https://pic3.zhimg.com/80/v2-ad6817815204b7714e6f1b07f3af96ae_720w.jpg",[],{"type":17,"tag":25,"props":166,"children":167},{},[168],{"type":17,"tag":108,"props":169,"children":170},{},[171],{"type":23,"value":172},"5、缓存",{"type":17,"tag":25,"props":174,"children":175},{},[176],{"type":23,"value":177},"Enzyme在计算反向部分时，如果需要正向部分的结果，默认情况下会重新计算正向。但是对于不可能重新计算的操作（如Read）或效率低的操作，Enzyme也提供了缓存机制保存正向结果用于计算反向部分。",{"type":17,"tag":25,"props":179,"children":180},{},[181],{"type":17,"tag":108,"props":182,"children":183},{},[184],{"type":23,"value":185},"6、生成高性能的梯度计算",{"type":17,"tag":25,"props":187,"children":188},{},[189],{"type":23,"value":190},"Enzyme是在优化过的LLVM IR上做AD的。论文给出了一个在LICM（loop-invariant code motion）优化后做AD的如下例子，相比于在AD之后做LICM，计算复杂度有一个O(N^2）到O(N)的提升。",{"type":17,"tag":25,"props":192,"children":193},{},[194],{"type":17,"tag":161,"props":195,"children":197},{"alt":7,"src":196},"https://pic2.zhimg.com/80/v2-1c0db5e363ad60b4f9ec56dd6b8c6345_720w.jpg",[],{"type":17,"tag":25,"props":199,"children":200},{},[201],{"type":23,"value":202},"Top：一个O(N^2)的norm函数，如果使用loop-invariant-codemotion (LICM)把mag函数提出来，能够将复杂度优化到O(N)。Left：执行LICM之后做AD，得到的一个O(N)的 ∇norm函数。Right：在AD之后执行LICM，得到的一个O(N^2)的 ∇norm函数，∇mag仍然在循环内部，因为它使用了循环内部的一个值，LICM并不生效。",{"type":17,"tag":25,"props":204,"children":205},{},[206],{"type":23,"value":207},"下图为论文给出的在不同的benchmark上，Enzyme与其他自动微分工具的性能对比。其中，Ref是Enzyme在优化前做AD的一个对比实验组，与Ref相比，每个benchmark平均下来，在优化后做AD，有4.5倍的性能提升。",{"type":17,"tag":25,"props":209,"children":210},{},[211],{"type":17,"tag":161,"props":212,"children":214},{"alt":7,"src":213},"https://pic4.zhimg.com/80/v2-60c69af6d74fec5aebcb6e7106b3c0e3_720w.jpg",[],{"type":17,"tag":25,"props":216,"children":217},{},[218],{"type":17,"tag":219,"props":220,"children":221},"em",{},[222],{"type":23,"value":223},"注：这个性能对比主要还是和同等LLVM AD工具比",{"type":17,"tag":25,"props":225,"children":226},{},[227],{"type":23,"value":228},"Left：不同的AD系统在benchmark上的相对加速比，柱状越高的越好。值1.0表示当前benchmark上最快的AD系统，值0.5表示该AD系统生成的梯度计算，花费的时间是最快的2倍。Right：以秒为单位的几何平均运行时间。",{"type":17,"tag":25,"props":230,"children":231},{},[232],{"type":17,"tag":161,"props":233,"children":235},{"alt":7,"src":234},"https://pic1.zhimg.com/80/v2-881c7554a0b821182cb3f0e4f8e82df8_720w.jpg",[],{"type":17,"tag":25,"props":237,"children":238},{},[239],{"type":23,"value":240},"Enzyme和Ref两条流水线，在AD前后执行优化的情况",{"type":17,"tag":69,"props":242,"children":244},{"id":243},"enzyme的使用",[245],{"type":23,"value":246},"Enzyme的使用",{"type":17,"tag":25,"props":248,"children":249},{},[250],{"type":23,"value":251},"要生成基于LLVM语言的代码梯度，需要调用一个外部函数__enzyme_autodiff。",{"type":17,"tag":25,"props":253,"children":254},{},[255],{"type":17,"tag":161,"props":256,"children":258},{"alt":7,"src":257},"https://pic4.zhimg.com/80/v2-e496a15bbd067f0c999095a39e2581e3_720w.jpg",[],{"type":17,"tag":25,"props":260,"children":261},{},[262],{"type":23,"value":263},"以C程序为例，求一个函数的梯度：",{"type":17,"tag":85,"props":265,"children":266},{},[267],{"type":17,"tag":89,"props":268,"children":269},{},[270],{"type":23,"value":271},"准备一个C程序",{"type":17,"tag":273,"props":274,"children":276},"pre",{"code":275},"// test.c\n#include \nextern double __enzyme_autodiff(void*, double);//extern funciton\ndouble square(double x) {\n    return x * x;\n}\ndouble dsquare(double x) {\n    // This returns the derivative of square or 2 * x\n    return __enzyme_autodiff(square, x);\n}\nint main() {\n    for(double i=1; i\u003C5; i++)\n        printf(\"square(%f)=%f, dsquare(%f)=%f\", i, square(i), i, dsquare(i));\n}\n",[277],{"type":17,"tag":278,"props":279,"children":280},"code",{"__ignoreMap":7},[281],{"type":23,"value":275},{"type":17,"tag":25,"props":283,"children":284},{},[285],{"type":23,"value":286},"2. 使用clang生成llvm ir",{"type":17,"tag":273,"props":288,"children":290},{"code":289},"clang test.c -S -emit-llvm -o input.ll -O2 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops\n",[291],{"type":17,"tag":278,"props":292,"children":293},{"__ignoreMap":7},[294],{"type":23,"value":289},{"type":17,"tag":25,"props":296,"children":297},{},[298],{"type":23,"value":299},"3. 使用Enzyme生成梯度",{"type":17,"tag":273,"props":301,"children":303},{"code":302},"opt input.ll -load=/path/to/Enzyme/enzyme/build/Enzyme/LLVMEnzyme-.so -enzyme -o output.ll -S\n",[304],{"type":17,"tag":278,"props":305,"children":306},{"__ignoreMap":7},[307],{"type":23,"value":302},{"type":17,"tag":25,"props":309,"children":310},{},[311],{"type":23,"value":312},"4. 进行AD后的优化并生成可执行文件",{"type":17,"tag":273,"props":314,"children":316},{"code":315},"clang output.ll -O3 -o a.exe\n",[317],{"type":17,"tag":278,"props":318,"children":319},{"__ignoreMap":7},[320],{"type":23,"value":315},{"type":17,"tag":25,"props":322,"children":323},{},[324],{"type":23,"value":325},"另外，在做AD时，Enzyme需要获取所有需要微分函数的IR。如果只是一种源代码的IR很容易获得，对于多种源代码或者使用外部库的代码，Enzyme使用Link-Time Optimization (LTO)--一种用于整个程序优化的编译器技术，保留所有源文件的IR一直到链接阶段。为了在多种源代码的代码库中使用，使能LTO，在合并的IR上运行Enzyme，对于静态库可以在编译时加上-fembed-bitcode命令，使得在静态库里面包含中间代码。",{"type":17,"tag":25,"props":327,"children":328},{},[329],{"type":23,"value":330},"目前看Enzyme还支持CPU。",{"type":17,"tag":69,"props":332,"children":334},{"id":333},"enzyme在ml框架的应用",[335],{"type":23,"value":336},"Enzyme在ML框架的应用",{"type":17,"tag":25,"props":338,"children":339},{},[340],{"type":23,"value":341},"根据上述使用Enzyme的方法，借助AI框架的自定义算子能力，可以把Enzyme内嵌到各种框架中，以MindSpore为例：",{"type":17,"tag":25,"props":343,"children":344},{},[345],{"type":23,"value":346},"借助MindSpore自定义算子的能力把Enzyme嵌入到MindSpore中去使用：",{"type":17,"tag":25,"props":348,"children":349},{},[350],{"type":17,"tag":36,"props":351,"children":354},{"href":352,"rel":353},"https://www.mindspore.cn/tutorial/training/zh-CN/r1.1/advanced_use/custom_operator_cpu.html",[40],[355],{"type":23,"value":356},"自定义算子（CPU） - MindSpore r1.1 documentationwww.mindspore.cn",{"type":17,"tag":25,"props":358,"children":359},{},[360],{"type":23,"value":361},"方法如下：",{"type":17,"tag":85,"props":363,"children":364},{},[365,370,375,380],{"type":17,"tag":89,"props":366,"children":367},{},[368],{"type":23,"value":369},"自定义正向算子Enzyme，其中算子实现为根据外来源代码路径，调用clang去生成外来源代码的.so，并通过dlfcn库加载该.so，根据函数名去获取并调用正向函数。",{"type":17,"tag":89,"props":371,"children":372},{},[373],{"type":23,"value":374},"自定义反向算子EnzymeGrad，其中算子实现为根据外来源代码路径，调用clang和enzyme去生成外来源代码经过AD的.so，并通过dlfcn库加载该.so，根据函数名去获取并调用梯度函数。",{"type":17,"tag":89,"props":376,"children":377},{},[378],{"type":23,"value":379},"定义反向传播函数（bprop），函数的计算逻辑为调用自定义好的反向算子EnzymeGrad。",{"type":17,"tag":89,"props":381,"children":382},{},[383],{"type":23,"value":384},"定义正向网络，网络中调用自定义好的Enzyme算子。最后根据正向网络调用GradOperation函数求梯度，就可以使用Enzyme生成梯度了。",{"type":17,"tag":25,"props":386,"children":387},{},[388],{"type":17,"tag":161,"props":389,"children":391},{"alt":7,"src":390},"https://pic1.zhimg.com/80/v2-eee070f6d2c8c999236d6138f143350c_720w.jpg",[],{"type":17,"tag":25,"props":393,"children":394},{},[395],{"type":23,"value":396},"当然现在Enzyme也已经可以应用到Pytorch或TF中。",{"type":17,"tag":69,"props":398,"children":400},{"id":399},"主要结论",[401],{"type":23,"value":402},"主要结论：",{"type":17,"tag":85,"props":404,"children":405},{},[406,414],{"type":17,"tag":89,"props":407,"children":408},{},[409],{"type":17,"tag":108,"props":410,"children":411},{},[412],{"type":23,"value":413},"Enzyme提供了一种梯度计算的性能提升方法：在某些优化完成之后才进行AD。",{"type":17,"tag":89,"props":415,"children":416},{},[417],{"type":17,"tag":108,"props":418,"children":419},{},[420],{"type":23,"value":421},"Enzyme能在低级别的LLVM IR上进行AD，非常方便嵌入现有的框架。",{"type":17,"tag":25,"props":423,"children":424},{},[425,427],{"type":23,"value":426},"论文地址：",{"type":17,"tag":36,"props":428,"children":431},{"href":429,"rel":430},"https://link.zhihu.com/?target=https%3A//arxiv.org/pdf/2010.01709.pdf",[40],[432],{"type":23,"value":433},"https://arxiv.org/pdf/2010.01709.pdf",{"type":17,"tag":25,"props":435,"children":436},{},[437,439],{"type":23,"value":438},"github地址：",{"type":17,"tag":36,"props":440,"children":443},{"href":441,"rel":442},"https://link.zhihu.com/?target=https%3A//github.com/wsmoses/Enzyme",[40],[444],{"type":23,"value":445},"https://github.com/wsmoses/Enzyme",{"title":7,"searchDepth":447,"depth":447,"links":448},4,[449,451,452,453,454],{"id":71,"depth":450,"text":71},2,{"id":101,"depth":450,"text":101},{"id":243,"depth":450,"text":246},{"id":333,"depth":450,"text":336},{"id":399,"depth":450,"text":402},"markdown","content:technology-blogs:zh:407.md","content","technology-blogs/zh/407.md","technology-blogs/zh/407","md",1776506137107]