[{"data":1,"prerenderedAt":379},["ShallowReactive",2],{"content-query-SFnn2nZaVc":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":373,"_id":374,"_source":375,"_file":376,"_stem":377,"_extension":378},"/technology-blogs/zh/766","zh",false,"","AI框架动静态图统一的思考","动静图统一的挑战、趋势以及MindSpore的实践三个维度","2021-10-29","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/10/29/2dca412db7f142afb448b366eb7d9774.png","technology-blogs","大V博文",{"type":15,"children":16,"toc":366},"root",[17,25,31,44,55,60,65,76,81,88,97,102,107,122,133,138,143,148,153,158,163,171,176,181,186,194,199,204,209,219,224,229,234,242,247,256,261,266,271,284,289,302,307,317,325,330,335,340,345,351,356,361],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"ai框架动静态图统一的思考",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"作者：金雪锋",{"type":18,"tag":26,"props":32,"children":33},{},[34,36],{"type":24,"value":35},"作者主页：",{"type":18,"tag":37,"props":38,"children":42},"a",{"href":39,"rel":40},"https://www.zhihu.com/people/jin-xue-feng",[41],"nofollow",[43],{"type":24,"value":39},{"type":18,"tag":26,"props":45,"children":46},{},[47,49],{"type":24,"value":48},"文章来源：",{"type":18,"tag":37,"props":50,"children":53},{"href":51,"rel":52},"https://zhuanlan.zhihu.com/p/416643687",[41],[54],{"type":24,"value":51},{"type":18,"tag":26,"props":56,"children":57},{},[58],{"type":24,"value":59},"发现好久没更新AI框架分析的专栏，确实有点对不住当初立的flag，赶紧先上一篇，内容准备的还有些匆忙，后面有时间再优化和细化。",{"type":18,"tag":26,"props":61,"children":62},{},[63],{"type":24,"value":64},"本文是AI框架分析专栏的第七篇，总体目录参见：",{"type":18,"tag":26,"props":66,"children":67},{},[68,70],{"type":24,"value":69},"AI框架的演进趋势和MindSpore的构想：",{"type":18,"tag":37,"props":71,"children":74},{"href":72,"rel":73},"https://zhuanlan.zhihu.com/p/225392622",[41],[75],{"type":24,"value":72},{"type":18,"tag":26,"props":77,"children":78},{},[79],{"type":24,"value":80},"MindSpore在第一个版本发布的时候，就开始构建动静态图统一的特性，动静统一的目标非常美好，但是实现起来其实非常困难；本文试图从动静图统一的挑战、趋势以及MindSpore的实践三个维度出发，给读者一些参考。",{"type":18,"tag":82,"props":83,"children":85},"h2",{"id":84},"动静态图的发展趋势",[86],{"type":24,"value":87},"动静态图的发展趋势：",{"type":18,"tag":26,"props":89,"children":90},{},[91],{"type":18,"tag":92,"props":93,"children":94},"strong",{},[95],{"type":24,"value":96},"1、什么是动静态图",{"type":18,"tag":26,"props":98,"children":99},{},[100],{"type":24,"value":101},"**动态图：**AI框架的动态图基本上是指类似PyTorch的实现（非torchscript），原理上比较简单，使用Python的c extension机制注册算子（算子一般性能比较高，采用native的语言开发），正向利用Python进行解释执行，同时通过Tape机制（自动微分）生成反向图，然后基于反向图进行梯度更新，反向执行其实也是图的一种模式，不过不会做图的编译优化。",{"type":18,"tag":26,"props":103,"children":104},{},[105],{"type":24,"value":106},"**静态图：**与动态图相比，静态图在执行前有一个构图和编译优化的过程，即执行前先生成正反向图并完成编译优化，然后再执行。那如何从Python的神经网络表达翻译到图呢？目前主要有两种方式：",{"type":18,"tag":108,"props":109,"children":110},"ul",{},[111,117],{"type":18,"tag":112,"props":113,"children":114},"li",{},[115],{"type":24,"value":116},"Tracing模式：框架把Python代码假执行一遍，记住算子执行序列（Tensor相关操作），作为正向图，并以此进行自动微分生成反向图，并进行正反向图的编译优化，最后以图模式进行执行",{"type":18,"tag":112,"props":118,"children":119},{},[120],{"type":24,"value":121},"AST转换：框架获取Python代码的AST，然后通过编译的技术转换成正向图（这里面有Parser、Resolver、Specialize、Optimaize等），并基于此生成反向图，同时再进行编译优化，最后以图模式进行执行。",{"type":18,"tag":26,"props":123,"children":124},{},[125,127],{"type":24,"value":126},"聊一聊AI框架前端：",{"type":18,"tag":37,"props":128,"children":131},{"href":129,"rel":130},"https://zhuanlan.zhihu.com/p/393031067",[41],[132],{"type":24,"value":129},{"type":18,"tag":26,"props":134,"children":135},{},[136],{"type":24,"value":137},"动静态图两者各有优缺点：",{"type":18,"tag":26,"props":139,"children":140},{},[141],{"type":24,"value":142},"动态图：灵活易用，基本上Python的语法都支持，对于动态模型，比如动态shape、有复杂控制流等，尤其有利；当然它的缺点也比较明显，就是性能和部署，动态图的执行性能基本上取决于单算子的性能，缺乏算子间的融合优化，无法充分发挥AI芯片的算力（这个对性能快速提升的AI芯片来说不是个好事情），同时AI在部署的时候，为了性能和功耗（比如端侧部署），一般要需要生成一个部署模型，部署模型本质是静态图的表达，所以这里就存在一个动静态转换的问题，太灵活的动态图训练完后，想转换变成一个部署模型（静态图）同样存在很大的挑战。",{"type":18,"tag":26,"props":144,"children":145},{},[146],{"type":24,"value":147},"静态图：优缺点和动态图刚好相反，表达上有许多限制，不灵活；但是性能好（适合面向芯片的编译优化、适合分布式并行优化），部署能力强。",{"type":18,"tag":26,"props":149,"children":150},{},[151],{"type":24,"value":152},"另外，tracing based和ast based两种静态图模式在表达上/编译优化上也有少许差异，比如tracing based的方法，可以充分利用Python的执行完成推导功能（实现比较完整的Python复杂的数据结构和动态类型到Tensor的映射），问题是很难处理控制流并丢失了scope的信息；ast based的方法原理上功能可以做的比较强大，但是在python语法的翻译需要比较全量的翻译，工程量比较大/方案比较复杂。",{"type":18,"tag":26,"props":154,"children":155},{},[156],{"type":24,"value":157},"大家一个朴素的想法就是我们是否可以做到动静态图灵活转换，从而能同时拿到两者的优点。比如在网络调试或者网络研究的时候，使用动态图，工作效率比较高；在生产环境上，无缝切换到静态图，性能高，部署快。",{"type":18,"tag":26,"props":159,"children":160},{},[161],{"type":24,"value":162},"但是动静态图的无缝转换实际上是一件非常难的事情。",{"type":18,"tag":26,"props":164,"children":165},{},[166],{"type":18,"tag":92,"props":167,"children":168},{},[169],{"type":24,"value":170},"2、动静态图转换的挑战",{"type":18,"tag":26,"props":172,"children":173},{},[174],{"type":24,"value":175},"如果把静态图的表达看成是一种特殊的DSL，那这种DSL实际上是一种静态语言，而Python是一个解释性、动态类型的语言，从本质上讲它是不可能完全无损的转换到一种静态语言。不过AI框架是一种面向领域的框架，其业务特征是有比较固定的范式的，比如以tensor计算和自动微分为中心，最终的计算都会形成tensor流，这一定程度上可以减少动静图转换的难度，但是挑战依然存在，下面分情况来分析：",{"type":18,"tag":26,"props":177,"children":178},{},[179],{"type":24,"value":180},"tracing based的静态图：这种方式拿到的图实际上是平铺的一个执行流，所以很难处理控制流的情况，比如循环，对于tracing来说就是展开，但是有些情况下循环无法展开，如循环条件根据训练的收敛情况/算子的执行结果等。",{"type":18,"tag":26,"props":182,"children":183},{},[184],{"type":24,"value":185},"ast based的静态图：这种方式拿到的图是从python的ast转换而来，好处是控制流和scope信息都可以保留，但是挑战是那么多python的语法和数据结构都要转换到静态图的表达，更难的是python是动态类型的，所以这个ast到静态图的转换中需要一个复杂的类型/值推导过程。",{"type":18,"tag":26,"props":187,"children":188},{},[189],{"type":18,"tag":92,"props":190,"children":191},{},[192],{"type":24,"value":193},"3、动静态图发展的三个阶段",{"type":18,"tag":26,"props":195,"children":196},{},[197],{"type":24,"value":198},"总的来说，个人认为，动静图的发展分为三个阶段：",{"type":18,"tag":26,"props":200,"children":201},{},[202],{"type":24,"value":203},"动静态图分离——>动静图结合——>动静图统一；当前框架主要处于第二阶段。",{"type":18,"tag":26,"props":205,"children":206},{},[207],{"type":24,"value":208},"**动静图结合：**朴素的想法是，既然动态图不容易转换到静态图，能不能让开发者自己标识模型中的哪部分Layer或者代码需要加速并且可以实现动静转换。基本的实现方式是在需要静态图的代码块上加上装饰符，以MindSpore为例：",{"type":18,"tag":210,"props":211,"children":213},"pre",{"code":212},"@ms_function\ndef tensor_add_with_dec(x, y):\n     z = x + y\n     return z\n",[214],{"type":18,"tag":215,"props":216,"children":217},"code",{"__ignoreMap":7},[218],{"type":24,"value":212},{"type":18,"tag":26,"props":220,"children":221},{},[222],{"type":24,"value":223},"框架会将ms_function修饰的函数进行静态图处理。目前动静结合是当前框架的主流模式。但是动静图结合的方式对开发者并不友好，需要开发者自己去判断哪里可以转成静态图，意味着开发者能够清楚两个事情：一是哪些地方可以加速；二是哪些代码块可以转成静态图（代码块里面代码的语法符合静态图的约束）。",{"type":18,"tag":26,"props":225,"children":226},{},[227],{"type":24,"value":228},"**动静图统一：**理想的方式就是动静统一，开发者可以灵活的进行动静态图的切换，这里面可能有两条路径，一条路径是从动态图出发，框架可以在运行过程中自动的JIT，无需用户用修饰符指定，主要的关键技术就是Lazy Tensor；另外一条路径是从静态图出发，在编译过程中如果发现有不支持的语法，可以保留到运行时进行fallback到python，主要的关键技术就是jit fallback。",{"type":18,"tag":82,"props":230,"children":232},{"id":231},"动静统一的关键技术",[233],{"type":24,"value":231},{"type":18,"tag":26,"props":235,"children":236},{},[237],{"type":18,"tag":92,"props":238,"children":239},{},[240],{"type":24,"value":241},"1、LazyTensor",{"type":18,"tag":26,"props":243,"children":244},{},[245],{"type":24,"value":246},"LazyTensor思想也比较简单，该方法建立在动态图的异步执行基础上，遇到tensor的操作（主要是算子），如果不需要查看tensor的具体内容，框架可以把这算子缓存下来，不发到device执行，从深度学习的范式看，大概率会缓存一大串op的执行序列，缓存到一定程度，可以把这个序列进行JIT编译优化，然后通过图的方式执行。",{"type":18,"tag":26,"props":248,"children":249},{},[250],{"type":18,"tag":251,"props":252,"children":255},"img",{"alt":253,"src":254},"1.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202110/29/090757myv85yn364krrdsq.jpg",[],{"type":18,"tag":26,"props":257,"children":258},{},[259],{"type":24,"value":260},"当然这里tensor相关的api需要分为两类，一种是可以被表达成IR图的API以及另一种无法被表达成IR的API。任意一个返回一个或多个Tensor的API，都是可以被转换成IR图，而返回非Tensor类型的API则无法被转换成IR图。这样Layz Tensor机制就可以根据这个标识来确定哪些op序列可以进行JIT优化。",{"type":18,"tag":26,"props":262,"children":263},{},[264],{"type":24,"value":265},"Lazy Tensor机制本质是tracing based的静态图的一个演进，相当于自动打装饰符。",{"type":18,"tag":26,"props":267,"children":268},{},[269],{"type":24,"value":270},"LazyTensor机制的好处：",{"type":18,"tag":108,"props":272,"children":273},{},[274,279],{"type":18,"tag":112,"props":275,"children":276},{},[277],{"type":24,"value":278},"用户无感，易用性好",{"type":18,"tag":112,"props":280,"children":281},{},[282],{"type":24,"value":283},"语法限制少，理论上只要找到合适的tensor操作序列都可以进行加速。",{"type":18,"tag":26,"props":285,"children":286},{},[287],{"type":24,"value":288},"LazyTensor机制的局限：",{"type":18,"tag":108,"props":290,"children":291},{},[292,297],{"type":18,"tag":112,"props":293,"children":294},{},[295],{"type":24,"value":296},"JIT编译的开销比较大，所以最好是首次的JIT编译结果能够缓存，方便下次重复使用，用在静态网络下比较合适。",{"type":18,"tag":112,"props":298,"children":299},{},[300],{"type":24,"value":301},"动态情况下，比如动态shape/控制流，每次变化都会触发新的编译，从性能上看得不偿失。",{"type":18,"tag":26,"props":303,"children":304},{},[305],{"type":24,"value":306},"LazyTensor的细节可以参考下面的论文分析，同时国内清华大学的计图框架在这一块已经做了非常有意思的尝试。",{"type":18,"tag":26,"props":308,"children":309},{},[310],{"type":18,"tag":37,"props":311,"children":314},{"href":312,"rel":313},"https://zhuanlan.zhihu.com/p/383547872",[41],[315],{"type":24,"value":316},"论文分析-动静态图结合的一种方案：LazyTenor-combining eager execution with domain-specific Compiler",{"type":18,"tag":26,"props":318,"children":319},{},[320],{"type":18,"tag":92,"props":321,"children":322},{},[323],{"type":24,"value":324},"2、JIT fallback",{"type":18,"tag":26,"props":326,"children":327},{},[328],{"type":24,"value":329},"JIT fallback是用静态图的角度出发来考虑动静图的统一，希望静态图能够尽量多的支持动态图的语法，其思路借鉴了传统JIT编译的fallback的思路，传统的JIT编译经常会通过profiling信息，对函数进行多态类型、value推导、分支调度等优化，同时设置guard条件，一旦guard条件发现情况有变，可以去JIT优化，回到原来未优化的函数进行解释执行。",{"type":18,"tag":26,"props":331,"children":332},{},[333],{"type":24,"value":334},"对于AI框架的JIT fallback没有那么复杂，在静态图编译的时候（一般JIT fallback是基于ast based的静态图），如果发现是编译器不支持的Python语法，可以把相关语句保留下来，生成解释节点，然后在后面的处理中，fallback到python去执行相关的语句，从而实现相关语法的支持，这里有几个难点：",{"type":18,"tag":26,"props":336,"children":337},{},[338],{"type":24,"value":339},"1、不支持的Python语法的识别",{"type":18,"tag":26,"props":341,"children":342},{},[343],{"type":24,"value":344},"2、解释节点的推导和执行，解释节点有两个运行时机：首先是编译期的推导，一般而言，解释节点尽量在这个实际执行；其次是运行期的执行。",{"type":18,"tag":82,"props":346,"children":348},{"id":347},"mindspore的实践",[349],{"type":24,"value":350},"MindSpore的实践",{"type":18,"tag":26,"props":352,"children":353},{},[354],{"type":24,"value":355},"MindSpore目前已经支持动静结合的方式，正在完善动静统一的方式。",{"type":18,"tag":26,"props":357,"children":358},{},[359],{"type":24,"value":360},"动静结合的方式：主要是通过ms_function这个装饰符来实现。",{"type":18,"tag":26,"props":362,"children":363},{},[364],{"type":24,"value":365},"动静统一：MindSpore可以通过set_context来实现动静态图的一键式切换，但是不可否认当前确实存在不少动态图无法转到静态图的语法，MindSpore正在做JIT fallback的工作，预计1.6版本先实现编译推导期的fallback。至于LazyTensor的方式，目前看还很难解决动态的问题，还需要进一步探索。",{"title":7,"searchDepth":367,"depth":367,"links":368},4,[369,371,372],{"id":84,"depth":370,"text":87},2,{"id":231,"depth":370,"text":231},{"id":347,"depth":370,"text":350},"markdown","content:technology-blogs:zh:766.md","content","technology-blogs/zh/766.md","technology-blogs/zh/766","md",1776506140832]