[{"data":1,"prerenderedAt":404},["ShallowReactive",2],{"content-query-6FReIMk7yg":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":398,"_id":399,"_source":400,"_file":401,"_stem":402,"_extension":403},"/technology-blogs/zh/571","zh",false,"","大V博文系列：MLSys 2021论文分析7-《Theoretic Framework for Automatic Parallelization in Multi-core Systems》","单台服务器上多处理器的并行","2021-05-26","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/05/26/3eed882f765d4be892d76680f18e323e.png","technology-blogs","大V博文",{"type":15,"children":16,"toc":388},"root",[17,25,28,34,47,58,71,76,88,93,103,112,123,131,136,144,152,160,173,188,198,211,221,229,239,247,260,270,281,289,299,310,325,338,350,358,374],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"大v博文系列mlsys-2021论文分析7-theoretic-framework-for-automatic-parallelization-in-multi-core-systems",[23],{"type":24,"value":8},"text",{"type":18,"tag":19,"props":26,"children":27},{"id":7},[],{"type":18,"tag":29,"props":30,"children":31},"p",{},[32],{"type":24,"value":33},"作者：金雪锋",{"type":18,"tag":29,"props":35,"children":36},{},[37,39],{"type":24,"value":38},"作者主页：",{"type":18,"tag":40,"props":41,"children":45},"a",{"href":42,"rel":43},"https://www.zhihu.com/people/jin-xue-feng",[44],"nofollow",[46],{"type":24,"value":42},{"type":18,"tag":29,"props":48,"children":49},{},[50,52],{"type":24,"value":51},"文章来源：",{"type":18,"tag":40,"props":53,"children":56},{"href":54,"rel":55},"https://zhuanlan.zhihu.com/p/375387650",[44],[57],{"type":24,"value":54},{"type":18,"tag":29,"props":59,"children":60},{},[61,63,69],{"type":24,"value":62},"本次小伙伴带来《",{"type":18,"tag":64,"props":65,"children":66},"strong",{},[67],{"type":24,"value":68},"A Distributed Graph-Theoretic Framework for Automatic Parallelization in Multi-core Systems",{"type":24,"value":70},"》论文分析。",{"type":18,"tag":29,"props":72,"children":73},{},[74],{"type":24,"value":75},"论文作者：Guixiang Ma(Intel), Yao Xiao(USC), Theodore L. Willke(Intel), Nesreen K. Ahmed(Intel), Shahin Nazarian(USC), Paul Bogdan(USC)",{"type":18,"tag":29,"props":77,"children":78},{},[79,81],{"type":24,"value":80},"论文链接：",{"type":18,"tag":40,"props":82,"children":85},{"href":83,"rel":84},"https://link.zhihu.com/?target=https%3A//proceedings.mlsys.org/paper/2021https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/a5e00132373a7031000fd987a3c9f87b-Paper.pdf",[44],[86],{"type":24,"value":87},"https://proceedings.mlsys.org/paper/2021https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/a5e00132373a7031000fd987a3c9f87b-Paper.pdf",{"type":18,"tag":29,"props":89,"children":90},{},[91],{"type":24,"value":92},"论文talk：",{"type":18,"tag":29,"props":94,"children":95},{},[96],{"type":18,"tag":40,"props":97,"children":100},{"href":98,"rel":99},"https://link.zhihu.com/?target=https%3A//slideslive.com/38952733/oral-a-distributed-graphtheoretic-framework-for-automatic-parallelization-in-multicore-systems%3Fref%3Dsearch",[44],[101],{"type":24,"value":102},"https://slideslive.com/38952733/oral-a-distributed-graphtheoretic-framework-for-automatic-parallelization-in-multicore-systems?ref=search",{"type":18,"tag":104,"props":105,"children":107},"h2",{"id":106},"背景与动机",[108],{"type":18,"tag":64,"props":109,"children":110},{},[111],{"type":24,"value":106},{"type":18,"tag":29,"props":113,"children":114},{},[115,117,121],{"type":24,"value":116},"一方面越来越强的多核处理器被设计出来，另一方面机器学习和大数据分析类任务也越来越复杂。如何将复杂的应用自动化地部署到多核处理器上，进而发挥出硬件的能力成为了关键问题。本文关注于",{"type":18,"tag":64,"props":118,"children":119},{},[120],{"type":24,"value":9},{"type":24,"value":122},"，即如何将编译得到的计算图分割成多个clusters，进而将clusters映射到处理器上，以期得到更优的并行执行效率。",{"type":18,"tag":104,"props":124,"children":126},{"id":125},"目标",[127],{"type":18,"tag":64,"props":128,"children":129},{},[130],{"type":24,"value":125},{"type":18,"tag":29,"props":132,"children":133},{},[134],{"type":24,"value":135},"本工作考虑的是指令级别(instruction-level)的并行粒度，即将不同的指令分配到不同的处理器上执行。经过多种类型的应用分析得到，发现编译后的LLVM IR图具有power-law degree distribution特性，也就是：计算图中的只有少数几个点的度数较高，其余点的度数很少。因此，本工作的目标是针对这种power-law degree distribution graph切分图，使得切分后的各个cluster (instructions的集合)拥有balanced workloads，同时跨cluster之间的通信少。",{"type":18,"tag":104,"props":137,"children":139},{"id":138},"问题建模及解决方案",[140],{"type":18,"tag":64,"props":141,"children":142},{},[143],{"type":24,"value":138},{"type":18,"tag":29,"props":145,"children":146},{},[147],{"type":18,"tag":148,"props":149,"children":151},"img",{"alt":7,"src":150},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/05/26/644873e7a2124c68a5e8ff6abb193a2b.png",[],{"type":18,"tag":29,"props":153,"children":154},{},[155],{"type":18,"tag":64,"props":156,"children":157},{},[158],{"type":24,"value":159},"如上图所示，提出的方案大致分三个步骤：1) 构造 LLVM IR graph；2) 图切分；3) 切分后的clusters映射到多处理器上。",{"type":18,"tag":29,"props":161,"children":162},{},[163],{"type":18,"tag":64,"props":164,"children":165},{},[166,171],{"type":18,"tag":64,"props":167,"children":168},{},[169],{"type":24,"value":170},"构造LLVM IR graph",{"type":24,"value":172}," (V, E, W)，其中V表示点的集合，每个点表示一个指令(instruction)；E是边的集合，每条表示两条指令的数据依赖关系；W是边的weight，表示的是每条边对应的数据传输所需的内存操作的时间。",{"type":18,"tag":29,"props":174,"children":175},{},[176,181,183],{"type":18,"tag":64,"props":177,"children":178},{},[179],{"type":24,"value":180},"如何从IR文件中构造图呢？本工作从前端编译器编译的IR图出发，首先在IR文件中增加一些输出打印信息，用于测量内存操作的时间，进而将IR文件放到",{"type":24,"value":182},"后端执行",{"type":18,"tag":64,"props":184,"children":185},{},[186],{"type":24,"value":187},"以得到每个内存操作的真实时间。图当中的边是由分析IR文件中分析指令的源寄存器和目的寄存器地址得来的： 如本指令从寄存器A中读数据，而上一条指令向寄存器A找那个写数据，那么就将这两条指令对应的点创建一条边。下图展现的是图构造的过程。",{"type":18,"tag":29,"props":189,"children":190},{},[191],{"type":18,"tag":64,"props":192,"children":193},{},[194],{"type":18,"tag":148,"props":195,"children":197},{"alt":7,"src":196},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/05/26/ef4ff6b7fb16432fa7d5e3065a59446a.jpg",[],{"type":18,"tag":29,"props":199,"children":200},{},[201],{"type":18,"tag":64,"props":202,"children":203},{},[204,209],{"type":18,"tag":64,"props":205,"children":206},{},[207],{"type":24,"value":208},"图切分",{"type":24,"value":210},"。在构造好graph后，选择合适的图切分方式是核心问题。这时，有两种基本的图切分方式，边切分(Edge-Cut)和点切分(Vertex-Cut)。如下图所示，边切分中跨cluster的边会产生通信，而点切分中被切分的点在不同cluster中都有replica，通信来源于replica之间的同步信息。由于本工作的目标是最小化跨cluster的通信，且保持各个cluster之间的workload balanced，点切分相比于边切分是更好的方式。",{"type":18,"tag":29,"props":212,"children":213},{},[214],{"type":18,"tag":64,"props":215,"children":216},{},[217],{"type":18,"tag":148,"props":218,"children":220},{"alt":7,"src":219},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/05/26/7434134cd9714e04aa9ce90c754c9849.jpg",[],{"type":18,"tag":29,"props":222,"children":223},{},[224],{"type":18,"tag":64,"props":225,"children":226},{},[227],{"type":24,"value":228},"选定点切分的方式后，问题的目标就变成了如下优化问题，其中A(v)表示的是每个点v被切分到的cluster的集合，M(e) = m表示的是边e被分配到了cluster m中。公式(2)表示的是跨cluster的通信最小化。公式(3)表示的是每个cluster分得的边的权重的和(workload)是balanced，其中lambda是imbalance factor。",{"type":18,"tag":29,"props":230,"children":231},{},[232],{"type":18,"tag":64,"props":233,"children":234},{},[235],{"type":18,"tag":148,"props":236,"children":238},{"alt":7,"src":237},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/05/26/ab77d77440b6403c940fc07c0cda4bcd.png",[],{"type":18,"tag":29,"props":240,"children":241},{},[242],{"type":18,"tag":64,"props":243,"children":244},{},[245],{"type":24,"value":246},"PowerGraph[1]算法将边分配到各个cluster中去，同时平衡各个cluster的边的数量；Libra[2]针对高度数的点附着的边优先处理。这两个工作都是针对非weighted graph的，本工作在PowerGraph和Libra的基础上给出了weighted graph的切分方式，分别称作Weighted PowerGraph和Weighted Libra；进一步，在分配边时，考虑到(3)的约束条件，又有Weighted-balanced PowerGraph和Weighted-balanced Libra。",{"type":18,"tag":29,"props":248,"children":249},{},[250],{"type":18,"tag":64,"props":251,"children":252},{},[253,258],{"type":18,"tag":64,"props":254,"children":255},{},[256],{"type":24,"value":257},"Cluster映射",{"type":24,"value":259},"。将切分好的cluster映射到处理器上，需要考虑多处理器的拓扑连接关系。本工作考虑的是NUMA架构：处理器对local memory有较快的访问速度，而对其他memory的访问速度较慢。为了获得较优的并行加速，同时考虑到跨cluster的通信和资源利用率，本工作提出了cluster到处理器映射的几个原则：多个cluster若操作于同一数据结构，那么它们应该被映射到同一核上；有通信的clusters应该被映射到相邻的处理器上；相互独立的clusters可以被映射到不同区域的处理器上；同时，为了避免很多clusters被映射到同一处理器上，给每个处理器设置了cluster数上限。下图是cluster映射的一个例子。",{"type":18,"tag":29,"props":261,"children":262},{},[263],{"type":18,"tag":64,"props":264,"children":265},{},[266],{"type":18,"tag":148,"props":267,"children":269},{"alt":7,"src":268},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/05/26/11e096e9399a4695a75169b48ff55bf6.jpg",[],{"type":18,"tag":104,"props":271,"children":273},{"id":272},"实验",[274],{"type":18,"tag":64,"props":275,"children":276},{},[277],{"type":18,"tag":64,"props":278,"children":279},{},[280],{"type":24,"value":272},{"type":18,"tag":29,"props":282,"children":283},{},[284],{"type":18,"tag":64,"props":285,"children":286},{},[287],{"type":24,"value":288},"本工作使用gem5模拟器实验，模拟了NUMA架构的不同配置。应用用例包括：图计算，傅里叶变换，CNN，矩阵计算等。衡量指标包括执行时间，通信代价等。提出的四种算法都优于SOTA；在大多数场景中，Weighted-balanced Libra是表现最好的算法。下表是节选的两个实验结果。",{"type":18,"tag":29,"props":290,"children":291},{},[292],{"type":18,"tag":64,"props":293,"children":294},{},[295],{"type":18,"tag":148,"props":296,"children":298},{"alt":7,"src":297},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/05/26/a881b6a10ed14a5096ca5f0f1c91bf51.jpg",[],{"type":18,"tag":104,"props":300,"children":302},{"id":301},"总结及comments",[303],{"type":18,"tag":64,"props":304,"children":305},{},[306],{"type":18,"tag":64,"props":307,"children":308},{},[309],{"type":24,"value":301},{"type":18,"tag":29,"props":311,"children":312},{},[313,318,320],{"type":18,"tag":64,"props":314,"children":315},{},[316],{"type":24,"value":317},"本文考虑了复杂应用在多处理器上自动并行的问题，提出了针对LLVM IR graph切分的指令级并行的方案。依赖于程序的",{"type":24,"value":319},"profiling",{"type":18,"tag":64,"props":321,"children":322},{},[323],{"type":24,"value":324},"获取数据操作的时间信息，进而对图进行点切分，在已有算法的基础上提出了针对weighted graph的切分方式，在模拟实验中验证了并行性能的提升。",{"type":18,"tag":29,"props":326,"children":327},{},[328],{"type":18,"tag":64,"props":329,"children":330},{},[331,336],{"type":18,"tag":64,"props":332,"children":333},{},[334],{"type":24,"value":335},"Comments",{"type":24,"value":337},"：1) 本工作并未在真实硬件环境上验证；2) 文章并未讨论关于多replica同步机制的问题，同步机制依赖于硬件的架构；3) 文章理论分析的(1-1/e)-approximation并不是提出的算法的近似度，而是naïve greedy algorithm针对submodular function with cardinality constraint，而本文要解决的问题与submodular function with cardinality constraint问题不同，导致(1-1/e)近似度在这里并不适用；4) 本文的基于profiling的方案在指令级别的图切分方式中有独特的优势，因为指令级别的数据操作涉及到的寄存器的读写时间相对稳定，而在非指令级图切分的方式中，profiling方案容易产生variance过大的问题，不稳定。同时，本文提到的点切分的方式，在非指令级的切分是否适用需视情况而定，因为replica的同步方式会很不一样。",{"type":18,"tag":104,"props":339,"children":341},{"id":340},"reference",[342],{"type":18,"tag":64,"props":343,"children":344},{},[345],{"type":18,"tag":64,"props":346,"children":347},{},[348],{"type":24,"value":349},"Reference",{"type":18,"tag":29,"props":351,"children":352},{},[353],{"type":18,"tag":64,"props":354,"children":355},{},[356],{"type":24,"value":357},"[1] Joseph E. Gonzalez, Yucheng Low, Haijie Gu, Danny Bickson, Carlos Guestrin:",{"type":18,"tag":29,"props":359,"children":360},{},[361],{"type":18,"tag":64,"props":362,"children":363},{},[364,366,372],{"type":24,"value":365},"PowerGraph: Distributed Graph-Parallel Computation on Natural Graphs. ",{"type":18,"tag":367,"props":368,"children":369},"em",{},[370],{"type":24,"value":371},"OSDI 2012",{"type":24,"value":373},".",{"type":18,"tag":29,"props":375,"children":376},{},[377],{"type":18,"tag":64,"props":378,"children":379},{},[380,382,387],{"type":24,"value":381},"[2] Cong Xie, Ling Yan, Wu-Jun Li, Zhihua Zhang: Distributed Power-law Graph Computing: Theoretical and Empirical Analysis. ",{"type":18,"tag":367,"props":383,"children":384},{},[385],{"type":24,"value":386},"NIPS 2014",{"type":24,"value":373},{"title":7,"searchDepth":389,"depth":389,"links":390},4,[391,393,394,395,396,397],{"id":106,"depth":392,"text":106},2,{"id":125,"depth":392,"text":125},{"id":138,"depth":392,"text":138},{"id":272,"depth":392,"text":272},{"id":301,"depth":392,"text":301},{"id":340,"depth":392,"text":349},"markdown","content:technology-blogs:zh:571.md","content","technology-blogs/zh/571.md","technology-blogs/zh/571","md",1776506137970]