[{"data":1,"prerenderedAt":352},["ShallowReactive",2],{"content-query-pvPlWnEcws":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":346,"_id":347,"_source":348,"_file":349,"_stem":350,"_extension":351},"/technology-blogs/zh/949","zh",false,"","MindSpore运行模式与PyNative内存调优分析","PyNative下内存全部是动态使用的","2022-01-13","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/01/17/9decb90a372d4064a3d80d9f51094382.png","technology-blogs","开发者分享",{"type":15,"children":16,"toc":343},"root",[17,25,35,40,55,60,68,73,123,131,136,188,196,208,232,240,249,258,263,271,285,294,309,319,333],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore运行模式与pynative内存调优分析",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":18,"tag":30,"props":31,"children":32},"strong",{},[33],{"type":24,"value":34},"MindSpore运行模式与内存",{"type":18,"tag":26,"props":36,"children":37},{},[38],{"type":24,"value":39},"MindSpore下有2种运行模式，一种是GRAPH模式（静态图），一种是PyNative模式（动态图）。 GRAPH下的内存使用分为2种：",{"type":18,"tag":41,"props":42,"children":43},"ol",{},[44,50],{"type":18,"tag":45,"props":46,"children":47},"li",{},[48],{"type":24,"value":49},"通过somas算法控制的内存，称为动态内存（Tensor使用的内存地址可以被其它Tensor使用），如算子的输出；",{"type":18,"tag":45,"props":51,"children":52},{},[53],{"type":24,"value":54},"从内存池申请的内存，称为静态内存（Tensor的内存地址不会被其它Tensor使用），如Weights Parameter， 图的output等。",{"type":18,"tag":26,"props":56,"children":57},{},[58],{"type":24,"value":59},"PyNative模式下使用的内存均从内存池申请，算子运行完成后，如果输出不被算子反向图使用，那就可以释放。也就是说，PyNative下内存全部是动态使用的。",{"type":18,"tag":26,"props":61,"children":62},{},[63],{"type":18,"tag":30,"props":64,"children":65},{},[66],{"type":24,"value":67},"PyNative的内存问题分析",{"type":18,"tag":26,"props":69,"children":70},{},[71],{"type":24,"value":72},"如何提高PyNative下的内存使用率呢？可以从2个方面考虑。",{"type":18,"tag":41,"props":74,"children":75},{},[76,118],{"type":18,"tag":45,"props":77,"children":78},{},[79,81,86,88,93,97,99,104,106,109,113,116],{"type":24,"value":80},"内存池的设计。PyNative下的内存均是从内存池申请的，那么内存池的设计就直接关系到内存的利用率。MindSpore下内存池使用的是Best-Fit算法，优先按照大小去划分。具体是，先划分block（默认大小1GB），然后在block中划分buf块，Tensor使用的内存是buf块。那么，这里就存在2个问题。 – ",{"type":18,"tag":30,"props":82,"children":83},{},[84],{"type":24,"value":85},"block之间碎片",{"type":24,"value":87},"。默认按照1GB的大小去分配，如果是30GB的内存，一般情况下，肯定会有30个内存碎片。碎片与碎片的内存无法合并，也就无法被再利用，造成内存利用率低。如图1中，灰色的块表示碎片。 ",{"type":18,"tag":89,"props":90,"children":92},"img",{"alt":7,"src":91},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202201/07/190556xz3jwkh1czukghg8.png",[],{"type":18,"tag":94,"props":95,"children":96},"br",{},[],{"type":24,"value":98},"图1 block之间碎片 – ",{"type":18,"tag":30,"props":100,"children":101},{},[102],{"type":24,"value":103},"block内部碎片",{"type":24,"value":105},"。block内部各个buf所属的Tensor，它们的生命周期可能是不相同的，那么就有可能有的buf被回收是空闲的，有的buf还在使用中，那么buf之间的内存就不能被有效合并，也会出现很多碎片，造成利用率低。如图2中，灰色的块表示碎片。",{"type":18,"tag":94,"props":107,"children":108},{},[],{"type":18,"tag":89,"props":110,"children":112},{"alt":7,"src":111},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202201/07/190713yohkhfnkb1ff9zot.png",[],{"type":18,"tag":94,"props":114,"children":115},{},[],{"type":24,"value":117},"图2 block内部碎片",{"type":18,"tag":45,"props":119,"children":120},{},[121],{"type":24,"value":122},"正向内存使用优化。算子正向过程的输出，如果不被反向图使用到，那么算子正向运行完成的时，就可以释放；但是，如果正向输出被反向图使用到，那么就只能在反向算子运行完成后才能释放。如何确保在反向算子运行完成后及时释放？",{"type":18,"tag":26,"props":124,"children":125},{},[126],{"type":18,"tag":30,"props":127,"children":128},{},[129],{"type":24,"value":130},"PyNative内存优化手段",{"type":18,"tag":26,"props":132,"children":133},{},[134],{"type":24,"value":135},"通过以上分析，PyNative下的内存优化，具体可以从3个方面入手，",{"type":18,"tag":41,"props":137,"children":138},{},[139,155,178],{"type":18,"tag":45,"props":140,"children":141},{},[142,147,149,153],{"type":18,"tag":30,"props":143,"children":144},{},[145],{"type":24,"value":146},"优化block之间的碎片",{"type":24,"value":148},"。如果block只有一个，是不是就没有block之间的碎片了？通过在context中添加mempool_block_size，可以使用block的大小，例如，Device设备给内存池使用为29GB，PyNative下通过设置context.set_context(mempool_block_size=“29GB”)，那么内存池就只会有1个block了，也就不存在block之间的碎片了。如图3中，黄色块表示已经分配占用。 ",{"type":18,"tag":89,"props":150,"children":152},{"alt":7,"src":151},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202201/07/190635luw3ott79jcujc8x.png",[],{"type":24,"value":154},"图3 内存池只有1个block",{"type":18,"tag":45,"props":156,"children":157},{},[158,165,170,172,176],{"type":18,"tag":159,"props":160,"children":164},"a",{"href":161,"rel":162},"https://bbs.huaweicloud.com/forum/thread-177022-1-1.html#",[163],"nofollow",[],{"type":18,"tag":30,"props":166,"children":167},{},[168],{"type":24,"value":169},"优化block内部碎片",{"type":24,"value":171},"。将内存池划分为2类block。一类称为common block, 另一类称为persistent block。将图的输入，输出，Weigths Parameter和ValueNode这些生命周期较长的Tensor划分到这个persistent block。common block被其它类型的内存申请使用，主要是各个算子的输出。如此一来，common block将大幅提高buf之间的合并成功率，图执行完一个step后，common block将完全有可能合并为一整块内存。如图4中，黄色块表示已经分配占用。",{"type":18,"tag":89,"props":173,"children":175},{"alt":7,"src":174},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202201/07/191422bjtbbdenj2kzconz.png",[],{"type":24,"value":177}," 图4 内存池区分为common block和persistent block",{"type":18,"tag":45,"props":179,"children":180},{},[181,186],{"type":18,"tag":30,"props":182,"children":183},{},[184],{"type":24,"value":185},"优化正向输出内存",{"type":24,"value":187},"。通过添加对正向输出的内存的所属Tensor的引用计数，当引用计数为0时，直接释放该内存。反向图在执行过程中，先遍历整图获取每个正向输出Tensor的引用计数，当反向算子执行完成后，检查Tensor的引用计数是否为0，如果是，就直接释放该Tensor的内存，不需要等到整个反向图执行完成后再释放。释放的内存被内存回收，可能会与其它空闲buf合并成大块内存，方便后续算子使用，这样内存利用率就提高了。",{"type":18,"tag":26,"props":189,"children":190},{},[191],{"type":18,"tag":30,"props":192,"children":193},{},[194],{"type":24,"value":195},"PyNative内存调试案例",{"type":18,"tag":26,"props":197,"children":198},{},[199,201,206],{"type":24,"value":200},"以上3个优化手段，第1个是需要用户设置的，后面2个已经在代码中添加优化，无需用户考虑。那么如何使用呢？ ",{"type":18,"tag":30,"props":202,"children":203},{},[204],{"type":24,"value":205},"注意：当前PyNative下的persistent block的默认大小是1GB，不需要设置。通过mempool_block_size接口设置的大小是common block的大小，该大小与Device设备给内存池的可用大小有关，结果取2个值的最小值。GPU和Ascend上调整可用内存大小接口为max_device_memory=“XXGB”",{"type":24,"value":207},"。如Ascend设备，总共大小为32GB，默认可以给内存池30GB，剩下的内存给HCCL组件或者算子运行时内存使用，如下示例：",{"type":18,"tag":209,"props":210,"children":211},"ul",{},[212,217,222,227],{"type":18,"tag":45,"props":213,"children":214},{},[215],{"type":24,"value":216},"context.set_context(mempool_block_size=“10GB”)，默认设备可用内存大小为30GB，去掉persistent block的1GB，取min(mempool_block_size, max_device_memory - 1GB)值，那么实际生效的common block大小就是10GB。",{"type":18,"tag":45,"props":218,"children":219},{},[220],{"type":24,"value":221},"context.set_context(mempool_block_size=“30GB”)，默认设备可用内存大小为30G，去掉persistent block的1GB，取min(mempool_block_size, max_device_memory - 1GB)值，那么实际生效的common block大小就是29GB。",{"type":18,"tag":45,"props":223,"children":224},{},[225],{"type":24,"value":226},"context.set_context(mempool_block_size=“30GB”, max_device_memory=“31GB”)，通过max_device_memory接口，提供给内存池的设备内存可用大小为31GB，去掉persistent block的1GB，取min(mempool_block_size, max_device_memory - 1GB)值，那么实际生效的common block大小就是30GB。",{"type":18,"tag":45,"props":228,"children":229},{},[230],{"type":24,"value":231},"context.set_context(mempool_block_size=“31GB”, max_device_memory=“31GB”)，通过max_device_memory接口，提供给内存池的设备内存可用大小为31GB，去掉persistent block的1GB，取min(mempool_block_size, max_device_memory - 1GB)值，那么实际生效的common block大小就是30GB。",{"type":18,"tag":26,"props":233,"children":234},{},[235],{"type":18,"tag":30,"props":236,"children":237},{},[238],{"type":24,"value":239},"通过shufflenetv2网络具体说明",{"type":18,"tag":26,"props":241,"children":242},{},[243,245],{"type":24,"value":244},"下载MindSpore models，进入official/cv/shufflenetv2目录。如下做了运行模式，batch size与epoch size修改： ",{"type":18,"tag":89,"props":246,"children":248},{"alt":7,"src":247},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202201/07/190908amjliqlwrqdv3pkm.png",[],{"type":18,"tag":26,"props":250,"children":251},{},[252,254],{"type":24,"value":253},"运行：bash run_standalone_train_for_gpu.sh /home/workspace/mindspore_dataset/ImageNet_Original/train; 目前该网络没有默认没有添加mempool_block_size的设置，是跑不通的，如下： ",{"type":18,"tag":89,"props":255,"children":257},{"alt":7,"src":256},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202201/07/190920rid1aoe45zo398tx.png",[],{"type":18,"tag":26,"props":259,"children":260},{},[261],{"type":24,"value":262},"可以看到，common block的大小是1GB，有19个；persistent block的大小是1GB，有1个。",{"type":18,"tag":26,"props":264,"children":265},{},[266],{"type":18,"tag":30,"props":267,"children":268},{},[269],{"type":24,"value":270},"解决步骤如下：",{"type":18,"tag":26,"props":272,"children":273},{},[274,279,281],{"type":18,"tag":30,"props":275,"children":276},{},[277],{"type":24,"value":278},"第一步",{"type":24,"value":280},"，先直接设置context.set_context(mempool_block_size=“29GB”)或者context.set_context(mempool_block_size=“30GB”)，通过上述分析，该2种设置方法生效的common block都是29GB，如下： ",{"type":18,"tag":89,"props":282,"children":284},{"alt":7,"src":283},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202201/07/190933pr9l6jtrungt9qdu.png",[],{"type":18,"tag":26,"props":286,"children":287},{},[288,290],{"type":24,"value":289},"再次，运行测试结果如下： ",{"type":18,"tag":89,"props":291,"children":293},{"alt":7,"src":292},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202201/07/190947gghecdrkmvbqm4ay.png",[],{"type":18,"tag":26,"props":295,"children":296},{},[297],{"type":18,"tag":30,"props":298,"children":299},{},[300,302,307],{"type":24,"value":301},"PyNative能跑起来了，每一行“Run start cell id 140427040045680_”日志表示执行了一个step。",{"type":18,"tag":30,"props":303,"children":304},{},[305],{"type":24,"value":306},"第二步",{"type":24,"value":308},"，如果上述设置后，内存还是出现不足，那么就需要调整设备可用内存的大小，当前最大调整为31G，即设置：",{"type":18,"tag":26,"props":310,"children":311},{},[312],{"type":18,"tag":30,"props":313,"children":314},{},[315],{"type":18,"tag":89,"props":316,"children":318},{"alt":7,"src":317},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202201/07/191004dypxffdcilzylysa.png",[],{"type":18,"tag":26,"props":320,"children":321},{},[322],{"type":18,"tag":30,"props":323,"children":324},{},[325,327,331],{"type":24,"value":326},"这样生效的common block大小是30GB，运行结果会出现： ",{"type":18,"tag":89,"props":328,"children":330},{"alt":7,"src":329},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202201/07/191014n7qanqssbelg1qug.png",[],{"type":24,"value":332}," DataSetQueue报错内存不足，而且是在AllocDeviceMem时。但是，调用接口AllocDeviceMem时，先判断了current free memory size大小，不应该报错。那么这里报错，只能说明是该次内存的申请不是向内存池申请的，而是算子本身运行过程种需要的内存。既然这样，那就需要将内存池的大小改小，以便多些内存给其它场景使用。这里申请的内存大小是1849700352B，大小约为1.7GB，而当前只空闲1073741824B，大小是1GB，所以至少要多放出0.7GB的内存。那么，内存池的大小设置为29GB即可。",{"type":18,"tag":26,"props":334,"children":335},{},[336],{"type":18,"tag":30,"props":337,"children":338},{},[339],{"type":18,"tag":89,"props":340,"children":342},{"alt":7,"src":341},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202201/07/191053goj7jg19lfa4bldg.png",[],{"title":7,"searchDepth":344,"depth":344,"links":345},4,[],"markdown","content:technology-blogs:zh:949.md","content","technology-blogs/zh/949.md","technology-blogs/zh/949","md",1776506142291]