[{"data":1,"prerenderedAt":460},["ShallowReactive",2],{"content-query-JMmHIHxe2n":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":454,"_id":455,"_source":456,"_file":457,"_stem":458,"_extension":459},"/technology-blogs/zh/386","zh",false,"","MindSpore大V博文之高阶优化器系列（二）","这篇跟大家分享MindSpore自研优化器THOR，该优化器在ImageNet上训练ResNet50，使用MindSpore+8 Ascend 910 仅需66.7分钟，当使用256节点时仅需2.7分钟！该优化器已完成论文投稿，被AAAI2020接受，后续会把论文贴出来。","2021-01-26","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/01/26/b53340f674d34c51a4380b77fe91263b.png","technology-blogs","大V博文",{"type":15,"children":16,"toc":451},"root",[17,25,31,44,55,60,70,75,106,117,124,147,152,157,192,199,222,229,236,241,258,263,274,279,286,302,325,330,337,342,347,354,359,364,371,376,381,393,405,410,422,434,439],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore大v博文之高阶优化器系列二",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"作者：于璠",{"type":18,"tag":26,"props":32,"children":33},{},[34,36],{"type":24,"value":35},"作者主页：",{"type":18,"tag":37,"props":38,"children":42},"a",{"href":39,"rel":40},"https://www.zhihu.com/people/yu-fan-42-9",[41],"nofollow",[43],{"type":24,"value":39},{"type":18,"tag":26,"props":45,"children":46},{},[47,49],{"type":24,"value":48},"原文链接：",{"type":18,"tag":37,"props":50,"children":53},{"href":51,"rel":52},"https://zhuanlan.zhihu.com/p/345045119",[41],[54],{"type":24,"value":51},{"type":18,"tag":26,"props":56,"children":57},{},[58],{"type":24,"value":59},"上篇文章对深度学习训练中的优化器做了背景介绍，链接戳这里。",{"type":18,"tag":26,"props":61,"children":62},{},[63],{"type":18,"tag":37,"props":64,"children":67},{"href":65,"rel":66},"https://zhuanlan.zhihu.com/p/337187451",[41],[68],{"type":24,"value":69},"于璠：MindSpore高阶优化器系列（1）zhuanlan.zhihu.com",{"type":18,"tag":26,"props":71,"children":72},{},[73],{"type":24,"value":74},"这篇跟大家分享MindSpore自研优化器THOR(Trace-based Hardware-driven layer-ORiented Natural Gradient Descent Computation)，该优化器在ImageNet上训练ResNet50，使用MindSpore+8 Ascend 910 仅需66.7分钟，当使用256节点时仅需2.7分钟！该优化器已完成论文投稿，被AAAI2020接受，后续会把论文贴出来。",{"type":18,"tag":26,"props":76,"children":77},{},[78,80,86,88,92,94,98,100,104],{"type":24,"value":79},"上一篇中我们已经介绍过一二阶优化器，其中二阶优化器与一阶优化器相比收敛速度更快，但缺点是二阶信息矩阵求逆复杂度高，为",{"type":18,"tag":81,"props":82,"children":85},"img",{"alt":83,"src":84},"image.png","https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225131x3a1hmzfx4byjuar.png",[],{"type":24,"value":87}," , 其中 n 为二阶信息矩阵维度，当模型参数量为",{"type":18,"tag":81,"props":89,"children":91},{"alt":83,"src":90},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225152bjjdnhqrkgblksae.png",[],{"type":24,"value":93},"时，对应的二阶信息矩阵的大小为",{"type":18,"tag":81,"props":95,"children":97},{"alt":83,"src":96},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225234erj8th2gkgwbymgq.png",[],{"type":24,"value":99}," 。在深度学习模型中,",{"type":18,"tag":81,"props":101,"children":103},{"alt":83,"src":102},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225253vlp0cxzg43wi8iyv.png",[],{"type":24,"value":105}," 常常在数百万的量级，此时二阶信息矩阵的逆无法计算。因此如何降低二阶信息矩阵求逆的计算复杂度成为关键问题。",{"type":18,"tag":26,"props":107,"children":108},{},[109,111,115],{"type":24,"value":110},"MindSpore针对该问题，提出了自研算法THOR，该算法是基于自然梯度法，对Fisher矩阵做了近似，自然梯度法中的",{"type":18,"tag":81,"props":112,"children":114},{"alt":83,"src":113},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225315sfperj5kg53jsicq.png",[],{"type":24,"value":116}," 矩阵可以表示为：",{"type":18,"tag":26,"props":118,"children":119},{},[120],{"type":18,"tag":81,"props":121,"children":123},{"alt":83,"src":122},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/2253535k32c4edascadwyf.png",[],{"type":18,"tag":26,"props":125,"children":126},{},[127,129,133,135,139,141,145],{"type":24,"value":128},"其中",{"type":18,"tag":81,"props":130,"children":132},{"alt":83,"src":131},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225510cbzfvqefladqvthq.png",[],{"type":24,"value":134}," 是网络模型的预测分布，",{"type":18,"tag":81,"props":136,"children":138},{"alt":83,"src":137},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225533sutlbf5p5ygtmixv.png",[],{"type":24,"value":140}," 是其概率密度，",{"type":18,"tag":81,"props":142,"children":144},{"alt":83,"src":143},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225550funlbzux8qz9ylin.png",[],{"type":24,"value":146}," 是需要网络模型的参数。",{"type":18,"tag":26,"props":148,"children":149},{},[150],{"type":24,"value":151},"那THOR主要做了哪些改进呢，我们一起来看一下：",{"type":18,"tag":26,"props":153,"children":154},{},[155],{"type":24,"value":156},"1. 降低二阶信息矩阵更新频率",{"type":18,"tag":26,"props":158,"children":159},{},[160,162,166,168,172,174,178,180,184,186,190],{"type":24,"value":161},"通过实验观察",{"type":18,"tag":81,"props":163,"children":165},{"alt":83,"src":164},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225612qzcx1hntpxfdohbg.png",[],{"type":24,"value":167},"矩阵的F范数（Frobenius norm）在前期变化剧烈，后期逐渐变稳定，从而假设",{"type":18,"tag":81,"props":169,"children":171},{"alt":83,"src":170},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225708wikbkfk15u2x0ibe.png",[],{"type":24,"value":173}," 是一个马尔可夫过程，可以收敛到一个稳态分布π，其中",{"type":18,"tag":81,"props":175,"children":177},{"alt":83,"src":176},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225725tlspu5tt78r3a0zs.png",[],{"type":24,"value":179}," 代表第k个迭代时的",{"type":18,"tag":81,"props":181,"children":183},{"alt":83,"src":182},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225740fihalqnqofuydpee.png",[],{"type":24,"value":185},"矩阵。因此，在训练过程中逐步增大",{"type":18,"tag":81,"props":187,"children":189},{"alt":83,"src":188},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225753iq7m2guy8rr2kr4s.png",[],{"type":24,"value":191},"矩阵的更新间隔，可以在不影响收敛速度的情况下，减少训练时间。例如在ResNet50中，更新间隔步数随着训练的进行越来越大，到后期每个epoch只需更新一次二阶信息矩阵，如下图所示。",{"type":18,"tag":26,"props":193,"children":194},{},[195],{"type":18,"tag":81,"props":196,"children":198},{"alt":83,"src":197},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225832cjsijkjwjqjmsyea.png",[],{"type":18,"tag":26,"props":200,"children":201},{},[202,204,208,210,214,216,220],{"type":24,"value":203},"THOR受KFAC启发，将",{"type":18,"tag":81,"props":205,"children":207},{"alt":83,"src":206},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225911nvgaindk22ztwxik.png",[],{"type":24,"value":209},"矩阵按层解耦来降低矩阵复杂度，分别针对每一层的",{"type":18,"tag":81,"props":211,"children":213},{"alt":83,"src":212},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225919ai9eieipnydksinm.png",[],{"type":24,"value":215},"矩阵做实验，发现有些层的",{"type":18,"tag":81,"props":217,"children":219},{"alt":83,"src":218},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225924kpxirbtnw4znyi9u.png",[],{"type":24,"value":221},"矩阵趋于稳态的速度更快，因此在统一的更新间隔上，更加细粒度的去调整每一层的更新频率。THOR使用矩阵的迹作为判断条件，当迹的变化情况大于某一阈值 时更新该层的二阶信息矩阵，否则沿用上一个迭代的二阶信息矩阵，并且引入了停止更新机制，当迹的变化量小于某个阈值 时停止更新该层二姐信息矩阵，且 ，具体更新公式如下：",{"type":18,"tag":26,"props":223,"children":224},{},[225],{"type":18,"tag":81,"props":226,"children":228},{"alt":83,"src":227},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225942jqzcdrdasuiqtwyk.png",[],{"type":18,"tag":26,"props":230,"children":231},{},[232],{"type":18,"tag":81,"props":233,"children":235},{"alt":83,"src":234},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/225957nyxm8mpvfpknyfc2.png",[],{"type":18,"tag":26,"props":237,"children":238},{},[239],{"type":24,"value":240},"2. 硬件感知矩阵切分",{"type":18,"tag":26,"props":242,"children":243},{},[244,246,250,252,256],{"type":24,"value":245},"THOR在将",{"type":18,"tag":81,"props":247,"children":249},{"alt":83,"src":248},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/230029zegwa0da5xxc4e0d.png",[],{"type":24,"value":251},"矩阵按层解耦的基础上，进一步假设每个网络层中的输入和输出块之间也是独立的，例如将每层网络的输入输出切分为n个块，这n个块之间即是独立的，根据该假设对二阶信息矩阵做进一步的切分，从而提高了计算效率。THOR结合矩阵信息损失数据和矩阵性能数据确定了矩阵分块维度，从而大大提升",{"type":18,"tag":81,"props":253,"children":255},{"alt":83,"src":254},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/230034ajnx6yix5diudkco.png",[],{"type":24,"value":257},"矩阵求逆时间。",{"type":18,"tag":26,"props":259,"children":260},{},[261],{"type":24,"value":262},"那么如何确定矩阵分块维度的呢。具体方法为：",{"type":18,"tag":26,"props":264,"children":265},{},[266,268,272],{"type":24,"value":267},"（1）根据",{"type":18,"tag":81,"props":269,"children":271},{"alt":83,"src":270},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/230055knluemrasvf95m6u.png",[],{"type":24,"value":273},"矩阵中维度最大的那一层，确定矩阵切分维度，拿ReseNet-50举例，网络层中的最大维度为2048，确定矩阵切分维度为[1,16,32,64,128,256,512,1024,2048]；",{"type":18,"tag":26,"props":275,"children":276},{},[277],{"type":24,"value":278},"（2）根据确定的矩阵维度，根据谱范数计算每个维度下的矩阵损失，具体公式为",{"type":18,"tag":26,"props":280,"children":281},{},[282],{"type":18,"tag":81,"props":283,"children":285},{"alt":83,"src":284},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/230202yywvn5c64ldtfuld.png",[],{"type":18,"tag":26,"props":287,"children":288},{},[289,290,294,296,300],{"type":24,"value":128},{"type":18,"tag":81,"props":291,"children":293},{"alt":83,"src":292},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/230238ps02ngxyqempjpao.png",[],{"type":24,"value":295}," 表示矩阵 X 的最大特征值， A 表示原始未分割矩阵，",{"type":18,"tag":81,"props":297,"children":299},{"alt":83,"src":298},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/230257mwlim6fhua2nj7cb.png",[],{"type":24,"value":301}," 表示分割后的矩阵。然后统计在该维度下损失小于1%的矩阵数量，最后通过除以总的矩阵数量得到标准化后的矩阵损失信息。",{"type":18,"tag":26,"props":303,"children":304},{},[305,307,311,313,317,319,323],{"type":24,"value":306},"（3）根据确定的矩阵维度，计算每个维度下的矩阵求逆时间，再通过公式",{"type":18,"tag":81,"props":308,"children":310},{"alt":83,"src":309},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/230316sbivzsirlyhptdc2.png",[],{"type":24,"value":312}," 得到每个维度下标准化后性能数据，其中",{"type":18,"tag":81,"props":314,"children":316},{"alt":83,"src":315},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/230332hi4nivefd09ltdqz.png",[],{"type":24,"value":318}," 表示维度最小的矩阵的性能数据，",{"type":18,"tag":81,"props":320,"children":322},{"alt":83,"src":321},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/230359mrvdkmvkwzschbaw.png",[],{"type":24,"value":324},"表示第n个维度下的性能数据。",{"type":18,"tag":26,"props":326,"children":327},{},[328],{"type":24,"value":329},"（4）根据标注化后的矩阵损失信息和标准化后的性能数据绘图，如以ResNet50为例，可得到下图，图中交叉点为106，与128最接近，最后确定矩阵切分维度为128。",{"type":18,"tag":26,"props":331,"children":332},{},[333],{"type":18,"tag":81,"props":334,"children":336},{"alt":83,"src":335},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/230424t8bbf2b3fogkcqfs.png",[],{"type":18,"tag":26,"props":338,"children":339},{},[340],{"type":24,"value":341},"3. 实验结果",{"type":18,"tag":26,"props":343,"children":344},{},[345],{"type":24,"value":346},"下图展示了THOR在ResNet50+ImageNet，batchsize为256时一二阶上的训练曲线图。",{"type":18,"tag":26,"props":348,"children":349},{},[350],{"type":18,"tag":81,"props":351,"children":353},{"alt":83,"src":352},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/2304512mnf9du9ktzwgxwc.png",[],{"type":18,"tag":26,"props":355,"children":356},{},[357],{"type":24,"value":358},"图中的THOR，THOR_stop_，_THOR_NT分表表示 ，从图中可以看到THOR收敛所需迭代数大约是一阶的一半，且单step的时间与一阶相差也不大。相比一阶算法需要117min，二阶优化器端到端时间提速约40%。",{"type":18,"tag":26,"props":360,"children":361},{},[362],{"type":24,"value":363},"THOR还测试了在不同batchsize下ResNet50+ImageNet的收敛结果，结果见下表，当batchsize为8192，使用256块Ascend 910时，只需2.7分钟精度即可收敛到75.9%，该结果在业界也是非常有竞争力的。MindSpore团队还会将THOR进一步应用到NLP领域中，如Bert和GPT-3，到时候再跟大家分享THOR在NLP任务上的表现。",{"type":18,"tag":26,"props":365,"children":366},{},[367],{"type":18,"tag":81,"props":368,"children":370},{"alt":83,"src":369},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202101/18/230615ftzyslmerwnjonja.png",[],{"type":18,"tag":26,"props":372,"children":373},{},[374],{"type":24,"value":375},"最后做个预告，下一篇跟大家分享下实操经验：如何在MindSpore上使用THOR训练模型？",{"type":18,"tag":26,"props":377,"children":378},{},[379],{"type":24,"value":380},"例行广告时间：）",{"type":18,"tag":26,"props":382,"children":383},{},[384,386],{"type":24,"value":385},"MindSpore官网：",{"type":18,"tag":37,"props":387,"children":390},{"href":388,"rel":389},"https://.zhihu.com/?target=https://www.mindspore.cn/",[41],[391],{"type":24,"value":392},"https://www.mindspore.cn/",{"type":18,"tag":26,"props":394,"children":395},{},[396,398],{"type":24,"value":397},"MindSpore论坛：",{"type":18,"tag":37,"props":399,"children":402},{"href":400,"rel":401},"https://.zhihu.com/?target=https://bbs.huaweicloud.com/forum/forum-1076-1.html",[41],[403],{"type":24,"value":404},"https://bbs.huaweicloud.com/forum/forum-1076-1.html",{"type":18,"tag":26,"props":406,"children":407},{},[408],{"type":24,"value":409},"代码仓地址：",{"type":18,"tag":26,"props":411,"children":412},{},[413,415],{"type":24,"value":414},"Gitee-",{"type":18,"tag":37,"props":416,"children":419},{"href":417,"rel":418},"https://.zhihu.com/?target=https://gitee.com/mindspore/mindspore",[41],[420],{"type":24,"value":421},"https://gitee.com/mindspore/mindspore",{"type":18,"tag":26,"props":423,"children":424},{},[425,427],{"type":24,"value":426},"GitHub-",{"type":18,"tag":37,"props":428,"children":431},{"href":429,"rel":430},"https://.zhihu.com/?target=https://github.com/mindspore-ai/mindspore",[41],[432],{"type":24,"value":433},"https://github.com/mindspore-ai/mindspore",{"type":18,"tag":26,"props":435,"children":436},{},[437],{"type":24,"value":438},"官方QQ群: 871543426",{"type":18,"tag":26,"props":440,"children":441},{},[442,449],{"type":18,"tag":37,"props":443,"children":446},{"href":444,"rel":445},"https://.zhihu.com/?target=http://weixin.qq.com/r/-0hlfbPEDu5xrfce9x3t",[41],[447],{"type":24,"value":448},"http://weixin.qq.com/r/-0hlfbPEDu5xrfce9x3t",{"type":24,"value":450}," (二维码自动识别)",{"title":7,"searchDepth":452,"depth":452,"links":453},4,[],"markdown","content:technology-blogs:zh:386.md","content","technology-blogs/zh/386.md","technology-blogs/zh/386","md",1776506136244]