[{"data":1,"prerenderedAt":429},["ShallowReactive",2],{"content-query-7Bt3JMIEOH":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":423,"_id":424,"_source":425,"_file":426,"_stem":427,"_extension":428},"/technology-blogs/zh/1860","zh",false,"","【MindSpore易点通】精度问题定位经验总结","MindSpore中SGD优化器和Momentum优化器中的momentum参数与PyTorch中实现一致","2022-09-22","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/09/30/91b2c177f106479ba26fb1276bc6790a.png","technology-blogs","开发者分享",{"type":15,"children":16,"toc":411},"root",[17,25,31,37,42,47,52,58,63,73,78,86,91,107,113,118,132,137,142,148,153,158,172,177,191,196,201,206,212,217,229,234,245,250,255,261,266,271,276,281,286,291,296,308,313,318,324,337,343,348,353,366,371,376,387,393,406],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore易点通精度问题定位经验总结",[23],{"type":24,"value":8},"text",{"type":18,"tag":19,"props":26,"children":28},{"id":27},"训练准备阶段checklist",[29],{"type":24,"value":30},"训练准备阶段CheckList",{"type":18,"tag":32,"props":33,"children":34},"p",{},[35],{"type":24,"value":36},"首先需要",{"type":18,"tag":32,"props":38,"children":39},{},[40],{"type":24,"value":41},"1、了解MindSpore的基本信息",{"type":18,"tag":32,"props":43,"children":44},{},[45],{"type":24,"value":46},"2、准备用MindSpore进行脚本开发或者脚本已经开发完成",{"type":18,"tag":32,"props":48,"children":49},{},[50],{"type":24,"value":51},"为了保证网络训练过程正确，进行下一步训练之前，请做以下几点检查：",{"type":18,"tag":53,"props":54,"children":56},"h3",{"id":55},"网络定义",[57],{"type":24,"value":55},{"type":18,"tag":32,"props":59,"children":60},{},[61],{"type":24,"value":62},"1、注意MindSpore中SGD优化器和Momentum优化器中的momentum参数与PyTorch中实现一致。但是BN中的momentum参数与PyTorch不一致。具体关系描述如下： MindSpore中BatchNorm1d/BatchNorm2d的momentum参数(定义该参数的变量名称为momentum_ms)，该参数与PyTorch里BN的momentum参数(定义该参数的变量名称为momentum_py)的关系为：",{"type":18,"tag":64,"props":65,"children":67},"pre",{"code":66},"momentum_ms = 1 - momentum_py\n",[68],{"type":18,"tag":69,"props":70,"children":71},"code",{"__ignoreMap":7},[72],{"type":24,"value":66},{"type":18,"tag":32,"props":74,"children":75},{},[76],{"type":24,"value":77},"2、注意MindSpore中Dropout的keep_prob参数，与PyTorch里dropout的p参数不一致，其对应关系为：",{"type":18,"tag":64,"props":79,"children":81},{"code":80},"keep\\_prob = 1 - p\n",[82],{"type":18,"tag":69,"props":83,"children":84},{"__ignoreMap":7},[85],{"type":24,"value":80},{"type":18,"tag":53,"props":87,"children":89},{"id":88},"混合精度配置",[90],{"type":24,"value":88},{"type":18,"tag":32,"props":92,"children":93},{},[94,96,105],{"type":24,"value":95},"若您使用的是MindSpore+Ascend的组合进行训练，请注意Ascend为了加速训练，卷积只能使用FP16类型进行计算，因此您的训练过程默认就是混合精度。 为了保证训练过程正确，请在网络定义的时候配置混合精度级别，可参看",{"type":18,"tag":97,"props":98,"children":102},"a",{"href":99,"rel":100},"https://d5e26d2d-5f7b-43cf-8498-d2bebc602ba8.vscode-webview-test.com/vscode-resourcehttps://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file///c:/Users/hangtian/Downloads/MindBook%E7%AD%9B%E9%80%890803/MindBook_to_MS_tuomin/%E5%BC%80%E5%8F%91%E6%8C%87%E5%AF%BC%E6%89%8B%E5%86%8C/%E6%B7%B7%E5%90%88%E7%B2%BE%E5%BA%A6/%E6%B7%B7%E5%90%88%E7%B2%BE%E5%BA%A6.md",[101],"nofollow",[103],{"type":24,"value":104},"混合精度",{"type":24,"value":106},"中的使用方法。为了保证混合精度训练过程不溢出，需要配合使用Loss Scale功能。",{"type":18,"tag":19,"props":108,"children":110},{"id":109},"训练中精度checklist",[111],{"type":24,"value":112},"训练中精度CheckList",{"type":18,"tag":32,"props":114,"children":115},{},[116],{"type":24,"value":117},"训练中应该已经完成：",{"type":18,"tag":32,"props":119,"children":120},{},[121,123,130],{"type":24,"value":122},"1、",{"type":18,"tag":97,"props":124,"children":127},{"href":125,"rel":126},"https://6d8e22ed-185d-40dd-97ef-3a3b985934f7.vscode-webview-test.com/vscode-resourcehttps://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file///c:/Users/hangtian/Downloads/MindBook%E7%AD%9B%E9%80%890803/MindBook_to_MS_tuomin/%E5%BC%80%E5%8F%91%E6%8C%87%E5%AF%BC%E6%89%8B%E5%86%8C/%E7%B2%BE%E5%BA%A6%E8%B0%83%E4%BC%98/%E7%B2%BE%E5%BA%A6%E9%97%AE%E9%A2%98%E5%AE%9A%E4%BD%8D%E6%96%B9%E6%A1%88/%E8%AE%AD%E5%89%8D%E5%BF%85%E5%81%9A.md",[101],[128],{"type":24,"value":129},"训前必做",{"type":24,"value":131},"的检查事项",{"type":18,"tag":32,"props":133,"children":134},{},[135],{"type":24,"value":136},"2、开始进行MindSpore的训练，可以输出第一个Step的Loss值",{"type":18,"tag":32,"props":138,"children":139},{},[140],{"type":24,"value":141},"为了保证网络训练过程正确，进行多Epoch迭代之前，请按照顺序做以下几点检查。",{"type":18,"tag":53,"props":143,"children":145},{"id":144},"检查1首个step的前向loss值是否正确",[146],{"type":24,"value":147},"检查1：首个Step的前向Loss值是否正确",{"type":18,"tag":32,"props":149,"children":150},{},[151],{"type":24,"value":152},"1、先做变量对齐",{"type":18,"tag":32,"props":154,"children":155},{},[156],{"type":24,"value":157},"结果：若变量对齐之后Loss值一致，说明是初始化问题或者脚本迁移有错误，修改初始化或者对齐脚本后解决。若Loss值仍不同，则转到2。",{"type":18,"tag":32,"props":159,"children":160},{},[161,163,170],{"type":24,"value":162},"2、使用Print算子在网络结构中二分法打印算子的正向输出，找到正向精度出现问题的算子。MindSpore的",{"type":18,"tag":97,"props":164,"children":167},{"href":165,"rel":166},"https://6d8e22ed-185d-40dd-97ef-3a3b985934f7.vscode-webview-test.com/vscode-resourcehttps://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file///c:/Users/hangtian/Downloads/MindBook%E7%AD%9B%E9%80%890803/MindBook_to_MS_tuomin/%E5%BC%80%E5%8F%91%E6%8C%87%E5%AF%BC%E6%89%8B%E5%86%8C/%E7%B2%BE%E5%BA%A6%E8%B0%83%E4%BC%98/%E5%B8%B8%E7%94%A8%E5%AE%9A%E4%BD%8D%E6%96%B9%E6%B3%95/print%E7%AE%97%E5%AD%90%E4%BD%BF%E7%94%A8%E8%AF%B4%E6%98%8E.md",[101],[168],{"type":24,"value":169},"Print算子使用说明",{"type":24,"value":171},"，原始迁移脚本可以根据框架特点加入打印，例如PyTorch框架可以直接print节点的正向输出。",{"type":18,"tag":32,"props":173,"children":174},{},[175],{"type":24,"value":176},"结果：找到第一个精度不对的算子，转到3；若所有正向算子精度都一致，那么转到检查2。",{"type":18,"tag":32,"props":178,"children":179},{},[180,182,189],{"type":24,"value":181},"3、思考是否能够用别的算子做替换精度有问题的算子，可以查看 ",{"type":18,"tag":97,"props":183,"children":186},{"href":184,"rel":185},"https://www.mindspore.cn/docs/zh-CN/r1.7/note/api_mapping.html",[101],[187],{"type":24,"value":188},"API 映射表",{"type":24,"value":190},"。",{"type":18,"tag":32,"props":192,"children":193},{},[194],{"type":24,"value":195},"结果：替换算子进行训练，观察第一个Loss值能否对应上，若不能则转到4。",{"type":18,"tag":32,"props":197,"children":198},{},[199],{"type":24,"value":200},"4、该算子不可替换，向华为反馈该算子正向精度问题",{"type":18,"tag":32,"props":202,"children":203},{},[204],{"type":24,"value":205},"结果：反馈完成后等待回复。",{"type":18,"tag":53,"props":207,"children":209},{"id":208},"检查2首个step的反向梯度值是否正确",[210],{"type":24,"value":211},"检查2：首个Step的反向梯度值是否正确",{"type":18,"tag":32,"props":213,"children":214},{},[215],{"type":24,"value":216},"1、先做检查1，保证第一个Step的Loss值能够一致，转到2。",{"type":18,"tag":32,"props":218,"children":219},{},[220,222,227],{"type":24,"value":221},"2、使用Print算子在网络结构中二分法打印算子的反向输出，找到反向精度出现问题的算子。MindSpore的",{"type":18,"tag":97,"props":223,"children":225},{"href":165,"rel":224},[101],[226],{"type":24,"value":169},{"type":24,"value":228},"，原始迁移脚本可以根据框架特点加入反向打印，例如PyTorch框架可以通过register_hook打印节点的反向梯度。",{"type":18,"tag":32,"props":230,"children":231},{},[232],{"type":24,"value":233},"结果：找到第一个反向梯度精度不对的算子，转到3；若所有算子反向梯度都能对应一致，那么转到检查3。",{"type":18,"tag":32,"props":235,"children":236},{},[237,239,244],{"type":24,"value":238},"3、思考是否能够用别的算子做替换精度有问题的算子，可以查看",{"type":18,"tag":97,"props":240,"children":242},{"href":184,"rel":241},[101],[243],{"type":24,"value":188},{"type":24,"value":190},{"type":18,"tag":32,"props":246,"children":247},{},[248],{"type":24,"value":249},"结果：替换算子进行训练，看反向梯度能否对应，若不能转到4。",{"type":18,"tag":32,"props":251,"children":252},{},[253],{"type":24,"value":254},"4、该算子不可替换，向华为反馈该算子反向精度问题",{"type":18,"tag":53,"props":256,"children":258},{"id":257},"检查3经过第一个step后权重更新是否正确",[259],{"type":24,"value":260},"检查3：经过第一个Step后权重更新是否正确",{"type":18,"tag":32,"props":262,"children":263},{},[264],{"type":24,"value":265},"1、先做检查1和检查2，保证第一个Step的前向输出和反向梯度能够和原始迁移脚本一致，转到2。",{"type":18,"tag":32,"props":267,"children":268},{},[269],{"type":24,"value":270},"2、保存MindSpore的初始化权值文件，假设名字为ms_checkpoint1。跑一个训练Step后再保存一次更新过后的权值文件，假设名字为ms_checkpoint2。",{"type":18,"tag":32,"props":272,"children":273},{},[274],{"type":24,"value":275},"3、分别对ms_checkpoint1和ms_checkpoint2的相同层进行参数打印，并比较参数是否更新，若不更新转到5，正常更新转到4。",{"type":18,"tag":32,"props":277,"children":278},{},[279],{"type":24,"value":280},"4、保存原始迁移脚本在跑一个训练Step后的权值文件，例如PyTorch保存权值为pt_checkpoint，分别对pt_checkpoint和ms_checkpoint2的相同层进行参数打印，并比较参数更新后是否一致，若不一致则转到5，一致则可以进行多Epoch的收敛实验。",{"type":18,"tag":32,"props":282,"children":283},{},[284],{"type":24,"value":285},"5、若参数不能更新或者参数更新错误，思考是否能够用别的优化器。若不能，则向华为反馈该优化器更新问题。",{"type":18,"tag":19,"props":287,"children":289},{"id":288},"精度问题定位思路",[290],{"type":24,"value":288},{"type":18,"tag":32,"props":292,"children":293},{},[294],{"type":24,"value":295},"首先确定已经完成：",{"type":18,"tag":32,"props":297,"children":298},{},[299,300,306],{"type":24,"value":122},{"type":18,"tag":97,"props":301,"children":304},{"href":302,"rel":303},"https://fb80f0bc-b864-4056-a504-04357bbe7316.vscode-webview-test.com/vscode-resourcehttps://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file///c:/Users/hangtian/Downloads/MindBook%E7%AD%9B%E9%80%890803/MindBook_to_MS_tuomin/%E5%BC%80%E5%8F%91%E6%8C%87%E5%AF%BC%E6%89%8B%E5%86%8C/%E7%B2%BE%E5%BA%A6%E8%B0%83%E4%BC%98/%E7%B2%BE%E5%BA%A6%E9%97%AE%E9%A2%98%E5%AE%9A%E4%BD%8D%E6%96%B9%E6%A1%88/%E8%AE%AD%E5%89%8D%E5%BF%85%E5%81%9A.md",[101],[305],{"type":24,"value":129},{"type":24,"value":307},"的检查事项；",{"type":18,"tag":32,"props":309,"children":310},{},[311],{"type":24,"value":312},"2、开始进行MindSpore的多轮训练，可以画出迭代的Loss变化曲线或者可以在验证集/测试集上测试精度",{"type":18,"tag":32,"props":314,"children":315},{},[316],{"type":24,"value":317},"精度问题可能分为以下几种场景：",{"type":18,"tag":53,"props":319,"children":321},{"id":320},"场景1loss不能收敛",[322],{"type":24,"value":323},"场景1：Loss不能收敛",{"type":18,"tag":32,"props":325,"children":326},{},[327,329,336],{"type":24,"value":328},"1、定位是否是算子前向或者反向的精度问题，或者是优化器更新问题，可以参考",{"type":18,"tag":97,"props":330,"children":333},{"href":331,"rel":332},"https://fb80f0bc-b864-4056-a504-04357bbe7316.vscode-webview-test.com/vscode-resourcehttps://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file///c:/Users/hangtian/Downloads/MindBook%E7%AD%9B%E9%80%890803/MindBook_to_MS_tuomin/%E5%BC%80%E5%8F%91%E6%8C%87%E5%AF%BC%E6%89%8B%E5%86%8C/%E7%B2%BE%E5%BA%A6%E8%B0%83%E4%BC%98/%E7%B2%BE%E5%BA%A6%E9%97%AE%E9%A2%98%E5%AE%9A%E4%BD%8D%E6%96%B9%E6%A1%88/%E8%AE%AD%E4%B8%AD%E6%8C%87%E5%AF%BC.md",[101],[334],{"type":24,"value":335},"训中指导",{"type":24,"value":190},{"type":18,"tag":53,"props":338,"children":340},{"id":339},"场景2脚本迁移场景下loss可以收敛收敛曲线和基线相比差异大或者不能收敛到相同位置精度不达标",[341],{"type":24,"value":342},"场景2：脚本迁移场景下Loss可以收敛，收敛曲线和基线相比差异大或者不能收敛到相同位置，精度不达标",{"type":18,"tag":32,"props":344,"children":345},{},[346],{"type":24,"value":347},"1、检查超参数的设置是否正确（例如优化器的weight decay、优化器的momentum、初始学习率、Dropout的prop和bn的momentum），框架迁移的场景检查初超参数设置是否和原始脚本一致，重头在MindSpore上开发的场景检查超参数设置是否符合预期。",{"type":18,"tag":32,"props":349,"children":350},{},[351],{"type":24,"value":352},"2、检查初始化方式是否正确，框架迁移的场景检查初始化方式和结果是否和原始脚本一致，重头在MindSpore上开发的场景检查初始化结果是否符合预期。",{"type":18,"tag":32,"props":354,"children":355},{},[356,358,364],{"type":24,"value":357},"3、检查",{"type":18,"tag":97,"props":359,"children":362},{"href":360,"rel":361},"https://fb80f0bc-b864-4056-a504-04357bbe7316.vscode-webview-test.com/vscode-resourcehttps://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file///c:/Users/hangtian/Downloads/MindBook%E7%AD%9B%E9%80%890803/MindBook_to_MS_tuomin/%E5%BC%80%E5%8F%91%E6%8C%87%E5%AF%BC%E6%89%8B%E5%86%8C/%E6%B7%B7%E5%90%88%E7%B2%BE%E5%BA%A6/%E6%B7%B7%E5%90%88%E7%B2%BE%E5%BA%A6.md",[101],[363],{"type":24,"value":104},{"type":24,"value":365},"是否正确。",{"type":18,"tag":32,"props":367,"children":368},{},[369],{"type":24,"value":370},"4、检查学习率调整的策略是否正确（例如lr schedule以及warmup策略）。",{"type":18,"tag":32,"props":372,"children":373},{},[374],{"type":24,"value":375},"5、多卡（多机）训练场景检查是否真正跑的是数据并行或者模型并行，参考并行检查。",{"type":18,"tag":32,"props":377,"children":378},{},[379,381,386],{"type":24,"value":380},"6、定位是否是算子前向或者反向的精度问题，或者是优化器更新问题，可以参考",{"type":18,"tag":97,"props":382,"children":384},{"href":331,"rel":383},[101],[385],{"type":24,"value":335},{"type":24,"value":190},{"type":18,"tag":53,"props":388,"children":390},{"id":389},"场景3loss可以收敛收敛曲线一致脚本迁移场景下但精度不达标",[391],{"type":24,"value":392},"场景3：loss可以收敛，收敛曲线一致（脚本迁移场景下），但精度不达标",{"type":18,"tag":32,"props":394,"children":395},{},[396,398,405],{"type":24,"value":397},"1、检查测试脚本是否使用Dropout算子，需要在测试阶段手动把",{"type":18,"tag":97,"props":399,"children":402},{"href":400,"rel":401},"https://fb80f0bc-b864-4056-a504-04357bbe7316.vscode-webview-test.com/vscode-resourcehttps://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file///c:/Users/hangtian/Downloads/MindBook%E7%AD%9B%E9%80%890803/MindBook_to_MS_tuomin/%E7%BB%8F%E9%AA%8C%E6%80%BB%E7%BB%93/%E7%AE%97%E5%AD%90%E4%BD%BF%E7%94%A8/%E5%9C%A8%E6%B5%8B%E8%AF%95%E7%9A%84%E6%97%B6%E5%80%99%E4%BD%BF%E7%94%A8mindspore.nn.Dropout/%E5%9C%A8%E6%B5%8B%E8%AF%95%E7%9A%84%E6%97%B6%E5%80%99%E4%BD%BF%E7%94%A8mindspore.nn.Dropout.md",[101],[403],{"type":24,"value":404},"Dropout算子去掉",{"type":24,"value":190},{"type":18,"tag":32,"props":407,"children":408},{},[409],{"type":24,"value":410},"2、检查测试脚本是否把网络Cell对象的set_train属性设置为False（目前只影响BN的mean和variance计算方式，训练模式下mean和variance是在训练期间被计算的，若推理模式下mean和variance是从checkpoint里面load出来的），使用Model接口的eval或者predict方法可以不用做这个检查（这些方法里会自动把Cell对象的set_train属性设置为False）。",{"title":7,"searchDepth":412,"depth":412,"links":413},4,[414,416,417,418,419,420,421,422],{"id":55,"depth":415,"text":55},3,{"id":88,"depth":415,"text":88},{"id":144,"depth":415,"text":147},{"id":208,"depth":415,"text":211},{"id":257,"depth":415,"text":260},{"id":320,"depth":415,"text":323},{"id":339,"depth":415,"text":342},{"id":389,"depth":415,"text":392},"markdown","content:technology-blogs:zh:1860.md","content","technology-blogs/zh/1860.md","technology-blogs/zh/1860","md",1776506116255]