[{"data":1,"prerenderedAt":623},["ShallowReactive",2],{"content-query-Ra2h0JAmBV":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":617,"_id":618,"_source":619,"_file":620,"_stem":621,"_extension":622},"/technology-blogs/zh/433","zh",false,"","MindSpore模型精度调优应用（三）：常见精度问题简介","在模型的开发过程中，精度达不到预期常常让人头疼。为了帮助用户解决模型调试调优的问题，我们为MindSpore量身定做了可视化调试调优组件：MindInsight。我们还梳理了针对常见精度问题的调试调优指南，将以“MindSpore模型精度调优应用”系列文章的形式分享出来，希望能帮助用户轻松定位精度问题，快速优化模型精度。","2021-04-08","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/04/08/a53a671562f04c42a7258d234496a9c5.png","technology-blogs","调试调优",{"type":15,"children":16,"toc":608},"root",[17,25,31,36,41,46,51,58,63,72,77,82,87,92,97,102,107,112,118,123,131,136,141,146,151,156,161,167,172,180,185,190,195,200,205,210,215,220,226,232,237,245,250,255,260,275,281,286,294,299,304,309,314,319,324,329,334,339,344,350,355,363,368,373,378,383,389,394,401,405,409,413,417,421,428,432,436,440,447,451,455,459,463,468,475,479,483,490,494,498,502,506,510,517,521,525,531,536,545,550,559,565,575,586,597],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore模型精度调优应用三常见精度问题简介",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"MindSpore团队",{"type":18,"tag":26,"props":32,"children":33},{},[34],{"type":24,"value":35},"引子：在模型的开发过程中，精度达不到预期常常让人头疼。为了帮助用户解决模型调试调优的问题，我们为MindSpore量身定做了可视化调试调优组件：MindInsight。我们还梳理了针对常见精度问题的调试调优指南，将以“MindSpore模型精度调优应用”系列文章的形式分享出来，希望能帮助用户轻松定位精度问题，快速优化模型精度。",{"type":18,"tag":26,"props":37,"children":38},{},[39],{"type":24,"value":40},"本文是系列分享的第三篇，将简单介绍常见精度问题，使得读者能够根据精度问题现象，判断出可能的原因。本系列分享假设您的脚本已经能够运行并算出loss值。如果脚本还不能运行，请先参考相关报错提示进行修改。",{"type":18,"tag":26,"props":42,"children":43},{},[44],{"type":24,"value":45},"在精度调优实践中，经常有发现异常现象，但是对异常现象不够敏感，不会解释，导致同精度问题根因失之交臂。本文对常见精度问题进行了解释，希望能帮你提高定位精度问题的能力。",{"type":18,"tag":26,"props":47,"children":48},{},[49],{"type":24,"value":50},"1 数据问题",{"type":18,"tag":52,"props":53,"children":55},"h2",{"id":54},"_11-数据集问题",[56],{"type":24,"value":57},"1.1 数据集问题",{"type":18,"tag":26,"props":59,"children":60},{},[61],{"type":24,"value":62},"数据集的质量决定了算法效果的上限，如果数据质量差，再好的算法也难以得到很好的效果。常见数据集问题如下：",{"type":18,"tag":26,"props":64,"children":65},{},[66],{"type":18,"tag":67,"props":68,"children":69},"strong",{},[70],{"type":24,"value":71},"常见数据集问题",{"type":18,"tag":26,"props":73,"children":74},{},[75],{"type":24,"value":76},"数据集中缺失值过多",{"type":18,"tag":26,"props":78,"children":79},{},[80],{"type":24,"value":81},"数据集中存在异常值",{"type":18,"tag":26,"props":83,"children":84},{},[85],{"type":24,"value":86},"数据的标签错误",{"type":18,"tag":26,"props":88,"children":89},{},[90],{"type":24,"value":91},"数据集每个类别的样本数目不均衡",{"type":18,"tag":26,"props":93,"children":94},{},[95],{"type":24,"value":96},"训练样本不足",{"type":18,"tag":26,"props":98,"children":99},{},[100],{"type":24,"value":101},"数据集中存在缺失值、异常值，会导致模型学习到错误的数据关系。一般来说，应该从训练集中删除存在缺失值或异常值的数据，或者设置合理的默认值。数据标签错误是异常值的一种特殊情况，但是这种情况对训练的破坏性较大，应通过抽查输入模型的数据等方式提前识别这类问题。",{"type":18,"tag":26,"props":103,"children":104},{},[105],{"type":24,"value":106},"数据集中每个类别的样本数目不均衡，是指数据集中每个类别中的样本数目有较大差距。例如，图像分类数据集（训练集）中，大部分类别都有1000个样本，但是“猫”这一类别只有100个样本，就可以认为出现了样本数目不均衡的情况。样本数目不均衡会导致模型在样本数目少的类别上预测效果差。如果出现了样本数目不均衡，应该酌情增加样本量小的类别的样本。作为参考，一般来说，有监督深度学习算法在每类5000个标注样本的情况下将达到可以接受的性能，当数据集中有1000万个以上的已标注样本时，模型的表现将会超过人类。",{"type":18,"tag":26,"props":108,"children":109},{},[110],{"type":24,"value":111},"训练样本不足则是指训练集相对于模型容量太小。训练样本不足会导致训练不稳定，且容易出现过拟合。如果模型的参数量同训练样本数量不成比例，应该考虑增加训练样本或者降低模型复杂度。",{"type":18,"tag":52,"props":113,"children":115},{"id":114},"_12-数据处理问题",[116],{"type":24,"value":117},"1.2 数据处理问题",{"type":18,"tag":26,"props":119,"children":120},{},[121],{"type":24,"value":122},"常见数据处理问题如下：",{"type":18,"tag":26,"props":124,"children":125},{},[126],{"type":18,"tag":67,"props":127,"children":128},{},[129],{"type":24,"value":130},"常见数据处理算法问题",{"type":18,"tag":26,"props":132,"children":133},{},[134],{"type":24,"value":135},"未对数据进行归一化或标准化",{"type":18,"tag":26,"props":137,"children":138},{},[139],{"type":24,"value":140},"数据处理方式和训练集不一致",{"type":18,"tag":26,"props":142,"children":143},{},[144],{"type":24,"value":145},"没有对数据集进行shuffle",{"type":18,"tag":26,"props":147,"children":148},{},[149],{"type":24,"value":150},"未对数据进行归一化或标准化，是指输入模型的数据，各个维度不在一个尺度上。一般来说，模型要求各个维度的数据在-1到1之间，均值为0。如果某两个维度的尺度存在数量级的差异，可能会影响模型的训练效果，此时需要对数据进行归一化或标准化。",{"type":18,"tag":26,"props":152,"children":153},{},[154],{"type":24,"value":155},"数据处理方式和训练集不一致是指在使用模型进行推理时，处理方式和训练集不一致。例如对图片的缩放、裁切、归一化参数和训练集不同，会导致推理时的数据分布和训练时的数据分布产生差异，可能会降低模型的推理精度。备注：一些数据增强操作（如随机旋转，随机裁切等）一般只应用在训练集，推理时无需进行数据增强。",{"type":18,"tag":26,"props":157,"children":158},{},[159],{"type":24,"value":160},"没有对数据集进行shuffle，是指训练时未对数据集进行混洗。未进行shuffle，或者混洗不充分，会导致总是以相同的数据顺序更新模型，严重限制了梯度优化方向的可选择性，导致收敛点的选择空间变少，容易过拟合。",{"type":18,"tag":19,"props":162,"children":164},{"id":163},"_2-超参问题",[165],{"type":24,"value":166},"2 超参问题",{"type":18,"tag":26,"props":168,"children":169},{},[170],{"type":24,"value":171},"超参是模型和数据之间的润滑剂，超参的选择直接影响了模型对数据拟合效果的优劣。超参方面常见的问题如下：",{"type":18,"tag":26,"props":173,"children":174},{},[175],{"type":18,"tag":67,"props":176,"children":177},{},[178],{"type":24,"value":179},"常见超参问题",{"type":18,"tag":26,"props":181,"children":182},{},[183],{"type":24,"value":184},"学习率过大",{"type":18,"tag":26,"props":186,"children":187},{},[188],{"type":24,"value":189},"学习率过小",{"type":18,"tag":26,"props":191,"children":192},{},[193],{"type":24,"value":194},"epoch过大",{"type":18,"tag":26,"props":196,"children":197},{},[198],{"type":24,"value":199},"epoch过小",{"type":18,"tag":26,"props":201,"children":202},{},[203],{"type":24,"value":204},"batch size过大",{"type":18,"tag":26,"props":206,"children":207},{},[208],{"type":24,"value":209},"学习率过大或过小。学习率可以说是模型训练中最重要的超参了。学习率过大，会导致loss震荡，不能收敛到预期值。学习率过小，会导致loss收敛慢。应根据理论和经验合理选择学习率策略。",{"type":18,"tag":26,"props":211,"children":212},{},[213],{"type":24,"value":214},"epoch过大或过小。epoch数目直接影响模型是欠拟合还是过拟合。epoch过小，模型未训练到最优解就停止了训练，容易欠拟合；epoch过大，模型训练时间过长，容易在训练集上过拟合，在测试集上达不到最优的效果。应根据训练过程中验证集上模型效果的变化情况，合理选择epoch数目。",{"type":18,"tag":26,"props":216,"children":217},{},[218],{"type":24,"value":219},"batch size过大。batch size过大时，模型可能不能收敛到较优的极小值上，从而降低模型的泛化能力。",{"type":18,"tag":19,"props":221,"children":223},{"id":222},"_3-算法问题",[224],{"type":24,"value":225},"3 算法问题",{"type":18,"tag":52,"props":227,"children":229},{"id":228},"_31-api使用问题",[230],{"type":24,"value":231},"3.1 API使用问题",{"type":18,"tag":26,"props":233,"children":234},{},[235],{"type":24,"value":236},"常见API使用问题如下：",{"type":18,"tag":26,"props":238,"children":239},{},[240],{"type":18,"tag":67,"props":241,"children":242},{},[243],{"type":24,"value":244},"常见API使用问题",{"type":18,"tag":26,"props":246,"children":247},{},[248],{"type":24,"value":249},"使用API没有遵循MindSpore约束",{"type":18,"tag":26,"props":251,"children":252},{},[253],{"type":24,"value":254},"构图时未遵循MindSpore construct约束",{"type":18,"tag":26,"props":256,"children":257},{},[258],{"type":24,"value":259},"使用API未遵循MindSpore约束，是指使用的API和真实应用的场景不匹配。例如，在除数中可能含有零的场景，应该考虑使用DivNoNan而非Div以避免产生除零问题。又例如，MindSpore中，DropOut第一个参数为保留的概率，和其它框架正好相反（其它框架为丢掉的概率），使用时需要注意。",{"type":18,"tag":26,"props":261,"children":262},{},[263,265,273],{"type":24,"value":264},"构图未遵循MindSpore construct约束，是指图模式下的网络未遵循MindSpore静态图语法支持中声明的约束。例如，MindSpore目前不支持对带键值对参数的函数求反向。完整约束请见官网： ",{"type":18,"tag":266,"props":267,"children":271},"a",{"href":268,"rel":269},"https://www.mindspore.cn/",[270],"nofollow",[272],{"type":24,"value":268},{"type":24,"value":274}," 。",{"type":18,"tag":52,"props":276,"children":278},{"id":277},"_32-计算图结构问题",[279],{"type":24,"value":280},"3.2 计算图结构问题",{"type":18,"tag":26,"props":282,"children":283},{},[284],{"type":24,"value":285},"计算图结构是模型计算的载体，计算图结构错误一般是实现算法时代码写错了。计算图结构方面常见的问题有：",{"type":18,"tag":26,"props":287,"children":288},{},[289],{"type":18,"tag":67,"props":290,"children":291},{},[292],{"type":24,"value":293},"常见计算图结构问题",{"type":18,"tag":26,"props":295,"children":296},{},[297],{"type":24,"value":298},"权重共享错误",{"type":18,"tag":26,"props":300,"children":301},{},[302],{"type":24,"value":303},"权重冻结错误",{"type":18,"tag":26,"props":305,"children":306},{},[307],{"type":24,"value":308},"节点连接错误",{"type":18,"tag":26,"props":310,"children":311},{},[312],{"type":24,"value":313},"节点模式不正确",{"type":18,"tag":26,"props":315,"children":316},{},[317],{"type":24,"value":318},"loss函数有误",{"type":18,"tag":26,"props":320,"children":321},{},[322],{"type":24,"value":323},"权重共享错误，是指应该共享的权重未共享，或者不应该共享的权重共享了。通过MindInsight计算图可视，可以检查这一类问题。",{"type":18,"tag":26,"props":325,"children":326},{},[327],{"type":24,"value":328},"权重冻结错误，是指应该冻结的权重未冻结，或者不应该冻结的权重冻结了。在MindSpore中，冻结权重可以通过控制传入优化器的params参数来实现。未传入优化器的Parameter将不会被更新。可以通过检查脚本，或者查看MindInsight中的参数分布图确认权重冻结情况。",{"type":18,"tag":26,"props":330,"children":331},{},[332],{"type":24,"value":333},"节点连接错误，是指计算图中各block的连接和设计不一致。如果发现节点连接错误，应该仔细检查脚本是否编写出错。",{"type":18,"tag":26,"props":335,"children":336},{},[337],{"type":24,"value":338},"节点模式不正确，是指部分区分训练、推理模式的算子，需要按照实际情况设置模式。典型的包括：（1）BatchNorm算子，训练时应打开BatchNorm的训练模式，此开关在调用 net.set_train(True)的时候会自动打开（2）DropOut算子，推理时不应使用DropOut算子。",{"type":18,"tag":26,"props":340,"children":341},{},[342],{"type":24,"value":343},"loss函数有误，是指loss函数算法实现错误，或者未选择合理的loss函数。例如，BCELoss和BCEWithLogitsLoss是不同的，应根据是否需要sigmoid函数合理选择。",{"type":18,"tag":52,"props":345,"children":347},{"id":346},"_33-权重初始化问题",[348],{"type":24,"value":349},"3.3 权重初始化问题",{"type":18,"tag":26,"props":351,"children":352},{},[353],{"type":24,"value":354},"权重初始值是模型训练的起点，不合理的初始值将会影响模型训练的速度和效果。权重初始化方面常见问题如下：",{"type":18,"tag":26,"props":356,"children":357},{},[358],{"type":18,"tag":67,"props":359,"children":360},{},[361],{"type":24,"value":362},"常见权重初始化问题",{"type":18,"tag":26,"props":364,"children":365},{},[366],{"type":24,"value":367},"权重初始值全部为0",{"type":18,"tag":26,"props":369,"children":370},{},[371],{"type":24,"value":372},"分布式场景不同节点的权重初始值不同",{"type":18,"tag":26,"props":374,"children":375},{},[376],{"type":24,"value":377},"权重初始值全为0，是指初始化后，权重值为0。这一般会导致权重更新问题，应使用随机值初始化权重。",{"type":18,"tag":26,"props":379,"children":380},{},[381],{"type":24,"value":382},"分布式场景不同节点的权重初始值不同，是指初始化后，不同节点上的同名权重初始值不同。正常来说，MindSpore会对梯度做全局AllReduce。确保每个step结尾，权重更新量是相同的，从而保证每个step中，各个节点上的权重一致。如果初始化时各节点的权重不同，就会导致不同节点的权重在接下来的训练中处于不同的状态，会直接影响模型精度。分布式场景应通过固定相同的随机数种子等方式，确保权重的初始值一致。",{"type":18,"tag":19,"props":384,"children":386},{"id":385},"_4-完整checklist",[387],{"type":24,"value":388},"4 完整checklist",{"type":18,"tag":26,"props":390,"children":391},{},[392],{"type":24,"value":393},"最后，我们将常见的精度问题汇总到一起，以方便大家查阅：",{"type":18,"tag":26,"props":395,"children":396},{},[397],{"type":18,"tag":67,"props":398,"children":399},{},[400],{"type":24,"value":71},{"type":18,"tag":26,"props":402,"children":403},{},[404],{"type":24,"value":76},{"type":18,"tag":26,"props":406,"children":407},{},[408],{"type":24,"value":81},{"type":18,"tag":26,"props":410,"children":411},{},[412],{"type":24,"value":86},{"type":18,"tag":26,"props":414,"children":415},{},[416],{"type":24,"value":91},{"type":18,"tag":26,"props":418,"children":419},{},[420],{"type":24,"value":96},{"type":18,"tag":26,"props":422,"children":423},{},[424],{"type":18,"tag":67,"props":425,"children":426},{},[427],{"type":24,"value":130},{"type":18,"tag":26,"props":429,"children":430},{},[431],{"type":24,"value":135},{"type":18,"tag":26,"props":433,"children":434},{},[435],{"type":24,"value":140},{"type":18,"tag":26,"props":437,"children":438},{},[439],{"type":24,"value":145},{"type":18,"tag":26,"props":441,"children":442},{},[443],{"type":18,"tag":67,"props":444,"children":445},{},[446],{"type":24,"value":179},{"type":18,"tag":26,"props":448,"children":449},{},[450],{"type":24,"value":184},{"type":18,"tag":26,"props":452,"children":453},{},[454],{"type":24,"value":189},{"type":18,"tag":26,"props":456,"children":457},{},[458],{"type":24,"value":194},{"type":18,"tag":26,"props":460,"children":461},{},[462],{"type":24,"value":199},{"type":18,"tag":26,"props":464,"children":465},{},[466],{"type":24,"value":467},"batch_size过大",{"type":18,"tag":26,"props":469,"children":470},{},[471],{"type":18,"tag":67,"props":472,"children":473},{},[474],{"type":24,"value":244},{"type":18,"tag":26,"props":476,"children":477},{},[478],{"type":24,"value":249},{"type":18,"tag":26,"props":480,"children":481},{},[482],{"type":24,"value":254},{"type":18,"tag":26,"props":484,"children":485},{},[486],{"type":18,"tag":67,"props":487,"children":488},{},[489],{"type":24,"value":293},{"type":18,"tag":26,"props":491,"children":492},{},[493],{"type":24,"value":298},{"type":18,"tag":26,"props":495,"children":496},{},[497],{"type":24,"value":303},{"type":18,"tag":26,"props":499,"children":500},{},[501],{"type":24,"value":308},{"type":18,"tag":26,"props":503,"children":504},{},[505],{"type":24,"value":313},{"type":18,"tag":26,"props":507,"children":508},{},[509],{"type":24,"value":318},{"type":18,"tag":26,"props":511,"children":512},{},[513],{"type":18,"tag":67,"props":514,"children":515},{},[516],{"type":24,"value":362},{"type":18,"tag":26,"props":518,"children":519},{},[520],{"type":24,"value":367},{"type":18,"tag":26,"props":522,"children":523},{},[524],{"type":24,"value":372},{"type":18,"tag":19,"props":526,"children":528},{"id":527},"_5-往期回顾",[529],{"type":24,"value":530},"5 往期回顾",{"type":18,"tag":26,"props":532,"children":533},{},[534],{"type":24,"value":535},"MindSpore模型精度调优应用（一）：精度问题的常见现象、原因和简要调优思路",{"type":18,"tag":26,"props":537,"children":538},{},[539],{"type":18,"tag":266,"props":540,"children":543},{"href":541,"rel":542},"https://www.mindspore.cn/news/newschildren?id=381",[270],[544],{"type":24,"value":541},{"type":18,"tag":26,"props":546,"children":547},{},[548],{"type":24,"value":549},"MindSpore模型精度调优应用（二）：精度调试调优思路",{"type":18,"tag":26,"props":551,"children":552},{},[553],{"type":18,"tag":266,"props":554,"children":557},{"href":555,"rel":556},"https://www.mindspore.cn/news/newschildren?id=394",[270],[558],{"type":24,"value":555},{"type":18,"tag":19,"props":560,"children":562},{"id":561},"_6-欢迎关注我们",[563],{"type":24,"value":564},"6 欢迎关注我们",{"type":18,"tag":26,"props":566,"children":567},{},[568,570],{"type":24,"value":569},"MindSpore官网： ",{"type":18,"tag":266,"props":571,"children":573},{"href":268,"rel":572},[270],[574],{"type":24,"value":268},{"type":18,"tag":26,"props":576,"children":577},{},[578,580],{"type":24,"value":579},"MindSpore代码仓库： ",{"type":18,"tag":266,"props":581,"children":584},{"href":582,"rel":583},"https://gitee.com/mindspore/mindspore",[270],[585],{"type":24,"value":582},{"type":18,"tag":26,"props":587,"children":588},{},[589,591],{"type":24,"value":590},"MindInsight代码仓库： ",{"type":18,"tag":266,"props":592,"children":595},{"href":593,"rel":594},"https://gitee.com/mindspore/mindinsight",[270],[596],{"type":24,"value":593},{"type":18,"tag":26,"props":598,"children":599},{},[600,602],{"type":24,"value":601},"MindInsight使用教程： ",{"type":18,"tag":266,"props":603,"children":606},{"href":604,"rel":605},"https://www.mindspore.cn/mindinsight/docs/zh-CN/r1.8/index.html",[270],[607],{"type":24,"value":604},{"title":7,"searchDepth":609,"depth":609,"links":610},4,[611,613,614,615,616],{"id":54,"depth":612,"text":57},2,{"id":114,"depth":612,"text":117},{"id":228,"depth":612,"text":231},{"id":277,"depth":612,"text":280},{"id":346,"depth":612,"text":349},"markdown","content:technology-blogs:zh:433.md","content","technology-blogs/zh/433.md","technology-blogs/zh/433","md",1776506137357]