[{"data":1,"prerenderedAt":312},["ShallowReactive",2],{"content-query-S97Dr71nVp":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":306,"_id":307,"_source":308,"_file":309,"_stem":310,"_extension":311},"/technology-blogs/zh/1630","zh",false,"","【MindSpore易点通】深度学习系列-提升网络训练速度的方法","深度学习，让我们一起有温度、有态度、有深度地学习！","2022-07-15","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/07/18/e0578ad562bc445a835e41faa54fb7cf.png","technology-blogs","基础知识",{"type":15,"children":16,"toc":300},"root",[17,25,31,37,42,47,55,62,67,72,79,84,89,96,101,108,113,118,125,130,136,141,146,151,158,163,170,179,186,191,200,209,214,219,224,229,236,241,246,251,258,263,268,273,280,285,290,295],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore易点通深度学习系列-提升网络训练速度的方法",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"嗨咯，宝子们~天气炎热，小编想更新的心也变得更加火热，正好提供提升网络训练速度的方法给大家烦躁的内心降降温！",{"type":18,"tag":32,"props":33,"children":35},"h4",{"id":34},"归一化输入",[36],{"type":24,"value":34},{"type":18,"tag":26,"props":38,"children":39},{},[40],{"type":24,"value":41},"训练神经网络时，加速训练可以采用归一化输入的方法。假设训练集有两个特征，输入特征为2维，归一化一共只需两步：",{"type":18,"tag":26,"props":43,"children":44},{},[45],{"type":24,"value":46},"1.零均值",{"type":18,"tag":26,"props":48,"children":49},{},[50],{"type":18,"tag":51,"props":52,"children":54},"img",{"alt":7,"src":53},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/15/1657850567839339177.png",[],{"type":18,"tag":26,"props":56,"children":57},{},[58],{"type":18,"tag":51,"props":59,"children":61},{"alt":7,"src":60},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/15/1657850586183915691.png",[],{"type":18,"tag":26,"props":63,"children":64},{},[65],{"type":24,"value":66},"2.归一化方差",{"type":18,"tag":26,"props":68,"children":69},{},[70],{"type":24,"value":71},"特征x1的方差比特征x2的方差要大得多，需要给σ赋值：",{"type":18,"tag":26,"props":73,"children":74},{},[75],{"type":18,"tag":51,"props":76,"children":78},{"alt":7,"src":77},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/15/1657850607748782873.png",[],{"type":18,"tag":26,"props":80,"children":81},{},[82],{"type":24,"value":83},"σ2是一个向量，它的每个特征都有方差。",{"type":18,"tag":26,"props":85,"children":86},{},[87],{"type":24,"value":88},"代价函数：",{"type":18,"tag":26,"props":90,"children":91},{},[92],{"type":18,"tag":51,"props":93,"children":95},{"alt":7,"src":94},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/15/1657850622136240834.png",[],{"type":18,"tag":26,"props":97,"children":98},{},[99],{"type":24,"value":100},"如果使用非归一化的输入特征，代价函数的图像如下所示：",{"type":18,"tag":26,"props":102,"children":103},{},[104],{"type":18,"tag":51,"props":105,"children":107},{"alt":7,"src":106},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/15/1657850637936863080.png",[],{"type":18,"tag":26,"props":109,"children":110},{},[111],{"type":24,"value":112},"假如x1取值范围从1到1000，特征x2的取值范围从0到1，那么参数w1和w2值的范围将会变化很大。在这样的代价函数上运行梯度下降法，只能使用非常小的学习率，多次迭代直到最后找到最小值。",{"type":18,"tag":26,"props":114,"children":115},{},[116],{"type":24,"value":117},"但如果你归一化特征，代价函数更对称，那么不论从哪个位置开始，梯度下降法都能够更直接地找到最小值，步长也可以适当增大。",{"type":18,"tag":26,"props":119,"children":120},{},[121],{"type":18,"tag":51,"props":122,"children":124},{"alt":7,"src":123},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/15/1657850675220424975.png",[],{"type":18,"tag":26,"props":126,"children":127},{},[128],{"type":24,"value":129},"因此，如果输入特征处于不同范围，有些特征值从0到1，有些从1到1000，那么归一化特征值就非常重要；如果特征x1在0-1之间，x2在-1到1之间，x3在1-2之间，这样的相似范围，那么归一化就不是很重要了。",{"type":18,"tag":32,"props":131,"children":133},{"id":132},"梯度爆炸消失",[134],{"type":24,"value":135},"梯度爆炸/消失",{"type":18,"tag":26,"props":137,"children":138},{},[139],{"type":24,"value":140},"训练深度神经网络经常会面临梯度消失（梯度爆炸）的问题，也就是说，训练时导数或坡度有时会变得非常大，或者非常小，甚至于以指数方式变小，这样会加大训练的难度。",{"type":18,"tag":26,"props":142,"children":143},{},[144],{"type":24,"value":145},"那么如何避免这样的问题呢？",{"type":18,"tag":26,"props":147,"children":148},{},[149],{"type":24,"value":150},"假设训练一个神经网络，含有参数W[1]，W[2]，W[3]，...，W[l]，这里我们可以简化一下激活函数，忽略b，直接g(z)=z，输出：",{"type":18,"tag":26,"props":152,"children":153},{},[154],{"type":18,"tag":51,"props":155,"children":157},{"alt":7,"src":156},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/15/1657850781683685321.png",[],{"type":18,"tag":26,"props":159,"children":160},{},[161],{"type":24,"value":162},"W[1] x=z[1]，a[1]=g(z[1])，也就是说，第一项W[1] x=a[1]，W[2]W[1] x=a[2]。",{"type":18,"tag":26,"props":164,"children":165},{},[166],{"type":18,"tag":51,"props":167,"children":169},{"alt":7,"src":168},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/15/1657850852031257440.png",[],{"type":18,"tag":26,"props":171,"children":172},{},[173,175],{"type":24,"value":174},"假设每个权重矩阵",{"type":18,"tag":51,"props":176,"children":178},{"alt":7,"src":177},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/15/1657851019343173323.png",[],{"type":18,"tag":26,"props":180,"children":181},{},[182],{"type":18,"tag":51,"props":183,"children":185},{"alt":7,"src":184},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/15/1657851034579139135.png",[],{"type":18,"tag":26,"props":187,"children":188},{},[189],{"type":24,"value":190},"最后的计算结果^y也等于1.5(L-1) x。因此如果L值较大，那么y^的值也会非常大，呈指数级增长的，从而y的值将爆炸式增长。",{"type":18,"tag":26,"props":192,"children":193},{},[194,196],{"type":24,"value":195},"相反的，如果权重是0.5，",{"type":18,"tag":51,"props":197,"children":199},{"alt":7,"src":198},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/15/1657851061367237747.png",[],{"type":18,"tag":26,"props":201,"children":202},{},[203,205],{"type":24,"value":204},"矩阵",{"type":18,"tag":51,"props":206,"children":208},{"alt":7,"src":207},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/15/1657851077473303745.png",[],{"type":18,"tag":26,"props":210,"children":211},{},[212],{"type":24,"value":213},"假设x1和x2都是1，激活函数将变成1/2，1/2，1/4，1/4，1/8，1/8等，直到最后一项变成1/2L ，激活函数的值将以指数级下降。",{"type":18,"tag":26,"props":215,"children":216},{},[217],{"type":24,"value":218},"因此，在深度神经网络中，如果激活函数或梯度函数以与L相关的指数增长或递减，它们的值将会变得极大或极小，从而导致训练难度上升。虽然这样的说明并不能彻底解决此问题，但也告诉我们在选择初始化权重问题上需要多多用心，平衡利弊。",{"type":18,"tag":32,"props":220,"children":222},{"id":221},"神经网络的权重初始化",[223],{"type":24,"value":221},{"type":18,"tag":26,"props":225,"children":226},{},[227],{"type":24,"value":228},"首先举一个简单的神经单元初始化的例子：",{"type":18,"tag":26,"props":230,"children":231},{},[232],{"type":18,"tag":51,"props":233,"children":235},{"alt":7,"src":234},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/15/1657851094794400635.png",[],{"type":18,"tag":26,"props":237,"children":238},{},[239],{"type":24,"value":240},"输入特征从x1到x4，经过a=g(z)处理，最终得到^y，输入可以表示为a[l]，这里暂用x代替。",{"type":18,"tag":26,"props":242,"children":243},{},[244],{"type":24,"value":245},"z=w1 x1+w2 x2+⋯+wn xn，b=0",{"type":18,"tag":26,"props":247,"children":248},{},[249],{"type":24,"value":250},"为了预防z值过大或过小，所以如果n越大，那么我们就希望wi越小，这里可以设置wi=1/n，n表示神经元的输入特征数量。",{"type":18,"tag":26,"props":252,"children":253},{},[254],{"type":18,"tag":51,"props":255,"children":257},{"alt":7,"src":256},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/15/1657851205821665221.png",[],{"type":18,"tag":26,"props":259,"children":260},{},[261],{"type":24,"value":262},"n[l-1]是喂给第l层神经单元的数量（即第l-1层神经元数量）。",{"type":18,"tag":26,"props":264,"children":265},{},[266],{"type":24,"value":267},"这时我们会发现如果使用Relu激活函数，方差设置为2/n，效果会更好。",{"type":18,"tag":26,"props":269,"children":270},{},[271],{"type":24,"value":272},"g[l] (z)=Relu(z)",{"type":18,"tag":26,"props":274,"children":275},{},[276],{"type":18,"tag":51,"props":277,"children":279},{"alt":7,"src":278},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/15/1657851262550936426.png",[],{"type":18,"tag":26,"props":281,"children":282},{},[283],{"type":24,"value":284},"逻辑回归的特征是不会变的。但一般情况下l层上的每个神经元都有n[l-1]个输入。",{"type":18,"tag":26,"props":286,"children":287},{},[288],{"type":24,"value":289},"如果激活函数的输入特征被零均值和标准方差化，方差是1，z也会调整到相似范围，这就没解决梯度消失的问题，当然也确实降低了一点，因为它给权重矩阵w设置了合理值。",{"type":18,"tag":26,"props":291,"children":292},{},[293],{"type":24,"value":294},"实际上，公式只是一方面，它们给出初始化权重矩阵的方差的默认值，如果想添加方差，方差参数也是需要调整的。",{"type":18,"tag":26,"props":296,"children":297},{},[298],{"type":24,"value":299},"希望通过今天的介绍，大家能够在解决梯度消失或爆炸以及如何为权重初始化合理值时，更加从容且有想法，我们下期再见！",{"title":7,"searchDepth":301,"depth":301,"links":302},4,[303,304,305],{"id":34,"depth":301,"text":34},{"id":132,"depth":301,"text":135},{"id":221,"depth":301,"text":221},"markdown","content:technology-blogs:zh:1630.md","content","technology-blogs/zh/1630.md","technology-blogs/zh/1630","md",1776506114606]