[{"data":1,"prerenderedAt":213},["ShallowReactive",2],{"content-query-acRkemYjr5":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":207,"_id":208,"_source":209,"_file":210,"_stem":211,"_extension":212},"/technology-blogs/zh/804","zh",false,"","MindSpore参数归一化实现方式","MindSpore实现WeightNorm参数归一化","2021-11-27","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/11/30/b8ca2b98b3ae4ee48a3d314460e17253.png","technology-blogs","开发者分享",{"type":15,"children":16,"toc":204},"root",[17,25,35,40,48,53,61,85,93,98,103,113,123,128,133,138,146,151,159,172,180,188,196],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore参数归一化实现方式",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":18,"tag":30,"props":31,"children":32},"strong",{},[33],{"type":24,"value":34},"1. 功能描述：",{"type":18,"tag":26,"props":36,"children":37},{},[38],{"type":24,"value":39},"MindSpore实现WeightNorm参数归一化。",{"type":18,"tag":26,"props":41,"children":42},{},[43],{"type":18,"tag":30,"props":44,"children":45},{},[46],{"type":24,"value":47},"2. 实现分析：",{"type":18,"tag":26,"props":49,"children":50},{},[51],{"type":24,"value":52},"在MindSpore实现高性能方案，建议采用图模式，同时也能保证动静统一。MindSpore图模式需要把归一化操作表达到整图里，可以采用自定义的方式在网络结构中实现。",{"type":18,"tag":26,"props":54,"children":55},{},[56],{"type":18,"tag":30,"props":57,"children":58},{},[59],{"type":24,"value":60},"3. 参数归一化功能简介（背景介绍）：",{"type":18,"tag":26,"props":62,"children":63},{},[64,66,72,74,83],{"type":24,"value":65},"在深度学习中通常对卷积层的权重进行参数归一化，参数归一化功能根据以下公式对传入的 layer 中的权重参数进行归一化: ",{"type":18,"tag":67,"props":68,"children":71},"img",{"alt":69,"src":70},"image.png","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202111/19/151225chg2ljlgfiacspv0.png",[],{"type":24,"value":73}," 公式中w是网络权重，g 代表长度变量 ，v代表方向变量。权重归一化可以将神经网络中权重w的向量长度g与其方向v解耦，将w用g和v两个变量表示。 (例如:详细可以参考论文: ",{"type":18,"tag":75,"props":76,"children":80},"a",{"href":77,"rel":78},"https://arxiv.org/pdf/1602.07868.pdf%E3%80%82",[79],"nofollow",[81],{"type":24,"value":82},"https://arxiv.org/pdf/1602.07868.pdf。",{"type":24,"value":84},")",{"type":18,"tag":26,"props":86,"children":87},{},[88],{"type":18,"tag":30,"props":89,"children":90},{},[91],{"type":24,"value":92},"4. 解决方案：",{"type":18,"tag":26,"props":94,"children":95},{},[96],{"type":24,"value":97},"实现MindSpore的WeightNorm需要注意：",{"type":18,"tag":26,"props":99,"children":100},{},[101],{"type":24,"value":102},"4.1 MindSpore实现时，需要封装一个Wrapper，将WeightNorm和需要进行参数归一化的网络结构(如卷积)封装为一个整体，这样每次在卷积执行之前，就会先执行WeightNorm。具体伪代码如下：",{"type":18,"tag":104,"props":105,"children":107},"pre",{"code":106},"class WeightNorm(nn.Cell):\n    def __init__(self):\n        ...\n        register_w_v_g()\n        self.layer = layer\n\n    def construct(self, inputs):\n        compute_weight_norm()\n        result = self.layer(inputs)\n        return result\n",[108],{"type":18,"tag":109,"props":110,"children":111},"code",{"__ignoreMap":7},[112],{"type":24,"value":106},{"type":18,"tag":114,"props":115,"children":116},"ul",{},[117],{"type":18,"tag":118,"props":119,"children":120},"li",{},[121],{"type":24,"value":122},"4.2 使用参数归一化需要能够添加和删除weight norm，但MindSpore静态图编译后无法删除Weight Norm",{"type":18,"tag":26,"props":124,"children":125},{},[126],{"type":24,"value":127},"remove_weight_norm的场景：",{"type":18,"tag":26,"props":129,"children":130},{},[131],{"type":24,"value":132},"4.2.1 inference，即推理阶段需要移除Weight Norm。",{"type":18,"tag":26,"props":134,"children":135},{},[136],{"type":24,"value":137},"4.2.2 进行一次Weight Norm计算，然后固定w（WeightNorm.remove()的执行逻辑）",{"type":18,"tag":114,"props":139,"children":140},{},[141],{"type":18,"tag":118,"props":142,"children":143},{},[144],{"type":24,"value":145},"remove_weight_norm的使用场景，即模型进行推理时，在加载Checkpoint后进行操作，此时未涉及到静态图的编译阶段，因此可以对实例化的模型进行任意修改。 PS: 静态图不支持在训练过程中移除weight norm。",{"type":18,"tag":26,"props":147,"children":148},{},[149],{"type":24,"value":150},"MindSpore WeightNorm示例：",{"type":18,"tag":104,"props":152,"children":154},{"code":153},"class WeightNorm (nn.Cell):\n\n      def __init__(self, module, dim:int=0):\n           super().__init__()\n\n           if dim is None:\n               dim = -1\n\n          self.dim = dim\n          self.module = module\n          self.assign = P.Assign()\n          # add g and v as new parameters and express w as g/||v|| * v\n          self.param_g = Parameter(Tensor(norm_except_dim(self.module.weight, 2, dim)))\n          self.param_v = Parameter(Tensor(self.module.weight.data))\n          self.module.weight.set_data(_weight_norm(self.param_v, self.param_g, self.dim))\n          self.use_weight_norm = True\n\n     def construct(self, *inputs, **kwargs):\n           if not self.use_weight_norm:\n               return self.module(*inputs, **kwargs)\n          self.assign(self.module.weight, _weight_norm(self.param_v, self.param_g, self.dim))\n              return self.module(*inputs, **kwargs)\n\n     def remove_weight_norm(self):\n          self.assign(self.module.weight, _weight_norm(self.param_v, self.param_g, self.dim))\n          self.use_weight_norm = False\n",[155],{"type":18,"tag":109,"props":156,"children":157},{"__ignoreMap":7},[158],{"type":24,"value":153},{"type":18,"tag":114,"props":160,"children":161},{},[162,167],{"type":18,"tag":118,"props":163,"children":164},{},[165],{"type":24,"value":166},"4.3 use_weight_norm可以达到移除WeightNorm的目的。即调用remove_weight_norm方法后，将self.use_weight_norm设置为False，当再次construct函数时，就会直接调用self.module，忽略Weight Norm计算。",{"type":18,"tag":118,"props":168,"children":169},{},[170],{"type":24,"value":171},"4.4 self.param_g = Parameter(Tensor(norm_except_dim(self.module.weight, 2, dim))) 实现 w和 ||v|| 的计算，静态图不支持getattr方法，考虑到MindSpore的nn层设计，就固定module的权重为module.weight。",{"type":18,"tag":104,"props":173,"children":175},{"code":174},"def norm_except_dim(v, pow, dim):\n    if dim == -1:\n       return mnp.norm(v, pow)\n    elif dim == 0:\n       output_size = (v.shape[0],) + (1,) * (v.ndim - 1)\n       return mnp.norm(v.view((v.shape[0], -1)), pow, 1).view(output_size)\n    elif dim == (v.ndim - 1):\n       output_size = (1,) * (v.ndim - 1) + (v.shape[v.ndim - 1])\n       return mnp.norm(v.view((-1, v.shape[v.ndim - 1])), pow, 0).view(output_size)\n    else:\n       return norm_except_dim(v.swapaxes(0, dim), pow, dim).swapaxes(0,dim)\n\ndef _weight_norm(v, g, dim):\n    return v * (g / norm_except_dim(v, 2, dim))\n",[176],{"type":18,"tag":109,"props":177,"children":178},{"__ignoreMap":7},[179],{"type":24,"value":174},{"type":18,"tag":114,"props":181,"children":182},{},[183],{"type":18,"tag":118,"props":184,"children":185},{},[186],{"type":24,"value":187},"4.5 上述代码WeightNorm中，self.module.weight是要进行归一化的网络权重，self.param_g是长度变量，self.param_v是方向变量， 其中norm_except_dim函数用于计算指定维度的长度。",{"type":18,"tag":26,"props":189,"children":190},{},[191],{"type":18,"tag":30,"props":192,"children":193},{},[194],{"type":24,"value":195},"5. MindSpore的WeightNorm简单使用方式",{"type":18,"tag":104,"props":197,"children":199},{"code":198},"   # assume we need apply weight norm on nn.Dense layer\n   m = WeightNorm(nn.Dense(20, 40))\n   # m.param_g.shape is (40, 1)\n   # m.param_v.shape is (40, 20)\n   # use m as normal nn.Dense\n\n   inputs = Tensor(np.random.randn(10, 20), mstype.float32)\n   outputs = m(inputs)\n\n   # if you want to remove weight norm, just call it\n   m.remove_weight_norm()\n   # m.use_weight_norm == False\n",[200],{"type":18,"tag":109,"props":201,"children":202},{"__ignoreMap":7},[203],{"type":24,"value":198},{"title":7,"searchDepth":205,"depth":205,"links":206},4,[],"markdown","content:technology-blogs:zh:804.md","content","technology-blogs/zh/804.md","technology-blogs/zh/804","md",1776506141444]