[{"data":1,"prerenderedAt":259},["ShallowReactive",2],{"content-query-5x316KSVCa":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":253,"_id":254,"_source":255,"_file":256,"_stem":257,"_extension":258},"/technology-blogs/zh/1830","zh",false,"","【MindSpore易点通】网络构建经验总结下篇","MindSpore实现梯度不回传以及梯度回传后不更新权重","2022-09-09","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/09/26/d60fd10c946140749ce536c7b250fee5.png","technology-blogs","基础知识",{"type":15,"children":16,"toc":232},"root",[17,25,35,44,50,58,63,73,78,86,95,103,108,116,121,129,138,146,151,159,164,172,181,189,203,211,219,224],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore易点通网络构建经验总结下篇",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":29},"h2",{"id":28},"mindspore实现梯度不回传以及梯度回传后不更新权重",[30],{"type":18,"tag":31,"props":32,"children":33},"strong",{},[34],{"type":24,"value":9},{"type":18,"tag":36,"props":37,"children":39},"h3",{"id":38},"背景信息",[40],{"type":18,"tag":31,"props":41,"children":42},{},[43],{"type":24,"value":38},{"type":18,"tag":45,"props":46,"children":47},"p",{},[48],{"type":24,"value":49},"训练中经常会用到某层的梯度不回传（比如互学习）或者梯度回传但是不更新权重（Fine-tuning）",{"type":18,"tag":36,"props":51,"children":53},{"id":52},"经验总结",[54],{"type":18,"tag":31,"props":55,"children":56},{},[57],{"type":24,"value":52},{"type":18,"tag":45,"props":59,"children":60},{},[61],{"type":24,"value":62},"1. 梯度不回传使用stop_gradient接口实现，代码示例如下：",{"type":18,"tag":64,"props":65,"children":67},"pre",{"code":66},"import mindspore.nn as nnfrom mindspore.ops import operations as Pfrom mindspore.ops import functional as Ffrom mindspore.nn.loss.loss import _Lossfrom mindspore import Tensor, Parameterfrom mindspore.common import dtype as mstypefrom mindspore.ops.functional import stop_gradient\n\nclass Contrastive(_Loss):\n\n    def __init__(self, args):\n\n        super(Contrastive, self).__init__()\n\n        self.args = args\n\n        self.stride_slice = P.StridedSlice()\n\n        self.pow = P.Pow()\n\n        self.sum = P.CumSum()\n\n        self.dist_weight = Tensor(4, dtype=mstype.float32)\n\n        emb_list = list(range(args.per_batch_size))\n\n        emb1_list = emb_list[0::2]\n\n        emb2_list = emb_list[1::2]\n\n        self.emb1_param = Tensor(emb1_list, dtype=mstype.int32)\n\n        self.emb2_param = Tensor(emb2_list, dtype=mstype.int32)\n\n        self.add = P.TensorAdd()\n\n        self.div = P.RealDiv()\n\n        self.cast = P.Cast()\n\n        self.gatherv2 = P.GatherV2()\n\n\n\n    def construct(self, nembeddings):\n\n        nembeddings_shape = F.shape(nembeddings)\n\n        emb1 = self.gatherv2(nembeddings, self.emb1_param, 0)\n\n        emb2 = self.gatherv2(nembeddings, self.emb2_param, 0)\n\n        emb2_detach = stop_gradient(emb2)      //阻止emb2的梯度回传\n\n        emb3 = emb1 - emb2_detach\n\n        pow_emb3 = emb3 * emb3\n\n        dist = self.sum(pow_emb3, 1)\n\n        return self.div(dist*self.dist_weight, self.cast(F.scalar_to_array(nembeddings_shape[0]), mstype.float32))\n",[68],{"type":18,"tag":69,"props":70,"children":71},"code",{"__ignoreMap":7},[72],{"type":24,"value":66},{"type":18,"tag":45,"props":74,"children":75},{},[76],{"type":24,"value":77},"1. 梯度回传后不更新权重，使用requires_grad=False来实现，代码示例如下（假设要把名字为conv1的层权重冻结）：",{"type":18,"tag":64,"props":79,"children":81},{"code":80},"for param in net.trainable_params():\n\n    if 'conv1' in param.name:\n\n        param.requires_grad = False\n\n    else:\n\n        param.requires_grad = True\n",[82],{"type":18,"tag":69,"props":83,"children":84},{"__ignoreMap":7},[85],{"type":24,"value":80},{"type":18,"tag":26,"props":87,"children":89},{"id":88},"mindspore中使用loss-scalefeed模式下关于sens参数的配置",[90],{"type":18,"tag":31,"props":91,"children":92},{},[93],{"type":24,"value":94},"MindSpore中使用Loss Scale（Feed模式下）关于sens参数的配置",{"type":18,"tag":36,"props":96,"children":98},{"id":97},"背景信息-1",[99],{"type":18,"tag":31,"props":100,"children":101},{},[102],{"type":24,"value":38},{"type":18,"tag":45,"props":104,"children":105},{},[106],{"type":24,"value":107},"D芯片的卷积只有FP16精度，所以用D芯片训练一定是在跑混合精度。为避免梯度下溢，需要使用Loss Scale。",{"type":18,"tag":36,"props":109,"children":111},{"id":110},"经验总结-1",[112],{"type":18,"tag":31,"props":113,"children":114},{},[115],{"type":24,"value":52},{"type":18,"tag":45,"props":117,"children":118},{},[119],{"type":24,"value":120},"Feed模式流程下，接口中Optimizer和TrainOneStepCell的sens需要手动设置成同一数值",{"type":18,"tag":64,"props":122,"children":124},{"code":123},"opt = nn.Momentum(params=train_net.trainable_params(),\n\n                  learning_rate=lr_iter,\n\n                  momentum=0.9,\n\n                  weight_decay=0.0001,\n\n                  loss_scale=1000.0)\n\ntrain_net = TrainOneStepCell(train_net, opt, sens=1000.0)\n",[125],{"type":18,"tag":69,"props":126,"children":127},{"__ignoreMap":7},[128],{"type":24,"value":123},{"type":18,"tag":26,"props":130,"children":132},{"id":131},"mindspore中使用sequentialcell的输入必须为nncell组成的list",[133],{"type":18,"tag":31,"props":134,"children":135},{},[136],{"type":24,"value":137},"MindSpore中使用SequentialCell的输入必须为nn.Cell组成的List",{"type":18,"tag":36,"props":139,"children":141},{"id":140},"背景信息-2",[142],{"type":18,"tag":31,"props":143,"children":144},{},[145],{"type":24,"value":38},{"type":18,"tag":45,"props":147,"children":148},{},[149],{"type":24,"value":150},"PyTorch在网络定义中经常使用torch.nn.Sequential来构造算子的列表，在MindSpore中要使用mindspore.nn.SequentialCell来实现这个功能。",{"type":18,"tag":36,"props":152,"children":154},{"id":153},"经验总结-2",[155],{"type":18,"tag":31,"props":156,"children":157},{},[158],{"type":24,"value":52},{"type":18,"tag":45,"props":160,"children":161},{},[162],{"type":24,"value":163},"mindspore.nn.SequentialCell的输入和PyTorch的Sequential有所不同，输入必须为Cell组成的List，否则会有不符合预期的错误。 使用示例如下：",{"type":18,"tag":64,"props":165,"children":167},{"code":166},"class MyNet(nn.Cell):\n\n    def __init__(self):\n\n        super(MyNet, self).__init__()\n\n        self.conv = nn.Conv2d(16, 64, 3, pad_mode='pad', padding=0, dilation=2)\n\n        self.bn = nn.BatchNorm2d(64)\n\n        self.relu = nn.ReLU()\n\n        self.seq = nn.SequentialCell([self.conv, self.bn, self.relu])   #这里必须把nn.Cell的对象包装为List作为SequentialCell的输入\n\n\n\n    def construct(self, x):\n\n        x = self.seq(x)\n\n        return x\n",[168],{"type":18,"tag":69,"props":169,"children":170},{"__ignoreMap":7},[171],{"type":24,"value":166},{"type":18,"tag":26,"props":173,"children":175},{"id":174},"transformer中positional-encoding的mindspore简单实现",[176],{"type":18,"tag":31,"props":177,"children":178},{},[179],{"type":24,"value":180},"Transformer中Positional Encoding的MindSpore简单实现",{"type":18,"tag":36,"props":182,"children":184},{"id":183},"背景信息-3",[185],{"type":18,"tag":31,"props":186,"children":187},{},[188],{"type":24,"value":38},{"type":18,"tag":45,"props":190,"children":191},{},[192,201],{"type":18,"tag":193,"props":194,"children":198},"a",{"href":195,"rel":196},"https://arxiv.org/pdf/1706.03762.pdf",[197],"nofollow",[199],{"type":24,"value":200},"《Attention Is All You Need》",{"type":24,"value":202},"中的位置编码方法，Transformer中较为常用。公式如下：",{"type":18,"tag":45,"props":204,"children":205},{},[206],{"type":18,"tag":207,"props":208,"children":210},"img",{"alt":7,"src":209},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/09/26/bf3a3f77315e43efa0821d2ce410511a.png",[],{"type":18,"tag":36,"props":212,"children":214},{"id":213},"经验总结-3",[215],{"type":18,"tag":31,"props":216,"children":217},{},[218],{"type":24,"value":52},{"type":18,"tag":45,"props":220,"children":221},{},[222],{"type":24,"value":223},"为了适用于动态shape的输入，又由于mindspore.nn.Cell.construct中不便于进行numpy操作，采用先生成一个足够长的positional encodding向量再根据输入长度进行截取的方法。",{"type":18,"tag":64,"props":225,"children":227},{"code":226},"import mindspore.ops.operations as Pimport mindspore.nn as nnfrom mindspore.common import dtype as mstypefrom mindspore import Tensorimport numpy as npimport math\n\nclass PositionalEncoding(nn.Cell):\n\n    \"\"\"Positional encoding as in Sec 3.5 https://arxiv.org/pdf/1706.03762.pdf\n\n\n\n    :param int dim: dimension of input\n\n    :param int maxlen: upper limit of sequence length\n\n    :param float dropout_rate: dropout rate\n\n\n\n    \"\"\"\n\n\n\n    def __init__(self, dim, maxlen=10000, dropout_rate=0.1):\n\n        \"\"\"Construct an PositionalEncoding object.\"\"\"\n\n        super(PositionalEncoding, self).__init__()\n\n\n\n        xscale = math.sqrt(dim)\n\n        self.dropout = nn.Dropout(1 - dropout_rate)\n\n        self.mul = P.Mul()\n\n        self.add = P.TensorAdd()\n\n        self.shape = P.Shape()\n\n\n\n        self.pe = self.postion_encoding_table(maxlen, dim)\n\n        self.te = Tensor([xscale, ], mstype.float32)\n\n\n\n    def construct(self, x):\n\n        \"\"\"\n\n        Add positional encoding\n\n        :param mindspore.Tensor x: batches of inputs (B, len, dim)\n\n        :return: Encoded x (B, len, dim)\n\n        \"\"\"\n\n        (_, l, _) = self.shape(x)\n\n        pos = self.pe[:, :l, :]\n\n        x = self.mul(x, self.te)\n\n        x = self.add(x, pos)\n\n        x = self.dropout(x)\n\n        return x\n\n\n\n    def postion_encoding_table(self, max_length, dims):\n\n        pe = np.zeros((max_length, dims))\n\n        position = np.arange(0, max_length).reshape((max_length, 1))\n\n        div_term = np.exp(np.arange(0, dims, 2) * (-(math.log(10000.0) / dims)))\n\n        div_term = div_term.reshape((1, div_term.shape[0]))\n\n        pe[:, 0::2] = np.sin(np.matmul(position, div_term))\n\n        pe[:, 1::2] = np.cos(np.matmul(position, div_term))\n\n        pe = pe.reshape((1, max_length, dims))\n\n        pe = Tensor(pe, mstype.float32)\n\n        return pe\n",[228],{"type":18,"tag":69,"props":229,"children":230},{"__ignoreMap":7},[231],{"type":24,"value":226},{"title":7,"searchDepth":233,"depth":233,"links":234},4,[235,241,245,249],{"id":28,"depth":236,"text":9,"children":237},2,[238,240],{"id":38,"depth":239,"text":38},3,{"id":52,"depth":239,"text":52},{"id":88,"depth":236,"text":94,"children":242},[243,244],{"id":97,"depth":239,"text":38},{"id":110,"depth":239,"text":52},{"id":131,"depth":236,"text":137,"children":246},[247,248],{"id":140,"depth":239,"text":38},{"id":153,"depth":239,"text":52},{"id":174,"depth":236,"text":180,"children":250},[251,252],{"id":183,"depth":239,"text":38},{"id":213,"depth":239,"text":52},"markdown","content:technology-blogs:zh:1830.md","content","technology-blogs/zh/1830.md","technology-blogs/zh/1830","md",1776506116036]