[{"data":1,"prerenderedAt":325},["ShallowReactive",2],{"content-query-AaFqngyEx4":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":10,"date":11,"cover":12,"type":13,"category":14,"body":15,"_type":319,"_id":320,"_source":321,"_file":322,"_stem":323,"_extension":324},"/technology-blogs/en/1802","en",false,"",[9],"MindSpore Made Easy","Mixed precision accelerates computing and reduces the memory usage.","2022-08-12","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/09/15/1b4b42e60bac4c58bd09c16581750e85.png","technology-blogs","Basics",{"type":16,"children":17,"toc":306},"root",[18,32,38,49,54,59,64,69,81,86,97,102,111,116,121,126,135,140,145,150,159,169,208,232,242,251,256,261,269,274,282,291,296,301],{"type":19,"tag":20,"props":21,"children":23},"element","h1",{"id":22},"mindspore-made-easy-summary-of-training-with-mixed-precision",[24,30],{"type":19,"tag":25,"props":26,"children":27},"span",{},[28],{"type":29,"value":9},"text",{"type":29,"value":31}," Summary of Training with Mixed Precision",{"type":19,"tag":33,"props":34,"children":35},"p",{},[36],{"type":29,"value":37},"August 12, 2022",{"type":19,"tag":39,"props":40,"children":42},"h2",{"id":41},"_1-overview",[43],{"type":19,"tag":44,"props":45,"children":46},"strong",{},[47],{"type":29,"value":48},"1. Overview",{"type":19,"tag":33,"props":50,"children":51},{},[52],{"type":29,"value":53},"The training with mixed precision method accelerates the deep neural network training process by mixing the single-precision floating-point data format and the half-precision floating-point data format without compromising the network accuracy. It can also accelerate the computing process, reduce memory usage and retrieval, and enable a larger model or batch size to be trained on specific hardware. The computation process in MindSpore requires:",{"type":19,"tag":33,"props":55,"children":56},{},[57],{"type":29,"value":58},"1. Parameters are stored in FP32 format.",{"type":19,"tag":33,"props":60,"children":61},{},[62],{"type":29,"value":63},"2. During forward propagation, inputs and parameters of FP16 operators are cast from FP32 to FP16.",{"type":19,"tag":33,"props":65,"children":66},{},[67],{"type":29,"value":68},"3. Set the loss layer to FP32 for computation.",{"type":19,"tag":33,"props":70,"children":71},{},[72,74,79],{"type":29,"value":73},"4. During backward propagation, the value of ",{"type":19,"tag":44,"props":75,"children":76},{},[77],{"type":29,"value":78},"Loss Scale",{"type":29,"value":80}," is multiplied first to avoid underflow caused by a small gradient descent.",{"type":19,"tag":33,"props":82,"children":83},{},[84],{"type":29,"value":85},"5. The FP16 parameters are used in gradient calculation, and the result is cast back to FP32.",{"type":19,"tag":33,"props":87,"children":88},{},[89,91,95],{"type":29,"value":90},"6. The result is divided by ",{"type":19,"tag":44,"props":92,"children":93},{},[94],{"type":29,"value":78},{"type":29,"value":96}," to restore the enlarged gradient.",{"type":19,"tag":33,"props":98,"children":99},{},[100],{"type":29,"value":101},"7. The optimizer checks whether the gradient overflows. If yes, the optimizer skips the update. If no, the optimizer updates the original parameters using FP32.",{"type":19,"tag":39,"props":103,"children":105},{"id":104},"_2-application-scenarios",[106],{"type":19,"tag":44,"props":107,"children":108},{},[109],{"type":29,"value":110},"2. Application Scenarios",{"type":19,"tag":33,"props":112,"children":113},{},[114],{"type":29,"value":115},"The mixed precision can accelerate computing and reduce memory usage. Therefore, you can use it in the following scenarios:",{"type":19,"tag":33,"props":117,"children":118},{},[119],{"type":29,"value":120},"1. Memory resources are insufficient.",{"type":19,"tag":33,"props":122,"children":123},{},[124],{"type":29,"value":125},"2. The training speed is low.",{"type":19,"tag":39,"props":127,"children":129},{"id":128},"_3-usage-rules",[130],{"type":19,"tag":44,"props":131,"children":132},{},[133],{"type":29,"value":134},"3. Usage Rules",{"type":19,"tag":33,"props":136,"children":137},{},[138],{"type":29,"value":139},"This blog is intended for users who:",{"type":19,"tag":33,"props":141,"children":142},{},[143],{"type":29,"value":144},"1. Have basic understanding of MindSpore and are about to start MindSpore training code migration tasks.",{"type":19,"tag":33,"props":146,"children":147},{},[148],{"type":29,"value":149},"2. Have completed MindSpore training code migration tasks, that is, have obtained the MindSpore training code.",{"type":19,"tag":39,"props":151,"children":153},{"id":152},"_4-usage-samples",[154],{"type":19,"tag":44,"props":155,"children":156},{},[157],{"type":29,"value":158},"4. Usage Samples",{"type":19,"tag":160,"props":161,"children":163},"h3",{"id":162},"_1-mindspore-high-level-apis-with-mixed-precision",[164],{"type":19,"tag":44,"props":165,"children":166},{},[167],{"type":29,"value":168},"(1) MindSpore High-Level APIs with Mixed Precision",{"type":19,"tag":33,"props":170,"children":171},{},[172,174,179,181,185,187,192,194,199,201,206],{"type":29,"value":173},"MindSpore encapsulates mixed precision in the ",{"type":19,"tag":44,"props":175,"children":176},{},[177],{"type":29,"value":178},"mindspore.Model",{"type":29,"value":180}," interface for users to call. The specific implementation procedure is the same as that of writing common training code. You only need to set parameters related to mixed precision in ",{"type":19,"tag":44,"props":182,"children":183},{},[184],{"type":29,"value":178},{"type":29,"value":186},", for example, ",{"type":19,"tag":44,"props":188,"children":189},{},[190],{"type":29,"value":191},"amp_level",{"type":29,"value":193},", ",{"type":19,"tag":44,"props":195,"children":196},{},[197],{"type":29,"value":198},"loss_scale_manager",{"type":29,"value":200}," and ",{"type":19,"tag":44,"props":202,"children":203},{},[204],{"type":29,"value":205},"keep_batchnorm_fp32",{"type":29,"value":207},".",{"type":19,"tag":33,"props":209,"children":210},{},[211,213,217,219,223,225,230],{"type":29,"value":212},"Modify the ",{"type":19,"tag":44,"props":214,"children":215},{},[216],{"type":29,"value":178},{"type":29,"value":218}," interface in the high-level API code to set ",{"type":19,"tag":44,"props":220,"children":221},{},[222],{"type":29,"value":191},{"type":29,"value":224}," to ",{"type":19,"tag":44,"props":226,"children":227},{},[228],{"type":29,"value":229},"O3",{"type":29,"value":231},". Then, the network uses mixed precision for training.",{"type":19,"tag":233,"props":234,"children":236},"pre",{"code":235},"net = Model(net, loss, opt, metrics=metrics, amp_level=\"O3\")\n",[237],{"type":19,"tag":238,"props":239,"children":240},"code",{"__ignoreMap":7},[241],{"type":29,"value":235},{"type":19,"tag":160,"props":243,"children":245},{"id":244},"_2-mindspore-low-level-apis-with-mixed-precision",[246],{"type":19,"tag":44,"props":247,"children":248},{},[249],{"type":29,"value":250},"(2) MindSpore Low-Level APIs with Mixed Precision",{"type":19,"tag":33,"props":252,"children":253},{},[254],{"type":29,"value":255},"To enable MindSpore low-level APIs to use mixed precision, you only need to enable the mixed precision training of the network in the step of constructing a model using MindSpore low-level API code. The following compares the two model construction modes.",{"type":19,"tag":33,"props":257,"children":258},{},[259],{"type":29,"value":260},"Construct a model using the MindSpore low-level API code:",{"type":19,"tag":233,"props":262,"children":264},{"code":263},"class BuildTrainNetwork(nn.Cell):\n\n    '''Build train network.'''\n\n    def __init__(self, my_network, my_criterion, train_batch_size, class_num):\n\n        super(BuildTrainNetwork, self).__init__()\n\n        self.network = my_network\n\n        self.criterion = my_criterion\n\n        self.print = P.Print()\n\n        # Initialize self.output\n\n        self.output = mindspore.Parameter(Tensor(np.ones((train_batch_size,\n\n                        class_num)), mindspore.float32), requires_grad=False)\n\n\n\n    def construct(self, input_data, label):\n\n        output = self.network(input_data)\n\n        # Get the network output and assign it to self.output\n\n        self.output = output\n\n        loss0 = self.criterion(output, label)\n\n        return loss0\n\nclass TrainOneStepCellV2(TrainOneStepCell):\n\n    def __init__(self, network, optimizer, sens=1.0):\n\n        super(TrainOneStepCellV2, self).__init__(network, optimizer, sens=1.0)\n\n\n\n    def construct(self, *inputs):\n\n        weights = self.weights\n\n        loss = self.network(*inputs)\n\n        # Obtain self.network from BuildTrainNetwork\n\n        output = self.network.output\n\n        sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)\n\n        # Get the gradient of the network parameters\n\n        grads = self.grad(self.network, weights)(*inputs, sens)\n\n        grads = self.grad_reducer(grads)\n\n        # Optimize model parameters\n\n        loss = F.depend(loss, self.optimizer(grads))\n\n        return loss, output\n\n        \n\nmodel_constructed = BuildTrainNetwork(net, loss_function,\n\n                                TRAIN_BATCH_SIZE, CLASS_NUM)\n\nmodel_constructed = TrainOneStepCellV2(model_constructed, opt)\n",[265],{"type":19,"tag":238,"props":266,"children":267},{"__ignoreMap":7},[268],{"type":29,"value":263},{"type":19,"tag":33,"props":270,"children":271},{},[272],{"type":29,"value":273},"Construct a model using the MindSpore low-level API code with mixed precision:",{"type":19,"tag":233,"props":275,"children":277},{"code":276},"class BuildTrainNetwork(nn.Cell):\n\n    '''Build train network.'''\n\n    def __init__(self, my_network, my_criterion, train_batch_size, class_num):\n\n        super(BuildTrainNetwork, self).__init__()\n\n        self.network = my_network\n\n        self.criterion = my_criterion\n\n        self.print = P.Print()\n\n        # Initialize self.output\n\n        self.output = mindspore.Parameter(Tensor(np.ones((train_batch_size,\n\n                        class_num)), mindspore.float32), requires_grad=False)\n\n\n\n    def construct(self, input_data, label):\n\n        output = self.network(input_data)\n\n        # Get the network output and assign it to self.output\n\n        self.output = output\n\n        loss0 = self.criterion(output, label)\n\n        return loss0\n\nclass TrainOneStepCellV2(TrainOneStepCell):\n\n    '''Build train network.'''\n\n    def __init__(self, network, optimizer, sens=1.0):\n\n        super(TrainOneStepCellV2, self).__init__(network, optimizer, sens=1.0)\n\n\n\n    def construct(self, *inputs):\n\n        weights = self.weights\n\n        loss = self.network(*inputs)\n\n        # Obtain self.network from BuildTrainNetwork\n\n        output = self.network.output\n\n        sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)\n\n        # Get the gradient of the network parameters\n\n        grads = self.grad(self.network, weights)(*inputs, sens)\n\n        grads = self.grad_reducer(grads)\n\n        # Optimize model parameters\n\n        loss = F.depend(loss, self.optimizer(grads))\n\n        return loss, output\n\n        def build_train_network_step2(network, optimizer,\n\n            loss_fn=None, level='O0', **kwargs):\n\n    \"\"\"\n\n    Build the mixed precision training cell automatically.\n\n    \"\"\"\n\n    amp.validator.check_value_type('network', network, nn.Cell)\n\n    amp.validator.check_value_type('optimizer', optimizer, nn.Optimizer)\n\n    amp.validator.check('level', level, \"\", ['O0', 'O2', 'O3', \"auto\"],\n\n                        amp.Rel.IN)\n\n\n\n    if level == \"auto\":\n\n        device_target = context.get_context('device_target')\n\n        if device_target == \"GPU\":\n\n            level = \"O2\"\n\n        elif device_target == \"Ascend\":\n\n            level = \"O3\"\n\n        else:\n\n            raise ValueError(\n\n        \"Level `auto` only support when `device_target` is GPU or Ascend.\")\n\n\n\n    amp._check_kwargs(kwargs)\n\n    config = dict(amp._config_level[level], **kwargs)\n\n    config = amp.edict(config)\n\n\n\n    if config.cast_model_type == mstype.float16:\n\n        network.to_float(mstype.float16)\n\n\n\n        if config.keep_batchnorm_fp32:\n\n            amp._do_keep_batchnorm_fp32(network)\n\n\n\n    if loss_fn:\n\n        network = amp._add_loss_network(network, loss_fn,\n\n                                    config.cast_model_type)\n\n\n\n    if amp._get_parallel_mode() in (amp.ParallelMode.SEMI_AUTO_PARALLEL,\n\n                                    amp.ParallelMode.AUTO_PARALLEL):\n\n        network = amp._VirtualDatasetCell(network)\n\n\n\n    loss_scale = 1.0\n\n    if config.loss_scale_manager is not None:\n\n        loss_scale_manager = config.loss_scale_manager\n\n        loss_scale = loss_scale_manager.get_loss_scale()\n\n        update_cell = loss_scale_manager.get_update_cell()\n\n        if update_cell is not None:\n\n            # only cpu not support `TrainOneStepWithLossScaleCell` for control flow.\n\n            if not context.get_context(\"enable_ge\")\n\n                and context.get_context(\"device_target\") == \"CPU\":\n\n                raise ValueError(\"Only `loss_scale_manager=None` and \"\n\n                \"`loss_scale_manager=FixedLossScaleManager`\"\n\n                \"are supported in current version. If you use `O2` option,\"\n\n                \"use `loss_scale_manager=None` or `FixedLossScaleManager`\")\n\n            network = TrainOneStepCellV2(network, optimizer)\n\n            return network\n\n    network = TrainOneStepCellV2(network, optimizer)\n\n    return network\n\n    \n\nmodel_constructed = BuildTrainNetwork(net, loss_function, TRAIN_BATCH_SIZE, CLASS_NUM)\n\nmodel_constructed = build_train_network_step2(model_constructed, opt, level=\"O3\")\n",[278],{"type":19,"tag":238,"props":279,"children":280},{"__ignoreMap":7},[281],{"type":29,"value":276},{"type":19,"tag":39,"props":283,"children":285},{"id":284},"_5-performance-comparison",[286],{"type":19,"tag":44,"props":287,"children":288},{},[289],{"type":29,"value":290},"5. Performance Comparison",{"type":19,"tag":33,"props":292,"children":293},{},[294],{"type":29,"value":295},"Compared with full-precision training, the performance is greatly improved after the mixed precision is used.",{"type":19,"tag":33,"props":297,"children":298},{},[299],{"type":29,"value":300},"Low-level APIs: 2000 imgs/sec; Low-level APIs with mixed precision: 3200 imgs/sec",{"type":19,"tag":33,"props":302,"children":303},{},[304],{"type":29,"value":305},"High-level APIs: 2200 imgs/sec; High-level APIs with mixed precision: 3300 imgs/sec",{"title":7,"searchDepth":307,"depth":307,"links":308},4,[309,311,312,313,318],{"id":41,"depth":310,"text":48},2,{"id":104,"depth":310,"text":110},{"id":128,"depth":310,"text":134},{"id":152,"depth":310,"text":158,"children":314},[315,317],{"id":162,"depth":316,"text":168},3,{"id":244,"depth":316,"text":250},{"id":284,"depth":310,"text":290},"markdown","content:technology-blogs:en:1802.md","content","technology-blogs/en/1802.md","technology-blogs/en/1802","md",1776506104675]