[{"data":1,"prerenderedAt":292},["ShallowReactive",2],{"content-query-3NObYPtQVo":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":286,"_id":287,"_source":288,"_file":289,"_stem":290,"_extension":291},"/technology-blogs/zh/2025-12-2","zh",false,"","MindSpore显存救星：手把手教你实现“梯度累积”与断点续训","跳过基础API，直接带你深入MindSpore的底层Cell机制，手动实现一个支持梯度累积的训练封装","2025-12-2","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/11/14/36ab0eb7d52a4d1280f8fe6595b188ea.png","technology-blogs","开发者分享",{"type":15,"children":16,"toc":280},"root",[17,25,35,46,58,70,75,83,94,99,104,124,132,140,144,150,160,169,177,185,190,202,210,222,227,235,243,248,275],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore显存救星手把手教你实现梯度累积与断点续训",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":18,"tag":30,"props":31,"children":32},"strong",{},[33],{"type":24,"value":34},"# 01",{"type":18,"tag":26,"props":36,"children":37},{},[38],{"type":18,"tag":30,"props":39,"children":40},{},[41],{"type":18,"tag":30,"props":42,"children":43},{},[44],{"type":24,"value":45},"背景介绍",{"type":18,"tag":26,"props":47,"children":48},{},[49,51,56],{"type":24,"value":50},"在昇腾（Ascend）NPU上训练大模型或高分辨率图像模型时，我们常会遇到一个尴尬的问题：",{"type":18,"tag":30,"props":52,"children":53},{},[54],{"type":24,"value":55},"想要增加Batch Size以稳定收敛，但NPU显存（HBM）却报警了（OOM）",{"type":24,"value":57},"。",{"type":18,"tag":26,"props":59,"children":60},{},[61,63,68],{"type":24,"value":62},"除了增加更大显存的硬件，软件层面最有效的解决方案就是",{"type":18,"tag":30,"props":64,"children":65},{},[66],{"type":24,"value":67},"梯度累积（Gradient Accumulation）",{"type":24,"value":69},"。它的核心思想是将一个大的Batch拆分成多个Micro-Batch依次计算，累积梯度后再更新参数。",{"type":18,"tag":26,"props":71,"children":72},{},[73],{"type":24,"value":74},"本文将跳过基础API，直接带你深入MindSpore的底层Cell机制，手动实现一个支持梯度累积的训练封装。",{"type":18,"tag":26,"props":76,"children":77},{},[78],{"type":18,"tag":30,"props":79,"children":80},{},[81],{"type":24,"value":82},"# 02",{"type":18,"tag":26,"props":84,"children":85},{},[86],{"type":18,"tag":30,"props":87,"children":88},{},[89],{"type":18,"tag":30,"props":90,"children":91},{},[92],{"type":24,"value":93},"原理与核心难点",{"type":18,"tag":26,"props":95,"children":96},{},[97],{"type":24,"value":98},"在MindSpore的Graph模式下，直接写Python循环累积梯度是行不通的（因为会被编译成静态图）。我们需要自定义 TrainOneStepCell。",{"type":18,"tag":26,"props":100,"children":101},{},[102],{"type":24,"value":103},"核心逻辑：",{"type":18,"tag":105,"props":106,"children":107},"ul",{},[108,114,119],{"type":18,"tag":109,"props":110,"children":111},"li",{},[112],{"type":24,"value":113},"**Forward & Backward：**计算当前Micro-Batch的Loss和梯度。",{"type":18,"tag":109,"props":115,"children":116},{},[117],{"type":24,"value":118},"**Accumulate：**将当前梯度加到累积变量（Parameter）中。",{"type":18,"tag":109,"props":120,"children":121},{},[122],{"type":24,"value":123},"**Update：**当达到累积步数（Accumulation Steps）时，应用优化器更新权重，并清零累积变量。",{"type":18,"tag":26,"props":125,"children":126},{},[127],{"type":18,"tag":30,"props":128,"children":129},{},[130],{"type":24,"value":131},"# 03",{"type":18,"tag":26,"props":133,"children":134},{},[135],{"type":18,"tag":30,"props":136,"children":137},{},[138],{"type":24,"value":139},"代码实战：自定义梯度累积Cell",{"type":18,"tag":141,"props":142,"children":143},"h3",{"id":7},[],{"type":18,"tag":141,"props":145,"children":147},{"id":146},"下面的代码演示了如何封装一个通用的梯度累积训练步",[148],{"type":24,"value":149},"下面的代码演示了如何封装一个通用的梯度累积训练步。",{"type":18,"tag":151,"props":152,"children":154},"pre",{"code":153},"import mindspore as ms\nfrom mindspore import nn, ops, Tensor, Parameter\nfrom mindspore.common import dtype as mstype\n\nclass TrainOneStepWithAccumulation(nn.Cell):\n    \"\"\"\n    支持梯度累积的自定义训练步封装\n    network: 前向网络\n    optimizer: 优化器\n    accumulate_step: 累积步数 (例如 4)\n    sens: Loss缩放系数 (用于混合精度)\n    \"\"\"\n    def __init__(self, network, optimizer, accumulate_step, sens=1.0):\n        super(TrainOneStepWithAccumulation, self).__init__()\n        self.network = network\n        self.optimizer = optimizer\n        self.accumulate_step = accumulate_step\n        self.weights = self.optimizer.parameters\n        \n        # 定义梯度计算函数\n        self.grad = ops.GradOperation(get_by_list=True, sens_param=True)\n        self.sens = sens\n        \n        # 创建用于存储累积梯度的Parameter\n        # 注意：必须初始化为0，且不参与优化器更新\n        self.accumulated_grads = self.weights.clone(prefix=\"acc_grad\", init='zeros')\n        \n        # 内部计数器\n        self.counter = Parameter(Tensor(0, mstype.int32), name=\"accumulate_counter\")\n        \n        # 算子定义\n        self.hyper_map = ops.HyperMap()\n        self.partial = ops.Partial()\n        self.assign_add = ops.AssignAdd()\n        self.reset_acc = ops.Assign()\n    \n    def construct(self, data, label):\n        # 1. 计算当前Micro-Batch的梯度\n        weights = self.weights\n        loss = self.network(data, label)\n        \n        # 构造sens tensor用于反向传播\n        sens = ops.Fill()(ops.DType()(loss), ops.Shape()(loss), self.sens)\n        grads = self.grad(self.network, weights)(data, label, sens)\n        \n        # 2. 累积梯度 (grads / accumulate_step)\n        # 我们通常在累积时平均，或者在Loss计算时平均，这里选择直接累积\n        success = self.hyper_map(ops.partial(self.assign_add), self.accumulated_grads, grads)\n        \n        # 3. 计数器 +1\n        loss = ops.depend(loss, success)\n        current_step = self.assign_add(self.counter, Tensor(1, mstype.int32))\n        \n        # 4. 判断是否达到累积步数\n        if current_step % self.accumulate_step == 0:\n            # 达到累积步数：\n            # a. 使用累积的梯度更新权重\n            self.optimizer(self.accumulated_grads)\n            \n            # b. 清零累积梯度\n            zeros = ops.ZerosLike()(self.accumulated_grads) # 这里需配合HyperMap使用，简化示意\n            # 实际清零逻辑：\n            self.hyper_map(ops.partial(self.reset_acc), self.accumulated_grads, self.weights.clone(init='zeros'))\n            \n            # c. 重置计数器(可选，防止溢出)\n            # self.reset_acc(self.counter, Tensor(0, mstype.int32))\n        return loss\n",[155],{"type":18,"tag":156,"props":157,"children":158},"code",{"__ignoreMap":7},[159],{"type":24,"value":153},{"type":18,"tag":161,"props":162,"children":163},"blockquote",{},[164],{"type":18,"tag":26,"props":165,"children":166},{},[167],{"type":24,"value":168},"注意：上述代码为了通过静态图编译，需要严格遵守MindSpore的语法规范。在实际工程中，还需要处理sens的动态调整（Loss Scale），这在AMP（混合精度）模式下尤为重要。MindSpore高阶API boost模块中也提供了相关实验性特性，但在定制化场景下，手动实现Cell是最可控的。",{"type":18,"tag":26,"props":170,"children":171},{},[172],{"type":18,"tag":30,"props":173,"children":174},{},[175],{"type":24,"value":176},"# 04",{"type":18,"tag":26,"props":178,"children":179},{},[180],{"type":18,"tag":30,"props":181,"children":182},{},[183],{"type":24,"value":184},"避坑：关于Ckpt的保存与加载",{"type":18,"tag":26,"props":186,"children":187},{},[188],{"type":24,"value":189},"在使用了梯度累积后，训练过程中的global_step概念会发生变化。在保存Checkpoint时，需要注意以下两点：",{"type":18,"tag":26,"props":191,"children":192},{},[193,195,200],{"type":24,"value":194},"1、",{"type":18,"tag":30,"props":196,"children":197},{},[198],{"type":24,"value":199},"异步保存：",{"type":24,"value":201}," 在Ascend上，IO操作（写磁盘）如果不异步进行，会严重阻塞计算流水线。",{"type":18,"tag":151,"props":203,"children":205},{"code":204},"# 必须配置 async_save=True\nconfig_ck = ms.CheckpointConfig(save_checkpoint_steps=1000, \n                                keep_checkpoint_max=5, \n                                async_save=True)\n",[206],{"type":18,"tag":156,"props":207,"children":208},{"__ignoreMap":7},[209],{"type":24,"value":204},{"type":18,"tag":26,"props":211,"children":212},{},[213,215,220],{"type":24,"value":214},"2、",{"type":18,"tag":30,"props":216,"children":217},{},[218],{"type":24,"value":219},"断点续训的陷阱：",{"type":24,"value":221}," 加载模型时，如果使用了梯度累积，必须保证加载的优化器状态（Optimizer State）与当前累积步的状态一致。简单的 load_checkpoint可能只加载了权重。",{"type":18,"tag":26,"props":223,"children":224},{},[225],{"type":24,"value":226},"**建议：**在生产环境中，始终将 epoch、cur_step等元数据作为独立的Parameter保存到ckpt中，以便恢复训练时能精准对齐。",{"type":18,"tag":26,"props":228,"children":229},{},[230],{"type":18,"tag":30,"props":231,"children":232},{},[233],{"type":24,"value":234},"# 05",{"type":18,"tag":26,"props":236,"children":237},{},[238],{"type":18,"tag":30,"props":239,"children":240},{},[241],{"type":24,"value":242},"总结",{"type":18,"tag":26,"props":244,"children":245},{},[246],{"type":24,"value":247},"在昇腾算力平台上，显存不应成为制约模型深度的瓶颈。",{"type":18,"tag":105,"props":249,"children":250},{},[251,263],{"type":18,"tag":109,"props":252,"children":253},{},[254,256,261],{"type":24,"value":255},"如果你的模型因为Batch Size太小而无法收敛（BN层震荡），",{"type":18,"tag":30,"props":257,"children":258},{},[259],{"type":24,"value":260},"梯度累积",{"type":24,"value":262},"是必选方案。",{"type":18,"tag":109,"props":264,"children":265},{},[266,268,273],{"type":24,"value":267},"通过继承 nn.Cell自定义训练步，虽然代码量稍大，但能让你完全掌控 NPU 的计算逻辑，实现如 Gradient Clipping（",{"type":18,"tag":30,"props":269,"children":270},{},[271],{"type":24,"value":272},"梯度裁剪",{"type":24,"value":274},"）等更高级的操作。",{"type":18,"tag":26,"props":276,"children":277},{},[278],{"type":24,"value":279},"希望这个硬核技巧能帮大家在昇腾上跑起更大的模型！",{"title":7,"searchDepth":281,"depth":281,"links":282},4,[283,285],{"id":7,"depth":284,"text":7},3,{"id":146,"depth":284,"text":149},"markdown","content:technology-blogs:zh:2025-12-2.md","content","technology-blogs/zh/2025-12-2.md","technology-blogs/zh/2025-12-2","md",1776506118178]