[{"data":1,"prerenderedAt":286},["ShallowReactive",2],{"content-query-etyk6wyIM1":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":280,"_id":281,"_source":282,"_file":283,"_stem":284,"_extension":285},"/technology-blogs/zh/2026-2-2","zh",false,"","HyperOffload：设备内存优化技术深度解析","采用\"数据换时间\"的优化思路","2026-2-2","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/11/28/8e0e0150508a4c5ba4287fa3bec8ea3f.png","technology-blogs","技术解读",{"type":15,"children":16,"toc":277},"root",[17,25,31,37,42,47,52,57,62,68,73,78,83,88,93,98,103,113,118,126,131,139,144,152,160,171,176,181,186,192,197,202,207,212,217,223,228,236,241,247,252,257,268],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"hyperoffload设备内存优化技术深度解析",[23],{"type":24,"value":8},"text",{"type":18,"tag":19,"props":26,"children":28},{"id":27},"_01-核心问题与解决思路",[29],{"type":24,"value":30},"01 核心问题与解决思路",{"type":18,"tag":32,"props":33,"children":34},"p",{},[35],{"type":24,"value":36},"1.1 显存受限的本质问题",{"type":18,"tag":32,"props":38,"children":39},{},[40],{"type":24,"value":41},"在模型训练与推理过程中，模型参数与中间 Activation 的显存占用往往超出设备显存容量限制。传统方案要求用户手动管理数据 placement 或使用模型并行分割策略，开发门槛较高且灵活性不足。",{"type":18,"tag":32,"props":43,"children":44},{},[45],{"type":24,"value":46},"1.2 数据 Offload 优化策略",{"type":18,"tag":32,"props":48,"children":49},{},[50],{"type":24,"value":51},"HyperOffload 采用\"数据换时间\"的优化思路：将不常用的数据暂时移到主机端，需要时再移回来。通过这种方式，虽然增加了数据传输开销，但显著降低了峰值显存占用，使原本无法在有限显存环境下执行的大模型得以运行。",{"type":18,"tag":32,"props":53,"children":54},{},[55],{"type":24,"value":56},"1.3 图优化层三大任务",{"type":18,"tag":32,"props":58,"children":59},{},[60],{"type":24,"value":61},"图优化层承担三项核心职责：首先是分析计算图，确定哪些数据适合 Offload；其次是决定 Offload 时机，包括何时将数据从设备移到主机、何时从主机移回设备；最后是修改计算图，插入必要的 Offload 操作节点，并调整执行顺序以实现计算与传输的重叠执行。",{"type":18,"tag":19,"props":63,"children":65},{"id":64},"_02-图优化层代码逻辑详解",[66],{"type":24,"value":67},"02 图优化层代码逻辑详解",{"type":18,"tag":32,"props":69,"children":70},{},[71],{"type":24,"value":72},"2.1 优化器主流程",{"type":18,"tag":32,"props":74,"children":75},{},[76],{"type":24,"value":77},"HyperOffloadOptimizer::Run() 是图优化层的核心入口，其执行逻辑遵循清晰的流水线式设计。",{"type":18,"tag":32,"props":79,"children":80},{},[81],{"type":24,"value":82},"执行 SkipHyperOffloadOptimizer() 完成配置检查后，operations_.Init() 初始化操作管理队列，为每个执行节点预留操作存储空间。随后根据使能状态分别处理 Parameter Offload 和 Activation Offload 两类场景。",{"type":18,"tag":32,"props":84,"children":85},{},[86],{"type":24,"value":87},"GenerateParameterOperations() 负责为远程 Parameter 在恰当位置插入加载与释放操作，GenerateActivationOperations() 则通过分析节点依赖关系，识别适合 Offload 的中间结果并插入对应的传输节点。",{"type":18,"tag":32,"props":89,"children":90},{},[91],{"type":24,"value":92},"在完成节点插入后，AdjustHyperOffloadNodePosition() 调整 H2D 节点的物理位置，根据 prefetch_distance 配置让数据加载操作适当提前，使后续计算能够与数据传输并行执行。接下来 AddEventNodes() 注入 Send/Recv 同步事件，确保数据传输与计算执行之间的正确依赖关系。",{"type":18,"tag":32,"props":94,"children":95},{},[96],{"type":24,"value":97},"最后，BuildExecutionOrder() 将所有 Offload 操作节点按照依赖关系整合进原始执行顺序，形成包含新增节点的完整执行序列；AssignHyperOffloadIds() 为所有 Offload 相关节点分配唯一标识符，用于运行时追踪数据流转状态。",{"type":18,"tag":32,"props":99,"children":100},{},[101],{"type":24,"value":102},"2.2  GenerateParameterOperations 伪代码",{"type":18,"tag":104,"props":105,"children":107},"pre",{"code":106},"# 为远程 Parameter 插入 H2D/D2H 操作节点\nfunction GenerateParameterOperations():\n    for each node in exec_order:\n        # 遍历节点的所有输入\n        for i from 0 to node.inputs().size() - 1:\n            input_node = node.input(i)\n            \n            # 处理 Load 节点包裹的 Parameter\n            if IsPrimitiveCNode(input_node, \"Load\"):\n                real_input = GetKernelWithReturnType(input_node, 0, false, \"Load\").first\n                if IsPrimitiveCNode(real_input, \"Load\"):\n                    data_node = real_input.input(1)  # Load 节点的第二个输入\n                    if IsRemoteParameter(data_node):  # 检查是否是远程 Parameter\n                        # 插入 H2D 节点\n                        to_device = BuildToDeviceNode(data_node)\n                        UpdateNodeInput(node, i, to_device)\n                        operations_.AddOperationBefore(node_index, to_device)\n                        \n                        # 插入 D2H 节点（原地更新）\n                        to_remote = BuildInplaceToHostNode(to_device, data_node, node)\n                        operations_.AddOperationAfter(node_index, to_remote)\n                continue\n            \n            # 直接处理 Parameter 引用\n            if IsRemoteParameter(input_node):\n                # 插入 H2D 节点\n                to_device = BuildToDeviceNode(input_node)\n                UpdateNodeInput(node, i, to_device)\n                operations_.AddOperationBefore(node_index, to_device)\n               \n                # 插入 D2H 节点（原地更新）\n                param_node = GetRemoteParameter(graph, input_node)\n                to_remote = BuildInplaceToHostNode(to_device, param_node, node)\n                operations_.AddOperationAfter(node_index, to_remote)\n",[108],{"type":18,"tag":109,"props":110,"children":111},"code",{"__ignoreMap":7},[112],{"type":24,"value":106},{"type":18,"tag":32,"props":114,"children":115},{},[116],{"type":24,"value":117},"2.3 GenerateActivationOperations 伪代码",{"type":18,"tag":104,"props":119,"children":121},{"code":120},"# 识别需 Offload 的 Activation 并插入 D2H/H2D 节点\nfunction GenerateActivationOperations():\n    # 1. 建立数据 → 使用者映射关系\n    user_info_list = CollectAllNodeUsers(exec_order)\n    \n    # 2. 基于距离策略筛选 Offload 目标\n    strategy = DistanceBaseHyperOffloadStrategy()\n    offload_info_list = strategy.Run(exec_order, user_info_list)\n    \n    # 3. 数量过滤\n    filter = OffloadInfoFilterByNumber()\n    offload_info_list = filter.Filter(offload_info_list)\n    \n    # 4. 获取图输出节点\n    outputs = GetAllOutputWithIndex(graph.output())\n    \n    # 5. 为每个目标插入 Offload 节点\n    for each offload_info in offload_info_list:\n        AddSingleActivationOperations(offload_info, outputs)\n\n# 单个 Activation 的 Offload 节点插入\nfunction AddSingleActivationOperations(offload_info, outputs):\n    data_node = offload_info.data_node\n    \n    # 1. 处理 Tuple 类型的输出\n    if data_node.abstract is Tuple:\n        index_node = NewValueNode(offload_info.data_node.index)\n        data_node = graph.NewCNode([TupleGetItem, data_node, index_node])\n    \n    # 2. 在数据产生位置之后插入 D2H 节点\n    to_host = BuildToHostNode(data_node)\n    data_idx = FindIndexInExecOrder(data_node)\n    operations_.AddOperationAfter(data_idx, to_host)\n    \n    # 3. 为每个使用位置插入 H2D 节点\n    for each replace_info in offload_info.replace_info_list:\n        to_device = BuildToDeviceNode(to_host)\n        \n        # 替换原节点输入\n        for each change_node in replace_info.replace_rest_nodes:\n            UpdateNodeInput(change_node, change_node.index, to_device)\n        \n        # 在使用位置之前插入 H2D 节点\n        change_idx = FindIndexInExecOrder(change_node)\n        operations_.AddOperationBefore(change_idx, to_device)\n    \n    # 4. 处理输出节点\n    if data_node in outputs:\n        to_device = BuildToDeviceNode(to_host)\n        UpdateNodeInput(output, output.index, to_device)\n        operations_.AddOperationAfter(last_index, to_device)\n",[122],{"type":18,"tag":109,"props":123,"children":124},{"__ignoreMap":7},[125],{"type":24,"value":120},{"type":18,"tag":32,"props":127,"children":128},{},[129],{"type":24,"value":130},"2.4 AdjustHyperOffloadNodePosition 伪代码",{"type":18,"tag":104,"props":132,"children":134},{"code":133},"# 根据 prefetch_distance 调整 H2D 节点位置，实现数据预取\nfunction AdjustHyperOffloadNodePosition():\n    # 1. 获取所有已插入的 H2D 节点\n    h2d_nodes = operations_.GetAllH2DNodes()\n \n    for each h2d_node in h2d_nodes:\n        # 2. 获取 H2D 节点关联的原始数据节点\n        data_index = 1  # 输入参数位置\n        input_node = h2d_node.input(data_index)\n \n        if IsD2HNode(input_node):\n            # H2D 节点的输入是 D2H 节点，取 D2H 的输入作为原始数据\n            data_node = input_node.input(data_index)\n        else:\n           data_node = input_node\n \n        # 3. 获取 prefetch_distance（优先使用节点属性，其次使用全局配置）\n        prefetch_distance = GetNodePrefetchDistance(data_node) or\nGLOBAL_CONFIG.prefetch_distance\n \n        # 4. 将 H2D 节点向前移动指定距离，实现预取\n        operations_.MoveAhead(h2d_node, prefetch_distance)\n",[135],{"type":18,"tag":109,"props":136,"children":137},{"__ignoreMap":7},[138],{"type":24,"value":133},{"type":18,"tag":32,"props":140,"children":141},{},[142],{"type":24,"value":143},"2.5 BuildExecutionOrder 伪代码",{"type":18,"tag":104,"props":145,"children":147},{"code":146},"# 将 Offload 操作节点插入到执行顺序中，构建新执行序列\nfunction BuildExecutionOrder():\n    new_execution_order = []\n    \n    # 1. 添加前置操作（独立于任何执行节点的操作）\n    for op in operations_.GetPreOperations():\n        new_execution_order.append(op)\n    \n    # 2. 遍历原始执行顺序，交错插入 Offload 操作\n    for i from 0 to len(exec_order) - 1:      \n        # 2.1 添加原始执行节点\n        new_execution_order.append(exec_order[i])\n        \n        # 2.2 添加该位置对应的 Offload 操作（在 exec_order[i] 之后执行）\n        for op in operations_.GetOperations()[i]:\n            new_execution_order.append(op)\n    \n    return new_execution_order\n",[148],{"type":18,"tag":109,"props":149,"children":150},{"__ignoreMap":7},[151],{"type":24,"value":146},{"type":18,"tag":104,"props":153,"children":155},{"code":154},"# 示例：Activation Offload 的节点插入\n# 原始 exec_order: [A, B, C, D, E]\n# 假设 A 的输出 X 被 E 使用，且 A 与 E 之间距离超过阈值，需要 Offload\n#\n# 节点关系：\n#   A(data: X) → E 使用 X\n#   D2H 插入在 A 之后，H2D 插入在 E 之前\n#\n# operations_ 队列：\n#   pre_operations: []  // 无前置操作\n#   operations[0]: [D2H1]     // A 之后：D2H1(input: A)\n#   operations[1]: []         // B 之后\n#   operations[2]: []         // C 之后\n#   operations[3]: []         // D 之后\n#   operations[4]: [H2D1]     // E 之前：H2D1(input: D2H1)\n#\n# 构建结果: [A, D2H1(A), B, C, D, E, H2D1(D2H1)]\n#\n# 数据流向：\n#   A(X) → D2H1(输入 A 的输出 X，输出 X 到主机) → E(从主机加载 X)\n#                          ↘\n#                           H2D1(输入 D2H1 的输出 X)\n",[156],{"type":18,"tag":109,"props":157,"children":158},{"__ignoreMap":7},[159],{"type":24,"value":154},{"type":18,"tag":161,"props":162,"children":164},"div",{"style":163},"text-align: center;",[165],{"type":18,"tag":166,"props":167,"children":170},"img",{"src":168,"style":169,"alt":7},"/category/information/technology-blogs/banner/2026-2-2/1.jpg","display: block;margin: 0 auto;max-width:70%",[],{"type":18,"tag":32,"props":172,"children":173},{},[174],{"type":24,"value":175},"2.6 关键函数简述",{"type":18,"tag":32,"props":177,"children":178},{},[179],{"type":24,"value":180},"Activation Offload 的处理流程包含四个关键环节。首先，CollectAllNodeUsers() 遍历计算图建立数据与使用者的映射关系，明确每个数据节点被哪些操作消费以及消费的先后顺序。然后 DistanceBaseHyperOffloadStrategy::Run() 基于使用距离判定 Offload 必要性：当数据产生位置与后续使用位置之间的间隔超过 select_distance 阈值时，判定该数据适合 Offload。判定结果经 OffloadInfoFilterByNumber::Filter() 数量过滤后，最终 AddSingleActivationOperations() 执行实际的节点插入操作：在数据产生处插入 D2H 节点将数据送回主机，在各使用处插入 H2D 节点重新加载数据。",{"type":18,"tag":32,"props":182,"children":183},{},[184],{"type":24,"value":185},"事件注入环节，AddEventNodes() 为每个 D2H 和 H2D 节点配对 Send/Recv 事件，通过在不同执行流上调度发送与接收操作，实现计算与传输的时间重叠。",{"type":18,"tag":19,"props":187,"children":189},{"id":188},"_03-关键数据结构汇总",[190],{"type":24,"value":191},"03 关键数据结构汇总",{"type":18,"tag":32,"props":193,"children":194},{},[195],{"type":24,"value":196},"HyperOffloadInput — 优化器输入结构，包含待优化的计算图、原始执行顺序、Parameter Offload 开关、Activation Offload 开关、以及新节点回调函数。",{"type":18,"tag":32,"props":198,"children":199},{},[200],{"type":24,"value":201},"HyperOffloadPlan — 优化器输出结构，包含优化后的计算图、包含 Offload 节点的新执行顺序、以及 HyperOffloadOperations 操作管理队列。",{"type":18,"tag":32,"props":203,"children":204},{},[205],{"type":24,"value":206},"UserInfo — 封装数据节点与其全部消费者的映射关系，node 字段标识数据生产者，users 字段存储按执行顺序排列的消费者列表。",{"type":18,"tag":32,"props":208,"children":209},{},[210],{"type":24,"value":211},"OffloadInfo — 描述单个数据的 Offload 方案，data_node 标识目标数据，replace_info_list 包含所有需要重写数据引用的节点信息。",{"type":18,"tag":32,"props":213,"children":214},{},[215],{"type":24,"value":216},"HyperOffloadOperations — Offload 操作管理类，维护前置操作队列和每个执行位置对应的操作列表，支持 AddOperationBefore、AddOperationAfter、MoveAhead 等操作。",{"type":18,"tag":19,"props":218,"children":220},{"id":219},"_04-python-使用示例",[221],{"type":24,"value":222},"04 Python 使用示例",{"type":18,"tag":32,"props":224,"children":225},{},[226],{"type":24,"value":227},"Python 层通过 @jit 装饰器的 auto_offload 参数暴露使用接口。Activation Offload 使用 @jit(auto_offload=\"activation\") 启用，适用于网络中存在跨长距离复用的中间激活值场景。",{"type":18,"tag":104,"props":229,"children":231},{"code":230},"from mindspore import jit\n\n@jit(auto_offload=\"activation\")\ndef forward(x):\n    m1 = x / 2\n    m3 = m1 * 2\n    m4 = m3 * 2\n    m5 = m3 + m4\n    return (m5 * 2) - m1\n",[232],{"type":18,"tag":109,"props":233,"children":234},{"__ignoreMap":7},[235],{"type":24,"value":230},{"type":18,"tag":32,"props":237,"children":238},{},[239],{"type":24,"value":240},"运行时参数可通过 mindspore.graph.compile_config 模块调整，包括 SELECT_DISTANCE（距离阈值）、SELECT_NUM（Offload 上限）、PREFETCH_DISTANCE（预取距离）、RELEASE_DISTANCE（释放距离）等配置项。",{"type":18,"tag":19,"props":242,"children":244},{"id":243},"_05-总结与适用场景",[245],{"type":24,"value":246},"05 总结与适用场景",{"type":18,"tag":32,"props":248,"children":249},{},[250],{"type":24,"value":251},"HyperOffload 聚焦于 Ascend 设备显存受限场景下的模型高效执行，其基于数据流分析的编译期优化与基于异步事件的运行时协同机制，为大模型训练与内存受限推理提供了切实可行的解决方案。该技术适用于参数量庞大的 Transformer 类模型、长序列处理任务以及显存容量有限的边缘推理场景，核心价值体现在透明集成性、灵活配置性以及异步并行带来的性能收益。后续可探索基于机器学习的自适应策略选择、跨设备分布式 Offload 协同等优化方向。",{"type":18,"tag":32,"props":253,"children":254},{},[255],{"type":24,"value":256},"关于HyperOffload的更多链接，请参考",{"type":18,"tag":32,"props":258,"children":259},{},[260],{"type":18,"tag":261,"props":262,"children":266},"a",{"href":263,"rel":264},"https://atomgit.com/mindspore/mindspore/tree/master/mindspore/ccsrc/utils/hyper_offload",[265],"nofollow",[267],{"type":24,"value":263},{"type":18,"tag":32,"props":269,"children":270},{},[271],{"type":18,"tag":261,"props":272,"children":275},{"href":273,"rel":274},"https://mp.weixin.qq.com/s?__biz=MzkxMTM2MjMzNg==&mid=2247634387&idx=1&sn=4f92b52ce05f25ad4604a56aef9e7484&scene=21&poc_token=HDIOhGmjOQivkVwCNZdvzEy_hINwjwgoJuTr1uka",[265],[276],{"type":24,"value":273},{"title":7,"searchDepth":278,"depth":278,"links":279},4,[],"markdown","content:technology-blogs:zh:2026-2-2.md","content","technology-blogs/zh/2026-2-2.md","technology-blogs/zh/2026-2-2","md",1776506119565]