[{"data":1,"prerenderedAt":966},["ShallowReactive",2],{"content-query-dZDb7IihJ7":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":960,"_id":961,"_source":962,"_file":963,"_stem":964,"_extension":965},"/technology-blogs/zh/3795","zh",false,"","MindSpore权重转换全解析：基于Safetensors格式的高效实现","作者：yide12   来源：昇思论坛","2025-07-15","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/18/cae65d3d8b144a849071587eae52c887.png","technology-blogs","开发者说",{"type":15,"children":16,"toc":924},"root",[17,25,31,36,41,50,58,66,71,81,86,111,119,127,136,141,150,160,178,187,200,209,222,231,239,248,257,262,272,281,286,294,302,313,322,348,357,366,379,388,401,410,423,432,441,446,465,474,479,497,505,513,522,527,545,554,577,586,604,612,620,628,634,652,660,668,677,682,700,709,714,732,741,746,764,773,783,796,805,815,833,842,852,870,878,886,891,919],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore权重转换全解析基于safetensors格式的高效实现",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"作者：yide12",{"type":18,"tag":26,"props":32,"children":33},{},[34],{"type":24,"value":35},"来源：昇思论坛",{"type":18,"tag":26,"props":37,"children":38},{},[39],{"type":24,"value":40},"昇思MindSpore2024年技术帖分享大会圆满结束！全年收获80+高质量技术帖， 2025年全新升级，推出“2025年昇思干货小卖部，你投我就收！”，活动继续每月征集技术帖。本期技术文章由社区开发者yide12输出并投稿。如果您对活动感兴趣，欢迎在昇思论坛投稿。",{"type":18,"tag":26,"props":42,"children":43},{},[44],{"type":18,"tag":45,"props":46,"children":47},"strong",{},[48],{"type":24,"value":49},"# 01",{"type":18,"tag":26,"props":51,"children":52},{},[53],{"type":18,"tag":45,"props":54,"children":55},{},[56],{"type":24,"value":57},"MindSpore权重转换基础概念",{"type":18,"tag":26,"props":59,"children":60},{},[61],{"type":18,"tag":45,"props":62,"children":63},{},[64],{"type":24,"value":65},"1.1 为什么需要权重转换？",{"type":18,"tag":26,"props":67,"children":68},{},[69],{"type":24,"value":70},"在深度学习模型的开发与部署过程中，权重转换是一个关键环节。尤其在分布式训练场景下，模型权重会被切分到多个设备（如GPU或NPU）上进行并行计算。训练完成后，我们通常需要将这些分布式的权重合并为一个完整的模型，或者根据特定的部署需求对权重进行重新组织。",{"type":18,"tag":72,"props":73,"children":75},"h3",{"id":74},"_12-safetensors格式简介",[76],{"type":18,"tag":45,"props":77,"children":78},{},[79],{"type":24,"value":80},"1.2 Safetensors格式简介",{"type":18,"tag":26,"props":82,"children":83},{},[84],{"type":24,"value":85},"Safetensors是一种专为深度学习设计的高性能张量存储格式，具有以下特点：",{"type":18,"tag":87,"props":88,"children":89},"ul",{},[90,96,101,106],{"type":18,"tag":91,"props":92,"children":93},"li",{},[94],{"type":24,"value":95},"**安全高效：**避免了传统格式（如pickle）可能存在的安全风险，同时提供了高效的读写性能",{"type":18,"tag":91,"props":97,"children":98},{},[99],{"type":24,"value":100},"**跨框架兼容：**支持在不同深度学习框架间无缝交换张量数据",{"type":18,"tag":91,"props":102,"children":103},{},[104],{"type":24,"value":105},"**零拷贝加载：**可直接映射到内存，无需额外拷贝操作",{"type":18,"tag":91,"props":107,"children":108},{},[109],{"type":24,"value":110},"**元数据支持：**能够存储张量的元数据（如形状、数据类型等）",{"type":18,"tag":26,"props":112,"children":113},{},[114],{"type":18,"tag":45,"props":115,"children":116},{},[117],{"type":24,"value":118},"# 02",{"type":18,"tag":26,"props":120,"children":121},{},[122],{"type":18,"tag":45,"props":123,"children":124},{},[125],{"type":24,"value":126},"unified_safetensors接口详解",{"type":18,"tag":72,"props":128,"children":130},{"id":129},"_21-接口功能概述",[131],{"type":18,"tag":45,"props":132,"children":133},{},[134],{"type":24,"value":135},"2.1 接口功能概述",{"type":18,"tag":26,"props":137,"children":138},{},[139],{"type":24,"value":140},"unified_safetensors接口用于将多个分布式保存的safetensors权重文件合并为一个或多个统一的safetensors文件。这个过程通常在分布式训练完成后执行，以便得到一个完整的模型权重。",{"type":18,"tag":72,"props":142,"children":144},{"id":143},"_22-核心参数说明",[145],{"type":18,"tag":45,"props":146,"children":147},{},[148],{"type":24,"value":149},"2.2 核心参数说明",{"type":18,"tag":151,"props":152,"children":154},"h4",{"id":153},"_221-路径参数",[155],{"type":18,"tag":45,"props":156,"children":157},{},[158],{"type":24,"value":159},"2.2.1 路径参数",{"type":18,"tag":87,"props":161,"children":162},{},[163,168,173],{"type":18,"tag":91,"props":164,"children":165},{},[166],{"type":24,"value":167},"**src_dir：**源权重保存目录，包含了所有需要合并的safetensors文件",{"type":18,"tag":91,"props":169,"children":170},{},[171],{"type":24,"value":172},"**src_strategy_file：**源权重切分策略文件，记录了权重在分布式训练时的切分方式",{"type":18,"tag":91,"props":174,"children":175},{},[176],{"type":24,"value":177},"**dst_dir：**合并后的目标保存目录",{"type":18,"tag":151,"props":179,"children":181},{"id":180},"_222-合并策略参数",[182],{"type":18,"tag":45,"props":183,"children":184},{},[185],{"type":24,"value":186},"2.2.2 合并策略参数",{"type":18,"tag":87,"props":188,"children":189},{},[190,195],{"type":18,"tag":91,"props":191,"children":192},{},[193],{"type":24,"value":194},"**merge_with_redundancy：**控制合并时是否保留冗余数据。当设置为True时，合并的源权重文件是完整的；设置为False时，会去除冗余信息",{"type":18,"tag":91,"props":196,"children":197},{},[198],{"type":24,"value":199},"**file_suffix：**指定合并后safetensors文件的后缀名。如果不指定，将合并源目录下的所有safetensors文件",{"type":18,"tag":151,"props":201,"children":203},{"id":202},"_223-性能优化参数",[204],{"type":18,"tag":45,"props":205,"children":206},{},[207],{"type":24,"value":208},"2.2.3 性能优化参数",{"type":18,"tag":87,"props":210,"children":211},{},[212,217],{"type":18,"tag":91,"props":213,"children":214},{},[215],{"type":24,"value":216},"**max_process_num：**最大并行进程数，可根据硬件资源调整以提高合并效率",{"type":18,"tag":91,"props":218,"children":219},{},[220],{"type":24,"value":221},"**split_dst_file：**允许将合并任务切分为多个子任务，支持单机多任务或多机并行处理",{"type":18,"tag":151,"props":223,"children":225},{"id":224},"_224-高级筛选参数",[226],{"type":18,"tag":45,"props":227,"children":228},{},[229],{"type":24,"value":230},"2.2.4 高级筛选参数",{"type":18,"tag":87,"props":232,"children":233},{},[234],{"type":18,"tag":91,"props":235,"children":236},{},[237],{"type":24,"value":238},"**choice_func：**一个可调用函数，用于筛选需要合并的参数或修改参数名称。这个函数非常灵活，可以根据自定义规则对权重进行处理",{"type":18,"tag":72,"props":240,"children":242},{"id":241},"_23-使用场景举例",[243],{"type":18,"tag":45,"props":244,"children":245},{},[246],{"type":24,"value":247},"2.3 使用场景举例",{"type":18,"tag":151,"props":249,"children":251},{"id":250},"_231-常规合并场景",[252],{"type":18,"tag":45,"props":253,"children":254},{},[255],{"type":24,"value":256},"2.3.1 常规合并场景",{"type":18,"tag":26,"props":258,"children":259},{},[260],{"type":24,"value":261},"假设你在8卡GPU上完成了分布式训练，每个卡保存了一部分权重。现在需要将这些权重合并为一个完整的模型：",{"type":18,"tag":263,"props":264,"children":266},"pre",{"code":265},"# 合并分布式训练产生的权重\nunified_safetensors(\n    src_dir=\"path/to/distributed_weights\",\n    src_strategy_file=\"path/to/strategy_file.ckpt\",\n    dst_dir=\"path/to/merged_weights\"\n)\n",[267],{"type":18,"tag":268,"props":269,"children":270},"code",{"__ignoreMap":7},[271],{"type":24,"value":265},{"type":18,"tag":151,"props":273,"children":275},{"id":274},"_232-自定义合并场景",[276],{"type":18,"tag":45,"props":277,"children":278},{},[279],{"type":24,"value":280},"2.3.2 自定义合并场景",{"type":18,"tag":26,"props":282,"children":283},{},[284],{"type":24,"value":285},"如果你只需要合并部分权重参数，或者需要修改某些参数的名称，可以使用choice_func参数：",{"type":18,"tag":263,"props":287,"children":289},{"code":288},"# 定义一个筛选函数，只合并名称中包含\"encoder\"的参数\ndef filter_encoder_params(param_name):\n    return \"encoder\" in param_name\nunified_safetensors(\n    src_dir=\"path/to/distributed_weights\",\n    src_strategy_file=\"path/to/strategy_file.ckpt\",\n    dst_dir=\"path/to/merged_weights\",\n    choice_func=filter_encoder_params\n)\n",[290],{"type":18,"tag":268,"props":291,"children":292},{"__ignoreMap":7},[293],{"type":24,"value":288},{"type":18,"tag":26,"props":295,"children":296},{},[297],{"type":18,"tag":45,"props":298,"children":299},{},[300],{"type":24,"value":301},"# 03",{"type":18,"tag":26,"props":303,"children":304},{},[305],{"type":18,"tag":45,"props":306,"children":307},{},[308],{"type":18,"tag":45,"props":309,"children":310},{},[311],{"type":24,"value":312},"load_distributed_checkpoint接口详解",{"type":18,"tag":72,"props":314,"children":316},{"id":315},"_31-接口功能概述",[317],{"type":18,"tag":45,"props":318,"children":319},{},[320],{"type":24,"value":321},"3.1 接口功能概述",{"type":18,"tag":26,"props":323,"children":324},{},[325,327,339,341,346],{"type":24,"value":326},"load_distributed_checkpoint接口是MindSpore中实现分布式权重加载的核心工具，",{"type":18,"tag":45,"props":328,"children":329},{},[330,332,337],{"type":24,"value":331},"既可以用于分布式推理场景，也能在分布式训练中发挥关键作用。",{"type":18,"tag":45,"props":333,"children":334},{},[335],{"type":24,"value":336},"在训练场景下，该接口主要用于",{"type":24,"value":338},"恢复中断的训练任务",{"type":24,"value":340},"或",{"type":18,"tag":45,"props":342,"children":343},{},[344],{"type":24,"value":345},"在多机多卡环境中同步权重",{"type":24,"value":347},"，而推理场景则侧重根据部署策略加载对应权重分片。接口通过智能解析训练/推理策略，自动完成权重的切分、映射与加载，大幅降低分布式场景下的权重管理复杂度。",{"type":18,"tag":72,"props":349,"children":351},{"id":350},"_32-核心参数在训练场景中的应用",[352],{"type":18,"tag":45,"props":353,"children":354},{},[355],{"type":24,"value":356},"3.2 核心参数在训练场景中的应用",{"type":18,"tag":151,"props":358,"children":360},{"id":359},"_321-训练策略相关参数",[361],{"type":18,"tag":45,"props":362,"children":363},{},[364],{"type":24,"value":365},"3.2.1 训练策略相关参数",{"type":18,"tag":87,"props":367,"children":368},{},[369,374],{"type":18,"tag":91,"props":370,"children":371},{},[372],{"type":24,"value":373},"train_strategy_filename：训练场景的核心参数，指向记录训练时并行策略的proto文件。该文件包含了模型在训练阶段的张量切分方式、设备映射关系等关键信息，加载时接口会根据此策略自动匹配权重分片。",{"type":18,"tag":91,"props":375,"children":376},{},[377],{"type":24,"value":378},"predict_strategy：在训练场景中也可使用，当需要调整训练策略（如改变卡数、并行模式）时，可通过此参数指定新策略，接口会自动完成权重的重分布。",{"type":18,"tag":151,"props":380,"children":382},{"id":381},"_322-训练恢复场景参数",[383],{"type":18,"tag":45,"props":384,"children":385},{},[386],{"type":24,"value":387},"3.2.2 训练恢复场景参数",{"type":18,"tag":87,"props":389,"children":390},{},[391,396],{"type":18,"tag":91,"props":392,"children":393},{},[394],{"type":24,"value":395},"**checkpoint_filenames：**在训练恢复时，需按rank顺序传入各卡的检查点文件，接口会根据当前设备角色加载对应的权重分片。",{"type":18,"tag":91,"props":397,"children":398},{},[399],{"type":24,"value":400},"**strict_load：**建议设置为False，允许训练过程中网络结构微调（如添加正则化层），接口会智能匹配可加载的参数。",{"type":18,"tag":151,"props":402,"children":404},{"id":403},"_323-分布式训练同步参数",[405],{"type":18,"tag":45,"props":406,"children":407},{},[408],{"type":24,"value":409},"3.2.3 分布式训练同步参数",{"type":18,"tag":87,"props":411,"children":412},{},[413,418],{"type":18,"tag":91,"props":414,"children":415},{},[416],{"type":24,"value":417},"**rank_id：**在多机训练场景中，指定当前设备的逻辑序号，确保各机加载对应分片的权重，避免数据混乱。",{"type":18,"tag":91,"props":419,"children":420},{},[421],{"type":24,"value":422},"**max_process_num：**训练场景下可适当调大此参数（如根据CPU核数调整），提升权重加载的并行效率，减少训练恢复的等待时间。",{"type":18,"tag":72,"props":424,"children":426},{"id":425},"_33-训练场景使用示例",[427],{"type":18,"tag":45,"props":428,"children":429},{},[430],{"type":24,"value":431},"3.3 训练场景使用示例",{"type":18,"tag":151,"props":433,"children":435},{"id":434},"_331-分布式训练恢复",[436],{"type":18,"tag":45,"props":437,"children":438},{},[439],{"type":24,"value":440},"3.3.1 分布式训练恢复",{"type":18,"tag":26,"props":442,"children":443},{},[444],{"type":24,"value":445},"假设在8卡训练过程中任务中断，需从检查点恢复训练：",{"type":18,"tag":447,"props":448,"children":449},"ol",{},[450,455,460],{"type":18,"tag":91,"props":451,"children":452},{},[453],{"type":24,"value":454},"准备各卡的检查点文件（如rank0.ckpt~rank7.ckpt）和训练策略文件（train_strategy.ckpt）。",{"type":18,"tag":91,"props":456,"children":457},{},[458],{"type":24,"value":459},"调用接口时指定train_strategy_filename，接口会根据训练策略自动加载对应权重分片。",{"type":18,"tag":91,"props":461,"children":462},{},[463],{"type":24,"value":464},"恢复训练后，优化器状态、训练轮次等信息也会同步加载，确保训练过程连续。",{"type":18,"tag":151,"props":466,"children":468},{"id":467},"_332-训练策略调整",[469],{"type":18,"tag":45,"props":470,"children":471},{},[472],{"type":24,"value":473},"3.3.2 训练策略调整",{"type":18,"tag":26,"props":475,"children":476},{},[477],{"type":24,"value":478},"若需要从8卡训练调整为4卡继续训练：",{"type":18,"tag":447,"props":480,"children":481},{},[482,487,492],{"type":18,"tag":91,"props":483,"children":484},{},[485],{"type":24,"value":486},"生成新的推理策略文件（predict_strategy.ckpt），定义4卡环境下的权重分布。",{"type":18,"tag":91,"props":488,"children":489},{},[490],{"type":24,"value":491},"通过predict_strategy参数传入新策略，接口会自动将8卡的权重切分重新映射到4卡。",{"type":18,"tag":91,"props":493,"children":494},{},[495],{"type":24,"value":496},"此过程无需手动处理权重分片，接口会根据策略智能完成数据重分布，保证训练连续性。",{"type":18,"tag":26,"props":498,"children":499},{},[500],{"type":18,"tag":45,"props":501,"children":502},{},[503],{"type":24,"value":504},"# 04",{"type":18,"tag":26,"props":506,"children":507},{},[508],{"type":18,"tag":45,"props":509,"children":510},{},[511],{"type":24,"value":512},"完整工作流程示例",{"type":18,"tag":72,"props":514,"children":516},{"id":515},"_41-分布式训练后合并权重",[517],{"type":18,"tag":45,"props":518,"children":519},{},[520],{"type":24,"value":521},"4.1 分布式训练后合并权重",{"type":18,"tag":26,"props":523,"children":524},{},[525],{"type":24,"value":526},"假设你已经完成了分布式训练，现在需要合并权重：",{"type":18,"tag":447,"props":528,"children":529},{},[530,535,540],{"type":18,"tag":91,"props":531,"children":532},{},[533],{"type":24,"value":534},"准备好分布式训练保存的权重文件和策略文件",{"type":18,"tag":91,"props":536,"children":537},{},[538],{"type":24,"value":539},"调用unified_safetensors接口合并权重",{"type":18,"tag":91,"props":541,"children":542},{},[543],{"type":24,"value":544},"检查合并结果",{"type":18,"tag":72,"props":546,"children":548},{"id":547},"_42-加载权重进行分布式推理",[549],{"type":18,"tag":45,"props":550,"children":551},{},[552],{"type":24,"value":553},"4.2 加载权重进行分布式推理",{"type":18,"tag":447,"props":555,"children":556},{},[557,562,567,572],{"type":18,"tag":91,"props":558,"children":559},{},[560],{"type":24,"value":561},"定义推理网络结构",{"type":18,"tag":91,"props":563,"children":564},{},[565],{"type":24,"value":566},"准备预测策略文件",{"type":18,"tag":91,"props":568,"children":569},{},[570],{"type":24,"value":571},"调用load_distributed_checkpoint接口加载权重",{"type":18,"tag":91,"props":573,"children":574},{},[575],{"type":24,"value":576},"执行分布式推理",{"type":18,"tag":72,"props":578,"children":580},{"id":579},"_43-权重格式转换与安全加载",[581],{"type":18,"tag":45,"props":582,"children":583},{},[584],{"type":24,"value":585},"4.3 权重格式转换与安全加载",{"type":18,"tag":447,"props":587,"children":588},{},[589,594,599],{"type":18,"tag":91,"props":590,"children":591},{},[592],{"type":24,"value":593},"将分布式ckpt权重转换为safetensors格式",{"type":18,"tag":91,"props":595,"children":596},{},[597],{"type":24,"value":598},"对敏感模型权重进行加密保存",{"type":18,"tag":91,"props":600,"children":601},{},[602],{"type":24,"value":603},"在部署环境中解密并加载权重",{"type":18,"tag":26,"props":605,"children":606},{},[607],{"type":18,"tag":45,"props":608,"children":609},{},[610],{"type":24,"value":611},"# 05",{"type":18,"tag":26,"props":613,"children":614},{},[615],{"type":18,"tag":45,"props":616,"children":617},{},[618],{"type":24,"value":619},"训练与推理场景的核心区别",{"type":18,"tag":26,"props":621,"children":622},{},[623],{"type":18,"tag":624,"props":625,"children":627},"img",{"alt":7,"src":626},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/18/7a98b76ed1f54381aa4e162ec59c3adc.png",[],{"type":18,"tag":72,"props":629,"children":631},{"id":630},"关键差异说明",[632],{"type":24,"value":633},"关键差异说明：",{"type":18,"tag":87,"props":635,"children":636},{},[637,642,647],{"type":18,"tag":91,"props":638,"children":639},{},[640],{"type":24,"value":641},"**策略文件：**训练使用train_strategy（含优化器状态、训练超参），推理使用predict_strategy（侧重模型结构与并行部署）",{"type":18,"tag":91,"props":643,"children":644},{},[645],{"type":24,"value":646},"**权重完整性：**训练需加载优化器、调度器等完整状态，推理仅需模型权重",{"type":18,"tag":91,"props":648,"children":649},{},[650],{"type":24,"value":651},"**设备同步：**训练场景需确保各卡权重分片与策略严格一致，推理更侧重单卡/多卡的高效执行",{"type":18,"tag":26,"props":653,"children":654},{},[655],{"type":18,"tag":45,"props":656,"children":657},{},[658],{"type":24,"value":659},"# 06",{"type":18,"tag":26,"props":661,"children":662},{},[663],{"type":18,"tag":45,"props":664,"children":665},{},[666],{"type":24,"value":667},"常见问题与解决方案",{"type":18,"tag":72,"props":669,"children":671},{"id":670},"_61-参数名称不匹配",[672],{"type":18,"tag":45,"props":673,"children":674},{},[675],{"type":24,"value":676},"6.1 参数名称不匹配",{"type":18,"tag":26,"props":678,"children":679},{},[680],{"type":24,"value":681},"当遇到参数名称不匹配的问题时，可以：",{"type":18,"tag":87,"props":683,"children":684},{},[685,690,695],{"type":18,"tag":91,"props":686,"children":687},{},[688],{"type":24,"value":689},"使用strict_load=False允许非严格匹配",{"type":18,"tag":91,"props":691,"children":692},{},[693],{"type":24,"value":694},"通过name_map参数提供名称映射关系",{"type":18,"tag":91,"props":696,"children":697},{},[698],{"type":24,"value":699},"使用choice_func在合并时修改参数名称",{"type":18,"tag":72,"props":701,"children":703},{"id":702},"_62-内存不足",[704],{"type":18,"tag":45,"props":705,"children":706},{},[707],{"type":24,"value":708},"6.2 内存不足",{"type":18,"tag":26,"props":710,"children":711},{},[712],{"type":24,"value":713},"处理超大规模模型时，可能会遇到内存不足的问题：",{"type":18,"tag":87,"props":715,"children":716},{},[717,722,727],{"type":18,"tag":91,"props":718,"children":719},{},[720],{"type":24,"value":721},"使用split_dst_file参数将任务切分为多个子任务",{"type":18,"tag":91,"props":723,"children":724},{},[725],{"type":24,"value":726},"调整max_process_num控制并行度",{"type":18,"tag":91,"props":728,"children":729},{},[730],{"type":24,"value":731},"考虑使用内存映射技术或分批处理",{"type":18,"tag":72,"props":733,"children":735},{"id":734},"_63-性能优化",[736],{"type":18,"tag":45,"props":737,"children":738},{},[739],{"type":24,"value":740},"6.3 性能优化",{"type":18,"tag":26,"props":742,"children":743},{},[744],{"type":24,"value":745},"为提高权重转换和加载的效率，可以：",{"type":18,"tag":87,"props":747,"children":748},{},[749,754,759],{"type":18,"tag":91,"props":750,"children":751},{},[752],{"type":24,"value":753},"根据硬件资源调整max_process_num",{"type":18,"tag":91,"props":755,"children":756},{},[757],{"type":24,"value":758},"使用高速存储设备（如SSD）存放权重文件",{"type":18,"tag":91,"props":760,"children":761},{},[762],{"type":24,"value":763},"利用多机并行处理大规模任务",{"type":18,"tag":72,"props":765,"children":767},{"id":766},"_64-训练策略与当前环境不匹配",[768],{"type":18,"tag":45,"props":769,"children":770},{},[771],{"type":24,"value":772},"6.4 训练策略与当前环境不匹配",{"type":18,"tag":26,"props":774,"children":775},{},[776,778],{"type":24,"value":777},"**问题现象：**加载时提示策略中的设备数与当前环境不一致 ",{"type":18,"tag":45,"props":779,"children":780},{},[781],{"type":24,"value":782},"解决方案：",{"type":18,"tag":87,"props":784,"children":785},{},[786,791],{"type":18,"tag":91,"props":787,"children":788},{},[789],{"type":24,"value":790},"若设备数减少：通过predict_strategy指定新策略，接口自动合并权重分片",{"type":18,"tag":91,"props":792,"children":793},{},[794],{"type":24,"value":795},"若设备数增加：需重新训练或使用模型并行策略拆分权重，可结合unified_safetensors重新合并",{"type":18,"tag":72,"props":797,"children":799},{"id":798},"_65-优化器状态加载失败",[800],{"type":18,"tag":45,"props":801,"children":802},{},[803],{"type":24,"value":804},"6.5 优化器状态加载失败",{"type":18,"tag":26,"props":806,"children":807},{},[808,810],{"type":24,"value":809},"**问题描述：**训练恢复时优化器参数加载报错 ",{"type":18,"tag":45,"props":811,"children":812},{},[813],{"type":24,"value":814},"解决方法：",{"type":18,"tag":87,"props":816,"children":817},{},[818,823,828],{"type":18,"tag":91,"props":819,"children":820},{},[821],{"type":24,"value":822},"确保检查点文件包含优化器状态（如使用Model.save_checkpoint保存完整状态）",{"type":18,"tag":91,"props":824,"children":825},{},[826],{"type":24,"value":827},"检查优化器定义是否与训练时一致（如学习率调度器、权重衰减等参数）",{"type":18,"tag":91,"props":829,"children":830},{},[831],{"type":24,"value":832},"设置strict_load=False允许优化器参数的兼容加载",{"type":18,"tag":72,"props":834,"children":836},{"id":835},"_66-多机训练权重不一致",[837],{"type":18,"tag":45,"props":838,"children":839},{},[840],{"type":24,"value":841},"6.6 多机训练权重不一致",{"type":18,"tag":26,"props":843,"children":844},{},[845,847],{"type":24,"value":846},"**问题原因：**各机加载的权重分片错误或策略不同步 ",{"type":18,"tag":45,"props":848,"children":849},{},[850],{"type":24,"value":851},"预防措施：",{"type":18,"tag":87,"props":853,"children":854},{},[855,860,865],{"type":18,"tag":91,"props":856,"children":857},{},[858],{"type":24,"value":859},"统一使用相同的train_strategy_filename和检查点文件列表",{"type":18,"tag":91,"props":861,"children":862},{},[863],{"type":24,"value":864},"确保各机的rank_id与检查点文件的rank顺序严格对应",{"type":18,"tag":91,"props":866,"children":867},{},[868],{"type":24,"value":869},"加载完成后可通过简单前向传播验证各机输出一致性",{"type":18,"tag":26,"props":871,"children":872},{},[873],{"type":18,"tag":45,"props":874,"children":875},{},[876],{"type":24,"value":877},"# 07",{"type":18,"tag":26,"props":879,"children":880},{},[881],{"type":18,"tag":45,"props":882,"children":883},{},[884],{"type":24,"value":885},"总结",{"type":18,"tag":26,"props":887,"children":888},{},[889],{"type":24,"value":890},"MindSpore提供的unified_safetensors和load_distributed_checkpoint接口为分布式训练和推理提供了强大的权重管理能力。通过合理使用这两个接口，你可以：",{"type":18,"tag":87,"props":892,"children":893},{},[894,899,904,909,914],{"type":18,"tag":91,"props":895,"children":896},{},[897],{"type":24,"value":898},"高效合并分布式训练产生的权重",{"type":18,"tag":91,"props":900,"children":901},{},[902],{"type":24,"value":903},"灵活加载权重用于分布式推理和训练",{"type":18,"tag":91,"props":905,"children":906},{},[907],{"type":24,"value":908},"实现不同格式间的权重转换",{"type":18,"tag":91,"props":910,"children":911},{},[912],{"type":24,"value":913},"保障敏感模型的安全性",{"type":18,"tag":91,"props":915,"children":916},{},[917],{"type":24,"value":918},"无缝恢复训练任务，灵活调整并行策略",{"type":18,"tag":26,"props":920,"children":921},{},[922],{"type":24,"value":923},"掌握这些技术，将帮助你更轻松地应对深度学习模型开发和部署中的各种挑战。",{"title":7,"searchDepth":925,"depth":925,"links":926},4,[927,929,930,936,940,941,946,950,951,952,953,954,955,956,957,958,959],{"id":74,"depth":928,"text":80},3,{"id":129,"depth":928,"text":135},{"id":143,"depth":928,"text":149,"children":931},[932,933,934,935],{"id":153,"depth":925,"text":159},{"id":180,"depth":925,"text":186},{"id":202,"depth":925,"text":208},{"id":224,"depth":925,"text":230},{"id":241,"depth":928,"text":247,"children":937},[938,939],{"id":250,"depth":925,"text":256},{"id":274,"depth":925,"text":280},{"id":315,"depth":928,"text":321},{"id":350,"depth":928,"text":356,"children":942},[943,944,945],{"id":359,"depth":925,"text":365},{"id":381,"depth":925,"text":387},{"id":403,"depth":925,"text":409},{"id":425,"depth":928,"text":431,"children":947},[948,949],{"id":434,"depth":925,"text":440},{"id":467,"depth":925,"text":473},{"id":515,"depth":928,"text":521},{"id":547,"depth":928,"text":553},{"id":579,"depth":928,"text":585},{"id":630,"depth":928,"text":633},{"id":670,"depth":928,"text":676},{"id":702,"depth":928,"text":708},{"id":734,"depth":928,"text":740},{"id":766,"depth":928,"text":772},{"id":798,"depth":928,"text":804},{"id":835,"depth":928,"text":841},"markdown","content:technology-blogs:zh:3795.md","content","technology-blogs/zh/3795.md","technology-blogs/zh/3795","md",1776506135411]