[{"data":1,"prerenderedAt":1340},["ShallowReactive",2],{"content-query-eHKg6EmoDt":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":1334,"_id":1335,"_source":1336,"_file":1337,"_stem":1338,"_extension":1339},"/technology-blogs/zh/3778","zh",false,"","MindSpore权重转换全解析：基于Safetensors格式的高效实现","一、MindSpore权重转换基础概念","2025-07-01","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/04/6e3468e1444e4ab88a8dd957a0d1a833.png","technology-blogs","开发者说",{"type":15,"children":16,"toc":1283},"root",[17,25,30,43,55,61,72,77,123,134,145,157,168,180,213,224,247,258,281,292,305,316,327,332,341,352,357,366,377,388,420,431,442,472,483,506,517,539,550,561,566,592,603,608,633,644,655,660,685,696,725,736,754,765,770,775,780,785,790,798,803,807,812,817,825,830,835,840,845,856,903,914,925,930,969,980,985,1016,1027,1032,1055,1066,1082,1109,1120,1136,1161,1172,1188,1220,1231,1250,1278],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore权重转换全解析基于safetensors格式的高效实现",[23],{"type":24,"value":8},"text",{"type":18,"tag":19,"props":26,"children":28},{"id":27},"mindspore权重转换全解析基于safetensors格式的高效实现-1",[29],{"type":24,"value":8},{"type":18,"tag":31,"props":32,"children":34},"h2",{"id":33},"一mindspore权重转换基础概念",[35,42],{"type":18,"tag":36,"props":37,"children":41},"a",{"href":38,"rel":39},"https://discuss.mindspore.cn/t/topic/146#p-205-mindspore-2",[40],"nofollow",[],{"type":24,"value":9},{"type":18,"tag":44,"props":45,"children":47},"h3",{"id":46},"_11-为什么需要权重转换",[48,53],{"type":18,"tag":36,"props":49,"children":52},{"href":50,"rel":51},"https://discuss.mindspore.cn/t/topic/146#p-205-h-11-3",[40],[],{"type":24,"value":54},"1.1 为什么需要权重转换？",{"type":18,"tag":56,"props":57,"children":58},"p",{},[59],{"type":24,"value":60},"在深度学习模型的开发与部署过程中，权重转换是一个关键环节。尤其在分布式训练场景下，模型权重会被切分到多个设备（如GPU或NPU）上进行并行计算。训练完成后，我们通常需要将这些分布式的权重合并为一个完整的模型，或者根据特定的部署需求对权重进行重新组织。",{"type":18,"tag":44,"props":62,"children":64},{"id":63},"_12-safetensors格式简介",[65,70],{"type":18,"tag":36,"props":66,"children":69},{"href":67,"rel":68},"https://discuss.mindspore.cn/t/topic/146#p-205-h-12-safetensors-4",[40],[],{"type":24,"value":71},"1.2 Safetensors格式简介",{"type":18,"tag":56,"props":73,"children":74},{},[75],{"type":24,"value":76},"Safetensors是一种专为深度学习设计的高性能张量存储格式，具有以下特点：",{"type":18,"tag":78,"props":79,"children":80},"ul",{},[81,93,103,113],{"type":18,"tag":82,"props":83,"children":84},"li",{},[85,91],{"type":18,"tag":86,"props":87,"children":88},"strong",{},[89],{"type":24,"value":90},"安全高效",{"type":24,"value":92},"：避免了传统格式（如pickle）可能存在的安全风险，同时提供了高效的读写性能",{"type":18,"tag":82,"props":94,"children":95},{},[96,101],{"type":18,"tag":86,"props":97,"children":98},{},[99],{"type":24,"value":100},"跨框架兼容",{"type":24,"value":102},"：支持在不同深度学习框架间无缝交换张量数据",{"type":18,"tag":82,"props":104,"children":105},{},[106,111],{"type":18,"tag":86,"props":107,"children":108},{},[109],{"type":24,"value":110},"零拷贝加载",{"type":24,"value":112},"：可直接映射到内存，无需额外拷贝操作",{"type":18,"tag":82,"props":114,"children":115},{},[116,121],{"type":18,"tag":86,"props":117,"children":118},{},[119],{"type":24,"value":120},"元数据支持",{"type":24,"value":122},"：能够存储张量的元数据（如形状、数据类型等）",{"type":18,"tag":31,"props":124,"children":126},{"id":125},"二unified_safetensors接口详解",[127,132],{"type":18,"tag":36,"props":128,"children":131},{"href":129,"rel":130},"https://discuss.mindspore.cn/t/topic/146#p-205-unified_safetensors-5",[40],[],{"type":24,"value":133},"二、unified_safetensors接口详解",{"type":18,"tag":44,"props":135,"children":137},{"id":136},"_21-接口功能概述",[138,143],{"type":18,"tag":36,"props":139,"children":142},{"href":140,"rel":141},"https://discuss.mindspore.cn/t/topic/146#p-205-h-21-6",[40],[],{"type":24,"value":144},"2.1 接口功能概述",{"type":18,"tag":56,"props":146,"children":147},{},[148,155],{"type":18,"tag":149,"props":150,"children":152},"code",{"className":151},[],[153],{"type":24,"value":154},"unified_safetensors",{"type":24,"value":156},"接口用于将多个分布式保存的safetensors权重文件合并为一个或多个统一的safetensors文件。这个过程通常在分布式训练完成后执行，以便得到一个完整的模型权重。",{"type":18,"tag":44,"props":158,"children":160},{"id":159},"_22-核心参数说明",[161,166],{"type":18,"tag":36,"props":162,"children":165},{"href":163,"rel":164},"https://discuss.mindspore.cn/t/topic/146#p-205-h-22-7",[40],[],{"type":24,"value":167},"2.2 核心参数说明",{"type":18,"tag":169,"props":170,"children":172},"h4",{"id":171},"_221-路径参数",[173,178],{"type":18,"tag":36,"props":174,"children":177},{"href":175,"rel":176},"https://discuss.mindspore.cn/t/topic/146#p-205-h-221-8",[40],[],{"type":24,"value":179},"2.2.1 路径参数",{"type":18,"tag":78,"props":181,"children":182},{},[183,193,203],{"type":18,"tag":82,"props":184,"children":185},{},[186,191],{"type":18,"tag":86,"props":187,"children":188},{},[189],{"type":24,"value":190},"src_dir",{"type":24,"value":192},"：源权重保存目录，包含了所有需要合并的safetensors文件",{"type":18,"tag":82,"props":194,"children":195},{},[196,201],{"type":18,"tag":86,"props":197,"children":198},{},[199],{"type":24,"value":200},"src_strategy_file",{"type":24,"value":202},"：源权重切分策略文件，记录了权重在分布式训练时的切分方式",{"type":18,"tag":82,"props":204,"children":205},{},[206,211],{"type":18,"tag":86,"props":207,"children":208},{},[209],{"type":24,"value":210},"dst_dir",{"type":24,"value":212},"：合并后的目标保存目录",{"type":18,"tag":169,"props":214,"children":216},{"id":215},"_222-合并策略参数",[217,222],{"type":18,"tag":36,"props":218,"children":221},{"href":219,"rel":220},"https://discuss.mindspore.cn/t/topic/146#p-205-h-222-9",[40],[],{"type":24,"value":223},"2.2.2 合并策略参数",{"type":18,"tag":78,"props":225,"children":226},{},[227,237],{"type":18,"tag":82,"props":228,"children":229},{},[230,235],{"type":18,"tag":86,"props":231,"children":232},{},[233],{"type":24,"value":234},"merge_with_redundancy",{"type":24,"value":236},"：控制合并时是否保留冗余数据。当设置为True时，合并的源权重文件是完整的；设置为False时，会去除冗余信息",{"type":18,"tag":82,"props":238,"children":239},{},[240,245],{"type":18,"tag":86,"props":241,"children":242},{},[243],{"type":24,"value":244},"file_suffix",{"type":24,"value":246},"：指定合并后safetensors文件的后缀名。如果不指定，将合并源目录下的所有safetensors文件",{"type":18,"tag":169,"props":248,"children":250},{"id":249},"_223-性能优化参数",[251,256],{"type":18,"tag":36,"props":252,"children":255},{"href":253,"rel":254},"https://discuss.mindspore.cn/t/topic/146#p-205-h-223-10",[40],[],{"type":24,"value":257},"2.2.3 性能优化参数",{"type":18,"tag":78,"props":259,"children":260},{},[261,271],{"type":18,"tag":82,"props":262,"children":263},{},[264,269],{"type":18,"tag":86,"props":265,"children":266},{},[267],{"type":24,"value":268},"max_process_num",{"type":24,"value":270},"：最大并行进程数，可根据硬件资源调整以提高合并效率",{"type":18,"tag":82,"props":272,"children":273},{},[274,279],{"type":18,"tag":86,"props":275,"children":276},{},[277],{"type":24,"value":278},"split_dst_file",{"type":24,"value":280},"：允许将合并任务切分为多个子任务，支持单机多任务或多机并行处理",{"type":18,"tag":169,"props":282,"children":284},{"id":283},"_224-高级筛选参数",[285,290],{"type":18,"tag":36,"props":286,"children":289},{"href":287,"rel":288},"https://discuss.mindspore.cn/t/topic/146#p-205-h-224-11",[40],[],{"type":24,"value":291},"2.2.4 高级筛选参数",{"type":18,"tag":78,"props":293,"children":294},{},[295],{"type":18,"tag":82,"props":296,"children":297},{},[298,303],{"type":18,"tag":86,"props":299,"children":300},{},[301],{"type":24,"value":302},"choice_func",{"type":24,"value":304},"：一个可调用函数，用于筛选需要合并的参数或修改参数名称。这个函数非常灵活，可以根据自定义规则对权重进行处理",{"type":18,"tag":44,"props":306,"children":308},{"id":307},"_23-使用场景举例",[309,314],{"type":18,"tag":36,"props":310,"children":313},{"href":311,"rel":312},"https://discuss.mindspore.cn/t/topic/146#p-205-h-23-12",[40],[],{"type":24,"value":315},"2.3 使用场景举例",{"type":18,"tag":169,"props":317,"children":319},{"id":318},"_231-常规合并场景",[320,325],{"type":18,"tag":36,"props":321,"children":324},{"href":322,"rel":323},"https://discuss.mindspore.cn/t/topic/146#p-205-h-231-13",[40],[],{"type":24,"value":326},"2.3.1 常规合并场景",{"type":18,"tag":56,"props":328,"children":329},{},[330],{"type":24,"value":331},"假设你在8卡GPU上完成了分布式训练，每个卡保存了一部分权重。现在需要将这些权重合并为一个完整的模型：",{"type":18,"tag":56,"props":333,"children":334},{},[335],{"type":18,"tag":149,"props":336,"children":338},{"className":337},[],[339],{"type":24,"value":340},"# 合并分布式训练产生的权重 unified_safetensors(     src_dir=\"path/to/distributed_weights\",     src_strategy_file=\"path/to/strategy_file.ckpt\",     dst_dir=\"path/to/merged_weights\" )",{"type":18,"tag":169,"props":342,"children":344},{"id":343},"_232-自定义合并场景",[345,350],{"type":18,"tag":36,"props":346,"children":349},{"href":347,"rel":348},"https://discuss.mindspore.cn/t/topic/146#p-205-h-232-14",[40],[],{"type":24,"value":351},"2.3.2 自定义合并场景",{"type":18,"tag":56,"props":353,"children":354},{},[355],{"type":24,"value":356},"如果你只需要合并部分权重参数，或者需要修改某些参数的名称，可以使用choice_func参数：",{"type":18,"tag":56,"props":358,"children":359},{},[360],{"type":18,"tag":149,"props":361,"children":363},{"className":362},[],[364],{"type":24,"value":365},"# 定义一个筛选函数，只合并名称中包含\"encoder\"的参数 def filter_encoder_params(param_name):     return \"encoder\" in param_name  unified_safetensors(     src_dir=\"path/to/distributed_weights\",     src_strategy_file=\"path/to/strategy_file.ckpt\",     dst_dir=\"path/to/merged_weights\",     choice_func=filter_encoder_params )",{"type":18,"tag":31,"props":367,"children":369},{"id":368},"三load_distributed_checkpoint接口详解",[370,375],{"type":18,"tag":36,"props":371,"children":374},{"href":372,"rel":373},"https://discuss.mindspore.cn/t/topic/146#p-205-load_distributed_checkpoint-15",[40],[],{"type":24,"value":376},"三、load_distributed_checkpoint接口详解",{"type":18,"tag":44,"props":378,"children":380},{"id":379},"_31-接口功能概述",[381,386],{"type":18,"tag":36,"props":382,"children":385},{"href":383,"rel":384},"https://discuss.mindspore.cn/t/topic/146#p-205-h-31-16",[40],[],{"type":24,"value":387},"3.1 接口功能概述",{"type":18,"tag":56,"props":389,"children":390},{},[391,397,399,404,406,411,413,418],{"type":18,"tag":149,"props":392,"children":394},{"className":393},[],[395],{"type":24,"value":396},"load_distributed_checkpoint",{"type":24,"value":398},"接口是MindSpore中实现分布式权重加载的核心工具，",{"type":18,"tag":86,"props":400,"children":401},{},[402],{"type":24,"value":403},"既可以用于分布式推理场景，也能在分布式训练中发挥关键作用",{"type":24,"value":405},"。在训练场景下，该接口主要用于",{"type":18,"tag":86,"props":407,"children":408},{},[409],{"type":24,"value":410},"恢复中断的训练任务",{"type":24,"value":412},"或",{"type":18,"tag":86,"props":414,"children":415},{},[416],{"type":24,"value":417},"在多机多卡环境中同步权重",{"type":24,"value":419},"，而推理场景则侧重根据部署策略加载对应权重分片。接口通过智能解析训练/推理策略，自动完成权重的切分、映射与加载，大幅降低分布式场景下的权重管理复杂度。",{"type":18,"tag":44,"props":421,"children":423},{"id":422},"_32-核心参数在训练场景中的应用",[424,429],{"type":18,"tag":36,"props":425,"children":428},{"href":426,"rel":427},"https://discuss.mindspore.cn/t/topic/146#p-205-h-32-17",[40],[],{"type":24,"value":430},"3.2 核心参数在训练场景中的应用",{"type":18,"tag":169,"props":432,"children":434},{"id":433},"_321-训练策略相关参数",[435,440],{"type":18,"tag":36,"props":436,"children":439},{"href":437,"rel":438},"https://discuss.mindspore.cn/t/topic/146#p-205-h-321-18",[40],[],{"type":24,"value":441},"3.2.1 训练策略相关参数",{"type":18,"tag":78,"props":443,"children":444},{},[445,462],{"type":18,"tag":82,"props":446,"children":447},{},[448,453,455,460],{"type":18,"tag":86,"props":449,"children":450},{},[451],{"type":24,"value":452},"train_strategy_filename",{"type":24,"value":454},"：",{"type":18,"tag":86,"props":456,"children":457},{},[458],{"type":24,"value":459},"训练场景的核心参数",{"type":24,"value":461},"，指向记录训练时并行策略的proto文件。该文件包含了模型在训练阶段的张量切分方式、设备映射关系等关键信息，加载时接口会根据此策略自动匹配权重分片。",{"type":18,"tag":82,"props":463,"children":464},{},[465,470],{"type":18,"tag":86,"props":466,"children":467},{},[468],{"type":24,"value":469},"predict_strategy",{"type":24,"value":471},"：在训练场景中也可使用，当需要调整训练策略（如改变卡数、并行模式）时，可通过此参数指定新策略，接口会自动完成权重的重分布。",{"type":18,"tag":169,"props":473,"children":475},{"id":474},"_322-训练恢复场景参数",[476,481],{"type":18,"tag":36,"props":477,"children":480},{"href":478,"rel":479},"https://discuss.mindspore.cn/t/topic/146#p-205-h-322-19",[40],[],{"type":24,"value":482},"3.2.2 训练恢复场景参数",{"type":18,"tag":78,"props":484,"children":485},{},[486,496],{"type":18,"tag":82,"props":487,"children":488},{},[489,494],{"type":18,"tag":86,"props":490,"children":491},{},[492],{"type":24,"value":493},"checkpoint_filenames",{"type":24,"value":495},"：在训练恢复时，需按rank顺序传入各卡的检查点文件，接口会根据当前设备角色加载对应的权重分片。",{"type":18,"tag":82,"props":497,"children":498},{},[499,504],{"type":18,"tag":86,"props":500,"children":501},{},[502],{"type":24,"value":503},"strict_load",{"type":24,"value":505},"：建议设置为False，允许训练过程中网络结构微调（如添加正则化层），接口会智能匹配可加载的参数。",{"type":18,"tag":169,"props":507,"children":509},{"id":508},"_323-分布式训练同步参数",[510,515],{"type":18,"tag":36,"props":511,"children":514},{"href":512,"rel":513},"https://discuss.mindspore.cn/t/topic/146#p-205-h-323-20",[40],[],{"type":24,"value":516},"3.2.3 分布式训练同步参数",{"type":18,"tag":78,"props":518,"children":519},{},[520,530],{"type":18,"tag":82,"props":521,"children":522},{},[523,528],{"type":18,"tag":86,"props":524,"children":525},{},[526],{"type":24,"value":527},"rank_id",{"type":24,"value":529},"：在多机训练场景中，指定当前设备的逻辑序号，确保各机加载对应分片的权重，避免数据混乱。",{"type":18,"tag":82,"props":531,"children":532},{},[533,537],{"type":18,"tag":86,"props":534,"children":535},{},[536],{"type":24,"value":268},{"type":24,"value":538},"：训练场景下可适当调大此参数（如根据CPU核数调整），提升权重加载的并行效率，减少训练恢复的等待时间。",{"type":18,"tag":44,"props":540,"children":542},{"id":541},"_33-训练场景使用示例",[543,548],{"type":18,"tag":36,"props":544,"children":547},{"href":545,"rel":546},"https://discuss.mindspore.cn/t/topic/146#p-205-h-33-21",[40],[],{"type":24,"value":549},"3.3 训练场景使用示例",{"type":18,"tag":169,"props":551,"children":553},{"id":552},"_331-分布式训练恢复",[554,559],{"type":18,"tag":36,"props":555,"children":558},{"href":556,"rel":557},"https://discuss.mindspore.cn/t/topic/146#p-205-h-331-22",[40],[],{"type":24,"value":560},"3.3.1 分布式训练恢复",{"type":18,"tag":56,"props":562,"children":563},{},[564],{"type":24,"value":565},"假设在8卡训练过程中任务中断，需从检查点恢复训练：",{"type":18,"tag":567,"props":568,"children":569},"ol",{},[570,575,587],{"type":18,"tag":82,"props":571,"children":572},{},[573],{"type":24,"value":574},"准备各卡的检查点文件（如rank0.ckpt~rank7.ckpt）和训练策略文件（train_strategy.ckpt）",{"type":18,"tag":82,"props":576,"children":577},{},[578,580,585],{"type":24,"value":579},"调用接口时指定",{"type":18,"tag":149,"props":581,"children":583},{"className":582},[],[584],{"type":24,"value":452},{"type":24,"value":586},"，接口会根据训练策略自动加载对应权重分片",{"type":18,"tag":82,"props":588,"children":589},{},[590],{"type":24,"value":591},"恢复训练后，优化器状态、训练轮次等信息也会同步加载，确保训练过程连续",{"type":18,"tag":169,"props":593,"children":595},{"id":594},"_332-训练策略调整",[596,601],{"type":18,"tag":36,"props":597,"children":600},{"href":598,"rel":599},"https://discuss.mindspore.cn/t/topic/146#p-205-h-332-23",[40],[],{"type":24,"value":602},"3.3.2 训练策略调整",{"type":18,"tag":56,"props":604,"children":605},{},[606],{"type":24,"value":607},"若需要从8卡训练调整为4卡继续训练：",{"type":18,"tag":567,"props":609,"children":610},{},[611,616,628],{"type":18,"tag":82,"props":612,"children":613},{},[614],{"type":24,"value":615},"生成新的推理策略文件（predict_strategy.ckpt），定义4卡环境下的权重分布",{"type":18,"tag":82,"props":617,"children":618},{},[619,621,626],{"type":24,"value":620},"通过",{"type":18,"tag":149,"props":622,"children":624},{"className":623},[],[625],{"type":24,"value":469},{"type":24,"value":627},"参数传入新策略，接口会自动将8卡的权重切分重新映射到4卡",{"type":18,"tag":82,"props":629,"children":630},{},[631],{"type":24,"value":632},"此过程无需手动处理权重分片，接口会根据策略智能完成数据重分布，保证训练连续性",{"type":18,"tag":31,"props":634,"children":636},{"id":635},"四完整工作流程示例",[637,642],{"type":18,"tag":36,"props":638,"children":641},{"href":639,"rel":640},"https://discuss.mindspore.cn/t/topic/146#p-205-h-24",[40],[],{"type":24,"value":643},"四、完整工作流程示例",{"type":18,"tag":44,"props":645,"children":647},{"id":646},"_41-分布式训练后合并权重",[648,653],{"type":18,"tag":36,"props":649,"children":652},{"href":650,"rel":651},"https://discuss.mindspore.cn/t/topic/146#p-205-h-41-25",[40],[],{"type":24,"value":654},"4.1 分布式训练后合并权重",{"type":18,"tag":56,"props":656,"children":657},{},[658],{"type":24,"value":659},"假设你已经完成了分布式训练，现在需要合并权重：",{"type":18,"tag":567,"props":661,"children":662},{},[663,668,680],{"type":18,"tag":82,"props":664,"children":665},{},[666],{"type":24,"value":667},"准备好分布式训练保存的权重文件和策略文件",{"type":18,"tag":82,"props":669,"children":670},{},[671,673,678],{"type":24,"value":672},"调用",{"type":18,"tag":149,"props":674,"children":676},{"className":675},[],[677],{"type":24,"value":154},{"type":24,"value":679},"接口合并权重",{"type":18,"tag":82,"props":681,"children":682},{},[683],{"type":24,"value":684},"检查合并结果",{"type":18,"tag":44,"props":686,"children":688},{"id":687},"_42-加载权重进行分布式推理",[689,694],{"type":18,"tag":36,"props":690,"children":693},{"href":691,"rel":692},"https://discuss.mindspore.cn/t/topic/146#p-205-h-42-26",[40],[],{"type":24,"value":695},"4.2 加载权重进行分布式推理",{"type":18,"tag":567,"props":697,"children":698},{},[699,704,709,720],{"type":18,"tag":82,"props":700,"children":701},{},[702],{"type":24,"value":703},"定义推理网络结构",{"type":18,"tag":82,"props":705,"children":706},{},[707],{"type":24,"value":708},"准备预测策略文件",{"type":18,"tag":82,"props":710,"children":711},{},[712,713,718],{"type":24,"value":672},{"type":18,"tag":149,"props":714,"children":716},{"className":715},[],[717],{"type":24,"value":396},{"type":24,"value":719},"接口加载权重",{"type":18,"tag":82,"props":721,"children":722},{},[723],{"type":24,"value":724},"执行分布式推理",{"type":18,"tag":44,"props":726,"children":728},{"id":727},"_43-权重格式转换与安全加载",[729,734],{"type":18,"tag":36,"props":730,"children":733},{"href":731,"rel":732},"https://discuss.mindspore.cn/t/topic/146#p-205-h-43-27",[40],[],{"type":24,"value":735},"4.3 权重格式转换与安全加载",{"type":18,"tag":567,"props":737,"children":738},{},[739,744,749],{"type":18,"tag":82,"props":740,"children":741},{},[742],{"type":24,"value":743},"将分布式ckpt权重转换为safetensors格式",{"type":18,"tag":82,"props":745,"children":746},{},[747],{"type":24,"value":748},"对敏感模型权重进行加密保存",{"type":18,"tag":82,"props":750,"children":751},{},[752],{"type":24,"value":753},"在部署环境中解密并加载权重",{"type":18,"tag":31,"props":755,"children":757},{"id":756},"五训练与推理场景的核心区别",[758,763],{"type":18,"tag":36,"props":759,"children":762},{"href":760,"rel":761},"https://discuss.mindspore.cn/t/topic/146#p-205-h-28",[40],[],{"type":24,"value":764},"五、训练与推理场景的核心区别",{"type":18,"tag":56,"props":766,"children":767},{},[768],{"type":24,"value":769},"应用场景",{"type":18,"tag":56,"props":771,"children":772},{},[773],{"type":24,"value":774},"核心目标",{"type":18,"tag":56,"props":776,"children":777},{},[778],{"type":24,"value":779},"策略文件类型",{"type":18,"tag":56,"props":781,"children":782},{},[783],{"type":24,"value":784},"权重处理方式",{"type":18,"tag":56,"props":786,"children":787},{},[788],{"type":24,"value":789},"典型参数配置",{"type":18,"tag":56,"props":791,"children":792},{},[793],{"type":18,"tag":86,"props":794,"children":795},{},[796],{"type":24,"value":797},"分布式推理",{"type":18,"tag":56,"props":799,"children":800},{},[801],{"type":24,"value":802},"高效部署模型",{"type":18,"tag":56,"props":804,"children":805},{},[806],{"type":24,"value":469},{"type":18,"tag":56,"props":808,"children":809},{},[810],{"type":24,"value":811},"按推理并行策略加载对应分片",{"type":18,"tag":56,"props":813,"children":814},{},[815],{"type":24,"value":816},"format=“safetensors”, network=None",{"type":18,"tag":56,"props":818,"children":819},{},[820],{"type":18,"tag":86,"props":821,"children":822},{},[823],{"type":24,"value":824},"分布式训练",{"type":18,"tag":56,"props":826,"children":827},{},[828],{"type":24,"value":829},"恢复训练或调整并行策略",{"type":18,"tag":56,"props":831,"children":832},{},[833],{"type":24,"value":834},"train_strategy",{"type":18,"tag":56,"props":836,"children":837},{},[838],{"type":24,"value":839},"按训练策略加载并同步状态",{"type":18,"tag":56,"props":841,"children":842},{},[843],{"type":24,"value":844},"train_strategy_filename=xxx",{"type":18,"tag":44,"props":846,"children":848},{"id":847},"关键差异说明",[849,854],{"type":18,"tag":36,"props":850,"children":853},{"href":851,"rel":852},"https://discuss.mindspore.cn/t/topic/146#p-205-h-29",[40],[],{"type":24,"value":855},"关键差异说明：",{"type":18,"tag":78,"props":857,"children":858},{},[859,883,893],{"type":18,"tag":82,"props":860,"children":861},{},[862,867,869,874,876,881],{"type":18,"tag":86,"props":863,"children":864},{},[865],{"type":24,"value":866},"策略文件",{"type":24,"value":868},"：训练使用",{"type":18,"tag":149,"props":870,"children":872},{"className":871},[],[873],{"type":24,"value":834},{"type":24,"value":875},"（含优化器状态、训练超参），推理使用",{"type":18,"tag":149,"props":877,"children":879},{"className":878},[],[880],{"type":24,"value":469},{"type":24,"value":882},"（侧重模型结构与并行部署）",{"type":18,"tag":82,"props":884,"children":885},{},[886,891],{"type":18,"tag":86,"props":887,"children":888},{},[889],{"type":24,"value":890},"权重完整性",{"type":24,"value":892},"：训练需加载优化器、调度器等完整状态，推理仅需模型权重",{"type":18,"tag":82,"props":894,"children":895},{},[896,901],{"type":18,"tag":86,"props":897,"children":898},{},[899],{"type":24,"value":900},"设备同步",{"type":24,"value":902},"：训练场景需确保各卡权重分片与策略严格一致，推理更侧重单卡/多卡的高效执行",{"type":18,"tag":31,"props":904,"children":906},{"id":905},"六常见问题与解决方案",[907,912],{"type":18,"tag":36,"props":908,"children":911},{"href":909,"rel":910},"https://discuss.mindspore.cn/t/topic/146#p-205-h-30",[40],[],{"type":24,"value":913},"六、常见问题与解决方案",{"type":18,"tag":44,"props":915,"children":917},{"id":916},"_61-参数名称不匹配",[918,923],{"type":18,"tag":36,"props":919,"children":922},{"href":920,"rel":921},"https://discuss.mindspore.cn/t/topic/146#p-205-h-61-31",[40],[],{"type":24,"value":924},"6.1 参数名称不匹配",{"type":18,"tag":56,"props":926,"children":927},{},[928],{"type":24,"value":929},"当遇到参数名称不匹配的问题时，可以：",{"type":18,"tag":78,"props":931,"children":932},{},[933,946,958],{"type":18,"tag":82,"props":934,"children":935},{},[936,938,944],{"type":24,"value":937},"使用",{"type":18,"tag":149,"props":939,"children":941},{"className":940},[],[942],{"type":24,"value":943},"strict_load=False",{"type":24,"value":945},"允许非严格匹配",{"type":18,"tag":82,"props":947,"children":948},{},[949,950,956],{"type":24,"value":620},{"type":18,"tag":149,"props":951,"children":953},{"className":952},[],[954],{"type":24,"value":955},"name_map",{"type":24,"value":957},"参数提供名称映射关系",{"type":18,"tag":82,"props":959,"children":960},{},[961,962,967],{"type":24,"value":937},{"type":18,"tag":149,"props":963,"children":965},{"className":964},[],[966],{"type":24,"value":302},{"type":24,"value":968},"在合并时修改参数名称",{"type":18,"tag":44,"props":970,"children":972},{"id":971},"_62-内存不足",[973,978],{"type":18,"tag":36,"props":974,"children":977},{"href":975,"rel":976},"https://discuss.mindspore.cn/t/topic/146#p-205-h-62-32",[40],[],{"type":24,"value":979},"6.2 内存不足",{"type":18,"tag":56,"props":981,"children":982},{},[983],{"type":24,"value":984},"处理超大规模模型时，可能会遇到内存不足的问题：",{"type":18,"tag":78,"props":986,"children":987},{},[988,999,1011],{"type":18,"tag":82,"props":989,"children":990},{},[991,992,997],{"type":24,"value":937},{"type":18,"tag":149,"props":993,"children":995},{"className":994},[],[996],{"type":24,"value":278},{"type":24,"value":998},"参数将任务切分为多个子任务",{"type":18,"tag":82,"props":1000,"children":1001},{},[1002,1004,1009],{"type":24,"value":1003},"调整",{"type":18,"tag":149,"props":1005,"children":1007},{"className":1006},[],[1008],{"type":24,"value":268},{"type":24,"value":1010},"控制并行度",{"type":18,"tag":82,"props":1012,"children":1013},{},[1014],{"type":24,"value":1015},"考虑使用内存映射技术或分批处理",{"type":18,"tag":44,"props":1017,"children":1019},{"id":1018},"_63-性能优化",[1020,1025],{"type":18,"tag":36,"props":1021,"children":1024},{"href":1022,"rel":1023},"https://discuss.mindspore.cn/t/topic/146#p-205-h-63-33",[40],[],{"type":24,"value":1026},"6.3 性能优化",{"type":18,"tag":56,"props":1028,"children":1029},{},[1030],{"type":24,"value":1031},"为提高权重转换和加载的效率，可以：",{"type":18,"tag":78,"props":1033,"children":1034},{},[1035,1045,1050],{"type":18,"tag":82,"props":1036,"children":1037},{},[1038,1040],{"type":24,"value":1039},"根据硬件资源调整",{"type":18,"tag":149,"props":1041,"children":1043},{"className":1042},[],[1044],{"type":24,"value":268},{"type":18,"tag":82,"props":1046,"children":1047},{},[1048],{"type":24,"value":1049},"使用高速存储设备（如SSD）存放权重文件",{"type":18,"tag":82,"props":1051,"children":1052},{},[1053],{"type":24,"value":1054},"利用多机并行处理大规模任务",{"type":18,"tag":44,"props":1056,"children":1058},{"id":1057},"_64-训练策略与当前环境不匹配",[1059,1064],{"type":18,"tag":36,"props":1060,"children":1063},{"href":1061,"rel":1062},"https://discuss.mindspore.cn/t/topic/146#p-205-h-64-34",[40],[],{"type":24,"value":1065},"6.4 训练策略与当前环境不匹配",{"type":18,"tag":56,"props":1067,"children":1068},{},[1069,1074,1076,1081],{"type":18,"tag":86,"props":1070,"children":1071},{},[1072],{"type":24,"value":1073},"问题现象",{"type":24,"value":1075},"：加载时提示策略中的设备数与当前环境不一致 ",{"type":18,"tag":86,"props":1077,"children":1078},{},[1079],{"type":24,"value":1080},"解决方案",{"type":24,"value":454},{"type":18,"tag":78,"props":1083,"children":1084},{},[1085,1097],{"type":18,"tag":82,"props":1086,"children":1087},{},[1088,1090,1095],{"type":24,"value":1089},"若设备数减少：通过",{"type":18,"tag":149,"props":1091,"children":1093},{"className":1092},[],[1094],{"type":24,"value":469},{"type":24,"value":1096},"指定新策略，接口自动合并权重分片",{"type":18,"tag":82,"props":1098,"children":1099},{},[1100,1102,1107],{"type":24,"value":1101},"若设备数增加：需重新训练或使用模型并行策略拆分权重，可结合",{"type":18,"tag":149,"props":1103,"children":1105},{"className":1104},[],[1106],{"type":24,"value":154},{"type":24,"value":1108},"重新合并",{"type":18,"tag":44,"props":1110,"children":1112},{"id":1111},"_65-优化器状态加载失败",[1113,1118],{"type":18,"tag":36,"props":1114,"children":1117},{"href":1115,"rel":1116},"https://discuss.mindspore.cn/t/topic/146#p-205-h-65-35",[40],[],{"type":24,"value":1119},"6.5 优化器状态加载失败",{"type":18,"tag":56,"props":1121,"children":1122},{},[1123,1128,1130,1135],{"type":18,"tag":86,"props":1124,"children":1125},{},[1126],{"type":24,"value":1127},"问题描述",{"type":24,"value":1129},"：训练恢复时优化器参数加载报错 ",{"type":18,"tag":86,"props":1131,"children":1132},{},[1133],{"type":24,"value":1134},"解决方法",{"type":24,"value":454},{"type":18,"tag":78,"props":1137,"children":1138},{},[1139,1144,1149],{"type":18,"tag":82,"props":1140,"children":1141},{},[1142],{"type":24,"value":1143},"确保检查点文件包含优化器状态（如使用Model.save_checkpoint保存完整状态）",{"type":18,"tag":82,"props":1145,"children":1146},{},[1147],{"type":24,"value":1148},"检查优化器定义是否与训练时一致（如学习率调度器、权重衰减等参数）",{"type":18,"tag":82,"props":1150,"children":1151},{},[1152,1154,1159],{"type":24,"value":1153},"设置",{"type":18,"tag":149,"props":1155,"children":1157},{"className":1156},[],[1158],{"type":24,"value":943},{"type":24,"value":1160},"允许优化器参数的兼容加载",{"type":18,"tag":44,"props":1162,"children":1164},{"id":1163},"_66-多机训练权重不一致",[1165,1170],{"type":18,"tag":36,"props":1166,"children":1169},{"href":1167,"rel":1168},"https://discuss.mindspore.cn/t/topic/146#p-205-h-66-36",[40],[],{"type":24,"value":1171},"6.6 多机训练权重不一致",{"type":18,"tag":56,"props":1173,"children":1174},{},[1175,1180,1182,1187],{"type":18,"tag":86,"props":1176,"children":1177},{},[1178],{"type":24,"value":1179},"问题原因",{"type":24,"value":1181},"：各机加载的权重分片错误或策略不同步 ",{"type":18,"tag":86,"props":1183,"children":1184},{},[1185],{"type":24,"value":1186},"预防措施",{"type":24,"value":454},{"type":18,"tag":78,"props":1189,"children":1190},{},[1191,1203,1215],{"type":18,"tag":82,"props":1192,"children":1193},{},[1194,1196,1201],{"type":24,"value":1195},"统一使用相同的",{"type":18,"tag":149,"props":1197,"children":1199},{"className":1198},[],[1200],{"type":24,"value":452},{"type":24,"value":1202},"和检查点文件列表",{"type":18,"tag":82,"props":1204,"children":1205},{},[1206,1208,1213],{"type":24,"value":1207},"确保各机的",{"type":18,"tag":149,"props":1209,"children":1211},{"className":1210},[],[1212],{"type":24,"value":527},{"type":24,"value":1214},"与检查点文件的rank顺序严格对应",{"type":18,"tag":82,"props":1216,"children":1217},{},[1218],{"type":24,"value":1219},"加载完成后可通过简单前向传播验证各机输出一致性",{"type":18,"tag":31,"props":1221,"children":1223},{"id":1222},"七总结",[1224,1229],{"type":18,"tag":36,"props":1225,"children":1228},{"href":1226,"rel":1227},"https://discuss.mindspore.cn/t/topic/146#p-205-h-37",[40],[],{"type":24,"value":1230},"七、总结",{"type":18,"tag":56,"props":1232,"children":1233},{},[1234,1236,1241,1243,1248],{"type":24,"value":1235},"MindSpore提供的",{"type":18,"tag":149,"props":1237,"children":1239},{"className":1238},[],[1240],{"type":24,"value":154},{"type":24,"value":1242},"和",{"type":18,"tag":149,"props":1244,"children":1246},{"className":1245},[],[1247],{"type":24,"value":396},{"type":24,"value":1249},"接口为分布式训练和推理提供了强大的权重管理能力。通过合理使用这两个接口，你可以：",{"type":18,"tag":78,"props":1251,"children":1252},{},[1253,1258,1263,1268,1273],{"type":18,"tag":82,"props":1254,"children":1255},{},[1256],{"type":24,"value":1257},"高效合并分布式训练产生的权重",{"type":18,"tag":82,"props":1259,"children":1260},{},[1261],{"type":24,"value":1262},"灵活加载权重用于分布式推理和训练",{"type":18,"tag":82,"props":1264,"children":1265},{},[1266],{"type":24,"value":1267},"实现不同格式间的权重转换",{"type":18,"tag":82,"props":1269,"children":1270},{},[1271],{"type":24,"value":1272},"保障敏感模型的安全性",{"type":18,"tag":82,"props":1274,"children":1275},{},[1276],{"type":24,"value":1277},"无缝恢复训练任务，灵活调整并行策略",{"type":18,"tag":56,"props":1279,"children":1280},{},[1281],{"type":24,"value":1282},"掌握这些技术，将帮助你更轻松地应对深度学习模型开发和部署中的各种挑战。",{"title":7,"searchDepth":1284,"depth":1284,"links":1285},4,[1286,1292,1305,1317,1322,1325,1333],{"id":33,"depth":1287,"text":9,"children":1288},2,[1289,1291],{"id":46,"depth":1290,"text":54},3,{"id":63,"depth":1290,"text":71},{"id":125,"depth":1287,"text":133,"children":1293},[1294,1295,1301],{"id":136,"depth":1290,"text":144},{"id":159,"depth":1290,"text":167,"children":1296},[1297,1298,1299,1300],{"id":171,"depth":1284,"text":179},{"id":215,"depth":1284,"text":223},{"id":249,"depth":1284,"text":257},{"id":283,"depth":1284,"text":291},{"id":307,"depth":1290,"text":315,"children":1302},[1303,1304],{"id":318,"depth":1284,"text":326},{"id":343,"depth":1284,"text":351},{"id":368,"depth":1287,"text":376,"children":1306},[1307,1308,1313],{"id":379,"depth":1290,"text":387},{"id":422,"depth":1290,"text":430,"children":1309},[1310,1311,1312],{"id":433,"depth":1284,"text":441},{"id":474,"depth":1284,"text":482},{"id":508,"depth":1284,"text":516},{"id":541,"depth":1290,"text":549,"children":1314},[1315,1316],{"id":552,"depth":1284,"text":560},{"id":594,"depth":1284,"text":602},{"id":635,"depth":1287,"text":643,"children":1318},[1319,1320,1321],{"id":646,"depth":1290,"text":654},{"id":687,"depth":1290,"text":695},{"id":727,"depth":1290,"text":735},{"id":756,"depth":1287,"text":764,"children":1323},[1324],{"id":847,"depth":1290,"text":855},{"id":905,"depth":1287,"text":913,"children":1326},[1327,1328,1329,1330,1331,1332],{"id":916,"depth":1290,"text":924},{"id":971,"depth":1290,"text":979},{"id":1018,"depth":1290,"text":1026},{"id":1057,"depth":1290,"text":1065},{"id":1111,"depth":1290,"text":1119},{"id":1163,"depth":1290,"text":1171},{"id":1222,"depth":1287,"text":1230},"markdown","content:technology-blogs:zh:3778.md","content","technology-blogs/zh/3778.md","technology-blogs/zh/3778","md",1776506135103]