[{"data":1,"prerenderedAt":385},["ShallowReactive",2],{"content-query-z9aWvrXeLT":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":379,"_id":380,"_source":381,"_file":382,"_stem":383,"_extension":384},"/news/zh/2024-4-6","zh",false,"","AI Infra训练好搭档——MindSpore Model Agent v0.1正式发布，专治模型训练“疑难杂症”","面向昇腾，围绕模型训练过程中的痛点问题（训练报错、模型精度、性能瓶颈），以 skills 的形式沉淀，致力于提升算法工程师的开发体验。","2026-4-6","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/25/199b735845bf4106b44b2035dc97bd39.png","news",{"type":14,"children":15,"toc":368},"root",[16,24,30,37,42,53,58,63,68,73,78,84,89,94,99,104,109,114,121,127,132,137,142,147,153,167,172,177,182,187,192,197,202,207,213,218,223,228,233,238,243,249,254,267,272,281,286,293,299,304,309,314,319,324,329,346,363],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"ai-infra训练好搭档mindspore-model-agent-v01正式发布专治模型训练疑难杂症",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28],{"type":23,"value":29},"你是否也正在经历着训练报错、精度跑偏、性能卡顿…等问题？不妨试试MindSpore Model Agent！面向昇腾，围绕模型训练过程中的痛点问题（训练报错、模型精度、性能瓶颈），以 skills 的形式沉淀，致力于提升算法工程师的开发体验。",{"type":17,"tag":31,"props":32,"children":34},"h2",{"id":33},"_01-模型训练痛点",[35],{"type":23,"value":36},"01 模型训练痛点",{"type":17,"tag":25,"props":38,"children":39},{},[40],{"type":23,"value":41},"模型训练中的问题，高频、复杂、跨层，单一工具往往搞不定。从安装部署、模型迁移、功能开发、训练调试到问题定位，整个过程高度依赖开发者自身经验。这不仅让问题处理效率变低，也影响开发者的使用体验。",{"type":17,"tag":43,"props":44,"children":46},"div",{"style":45},"text-align: center;",[47],{"type":17,"tag":48,"props":49,"children":52},"img",{"src":50,"style":51,"alt":7},"/category/information/news/banner/2026-4-6/1.jpg","display: block;margin: 0 auto;max-width:70%",[],{"type":17,"tag":25,"props":54,"children":55},{},[56],{"type":23,"value":57},"具体来看，常见痛点主要集中在以下几个方面：",{"type":17,"tag":25,"props":59,"children":60},{},[61],{"type":23,"value":62},"环境与依赖：训练启动前即可能受阻于 MindSpore、CANN、驱动、模型库、recipe、checkpoint、数据预处理等组件之间的版本与依赖关系。",{"type":17,"tag":25,"props":64,"children":65},{},[66],{"type":23,"value":67},"报错定位：问题不止于日志表面，涉及用户脚本、模型代码、框架行为、底层算子、编译图及运行时环境等多个层次，根因未必出现在首条报错信息中。",{"type":17,"tag":25,"props":69,"children":70},{},[71],{"type":23,"value":72},"精度偏差：排查链路通常很长，从数据 shuffle 、 API 行为差异，到底层算子实现差异、反向误差累积、融合算子带来的误差放大，往往需要追溯整网实现细节。",{"type":17,"tag":25,"props":74,"children":75},{},[76],{"type":23,"value":77},"性能瓶颈：profiling、dump、host/device 分析、算子耗时、数据处理瓶颈等分析工作，需在多工具间切换，信息分散，门槛较高。",{"type":17,"tag":31,"props":79,"children":81},{"id":80},"_02-为什么做mindspore-model-agent",[82],{"type":23,"value":83},"02 为什么做MindSpore Model Agent",{"type":17,"tag":25,"props":85,"children":86},{},[87],{"type":23,"value":88},"针对上述痛点，我们打造了一套基于 Agent 的问题处理机制，把模型训练中的“脏活累活”都交由Agent完成,以此来帮助开发者快速推进训练任务，提升开发效率。",{"type":17,"tag":25,"props":90,"children":91},{},[92],{"type":23,"value":93},"环境准备：自动检查环境、依赖、数据、checkpoint，把问题暴露在启动之前。",{"type":17,"tag":25,"props":95,"children":96},{},[97],{"type":23,"value":98},"增加新特性：把论文实现集成进现有模型仓、调通训练脚本、验证新特性是否影响现有精度等，帮你完成快速实验。",{"type":17,"tag":25,"props":100,"children":101},{},[102],{"type":23,"value":103},"训练起不来：不止看日志表面，帮你快速定位根因在脚本、框架、算子还是运行时。",{"type":17,"tag":25,"props":105,"children":106},{},[107],{"type":23,"value":108},"训练精度不达标：从数据 shuffle 到算子行为，逐层排查精度偏差的真实源头。",{"type":17,"tag":25,"props":110,"children":111},{},[112],{"type":23,"value":113},"性能不佳：围绕吞吐、时延、利用率、数据链路，帮你找到最值得优化的瓶颈。",{"type":17,"tag":43,"props":115,"children":116},{"style":45},[117],{"type":17,"tag":48,"props":118,"children":120},{"src":119,"style":51,"alt":7},"/category/information/news/banner/2026-4-6/2.jpg",[],{"type":17,"tag":31,"props":122,"children":124},{"id":123},"_03-与业界coding-agent-区别",[125],{"type":23,"value":126},"03 与业界Coding Agent 区别",{"type":17,"tag":25,"props":128,"children":129},{},[130],{"type":23,"value":131},"MindSpore Model Agent 沉淀多年大模型领域经验，打造面向AI Infra和模型训练场景的专用Agent。",{"type":17,"tag":25,"props":133,"children":134},{},[135],{"type":23,"value":136},"业界主流的 CLI Agent 围绕“代码”展开：代码生成、代码修改、测试执行、仓库内协作。其核心问题是“代码怎么写出来”。",{"type":17,"tag":25,"props":138,"children":139},{},[140],{"type":23,"value":141},"MindSpore Model Agent 则围绕“模型训练”展开：环境依赖、框架行为、脚本调试、日志分析、profiling、精度与性能定位。其核心问题是“训练怎么跑起来、跑对、跑快”。",{"type":17,"tag":25,"props":143,"children":144},{},[145],{"type":23,"value":146},"两者的区别在于：主流 Coding Agent围绕“代码库理解”和“写代码“，MindSpore Model Agent目标是解决模型训练中的框架、算子、数据、运行时等多类问题，协助算法工程师把“训练任务真正推进下去”。",{"type":17,"tag":31,"props":148,"children":150},{"id":149},"_04-mindspore-model-agent特性",[151],{"type":23,"value":152},"04 MindSpore Model Agent特性",{"type":17,"tag":25,"props":154,"children":155},{},[156,158,165],{"type":23,"value":157},"MindSpore Model Agent 围绕模型训练场景，优先落地一组最有工程价值、最贴近真实问题的关键能力，并以 ",{"type":17,"tag":159,"props":160,"children":162},"code",{"className":161},[],[163],{"type":23,"value":164},"mindspore-skills",{"type":23,"value":166}," 的形式呈现。",{"type":17,"tag":25,"props":168,"children":169},{},[170],{"type":23,"value":171},"当前Agent组件包括：",{"type":17,"tag":25,"props":173,"children":174},{},[175],{"type":23,"value":176},"环境分析Agent （readiness-agent）",{"type":17,"tag":25,"props":178,"children":179},{},[180],{"type":23,"value":181},"失败分析Agent (failure-agent)",{"type":17,"tag":25,"props":183,"children":184},{},[185],{"type":23,"value":186},"精度分析Agent (accuracy-agent)",{"type":17,"tag":25,"props":188,"children":189},{},[190],{"type":23,"value":191},"性能分析Agent (performance-agent)",{"type":17,"tag":25,"props":193,"children":194},{},[195],{"type":23,"value":196},"模型迁移Agent (migrate-agent)",{"type":17,"tag":25,"props":198,"children":199},{},[200],{"type":23,"value":201},"mindspore-skills 是模型训练场景的专业能力沉淀：将环境检查、失败分析、精度定位、性能分析等专家经验，逐步整理为可调用、可组合、可演进的 skills、workflow、example 和 diagnose pattern。开发者可使用业界 CLI Agent加载这些领域 skills。",{"type":17,"tag":25,"props":203,"children":204},{},[205],{"type":23,"value":206},"同时，我们推出 MindSpore CLI，端到端串联上述能力，形成统一工作流。开发者可在同一交互面中完成训练前检查、训练中问题分析、训练后精度与性能定位，将原本分散在命令、脚本、日志、工具间的动作组织起来。后续将针对 AI Infra 场景问题持续深度优化，提升算法工程师的开发体验。",{"type":17,"tag":31,"props":208,"children":210},{"id":209},"_05-演进方向",[211],{"type":23,"value":212},"05 演进方向",{"type":17,"tag":25,"props":214,"children":215},{},[216],{"type":23,"value":217},"我们将“模型算法从 idea 到部署的全流程”作为场景驱动，覆盖微调、预训练、后训练、强化学习、部署等流程。",{"type":17,"tag":25,"props":219,"children":220},{},[221],{"type":23,"value":222},"当前重点：优先做扎实单机训练场景，再逐步支持更为复杂的集群训练、后训练与强化学习等场景。",{"type":17,"tag":25,"props":224,"children":225},{},[226],{"type":23,"value":227},"版本优化方向：",{"type":17,"tag":25,"props":229,"children":230},{},[231],{"type":23,"value":232},"持续完善failure agent 的报错类型，包括算子/runtime等；",{"type":17,"tag":25,"props":234,"children":235},{},[236],{"type":23,"value":237},"持续完善accuracy agent的数据处理和api累计误差修复；",{"type":17,"tag":25,"props":239,"children":240},{},[241],{"type":23,"value":242},"进一步增加performance agent的常用昇腾亲和算子实现种类；",{"type":17,"tag":31,"props":244,"children":246},{"id":245},"_06-安装命令",[247],{"type":23,"value":248},"06 安装命令",{"type":17,"tag":25,"props":250,"children":251},{},[252],{"type":23,"value":253},"看到这里，你是否也想亲自体验一下：让 Agent 帮你搞定环境检查、报错定位、精度排查、性能优化这些“脏活累活”？",{"type":17,"tag":25,"props":255,"children":256},{},[257,259],{"type":23,"value":258},"1️⃣ 安装mindspore-skills，支持opencode等工具，详见：\n",{"type":17,"tag":260,"props":261,"children":265},"a",{"href":262,"rel":263},"https://gitcode.com/mindspore-lab/mindspore-skills/blob/main/README.md",[264],"nofollow",[266],{"type":23,"value":262},{"type":17,"tag":25,"props":268,"children":269},{},[270],{"type":23,"value":271},"2️⃣ 安装mindspore-cli，一行命令：",{"type":17,"tag":273,"props":274,"children":276},"pre",{"code":275},"curl -fsSL https://raw.githubusercontent.com/mindspore-lab/mindspore-cli/main/scripts/install.sh | bash\n",[277],{"type":17,"tag":159,"props":278,"children":279},{"__ignoreMap":7},[280],{"type":23,"value":275},{"type":17,"tag":25,"props":282,"children":283},{},[284],{"type":23,"value":285},"装完你就多了一个训练小帮手，遇到问题可以在群里随时喊专家答疑～",{"type":17,"tag":43,"props":287,"children":288},{"style":45},[289],{"type":17,"tag":48,"props":290,"children":292},{"src":291,"style":51,"alt":7},"/category/information/news/banner/2026-4-6/3.jpg",[],{"type":17,"tag":31,"props":294,"children":296},{"id":295},"_07-社区贡献",[297],{"type":23,"value":298},"07 社区贡献：",{"type":17,"tag":25,"props":300,"children":301},{},[302],{"type":23,"value":303},"围绕 mindspore-skills，我们将持续沉淀 skills、workflow、examples、docs 及 diagnose patterns。欢迎每一位开发者加入，一起把模型训练的经验变成可复用的能力。",{"type":17,"tag":25,"props":305,"children":306},{},[307],{"type":23,"value":308},"贡献者成长阶梯：",{"type":17,"tag":25,"props":310,"children":311},{},[312],{"type":23,"value":313},"L1 Content Contributor：补充 skill issue 与场景，清晰描述经验，完善用法。",{"type":17,"tag":25,"props":315,"children":316},{},[317],{"type":23,"value":318},"L2 Skill Contributor：新增或改进 skill，将零散经验整理为可复用能力。",{"type":17,"tag":25,"props":320,"children":321},{},[322],{"type":23,"value":323},"L3 Skill Owner：维护并 review 某类 skill，成为领域骨干。",{"type":17,"tag":25,"props":325,"children":326},{},[327],{"type":23,"value":328},"无论你处于哪个阶段，我们都期待你的参与。通过 issue 提出反馈或贡献内容，我们会持续迭代优化。",{"type":17,"tag":25,"props":330,"children":331},{},[332,334,340],{"type":23,"value":333},"MindSpore Skills ：\n",{"type":17,"tag":260,"props":335,"children":338},{"href":336,"rel":337},"https://github.com/mindspore-lab/mindspore-skills",[264],[339],{"type":23,"value":336},{"type":17,"tag":260,"props":341,"children":344},{"href":342,"rel":343},"https://gitcode.com/mindspore-lab/mindspore-skills",[264],[345],{"type":23,"value":342},{"type":17,"tag":25,"props":347,"children":348},{},[349,351,357],{"type":23,"value":350},"MindSpore CLI ：\n",{"type":17,"tag":260,"props":352,"children":355},{"href":353,"rel":354},"https://github.com/mindspore-lab/mindspore-cli",[264],[356],{"type":23,"value":353},{"type":17,"tag":260,"props":358,"children":361},{"href":359,"rel":360},"https://gitcode.com/mindspore-lab/mindspore-cli",[264],[362],{"type":23,"value":359},{"type":17,"tag":25,"props":364,"children":365},{},[366],{"type":23,"value":367},"💡 一起，让训练更简单。",{"title":7,"searchDepth":369,"depth":369,"links":370},4,[371,373,374,375,376,377,378],{"id":33,"depth":372,"text":36},2,{"id":80,"depth":372,"text":83},{"id":123,"depth":372,"text":126},{"id":149,"depth":372,"text":152},{"id":209,"depth":372,"text":212},{"id":245,"depth":372,"text":248},{"id":295,"depth":372,"text":298},"markdown","content:news:zh:2024-4-6.md","content","news/zh/2024-4-6.md","news/zh/2024-4-6","md",1776506059358]