[{"data":1,"prerenderedAt":295},["ShallowReactive",2],{"content-query-r9n3dmjucX":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":289,"_id":290,"_source":291,"_file":292,"_stem":293,"_extension":294},"/technology-blogs/zh/2026-2-3","zh",false,"","昇思MindSpore 2.8框架自定义能力技术解读","MindSpore 2.8在自定义算子方面实现显著优化","2026-2-3","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/11/28/8e0e0150508a4c5ba4287fa3bec8ea3f.png","technology-blogs","技术解读",{"type":15,"children":16,"toc":286},"root",[17,25,31,37,42,47,52,57,62,72,77,82,93,98,103,108,113,121,126,130,139,144,149,154,159,167,172,180,184,193,199,204,209,214,219,224,229,234,239,245,250,255,260,265,270,275],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"昇思mindspore-28框架自定义能力技术解读",[23],{"type":24,"value":8},"text",{"type":18,"tag":19,"props":26,"children":28},{"id":27},"_01-核心技术创新解读",[29],{"type":24,"value":30},"01 核心技术创新解读",{"type":18,"tag":32,"props":33,"children":34},"p",{},[35],{"type":24,"value":36},"1.1 自定义算子：高性能计算单元的无缝嵌入",{"type":18,"tag":32,"props":38,"children":39},{},[40],{"type":24,"value":41},"算子是模型核心计算单元，前沿算法与垂直领域对其需求日趋个性化，但主流框架预置算子库更新滞后、难以适配特定场景，且传统自定义算子开发流程繁琐、兼容性差，开发与落地成本居高不下。",{"type":18,"tag":32,"props":43,"children":44},{},[45],{"type":24,"value":46},"MindSpore 2.8在自定义算子方面实现显著优化，精准破解上述痛点：",{"type":18,"tag":32,"props":48,"children":49},{},[50],{"type":24,"value":51},"极简C++开发接口：提供简洁API，大幅降低开发门槛，开发者无需深入了解框架底层架构，即可快速编写自定义算子逻辑。",{"type":18,"tag":32,"props":53,"children":54},{},[55],{"type":24,"value":56},"内置四级流水机制：创新性引入PyTask、FrontendTask、DeviceTask和LaunchTask四级流水架构，将算子执行流程分解为多个并行阶段，有效规避单环节阻塞问题，显著提升执行效率，尤其适配大模型多算子并行计算场景。",{"type":18,"tag":32,"props":58,"children":59},{},[60],{"type":24,"value":61},"一站式工具链：CustomOpBuilder工具实现编译与加载一步完成，开发者可像调用内置算子一样直接使用，大幅简化开发流程，提升开发效率：",{"type":18,"tag":63,"props":64,"children":66},"pre",{"code":65},"  python\n  my_ops = CustomOpBuilder(\"cpu_add\", ['./pyboost_cpu_add.cpp'],\n  backend=\"CPU\").load()\n  out = my_ops.add3(Tensor(x), Tensor(y), Tensor(z))\n",[67],{"type":18,"tag":68,"props":69,"children":70},"code",{"__ignoreMap":7},[71],{"type":24,"value":65},{"type":18,"tag":32,"props":73,"children":74},{},[75],{"type":24,"value":76},"加速库深度集成：提供AclnnOpRunner、AtbOpRunner和AsdSipFFTOpRunner等专用运行器，为ACLNN、ATB、ASDSIP三大加速库量身定制对接接口，实现\"零成本\"接入高性能算子，充分发挥硬件加速潜力。",{"type":18,"tag":32,"props":78,"children":79},{},[80],{"type":24,"value":81},"参考链接：",{"type":18,"tag":32,"props":83,"children":84},{},[85],{"type":18,"tag":86,"props":87,"children":91},"a",{"href":88,"rel":89},"https://www.mindspore.cn/tutorials/zh-CN/master/custom_program/operation/op_customopbuilder.html",[90],"nofollow",[92],{"type":24,"value":88},{"type":18,"tag":32,"props":94,"children":95},{},[96],{"type":24,"value":97},"1.2 自定义PASS：计算图优化的精准控制",{"type":18,"tag":32,"props":99,"children":100},{},[101],{"type":24,"value":102},"计算图优化直接影响模型性能，但主流框架PASS优化采用“一刀切”策略，无法精准匹配不同模型、硬件的个性化需求，且多数框架不开放PASS自定义接口，限制了优化的灵活性与深度。",{"type":18,"tag":32,"props":104,"children":105},{},[106],{"type":24,"value":107},"MindSpore 2.8开放框架PASS编写及注册接口，打破优化壁垒，让开发者能在编译期对计算图进行精细控制，实现\"按需优化\"：",{"type":18,"tag":32,"props":109,"children":110},{},[111],{"type":24,"value":112},"模式化图变换：通过继承PatternToPatternPass基类，开发者可清晰定义源模式、目标模式和匹配条件，实现结构化图优化，无需手动遍历计算图节点，大幅降低PASS开发难度。例如，可快速实现\"Add+Neg\"算子向\"Sub\"算子的融合，减少计算节点数量，提升执行效率：",{"type":18,"tag":63,"props":114,"children":116},{"code":115},"cpp\nvoid AddNegFusionPass::DefineSrcPattern(SrcPattern *src_pattern) {\n  (*src_pattern)\n    .AddVar(\"x\")\n    .AddVar(\"y\")\n    .AddCNode(\"neg\", {std::make_shared\u003CPrimitive>(\"Neg\"), \"y\"})\n    .AddCNode(\"add\", {std::make_shared\u003CPrimitive>(\"Add\"), \"x\", \"neg\"});\n}\n  \nvoid AddNegFusionPass::DefineDstPattern(DstPattern *dst_pattern) {\n  (*dst_pattern).AddCNode(\"sub\", {std::make_shared\u003CPrimitive>(\"Sub\"), \"x\", \"y\"}, BuildSub);\n}\n",[117],{"type":18,"tag":68,"props":118,"children":119},{"__ignoreMap":7},[120],{"type":24,"value":115},{"type":18,"tag":32,"props":122,"children":123},{},[124],{"type":24,"value":125},"灵活的注册机制：提供register_custom_pass接口，支持按后端类型和优化阶段精准注册自定义PASS，使优化策略与硬件特性、模型需求深度匹配。例如，可针对昇腾NPU注册专属融合PASS，针对CPU注册内存复用PASS，实现计算图优化的精准落地。",{"type":18,"tag":32,"props":127,"children":128},{},[129],{"type":24,"value":81},{"type":18,"tag":32,"props":131,"children":132},{},[133],{"type":18,"tag":86,"props":134,"children":137},{"href":135,"rel":136},"https://www.mindspore.cn/tutorials/zh-CN/master/custom_program/custom_pass.html",[90],[138],{"type":24,"value":135},{"type":18,"tag":32,"props":140,"children":141},{},[142],{"type":24,"value":143},"1.3 自定义后端：运行时环境的全面掌控",{"type":18,"tag":32,"props":145,"children":146},{},[147],{"type":24,"value":148},"AI硬件生态日趋多元，但主流框架后端与特定硬件绑定较深，通用后端无法充分发挥新型硬件潜力、适配成本高，且传统后端切换繁琐，难以满足多场景部署需求。",{"type":18,"tag":32,"props":150,"children":151},{},[152],{"type":24,"value":153},"为满足异构计算需求，破解硬件适配难题，MindSpore 2.8提供完整后端扩展机制，让开发者可全面掌控运行时环境，实现框架与硬件的高效适配：",{"type":18,"tag":32,"props":155,"children":156},{},[157],{"type":24,"value":158},"标准化抽象接口：\n通过继承BackendBase基类，开发者只需实现Build和Run两个核心接口，即可快速开发自定义后端，无需关注框架底层的调度逻辑，大幅降低后端开发门槛，缩短硬件适配周期：",{"type":18,"tag":63,"props":160,"children":162},{"code":161},"cpp\nclass MSCustomBackendBase : public BackendBase {\npublic:\n  BackendGraphId Build(const FuncGraphPtr &func_graph, const BackendJitConfig &backend_jit_config);\n  RunningStatus Run(BackendGraphId graph_id, const VectorRef &inputs, VectorRef *outputs);\n};\n",[163],{"type":18,"tag":68,"props":164,"children":165},{"__ignoreMap":7},[166],{"type":24,"value":161},{"type":18,"tag":32,"props":168,"children":169},{},[170],{"type":24,"value":171},"无缝集成体验：通过MS_REGISTER_BACKEND宏注册自定义后端，结合Python端register_custom_backend接口，实现在同一程序中自由切换不同后端，无需重新编译模型，适配多场景部署需求，提升开发与部署效率：",{"type":18,"tag":63,"props":173,"children":175},{"code":174},"python\n@jit(backend=\"my_custom_backend\")\ndef net1(x):\n    return mint.sin(x)\n      \n@jit(backend=\"ms_backend\")\ndef net2(x):\n    return mint.cos(x)\n",[176],{"type":18,"tag":68,"props":177,"children":178},{"__ignoreMap":7},[179],{"type":24,"value":174},{"type":18,"tag":32,"props":181,"children":182},{},[183],{"type":24,"value":81},{"type":18,"tag":32,"props":185,"children":186},{},[187],{"type":18,"tag":86,"props":188,"children":191},{"href":189,"rel":190},"https://www.mindspore.cn/tutorials/zh-CN/master/custom_program/custom_backend.html",[90],[192],{"type":24,"value":189},{"type":18,"tag":19,"props":194,"children":196},{"id":195},"_02-技术价值与应用场景",[197],{"type":24,"value":198},"02 技术价值与应用场景",{"type":18,"tag":32,"props":200,"children":201},{},[202],{"type":24,"value":203},"2.1 科学计算领域",{"type":18,"tag":32,"props":205,"children":206},{},[207],{"type":24,"value":208},"在物理模拟、分子动力学等场景中，研究者可将领域知识直接编码为高性能算子，避免在框架限制与科学需求间妥协，加速科研成果转化。",{"type":18,"tag":32,"props":210,"children":211},{},[212],{"type":24,"value":213},"2.2  专用硬件加速",{"type":18,"tag":32,"props":215,"children":216},{},[217],{"type":24,"value":218},"硬件厂商可在保持MindSpore统一编程模型的同时，充分发挥自主创新AI芯片或特定架构加速器的特性，通过自定义后端与算子，加速AI模型在边缘设备、数据中心等场景的落地，推动硬件生态繁荣。",{"type":18,"tag":32,"props":220,"children":221},{},[222],{"type":24,"value":223},"2.3 垂直行业优化",{"type":18,"tag":32,"props":225,"children":226},{},[227],{"type":24,"value":228},"企业可针对金融风控、医疗影像等垂直领域的特定模型结构，通过自定义PASS实施深度优化，如算子融合、内存复用策略调整，显著提升推理性能与资源利用率，降低部署成本。",{"type":18,"tag":32,"props":230,"children":231},{},[232],{"type":24,"value":233},"2.4 前沿算法快速验证",{"type":18,"tag":32,"props":235,"children":236},{},[237],{"type":24,"value":238},"研究人员能够快速实现并验证创新想法（如新算子、新架构），通过三大自定义能力快速搭建实验环境，大幅缩短从理论到实践的转化周期，助力算法创新突破。",{"type":18,"tag":19,"props":240,"children":242},{"id":241},"_03-总结与展望",[243],{"type":24,"value":244},"03 总结与展望",{"type":18,"tag":32,"props":246,"children":247},{},[248],{"type":24,"value":249},"昇思MindSpore 2.8通过增强三大自定义能力，构建了开放、灵活且高效的技术生态：",{"type":18,"tag":32,"props":251,"children":252},{},[253],{"type":24,"value":254},"开放性：提供标准化扩展接口，打破框架黑盒，使开发者深度参与框架演进，形成\"框架-开发者-硬件厂商\"的协同生态；",{"type":18,"tag":32,"props":256,"children":257},{},[258],{"type":24,"value":259},"灵活性：支持从算子、图优化到运行时的全栈定制，精准适配前沿算法、垂直行业与新型硬件的多样化需求；",{"type":18,"tag":32,"props":261,"children":262},{},[263],{"type":24,"value":264},"高效性：通过多级流水等创新机制，在提供开放能力的同时不牺牲性能，实现\"灵活定制\"与\"高效运行\"的双重目标。",{"type":18,"tag":32,"props":266,"children":267},{},[268],{"type":24,"value":269},"随着AI与各行业深度融合，框架的自定义能力将成为核心竞争力。昇思MindSpore的技术路线不仅解决当前开发者痛点，更为构建\"框架-算法-硬件\"协同优化的新一代AI基础设施奠定基础。在\"框架通用性\"与\"场景特殊性\"的长期博弈中，昇思MindSpore 2.8选择平衡之路：保持核心架构统一性，同时通过精心设计的扩展点赋予开发者充分定制自由，代表了深度学习框架从封闭工具箱向开放创新平台演进的重要趋势。",{"type":18,"tag":32,"props":271,"children":272},{},[273],{"type":24,"value":274},"关于昇思2.8版本的更多链接，请参考",{"type":18,"tag":32,"props":276,"children":277},{},[278,280],{"type":24,"value":279},"昇思MindSpore 2.8版本正式发布，为超节点而生的HyperParallel架构，训推更灵活、更高效:",{"type":18,"tag":86,"props":281,"children":284},{"href":282,"rel":283},"https://mp.weixin.qq.com/s?__biz=MzkxMTM2MjMzNg==&mid=2247634387&idx=1&sn=4f92b52ce05f25ad4604a56aef9e7484&scene=21&poc_token=HD0PhGmjCo40u25LCL3l5b87eZNs-YyT16KcfPp4",[90],[285],{"type":24,"value":282},{"title":7,"searchDepth":287,"depth":287,"links":288},4,[],"markdown","content:technology-blogs:zh:2026-2-3.md","content","technology-blogs/zh/2026-2-3.md","technology-blogs/zh/2026-2-3","md",1776506119608]