[{"data":1,"prerenderedAt":528},["ShallowReactive",2],{"content-query-UWy3ZZkXYQ":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":522,"_id":523,"_source":524,"_file":525,"_stem":526,"_extension":527},"/technology-blogs/zh/2026-4-21","zh",false,"","MindSpore绑核能力持续优化：让关键线程更稳，让大模型训练更快","围绕 Host Bound 瓶颈，MindSpore 持续推进绑核能力演进","2026-4-21","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/11/28/8e0e0150508a4c5ba4287fa3bec8ea3f.png","technology-blogs","技术解读",{"type":15,"children":16,"toc":514},"root",[17,25,31,36,41,66,71,82,89,94,99,104,109,119,124,129,134,139,144,149,172,177,182,187,192,197,202,207,212,217,222,229,234,239,244,249,254,259,264,269,287,292,298,303,308,315,320,338,343,348,356,361,366,374,379,386,391,397,402,407,412,417,422,427,432,437,442,447,452,458,463,468,481,493,498,509],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore绑核能力持续优化让关键线程更稳让大模型训练更快",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"在大模型训练与推理持续演进的今天，性能优化的重点早已不只在 Device 侧。随着模型规模不断增大、执行链路愈发复杂，Host 侧逐渐成为影响整机效率的关键环节。",{"type":18,"tag":26,"props":32,"children":33},{},[34],{"type":24,"value":35},"围绕 Host Bound 瓶颈，MindSpore 持续推进绑核能力演进：分布式进程级绑核 → 线程级精细化亲和 → 进程与线程协同隔离 → NUMA 级绑定 → 统一 JSON 配置；",{"type":18,"tag":26,"props":37,"children":38},{},[39],{"type":24,"value":40},"且这套能力已在客户现场得到验证：",{"type":18,"tag":42,"props":43,"children":44},"ul",{},[45,51,56,61],{"type":18,"tag":46,"props":47,"children":48},"li",{},[49],{"type":24,"value":50},"1024 卡线性度达到 99%",{"type":18,"tag":46,"props":52,"children":53},{},[54],{"type":24,"value":55},"多组大规模配置线性度超过 97%",{"type":18,"tag":46,"props":57,"children":58},{},[59],{"type":24,"value":60},"典型场景性能提升 10% ~ 18%",{"type":18,"tag":46,"props":62,"children":63},{},[64],{"type":24,"value":65},"真实生产模型 4K 规模性能提升超过 20%",{"type":18,"tag":26,"props":67,"children":68},{},[69],{"type":24,"value":70},"这意味着，MindSpore 绑核能力带来的不只是“更快”，更是“更稳”。",{"type":18,"tag":72,"props":73,"children":75},"div",{"style":74},"text-align: center;",[76],{"type":18,"tag":77,"props":78,"children":81},"img",{"src":79,"style":80,"alt":7},"/category/information/technology-blogs/banner/2026-4-21/1.jpg","display: block;margin: 0 auto;max-width:60%",[],{"type":18,"tag":83,"props":84,"children":86},"h2",{"id":85},"_01-演进主线从进程级到-numa-级的五个阶段",[87],{"type":24,"value":88},"01 演进主线：从进程级到 NUMA 级的五个阶段",{"type":18,"tag":26,"props":90,"children":91},{},[92],{"type":24,"value":93},"阶段1：分布式进程级绑核 —— 划定 CPU 资源边界",{"type":18,"tag":26,"props":95,"children":96},{},[97],{"type":24,"value":98},"MindSpore 绑核能力的起点，是面向分布式场景的进程级绑核。",{"type":18,"tag":26,"props":100,"children":101},{},[102],{"type":24,"value":103},"在多卡分布式训练中，多个训练进程会同时拉起。如果完全依赖操作系统自由调度，不同进程很容易落在重叠的 CPU 核心上运行，进而带来资源争抢、调度抖动和性能波动。",{"type":18,"tag":26,"props":105,"children":106},{},[107],{"type":24,"value":108},"为此，MindSpore 首先提供了基于 msrun 的进程级绑核能力：",{"type":18,"tag":110,"props":111,"children":113},"pre",{"code":112},"msrun --bind_core=True\n",[114],{"type":18,"tag":115,"props":116,"children":117},"code",{"__ignoreMap":7},[118],{"type":24,"value":112},{"type":18,"tag":26,"props":120,"children":121},{},[122],{"type":24,"value":123},"这项能力的关键，不只是“支持绑核”，更在于它发生在分布式任务启动的最早阶段。",{"type":18,"tag":26,"props":125,"children":126},{},[127],{"type":24,"value":128},"通常来说，绑核越早做越好。越早完成 CPU 亲和性设置，后续线程创建、调度继承和资源分布就越稳定。msrun 正处在分布式进程拉起的起点，在这一阶段完成绑核，相当于在任务启动之初就先划定了 CPU 资源边界。",{"type":18,"tag":26,"props":130,"children":131},{},[132],{"type":24,"value":133},"这一步解决的是最基础的问题：先让每个分布式进程拥有相对稳定的 CPU 运行空间。",{"type":18,"tag":26,"props":135,"children":136},{},[137],{"type":24,"value":138},"阶段2：线程级绑核 —— 从“给进程分 CPU”到“给关键线程保 CPU”",{"type":18,"tag":26,"props":140,"children":141},{},[142],{"type":24,"value":143},"进程级绑核不够细，真正决定 Host 侧关键路径效率的是少数核心工作线程。它们直接影响编译、下发、数据处理等环节，能否持续稳定地向 Device 供给工作负载。若这些线程频繁迁移或与普通线程混跑，即使做了进程级绑核，系统仍会抖动。",{"type":18,"tag":26,"props":145,"children":146},{},[147],{"type":24,"value":148},"因此，MindSpore 进一步演进出线程级绑核能力，通过 mindspore.runtime.set_cpu_affinity 对关键线程进行细粒度的CPU绑定；\n覆盖的线程包括：",{"type":18,"tag":42,"props":150,"children":151},{},[152,157,162,167],{"type":18,"tag":46,"props":153,"children":154},{},[155],{"type":24,"value":156},"main 主线程：编译和流程控制",{"type":18,"tag":46,"props":158,"children":159},{},[160],{"type":24,"value":161},"runtime 线程：静态图下发",{"type":18,"tag":46,"props":163,"children":164},{},[165],{"type":24,"value":166},"pynative 线程：动态图算子下发",{"type":18,"tag":46,"props":168,"children":169},{},[170],{"type":24,"value":171},"minddata 线程：数据处理",{"type":18,"tag":26,"props":173,"children":174},{},[175],{"type":24,"value":176},"这一步的本质，是从“给进程分 CPU”，走向“给关键线程保 CPU”。",{"type":18,"tag":26,"props":178,"children":179},{},[180],{"type":24,"value":181},"不是所有线程都需要最稳定的 CPU 环境，但关键线程一定需要。只有让这些线程尽可能少迁移、少受干扰，Host 侧关键路径才能真正稳下来。",{"type":18,"tag":26,"props":183,"children":184},{},[185],{"type":24,"value":186},"阶段3：从绑定走向隔离 —— 关键线程与普通线程分区运行",{"type":18,"tag":26,"props":188,"children":189},{},[190],{"type":24,"value":191},"在线程级绑核基础上，MindSpore 进一步把资源管理思路从“绑定”推进到“隔离”。仅保证关键线程不迁移还不够，还需避免它们被其他线程抢占 CPU。",{"type":18,"tag":26,"props":193,"children":194},{},[195],{"type":24,"value":196},"为此，MindSpore 进一步演进出进程级与线程级协同绑核能力，核心思路是：",{"type":18,"tag":26,"props":198,"children":199},{},[200],{"type":24,"value":201},"先通过进程级绑核，为分布式进程划定 CPU 范围",{"type":18,"tag":26,"props":203,"children":204},{},[205],{"type":24,"value":206},"再在进程内部识别关键工作线程",{"type":18,"tag":26,"props":208,"children":209},{},[210],{"type":24,"value":211},"将runtime 等重要线程从主线程的混合执行环境中分离出来",{"type":18,"tag":26,"props":213,"children":214},{},[215],{"type":24,"value":216},"为这些关键线程分配更独立、更干净的 CPU 核心",{"type":18,"tag":26,"props":218,"children":219},{},[220],{"type":24,"value":221},"这背后遵循的是一个非常明确的原则：关键路径上的线程，应尽可能运行在低干扰核心上。关键线程运行在更独立的核心上时，Host 侧关键路径会更稳定，Device 侧因等待 Host 而产生的空闲也随之减少。",{"type":18,"tag":72,"props":223,"children":224},{"style":74},[225],{"type":18,"tag":77,"props":226,"children":228},{"src":227,"style":80,"alt":7},"/category/information/technology-blogs/banner/2026-4-21/2.jpg",[],{"type":18,"tag":26,"props":230,"children":231},{},[232],{"type":24,"value":233},"阶段4：NUMA 级绑定 —— CPU 与内存协同优化",{"type":18,"tag":26,"props":235,"children":236},{},[237],{"type":24,"value":238},"在多路 CPU、NUMA 架构下，CPU 访问本地内存与远端内存成本不同。即使线程绑在合适的CPU核心，若频繁访问另一个 NUMA 节点上的内存，仍会有额外访存时延和带宽损耗，尤其在 HyperOffload 等场景中代价更大。",{"type":18,"tag":26,"props":240,"children":241},{},[242],{"type":24,"value":243},"因此，MindSpore 的绑核能力继续扩展到了 NUMA 级别。它关注点不再只是“线程跑在哪个核上”，还包括“相关内存操作是否尽量在本地 NUMA 节点完成”。",{"type":18,"tag":26,"props":245,"children":246},{},[247],{"type":24,"value":248},"这一步的意义，在于把 CPU 亲和优化扩展为 CPU 与内存协同优化，让MindSpore 在复杂服务器拓扑下，更系统地减少远端访存带来的额外损耗。",{"type":18,"tag":26,"props":250,"children":251},{},[252],{"type":24,"value":253},"阶段5：统一 JSON 配置 —— 让复杂能力可落地",{"type":18,"tag":26,"props":255,"children":256},{},[257],{"type":24,"value":258},"随着绑核能力从进程级演进到线程级、再到NUMA级，优化维度增加，配置复杂度也随之上升。若能力分散在命令行、接口和零散配置中，将难以被大规模使用。",{"type":18,"tag":26,"props":260,"children":261},{},[262],{"type":24,"value":263},"因此，MindSpore 在能力增强的同时，也同步推进了易用性升级，通过 JSON 文件统一配置 CPU/NUMA 亲和关系，将原本相对分散的能力组织为更清晰、更可复用的工程化方案。",{"type":18,"tag":26,"props":265,"children":266},{},[267],{"type":24,"value":268},"这种方式带来的价值非常直接：",{"type":18,"tag":42,"props":270,"children":271},{},[272,277,282],{"type":18,"tag":46,"props":273,"children":274},{},[275],{"type":24,"value":276},"配置入口统一，降低理解成本",{"type":18,"tag":46,"props":278,"children":279},{},[280],{"type":24,"value":281},"表达能力更强，适合描述复杂线程与资源映射关系",{"type":18,"tag":46,"props":283,"children":284},{},[285],{"type":24,"value":286},"便于纳入部署流程、环境复现和版本管理",{"type":18,"tag":26,"props":288,"children":289},{},[290],{"type":24,"value":291},"MindSpore 在这一步解决的，是从“有没有高级绑核能力”到“能不能让用户顺畅、稳定地用到实际业务里”。",{"type":18,"tag":83,"props":293,"children":295},{"id":294},"_02-效果验证客户现场的真实数据",[296],{"type":24,"value":297},"02 效果验证：客户现场的真实数据",{"type":18,"tag":26,"props":299,"children":300},{},[301],{"type":24,"value":302},"绑核能力的价值，从来不只体现在某一个局部指标上，而是体现在大规模训练任务中的整体性能释放与稳定性提升。",{"type":18,"tag":26,"props":304,"children":305},{},[306],{"type":24,"value":307},"场景1：万亿级 MoE 模型（客户现场 A）",{"type":18,"tag":72,"props":309,"children":310},{"style":74},[311],{"type":18,"tag":77,"props":312,"children":314},{"src":313,"style":80,"alt":7},"/category/information/technology-blogs/banner/2026-4-21/3.jpg",[],{"type":18,"tag":26,"props":316,"children":317},{},[318],{"type":24,"value":319},"场景2：多组模型验证",{"type":18,"tag":42,"props":321,"children":322},{},[323,328,333],{"type":18,"tag":46,"props":324,"children":325},{},[326],{"type":24,"value":327},"256 卡 → 1024 卡，多组配置线性度均超过 97%",{"type":18,"tag":46,"props":329,"children":330},{},[331],{"type":24,"value":332},"整体性能进一步提升 10% ~ 15%",{"type":18,"tag":46,"props":334,"children":335},{},[336],{"type":24,"value":337},"性能抖动控制在 200ms 以内",{"type":18,"tag":26,"props":339,"children":340},{},[341],{"type":24,"value":342},"对于大规模训练任务而言，这不仅意味着更高吞吐，也意味着更稳定的运行表现。",{"type":18,"tag":26,"props":344,"children":345},{},[346],{"type":24,"value":347},"场景3：真实生产模型 4K 规模",{"type":18,"tag":42,"props":349,"children":350},{},[351],{"type":18,"tag":46,"props":352,"children":353},{},[354],{"type":24,"value":355},"性能提升 超过 20%",{"type":18,"tag":26,"props":357,"children":358},{},[359],{"type":24,"value":360},"进一步体现出该能力在 Host Bound 明显场景下的实际价值。",{"type":18,"tag":26,"props":362,"children":363},{},[364],{"type":24,"value":365},"场景4：另一客户现场 2K 规模",{"type":18,"tag":42,"props":367,"children":368},{},[369],{"type":18,"tag":46,"props":370,"children":371},{},[372],{"type":24,"value":373},"实施绑核后，性能收益 10% ~ 15%",{"type":18,"tag":26,"props":375,"children":376},{},[377],{"type":24,"value":378},"这也说明，Host Bound 问题在不同规模集群中具有一定普遍性，而绑核正是缓解这一问题的有效手段之一。",{"type":18,"tag":72,"props":380,"children":381},{"style":74},[382],{"type":18,"tag":77,"props":383,"children":385},{"src":384,"style":80,"alt":7},"/category/information/technology-blogs/banner/2026-4-21/4.jpg",[],{"type":18,"tag":26,"props":387,"children":388},{},[389],{"type":24,"value":390},"整体来看，MindSpore 绑核能力已经不只是“让任务跑得更快”，而是在大规模训推场景下，帮助用户获得更高的扩展效率、更稳定的 Host 侧供给能力，以及更可控的整机性能表现。",{"type":18,"tag":83,"props":392,"children":394},{"id":393},"_03-结语",[395],{"type":24,"value":396},"03 结语",{"type":18,"tag":26,"props":398,"children":399},{},[400],{"type":24,"value":401},"演进主线回顾",{"type":18,"tag":26,"props":403,"children":404},{},[405],{"type":24,"value":406},"1、msrun --bind_core=True：在分布式进程拉起阶段尽早建立稳定 CPU 环境；",{"type":18,"tag":26,"props":408,"children":409},{},[410],{"type":24,"value":411},"2、mindspore.runtime.set_cpu_affinity：为 Host 侧关键线程提供细粒度亲和控制；",{"type":18,"tag":26,"props":413,"children":414},{},[415],{"type":24,"value":416},"3、进程级与线程级协同隔离：让关键线程运行在低干扰核心上；",{"type":18,"tag":26,"props":418,"children":419},{},[420],{"type":24,"value":421},"4、NUMA 级绑定：保证 CPU 与内存访问的本地性；",{"type":18,"tag":26,"props":423,"children":424},{},[425],{"type":24,"value":426},"5、JSON 统一配置：把复杂优化沉淀为可工程化、可复用的能力。",{"type":18,"tag":26,"props":428,"children":429},{},[430],{"type":24,"value":431},"这不再是一堆零散功能的堆砌，而是一套面向Host Bound问题的系统化运行时优化能力。在 MindSpore 2.8 版本中，用户已支持灵活运用进程绑核、线程绑核、NUMA 本地内存等多层次亲和性设置。",{"type":18,"tag":26,"props":433,"children":434},{},[435],{"type":24,"value":436},"自 2.9 版本起，将进一步支持通过 JSON 文件统一配置 CPU/NUMA 亲和关系，从分散配置迈向统一、可复用的工程化方案，让性能调优更系统、更便捷。欢迎共同期待即将到来的 2.9 版本，体验更高效的亲和性配置与更优的运行时性能。",{"type":18,"tag":26,"props":438,"children":439},{},[440],{"type":24,"value":441},"核心价值",{"type":18,"tag":26,"props":443,"children":444},{},[445],{"type":24,"value":446},"对于框架：形成面向 Host Bound 问题的系统化运行时优化能力；",{"type":18,"tag":26,"props":448,"children":449},{},[450],{"type":24,"value":451},"对于用户：更稳定的 Host 侧、更少的 Device 空闲等待、更可控的整机性能释放。",{"type":18,"tag":83,"props":453,"children":455},{"id":454},"_04-互动与参考资料",[456],{"type":24,"value":457},"04 互动与参考资料",{"type":18,"tag":26,"props":459,"children":460},{},[461],{"type":24,"value":462},"如果你也被 Host 侧性能卡顿、CPU 调度混乱折腾过，不妨试试 MindSpore 的绑核能力~ 图片图片",{"type":18,"tag":26,"props":464,"children":465},{},[466],{"type":24,"value":467},"想优化？跟着下面几步，几分钟就能配起来——",{"type":18,"tag":26,"props":469,"children":470},{},[471,473],{"type":24,"value":472},"1️⃣ 查看 mindspore.runtime.set_cpu_affinity 接口说明，了解线程级绑核用法：\n",{"type":18,"tag":474,"props":475,"children":479},"a",{"href":476,"rel":477},"https://www.mindspore.cn/docs/zh-CN/master/api_python/runtime/mindspore.runtime.set_cpu_affinity.html",[478],"nofollow",[480],{"type":24,"value":476},{"type":18,"tag":26,"props":482,"children":483},{},[484,486],{"type":24,"value":485},"2️⃣ 参考 JSON 统一配置指南，一键搞定 CPU/NUMA 亲和设置：\n",{"type":18,"tag":474,"props":487,"children":490},{"href":488,"rel":489},"https://www.mindspore.cn/tutorials/zh-CN/master/parallel/msrun_launcher.html#%E4%BD%BF%E7%94%A8-json-%E7%BB%9F%E4%B8%80%E9%85%8D%E7%BD%AE-cpunuma-%E4%BA%B2%E5%92%8C--bind-numa-mindsporeruntimeset-cpu-affinity",[478],[491],{"type":24,"value":492},"https://www.mindspore.cn/tutorials/zh-CN/master/parallel/msrun_launcher.html#使用-json-统一配置-cpunuma-亲和--bind-numa-mindsporeruntimeset-cpu-affinity",{"type":18,"tag":26,"props":494,"children":495},{},[496],{"type":24,"value":497},"🌐 欢迎加入昇思社区 :",{"type":18,"tag":26,"props":499,"children":500},{},[501,507],{"type":18,"tag":474,"props":502,"children":505},{"href":503,"rel":504},"https://discuss.mindspore.cn/",[478],[506],{"type":24,"value":503},{"type":24,"value":508},"\n与5.2万+开发者一起，让AI框架越来越好用。",{"type":18,"tag":26,"props":510,"children":511},{},[512],{"type":24,"value":513},"也欢迎来评论区聊聊你的经验或困惑~",{"title":7,"searchDepth":515,"depth":515,"links":516},4,[517,519,520,521],{"id":85,"depth":518,"text":88},2,{"id":294,"depth":518,"text":297},{"id":393,"depth":518,"text":396},{"id":454,"depth":518,"text":457},"markdown","content:technology-blogs:zh:2026-4-21.md","content","technology-blogs/zh/2026-4-21.md","technology-blogs/zh/2026-4-21","md",1777066121693]