[{"data":1,"prerenderedAt":633},["ShallowReactive",2],{"content-query-8TH5O9hXdM":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":627,"_id":628,"_source":629,"_file":630,"_stem":631,"_extension":632},"/news/zh/2025-12.9","zh",false,"","昇思人工智能框架峰会| 昇思MindSpore AKG大模型驱动算子生成","尝试构建一套更为通用的基于Agent的算子生成解决方案","2025-12-9","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/25/199b735845bf4106b44b2035dc97bd39.png","news",{"type":14,"children":15,"toc":615},"root",[16,24,34,42,47,62,67,78,83,88,96,104,109,117,122,127,150,155,160,165,170,175,180,187,192,200,208,218,223,236,243,248,251,259,264,271,276,281,286,290,298,303,308,321,329,334,341,346,351,355,363,368,372,380,385,413,420,425,429,437,442,447,452,486,493,498,506,514,523,528,535,540,545,549,557,562,569,576,584,592,597,602],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"昇思人工智能框架峰会-昇思mindspore-akg大模型驱动算子生成",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28],{"type":17,"tag":29,"props":30,"children":31},"strong",{},[32],{"type":23,"value":33},"# 01",{"type":17,"tag":25,"props":35,"children":36},{},[37],{"type":17,"tag":29,"props":38,"children":39},{},[40],{"type":23,"value":41},"导言",{"type":17,"tag":25,"props":43,"children":44},{},[45],{"type":23,"value":46},"随着AI模型快速迭代与AI软硬件不断演进，AI产业对高质量算子的需求愈发强烈：",{"type":17,"tag":48,"props":49,"children":50},"ul",{},[51,57],{"type":17,"tag":52,"props":53,"children":54},"li",{},[55],{"type":23,"value":56},"模型侧：大模型（LLM）领域，稠密模型、MOE、MLA、多模态场景众多；稀疏、量化、KVCache压缩等技术又为算子生成带来更多复杂性与多样性。推荐类、 CV类模型同样在持续演进，与LLM截然不同的算子场景同样存在强烈的优化落地需求；",{"type":17,"tag":52,"props":58,"children":59},{},[60],{"type":23,"value":61},"硬件侧：随着国内外AI芯片的不断升级和演进，硬件架构、特性、参数各有不同；为提高AI芯片可用性，构建性能优势，定制化算子优化方案需求强烈；",{"type":17,"tag":25,"props":63,"children":64},{},[65],{"type":23,"value":66},"与此同时，基于LLM的算子生成技术逐渐成为业界研究热点。自2025年始，随着大模型的代码生成能力日益完善，用LLM来编写代码已在各类CodeAgent、AI IDE中落地部署；在算子生成领域，各大知名高校、厂商纷纷开始投身到LLM生成算子的探索工作当中。",{"type":17,"tag":68,"props":69,"children":71},"div",{"style":70},"text-align: center;",[72],{"type":17,"tag":73,"props":74,"children":77},"img",{"src":75,"style":76,"alt":7},"/category/information/news/banner/2025-12-9-1.jpg","display: block;margin: 0 auto;max-width:60%",[],{"type":17,"tag":25,"props":79,"children":80},{},[81],{"type":23,"value":82},"KernelBench：提供了一套易用的LLM生成算子验证流程基准",{"type":17,"tag":25,"props":84,"children":85},{},[86],{"type":23,"value":87},"昇思MindSpore AKG 团队于2025年初开始对相关问题进行探索，尝试构建一套更为通用的 基于 Agent 的算子生成解决方案，全自动实现多DSL算子的生成、搜索调优等。在大幅减少人参与算子开发的同时，保证了算子生成的正确率与性能。为昇腾平台提供更优的开发体验。",{"type":17,"tag":25,"props":89,"children":90},{},[91],{"type":17,"tag":29,"props":92,"children":93},{},[94],{"type":23,"value":95},"# 02",{"type":17,"tag":25,"props":97,"children":98},{},[99],{"type":17,"tag":29,"props":100,"children":101},{},[102],{"type":23,"value":103},"算子Agent 框架简介",{"type":17,"tag":25,"props":105,"children":106},{},[107],{"type":23,"value":108},"AKG提供了一款 AI 驱动的算子代码生成工具，利用大语言模型(LLM)的代码生成能力，通过LLM规划和控制多Agent 协同，完成多后端、多类型的AI算子生成和优化。",{"type":17,"tag":25,"props":110,"children":111},{},[112],{"type":17,"tag":29,"props":113,"children":114},{},[115],{"type":23,"value":116},"算子 Agent 框架的主要构成如下：",{"type":17,"tag":25,"props":118,"children":119},{},[120],{"type":23,"value":121},"1、Agents",{"type":17,"tag":25,"props":123,"children":124},{},[125],{"type":23,"value":126},"框架运行围绕着多个Agents展开，包括但不限于：",{"type":17,"tag":48,"props":128,"children":129},{},[130,135,140,145],{"type":17,"tag":52,"props":131,"children":132},{},[133],{"type":23,"value":134},"Designer：算子设计Agent，生成 Unified Sketch，为Coder提供优化指导；",{"type":17,"tag":52,"props":136,"children":137},{},[138],{"type":23,"value":139},"Coder：编码Agent，支持多前/后端的编码任务；",{"type":17,"tag":52,"props":141,"children":142},{},[143],{"type":23,"value":144},"Conductor：负责调度指挥职能，分析Task运行状态，决定执行流程，提出建议；",{"type":17,"tag":52,"props":146,"children":147},{},[148],{"type":23,"value":149},"Verifier：提供集成工具集，用于编译、运行、精度比对、性能分析等。",{"type":17,"tag":25,"props":151,"children":152},{},[153],{"type":23,"value":154},"2、自定义文档系统",{"type":17,"tag":25,"props":156,"children":157},{},[158],{"type":23,"value":159},"采取文档驱动式接入方案， 通过统一的文档规范与配置声明，使新的 DSL、前端框架与后端硬件在不修改 框架 本体的前提下完成接入，降低耦合与维护成本。",{"type":17,"tag":25,"props":161,"children":162},{},[163],{"type":23,"value":164},"3、搜索框架",{"type":17,"tag":25,"props":166,"children":167},{},[168],{"type":23,"value":169},"在持续性能优化场景中，可通过调用搜索框架，通过多轮迭代与 LLM 自主分析优化方向，持续优化算子代码性能表现。",{"type":17,"tag":25,"props":171,"children":172},{},[173],{"type":23,"value":174},"4、Database with RAG",{"type":17,"tag":25,"props":176,"children":177},{},[178],{"type":23,"value":179},"Database提供高性能算子数据存储、特征提取的功能；配合多层级混合算子检索方案，为各类场景提供更准确的算子示例，提升整体生成的正确性与性能。",{"type":17,"tag":68,"props":181,"children":182},{"style":70},[183],{"type":17,"tag":73,"props":184,"children":186},{"src":185,"style":76,"alt":7},"/category/information/news/banner/2025-12-9-2.jpg",[],{"type":17,"tag":25,"props":188,"children":189},{},[190],{"type":23,"value":191},"AIKG 框架图示",{"type":17,"tag":25,"props":193,"children":194},{},[195],{"type":17,"tag":29,"props":196,"children":197},{},[198],{"type":23,"value":199},"# 03",{"type":17,"tag":25,"props":201,"children":202},{},[203],{"type":17,"tag":29,"props":204,"children":205},{},[206],{"type":23,"value":207},"关键设计",{"type":17,"tag":209,"props":210,"children":212},"h3",{"id":211},"_31-基于-unified-sketch-的代码设计与生成",[213],{"type":17,"tag":29,"props":214,"children":215},{},[216],{"type":23,"value":217},"3.1 基于 Unified Sketch 的代码设计与生成",{"type":17,"tag":25,"props":219,"children":220},{},[221],{"type":23,"value":222},"区别于传统的“文字 --> 算子代码”，算子Agent采用先设计Sketch，再生成代码的思路。让各Agent聚焦关注自身任务问题，进而提升各Agent执行效果。",{"type":17,"tag":48,"props":224,"children":225},{},[226,231],{"type":17,"tag":52,"props":227,"children":228},{},[229],{"type":23,"value":230},"Sketch设计: 整合算子特点信息、硬件架构信息、不同算子启发式设计规则等，融合更多优化方案设计，用于提升生成算子效率和性能。",{"type":17,"tag":52,"props":232,"children":233},{},[234],{"type":23,"value":235},"代码生成：严格按照API文档、示例代码，确保生成的代码符合规范要求。",{"type":17,"tag":68,"props":237,"children":238},{"style":70},[239],{"type":17,"tag":73,"props":240,"children":242},{"src":241,"style":76,"alt":7},"/category/information/news/banner/2025-12-9-3.jpg",[],{"type":17,"tag":25,"props":244,"children":245},{},[246],{"type":23,"value":247},"Sketch图示",{"type":17,"tag":209,"props":249,"children":250},{"id":7},[],{"type":17,"tag":25,"props":252,"children":253},{},[254],{"type":17,"tag":29,"props":255,"children":256},{},[257],{"type":23,"value":258},"3.2 基于Conductor的工作流框架",{"type":17,"tag":25,"props":260,"children":261},{},[262],{"type":23,"value":263},"预设的固定流程缺乏对决策的把握，尤其是涉及不同难度的算子生成任务、不同的生成阶段，使用同一套流程会极大干扰生成效果。",{"type":17,"tag":68,"props":265,"children":266},{"style":70},[267],{"type":17,"tag":73,"props":268,"children":270},{"src":269,"style":76,"alt":7},"/category/information/news/banner/2025-12-9-4.jpg",[],{"type":17,"tag":25,"props":272,"children":273},{},[274],{"type":23,"value":275},"两种流程设计对比",{"type":17,"tag":25,"props":277,"children":278},{},[279],{"type":23,"value":280},"为解决这类问题，工作流中添加Conductor Agent作为中间调度者，存储完整的生成历史记录。通过Conductor智能调控任务流向，有针对性地分析生成过程并给出下一步建议，结合人工经验和特定的历史记录，引导后续 Agent 正确生成，推动整个生成过程高效运转。",{"type":17,"tag":25,"props":282,"children":283},{},[284],{"type":23,"value":285},"与通用CodeAgent对比，在AIKG框架下适配算子生成场景，针对性地设置任务调度和提示词生成，为算子生成提供规范的分析和进化方案，是正确生成高效率算子的最简洁的框架流程，在繁复的生成任务中指引高效的生成路线。",{"type":17,"tag":209,"props":287,"children":289},{"id":288},"_1",[],{"type":17,"tag":25,"props":291,"children":292},{},[293],{"type":17,"tag":29,"props":294,"children":295},{},[296],{"type":23,"value":297},"3.3自定义文档系统",{"type":17,"tag":25,"props":299,"children":300},{},[301],{"type":23,"value":302},"大部分LLM算子生成方案中，仅针对单一前端/后端组合（Torch + Cudac、Torch + Triton-cuda）的算子优化；优化后的成功示例很难无缝迁移其他前后端中去。",{"type":17,"tag":25,"props":304,"children":305},{},[306],{"type":23,"value":307},"而对 AI 算子的泛化支持又会导致以下问题：",{"type":17,"tag":48,"props":309,"children":310},{},[311,316],{"type":17,"tag":52,"props":312,"children":313},{},[314],{"type":23,"value":315},"多样化组合（前端框架/DSL/硬件）的定制化接入成本过高，常需在框架内部进行定制化改造，并存在组合规模爆炸的风险；",{"type":17,"tag":52,"props":317,"children":318},{},[319],{"type":23,"value":320},"规范与示例分散，缺少统一约定，难以沉淀和复用知识。",{"type":17,"tag":25,"props":322,"children":323},{},[324],{"type":17,"tag":29,"props":325,"children":326},{},[327],{"type":23,"value":328},"解决方案：文档驱动式接入（ Doc-Driven Integration）",{"type":17,"tag":25,"props":330,"children":331},{},[332],{"type":23,"value":333},"本框架 将前后端的定义从框架中剥离，通过统一的文档规范与配置声明，让前端框架、DSL、AI硬件在不修改框架的前提下完成接入，降低耦合与维护成本。",{"type":17,"tag":68,"props":335,"children":336},{"style":70},[337],{"type":17,"tag":73,"props":338,"children":340},{"src":339,"style":76,"alt":7},"/category/information/news/banner/2025-12-9-5.jpg",[],{"type":17,"tag":25,"props":342,"children":343},{},[344],{"type":23,"value":345},"以统一文档规范方式接入框架",{"type":17,"tag":25,"props":347,"children":348},{},[349],{"type":23,"value":350},"文档规范被划分为：（1）基础说明文档；（2）API文档；（3）优化建议文档；（4）示例参考等；AIKG 会针对特定文档做关键信息检索、长文本压缩等优化以合理利用各类文档。",{"type":17,"tag":209,"props":352,"children":354},{"id":353},"_2",[],{"type":17,"tag":25,"props":356,"children":357},{},[358],{"type":17,"tag":29,"props":359,"children":360},{},[361],{"type":23,"value":362},"3.4精度对比与性能采集",{"type":17,"tag":25,"props":364,"children":365},{},[366],{"type":23,"value":367},"本框架 提供 Torch/MindSpore/Numpy 等多个前端的精度、端到端性能验证、报错日志收集功能，新的 DSL 可快速接入 AIKG 框架以完成验证与反馈。在特殊的性能调优功能例如 Triton 的autotune 等，AIKG 也会提供相应的能力支持。",{"type":17,"tag":209,"props":369,"children":371},{"id":370},"_3",[],{"type":17,"tag":25,"props":373,"children":374},{},[375],{"type":17,"tag":29,"props":376,"children":377},{},[378],{"type":23,"value":379},"3.5搜索框架",{"type":17,"tag":25,"props":381,"children":382},{},[383],{"type":23,"value":384},"在基础算子生成流程之上，AIKG搜索模块支持在多任务场景下持续迭代优化算子性能。通过AIKG搜索框架方案，结合生成流程、数据库数据、优化要点，迭代生成高性能算子实现。搜索框架基础设计如下：",{"type":17,"tag":48,"props":386,"children":387},{},[388,393,398,403,408],{"type":17,"tag":52,"props":389,"children":390},{},[391],{"type":23,"value":392},"通过Sketch抽取算法框架和优化重点，兼容不同DSL；",{"type":17,"tag":52,"props":394,"children":395},{},[396],{"type":23,"value":397},"检查待优化列表，让大模型重点关注优化侧重点；",{"type":17,"tag":52,"props":399,"children":400},{},[401],{"type":23,"value":402},"通过重新抽取Sketch的方式，将算法逻辑和性能数据存储至 Database，后续生成时依据数据库的历史记录提供更合理的优化方案；",{"type":17,"tag":52,"props":404,"children":405},{},[406],{"type":23,"value":407},"从本地 Database 中抽取生成历史，作为下一轮生成的启发信息，检查 checklist 中未实现的部分，建议下一轮着重实现；",{"type":17,"tag":52,"props":409,"children":410},{},[411],{"type":23,"value":412},"由基础搜索框架扩展，AIKG进一步提供了Evolve类并行多轮搜索功能。",{"type":17,"tag":68,"props":414,"children":415},{"style":70},[416],{"type":17,"tag":73,"props":417,"children":419},{"src":418,"style":76,"alt":7},"/category/information/news/banner/2025-12-9-6.jpg",[],{"type":17,"tag":25,"props":421,"children":422},{},[423],{"type":23,"value":424},"多轮搜索流程图",{"type":17,"tag":209,"props":426,"children":428},{"id":427},"_4",[],{"type":17,"tag":25,"props":430,"children":431},{},[432],{"type":17,"tag":29,"props":433,"children":434},{},[435],{"type":23,"value":436},"3.6多层级混合算子检索方案",{"type":17,"tag":25,"props":438,"children":439},{},[440],{"type":23,"value":441},"随着算子数据的不断增加，在算子数据库中的数据条会越来越多，如何高效高质量的利用这些已有信息是进一步提升AIKG效率、性能表现的关键。基于这个认识，AIKG集成了相对应的RAG能力以及解决方案。",{"type":17,"tag":25,"props":443,"children":444},{},[445],{"type":23,"value":446},"而在Kernel代码相似性检索这一任务中，我们识别到Kernel代码由于其特殊性：不同Kernel存在大量的相似代码行，简单的Shape、Tiling变化会导致实际含义差别巨大，而这些信息很难被传统的相似性检索方案识别出来。",{"type":17,"tag":25,"props":448,"children":449},{},[450],{"type":23,"value":451},"为此我们提出一种多层级混合算子检索方案，通过这种更贴近算子逻辑的分析+检索，保证检索结果的大部分内容与当前任务相关。基本流程如下：",{"type":17,"tag":48,"props":453,"children":454},{},[455,471],{"type":17,"tag":52,"props":456,"children":457},{},[458,460,464,466,469],{"type":23,"value":459},"特征提取",{"type":17,"tag":461,"props":462,"children":463},"br",{},[],{"type":23,"value":465},"1）初始任务特征：算子的类型，计算逻辑，shape大小等；",{"type":17,"tag":461,"props":467,"children":468},{},[],{"type":23,"value":470},"2）算子代码特征：tiling, reordering, vectorization, layout, pipeline等；",{"type":17,"tag":52,"props":472,"children":473},{},[474,476,479,481,484],{"type":23,"value":475},"分层混合检索",{"type":17,"tag":461,"props":477,"children":478},{},[],{"type":23,"value":480},"1）基于后端架构、DSL、算子大类等基础要求在Database内进行粗筛；",{"type":17,"tag":461,"props":482,"children":483},{},[],{"type":23,"value":485},"2）将LLM生成的特征转换成Embedding，建立向量索引并逐层筛选。",{"type":17,"tag":68,"props":487,"children":488},{"style":70},[489],{"type":17,"tag":73,"props":490,"children":492},{"src":491,"style":76,"alt":7},"/category/information/news/banner/2025-12-9-7.jpg",[],{"type":17,"tag":25,"props":494,"children":495},{},[496],{"type":23,"value":497},"AIKG RAG 图示",{"type":17,"tag":25,"props":499,"children":500},{},[501],{"type":17,"tag":29,"props":502,"children":503},{},[504],{"type":23,"value":505},"# 04",{"type":17,"tag":25,"props":507,"children":508},{},[509],{"type":17,"tag":29,"props":510,"children":511},{},[512],{"type":23,"value":513},"当前效果展示",{"type":17,"tag":209,"props":515,"children":517},{"id":516},"_41-aikg支持dsl情况",[518],{"type":17,"tag":29,"props":519,"children":520},{},[521],{"type":23,"value":522},"4.1 AIKG支持DSL情况",{"type":17,"tag":25,"props":524,"children":525},{},[526],{"type":23,"value":527},"当前AIKG通过文档驱动式接入，已快速打通部分主流 DSL 的流程，部分结果如下:",{"type":17,"tag":68,"props":529,"children":530},{"style":70},[531],{"type":17,"tag":73,"props":532,"children":534},{"src":533,"style":76,"alt":7},"/category/information/news/banner/2025-12-9-8.jpg",[],{"type":17,"tag":25,"props":536,"children":537},{},[538],{"type":23,"value":539},"实验环境：ASCEND：Ascend Atlas 800 A2；；CPP：ARM64。",{"type":17,"tag":25,"props":541,"children":542},{},[543],{"type":23,"value":544},"注：Triton-ascend的当前对 img2col API缺少支持，30+个卷积算子生成存在难度。除此之外大部分场景都能支持。",{"type":17,"tag":209,"props":546,"children":548},{"id":547},"_5",[],{"type":17,"tag":25,"props":550,"children":551},{},[552],{"type":17,"tag":29,"props":553,"children":554},{},[555],{"type":23,"value":556},"4.2 Triton-ascend 性能验证",{"type":17,"tag":25,"props":558,"children":559},{},[560],{"type":23,"value":561},"Agent 在各类型算子中完成基于多轮搜索的Triton KernelBench Level 1性能优化，验证了AIKG 在性能调优方面的能力。同时在Inductor-Triton场景中，通过AIKG深度优化，普遍可达成更优的算子性能。",{"type":17,"tag":68,"props":563,"children":564},{"style":70},[565],{"type":17,"tag":73,"props":566,"children":568},{"src":567,"style":76,"alt":7},"/category/information/news/banner/2025-12-9-9.jpg",[],{"type":17,"tag":68,"props":570,"children":571},{"style":70},[572],{"type":17,"tag":73,"props":573,"children":575},{"src":574,"style":76,"alt":7},"/category/information/news/banner/2025-12-9-10.jpg",[],{"type":17,"tag":25,"props":577,"children":578},{},[579],{"type":17,"tag":29,"props":580,"children":581},{},[582],{"type":23,"value":583},"# 05",{"type":17,"tag":25,"props":585,"children":586},{},[587],{"type":17,"tag":29,"props":588,"children":589},{},[590],{"type":23,"value":591},"结语",{"type":17,"tag":25,"props":593,"children":594},{},[595],{"type":23,"value":596},"本项目是昇思MindSporeAKG团队在Agent时代的一次尝试，如何将AI编译、自动生成、算子优化等技术结合Agent产生更大的价值，是一件令人激动的事情；长久以来，非CUDA系的软硬件一直受困于编译、算子等底层能力生态支持度缺乏，易用性一直是各大厂商的老大难问题；而Agent+的路线目前看来有望缓解这一困境。",{"type":17,"tag":25,"props":598,"children":599},{},[600],{"type":23,"value":601},"AIKG会持续在这个方向上快速迭代演进，希望在AI民主化、算子民主化的过程中做积极的贡献。",{"type":17,"tag":25,"props":603,"children":604},{},[605,607],{"type":23,"value":606},"当前 AIKG 已作为 AKG 项目子仓于2025年6月开源（br_aikg分支），欢迎大家使用、评论及参与共建。代码地址：",{"type":17,"tag":608,"props":609,"children":613},"a",{"href":610,"rel":611},"https://gitee.com/mindspore/akg/tree/br_aikg/",[612],"nofollow",[614],{"type":23,"value":610},{"title":7,"searchDepth":616,"depth":616,"links":617},4,[618,620,621,622,623,624,625,626],{"id":211,"depth":619,"text":217},3,{"id":7,"depth":619,"text":7},{"id":288,"depth":619,"text":7},{"id":353,"depth":619,"text":7},{"id":370,"depth":619,"text":7},{"id":427,"depth":619,"text":7},{"id":516,"depth":619,"text":522},{"id":547,"depth":619,"text":7},"markdown","content:news:zh:2025-12.9.md","content","news/zh/2025-12.9.md","news/zh/2025-12.9","md",1776506060678]