[{"data":1,"prerenderedAt":329},["ShallowReactive",2],{"content-query-d69eHjVN5I":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":323,"_id":324,"_source":325,"_file":326,"_stem":327,"_extension":328},"/technology-blogs/zh/2026-5-13","zh",false,"","语音识别模型部署太难？昇思 MindSpore Lite 轻松搞定！","基于开源FireRedASR工业级语音识别模型，详述其在昇腾Atlas 800I A2单卡上实现时延最高降低55%的显著优化效果，识别精度在主流测试集与原始模型持平。","2026-5-13","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/11/28/8e0e0150508a4c5ba4287fa3bec8ea3f.png","technology-blogs","技术解读",{"type":15,"children":16,"toc":301},"root",[17,25,31,36,43,50,55,60,66,71,82,88,94,99,105,110,117,122,127,132,137,143,150,156,163,168,175,181,186,193,199,204,214,220,225,230,237,243,248,253,259,264,269,274,281,286,291,296],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"语音识别模型部署太难昇思-mindspore-lite-轻松搞定",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"语音识别作为人机交互的核心技术之一，其识别精度与响应时延直接决定了用户体验。随着智能汽车座舱、智能家居等场景对实时语音交互的需求不断提升，如何将高性能ASR模型高效部署到国产化平台上，已成为业界关注的重要课题。",{"type":18,"tag":26,"props":32,"children":33},{},[34],{"type":24,"value":35},"本文基于开源FireRedASR工业级语音识别模型，详述其在昇腾Atlas 800I A2单卡上实现时延最高降低55%的显著优化效果，识别精度在主流测试集与原始模型持平。",{"type":18,"tag":37,"props":38,"children":40},"h2",{"id":39},"_01-项目背景",[41],{"type":24,"value":42},"01 项目背景",{"type":18,"tag":44,"props":45,"children":47},"h3",{"id":46},"_11-fireredasr模型简介",[48],{"type":24,"value":49},"1.1 FireRedASR模型简介",{"type":18,"tag":26,"props":51,"children":52},{},[53],{"type":24,"value":54},"FireRedASR 是一系列开源的工业级自动语音识别（ASR）模型，支持普通话、多种中国方言及英语。该模型在公开普通话语音识别基准测试中刷新了业界最高水平（SOTA），并具备优异的流式识别能力。在架构设计上，FireRedASR 采用基于注意力机制的编码器–解码器（AED）框架，由 Conformer Encoder 与 Transformer Decoder 两部分组成。",{"type":18,"tag":26,"props":56,"children":57},{},[58],{"type":24,"value":59},"• Encoder：类似于LLM的Prefill阶段，根据整段音频特征产生Cross Attention用于后续解码\n• Decoder：类似于LLM的Decoder阶段，通过多次Multi Head Attention计算，推理出预测Token分数",{"type":18,"tag":44,"props":61,"children":63},{"id":62},"_12-迁移挑战",[64],{"type":24,"value":65},"1.2 迁移挑战",{"type":18,"tag":26,"props":67,"children":68},{},[69],{"type":24,"value":70},"将FireRedASR模型迁移至昇腾平台面临多重挑战：",{"type":18,"tag":72,"props":73,"children":75},"div",{"style":74},"text-align: center;",[76],{"type":18,"tag":77,"props":78,"children":81},"img",{"src":79,"style":80,"alt":7},"/category/information/technology-blogs/banner/2026-5-13/1.jpg","display: block;margin: 0 auto;max-width:60%",[],{"type":18,"tag":37,"props":83,"children":85},{"id":84},"_02-技术方案",[86],{"type":24,"value":87},"02 技术方案",{"type":18,"tag":44,"props":89,"children":91},{"id":90},"_21-为什么选择mindspore-lite",[92],{"type":24,"value":93},"2.1 为什么选择MindSpore Lite",{"type":18,"tag":26,"props":95,"children":96},{},[97],{"type":24,"value":98},"MindSpore Lite是昇思面向推理场景推出的轻量化推理框架，具备以下核心优势：\n• 多框架兼容：兼容MindSpore训练框架导出的模型结构，以及ONNX、TFLite、Pb等多种格式模型\n• 极致性能：通过整图下沉方式有效降低算子下发时延，针对昇腾硬件深度优化\n• 语音专项优化：针对语音类算法模型支持IO数据免拷贝等关键特性，减少数据传输开销\n• 自定义算子支持：支持用户自定义算子接入，满足差异化业务需求",{"type":18,"tag":44,"props":100,"children":102},{"id":101},"_22-模型迁移路径",[103],{"type":24,"value":104},"2.2 模型迁移路径",{"type":18,"tag":26,"props":106,"children":107},{},[108],{"type":24,"value":109},"原有基于TensorRT的推理架构迁移至MindSpore Lite，只需将模型转换环节改为MindIR格式，业务流程架构无需大规模调整：",{"type":18,"tag":72,"props":111,"children":112},{"style":74},[113],{"type":18,"tag":77,"props":114,"children":116},{"src":115,"style":80,"alt":7},"/category/information/technology-blogs/banner/2026-5-13/2.jpg",[],{"type":18,"tag":26,"props":118,"children":119},{},[120],{"type":24,"value":121},"关键优势：",{"type":18,"tag":26,"props":123,"children":124},{},[125],{"type":24,"value":126},"• 一键转换：MindSpore Lite提供converter_lite工具，支持自动化模型转换",{"type":18,"tag":26,"props":128,"children":129},{},[130],{"type":24,"value":131},"• 动态分档：通过配置支持动态shape，适配不同长度音频输入",{"type":18,"tag":26,"props":133,"children":134},{},[135],{"type":24,"value":136},"• AOE优化：启用昇腾专属优化（ascend_oriented），进一步提升推理性能",{"type":18,"tag":44,"props":138,"children":140},{"id":139},"_23-关键api",[141],{"type":24,"value":142},"2.3 关键API",{"type":18,"tag":72,"props":144,"children":145},{"style":74},[146],{"type":18,"tag":77,"props":147,"children":149},{"src":148,"style":80,"alt":7},"/category/information/technology-blogs/banner/2026-5-13/3.jpg",[],{"type":18,"tag":44,"props":151,"children":153},{"id":152},"_24-性能优化",[154],{"type":24,"value":155},"2.4 性能优化",{"type":18,"tag":157,"props":158,"children":160},"h4",{"id":159},"_241-融合算子",[161],{"type":24,"value":162},"2.4.1 融合算子",{"type":18,"tag":26,"props":164,"children":165},{},[166],{"type":24,"value":167},"针对VIT模型，MindSpore Lite提供了FlashAttention、LayerNorm等算子的融合能力，将大量小算子融合成整个融合大算子，降低算子的频繁调度时延以及提升算子的计算性能，从而达到模型推理的性能提升。",{"type":18,"tag":72,"props":169,"children":170},{"style":74},[171],{"type":18,"tag":77,"props":172,"children":174},{"src":173,"style":80,"alt":7},"/category/information/technology-blogs/banner/2026-5-13/4.jpg",[],{"type":18,"tag":44,"props":176,"children":178},{"id":177},"_242-h2dd2h免拷贝",[179],{"type":24,"value":180},"2.4.2 H2D/D2H免拷贝",{"type":18,"tag":26,"props":182,"children":183},{},[184],{"type":24,"value":185},"为提升业务吞吐量，我们将推理 Batch Size 由 1 提升至 256。然而，Batch 的增大导致 Host 与 Device 之间的数据拷贝量显著增加。为进一步优化性能，MindSpore Lite 引入了Device 侧内存预申请机制及后处理入图优化。这两项优化有效减少了数据拷贝开销，成功实现了推理性能的显著提升。",{"type":18,"tag":72,"props":187,"children":188},{"style":74},[189],{"type":18,"tag":77,"props":190,"children":192},{"src":191,"style":80,"alt":7},"/category/information/technology-blogs/banner/2026-5-13/5.jpg",[],{"type":18,"tag":44,"props":194,"children":196},{"id":195},"_243-aoe自动调优",[197],{"type":24,"value":198},"2.4.3 AOE自动调优",{"type":18,"tag":26,"props":200,"children":201},{},[202],{"type":24,"value":203},"MindSpore Lite集成AOE（Ascend Optimization Engine）后端自动调优工具，通过生成调优策略、编译和运行环境验证的闭环反馈机制，不断迭代出更优的调优策略。\n关键配置：",{"type":18,"tag":205,"props":206,"children":208},"pre",{"code":207},"aoe_mode=\"subgraph tuning, operator tuning\"\n",[209],{"type":18,"tag":210,"props":211,"children":212},"code",{"__ignoreMap":7},[213],{"type":24,"value":207},{"type":18,"tag":37,"props":215,"children":217},{"id":216},"_03-优化效果",[218],{"type":24,"value":219},"03 优化效果",{"type":18,"tag":26,"props":221,"children":222},{},[223],{"type":24,"value":224},"时延性能:",{"type":18,"tag":26,"props":226,"children":227},{},[228],{"type":24,"value":229},"基于昇腾Atlas 800I A2单卡部署，通过AOE自动调优和融合算子优化，encoder和decoder模块优化后性能较开箱时延均有较大提升，shape越大优化效果越显著，较开箱时延最高降低55%；",{"type":18,"tag":72,"props":231,"children":232},{"style":74},[233],{"type":18,"tag":77,"props":234,"children":236},{"src":235,"style":80,"alt":7},"/category/information/technology-blogs/banner/2026-5-13/6.jpg",[],{"type":18,"tag":37,"props":238,"children":240},{"id":239},"_04-总结",[241],{"type":24,"value":242},"04 总结",{"type":18,"tag":26,"props":244,"children":245},{},[246],{"type":24,"value":247},"本次迁移项目充分验证了MindSpore Lite在语音识别模型部署场景的强大能力：\n• 一键迁移：从TensorRT到MindIR的平滑过渡，业务改动最小化\n• 极致性能：时延最高降低55%，满足实时交互的严苛要求\n• 高效开发：完整的C++集成案例和参考资料，降低开发门槛",{"type":18,"tag":26,"props":249,"children":250},{},[251],{"type":24,"value":252},"昇思MindSpore Lite将继续深耕语音识别、语音合成等领域，为智能汽车、智能家居等场景提供更优质的推理体验。",{"type":18,"tag":37,"props":254,"children":256},{"id":255},"_05-社区贡献",[257],{"type":24,"value":258},"05 社区贡献",{"type":18,"tag":26,"props":260,"children":261},{},[262],{"type":24,"value":263},"MindSpore Lite面向不同硬件设备提供轻量化AI推理加速能力，使能智能应用，为开发者提供端到端的解决方案，为算法工程师和数据科学家提供开发友好、运行高效、部署灵活的体验，帮助人工智能软硬件应用生态繁荣发展。昇思MindSpore Lite始终秉持开源开放的合作理念，欢迎广大开发者参与共建。",{"type":18,"tag":26,"props":265,"children":266},{},[267],{"type":24,"value":268},"参与方式：",{"type":18,"tag":26,"props":270,"children":271},{},[272],{"type":24,"value":273},"昇思MindSpore Lite社区持续欢迎开发者提交Issue、贡献代码或分享迁移经验。无论您是企业用户还是个人开发者，都可以通过以下方式参与社区共建：",{"type":18,"tag":72,"props":275,"children":276},{"style":74},[277],{"type":18,"tag":77,"props":278,"children":280},{"src":279,"style":80,"alt":7},"/category/information/technology-blogs/banner/2026-5-13/7.jpg",[],{"type":18,"tag":26,"props":282,"children":283},{},[284],{"type":24,"value":285},"欢迎您通过AtomGit Issues来提交问题、报告与建议。",{"type":18,"tag":26,"props":287,"children":288},{},[289],{"type":24,"value":290},"欢迎您通过社区论坛进行技术、问题交流。",{"type":18,"tag":26,"props":292,"children":293},{},[294],{"type":24,"value":295},"欢迎您通过Sig来管理和改善工作流程，参与讨论。",{"type":18,"tag":26,"props":297,"children":298},{},[299],{"type":24,"value":300},"让我们共同推动自主创新AI推理框架的生态繁荣，让更多开发者受益于高效、易用的昇腾部署方案。",{"title":7,"searchDepth":302,"depth":302,"links":303},4,[304,310,320,321,322],{"id":39,"depth":305,"text":42,"children":306},2,[307,309],{"id":46,"depth":308,"text":49},3,{"id":62,"depth":308,"text":65},{"id":84,"depth":305,"text":87,"children":311},[312,313,314,315,318,319],{"id":90,"depth":308,"text":93},{"id":101,"depth":308,"text":104},{"id":139,"depth":308,"text":142},{"id":152,"depth":308,"text":155,"children":316},[317],{"id":159,"depth":302,"text":162},{"id":177,"depth":308,"text":180},{"id":195,"depth":308,"text":198},{"id":216,"depth":305,"text":219},{"id":239,"depth":305,"text":242},{"id":255,"depth":305,"text":258},"markdown","content:technology-blogs:zh:2026-5-13.md","content","technology-blogs/zh/2026-5-13.md","technology-blogs/zh/2026-5-13","md",1778880532337]