[{"data":1,"prerenderedAt":264},["ShallowReactive",2],{"content-query-1RQg4qAZKy":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":258,"_id":259,"_source":260,"_file":261,"_stem":262,"_extension":263},"/news/zh/2025-12-18","zh",false,"","昇思人工智能框架峰会 | MindSpore Lite混合精度推理，实现内存节省30%，助力鸿蒙翻译模型轻量化部署","基于MindSpore Lite的CPU混合精度推理方案","2025-12-18","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/25/199b735845bf4106b44b2035dc97bd39.png","news",{"type":14,"children":15,"toc":255},"root",[16,24,30,39,47,52,57,67,72,80,88,99,107,112,117,122,130,138,143,151,156,164,172,185,193,200,213,218,227,232,241,246],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"昇思人工智能框架峰会-mindspore-lite混合精度推理实现内存节省30助力鸿蒙翻译模型轻量化部署",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28],{"type":23,"value":29},"在语言翻译算法模型中，通常需依托Transformer算法模型完成文本特征的提取与转换。针对Transformer推理内存较高、难以满足端侧多语种翻译应用部署的内存要求，基于MindSpore Lite的CPU混合精度推理方案，综合运用混合精度子图调度、IO免拷贝等关键技术，成功将鸿蒙系统内置翻译模型的推理内存优化至66MB，相较于原始100MB以上的推理内存显著降低，支撑模型在鸿蒙6.0上线部署。",{"type":17,"tag":25,"props":31,"children":32},{},[33],{"type":17,"tag":34,"props":35,"children":36},"strong",{},[37],{"type":23,"value":38},"# 01",{"type":17,"tag":25,"props":40,"children":41},{},[42],{"type":17,"tag":34,"props":43,"children":44},{},[45],{"type":23,"value":46},"背景与挑战",{"type":17,"tag":25,"props":48,"children":49},{},[50],{"type":23,"value":51},"MindSpore Lite作为高性能推理框架，在当前AI产业化落地的背景下，为开源模型的商用部署提供了坚实的技术保障。",{"type":17,"tag":25,"props":53,"children":54},{},[55],{"type":23,"value":56},"在NLU（Natural Language Understanding，自然语言理解）场景中，特征提取通常依赖注意力机制实现。然而，注意力模块包含大量的大颗粒矩阵乘算子，并且涉及到Cache缓存，致使推理过程内存占用较高，对于鸿蒙系统内置的基础翻译模型，内存超限成为制约特性上线的关键瓶颈。",{"type":17,"tag":58,"props":59,"children":60},"ul",{},[61],{"type":17,"tag":62,"props":63,"children":64},"li",{},[65],{"type":23,"value":66},"  内存占用：翻译模型，使用整网Float16 CPU推理，精度误差不可接受，整网Float32精度正常，但在fp32下推理占用较大，需要借助混合精度特性，在降低cpu内存占用的同时减少其带来的精度损失。\n   ",{"type":17,"tag":25,"props":68,"children":69},{},[70],{"type":23,"value":71},"为突破上述内存瓶颈，MindSpore Lite提供了基于CPU混合精度的推理模式，并融合IO免拷贝、图算融合等关键技术，形成系统性解决方案，有力保障了业务的商用化落地。",{"type":17,"tag":25,"props":73,"children":74},{},[75],{"type":17,"tag":34,"props":76,"children":77},{},[78],{"type":23,"value":79},"# 02",{"type":17,"tag":25,"props":81,"children":82},{},[83],{"type":17,"tag":34,"props":84,"children":85},{},[86],{"type":23,"value":87},"MindSpore Lite推理技术方案",{"type":17,"tag":89,"props":90,"children":92},"div",{"style":91},"text-align: center;",[93],{"type":17,"tag":94,"props":95,"children":98},"img",{"src":96,"style":97,"alt":7},"/category/information/news/banner/2025-12-18-1.jpg","display: block;margin: 0 auto;max-width:60%",[],{"type":17,"tag":25,"props":100,"children":101},{},[102],{"type":17,"tag":34,"props":103,"children":104},{},[105],{"type":23,"value":106},"1、方案介绍",{"type":17,"tag":25,"props":108,"children":109},{},[110],{"type":23,"value":111},"MindSpore Lite推理框架提供了功能完备的转换工具及简洁易用的推理API接口。",{"type":17,"tag":25,"props":113,"children":114},{},[115],{"type":23,"value":116},"模型转换阶段，MindSpore Lite转换工具可将MindSpore训练框架导出的MindIR模型或第三方框架导出的ONNX模型转换为MindSpore Lite格式的ms模型。离线转换过程中，工具首先将用户模型解析为标准MindSpore Lite IR格式，并在此基础上执行算子融合、子图切分、量化压缩等系列优化操作。同时，MindSpore Lite依据目标部署硬件特性，将优化后的IR对接至相应硬件后端，最终导出适用于MindSpore Lite推理部署的ms模型文件。",{"type":17,"tag":25,"props":118,"children":119},{},[120],{"type":23,"value":121},"在线推理阶段，MindSpore Lite提供简明高效的API调用接口，通过加载转换后的ms模型，基于CPU后端注册的170+高性能算子实现，调度选取最优执行计划，最终获取推理结果。推理过程中，框架支持子图切分、IO免拷贝等关键技术，有效提升模型推理性能并保障业务部署的稳定性与功能性。",{"type":17,"tag":25,"props":123,"children":124},{},[125],{"type":17,"tag":34,"props":126,"children":127},{},[128],{"type":23,"value":129},"2、关键技术",{"type":17,"tag":25,"props":131,"children":132},{},[133],{"type":17,"tag":34,"props":134,"children":135},{},[136],{"type":23,"value":137},"CPU混合精度推理：",{"type":17,"tag":25,"props":139,"children":140},{},[141],{"type":23,"value":142},"MindSpore Lite提供CPU混合精度推理机制，可针对单个模型内的不同算子，灵活配置Float32、Float16等不同精度计算策略，通过仅对精度敏感的算子保持高精度计算，可完成性能和精度的精细调优。以语种翻译模型为例，结合算法测试集，默认选用Float16推理，针对LayerNorm，SoftMax等数值敏感算子采用Float32推理，较整网Float32性能提升20%，内存降低30%，且精度误差\u003C1%。",{"type":17,"tag":25,"props":144,"children":145},{},[146],{"type":17,"tag":34,"props":147,"children":148},{},[149],{"type":23,"value":150},"输入/输出免拷贝：",{"type":17,"tag":25,"props":152,"children":153},{},[154],{"type":23,"value":155},"调用MindSpore Lite执行推理前需要设置输入数据，推理结束后也需要读取输出结果，当输入规模变大，如翻译模型输入KVCache，会引入较大的内存拷贝时延，且存在额外内存占用。MindSpore Lite利用CPU内存共享机制，实现了模型输入/输出数据免拷贝功能，可有效降低推理时延和内存占用。以翻译模型CPU推理为例，针对多达40个输入节点，开启免拷贝功能，可提升10%的推理性能，且内存优化10%。",{"type":17,"tag":25,"props":157,"children":158},{},[159],{"type":17,"tag":34,"props":160,"children":161},{},[162],{"type":23,"value":163},"# 03",{"type":17,"tag":25,"props":165,"children":166},{},[167],{"type":17,"tag":34,"props":168,"children":169},{},[170],{"type":23,"value":171},"性能测试与验证",{"type":17,"tag":25,"props":173,"children":174},{},[175,177],{"type":23,"value":176},"可以通过MindSpore Lite官网发布包中的converter_lite转换工具，将开源导出的onnx模型转换成ms的模型，然后通过MindSpore Lite官网发布包中的benchmark工具验证模型的功能与性能数据，详细的验证方法可以参考MindSpore Lite官网教程：",{"type":17,"tag":178,"props":179,"children":183},"a",{"href":180,"rel":181},"https://www.mindspore.cn/lite/docs/zh-CN/r2.7.1/tools/benchmark_tool.html",[182],"nofollow",[184],{"type":23,"value":180},{"type":17,"tag":25,"props":186,"children":187},{},[188],{"type":17,"tag":34,"props":189,"children":190},{},[191],{"type":23,"value":192},"# 04",{"type":17,"tag":25,"props":194,"children":195},{},[196],{"type":17,"tag":34,"props":197,"children":198},{},[199],{"type":23,"value":171},{"type":17,"tag":58,"props":201,"children":202},{},[203,208],{"type":17,"tag":62,"props":204,"children":205},{},[206],{"type":23,"value":207},"  针对翻译算法模型在端侧CPU硬件上的部署推理，MindSpore Lite会持续进行性能优化，降低部署内存，提升推理性能，提供更加易用的接口能力。\n   ",{"type":17,"tag":62,"props":209,"children":210},{},[211],{"type":23,"value":212},"  与开源社区共同适配更多更新的开源算法模型，提升推理框架的泛化性能力，与模型推理性能。\n   ",{"type":17,"tag":25,"props":214,"children":215},{},[216],{"type":23,"value":217},"开源代码仓库链接：",{"type":17,"tag":25,"props":219,"children":220},{},[221],{"type":17,"tag":178,"props":222,"children":225},{"href":223,"rel":224},"https://gitee.com/mindspore/mindspore-lite",[182],[226],{"type":23,"value":223},{"type":17,"tag":25,"props":228,"children":229},{},[230],{"type":23,"value":231},"昇思社区官网链接：",{"type":17,"tag":25,"props":233,"children":234},{},[235],{"type":17,"tag":178,"props":236,"children":239},{"href":237,"rel":238},"https://www.mindspore.cn/lite/",[182],[240],{"type":23,"value":237},{"type":17,"tag":25,"props":242,"children":243},{},[244],{"type":23,"value":245},"鸿蒙官网MindSpore Lite Kit主页：",{"type":17,"tag":25,"props":247,"children":248},{},[249],{"type":17,"tag":178,"props":250,"children":253},{"href":251,"rel":252},"https://developer.huawei.com/consumer/cn/sdk/mindspore-lite-kit",[182],[254],{"type":23,"value":251},{"title":7,"searchDepth":256,"depth":256,"links":257},4,[],"markdown","content:news:zh:2025-12-18.md","content","news/zh/2025-12-18.md","news/zh/2025-12-18","md",1776506060014]