[{"data":1,"prerenderedAt":295},["ShallowReactive",2],{"content-query-ZeZCKoE3hp":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":289,"_id":290,"_source":291,"_file":292,"_stem":293,"_extension":294},"/technology-blogs/zh/2025-12-4","zh",false,"","利用昇思MindSpore图模式融合特性优化模型推理","减少了内核启动次数和设备内存访问，极大地提升了计算效率","2025-12-4","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/06/13/ca51743e8e31470a9d8d989f4b463985.png","technology-blogs","实践",{"type":15,"children":16,"toc":283},"root",[17,25,35,46,51,63,71,82,90,95,115,120,128,133,145,155,160,171,176,184,189,197,205,209,215,233,244,252,260,265],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"利用昇思mindspore图模式融合特性优化模型推理",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":18,"tag":30,"props":31,"children":32},"strong",{},[33],{"type":24,"value":34},"# 01",{"type":18,"tag":26,"props":36,"children":37},{},[38],{"type":18,"tag":30,"props":39,"children":40},{},[41],{"type":18,"tag":30,"props":42,"children":43},{},[44],{"type":24,"value":45},"背景介绍",{"type":18,"tag":26,"props":47,"children":48},{},[49],{"type":24,"value":50},"一个智慧城市项目中，我们需要将一个DeepLabV3+语义分割模型部署到Atlas 200I DK A2开发板上，用于实时街景解析。该模型结构复杂，包含大量卷积、批归一化（BatchNorm）和激活函数层。在最初的PyTorch模型转MindSpore并直接使用Pynative模式推理时，发现单张图片的推理延时高达500ms以上，无法满足实时性要求。",{"type":18,"tag":26,"props":52,"children":53},{},[54,56,61],{"type":24,"value":55},"MindSpore提供了两种执行模式：动态图模式（Pynative Mode）和静态图模式（Graph Mode）。动态图模式便于调试，但性能并非最优。静态图模式在执行前会将整个模型编译成一幅计算图，从而有机会进行深度的图级优化，其中最关键的技术之一就是",{"type":18,"tag":30,"props":57,"children":58},{},[59],{"type":24,"value":60},"算子融合（Operator Fusion）",{"type":24,"value":62},"。算子融合将多个细粒度的算子合并成一个粗粒度的算子，从而减少了内核启动次数和设备内存访问，极大地提升了计算效率。",{"type":18,"tag":26,"props":64,"children":65},{},[66],{"type":18,"tag":30,"props":67,"children":68},{},[69],{"type":24,"value":70},"# 02",{"type":18,"tag":26,"props":72,"children":73},{},[74],{"type":18,"tag":30,"props":75,"children":76},{},[77],{"type":18,"tag":30,"props":78,"children":79},{},[80],{"type":24,"value":81},"算子融合的原理与在分割模型中的应用",{"type":18,"tag":26,"props":83,"children":84},{},[85],{"type":18,"tag":30,"props":86,"children":87},{},[88],{"type":24,"value":89},"1、融合原理：",{"type":18,"tag":26,"props":91,"children":92},{},[93],{"type":24,"value":94},"以经典的Conv2D-> BatchNorm-> ReLU序列为例。在未融合的情况下，前向推理需要依次执行三个算子的内核：",{"type":18,"tag":96,"props":97,"children":98},"ol",{},[99,105,110],{"type":18,"tag":100,"props":101,"children":102},"li",{},[103],{"type":24,"value":104},"卷积计算。",{"type":18,"tag":100,"props":106,"children":107},{},[108],{"type":24,"value":109},"应用批归一化的缩放和平移。",{"type":18,"tag":100,"props":111,"children":112},{},[113],{"type":24,"value":114},"进行ReLU非线性激活。",{"type":18,"tag":26,"props":116,"children":117},{},[118],{"type":24,"value":119},"这三个步骤需要三次内核启动和多次中间结果的读写。算子融合技术则可以在图编译阶段，将这三个算子的计算过程合并为一个复合算子的计算过程。它通过数学推导，将BatchNorm的参数（γ, β）与卷积层的权重和偏置进行融合，并提前计算好新的权重和偏置，然后将ReLU的截断操作内联。最终，在设备上只需要启动一个融合算子内核，一次性完成所有计算。这显著降低了内核启动开销和内存带宽压力。",{"type":18,"tag":26,"props":121,"children":122},{},[123],{"type":18,"tag":30,"props":124,"children":125},{},[126],{"type":24,"value":127},"2、在模型中的实现：",{"type":18,"tag":26,"props":129,"children":130},{},[131],{"type":24,"value":132},"MindSpore的Graph Mode在默认情况下会自动尝试进行算子融合。但模型的结构设计会影响融合的效果。为了最大化融合收益，我们应尽量使用MindSpore提供的标准Cell来构建模型。",{"type":18,"tag":134,"props":135,"children":136},"ul",{},[137],{"type":18,"tag":100,"props":138,"children":139},{},[140],{"type":18,"tag":30,"props":141,"children":142},{},[143],{"type":24,"value":144},"初始问题代码（融合不友好）：",{"type":18,"tag":146,"props":147,"children":149},"pre",{"code":148},"import mindspore.nn as nn\n\nclass InefficientBlock(nn.Cell):\n    def __init__(self, in_channels, out_channels):\n        super().__init__()\n        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=3, has_bias=True)\n        self.bn = nn.BatchNorm2d(out_channels)\n        self.relu = nn.ReLU()\n    \n    def construct(self, x):\n        x = self.conv(x)\n        x = self.bn(x)\n        x = self.relu(x)\n        return x\n",[150],{"type":18,"tag":151,"props":152,"children":153},"code",{"__ignoreMap":7},[154],{"type":24,"value":148},{"type":18,"tag":26,"props":156,"children":157},{},[158],{"type":24,"value":159},"这种写法虽然清晰，但三个算子是独立的，给图优化器提供了明确的融合边界，效果已不错。但还有更优写法。",{"type":18,"tag":134,"props":161,"children":162},{},[163],{"type":18,"tag":100,"props":164,"children":165},{},[166],{"type":18,"tag":30,"props":167,"children":168},{},[169],{"type":24,"value":170},"优化后代码（推荐，最大化融合机会）：",{"type":18,"tag":26,"props":172,"children":173},{},[174],{"type":24,"value":175},"MindSpore提供了nn.Conv2dBnAct这样的预融合Cell，它是为融合而设计的。",{"type":18,"tag":146,"props":177,"children":179},{"code":178},"class EfficientBlock(nn.Cell):\n    def __init__(self, in_channels, out_channels):\n        super().__init__()\n        # 使用Conv2dBnAct，一个Cell封装了Conv、BN和Activation\n        self.conv_bn_relu = nn.Conv2dBnAct(in_channels, out_channels, kernel_size=3,\n                                          has_bn=True,  # 开启BN\n                                          activation='relu')  # 指定激活函数为ReLU\n   \n    def construct(self, x):\n        return self.conv_bn_relu(x)\n",[180],{"type":18,"tag":151,"props":181,"children":182},{"__ignoreMap":7},[183],{"type":24,"value":178},{"type":18,"tag":26,"props":185,"children":186},{},[187],{"type":24,"value":188},"使用nn.Conv2dBnAct等高级Cell，从源码层面就表达了“这是一个融合单元”的语义，使得MindSpore的图优化器能更安全、更彻底地应用融合优化。",{"type":18,"tag":26,"props":190,"children":191},{},[192],{"type":18,"tag":30,"props":193,"children":194},{},[195],{"type":24,"value":196},"# 03",{"type":18,"tag":26,"props":198,"children":199},{},[200],{"type":18,"tag":30,"props":201,"children":202},{},[203],{"type":24,"value":204},"效果验证与性能对比",{"type":18,"tag":206,"props":207,"children":208},"h3",{"id":7},[],{"type":18,"tag":206,"props":210,"children":212},{"id":211},"我们对使用inefficientblock构建的原始模型和使用efficientblock重构的优化模型进行了对比实验",[213],{"type":24,"value":214},"我们对使用InefficientBlock构建的原始模型和使用EfficientBlock重构的优化模型进行了对比实验。",{"type":18,"tag":134,"props":216,"children":217},{},[218,228],{"type":18,"tag":100,"props":219,"children":220},{},[221,226],{"type":18,"tag":30,"props":222,"children":223},{},[224],{"type":24,"value":225},"图结构对比：",{"type":24,"value":227}," 使用MindSpore的graphviz工具导出计算图。可以清晰地看到，优化后的模型图中，原先连续的Conv2D、BatchNorm、ReLU节点被替换为了一个名为FusedConv2dBnAct的单个节点。",{"type":18,"tag":100,"props":229,"children":230},{},[231],{"type":24,"value":232},"**性能数据对比：**在Atlas 200I DK A2上，使用相同的测试图片和推理循环（100次取平均），结果如下：",{"type":18,"tag":234,"props":235,"children":237},"div",{"style":236},"text-align: center;",[238],{"type":18,"tag":239,"props":240,"children":243},"img",{"src":241,"style":242,"alt":7},"/category/information/technology-blogs/banner/2025-12-4.jpg","display: block;margin: 0 auto;max-width:70%",[],{"type":18,"tag":26,"props":245,"children":246},{},[247],{"type":18,"tag":30,"props":248,"children":249},{},[250],{"type":24,"value":251},"# 04",{"type":18,"tag":26,"props":253,"children":254},{},[255],{"type":18,"tag":30,"props":256,"children":257},{},[258],{"type":24,"value":259},"总结",{"type":18,"tag":26,"props":261,"children":262},{},[263],{"type":24,"value":264},"在昇腾算力平台上，通过优化后的效果如下：",{"type":18,"tag":134,"props":266,"children":267},{},[268,273,278],{"type":18,"tag":100,"props":269,"children":270},{},[271],{"type":24,"value":272},"从Pynative模式切换到Graph模式，性能提升了约65%，这主要得益于图级别的整体优化和算子融合的初步应用。",{"type":18,"tag":100,"props":274,"children":275},{},[276],{"type":24,"value":277},"在Graph模式基础上，使用为融合优化的Cell（Conv2dBnAct）构建模型，性能进一步提升了约30%。这证明了积极的模型构建方式能更好地释放硬件潜力。",{"type":18,"tag":100,"props":279,"children":280},{},[281],{"type":24,"value":282},"内存占用的下降也符合预期，因为融合减少了许多中间结果的存储。",{"title":7,"searchDepth":284,"depth":284,"links":285},4,[286,288],{"id":7,"depth":287,"text":7},3,{"id":211,"depth":287,"text":214},"markdown","content:technology-blogs:zh:2025-12-4.md","content","technology-blogs/zh/2025-12-4.md","technology-blogs/zh/2025-12-4","md",1776506118216]