[{"data":1,"prerenderedAt":417},["ShallowReactive",2],{"content-query-mnmkDKLX5T":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":411,"_id":412,"_source":413,"_file":414,"_stem":415,"_extension":416},"/technology-blogs/zh/3894","zh",false,"","PTQ4SAM模型论文解读，并基于MindSpore NLP推理复现","该论文框架在各种视觉任务（实例分割、语义分割和目标检测）、数据集和模型变体上的广泛实验结果表明了PTQ4SAM的优越性。","2025-10-31","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/11/07/1593b742066f40658d35b77a0c0bb4c4.png","technology-blogs",{"type":14,"children":15,"toc":408},"root",[16,24,30,35,40,57,65,70,75,85,90,95,103,118,126,131,136,141,149,154,159,164,171,176,191,199,204,209,214,222,227,234,239,246,251,258,266,271,276,281,286,301,306,311,321,326,334,339,347,352,359,364,371,376,383,398,403],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"ptq4sam模型论文解读并基于mindspore-nlp推理复现",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28],{"type":23,"value":29},"作者：linyiining",{"type":17,"tag":25,"props":31,"children":32},{},[33],{"type":23,"value":34},"来源：开源实习",{"type":17,"tag":25,"props":36,"children":37},{},[38],{"type":23,"value":39},"Segment Anything Model（SAM）在许多计算机视觉任务中取得了令人印象深刻的性能。然而，作为一种大规模模型，其巨大的内存和计算成本限制了其实际部署。该论文针对SAM量化中固有的瓶颈问题，提出了一种针对SAM的后训练量化（PTQ）框架，命名为PTQ4SAM。该框架在各种视觉任务（实例分割、语义分割和目标检测）、数据集和模型变体上的广泛实验结果表明了PTQ4SAM的优越性。",{"type":17,"tag":18,"props":41,"children":43},{"id":42},"_01-论文创新点sam模型量化的固有挑战",[44,50,52],{"type":17,"tag":45,"props":46,"children":47},"strong",{},[48],{"type":23,"value":49},"# 01",{"type":23,"value":51}," ",{"type":17,"tag":45,"props":53,"children":54},{},[55],{"type":23,"value":56},"论文创新点（SAM模型量化的固有挑战）",{"type":17,"tag":25,"props":58,"children":59},{},[60],{"type":17,"tag":45,"props":61,"children":62},{},[63],{"type":23,"value":64},"1、post-key-linear的双峰分布现象影响了模型量化后的表现",{"type":17,"tag":25,"props":66,"children":67},{},[68],{"type":23,"value":69},"在 SAM（Segment Anything Model） 这样的视觉大模型中，Post-Key-Linear 这一术语通常与 Transformer 架构中的注意力机制（Attention Mechanism） 相关。具体来说，它指的是在计算注意力机制中的 Key 矩阵 后，经过一个线性变换（Linear Layer）的输出。SAM模型的Post-Key-Linear输出呈现双峰分布（如下图1(a)所示），这极大影响了量化的效果。",{"type":17,"tag":25,"props":71,"children":72},{},[73],{"type":23,"value":74},"解决方法：双峰集成策略。文章通过per-tensor和per-channel两个角度分析了双峰分布，并认为这是量化的主要障碍，并将双峰分布转换为正态分布来解决这一瓶颈。",{"type":17,"tag":25,"props":76,"children":77},{},[78,80],{"type":23,"value":79},"**2、**",{"type":17,"tag":45,"props":81,"children":82},{},[83],{"type":23,"value":84},"复杂的 post Softmax 分布",{"type":17,"tag":25,"props":86,"children":87},{},[88],{"type":23,"value":89},"由于注意力机制的种类多种多样，与 VIT 相比，SAM 表现出更复杂的 post Softmax 分布。文章中举了image-to-token和token-to-image两个位置的post-Softmax的不同分布说明这一点，并且指出先前的工作并未意识到这一点，导致了固有信息的潜在丢失。",{"type":17,"tag":25,"props":91,"children":92},{},[93],{"type":23,"value":94},"解决方法：自适应粒度量化（AGQ）",{"type":17,"tag":25,"props":96,"children":97},{},[98],{"type":17,"tag":99,"props":100,"children":102},"img",{"alt":7,"src":101},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/11/07/46c75dade8924bf5b18d82b9b25ddd1b.png",[],{"type":17,"tag":18,"props":104,"children":106},{"id":105},"_02-ptq4sam架构",[107,112,113],{"type":17,"tag":45,"props":108,"children":109},{},[110],{"type":23,"value":111},"# 02",{"type":23,"value":51},{"type":17,"tag":45,"props":114,"children":115},{},[116],{"type":23,"value":117},"PTQ4SAM架构",{"type":17,"tag":25,"props":119,"children":120},{},[121],{"type":17,"tag":45,"props":122,"children":123},{},[124],{"type":23,"value":125},"1、双峰集成策略",{"type":17,"tag":25,"props":127,"children":128},{},[129],{"type":23,"value":130},"从per-tensor角度来看激活值双峰分布，如图1(a)所示。从per-channel角度来看，每个通道的激活值在一个固定值附近，每个通道之间有很大区别，如图2所示。",{"type":17,"tag":25,"props":132,"children":133},{},[134],{"type":23,"value":135},"由于每一个channel中所有tensor的激活值都分布在一个固定值附近，而这一系列固定值呈现以0为中心的双峰分布。因此，论文中采用一个参数γ来将双峰分布转换为正常分布，其取值为-1或1。对于channel的激活值平均值为正的，γ为1，channel激活值不发生变化；对于平均值为负的，γ为-1，将负值映射为正值，同时为query linear也乘上-1，保持输出值不变。由于query linear是关于0对称的标准正态分布，乘以-1后分布不变。这样，q和k的linear激活值都保持了标准分布。",{"type":17,"tag":25,"props":137,"children":138},{},[139],{"type":23,"value":140},"代码的实现主要分为三步骤：①判断tensor是否属于双峰分布②计算γ③进行等效转换（bimodal discovery, γ computation and equivalent transformation）。",{"type":17,"tag":25,"props":142,"children":143},{},[144],{"type":17,"tag":45,"props":145,"children":146},{},[147],{"type":23,"value":148},"2、自适应粒度量化（AGQ）",{"type":17,"tag":25,"props":150,"children":151},{},[152],{"type":23,"value":153},"对post softmax的量化，用的是改进的log2量化，一种自适应粒度量化方法。通过搜索最优的2的幂次基底τ，为不同的Softmax后分布提供合适的量化粒度。通过建立一个LUT（对不同的τ和移位的乘积进行查找），就可以通过非常小的空间代价（4KB）实现对于不同post softmax的不同粒度的量化。这个空间代价是用所有的τ的取值乘以所有的移位可能。对于8bit的精度来说，τ的取值有2^2种可能性，aq有2^n（n取8）种可能，-aq/τ总共需要4byte来存（包含第一部分的aq%τ和第二部分的[aq/τ]，所以一共是4KB。",{"type":17,"tag":25,"props":155,"children":156},{},[157],{"type":23,"value":158},"移位操作：通过移位来快速实现2的幂次方的乘除法计算。",{"type":17,"tag":25,"props":160,"children":161},{},[162],{"type":23,"value":163},"选定τ的方法：通常来说是直接衡量attention矩阵A的损失，文中采用了AV的乘积损失作为目标函数，这样能够直接衡量在模型整体性能上的量化损失，单独衡量A的损失和整个注意力块的损失并不一致。通过在Calibration Set上实验，对不同的softmax层选择不同的τ数值。同时论文发现，τ取值较小的时候能够对低注意力分数更好的量化，而τ增大后，对更大的注意力分数的量化性能更好。",{"type":17,"tag":25,"props":165,"children":166},{},[167],{"type":17,"tag":99,"props":168,"children":170},{"alt":7,"src":169},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/11/07/6262648168164fe3ad956141112f1eb0.png",[],{"type":17,"tag":25,"props":172,"children":173},{},[174],{"type":23,"value":175},"3、预训练编码器和解码器权重共享的，**这种方法不仅能提高模型性能，还能显著降低模型的内存占用。共享权重的方式在大多数任务中都表现出了良好的效果，尤其在计算资源有限的情况下尤为重要。",{"type":17,"tag":18,"props":177,"children":179},{"id":178},"_03-实验",[180,185,186],{"type":17,"tag":45,"props":181,"children":182},{},[183],{"type":23,"value":184},"# 03",{"type":23,"value":51},{"type":17,"tag":45,"props":187,"children":188},{},[189],{"type":23,"value":190},"实验",{"type":17,"tag":25,"props":192,"children":193},{},[194],{"type":17,"tag":45,"props":195,"children":196},{},[197],{"type":23,"value":198},"1、实验设置",{"type":17,"tag":25,"props":200,"children":201},{},[202],{"type":23,"value":203},"任务和数据集：实验涵盖了实例分割（MS-COCO数据集）、语义分割（ADE20K数据集）和目标检测（DOTA-v1.0数据集）等任务。",{"type":17,"tag":25,"props":205,"children":206},{},[207],{"type":23,"value":208},"模型：使用了SAM-B、SAM-L和SAM-H三种模型变体。",{"type":17,"tag":25,"props":210,"children":211},{},[212],{"type":23,"value":213},"量化设置：实验中使用了不同的量化比特宽度（W6A6和W4A4），分别表示权重和激活的比特数。",{"type":17,"tag":25,"props":215,"children":216},{},[217],{"type":17,"tag":45,"props":218,"children":219},{},[220],{"type":23,"value":221},"2、实验结果",{"type":17,"tag":25,"props":223,"children":224},{},[225],{"type":23,"value":226},"实例分割：在MS-COCO数据集上，PTQ4SAM方法在不同的检测器中始终优于其他方法。同时，我们的 PTQ4SAML 令人鼓舞地实现了无损精度，如下图3，在 W6A6 设置下，我们的 PTQ4SAM-L 在 SAM-L 上应用 YOLOX 和 H-Deformable-DETR 时分别达到 40.3% 和 41.2%，与全精度型号相比，性能仅下降 0.1% 和 0.3%。",{"type":17,"tag":25,"props":228,"children":229},{},[230],{"type":17,"tag":99,"props":231,"children":233},{"alt":7,"src":232},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/11/07/7b51952b813c449b9e6a40c0276434e2.png",[],{"type":17,"tag":25,"props":235,"children":236},{},[237],{"type":23,"value":238},"语义分割：在ADE20K数据集上，PTQ4SAM在W6A6量化时甚至超过了全精度模型的性能。如下图4，SAM-L模型在W6A6量化时达到了33.66%的mIOU，比全精度模型高出0.05%。",{"type":17,"tag":25,"props":240,"children":241},{},[242],{"type":17,"tag":99,"props":243,"children":245},{"alt":7,"src":244},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/11/07/3d1cf29a7c664d02b94888bca3cb6689.png",[],{"type":17,"tag":25,"props":247,"children":248},{},[249],{"type":23,"value":250},"目标检测：在DOTA-v1.0数据集上D 定向目标检测中，我们的方法始终优于其他基于学习的 PTQ 方法。PTQ4SAM在SAM-L和SAM-H上进行W6A6量化时仅比全精度模型下降了0.3%，而在W4A4时，仍然能够保持较高的性能。",{"type":17,"tag":25,"props":252,"children":253},{},[254],{"type":17,"tag":99,"props":255,"children":257},{"alt":7,"src":256},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/11/07/9f1d609cc40a4406a64bda2140ccf736.png",[],{"type":17,"tag":25,"props":259,"children":260},{},[261],{"type":17,"tag":45,"props":262,"children":263},{},[264],{"type":23,"value":265},"3、主****要贡献",{"type":17,"tag":25,"props":267,"children":268},{},[269],{"type":23,"value":270},"1）第一个针对SAM的PTQ策略。",{"type":17,"tag":25,"props":272,"children":273},{},[274],{"type":23,"value":275},"2)提出了针对双峰分布的双峰集成策略。",{"type":17,"tag":25,"props":277,"children":278},{},[279],{"type":23,"value":280},"3)提出了AGQ方法。",{"type":17,"tag":25,"props":282,"children":283},{},[284],{"type":23,"value":285},"4)通过实验证明，我们的方法是一个即插即用的PTQ方法，大大超过了之前最先进的方法。",{"type":17,"tag":18,"props":287,"children":289},{"id":288},"_04-mindspore推理验证",[290,295,296],{"type":17,"tag":45,"props":291,"children":292},{},[293],{"type":23,"value":294},"# 04",{"type":23,"value":51},{"type":17,"tag":45,"props":297,"children":298},{},[299],{"type":23,"value":300},"MindSpore推理验证",{"type":17,"tag":25,"props":302,"children":303},{},[304],{"type":23,"value":305},"首先，复现论文代码得到sam_b的量化模型。",{"type":17,"tag":25,"props":307,"children":308},{},[309],{"type":23,"value":310},"然后，通过mindspore加载COCO数据集，下面是实例分割数据集的定义，每次迭代会返回\"img_id\", \"image_path\", \"image\", \"bboxes\", \"labels\", \"masks\"。文件命名dataset.py。（此处仅展示部分代码，点击阅读原文可查看完整代码）",{"type":17,"tag":312,"props":313,"children":315},"pre",{"code":314},"import os\n",[316],{"type":17,"tag":317,"props":318,"children":319},"code",{"__ignoreMap":7},[320],{"type":23,"value":314},{"type":17,"tag":25,"props":322,"children":323},{},[324],{"type":23,"value":325},"定义推理代码。为了实现实例分割，需要加载yolox作为检测头，然后把yolox检测出的boxes作为SAM模型的提示，输出目标掩码。首先，借助mindyolo库实现yolox目标检测mindyolo_test.py：",{"type":17,"tag":312,"props":327,"children":329},{"code":328},"import math\n",[330],{"type":17,"tag":317,"props":331,"children":332},{"__ignoreMap":7},[333],{"type":23,"value":328},{"type":17,"tag":25,"props":335,"children":336},{},[337],{"type":23,"value":338},"然后实现推理代码。为了评估，输出需要规范化为COCO格式，存入json中。",{"type":17,"tag":312,"props":340,"children":342},{"code":341},"import torch\n",[343],{"type":17,"tag":317,"props":344,"children":345},{"__ignoreMap":7},[346],{"type":23,"value":341},{"type":17,"tag":25,"props":348,"children":349},{},[350],{"type":23,"value":351},"调用COCO工具评估准确率：",{"type":17,"tag":312,"props":353,"children":354},{"code":314},[355],{"type":17,"tag":317,"props":356,"children":357},{"__ignoreMap":7},[358],{"type":23,"value":314},{"type":17,"tag":25,"props":360,"children":361},{},[362],{"type":23,"value":363},"测试结果：",{"type":17,"tag":25,"props":365,"children":366},{},[367],{"type":17,"tag":99,"props":368,"children":370},{"alt":7,"src":369},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/11/07/9a1c373ab1e548719beae693e06c1d8a.png",[],{"type":17,"tag":25,"props":372,"children":373},{},[374],{"type":23,"value":375},"可视化展示",{"type":17,"tag":25,"props":377,"children":378},{},[379],{"type":17,"tag":99,"props":380,"children":382},{"alt":7,"src":381},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/11/07/02b563ce8e924c2ea9f3818b10dd5ba8.png",[],{"type":17,"tag":18,"props":384,"children":386},{"id":385},"_05-总结",[387,392,393],{"type":17,"tag":45,"props":388,"children":389},{},[390],{"type":23,"value":391},"# 05",{"type":23,"value":51},{"type":17,"tag":45,"props":394,"children":395},{},[396],{"type":23,"value":397},"总结",{"type":17,"tag":25,"props":399,"children":400},{},[401],{"type":23,"value":402},"PTQ4SAM是首个专门针对SAM模型的后训练量化解决方案，通过BIG和AGQ策略，有效解决了SAM模型量化中的双峰分布和多样的Softmax后分布问题。在多种视觉任务和不同模型变体上，PTQ4SAM均表现出优越的性能，能够在低比特量化下保持较高的精度，并实现显著的计算和存储节省。",{"type":17,"tag":25,"props":404,"children":405},{},[406],{"type":23,"value":407},"尽管PTQ4SAM在量化SAM模型方面取得了显著成果，但论文也指出，SAM模型中双峰分布的成因尚不清楚，这将是未来研究的一个潜在方向。",{"title":7,"searchDepth":409,"depth":409,"links":410},4,[],"markdown","content:technology-blogs:zh:3894.md","content","technology-blogs/zh/3894.md","technology-blogs/zh/3894","md",1776506136596]