[{"data":1,"prerenderedAt":257},["ShallowReactive",2],{"content-query-GqSC8wUzMi":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":251,"_id":252,"_source":253,"_file":254,"_stem":255,"_extension":256},"/technology-blogs/zh/3038","zh",false,"","卷积结构的反击，纯卷积Query-Based检测器DECO超越DETR","作者：王云鹤 ｜来源：知乎","2024-03-18","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/25/b074222f62a848608f4fe1efae165011.png","technology-blogs",{"type":14,"children":15,"toc":246},"root",[16,24,43,48,59,64,73,81,86,91,99,104,112,117,125,130,138,143,150,155,160,165,170,177,185,190,195,207,212,219,224,229,236,241],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"卷积结构的反击纯卷积query-based检测器deco超越detr",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28,30,36,38],{"type":23,"value":29},"**作者：**",{"type":17,"tag":31,"props":32,"children":33},"strong",{},[34],{"type":23,"value":35},"王云鹤",{"type":23,"value":37}," ｜",{"type":17,"tag":31,"props":39,"children":40},{},[41],{"type":23,"value":42},"来源：知乎",{"type":17,"tag":25,"props":44,"children":45},{},[46],{"type":23,"value":47},"论文见：",{"type":17,"tag":25,"props":49,"children":50},{},[51],{"type":17,"tag":52,"props":53,"children":57},"a",{"href":54,"rel":55},"https://arxiv.org/abs/2312.13735",[56],"nofollow",[58],{"type":23,"value":54},{"type":17,"tag":25,"props":60,"children":61},{},[62],{"type":23,"value":63},"昇思MindSpore代码：",{"type":17,"tag":25,"props":65,"children":66},{},[67],{"type":17,"tag":52,"props":68,"children":71},{"href":69,"rel":70},"https://github.com/mindspore-lab/models/tree/master/research/huawei-noah/DECO",[56],[72],{"type":23,"value":69},{"type":17,"tag":25,"props":74,"children":75},{},[76],{"type":17,"tag":31,"props":77,"children":78},{},[79],{"type":23,"value":80},"引言",{"type":17,"tag":25,"props":82,"children":83},{},[84],{"type":23,"value":85},"Detection Transformer（DETR）推出之后，迅速引发了目标检测领域的一股热潮，很多的后续工作也从精度和速度方面对原始的DETR进行了改进。然而，Transformer是否真的大一统视觉领域呢？至少从ConvNeXt和RepLKNet等工作表明，CNN结构在视觉领域还是有很大的潜力的。我们这个工作探究的就是如何利用纯卷积的架构，来得到一个性能能打的类DETR框架的检测器。",{"type":17,"tag":25,"props":87,"children":88},{},[89],{"type":23,"value":90},"致敬DETR，我们称我们的方法为DECO (Detection ConvNets)。采用DETR类似的结构设定，搭配不同的Backbone，DECO在COCO上取得了38.6%和40.8%的AP，在V100上取得了35 FPS和28 FPS的速度，取得比DETR更好的性能。搭配类似RT-DETR的多尺度特征等模块，DECO取得了47.8% AP和34 FPS的速度，总体性能跟很多DETR改进方法比都有不错的优势。",{"type":17,"tag":25,"props":92,"children":93},{},[94],{"type":17,"tag":31,"props":95,"children":96},{},[97],{"type":23,"value":98},"DECO整体框架",{"type":17,"tag":25,"props":100,"children":101},{},[102],{"type":23,"value":103},"DETR的主要特点是利用Transformer Encoder-Decoder的结构，对一张输入图像，利用一组Query跟图像特征进行交互，可以直接输出指定数量的检测框，从而可以摆脱对NMS等后处理操作的依赖。我们提出的DECO总体架构上跟DETR类似，也包括了Backbone来进行图像特征提取，一个Encoder-Decoder的结构跟Query进行交互，最后输出特定数量的检测结果。唯一的不同在于，DECO的Encoder和Decoder是纯卷积的结构，因此DECO是一个由纯卷积构成的Query-Based端对端检测器。",{"type":17,"tag":25,"props":105,"children":106},{},[107],{"type":17,"tag":108,"props":109,"children":111},"img",{"alt":7,"src":110},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/25/356482e47267455d84c4436b5550e78f.png",[],{"type":17,"tag":25,"props":113,"children":114},{},[115],{"type":23,"value":116},"图：DECO和DETR框架的对比",{"type":17,"tag":25,"props":118,"children":119},{},[120],{"type":17,"tag":31,"props":121,"children":122},{},[123],{"type":23,"value":124},"DECO Encoder结构",{"type":17,"tag":25,"props":126,"children":127},{},[128],{"type":23,"value":129},"Encoder的结构替换相对比较直接，我们选择使用4个ConvNeXt Block来构成Encoder结构。具体来说，Encoder的每一层都是通过叠加一个7x7的深度卷积、一个LayerNorm层、一个1x1的卷积、一个GELU激活函数以及另一个1x1卷积来实现的。此外，在DETR中，因为Transformer架构对输入具有排列不变性，所以每层编码器的输入都需要添加位置编码，但是对于卷积组成的Encoder来说，则无需添加任何位置编码。",{"type":17,"tag":25,"props":131,"children":132},{},[133],{"type":17,"tag":31,"props":134,"children":135},{},[136],{"type":23,"value":137},"DECO Decoder结构",{"type":17,"tag":25,"props":139,"children":140},{},[141],{"type":23,"value":142},"相比而言，Decoder的替换则复杂得多。Decoder的主要作用为对图像特征和Query进行充分的交互，使得Query可以充分感知到图像特征信息，从而对图像中的目标进行坐标和类别的预测。Decoder主要包括两个输入：Encoder的特征输出和一组可学的查询向量（Query）。我们把Decoder的主要结构分为两个模块：自交互模块（Self-Interaction Module, SIM）和交叉交互模块（Cross-Interaction Module, CIM）。",{"type":17,"tag":25,"props":144,"children":145},{},[146],{"type":17,"tag":108,"props":147,"children":149},{"alt":7,"src":148},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/25/a0d5e0d2ccd44b5d8fd92ad134c68b5b.png",[],{"type":17,"tag":25,"props":151,"children":152},{},[153],{"type":23,"value":154},"图：DECO的Decoder结构",{"type":17,"tag":25,"props":156,"children":157},{},[158],{"type":23,"value":159},"SIM模块主要融合Query和上层Decoder层的输出，这部分的结构，可以利用若干个卷积层来组成，使用9x9 depthwise卷积和1x1卷积分别在空间维度和通道维度进行信息交互，充分获取所需的目标信息以送到后面的CIM模块进行进一步的目标检测特征提取。Query为一组随机初始化的向量，该数量决定了检测器最终输出的检测框数量，其具体的值可以随实际需要进行调节。对DECO来说，因为所有的结构都是由卷积构成的，因此我们把Query变成二维，比如100个Query，则可以变成10x10的维度。",{"type":17,"tag":25,"props":161,"children":162},{},[163],{"type":23,"value":164},"CIM模块的主要作用是让图像特征和Query进行充分的交互，使得Query可以充分感知到图像特征信息，从而对图像中的目标进行坐标和类别的预测。对于Transformer结构来说，利用Cross Attention机制可以很方便实现这一目的，但对于卷积结构来说，如何让两个特征进行充分交互，则是一个最大的难点。",{"type":17,"tag":25,"props":166,"children":167},{},[168],{"type":23,"value":169},"要把大小不同的SIM输出和Encoder输出全局特征进行融合，必须先把两者进行空间对齐然后进行融合，首先我们对SIM的输出进行最近邻上采样：",{"type":17,"tag":25,"props":171,"children":172},{},[173],{"type":17,"tag":108,"props":174,"children":176},{"alt":7,"src":175},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/25/20d84cda0f2b4c7fb0ade648001eef22.png",[],{"type":17,"tag":25,"props":178,"children":179},{},[180],{"type":17,"tag":31,"props":181,"children":182},{},[183],{"type":23,"value":184},"多尺度特征的利用",{"type":17,"tag":25,"props":186,"children":187},{},[188],{"type":23,"value":189},"跟原始的DETR一样，上述框架得到的DECO有个共同的短板，即缺少多尺度特征，而这对于高精度目标检测来说是影响很大的。Deformable DETR通过使用一个多尺度的可变形注意力模块来整合不同尺度的特征，但这个方法是跟Attention算子强耦合的，因此没法直接用在我们的DECO上。",{"type":17,"tag":25,"props":191,"children":192},{},[193],{"type":23,"value":194},"为了让DECO也能处理多尺度特征，我们在Decoder输出的特征之后，采用了RT-DETR提出的一个跨尺度特征融合模块。实际上，DETR诞生之后衍生了一系列的改进方法，我们相信很多策略对于DECO来说同样是适用的，这也希望感兴趣的人共同来探讨。",{"type":17,"tag":196,"props":197,"children":199},"h3",{"id":198},"实验结果",[200],{"type":17,"tag":31,"props":201,"children":202},{},[203],{"type":17,"tag":31,"props":204,"children":205},{},[206],{"type":23,"value":198},{"type":17,"tag":25,"props":208,"children":209},{},[210],{"type":23,"value":211},"我们在COCO上进行了实验，在保持主要架构不变的情况下将DECO和DETR进行了比较，比如保持Query数量一致，保持Decoder层数不变等，仅将DETR中的Transformer结构按上文所述换成我们的卷积结构。可以看出，DECO取得了比DETR更好的精度和速度的Tradeoff。",{"type":17,"tag":25,"props":213,"children":214},{},[215],{"type":17,"tag":108,"props":216,"children":218},{"alt":7,"src":217},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/25/e71fadc8d3c44b38bd90409b63233f05.png",[],{"type":17,"tag":25,"props":220,"children":221},{},[222],{"type":23,"value":223},"图：DECO和DETR的性能比较",{"type":17,"tag":25,"props":225,"children":226},{},[227],{"type":23,"value":228},"我们也把搭配了多尺度特征后的DECO跟更多目标检测方法进行了对比，其中包括了很多DETR的变体，从下图中可以看到，DECO取得了很不错的效果，比很多以前的检测器都取得了更好的性能。",{"type":17,"tag":25,"props":230,"children":231},{},[232],{"type":17,"tag":108,"props":233,"children":235},{"alt":7,"src":234},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/25/16f777f6e2c949fa8c7ea227ba32d174.png",[],{"type":17,"tag":25,"props":237,"children":238},{},[239],{"type":23,"value":240},"图：DECO和不同检测器的性能比较",{"type":17,"tag":25,"props":242,"children":243},{},[244],{"type":23,"value":245},"文章中DECO的结构进行了很多的消融实验及可视化，包括在Decoder中选用的具体融合策略（相加、点乘、Concat），以及Query的维度怎么设置才有最优的效果等，也有一些比较有趣的发现，更详细的结果和讨论请参看原文。",{"title":7,"searchDepth":247,"depth":247,"links":248},4,[249],{"id":198,"depth":250,"text":198},3,"markdown","content:technology-blogs:zh:3038.md","content","technology-blogs/zh/3038.md","technology-blogs/zh/3038","md",1776506125599]