[{"data":1,"prerenderedAt":325},["ShallowReactive",2],{"content-query-y0VUrlXHJD":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":319,"_id":320,"_source":321,"_file":322,"_stem":323,"_extension":324},"/technology-blogs/zh/876","zh",false,"","MindSpore Vision Transformer系列（1）：ViT","An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","2021-12-22","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/12/22/ea67e959549b475cacca1ea5beb2ae37.png","technology-blogs","大V博文",{"type":15,"children":16,"toc":316},"root",[17,25,31,44,55,60,72,80,97,108,123,134,145,162,173,187,198,209,223,234,251,262,266,283,294,305],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore-vision-transformer系列1vit",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"作者：于璠",{"type":18,"tag":26,"props":32,"children":33},{},[34,36],{"type":24,"value":35},"作者主页：",{"type":18,"tag":37,"props":38,"children":42},"a",{"href":39,"rel":40},"https://www.zhihu.com/people/yu-fan-42-9",[41],"nofollow",[43],{"type":24,"value":39},{"type":18,"tag":26,"props":45,"children":46},{},[47,49],{"type":24,"value":48},"文章来源：",{"type":18,"tag":37,"props":50,"children":53},{"href":51,"rel":52},"https://zhuanlan.zhihu.com/p/447955652",[41],[54],{"type":24,"value":51},{"type":18,"tag":26,"props":56,"children":57},{},[58],{"type":24,"value":59},"目前在视觉领域已经有众多的Transformer工作，涵盖图像分类[1]、检测[2]和分割[3]，以及视频等领域。本篇分享去年来自Google的Vision Transformer经典工作：An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale[1]（ViT）。",{"type":18,"tag":26,"props":61,"children":62},{},[63],{"type":18,"tag":64,"props":65,"children":66},"strong",{},[67],{"type":18,"tag":64,"props":68,"children":69},{},[70],{"type":24,"value":71},"背景介绍",{"type":18,"tag":26,"props":73,"children":74},{},[75],{"type":18,"tag":64,"props":76,"children":77},{},[78],{"type":24,"value":79},"Transformer自出现以来，引领了NLP领域的技术革新，在各大NLP榜单任务中“独霸天下”。但在计算机视觉领域，CNN结构在主流的backbone中仍占据主导地位，如ResNet等。相比于CNN，Transformer在空间等价性和局部连接方面缺少优势，但是凭借自身的self-attention结构，Transformer更适合在长范围内建模。",{"type":18,"tag":26,"props":81,"children":82},{},[83],{"type":18,"tag":64,"props":84,"children":85},{},[86],{"type":18,"tag":64,"props":87,"children":88},{},[89],{"type":18,"tag":64,"props":90,"children":91},{},[92],{"type":18,"tag":64,"props":93,"children":94},{},[95],{"type":24,"value":96},"Pipeline",{"type":18,"tag":26,"props":98,"children":99},{},[100],{"type":18,"tag":64,"props":101,"children":102},{},[103],{"type":18,"tag":64,"props":104,"children":105},{},[106],{"type":24,"value":107},"ViT参照了CNN局部特征提取方式，先把图像在空间维度等间隔分成若干相同尺寸的patch；对于生成的patch通过线性映射成patch embedding（与Transformer中token embedding类似）。参照NLP中的处理方式，patch embedding加上了标志patch相对位置的position embedding。同时在ViT的输入端，除了图像原始的patches，也增加了额外可学习的class embedding。该过程如下图所示。",{"type":18,"tag":26,"props":109,"children":110},{},[111],{"type":18,"tag":64,"props":112,"children":113},{},[114],{"type":18,"tag":64,"props":115,"children":116},{},[117],{"type":18,"tag":118,"props":119,"children":122},"img",{"alt":120,"src":121},"1.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202112/22/1043338l2dnwvvg3ul4dmd.jpg",[],{"type":18,"tag":26,"props":124,"children":125},{},[126],{"type":18,"tag":64,"props":127,"children":128},{},[129],{"type":18,"tag":64,"props":130,"children":131},{},[132],{"type":24,"value":133},"图1 ViT Pipeline",{"type":18,"tag":26,"props":135,"children":136},{},[137],{"type":18,"tag":64,"props":138,"children":139},{},[140],{"type":18,"tag":64,"props":141,"children":142},{},[143],{"type":24,"value":144},"对于得到的完整Embedded Patches结果，送入标准的Transformer Encoder模块。其中Transformer Encoder由若干个Transformer block堆叠而成，每个block由Norm、Multi-Head Attention和MLP层组成，层与层之间存在short cut连接。最后取class embedding位置的Transformer block输出结果，再通过MLP Head得到最终的网络输出。网络通过在ImageNet数据集上的分类任务进行预训练。",{"type":18,"tag":26,"props":146,"children":147},{},[148],{"type":18,"tag":64,"props":149,"children":150},{},[151],{"type":18,"tag":64,"props":152,"children":153},{},[154],{"type":18,"tag":64,"props":155,"children":156},{},[157],{"type":18,"tag":64,"props":158,"children":159},{},[160],{"type":24,"value":161},"实验结果",{"type":18,"tag":26,"props":163,"children":164},{},[165],{"type":18,"tag":64,"props":166,"children":167},{},[168],{"type":18,"tag":64,"props":169,"children":170},{},[171],{"type":24,"value":172},"在实验环节ViT尝试了在JFT和ImageNet-21K大型数据集上预训练后，在ImageNet，CIFAR-10，CIFAR-100等数据集上微调。其中JFT是Google私有的300M数据集，数据量远大于ImageNet。原始模型和数据集的对比如下表所示。",{"type":18,"tag":26,"props":174,"children":175},{},[176],{"type":18,"tag":64,"props":177,"children":178},{},[179],{"type":18,"tag":64,"props":180,"children":181},{},[182],{"type":18,"tag":118,"props":183,"children":186},{"alt":184,"src":185},"2.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202112/22/104402udpwg95sd8s9hbtm.jpg",[],{"type":18,"tag":26,"props":188,"children":189},{},[190],{"type":18,"tag":64,"props":191,"children":192},{},[193],{"type":18,"tag":64,"props":194,"children":195},{},[196],{"type":24,"value":197},"从上表可以看出在JFT数据集上预训练的ViT，在ImageNet的Top-1准确率最高能够达到88.55%，这基本是ImageNet的SOTA模型水平，足以证明ViT+大数据集的强大。",{"type":18,"tag":26,"props":199,"children":200},{},[201],{"type":18,"tag":64,"props":202,"children":203},{},[204],{"type":18,"tag":64,"props":205,"children":206},{},[207],{"type":24,"value":208},"那多大的数据集对ViT是有效的呢，文章继续补充了相应的实验，如下图所示。可以看到，随着数据集规模达到ImageNet-21K这种量级，ViT的威力才逐渐发挥出来，并超越了CNN为backbone的ResNet结构。这样的结果反映了Transformer网络需要大量数据集才能发挥出能力的特点：",{"type":18,"tag":26,"props":210,"children":211},{},[212],{"type":18,"tag":64,"props":213,"children":214},{},[215],{"type":18,"tag":64,"props":216,"children":217},{},[218],{"type":18,"tag":118,"props":219,"children":222},{"alt":220,"src":221},"3.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202112/22/1044217on4791xnpfgfjx9.jpg",[],{"type":18,"tag":26,"props":224,"children":225},{},[226],{"type":18,"tag":64,"props":227,"children":228},{},[229],{"type":18,"tag":64,"props":230,"children":231},{},[232],{"type":24,"value":233},"图2 不同数据集对比",{"type":18,"tag":26,"props":235,"children":236},{},[237],{"type":18,"tag":64,"props":238,"children":239},{},[240],{"type":18,"tag":64,"props":241,"children":242},{},[243],{"type":18,"tag":64,"props":244,"children":245},{},[246],{"type":18,"tag":64,"props":247,"children":248},{},[249],{"type":24,"value":250},"思考与总结",{"type":18,"tag":26,"props":252,"children":253},{},[254],{"type":18,"tag":64,"props":255,"children":256},{},[257],{"type":18,"tag":64,"props":258,"children":259},{},[260],{"type":24,"value":261},"ViT证明了Transformer也能简单有效地使用在CV领域，在众多CNN为主的backbone中异军突起。虽然ViT的效果不错，但是也留下了很多能够再进一步的方向，包括预训练的方式，在其他领域的应用等。",{"type":18,"tag":263,"props":264,"children":265},"hr",{},[],{"type":18,"tag":26,"props":267,"children":268},{},[269],{"type":18,"tag":64,"props":270,"children":271},{},[272],{"type":18,"tag":64,"props":273,"children":274},{},[275],{"type":18,"tag":64,"props":276,"children":277},{},[278],{"type":18,"tag":64,"props":279,"children":280},{},[281],{"type":24,"value":282},"Reference",{"type":18,"tag":26,"props":284,"children":285},{},[286],{"type":18,"tag":64,"props":287,"children":288},{},[289],{"type":18,"tag":64,"props":290,"children":291},{},[292],{"type":24,"value":293},"[1] Dosovitskiy A, Beyer L, Kolesnikov A, et al. An image is worth 16x16 words: Transformers for image recognition at scale[J]. arXiv preprint arXiv:2010.11929, 2020.",{"type":18,"tag":26,"props":295,"children":296},{},[297],{"type":18,"tag":64,"props":298,"children":299},{},[300],{"type":18,"tag":64,"props":301,"children":302},{},[303],{"type":24,"value":304},"[2] Carion N, Massa F, Synnaeve G, et al. End-to-end object detection with transformers[C]//European Conference on Computer Vision. Springer, Cham, 2020: 213-229.",{"type":18,"tag":26,"props":306,"children":307},{},[308],{"type":18,"tag":64,"props":309,"children":310},{},[311],{"type":18,"tag":64,"props":312,"children":313},{},[314],{"type":24,"value":315},"[3] Zheng S, Lu J, Zhao H, et al. Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers[C]//Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. 2021: 6881-6890.",{"title":7,"searchDepth":317,"depth":317,"links":318},4,[],"markdown","content:technology-blogs:zh:876.md","content","technology-blogs/zh/876.md","technology-blogs/zh/876","md",1776506141869]