[{"data":1,"prerenderedAt":312},["ShallowReactive",2],{"content-query-gtiQf5kJE4":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":306,"_id":307,"_source":308,"_file":309,"_stem":310,"_extension":311},"/technology-blogs/zh/3482","zh",false,"","基于MindSpore NLP实现Blip2模型论文解读","作者：未平 来源：知乎","2024-11-15","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/01/15/f8b57a5d40004f929c7a2378f492e5c3.png","technology-blogs","开发者分享",{"type":15,"children":16,"toc":301},"root",[17,25,44,63,71,78,83,91,98,105,110,115,123,128,145,152,169,192,199,207,212,217,222,227,235,240,245,252,260,265,278,283,291,296],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"基于mindspore-nlp实现blip2模型论文解读",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29,31,37,39],{"type":24,"value":30},"**作者：**",{"type":18,"tag":32,"props":33,"children":34},"strong",{},[35],{"type":24,"value":36},"未平",{"type":24,"value":38}," ",{"type":18,"tag":32,"props":40,"children":41},{},[42],{"type":24,"value":43},"来源：知乎",{"type":18,"tag":26,"props":45,"children":46},{},[47,49,54,56,61],{"type":24,"value":48},"BLIP-2 提出了一个",{"type":18,"tag":32,"props":50,"children":51},{},[52],{"type":24,"value":53},"新的高效预训练策略",{"type":24,"value":55},"，用于解决视觉和语言联合学习任务中的计算成本问题。相比其他模型，BLIP-2 的最大创新点在于，它",{"type":18,"tag":32,"props":57,"children":58},{},[59],{"type":24,"value":60},"引入了冻结的预训练图像编码器和冻结的大型语言模型（LLM）",{"type":24,"value":62},"，通过一个**轻量级的查询转换器（Querying Transformer, Q-Former）**来弥合视觉和语言的跨模态差距。",{"type":18,"tag":26,"props":64,"children":65},{},[66],{"type":18,"tag":32,"props":67,"children":68},{},[69],{"type":24,"value":70},"模型创新点",{"type":18,"tag":72,"props":73,"children":75},"h2",{"id":74},"该模型的主要创新包括",[76],{"type":24,"value":77},"该模型的主要创新包括：",{"type":18,"tag":26,"props":79,"children":80},{},[81],{"type":24,"value":82},"**1、****两阶段的预训练策略：**第一阶段学习视觉-语言表征，第二阶段进行视觉到语言的生成学习。这种分阶段策略有效利用了预训练的图像和语言模型，减少了计算成本。",{"type":18,"tag":26,"props":84,"children":85},{},[86],{"type":18,"tag":87,"props":88,"children":90},"img",{"alt":7,"src":89},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/11/21/fe5d2867602a4e218d1428f0d347034c.png",[],{"type":18,"tag":26,"props":92,"children":93},{},[94],{"type":18,"tag":87,"props":95,"children":97},{"alt":7,"src":96},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/11/21/16f7146db8284c85ada1582b2ffcb939.png",[],{"type":18,"tag":26,"props":99,"children":100},{},[101],{"type":18,"tag":87,"props":102,"children":104},{"alt":7,"src":103},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/11/21/560c0da22be04d5e86a1c12565a3a4b2.png",[],{"type":18,"tag":26,"props":106,"children":107},{},[108],{"type":24,"value":109},"**2、****轻量化设计：**BLIP-2 相比于其他最新的模型，训练参数显著减少，尤其是与 Flamingo80B 相比，BLIP-2 在零样本 VQAv2 上的表现高出 8.7%，但其训练参数却减少了 54 倍。",{"type":18,"tag":26,"props":111,"children":112},{},[113],{"type":24,"value":114},"**3、****通用性：**BLIP-2 支持多种视觉-语言任务，如视觉问答（VQA）、图像字幕生成、图文检索等，并表现出良好的零样本推理能力。",{"type":18,"tag":26,"props":116,"children":117},{},[118],{"type":18,"tag":32,"props":119,"children":120},{},[121],{"type":24,"value":122},"数据集上的评价指标得分",{"type":18,"tag":26,"props":124,"children":125},{},[126],{"type":24,"value":127},"BLIP-2 在多个视觉-语言任务的数据集上表现出色，主要得分指标如下：",{"type":18,"tag":26,"props":129,"children":130},{},[131,143],{"type":18,"tag":32,"props":132,"children":133},{},[134,136,141],{"type":24,"value":135},"1、视觉问答 (VQA)：",{"type":18,"tag":32,"props":137,"children":138},{},[139],{"type":24,"value":140},"在 VQAv2 数据集上，BLIP-2 在零样本情况下取得了",{"type":24,"value":142},"65.0",{"type":24,"value":144},"的准确率，远高于 Flamingo80B 的 56.3%。",{"type":18,"tag":26,"props":146,"children":147},{},[148],{"type":18,"tag":87,"props":149,"children":151},{"alt":7,"src":150},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/11/21/54ec7204a94247688f6a2700f10658bd.png",[],{"type":18,"tag":26,"props":153,"children":154},{},[155,167],{"type":18,"tag":32,"props":156,"children":157},{},[158,160,165],{"type":24,"value":159},"2、****图像字幕生成：",{"type":18,"tag":32,"props":161,"children":162},{},[163],{"type":24,"value":164},"在 NoCaps 数据集的验证集中，BLIP-2 获得了",{"type":24,"value":166},"121.6 CIDEr和15.8 SPICE",{"type":24,"value":168},"的得分，同样高于现有的 SOTA 模型。",{"type":18,"tag":26,"props":170,"children":171},{},[172,176,178,183,185,190],{"type":18,"tag":87,"props":173,"children":175},{"alt":7,"src":174},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/11/21/7fc1ebf3ea2e442d8e6b943dcc412327.png",[],{"type":24,"value":177}," **3、图文检索 (Flickr30K)：",{"type":18,"tag":32,"props":179,"children":180},{},[181],{"type":24,"value":182},"BLIP-2 在 Image → Text 检索任务上达到了",{"type":24,"value":184},"97.6%",{"type":18,"tag":32,"props":186,"children":187},{},[188],{"type":24,"value":189},"的 R@1（检索准确率），在 Text → Image 任务上达到了",{"type":24,"value":191},"89.7%**的 R@1，这些结果均超过了之前的顶级模型。",{"type":18,"tag":26,"props":193,"children":194},{},[195],{"type":18,"tag":87,"props":196,"children":198},{"alt":7,"src":197},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/11/21/3bf5fe53761d4617b49b20a2f46dd3c2.png",[],{"type":18,"tag":26,"props":200,"children":201},{},[202],{"type":18,"tag":32,"props":203,"children":204},{},[205],{"type":24,"value":206},"创新点优势分析",{"type":18,"tag":26,"props":208,"children":209},{},[210],{"type":24,"value":211},"BLIP-2 的创新相较于其他工作具有以下优势：",{"type":18,"tag":26,"props":213,"children":214},{},[215],{"type":24,"value":216},"**1、计算效****率显著提高：**传统的视觉-语言预训练方法需要对大规模模型进行端到端训练，计算成本高昂。BLIP-2 通过冻结预训练的图像编码器和语言模型，显著降低了计算成本，仅需微调轻量级的查询转换器，这使得它比 Flamingo 等模型更高效。",{"type":18,"tag":26,"props":218,"children":219},{},[220],{"type":24,"value":221},"**2、性能****卓越，参数更少：**在多个任务上，BLIP-2 展现了更高的性能，如在 VQA、图像字幕生成和图文检索任务中的表现均优于现有的模型，并且其参数量更小。例如，在 VQAv2 上，它比 Flamingo80B 高出 8.7%，但仅使用了 54 倍更少的训练参数。",{"type":18,"tag":26,"props":223,"children":224},{},[225],{"type":24,"value":226},"**3、通****用性和扩展性：**BLIP-2 的设计能够兼容更多的预训练图像和语言模型，这意味着随着更多高级模型的出现，它可以轻松进行适配以获得更好的性能。",{"type":18,"tag":26,"props":228,"children":229},{},[230],{"type":18,"tag":32,"props":231,"children":232},{},[233],{"type":24,"value":234},"使用 MindSpore NLP 进行模型评估",{"type":18,"tag":26,"props":236,"children":237},{},[238],{"type":24,"value":239},"图文检索评估过程参照官方代码和论文实现，首先对每张待评估图像进行预处理并提取特征，然后通过计算图像特征与预存文本特征的相似度进行初步检索，选出最相关的前K=128个文本。接着，利用BLIP-2模型的图文匹配ITM和ITC对这K个文本进行更精细的相似度计算，综合ITM和ITC分数得到最终排序。最后，根据排序结果记录相应的图像ID，并通过检查真实图像ID是否出现在检索结果的前N位来计算R@1、R@5和R@10等评估指标，从而模型的图文检索性能。",{"type":18,"tag":26,"props":241,"children":242},{},[243],{"type":24,"value":244},"为了验证该模型的效果，我们使用 MindSpore NLP 来加载（blip2-itm-vit-g） 模型，并在数据集（ COCO Caption 2014）上进行评估，由于资源限制只选取了coco2014中的全部文本和2500张图片进行测试，所以评分偏低，但Transformers实现与MindSpore nlp误差小于2%。",{"type":18,"tag":26,"props":246,"children":247},{},[248],{"type":18,"tag":87,"props":249,"children":251},{"alt":7,"src":250},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/11/21/6fa02de24b604ae59415f4e04b5e129e.png",[],{"type":18,"tag":26,"props":253,"children":254},{},[255],{"type":18,"tag":32,"props":256,"children":257},{},[258],{"type":24,"value":259},"完整代码",{"type":18,"tag":26,"props":261,"children":262},{},[263],{"type":24,"value":264},"已经上传到 GitHub 仓库中，您可以通过以下链接访问并运行：",{"type":18,"tag":26,"props":266,"children":267},{},[268,270],{"type":24,"value":269},"BLIP-2 MindNLP 图文检索评估代码：",{"type":18,"tag":271,"props":272,"children":276},"a",{"href":273,"rel":274},"https://github.com/fanxing-6/MindNLPBlip2ImageTextRetrievalEval",[275],"nofollow",[277],{"type":24,"value":273},{"type":18,"tag":26,"props":279,"children":280},{},[281],{"type":24,"value":282},"我们使用 MindSpore NLP 加载 BLIP-2 模型（blip2-itm-vit-g），并在 coco 2014 数据集上进行推理。",{"type":18,"tag":26,"props":284,"children":285},{},[286],{"type":18,"tag":32,"props":287,"children":288},{},[289],{"type":24,"value":290},"总结",{"type":18,"tag":26,"props":292,"children":293},{},[294],{"type":24,"value":295},"BLIP-2通过使用冻结的预训练图像编码器和大型语言模型，实现了一种计算高效的视觉-语言预训练策略，有效减少了训练成本，同时在多种视觉-语言任务上展现出卓越的性能。",{"type":18,"tag":26,"props":297,"children":298},{},[299],{"type":24,"value":300},"建议各位开发者利用MindSpore NLP等工具来加载并复现该模型的实验成果。MindSpore NLP提供了一套与PyTorch风格一致的简洁接口，加载和评估预训练模型非常直接和高效。",{"title":7,"searchDepth":302,"depth":302,"links":303},4,[304],{"id":74,"depth":305,"text":77},2,"markdown","content:technology-blogs:zh:3482.md","content","technology-blogs/zh/3482.md","technology-blogs/zh/3482","md",1776506130246]