[{"data":1,"prerenderedAt":610},["ShallowReactive",2],{"content-query-vdsJwxnKDM":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":604,"_id":605,"_source":606,"_file":607,"_stem":608,"_extension":609},"/technology-blogs/zh/3809","zh",false,"","BERT模型论文解读，并基于MindSpore NLP推理复现","欢迎开发者积极参与昇思MindSpore开源实习活动","2025-08-05","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/08/08/9be55f85b878469a86fc29f464be5099.png","technology-blogs",{"type":14,"children":15,"toc":599},"root",[16,24,30,35,50,55,62,78,83,88,103,113,118,132,142,147,166,174,184,194,204,223,233,238,243,253,261,266,285,295,309,316,330,338,343,348,356,364,369,388,398,406,411,429,443,456,470,478,488,493,501,513,523,527,535,545,550,555,560,565,572,580,587,592],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"bert模型论文解读并基于mindspore-nlp推理复现",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28],{"type":23,"value":29},"作者：liaominwen",{"type":17,"tag":25,"props":31,"children":32},{},[33],{"type":23,"value":34},"来源：开源实习",{"type":17,"tag":25,"props":36,"children":37},{},[38,40,48],{"type":23,"value":39},"BERT（Bidirectional Encoder Representations from Transformers）是一种预训练的自然语言处理（NLP）模型，由 Google 于2018年提出论文《BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding》（",{"type":17,"tag":41,"props":42,"children":46},"a",{"href":43,"rel":44},"https://arxiv.org/abs/1810.04805",[45],"nofollow",[47],{"type":23,"value":43},{"type":23,"value":49},"）",{"type":17,"tag":25,"props":51,"children":52},{},[53],{"type":23,"value":54},"在NLP领域应用预训练和微调前计算机视觉方向已经广泛采用这种方式BERT的提出 将NLP的结合了预训练和微调带到了一个新的领域。",{"type":17,"tag":56,"props":57,"children":59},"h2",{"id":58},"下面开始对该模型进行解读",[60],{"type":23,"value":61},"下面开始对该模型进行解读。",{"type":17,"tag":25,"props":63,"children":64},{},[65,71,73],{"type":17,"tag":66,"props":67,"children":68},"strong",{},[69],{"type":23,"value":70},"# 01",{"type":23,"value":72}," ",{"type":17,"tag":66,"props":74,"children":75},{},[76],{"type":23,"value":77},"架构",{"type":17,"tag":25,"props":79,"children":80},{},[81],{"type":23,"value":82},"BERT 是由多层 Transformer 编码器Encoder 堆叠而成。",{"type":17,"tag":25,"props":84,"children":85},{},[86],{"type":23,"value":87},"它分为以下两种：",{"type":17,"tag":89,"props":90,"children":91},"ul",{},[92,98],{"type":17,"tag":93,"props":94,"children":95},"li",{},[96],{"type":23,"value":97},"BERT-base：12 层 Encoder，隐藏层维度为 768，注意力头数为 12。",{"type":17,"tag":93,"props":99,"children":100},{},[101],{"type":23,"value":102},"BERT-large：24 层 Encoder，隐藏层维度为 1024，注意力头数为 16。",{"type":17,"tag":25,"props":104,"children":105},{},[106,111],{"type":17,"tag":66,"props":107,"children":108},{},[109],{"type":23,"value":110},"I am eating a banana",{"type":23,"value":112},"经过bert编码器编码的句子类似**[CLS]I am eating a banana[SEP] [PAD] [PAD]**",{"type":17,"tag":25,"props":114,"children":115},{},[116],{"type":23,"value":117},"前[CLS]表示可以用于下游任务，后[SEP]表示句子的结束[PAD]用于填充防止句子长度不同。",{"type":17,"tag":25,"props":119,"children":120},{},[121,126,127],{"type":17,"tag":66,"props":122,"children":123},{},[124],{"type":23,"value":125},"# 02",{"type":23,"value":72},{"type":17,"tag":66,"props":128,"children":129},{},[130],{"type":23,"value":131},"模型的创新点：",{"type":17,"tag":25,"props":133,"children":134},{},[135,137],{"type":23,"value":136},"**1、**",{"type":17,"tag":66,"props":138,"children":139},{},[140],{"type":23,"value":141},"BERT 通过 遮蔽语言模型（MLM） 任务",{"type":17,"tag":25,"props":143,"children":144},{},[145],{"type":23,"value":146},"利用双向上下文来训练模型。在训练时，它随机遮蔽输入中的一些单词（利用[mask]遮蔽句子中的部分信息），然后让模型同时从左边和右边的上下文信息预测这些单词。",{"type":17,"tag":25,"props":148,"children":149},{},[150,152,157,159,164],{"type":23,"value":151},"其中 ",{"type":17,"tag":66,"props":153,"children":154},{},[155],{"type":23,"value":156},"80%",{"type":23,"value":158}," 替换为 ",{"type":17,"tag":66,"props":160,"children":161},{},[162],{"type":23,"value":163},"[MASK]",{"type":23,"value":165},"；10% 替换为随机 token；10% 保持不变。",{"type":17,"tag":25,"props":167,"children":168},{},[169],{"type":17,"tag":170,"props":171,"children":173},"img",{"alt":7,"src":172},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/08/08/90f86b1aa7d1443f8e0c9a86c0c3baf3.png",[],{"type":17,"tag":25,"props":175,"children":176},{},[177,179],{"type":23,"value":178},"Token Embeddings-->",{"type":17,"tag":66,"props":180,"children":181},{},[182],{"type":23,"value":183},"词嵌入",{"type":17,"tag":25,"props":185,"children":186},{},[187,189],{"type":23,"value":188},"Segment Embeddings--->",{"type":17,"tag":66,"props":190,"children":191},{},[192],{"type":23,"value":193},"句子嵌入",{"type":17,"tag":25,"props":195,"children":196},{},[197,199],{"type":23,"value":198},"Position Embeddings--->",{"type":17,"tag":66,"props":200,"children":201},{},[202],{"type":23,"value":203},"位置嵌入(区别于GPT1的正余弦编码)",{"type":17,"tag":25,"props":205,"children":206},{},[207,209,214,216,221],{"type":23,"value":208},"输入 BERT 之前，",{"type":17,"tag":66,"props":210,"children":211},{},[212],{"type":23,"value":213},"每个 token 由 词嵌入、句子嵌入和位置嵌入",{"type":23,"value":215}," 组成。它们的 ",{"type":17,"tag":66,"props":217,"children":218},{},[219],{"type":23,"value":220},"向量相加",{"type":23,"value":222}," 后送入 BERT 进行训练或推理。",{"type":17,"tag":25,"props":224,"children":225},{},[226,228],{"type":23,"value":227},"**2、**",{"type":17,"tag":66,"props":229,"children":230},{},[231],{"type":23,"value":232},"下一句预测（Next Sentence Prediction, NSP）",{"type":17,"tag":25,"props":234,"children":235},{},[236],{"type":23,"value":237},"传统方法的局限性：忽视了句子之间的关系。",{"type":17,"tag":25,"props":239,"children":240},{},[241],{"type":23,"value":242},"BERT 的创新：BERT 在预训练时引入了 下一句预测（NSP） 任务，该任务帮助模型理解句子之间的逻辑关系。模型被给定两句文本，任务是判断第二句是否为第一句的下文。",{"type":17,"tag":25,"props":244,"children":245},{},[246,248],{"type":23,"value":247},"**3、**",{"type":17,"tag":66,"props":249,"children":250},{},[251],{"type":23,"value":252},"BERT 使用了预训练-微调（Pre-training and Fine-tuning）",{"type":17,"tag":25,"props":254,"children":255},{},[256],{"type":17,"tag":66,"props":257,"children":258},{},[259],{"type":23,"value":260},"预训练: 在大规模无标签的语料上进行预训练。",{"type":17,"tag":25,"props":262,"children":263},{},[264],{"type":23,"value":265},"语料：BooksCorpus(Zhu et al., 2015) 和英文维基百科（25亿词只提取文本段落，忽略列表、表格和标题）",{"type":17,"tag":25,"props":267,"children":268},{},[269,271,276,278,283],{"type":23,"value":270},"通过",{"type":17,"tag":66,"props":272,"children":273},{},[274],{"type":23,"value":275},"MLM",{"type":23,"value":277},"和",{"type":17,"tag":66,"props":279,"children":280},{},[281],{"type":23,"value":282},"NSP",{"type":23,"value":284},"任务学习语言的通用表示---->好处：能够大大减少对标注数据的依赖",{"type":17,"tag":25,"props":286,"children":287},{},[288,293],{"type":17,"tag":66,"props":289,"children":290},{},[291],{"type":23,"value":292},"微调:",{"type":23,"value":294}," 模型适应下游任务（如文本分类等）。-->好处：提高了模型在多个任务上的性能",{"type":17,"tag":25,"props":296,"children":297},{},[298,303,304],{"type":17,"tag":66,"props":299,"children":300},{},[301],{"type":23,"value":302},"# 03",{"type":23,"value":72},{"type":17,"tag":66,"props":305,"children":306},{},[307],{"type":23,"value":308},"BERT区别于其他模型",{"type":17,"tag":25,"props":310,"children":311},{},[312],{"type":17,"tag":170,"props":313,"children":315},{"alt":7,"src":314},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/08/08/50dcfd44ae014d899eccd7719676383f.png",[],{"type":17,"tag":25,"props":317,"children":318},{},[319,324,325],{"type":17,"tag":66,"props":320,"children":321},{},[322],{"type":23,"value":323},"# 04",{"type":23,"value":72},{"type":17,"tag":66,"props":326,"children":327},{},[328],{"type":23,"value":329},"效果",{"type":17,"tag":25,"props":331,"children":332},{},[333],{"type":17,"tag":66,"props":334,"children":335},{},[336],{"type":23,"value":337},"1、GLUE 测试",{"type":17,"tag":25,"props":339,"children":340},{},[341],{"type":23,"value":342},"BERT 通过 [CLS] 位置的向量 C 进行分类，添加全连接层 W 并使用交叉熵损失进行微调。",{"type":17,"tag":25,"props":344,"children":345},{},[346],{"type":23,"value":347},"效果：BERTLARGE 比 SOTA 提高 7.0% 平均准确率。MNLI 任务提升 4.6% 绝对准确率。BERTLARGE 在小数据集表现更优。",{"type":17,"tag":89,"props":349,"children":350},{},[351],{"type":17,"tag":93,"props":352,"children":353},{},[354],{"type":23,"value":355},"超参数Batch size = 32，Fine-tune 3 epochs，学习率 5e-5, 4e-5, 3e-5, 2e-5。",{"type":17,"tag":25,"props":357,"children":358},{},[359],{"type":17,"tag":66,"props":360,"children":361},{},[362],{"type":23,"value":363},"2、SQuAD任务",{"type":17,"tag":25,"props":365,"children":366},{},[367],{"type":23,"value":368},"采用 [CLS] 预测无答案，两个向量 S 和 E 预测答案起止位置。",{"type":17,"tag":25,"props":370,"children":371},{},[372,374,379,381,386],{"type":23,"value":373},"**SQuAD v1.1：**BERTLARGE (单模型) ",{"type":17,"tag":66,"props":375,"children":376},{},[377],{"type":23,"value":378},"F1 = 90.9%",{"type":23,"value":380},"，超越 SOTA。集成模型提升至 ",{"type":17,"tag":66,"props":382,"children":383},{},[384],{"type":23,"value":385},"F1 = 92.2%",{"type":23,"value":387},"。",{"type":17,"tag":25,"props":389,"children":390},{},[391,396],{"type":17,"tag":66,"props":392,"children":393},{},[394],{"type":23,"value":395},"SQuAD v2.0：F1 提高 5.1",{"type":23,"value":397},"，无答案处理能力增强。",{"type":17,"tag":25,"props":399,"children":400},{},[401],{"type":17,"tag":66,"props":402,"children":403},{},[404],{"type":23,"value":405},"3、SWAG 任务",{"type":17,"tag":25,"props":407,"children":408},{},[409],{"type":23,"value":410},"4 个候选句，每个与 sentence A 组成输入，[CLS] 位置表示选择最佳答案。",{"type":17,"tag":25,"props":412,"children":413},{},[414,416,421,423,428],{"type":23,"value":415},"BERTLARGE比",{"type":17,"tag":66,"props":417,"children":418},{},[419],{"type":23,"value":420},"OpenAIGPT 高 8.3%",{"type":23,"value":422},"，比",{"type":17,"tag":66,"props":424,"children":425},{},[426],{"type":23,"value":427},"ELMo提升 27.1%",{"type":23,"value":387},{"type":17,"tag":25,"props":430,"children":431},{},[432,437,438],{"type":17,"tag":66,"props":433,"children":434},{},[435],{"type":23,"value":436},"# 05",{"type":23,"value":72},{"type":17,"tag":66,"props":439,"children":440},{},[441],{"type":23,"value":442},"总结",{"type":17,"tag":89,"props":444,"children":445},{},[446,451],{"type":17,"tag":93,"props":447,"children":448},{},[449],{"type":23,"value":450},"去掉 NSP-->对问答任务影响大。",{"type":17,"tag":93,"props":452,"children":453},{},[454],{"type":23,"value":455},"采用左到右语言模型（LTR）表现下降，说明 BERT 的 双向性 关键。",{"type":17,"tag":25,"props":457,"children":458},{},[459,464,465],{"type":17,"tag":66,"props":460,"children":461},{},[462],{"type":23,"value":463},"# 06",{"type":23,"value":72},{"type":17,"tag":66,"props":466,"children":467},{},[468],{"type":23,"value":469},"实操",{"type":17,"tag":25,"props":471,"children":472},{},[473],{"type":17,"tag":66,"props":474,"children":475},{},[476],{"type":23,"value":477},"1、Mindspore仓进行基于bert的情感分类预测步骤",{"type":17,"tag":25,"props":479,"children":480},{},[481,483],{"type":23,"value":482},"相关代码已上传到昇思代码仓：",{"type":17,"tag":41,"props":484,"children":487},{"href":485,"rel":486},"https://github.com/mindspore-lab/mindnlp/tree/master/applications/bert",[45],[],{"type":17,"tag":25,"props":489,"children":490},{},[491],{"type":23,"value":492},"[",{"type":17,"tag":25,"props":494,"children":495},{},[496],{"type":17,"tag":41,"props":497,"children":499},{"href":485,"rel":498},[45],[500],{"type":23,"value":485},{"type":17,"tag":25,"props":502,"children":503},{},[504,506,511],{"type":23,"value":505},"](",{"type":17,"tag":41,"props":507,"children":509},{"href":485,"rel":508},[45],[510],{"type":23,"value":485},{"type":23,"value":512},")",{"type":17,"tag":25,"props":514,"children":515},{},[516,518],{"type":23,"value":517},"参考：基于MindSpore的bert模型实验指导书——",{"type":17,"tag":41,"props":519,"children":522},{"href":520,"rel":521},"https://developer.huaweicloud.com/develop/aigallery/notebook/detail?id=ed8761c1-a0d8-4ea7-ab10-5d72cd5467b2",[45],[],{"type":17,"tag":25,"props":524,"children":525},{},[526],{"type":23,"value":492},{"type":17,"tag":25,"props":528,"children":529},{},[530],{"type":17,"tag":41,"props":531,"children":533},{"href":520,"rel":532},[45],[534],{"type":23,"value":520},{"type":17,"tag":25,"props":536,"children":537},{},[538,539,544],{"type":23,"value":505},{"type":17,"tag":41,"props":540,"children":542},{"href":520,"rel":541},[45],[543],{"type":23,"value":520},{"type":23,"value":512},{"type":17,"tag":25,"props":546,"children":547},{},[548],{"type":23,"value":549},"步骤：",{"type":17,"tag":25,"props":551,"children":552},{},[553],{"type":23,"value":554},"1.安装MindSpore框架和MindSpore NLP套件；",{"type":17,"tag":25,"props":556,"children":557},{},[558],{"type":23,"value":559},"2.用bert分词库进行分词；",{"type":17,"tag":25,"props":561,"children":562},{},[563],{"type":23,"value":564},"3.进行模型的训练和推理。",{"type":17,"tag":25,"props":566,"children":567},{},[568],{"type":17,"tag":170,"props":569,"children":571},{"alt":7,"src":570},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/08/08/5c94944cc58044969ec4db182c933571.png",[],{"type":17,"tag":25,"props":573,"children":574},{},[575],{"type":17,"tag":66,"props":576,"children":577},{},[578],{"type":23,"value":579},"2、pytorch中用于文本分类的示例",{"type":17,"tag":25,"props":581,"children":582},{},[583],{"type":17,"tag":170,"props":584,"children":586},{"alt":7,"src":585},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/08/08/fc12b4232f0a48f996feb4f1e8d268ab.png",[],{"type":17,"tag":25,"props":588,"children":589},{},[590],{"type":23,"value":591},"情感分类验证集性能",{"type":17,"tag":25,"props":593,"children":594},{},[595],{"type":17,"tag":170,"props":596,"children":598},{"alt":7,"src":597},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/08/08/01fe7ffc5aea4bfb87d0dd957979dfce.png",[],{"title":7,"searchDepth":600,"depth":600,"links":601},4,[602],{"id":58,"depth":603,"text":61},2,"markdown","content:technology-blogs:zh:3809.md","content","technology-blogs/zh/3809.md","technology-blogs/zh/3809","md",1776506135642]