[{"data":1,"prerenderedAt":491},["ShallowReactive",2],{"content-query-fVFhsQKJEI":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":485,"_id":486,"_source":487,"_file":488,"_stem":489,"_extension":490},"/technology-blogs/zh/3608","zh",false,"","bertweet模型论文解读，并基于MindSpore NLP推理复现","作者：YoursLLL         来源：知乎","2025-02-11","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/02/14/f494148f1d374815bcb9c5ba623141f7.png","technology-blogs","开发者分享",{"type":15,"children":16,"toc":479},"root",[17,25,42,50,54,66,74,79,84,89,94,99,104,109,114,118,122,127,131,135,140,144,148,153,157,161,166,170,174,179,183,188,192,197,201,206,210,215,219,224,229,233,238,246,254,258,262,266,271,276,283,288,295,300,307,312,319,324,332,337,342,347,352,360,368,376,383,388,396,409,420,427,432,440,447,455,463,475],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"bertweet模型论文解读并基于mindspore-nlp推理复现",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29,35,37],{"type":18,"tag":30,"props":31,"children":32},"strong",{},[33],{"type":24,"value":34},"作者：YoursLLL",{"type":24,"value":36}," ",{"type":18,"tag":30,"props":38,"children":39},{},[40],{"type":24,"value":41},"来源：知乎",{"type":18,"tag":26,"props":43,"children":44},{},[45],{"type":18,"tag":30,"props":46,"children":47},{},[48],{"type":24,"value":49},"论文创新点",{"type":18,"tag":51,"props":52,"children":53},"h2",{"id":7},[],{"type":18,"tag":26,"props":55,"children":56},{},[57,59,64],{"type":24,"value":58},"BERTweet是首个大规模预训练的英语推文语言模型，基于BERTbase架构并使用RoBERTa的预训练方法。论文通过实验验证了BERTweet在以下推文自然语言处理任务上的性能优越性：",{"type":18,"tag":30,"props":60,"children":61},{},[62],{"type":24,"value":63},"词性标注（POS tagging）、命名实体识别（NER）和文本分类",{"type":24,"value":65},"。通过850M条推文的大规模英语推文数据进行训练，BERTweet在所有任务上均优于现有的强基线模型，如RoBERTa和XLM-R。",{"type":18,"tag":26,"props":67,"children":68},{},[69],{"type":18,"tag":30,"props":70,"children":71},{},[72],{"type":24,"value":73},"Roberta概述",{"type":18,"tag":26,"props":75,"children":76},{},[77],{"type":24,"value":78},"RoBERTa本质上是BERT，它只是在预训练中有以下变化：",{"type":18,"tag":26,"props":80,"children":81},{},[82],{"type":24,"value":83},"1、在掩码语言模型构建任务中使用动态掩码而不是静态掩码。",{"type":18,"tag":26,"props":85,"children":86},{},[87],{"type":24,"value":88},"2、不执行下句预测任务，只用掩码语言模型构建任务进行训练。",{"type":18,"tag":26,"props":90,"children":91},{},[92],{"type":24,"value":93},"3、以大批量的方式进行训练。",{"type":18,"tag":26,"props":95,"children":96},{},[97],{"type":24,"value":98},"4、使用字节级字节对编码作为子词词元化算法。",{"type":18,"tag":26,"props":100,"children":101},{},[102],{"type":24,"value":103},"具体表现在什么地方呢，我们举一些例子来说明：",{"type":18,"tag":26,"props":105,"children":106},{},[107],{"type":24,"value":108},"**",{"type":18,"tag":26,"props":110,"children":111},{},[112],{"type":24,"value":113},"1. 动态掩码 vs. 静态掩码",{"type":18,"tag":26,"props":115,"children":116},{},[117],{"type":24,"value":108},{"type":18,"tag":26,"props":119,"children":120},{},[121],{"type":24,"value":108},{"type":18,"tag":26,"props":123,"children":124},{},[125],{"type":24,"value":126},"BERT：在训练时，每个句子的掩码是预先固定的，比如对句子“猫在玩球”应用掩码，可能会得到“[MASK]在玩球”。在整个训练过程中，这个掩码方式保持不变（即在整个训练过程中，被掩盖掉的词总是“猫”）。",{"type":18,"tag":26,"props":128,"children":129},{},[130],{"type":24,"value":108},{"type":18,"tag":26,"props":132,"children":133},{},[134],{"type":24,"value":108},{"type":18,"tag":26,"props":136,"children":137},{},[138],{"type":24,"value":139},"RoBERTa：采用动态掩码，每次训练时，句子的掩码位置会有所变化，例如句子“猫在玩球”在一次训练中可能变成“[MASK]在玩球”，而在另一轮训练中可能变成“猫在[MASK]球”，使模型能在不同上下文中学习更广泛的词语语义。",{"type":18,"tag":26,"props":141,"children":142},{},[143],{"type":24,"value":108},{"type":18,"tag":26,"props":145,"children":146},{},[147],{"type":24,"value":108},{"type":18,"tag":26,"props":149,"children":150},{},[151],{"type":24,"value":152},"2. 使用字节级字节对编码（Byte-Pair Encoding, BPE）作为子词词元化算法",{"type":18,"tag":26,"props":154,"children":155},{},[156],{"type":24,"value":108},{"type":18,"tag":26,"props":158,"children":159},{},[160],{"type":24,"value":108},{"type":18,"tag":26,"props":162,"children":163},{},[164],{"type":24,"value":165},"BERT：使用 WordPiece 词元化算法，将句子拆分为词元（token），例如“unhappiness”可能被分为“un-”， “happi-”和“-ness”。",{"type":18,"tag":26,"props":167,"children":168},{},[169],{"type":24,"value":108},{"type":18,"tag":26,"props":171,"children":172},{},[173],{"type":24,"value":108},{"type":18,"tag":26,"props":175,"children":176},{},[177],{"type":24,"value":178},"RoBERTa：采用 BPE 子词词元化算法，以更细粒度的字节级别进行分词。像Emoji这样的特殊字符也会被分为多个字节级子词。这样，RoBERTa 可以处理更广泛的文本格式，包括非常见字符和表情符号。在Bertweet的数据训练主要来源于推文，在我的理解中，BPE对于Bertweet相当于关键的心脏部分。",{"type":18,"tag":26,"props":180,"children":181},{},[182],{"type":24,"value":108},{"type":18,"tag":26,"props":184,"children":185},{},[186],{"type":24,"value":187},"假设词语为 internationalization：",{"type":18,"tag":26,"props":189,"children":190},{},[191],{"type":24,"value":108},{"type":18,"tag":26,"props":193,"children":194},{},[195],{"type":24,"value":196},"BERT（WordPiece）：将长词分解为更小的词根或词缀，比如：",{"type":18,"tag":26,"props":198,"children":199},{},[200],{"type":24,"value":108},{"type":18,"tag":26,"props":202,"children":203},{},[204],{"type":24,"value":205},"\"internationalization\" -> [\"inter\", \"##national\", \"##ization\"]",{"type":18,"tag":26,"props":207,"children":208},{},[209],{"type":24,"value":108},{"type":18,"tag":26,"props":211,"children":212},{},[213],{"type":24,"value":214},"RoBERTa（BPE）：使用字节级分词算法，根据子词频率，按字节进行拆分：",{"type":18,"tag":26,"props":216,"children":217},{},[218],{"type":24,"value":108},{"type":18,"tag":26,"props":220,"children":221},{},[222],{"type":24,"value":223},"\"internationalization\" -> [\"international\", \"ization\"]",{"type":18,"tag":26,"props":225,"children":226},{},[227],{"type":24,"value":228},"在这种情况下，如果词干和后缀的组合频繁出现，RoBERTa 更倾向于保持这些高频的子词完整。",{"type":18,"tag":26,"props":230,"children":231},{},[232],{"type":24,"value":108},{"type":18,"tag":26,"props":234,"children":235},{},[236],{"type":24,"value":237},"包含特殊字符Emoji的句子示例：",{"type":18,"tag":26,"props":239,"children":240},{},[241],{"type":18,"tag":242,"props":243,"children":245},"img",{"alt":7,"src":244},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/02/14/b153063784f547dbaf18ac95b967276e.png",[],{"type":18,"tag":26,"props":247,"children":248},{},[249],{"type":18,"tag":30,"props":250,"children":251},{},[252],{"type":24,"value":253},"数据集上的指标评价得分",{"type":18,"tag":26,"props":255,"children":256},{},[257],{"type":24,"value":108},{"type":18,"tag":51,"props":259,"children":261},{"id":260},"_1",[],{"type":18,"tag":26,"props":263,"children":264},{},[265],{"type":24,"value":108},{"type":18,"tag":26,"props":267,"children":268},{},[269],{"type":24,"value":270},"BERTweet在不同推文数据集上的主要任务评价指标表现如下：",{"type":18,"tag":26,"props":272,"children":273},{},[274],{"type":24,"value":275},"词性标注（POS Tagging）：在Ritter11、ARK-Twitter和TWEEBANK-V2数据集上，BERTweet的准确率（Accuracy）分别为90.1%、94.1%、95.2%。",{"type":18,"tag":26,"props":277,"children":278},{},[279],{"type":18,"tag":242,"props":280,"children":282},{"alt":7,"src":281},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/02/14/7f6e72f77fdf49da8fc25d7c322dd7ca.png",[],{"type":18,"tag":26,"props":284,"children":285},{},[286],{"type":24,"value":287},"**命名实体识别（NER）：**在WNUT16和WNUT17数据集上，BERTweet的F1分数为52.1%和56.5%。",{"type":18,"tag":26,"props":289,"children":290},{},[291],{"type":18,"tag":242,"props":292,"children":294},{"alt":7,"src":293},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/02/14/29fac720fea24868abe4487affc3ce07.png",[],{"type":18,"tag":26,"props":296,"children":297},{},[298],{"type":24,"value":299},"**文本分类：**在SemEval2017-Task4A和SemEval2018-Task3A数据集上的F1得分分别为72.8%和74.6%，均刷新了现有的最优结果。",{"type":18,"tag":26,"props":301,"children":302},{},[303],{"type":18,"tag":242,"props":304,"children":306},{"alt":7,"src":305},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/02/14/60f98a109cb04f1d99a61d92b8d03efc.png",[],{"type":18,"tag":26,"props":308,"children":309},{},[310],{"type":24,"value":311},"在 SemEval2017-Task4A 数据集上的得分表现",{"type":18,"tag":26,"props":313,"children":314},{},[315],{"type":18,"tag":242,"props":316,"children":318},{"alt":7,"src":317},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/02/14/cb84a51d441048f7b70a117d1489090c.png",[],{"type":18,"tag":26,"props":320,"children":321},{},[322],{"type":24,"value":323},"在 SemEval2018-Task3A 数据集上的得分表现",{"type":18,"tag":26,"props":325,"children":326},{},[327],{"type":18,"tag":30,"props":328,"children":329},{},[330],{"type":24,"value":331},"创新点优势",{"type":18,"tag":26,"props":333,"children":334},{},[335],{"type":24,"value":336},"相较于以往的模型采用正规的、书面化的语言来做数据源（维基百科等）训练，BERTweet的主要创新点在于其**专门针对推文数据进行预训练，**从而有效处理推文特有的非正式表达方式（如缩写、拼写错误和表情符号）。相较于RoBERTa和XLM-R等模型，BERTweet采用了基于领域的预训练数据，使得其在处理推文时更具表现力。具体优势包括：",{"type":18,"tag":26,"props":338,"children":339},{},[340],{"type":24,"value":341},"**1.专注于推文领域：**BERTweet利用850M条推文数据进行训练，而RoBERTa等模型则基于常规的书籍和新闻数据，这使得BERTweet在推文的特定任务上表现更佳。",{"type":18,"tag":26,"props":343,"children":344},{},[345],{"type":24,"value":346},"**2. 高效的预训练方法：**尽管BERTweet的数据量小于XLM-R和RoBERTa的训练数据，但通过RoBERTa的优化预训练方法，模型在处理推文任务时能更好地提取领域特征。",{"type":18,"tag":26,"props":348,"children":349},{},[350],{"type":24,"value":351},"**3. 丰富的词汇处理能力：**BERTweet对推文中特有的词汇进行了处理，例如用户提及和URL等符号的规范化，这使其在处理推文内容时具备较好的泛化能力。",{"type":18,"tag":26,"props":353,"children":354},{},[355],{"type":18,"tag":30,"props":356,"children":357},{},[358],{"type":24,"value":359},"基于 Bertweet 本文使用两种框架分别对两个数据集进行了推理验证：",{"type":18,"tag":26,"props":361,"children":362},{},[363],{"type":18,"tag":30,"props":364,"children":365},{},[366],{"type":24,"value":367},"核心代码切片如下：",{"type":18,"tag":26,"props":369,"children":370},{},[371],{"type":18,"tag":30,"props":372,"children":373},{},[374],{"type":24,"value":375},"情感分类任务（IMDB）数据集：",{"type":18,"tag":26,"props":377,"children":378},{},[379],{"type":18,"tag":242,"props":380,"children":382},{"alt":7,"src":381},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/02/14/38c80b1121024bfc8b76e4eba185ac2a.png",[],{"type":18,"tag":26,"props":384,"children":385},{},[386],{"type":24,"value":387},"MindSpore NLP加载模型并以IMDB数据集训练",{"type":18,"tag":26,"props":389,"children":390},{},[391],{"type":18,"tag":30,"props":392,"children":393},{},[394],{"type":24,"value":395},"实体命名类任务(TweetBank数据集)：",{"type":18,"tag":26,"props":397,"children":398},{},[399,401],{"type":24,"value":400},"数据集地址: ",{"type":18,"tag":402,"props":403,"children":407},"a",{"href":404,"rel":405},"https://github.com/Oneplus/Tweebank",[406],"nofollow",[408],{"type":24,"value":404},{"type":18,"tag":26,"props":410,"children":411},{},[412,414],{"type":24,"value":413},"数据集处理与验证代码仓库：",{"type":18,"tag":402,"props":415,"children":418},{"href":416,"rel":417},"https://gitee.com/laizhenglin2024/bertweet-thesis",[406],[419],{"type":24,"value":416},{"type":18,"tag":26,"props":421,"children":422},{},[423],{"type":18,"tag":242,"props":424,"children":426},{"alt":7,"src":425},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/02/14/27f6f90dca784f6bad592c38ce1253d6.png",[],{"type":18,"tag":26,"props":428,"children":429},{},[430],{"type":24,"value":431},"MindSpore NLP加载模型并以Tweetbank数据集训练",{"type":18,"tag":26,"props":433,"children":434},{},[435],{"type":18,"tag":30,"props":436,"children":437},{},[438],{"type":24,"value":439},"在Transformers和MindSpore NLP两个框架下加载相同数据集的得分对比：",{"type":18,"tag":26,"props":441,"children":442},{},[443],{"type":18,"tag":242,"props":444,"children":446},{"alt":7,"src":445},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/02/14/eae238acc16e49f98c4c3510ea597159.png",[],{"type":18,"tag":26,"props":448,"children":449},{},[450],{"type":18,"tag":30,"props":451,"children":452},{},[453],{"type":24,"value":454},"从得分上来看MindSpore NLP还是极其微弱于Transformers，毕竟Transformers在自然语言处理社区中广泛应用，并经历了大量的优化和调优。它拥有丰富的模型库和大量的社区贡献，这使得在标准任务上的表现更佳。而MindSpore NLP相对较新，尽管具有潜力，但在特定任务和预训练模型的优化上可能还不如Transformers成熟。但我相信经过国内社区生态慢慢建设发展一定能够MindSpore NLP一定能够表现的更好。",{"type":18,"tag":26,"props":456,"children":457},{},[458],{"type":18,"tag":30,"props":459,"children":460},{},[461],{"type":24,"value":462},"总结",{"type":18,"tag":26,"props":464,"children":465},{},[466,468,473],{"type":24,"value":467},"BERTweet是一款专门为英语推文设计的大规模预训练语言模型，通过RoBERTa方法在850M条推文数据上训练，使其在推文特定任务（POS、NER、文本分类）上达到最优表现。与现有的强基线模型相比，BERTweet在领域内表现出色，为推文自然语言处理任务提供了一个强大且有效的模型，能够有效捕获推文中非正式表达的特点。研究的结果验证了针对",{"type":18,"tag":30,"props":469,"children":470},{},[471],{"type":24,"value":472},"特定领域数据的预训练模型",{"type":24,"value":474},"在特定任务上的优势，未来可能会进一步扩展BERTweet模型至更大规模以提升任务表现。",{"type":18,"tag":26,"props":476,"children":477},{},[478],{"type":24,"value":108},{"title":7,"searchDepth":480,"depth":480,"links":481},4,[482,484],{"id":7,"depth":483,"text":7},2,{"id":260,"depth":483,"text":7},"markdown","content:technology-blogs:zh:3608.md","content","technology-blogs/zh/3608.md","technology-blogs/zh/3608","md",1776506132103]