[{"data":1,"prerenderedAt":557},["ShallowReactive",2],{"content-query-5lIncGWdm2":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":551,"_id":552,"_source":553,"_file":554,"_stem":555,"_extension":556},"/technology-blogs/zh/391","zh",false,"","让BERT瘦下来  MindSpore量化训练极低比特语言模型 TernaryBERT","在GLUE和SQuAD上进行的实验表明，我们提出的TernaryBERT量化方法优于其他的BERT量化方法，甚至可以达到与全精度模型相当的性能，同时将模型缩小了14.9倍。现在TernaryBERT的开源代码已经在MindSpore上首发了。","2021-02-05","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/02/09/6ae749f1ed704824be939a608bb96e86.png","technology-blogs",{"type":14,"children":15,"toc":545},"root",[16,24,33,41,46,53,58,63,76,81,89,94,103,110,115,120,125,130,135,140,145,150,155,160,167,172,177,182,187,192,197,204,209,214,221,226,235,240,245,252,257,264,269,278,283,288,293,298,305,310,319,326,333,338,345,350,355,365,371,376,383,388,397,404,409,417,422,429,434,439,446,451,456,463,468,473,480,485,490,497,502,507,512,523,528,533,538],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"让bert瘦下来-mindspore量化训练极低比特语言模型-ternarybert",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28],{"type":17,"tag":29,"props":30,"children":32},"img",{"alt":7,"src":31},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/02/09/07d77e0c18a84198b87318fd4da1a6aa.png",[],{"type":17,"tag":25,"props":34,"children":35},{},[36],{"type":17,"tag":29,"props":37,"children":40},{"alt":38,"src":39},"image.png","https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202102/08/205241s3qsk1mouwsrd7jx.png",[],{"type":17,"tag":25,"props":42,"children":43},{},[44],{"type":23,"value":45},"基于Transformer的预训练模型如BERT在许多自然语言处理任务中都取得了显著的性能。然而，这些模型昂贵的计算和内存都阻碍了它们在资源受限设备上的部署。因此，我们提出了TernaryBERT，它将微调的BERT模型中权值三值化。此外，为了减少低比特导致的精度下降，我们在训练过程中采用了知识蒸馏技术。在GLUE和SQuAD上进行的实验表明，我们提出的TernaryBERT量化方法优于其他的BERT量化方法，甚至可以达到与全精度模型相当的性能，同时将模型缩小了14.9倍。现在TernaryBERT的开源代码已经在MindSpore上首发了。",{"type":17,"tag":25,"props":47,"children":48},{},[49],{"type":17,"tag":29,"props":50,"children":52},{"alt":38,"src":51},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202102/08/205303wamwdkgnk0btwmvm.png",[],{"type":17,"tag":25,"props":54,"children":55},{},[56],{"type":23,"value":57},"图1：不同算法的模型尺寸与MNLI-m精度对比。",{"type":17,"tag":25,"props":59,"children":60},{},[61],{"type":23,"value":62},"我们提出的方法（红色方块）优于其他的BERT压缩方法。",{"type":17,"tag":25,"props":64,"children":65},{},[66,68],{"type":23,"value":67},"图片来源：",{"type":17,"tag":69,"props":70,"children":74},"a",{"href":71,"rel":72},"https://arxiv.org/abs/2009.12812",[73],"nofollow",[75],{"type":23,"value":71},{"type":17,"tag":25,"props":77,"children":78},{},[79],{"type":23,"value":80},"论文链接：",{"type":17,"tag":25,"props":82,"children":83},{},[84],{"type":17,"tag":69,"props":85,"children":87},{"href":71,"rel":86},[73],[88],{"type":23,"value":71},{"type":17,"tag":25,"props":90,"children":91},{},[92],{"type":23,"value":93},"开源地址：",{"type":17,"tag":25,"props":95,"children":96},{},[97],{"type":17,"tag":69,"props":98,"children":101},{"href":99,"rel":100},"https://gitee.com/mindspore/mindspore/tree/master",[73],[102],{"type":23,"value":99},{"type":17,"tag":25,"props":104,"children":105},{},[106],{"type":17,"tag":29,"props":107,"children":109},{"alt":38,"src":108},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202102/08/205336wq2aolugbcsgb8op.png",[],{"type":17,"tag":25,"props":111,"children":112},{},[113],{"type":23,"value":114},"BERT模型由Transformer层构成。标准的Transformer层包括两个主要的子层：多头注意力（MHA）模块和前馈网络（FFN）。",{"type":17,"tag":25,"props":116,"children":117},{},[118],{"type":23,"value":119},"对于第 个Transformer层，假设输入为 ，其中 和 分别是序列长度和隐藏状态大小。假设每层都有 个attention头，头部 由 参数化，其中 。通过query和key的点积计算attention score。",{"type":17,"tag":25,"props":121,"children":122},{},[123],{"type":23,"value":124},"将softmax函数应用于归一化的分数以得到",{"type":17,"tag":25,"props":126,"children":127},{},[128],{"type":23,"value":129},"。 ，",{"type":17,"tag":25,"props":131,"children":132},{},[133],{"type":23,"value":134},"其中 可以是 。多头注意力的输出是：",{"type":17,"tag":25,"props":136,"children":137},{},[138],{"type":23,"value":139},"FFN层由两个线性层组成，分别由 和 参数化，其中 是FFN的intermediate层的神经元数目。将FFN的输入表示为 ，然后将输出计算为：",{"type":17,"tag":25,"props":141,"children":142},{},[143],{"type":23,"value":144},"结合上面两式，第l个Transformer层的前向传播可以写成：",{"type":17,"tag":25,"props":146,"children":147},{},[148],{"type":23,"value":149},"其中 是层归一化。第一个Transformer层的输入是token embedding、segment embedding和position embedding的结合。这里 是输入序列, 、、分别是可学习的word embedding、segment embedding和position embedding。",{"type":17,"tag":25,"props":151,"children":152},{},[153],{"type":23,"value":154},"对于权重量化，我们量化来自所有Transformer层中的权重",{"type":17,"tag":25,"props":156,"children":157},{},[158],{"type":23,"value":159},"、、、、、 ,以及word embedding中的 。除了这些权重外，我们还量化了前向传播中所有线性层的输入和矩阵乘法算子。我们不量化 、 和线性层中的bias，因为它们所涉及的参数可以忽略不计。我们也不量化softmax算子、层归一化和最后一个任务特定层，因为这些算子中包含的参数可以忽略不计，并且量化它们会导致显著的精度下降。",{"type":17,"tag":25,"props":161,"children":162},{},[163],{"type":17,"tag":29,"props":164,"children":166},{"alt":7,"src":165},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/02/09/6ce2bc30fdd446efbf98d0c4fc45c205.png",[],{"type":17,"tag":25,"props":168,"children":169},{},[170],{"type":23,"value":171},"下面我们将讨论下图中权重三值化函数 的选择。",{"type":17,"tag":25,"props":173,"children":174},{},[175],{"type":23,"value":176},"权重三值化在ternary-connect（Z. Lin, M. Courbariaux, R. Memisevic, and Y. Bengio. 2016. Neural networks with few multiplications.）中首创，其中三值化可以取通过2-bit表示的 。通过三值化，将前向过程中的大部分浮点乘法转换为浮点加法，大大减少了计算量和内存。通过添加缩放参数可以获得更好的结果。因此，为了将BERT的权重三值化，我们使用了基于近似的三值化方法TWN（F. Li, B. Zhang, and B. Liu. 2016. Ternary weight networks.），其中三元权重 可由缩放参数 和三元向量 的乘积表示为 。这里 是 中的元素个数。",{"type":17,"tag":25,"props":178,"children":179},{},[180],{"type":23,"value":181},"在第t次训练迭代中，TWN通过最小化全精度权重 与三值化的权重 之间的距离来实现权重的三值化，我们将上述问题定义为如下的优化问题：",{"type":17,"tag":25,"props":183,"children":184},{},[185],{"type":23,"value":186},"设 是一个阈值函数，如果 ，则 ，若 ，则 ，其他情况下 ，其中 是一个正数阈值。设 为元素乘法，上式的最优解满足：",{"type":17,"tag":25,"props":188,"children":189},{},[190],{"type":23,"value":191},"的精确解需要昂贵的排序操作。因此，TWN给出了近似的阈值 。",{"type":17,"tag":25,"props":193,"children":194},{},[195],{"type":23,"value":196},"在TWN的原始论文中，每个卷积层或全连接层都使用一个缩放参数。本文将缩放参数扩展到以下两个粒度：（i）layer-wise三值化，对每个权重矩阵中的所有元素使用一个缩放参数；（ii）raw-wise三值化，对权重矩阵中的每一行使用一个缩放参数。随着缩放参数的增加，raw-wise三值化具有更细的粒度和更小的量化误差。",{"type":17,"tag":25,"props":198,"children":199},{},[200],{"type":17,"tag":29,"props":201,"children":203},{"alt":38,"src":202},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202102/08/210105qpnx3tlttxtdmut6.png",[],{"type":17,"tag":25,"props":205,"children":206},{},[207],{"type":23,"value":208},"为了使最昂贵的矩阵乘法运算更快，本文将激活（即所有线性层和矩阵乘法的输入）量化为8-bit。常用的8-bit量化方法有两种：对称和最小-最大8-bit量化。对称8-bit量化的量化值在0的两侧对称分布，而最小-最大8-bit量化的量化值均匀分布在由最小值和最大值确定的范围内。",{"type":17,"tag":25,"props":210,"children":211},{},[212],{"type":23,"value":213},"我们发现BERT中的Transformer层的隐藏层的分布趋于负值。这种偏差在前面的层中更为明显。因此，我们对激活值使用最小-最大8-bit量化，因为它更好地解决了非对称分布。",{"type":17,"tag":25,"props":215,"children":216},{},[217],{"type":17,"tag":29,"props":218,"children":220},{"alt":38,"src":219},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202102/08/210131j1vwmrymahnvbmes.png",[],{"type":17,"tag":25,"props":222,"children":223},{},[224],{"type":23,"value":225},"图2：在SQuAD v1.1上训练的全精度BERT的第1和第6层隐藏层的分布。",{"type":17,"tag":25,"props":227,"children":228},{},[229,230],{"type":23,"value":67},{"type":17,"tag":69,"props":231,"children":233},{"href":71,"rel":232},[73],[234],{"type":23,"value":71},{"type":17,"tag":25,"props":236,"children":237},{},[238],{"type":23,"value":239},"具体而言，对于激活值 中的一个元素 ，表示 和 ，最小-最大8-bit量化函数为",{"type":17,"tag":25,"props":241,"children":242},{},[243],{"type":23,"value":244},"其中 是缩放参数。我们使用直通估计器反向传播量化后的激活值的梯度。",{"type":17,"tag":25,"props":246,"children":247},{},[248],{"type":17,"tag":29,"props":249,"children":251},{"alt":7,"src":250},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/02/09/fb00ea035ebd45afa03a39e0e58ee775.png",[],{"type":17,"tag":25,"props":253,"children":254},{},[255],{"type":23,"value":256},"量化的BERT使用低比特数值来表示模型权重和激活值。因此，与全精度的对应模型相比，它的信息容量相对较低，性能更差。为了解决这一问题，我们结合了知识蒸馏技术来提高量化的BERT的性能。在这个知识蒸馏框架中，量化的BERT作为学生模型，学习去恢复Transformer层和预测层上的全精度的教师模型的行为。",{"type":17,"tag":25,"props":258,"children":259},{},[260],{"type":17,"tag":29,"props":261,"children":263},{"alt":38,"src":262},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202102/08/210227dkri9g1mqupgi6ff.png",[],{"type":17,"tag":25,"props":265,"children":266},{},[267],{"type":23,"value":268},"图3：BERT模型的蒸馏感知三值化描述。",{"type":17,"tag":25,"props":270,"children":271},{},[272,273],{"type":23,"value":67},{"type":17,"tag":69,"props":274,"children":276},{"href":71,"rel":275},[73],[277],{"type":23,"value":71},{"type":17,"tag":25,"props":279,"children":280},{},[281],{"type":23,"value":282},"具体来说，Transformer层 的蒸馏目标包括两部分。第一部分是蒸馏损失，它将全精度教师模型的embedding层和所有Transformer层的输出提取到量化学生模型中，通过均方误差损失（MSE）： 。第二部分是从教师模型的attention score中提取知识的蒸馏损失，从每个Transformer层的所有头部 到学生模型的attention score ，即 。因此，Transformer层 的蒸馏公式如下：",{"type":17,"tag":25,"props":284,"children":285},{},[286],{"type":23,"value":287},"除了Transformer层外，我们还在预测层提取知识，使学生模型的logits 通过soft cross-entropy（SCE）损失从教师模型中学习拟合 ：",{"type":17,"tag":25,"props":289,"children":290},{},[291],{"type":23,"value":292},"因此，在TernaryBERT训练过程中进行知识提炼的总体目标是：",{"type":17,"tag":25,"props":294,"children":295},{},[296],{"type":23,"value":297},"我们使用对下游任务进行微调的全精度BERT初始化我们的量化模型，并使用数据增广方法（X. Jiao, Y. Yin, L. Shang, X. Jiang, X. Chen, L. Li, F. Wang, and Q. Liu. 2019. Tinybert: Distilling bert for natural language understanding.）来提高性能。整个过程称为蒸馏感知三值化，如算法1所示。",{"type":17,"tag":25,"props":299,"children":300},{},[301],{"type":17,"tag":29,"props":302,"children":304},{"alt":38,"src":303},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202102/08/210325kak0vqidekbq4uz6.png",[],{"type":17,"tag":25,"props":306,"children":307},{},[308],{"type":23,"value":309},"图4：算法1。",{"type":17,"tag":25,"props":311,"children":312},{},[313,314],{"type":23,"value":67},{"type":17,"tag":69,"props":315,"children":317},{"href":71,"rel":316},[73],[318],{"type":23,"value":71},{"type":17,"tag":25,"props":320,"children":321},{},[322],{"type":17,"tag":29,"props":323,"children":325},{"alt":38,"src":324},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202102/08/210408rej6vipzky44bfyt.png",[],{"type":17,"tag":327,"props":328,"children":330},"h2",{"id":329},"和bert量化算法对比的结果",[331],{"type":23,"value":332},"和BERT量化算法对比的结果",{"type":17,"tag":25,"props":334,"children":335},{},[336],{"type":23,"value":337},"表1显示了GLUE基准的开发集结果。从表1中我们发现：1）对于2-bit权重，由于模型容量的急剧减少，Q-BERT（或Q2BERT）与全精度BERT之间存在很大的差距。TernaryBERT的性能明显优于Q-BERT和Q2BERT，即使word embedding的比特数更少。同时，TerneyBERT以14.9倍更小的尺寸实现了与全精度基线相当的性能。2）当权值的位数增加到8时，所有量化模型的性能都得到了极大的改善，甚至可以与全精度基线相媲美，这表明设置8-8-8对BERT来说并不具有挑战性。我们提出的方法在MNLI和SST-2上都优于Q-BERT，在8个任务中有7个优于Q8BERT。3）TWN和LAT在所有任务上都取得了相似的结果，表明两种三值化方法都具有竞争力。",{"type":17,"tag":25,"props":339,"children":340},{},[341],{"type":17,"tag":29,"props":342,"children":344},{"alt":38,"src":343},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202102/08/210436femkqgcfruuvaluk.png",[],{"type":17,"tag":25,"props":346,"children":347},{},[348],{"type":23,"value":349},"表1：GLUE基准上量化的BERT和TinyBERT的开发集结果。",{"type":17,"tag":25,"props":351,"children":352},{},[353],{"type":23,"value":354},"我们将Transformer层权重、word embedding和激活的位数缩写为“W-E-A（#位）”。",{"type":17,"tag":25,"props":356,"children":357},{},[358,360],{"type":23,"value":359},"表格来源：",{"type":17,"tag":69,"props":361,"children":363},{"href":71,"rel":362},[73],[364],{"type":23,"value":71},{"type":17,"tag":327,"props":366,"children":368},{"id":367},"和其他bert压缩方法对比",[369],{"type":23,"value":370},"和其他BERT压缩方法对比",{"type":17,"tag":25,"props":372,"children":373},{},[374],{"type":23,"value":375},"从表2可以看出，与量化以外的其他常用的BERT压缩方法相比，本文提出的方法可以获得相似或更好的性能，但要小得多。",{"type":17,"tag":25,"props":377,"children":378},{},[379],{"type":17,"tag":29,"props":380,"children":382},{"alt":38,"src":381},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202102/08/210459bbcu6k7r5ziylqnn.png",[],{"type":17,"tag":25,"props":384,"children":385},{},[386],{"type":23,"value":387},"表2：在MNLI-m上，TernaryBERT与其他压缩方法的比较。",{"type":17,"tag":25,"props":389,"children":390},{},[391,392],{"type":23,"value":359},{"type":17,"tag":69,"props":393,"children":395},{"href":71,"rel":394},[73],[396],{"type":23,"value":71},{"type":17,"tag":25,"props":398,"children":399},{},[400],{"type":17,"tag":29,"props":401,"children":403},{"alt":38,"src":402},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202102/08/210534uerqnkvvoih3djrl.png",[],{"type":17,"tag":25,"props":405,"children":406},{},[407],{"type":23,"value":408},"相关训练与推理代码，以及使用方法已经开源在：",{"type":17,"tag":25,"props":410,"children":411},{},[412],{"type":17,"tag":69,"props":413,"children":415},{"href":99,"rel":414},[73],[416],{"type":23,"value":99},{"type":17,"tag":25,"props":418,"children":419},{},[420],{"type":23,"value":421},"为了方便大家验证我们的结果以及创新，我们将模型的结构，以及超参数的设置汇总到了相关的代码仓的/script文件夹。src/config.py中存放了配置信息。参数设置以GPU训练脚本train.sh为例：",{"type":17,"tag":25,"props":423,"children":424},{},[425],{"type":17,"tag":29,"props":426,"children":428},{"alt":38,"src":427},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202102/08/210556jy5fy6rimqezygx5.png",[],{"type":17,"tag":25,"props":430,"children":431},{},[432],{"type":23,"value":433},"图5：训练脚本",{"type":17,"tag":25,"props":435,"children":436},{},[437],{"type":23,"value":438},"如果想切换其他的glue数据集，只需要在--task_name的位置将上图中的sts-b更改即可。若想使用自己的数据集。可以参考src/dataset.py中构造数据pipeline的代码。只事先将文本数据转换成需要的输入格式然后封装为tfrecord或者mindrecord格式，就可以使用pipeline进行读取。",{"type":17,"tag":25,"props":440,"children":441},{},[442],{"type":17,"tag":29,"props":443,"children":445},{"alt":38,"src":444},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202102/08/210621rqw5mkx05ziwoyxj.png",[],{"type":17,"tag":25,"props":447,"children":448},{},[449],{"type":23,"value":450},"图6：构造数据pipeline的代码",{"type":17,"tag":25,"props":452,"children":453},{},[454],{"type":23,"value":455},"TernaryBERT的模型结构的定义和激活伪量化操作放在/src/tinybert_model.py中。用户可以在这里手动插入激活的伪量化结点或更改网络结构。",{"type":17,"tag":25,"props":457,"children":458},{},[459],{"type":17,"tag":29,"props":460,"children":462},{"alt":38,"src":461},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202102/08/210642nf0fyex7bx9b0dez.png",[],{"type":17,"tag":25,"props":464,"children":465},{},[466],{"type":23,"value":467},"图7：可以灵活插入激活伪量化结点",{"type":17,"tag":25,"props":469,"children":470},{},[471],{"type":23,"value":472},"src/cell_wrapper.py封装了训练相关的类以及权重的伪量化操作。",{"type":17,"tag":25,"props":474,"children":475},{},[476],{"type":17,"tag":29,"props":477,"children":479},{"alt":38,"src":478},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202102/08/210704nipkevlwl4i7c0ea.png",[],{"type":17,"tag":25,"props":481,"children":482},{},[483],{"type":23,"value":484},"图8：权重伪量化操作",{"type":17,"tag":25,"props":486,"children":487},{},[488],{"type":23,"value":489},"最后，MindSpore的model_zoo中存放有TernaryBERT针对MNLI-m、QNLI和STS-B对应的训练脚本。模型均可达到论文中所述精度。",{"type":17,"tag":25,"props":491,"children":492},{},[493],{"type":17,"tag":29,"props":494,"children":496},{"alt":38,"src":495},"https://bbs-img-cbc-cn.obs.cn-north-1.myhuaweicloud.com/data/attachment/forum/202102/08/210727dcmrjxy3exttvl5z.png",[],{"type":17,"tag":25,"props":498,"children":499},{},[500],{"type":23,"value":501},"表3：在MNLI-m、QNLI和STS-B上，",{"type":17,"tag":25,"props":503,"children":504},{},[505],{"type":23,"value":506},"通过mindspore实现的TernaryBERT的精度。",{"type":17,"tag":25,"props":508,"children":509},{},[510],{"type":23,"value":511},"MindSpore官方资料",{"type":17,"tag":25,"props":513,"children":514},{},[515,517],{"type":23,"value":516},"GitHub : ",{"type":17,"tag":69,"props":518,"children":521},{"href":519,"rel":520},"https://github.com/mindspore-ai/mindspore",[73],[522],{"type":23,"value":519},{"type":17,"tag":25,"props":524,"children":525},{},[526],{"type":23,"value":527},"Gitee:https : //gitee.com/mindspore/mindspore",{"type":17,"tag":25,"props":529,"children":530},{},[531],{"type":23,"value":532},"官方QQ群 : 871543426",{"type":17,"tag":25,"props":534,"children":535},{},[536],{"type":23,"value":537},"扫描下方二维码加入MindSpore项目",{"type":17,"tag":25,"props":539,"children":540},{},[541],{"type":17,"tag":29,"props":542,"children":544},{"alt":7,"src":543},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/02/09/2eaa688291554bba8d96002cf0e8c0bb.jpg",[],{"title":7,"searchDepth":546,"depth":546,"links":547},4,[548,550],{"id":329,"depth":549,"text":332},2,{"id":367,"depth":549,"text":370},"markdown","content:technology-blogs:zh:391.md","content","technology-blogs/zh/391.md","technology-blogs/zh/391","md",1776506136753]