[{"data":1,"prerenderedAt":233},["ShallowReactive",2],{"content-query-f89eqxmDzz":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":227,"_id":228,"_source":229,"_file":230,"_stem":231,"_extension":232},"/technology-blogs/zh/1701","zh",false,"","MindSpore Transformer模型库，几行代码玩转Transformer大模型，性能超越Megatron 20%","Transformer模型和自监督预训练模式的提出，给NLP、CV等多个人工智能应用领域开辟了新的方向。通过增加模型参数量和数据规模，预训练模型在实际领域的表现还在持续地提升。","2022-08-12","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/08/15/8e47373968f346dcbac136706062a3eb.png","technology-blogs","大V博文",{"type":15,"children":16,"toc":221},"root",[17,25,31,44,49,53,58,63,70,75,83,88,93,104,114,124,129,135,140,150,160,165,175,183,188,193,201,206,211],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore-transformer模型库几行代码玩转transformer大模型性能超越megatron-20",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"作者：金雪锋",{"type":18,"tag":26,"props":32,"children":33},{},[34,36],{"type":24,"value":35},"文章来源：",{"type":18,"tag":37,"props":38,"children":42},"a",{"href":39,"rel":40},"https://zhuanlan.zhihu.com/p/552995868",[41],"nofollow",[43],{"type":24,"value":39},{"type":18,"tag":26,"props":45,"children":46},{},[47],{"type":24,"value":48},"最近MindSpore将发布1.8.1版本，在那个版本我们会推出Transformer和推荐两个模型加速库，本文介绍一下Transformer这个库，提前预告一下。",{"type":18,"tag":26,"props":50,"children":51},{},[52],{"type":24,"value":9},{"type":18,"tag":26,"props":54,"children":55},{},[56],{"type":24,"value":57},"另一方面，参数量的提升给模型训练带来了新的挑战。GPT-3、T5等大规模Transformer模型通常至少需要上百张GPU卡进行长达数月的训练，耗费几百万美金的训练成本。如何更加高效地、分布式地训练这些“巨无霸”们，是目前整个业界都在思考的一个问题。",{"type":18,"tag":26,"props":59,"children":60},{},[61],{"type":24,"value":62},"目前，众多企业以及开源机构推出了专门的Transformer模型训练库，其中以NVIDIA基于Pytorch的Megatron-LM训练库在各方面性能上较为领先。我们推出了基于MindSpore的Transformer模型训练库，相比Megatron，具有更高的开发效率、内存使用效率以及更高的性能，并同时支持多种后端。",{"type":18,"tag":64,"props":65,"children":67},"h3",{"id":66},"_1性能结果",[68],{"type":24,"value":69},"1、性能结果",{"type":18,"tag":26,"props":71,"children":72},{},[73],{"type":24,"value":74},"我们分别在8p、16p和32p A100集群上测试了百亿规模GPT(hiddensize=5120, num_layers=35, num_heads=40)性能，模型并行路数设置为8，数据并行数分别为1、2、4，Global Batch为1024。Megatron配置Micro Batch Size=2（Megatron已达到上限），MindSpore配置Micro Batch Size=8，相比Megatron，昇思MindSpore的内存利用率更高，可以训练更大的Batch Size。",{"type":18,"tag":26,"props":76,"children":77},{},[78],{"type":18,"tag":79,"props":80,"children":82},"img",{"alt":7,"src":81},"https://pic2.zhimg.com/80/v2-d1c6151770714af0d25d7bbd14847965_720w.jpg",[],{"type":18,"tag":26,"props":84,"children":85},{},[86],{"type":24,"value":87},"吞吐率对比",{"type":18,"tag":26,"props":89,"children":90},{},[91],{"type":24,"value":92},"如上图所示：",{"type":18,"tag":26,"props":94,"children":95},{},[96,98],{"type":24,"value":97},"8P Megatron的最大吞吐率为7.4 k samples/s；MindSpore 最大吞吐率为9.3k samples/s， ",{"type":18,"tag":99,"props":100,"children":101},"strong",{},[102],{"type":24,"value":103},"超过Megatron 25%；",{"type":18,"tag":26,"props":105,"children":106},{},[107,109],{"type":24,"value":108},"16P Megatron的最大吞吐率为13.6k samples/s，MindSpore最大吞吐率为16.9k samples/s，",{"type":18,"tag":99,"props":110,"children":111},{},[112],{"type":24,"value":113},"超过Megatron 24%；",{"type":18,"tag":26,"props":115,"children":116},{},[117,119],{"type":24,"value":118},"32P Megatron的最大吞吐率为20.1k samples/s，MindSpore最大吞吐率为23.8k samples/s，",{"type":18,"tag":99,"props":120,"children":121},{},[122],{"type":24,"value":123},"超过Megatron 18%。",{"type":18,"tag":26,"props":125,"children":126},{},[127],{"type":24,"value":128},"这些性能提升主要得益于MindSpore强大的图算融合技术以及更加精细的自动并行调度能力。",{"type":18,"tag":64,"props":130,"children":132},{"id":131},"_2更高的开发效率",[133],{"type":24,"value":134},"2、更高的开发效率",{"type":18,"tag":26,"props":136,"children":137},{},[138],{"type":24,"value":139},"MindSpore Transformer利用MindSpore内置的并行技术，能够自动进行拓扑感知，高效地融合数据并行和模型并行策略；实现单卡到大规模集群的无缝切换。",{"type":18,"tag":26,"props":141,"children":142},{},[143,148],{"type":18,"tag":99,"props":144,"children":145},{},[146],{"type":24,"value":147},"低门槛的并行易用性",{"type":24,"value":149}," 受益于MindSpore的并行能力。MindSpore Transformer能够从单卡一键拓展到多卡训练。",{"type":18,"tag":151,"props":152,"children":154},"pre",{"code":153},"context.set_auto_parallel_context(parallel_mode=\"stand_alone\") # 单卡\ncontext.set_auto_parallel_context(parallel_mode=\"data_parallel\") # 数据并行\ncontext.set_auto_parallel_context(parallel_mode=\"semi_auto_parallel\") # 半自动并行\n",[155],{"type":18,"tag":156,"props":157,"children":158},"code",{"__ignoreMap":7},[159],{"type":24,"value":153},{"type":18,"tag":26,"props":161,"children":162},{},[163],{"type":24,"value":164},"用户可以在启动脚本中传入\"--parallel_model=data_parallel\"参数来使能上述功能。",{"type":18,"tag":26,"props":166,"children":167},{},[168,173],{"type":18,"tag":99,"props":169,"children":170},{},[171],{"type":24,"value":172},"丰富的并行特性，一键使能",{"type":24,"value":174}," 如下代码展示了MindSpore Transformer库中进行模型并行的设置。MindSpore预定义了一套最基础的通过配置model_parallel模型并行数和data_parallel数据并行数，可以直接实现Tranformer类网络的模型并行，实现大模型训练。",{"type":18,"tag":151,"props":176,"children":178},{"code":177},"parallel_config = TransformerOpParallelConfig(model_parallel=config.model_parallel, # 模型并行\n                                               data_parallel=config.data_parallel, # 数据并行\n                                               recompute=True, # 开启重计算\n                                               optimizer_shard=True) # 开启优化器并行\ntransformer = Transformer(hidden_size=config.hidden_size,\n                               batch_size=config.batch_size,\n                               ffn_hidden_size=config.hidden_size * 4,\n                               src_seq_length=config.seq_length,\n                               tgt_seq_length=config.seq_length,\n                               encoder_layers=config.num_layers,\n                               attention_dropout_rate=config.dropout_rate,\n                               hidden_dropout_rate=config.dropout_rate,\n                               decoder_layers=0,\n                               num_heads=config.num_heads,\n                               parallel_config=config.parallel_config) \n",[179],{"type":18,"tag":156,"props":180,"children":181},{"__ignoreMap":7},[182],{"type":24,"value":177},{"type":18,"tag":26,"props":184,"children":185},{},[186],{"type":24,"value":187},"同时，由于MindSpore框架本身有丰富的并行基础能力，这使得MindSpore Transformer整体实现比较简单，代码量7000行即可实现Megatron几万行的代码量，同时MindSpore Transformer与其他框架的库相比，在代码的灵活度和通用性上更好，更加有利于大模型的自定义和泛化。",{"type":18,"tag":26,"props":189,"children":190},{},[191],{"type":24,"value":192},"此外，MindSpore Transformer库同时提供多种并行技术：流水线并行、优化器并行和专家并行。用户可以持续关注仓库的最新进展。",{"type":18,"tag":26,"props":194,"children":195},{},[196],{"type":18,"tag":99,"props":197,"children":198},{},[199],{"type":24,"value":200},"3、展望",{"type":18,"tag":26,"props":202,"children":203},{},[204],{"type":24,"value":205},"大模型训练一直是业界的热点之一，国内外的大模型也在不断地推陈出新，MindSpore Transformer 库也将会持续不断的更新和演进。未来我们计划增加更多的预训练语言模型，例如MoE、多模态等大模型，欢迎关注和使用。",{"type":18,"tag":26,"props":207,"children":208},{},[209],{"type":24,"value":210},"代码仓链接：",{"type":18,"tag":26,"props":212,"children":213},{},[214],{"type":18,"tag":37,"props":215,"children":218},{"href":216,"rel":217},"https://link.zhihu.com/?target=https%3A//gitee.com/mindspore/transformer",[41],[219],{"type":24,"value":220},"MindSpore/transformergitee.com/mindspore/transformer",{"title":7,"searchDepth":222,"depth":222,"links":223},4,[224,226],{"id":66,"depth":225,"text":69},3,{"id":131,"depth":225,"text":134},"markdown","content:technology-blogs:zh:1701.md","content","technology-blogs/zh/1701.md","technology-blogs/zh/1701","md",1776506115255]