[{"data":1,"prerenderedAt":426},["ShallowReactive",2],{"content-query-fzcHg8dfMh":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":420,"_id":421,"_source":422,"_file":423,"_stem":424,"_extension":425},"/technology-blogs/zh/3027","zh",false,"","一天适配Gemma，MindNLP凭什么紧追开源SOTA？","作者：吕昱峰 ｜来源：知乎","2024-03-12","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/15/6a160495fc2245b588478742e15de4a5.png","technology-blogs",{"type":14,"children":15,"toc":414},"root",[16,24,41,46,54,59,66,71,79,87,92,97,104,109,114,119,139,144,149,156,161,173,192,197,205,210,215,238,252,259,272,277,282,315,327,332,350,362,369,374,385,390,402],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"一天适配gemmamindnlp凭什么紧追开源sota",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28,34,36],{"type":17,"tag":29,"props":30,"children":31},"strong",{},[32],{"type":23,"value":33},"作者：吕昱峰",{"type":23,"value":35}," ｜",{"type":17,"tag":29,"props":37,"children":38},{},[39],{"type":23,"value":40},"来源：知乎",{"type":17,"tag":25,"props":42,"children":43},{},[44],{"type":23,"value":45},"2月21日，谷歌毫无预兆地发布号称“全球性能最强大、轻量级”的新一代开源系列模型Gemma。Gemma模型使用了和Gemini同源的技术，总共有2B和7B参数规格，每个规格又分预训练和指令微调两个版本。",{"type":17,"tag":25,"props":47,"children":48},{},[49],{"type":17,"tag":50,"props":51,"children":53},"img",{"alt":7,"src":52},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/15/6e6028c1359e4d7babfc95868aa7f283.png",[],{"type":17,"tag":25,"props":55,"children":56},{},[57],{"type":23,"value":58},"2月22日，在大家还在对谷歌发布的“深夜炸弹”转发评论时，昇思MindSpore社区官宣Gemma适配完成。此时，距离Gemma发布时间还不到24小时。",{"type":17,"tag":25,"props":60,"children":61},{},[62],{"type":17,"tag":50,"props":63,"children":65},{"alt":7,"src":64},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/15/ed490a53a40447e1a3cd26dbdf445b3a.png",[],{"type":17,"tag":25,"props":67,"children":68},{},[69],{"type":23,"value":70},"熟悉AI领域的同学都知道，现在主要的AI研发还是依托GPU+友商框架路线，国产框架和国产硬件对SOTA模型的快速适配，是大家是否会选择使用的核心痛点，能够紧追SOTA，才有机会弯道超车。",{"type":17,"tag":25,"props":72,"children":73},{},[74],{"type":17,"tag":29,"props":75,"children":76},{},[77],{"type":23,"value":78},"所以，回到正题，一天适配Gemma，昇思MindSpore凭什么紧追开源SOTA？",{"type":17,"tag":25,"props":80,"children":81},{},[82],{"type":17,"tag":29,"props":83,"children":84},{},[85],{"type":23,"value":86},"完备的昇思MindSpore动态图",{"type":17,"tag":25,"props":88,"children":89},{},[90],{"type":23,"value":91},"昇思MindSpore框架开源伊始，为了深度适配昇腾硬件，以及对于AI框架发展路线的选择（彼时并没有人能预料到易用性能够爆杀一切性能优势），因此2020年开源到2023年MindSpore 2.0发布一直都没有对动态图过多重视，对于其定位也是“动态图调试后转静态图训练推理”。但是时代的车轮轧过去，静态图终究要成为历史。LLM的日新月异，又要不断追赶。所以，人生苦短，早用动态图。",{"type":17,"tag":25,"props":93,"children":94},{},[95],{"type":23,"value":96},"回到主题，昇思MindSpore的动态图经过内部的几次演进，形成了一个几乎完备的方案，下面展开讲讲。",{"type":17,"tag":25,"props":98,"children":99},{},[100],{"type":17,"tag":50,"props":101,"children":103},{"alt":7,"src":102},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/15/98df2ada02924f318032bb897c0bf487.png",[],{"type":17,"tag":25,"props":105,"children":106},{},[107],{"type":23,"value":108},"f(x, y) = log(x*y)的计算图",{"type":17,"tag":25,"props":110,"children":111},{},[112],{"type":23,"value":113},"以一个简单的函数f(x, y) = log(x*y)为例，正向执行需要乘法(mul)和log两个算子(operator)完成，而反向过程则是通过链式法则求微分。",{"type":17,"tag":25,"props":115,"children":116},{},[117],{"type":23,"value":118},"对于当下的深度学习框架使用者而言，绿色的反向传播部分均交由框架的自动微分(Autograd)功能完成，大家只需要关注正向逻辑，只要能够保证正向传播不断链即可（实际上诸多AI科研人员可能压根不care）。这给框架带来了几点要求：",{"type":17,"tag":120,"props":121,"children":122},"ul",{},[123,129,134],{"type":17,"tag":124,"props":125,"children":126},"li",{},[127],{"type":23,"value":128},"没有语法限制（相对的静态图总有语法限制）",{"type":17,"tag":124,"props":130,"children":131},{},[132],{"type":23,"value":133},"足够灵活，通常要使用各类Python库",{"type":17,"tag":124,"props":135,"children":136},{},[137],{"type":23,"value":138},"性能尚可（没人要求极致性能）",{"type":17,"tag":25,"props":140,"children":141},{},[142],{"type":23,"value":143},"这时候似乎静态图确实已经格格不入，而昇思MindSpore从静态图起家的技术栈是否需要完全摒弃也是一直有争议的话题（个人是动态图绝对拥趸）。事实上业界框架基本上都选择了完全转型动态图，但是任何一个大型系统的演进总会背着包袱，而包袱是负重还是补给便要考校研发人员的实力。",{"type":17,"tag":25,"props":145,"children":146},{},[147],{"type":23,"value":148},"昇思MindSpore则选择了尽可能复用静态图原有能力，保留性能优势。",{"type":17,"tag":25,"props":150,"children":151},{},[152],{"type":17,"tag":50,"props":153,"children":155},{"alt":7,"src":154},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/15/a8fe8cf4f7d14e8298bdf5f334ea18f3.png",[],{"type":17,"tag":25,"props":157,"children":158},{},[159],{"type":23,"value":160},"正向动态，反向成图示例图",{"type":17,"tag":25,"props":162,"children":163},{},[164,166,171],{"type":23,"value":165},"从框架用户的角度出发，易用性和性能一直都是不能两全的trade-off，但是仔细分析可以发现，大家要的灵活性，全部都集中在蓝色部分，也就是正向执行，而反向传播由于不可见，可以选择不同的方案。因此，最终昇思MindSpore的动态图选择了——",{"type":17,"tag":29,"props":167,"children":168},{},[169],{"type":23,"value":170},"正向Eager执行，反向成图",{"type":23,"value":172},"。这样带来的好处显而易见：",{"type":17,"tag":120,"props":174,"children":175},{},[176,184],{"type":17,"tag":124,"props":177,"children":178},{},[179],{"type":17,"tag":29,"props":180,"children":181},{},[182],{"type":23,"value":183},"用户使用友好，调试方便",{"type":17,"tag":124,"props":185,"children":186},{},[187],{"type":17,"tag":29,"props":188,"children":189},{},[190],{"type":23,"value":191},"训练性能保持优势",{"type":17,"tag":25,"props":193,"children":194},{},[195],{"type":23,"value":196},"即便是由于早期自动并行设计导致昇思MindSpore的算子粒度非常碎而导致kernel by kernel执行并不快的情况下，反向成图也能使正反向执行速度和PyTorch持平甚至更优。而另一方面，事实上PyTorch也在朝静态图演进，最终殊途同归，大家都走向了动静融合的路。",{"type":17,"tag":25,"props":198,"children":199},{},[200],{"type":17,"tag":29,"props":201,"children":202},{},[203],{"type":23,"value":204},"更易用的API接口",{"type":17,"tag":25,"props":206,"children":207},{},[208],{"type":23,"value":209},"架构设计的合理是基础，接下来要做的，就是**“宠着用户”**了。既然开发者使用友商框架的习惯已经无法更改，那就加入。当然，这会引发诸如“真假自研”、“一模一样还用你干什么”之类的话题。又因为其他框架接口直接被使用，因此在接口设计上的取舍自是很难。",{"type":17,"tag":25,"props":211,"children":212},{},[213],{"type":23,"value":214},"AI框架的接口一般由几部分构成，这里就和设计策略一起直接列出来：",{"type":17,"tag":120,"props":216,"children":217},{},[218,223,228,233],{"type":17,"tag":124,"props":219,"children":220},{},[221],{"type":23,"value":222},"网络构造接口，包括nn、ops，全面对齐友商框架；",{"type":17,"tag":124,"props":224,"children":225},{},[226],{"type":23,"value":227},"自动微分接口：保持自研；",{"type":17,"tag":124,"props":229,"children":230},{},[231],{"type":23,"value":232},"高阶封装Trainer：借鉴业界主流Trainer；",{"type":17,"tag":124,"props":234,"children":235},{},[236],{"type":23,"value":237},"Dataset：保持自研。",{"type":17,"tag":25,"props":239,"children":240},{},[241,243],{"type":23,"value":242},"可以看到在技术核心仍旧选择了自研，而保证足够易用的API接口则要和友商框架全面对标。这里我留个链接和截图，可以看到昇思MindSpore和友商框架的接口映射情况。而这也是能够做到模型快速迁移的基础。（",{"type":17,"tag":244,"props":245,"children":249},"a",{"href":246,"rel":247},"https://www.mindspore.cn/docs/zh-CN/r2.2/note/api%5C_mapping/pytorch%5C_api%5C_mapping.html%EF%BC%89",[248],"nofollow",[250],{"type":23,"value":251},"https://www.mindspore.cn/docs/zh-CN/r2.2/note/api\\_mapping/pytorch\\_api\\_mapping.html）",{"type":17,"tag":25,"props":253,"children":254},{},[255],{"type":17,"tag":50,"props":256,"children":258},{"alt":7,"src":257},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/15/bf794efe41cb4156af1f356a1c33ed8d.png",[],{"type":17,"tag":260,"props":261,"children":263},"h3",{"id":262},"全面拥抱hugging-face的mindnlp套件",[264],{"type":17,"tag":29,"props":265,"children":266},{},[267],{"type":17,"tag":29,"props":268,"children":269},{},[270],{"type":23,"value":271},"全面拥抱Hugging Face的MindNLP套件",{"type":17,"tag":25,"props":273,"children":274},{},[275],{"type":23,"value":276},"有了几乎对齐的API接口固然可以快速进行模型的迁移适配，但是如何保证正确性、使用体验的一致性都是能够决定是否有人愿意真的用你的关键因素。这里到了我的主场，就展开讲一下MindNLP的设计。",{"type":17,"tag":25,"props":278,"children":279},{},[280],{"type":23,"value":281},"首先，我们选择全面拥抱Hugging Face。其实一度想要直接贡献给Hugging Face社区，但是由于某些客观原因（之前贡献给einops的PR最后也被close了）。但是我们不会放着最大的大模型社区而不去对接。所以策略是什么呢？",{"type":17,"tag":120,"props":283,"children":284},{},[285,293,298,310],{"type":17,"tag":124,"props":286,"children":287},{},[288],{"type":17,"tag":29,"props":289,"children":290},{},[291],{"type":23,"value":292},"All in 动态图",{"type":17,"tag":124,"props":294,"children":295},{},[296],{"type":23,"value":297},"全面适配Hugging Face主要开发库，如Transformers、Peft、Trl等。",{"type":17,"tag":124,"props":299,"children":300},{},[301,303],{"type":23,"value":302},"直接使用datasets库，配合MindSpore Dataset组件达成数据集的满足度。（这里附上上一篇文章",{"type":17,"tag":244,"props":304,"children":307},{"href":305,"rel":306},"https://zhuanlan.zhihu.com/p/659489670%EF%BC%89",[248],[308],{"type":23,"value":309},"https://zhuanlan.zhihu.com/p/659489670）",{"type":17,"tag":124,"props":311,"children":312},{},[313],{"type":23,"value":314},"直接使用Hugging Face测试用例进行昇思MindSpore版本测试。",{"type":17,"tag":25,"props":316,"children":317},{},[318,320,325],{"type":23,"value":319},"有了以上四条主要策略，可以和Hugging Face社区达成深度的绑定，借助社区的海量资源来促进MindNLP和昇思MindSpore的生态。也能够从",{"type":17,"tag":29,"props":321,"children":322},{},[323],{"type":23,"value":324},"易用性、数据、模型",{"type":23,"value":326},"角度尽最大可能满足真正的需求。",{"type":17,"tag":25,"props":328,"children":329},{},[330],{"type":23,"value":331},"当然，我们会有一些小trick，比如：",{"type":17,"tag":120,"props":333,"children":334},{},[335,340,345],{"type":17,"tag":124,"props":336,"children":337},{},[338],{"type":23,"value":339},"花了两天把checkpoint文件的直接加载搞定，再也不用先转换再加载了；",{"type":17,"tag":124,"props":341,"children":342},{},[343],{"type":23,"value":344},"结合hf-mirror提供国内下载Hugging Face社区模型的能力，AutoModel一键加载",{"type":17,"tag":124,"props":346,"children":347},{},[348],{"type":23,"value":349},"利用Arrow格式做memory map的数据加载",{"type":17,"tag":25,"props":351,"children":352},{},[353,355,360],{"type":23,"value":354},"因为选择了动态图+拥抱Hugging Face社区的路线，我们几十个高校的同学一起已经搞定了60+模型的快速迁移适配，",{"type":17,"tag":29,"props":356,"children":357},{},[358],{"type":23,"value":359},"最快单个模型1小时通关",{"type":23,"value":361},"（Pass Hugging Face所有ut）。",{"type":17,"tag":25,"props":363,"children":364},{},[365],{"type":17,"tag":50,"props":366,"children":368},{"alt":7,"src":367},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/03/15/2ac8e67c375847969c4f9679a6bc9ac5.png",[],{"type":17,"tag":25,"props":370,"children":371},{},[372],{"type":23,"value":373},"社区贡献者",{"type":17,"tag":260,"props":375,"children":377},{"id":376},"总结",[378],{"type":17,"tag":29,"props":379,"children":380},{},[381],{"type":17,"tag":29,"props":382,"children":383},{},[384],{"type":23,"value":376},{"type":17,"tag":25,"props":386,"children":387},{},[388],{"type":23,"value":389},"动态图的适应是真的舒适，但是只有动态图可能永远只能做小弟，这时候，自上而下的设计，还是要全面为真正的用户着想。",{"type":17,"tag":25,"props":391,"children":392},{},[393,395],{"type":23,"value":394},"最后关于MindNLP，需要用的功能尽管提issue，顺带star一下也可。（",{"type":17,"tag":244,"props":396,"children":399},{"href":397,"rel":398},"https://github.com/mindspore-lab/mindnlp%EF%BC%89",[248],[400],{"type":23,"value":401},"https://github.com/mindspore-lab/mindnlp）",{"type":17,"tag":25,"props":403,"children":404},{},[405,407],{"type":23,"value":406},"此外，我们还在持续号召社区贡献，参与大模型任务赢大奖，海量任务等你来！（",{"type":17,"tag":244,"props":408,"children":411},{"href":409,"rel":410},"https://gitee.com/mindspore/community/issues/I835ND?from=project-issue%EF%BC%89",[248],[412],{"type":23,"value":413},"https://gitee.com/mindspore/community/issues/I835ND?from=project-issue）",{"title":7,"searchDepth":415,"depth":415,"links":416},4,[417,419],{"id":262,"depth":418,"text":271},3,{"id":376,"depth":418,"text":376},"markdown","content:technology-blogs:zh:3027.md","content","technology-blogs/zh/3027.md","technology-blogs/zh/3027","md",1776506125532]