[{"data":1,"prerenderedAt":551},["ShallowReactive",2],{"content-query-7fZjoDd4dg":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":545,"_id":546,"_source":547,"_file":548,"_stem":549,"_extension":550},"/technology-blogs/zh/2882","zh",false,"","项目分享 | 基于昇思MindSpore与LoRA微调方法训练独属于你的AI模型","作者：钟源珂 来源：知乎","2023-11-17","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/11/25/6f2069d0e5ac4992a1c69e78bec784f3.png","technology-blogs",{"type":14,"children":15,"toc":539},"root",[16,24,41,49,54,62,70,75,84,89,94,99,119,128,138,146,154,162,167,172,183,188,198,206,216,221,231,239,244,252,257,265,273,278,453,458,465,473,481,489,494,499,509,519,529],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"项目分享-基于昇思mindspore与lora微调方法训练独属于你的ai模型",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28,34,36],{"type":17,"tag":29,"props":30,"children":31},"strong",{},[32],{"type":23,"value":33},"作者：钟源珂",{"type":23,"value":35}," ",{"type":17,"tag":29,"props":37,"children":38},{},[39],{"type":23,"value":40},"来源：知乎",{"type":17,"tag":25,"props":42,"children":43},{},[44],{"type":17,"tag":29,"props":45,"children":46},{},[47],{"type":23,"value":48},"摘要",{"type":17,"tag":25,"props":50,"children":51},{},[52],{"type":23,"value":53},"过去一年大模型发展迅速，除了已发布的熟知的一些预训练大模型（pretrained model）：Llama，Llama2，ChatGLM，falcon等外，Hugging Face Open LLM Leader board（Open LLM Leaderboard - a Hugging Face Space by HuggingFaceH4）上 ，大量排行靠前的模型都是微调模型（finetune model）。大模型训练对算力要求极高，所以很多AI开发者会选择微调方法（Finetune）来训练大模型（可以理解为站在巨人肩膀上）。主流微调方法有Freeze、P-Tuning、LoRA等，这篇文章会对LoRA方法的原理进行讲解。",{"type":17,"tag":25,"props":55,"children":56},{},[57],{"type":17,"tag":29,"props":58,"children":59},{},[60],{"type":23,"value":61},"01",{"type":17,"tag":25,"props":63,"children":64},{},[65],{"type":17,"tag":29,"props":66,"children":67},{},[68],{"type":23,"value":69},"LoRA解析",{"type":17,"tag":25,"props":71,"children":72},{},[73],{"type":23,"value":74},"LoRA微调方法是由论文《LoRA: Low-Rank Adaptation Of Large Language Models》提出的，原文是这样介绍LoRA的：“We propose Low-Rank Adaptation, or LoRA, which freezes the pretrained model weights and injects trainable rank decomposition matrices into each layer of the Transformer architecture, greatly reducing the number of trainable parameters for downstream tasks”。从原文描述可以看出LoRA的特点是冻结原模型参数，插入可训练的分解低秩矩阵模块，使得微调参数量较少。",{"type":17,"tag":25,"props":76,"children":77},{},[78],{"type":17,"tag":79,"props":80,"children":83},"img",{"alt":81,"src":82},"image.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231124034014.49456937832789786498313232125353:50541124055942:2400:C0044CA9952676FEA2271AC073C6CB418C348DCBEAC3D1FAE1652249B9D6ACB8.png",[],{"type":17,"tag":25,"props":85,"children":86},{},[87],{"type":23,"value":88},"图1. LoRA微调只对A，B进行调整",{"type":17,"tag":25,"props":90,"children":91},{},[92],{"type":23,"value":93},"上图为LoRA的示意图，左边蓝色部分是该方法会冻结的权重，右边是新增的微调权重模块。",{"type":17,"tag":25,"props":95,"children":96},{},[97],{"type":23,"value":98},"文章总结LoRA具有以下关键优势：",{"type":17,"tag":100,"props":101,"children":102},"ul",{},[103,109,114],{"type":17,"tag":104,"props":105,"children":106},"li",{},[107],{"type":23,"value":108},"参数量小，减少对算力和存储的需求；",{"type":17,"tag":104,"props":110,"children":111},{},[112],{"type":23,"value":113},"是一种小型微调，效率高；",{"type":17,"tag":104,"props":115,"children":116},{},[117],{"type":23,"value":118},"LoRA微调会生成单独的模块，可以和其他微调方法结合使用。",{"type":17,"tag":120,"props":121,"children":122},"h3",{"id":7},[123],{"type":17,"tag":79,"props":124,"children":127},{"alt":125,"src":126},"cke_1849.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231124034053.41611143897870481682114487206946:50541124055942:2400:7CA54C6718C7BF6048ADF26999BC364F2BD37ABAAC667105BE7FF96C263BA3B3.png",[],{"type":17,"tag":25,"props":129,"children":130},{},[131],{"type":17,"tag":29,"props":132,"children":133},{},[134],{"type":17,"tag":79,"props":135,"children":137},{"alt":81,"src":136},"https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231124034152.31262372711741926778613375579261:50541124055942:2400:18B9046DBBD5CEFB532329A8FFD6823024557519ACA9769318D8886098B65524.png",[],{"type":17,"tag":25,"props":139,"children":140},{},[141],{"type":17,"tag":79,"props":142,"children":145},{"alt":143,"src":144},"cke_2788.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231124034122.77729748167457307402484010351719:50541124055942:2400:56DDF0FBFEEDD9CF13ECC390B0FD1CE2B9E5D54C881CF4EDED37E5F446441118.png",[],{"type":17,"tag":25,"props":147,"children":148},{},[149],{"type":17,"tag":29,"props":150,"children":151},{},[152],{"type":23,"value":153},"04",{"type":17,"tag":25,"props":155,"children":156},{},[157],{"type":17,"tag":29,"props":158,"children":159},{},[160],{"type":23,"value":161},"代码讲解",{"type":17,"tag":25,"props":163,"children":164},{},[165],{"type":23,"value":166},"代码讲解将使用MindSpore的微调套件MindPet的源代码为例讲解LoRA的实现，同时将结合Wukong-Huahua模型微调讲解MindPet中LoRA的应用。",{"type":17,"tag":25,"props":168,"children":169},{},[170],{"type":23,"value":171},"MindPet项目链接：",{"type":17,"tag":25,"props":173,"children":174},{},[175],{"type":17,"tag":176,"props":177,"children":181},"a",{"href":178,"rel":179},"https://github.com/mindspore-lab/mindpet",[180],"nofollow",[182],{"type":23,"value":178},{"type":17,"tag":25,"props":184,"children":185},{},[186],{"type":23,"value":187},"Wukong-Huahua+LoRA微调代码：",{"type":17,"tag":25,"props":189,"children":190},{},[191],{"type":17,"tag":176,"props":192,"children":195},{"href":193,"rel":194},"https://github.com/mindspore-lab/minddiffusion/blob/main/vision/wukong-huahua/run%5C_train.py",[180],[196],{"type":23,"value":197},"https://github.com/mindspore-lab/minddiffusion/blob/main/vision/wukong-huahua/run\\_train.py",{"type":17,"tag":120,"props":199,"children":201},{"id":200},"核心代码讲解",[202],{"type":17,"tag":29,"props":203,"children":204},{},[205],{"type":23,"value":200},{"type":17,"tag":25,"props":207,"children":208},{},[209,211],{"type":23,"value":210},"LoRA微调的核心代码分为两个部分：",{"type":17,"tag":29,"props":212,"children":213},{},[214],{"type":23,"value":215},"1）冻结原有权重；2）添加LoRA权重部分。",{"type":17,"tag":25,"props":217,"children":218},{},[219],{"type":23,"value":220},"**冻结原有权重：**这里使用MindPet提供的API：freeze_delta。",{"type":17,"tag":222,"props":223,"children":225},"pre",{"code":224},"if opts.enable_lora:\n    from mindpet.graph import freeze_delta \n    freeze_delta(LatentDiffusionWithLoss, 'lora')\n",[226],{"type":17,"tag":227,"props":228,"children":229},"code",{"__ignoreMap":7},[230],{"type":23,"value":224},{"type":17,"tag":25,"props":232,"children":233},{},[234],{"type":17,"tag":29,"props":235,"children":236},{},[237],{"type":23,"value":238},"freeze_delta",{"type":17,"tag":25,"props":240,"children":241},{},[242],{"type":23,"value":243},"freeze_delta会使用到的_freeze_from_list，其核心代码如下，可以看出该模块会将指定的列表的参数的反向梯度更新设置为False，这样就起到了冻结梯度的效果：",{"type":17,"tag":222,"props":245,"children":247},{"code":246},"def _freeze_from_list(model, include, exclude):\n    \"\"\"\n    根据include/exclude列表冻结网络。\n    \"\"\"\n    for name, param in model.parameters_and_names():\n        if _match_str_and_list(name, include) and not _match_str_and_list(name, exclude):\n            param.requires_grad = False\n        elif not _match_str_and_list(name, include) and _match_str_and_list(name, exclude):\n            param.requires_grad = True\n",[248],{"type":17,"tag":227,"props":249,"children":250},{"__ignoreMap":7},[251],{"type":23,"value":246},{"type":17,"tag":25,"props":253,"children":254},{},[255],{"type":23,"value":256},"**添加LoRA权重：**这里将CrossAttention中的q、k、v分别看成一个模块，在每个模块注入可训练的秩分解矩阵，使用的是MindPet提供的API: LoRADense。将原始的q、k、v替换成了LoRA矩阵。",{"type":17,"tag":222,"props":258,"children":260},{"code":259},"from mindpet.delta import LoRADense\nself.to_q = LoRADense(query_dim, inner_dim, has_bias=False, lora_rank=lora_rank, lora_alpha=lora_alpha).to_float(dtype)\nself.to_v = LoRADense(context_dim, inner_dim, has_bias=False, lora_rank=lora_rank, lora_alpha=lora_alpha).to_float(dtype)\nself.to_k = LoRADense(context_dim, inner_dim, has_bias=False, lora_rank=lora_rank, lora_alpha=lora_alpha).to_float(dtype)\n\nself.to_out = nn.SequentialCell(\n    LoRADense(inner_dim, query_dim, lora_rank=lora_rank, lora_alpha=lora_alpha).to_float(dtype),\n                nn.Dropout(dropout)\n)\n",[261],{"type":17,"tag":227,"props":262,"children":263},{"__ignoreMap":7},[264],{"type":23,"value":259},{"type":17,"tag":25,"props":266,"children":267},{},[268],{"type":17,"tag":29,"props":269,"children":270},{},[271],{"type":23,"value":272},"LoRADense",{"type":17,"tag":25,"props":274,"children":275},{},[276],{"type":23,"value":277},"该API的主要参数及解释：",{"type":17,"tag":100,"props":279,"children":280},{},[281,299,315,341,351,361,394,417,427],{"type":17,"tag":104,"props":282,"children":283},{},[284,289,291,297],{"type":17,"tag":29,"props":285,"children":286},{},[287],{"type":23,"value":288},"in_channels",{"type":23,"value":290},"(int) - 原Dense层输入",{"type":17,"tag":227,"props":292,"children":294},{"className":293},[],[295],{"type":23,"value":296},"Tensor",{"type":23,"value":298},"的空间维度。",{"type":17,"tag":104,"props":300,"children":301},{},[302,307,309,314],{"type":17,"tag":29,"props":303,"children":304},{},[305],{"type":23,"value":306},"out_channels",{"type":23,"value":308},"(int）- 原Dense层输出",{"type":17,"tag":227,"props":310,"children":312},{"className":311},[],[313],{"type":23,"value":296},{"type":23,"value":298},{"type":17,"tag":104,"props":316,"children":317},{},[318,323,325,331,333,339],{"type":17,"tag":29,"props":319,"children":320},{},[321],{"type":23,"value":322},"lora_rank",{"type":23,"value":324},"(int) - LoRA算法中",{"type":17,"tag":227,"props":326,"children":328},{"className":327},[],[329],{"type":23,"value":330},"lora_a",{"type":23,"value":332},"矩阵的行数，",{"type":17,"tag":227,"props":334,"children":336},{"className":335},[],[337],{"type":23,"value":338},"lora_b",{"type":23,"value":340},"矩阵的列数。",{"type":17,"tag":104,"props":342,"children":343},{},[344,349],{"type":17,"tag":29,"props":345,"children":346},{},[347],{"type":23,"value":348},"lora_alpha",{"type":23,"value":350},"(Union[int, float]) - 常数超参，不为0。",{"type":17,"tag":104,"props":352,"children":353},{},[354,359],{"type":17,"tag":29,"props":355,"children":356},{},[357],{"type":23,"value":358},"lora_dropout",{"type":23,"value":360},"(float) - 丢弃率，取值范围[0.0,1.0)。",{"type":17,"tag":104,"props":362,"children":363},{},[364,369,371,376,378,384,386,392],{"type":17,"tag":29,"props":365,"children":366},{},[367],{"type":23,"value":368},"lora_a_init(Union",{"type":23,"value":370},"[Tensor, str, Initializer, numbers.Number]) - ",{"type":17,"tag":227,"props":372,"children":374},{"className":373},[],[375],{"type":23,"value":330},{"type":23,"value":377},"矩阵的初始化方法。数据类型与 x 相同。str的值引用自函数 ",{"type":17,"tag":227,"props":379,"children":381},{"className":380},[],[382],{"type":23,"value":383},"initializer",{"type":23,"value":385},"，默认值：",{"type":17,"tag":227,"props":387,"children":389},{"className":388},[],[390],{"type":23,"value":391},"HeUniform(negative_slope=math.sqrt(5))",{"type":23,"value":393},"。",{"type":17,"tag":104,"props":395,"children":396},{},[397,402,404,409,410,416],{"type":17,"tag":29,"props":398,"children":399},{},[400],{"type":23,"value":401},"lora_b_init",{"type":23,"value":403},"(Union[Tensor, str, Initializer, numbers.Number]) - lora_b矩阵的初始化方法。数据类型与 x 相同。str的值引用自函数 ",{"type":17,"tag":227,"props":405,"children":407},{"className":406},[],[408],{"type":23,"value":383},{"type":23,"value":385},{"type":17,"tag":227,"props":411,"children":413},{"className":412},[],[414],{"type":23,"value":415},"'zeros'",{"type":23,"value":393},{"type":17,"tag":104,"props":418,"children":419},{},[420,425],{"type":17,"tag":29,"props":421,"children":422},{},[423],{"type":23,"value":424},"has_bias",{"type":23,"value":426}," (bool) - 是否使用偏置向量 bias 。默认值：True。",{"type":17,"tag":104,"props":428,"children":429},{},[430,435,437,443,445,451],{"type":17,"tag":29,"props":431,"children":432},{},[433],{"type":23,"value":434},"activation",{"type":23,"value":436}," (Union[str, Cell, Primitive, None]) - 应用于全连接层输出的激活函数。可指定激活函数名，如",{"type":17,"tag":227,"props":438,"children":440},{"className":439},[],[441],{"type":23,"value":442},"'relu'",{"type":23,"value":444},"，或具体激活函数，如",{"type":17,"tag":227,"props":446,"children":448},{"className":447},[],[449],{"type":23,"value":450},"mindspore.nn.ReLU()",{"type":23,"value":452},"。默认值：None。",{"type":17,"tag":25,"props":454,"children":455},{},[456],{"type":23,"value":457},"该模块原理为以下公式：可以根据原始权重，计算出原始模型输出和LoRA模块A，B输出加和的结果h（图1）：",{"type":17,"tag":25,"props":459,"children":460},{},[461],{"type":17,"tag":79,"props":462,"children":464},{"alt":81,"src":463},"https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231124034359.63086377926749689606678056779553:50541124055942:2400:59A25121FD0674A6DF8D77176C94146AF629CADCEBDABCA076628EBB71CCA7DC.png",[],{"type":17,"tag":222,"props":466,"children":468},{"code":467},"# Dense result， 原始权重的结果，图1蓝色\ndense_result = self.matmul(input_tensor, weight)\nif self.has_bias:\n    bias = self.cast(self.bias, self.dtype)\n    dense_result = self.bias_add(dense_result, bias)\n\n# LoRA result， LoRA权重的结果，图1橘色\ninput_tensor = self.lora_dropout(input_tensor)\ninput_tensor = self.lora_a_matmul(input_tensor, lora_a)\ninput_tensor = self.lora_b_matmul(input_tensor, lora_b)\ninput_tensor = self.mul(input_tensor, scaling)\n\n# Result addition and activation，将两部分叠加\ndense_result = self.add(dense_result, input_tensor)\n",[469],{"type":17,"tag":227,"props":470,"children":471},{"__ignoreMap":7},[472],{"type":23,"value":467},{"type":17,"tag":25,"props":474,"children":475},{},[476],{"type":17,"tag":29,"props":477,"children":478},{},[479],{"type":23,"value":480},"05",{"type":17,"tag":25,"props":482,"children":483},{},[484],{"type":17,"tag":29,"props":485,"children":486},{},[487],{"type":23,"value":488},"总结",{"type":17,"tag":25,"props":490,"children":491},{},[492],{"type":23,"value":493},"LoRA微调主要原理就是在原始权重基础上叠加一个新的权重，来适应下游任务，新的权重采用低秩可训练矩阵，所以使得LoRA微调效率高。该微调方法仅需调用两个API即可实现：1）freeze_delta冻结原有权重；2）LoRADense添加LoRA权重部分。",{"type":17,"tag":25,"props":495,"children":496},{},[497],{"type":23,"value":498},"往期回顾",{"type":17,"tag":25,"props":500,"children":501},{},[502],{"type":17,"tag":176,"props":503,"children":506},{"href":504,"rel":505},"http://mp.weixin.qq.com/s?__biz=MzkxMTM2MjMzNg==&mid=2247610240&idx=1&sn=7eedd9399fccd37203c5eabf8192901e&chksm=c11e3fcff669b6d9d5304369f79be682aad64c05c8b07ad5f2afc14fb99513299b721d992b3d&scene=21#wechat_redirect",[180],[507],{"type":23,"value":508},"项目分享 | 如何通过昇思MindSpore实现强化学习玩游戏",{"type":17,"tag":25,"props":510,"children":511},{},[512],{"type":17,"tag":176,"props":513,"children":516},{"href":514,"rel":515},"http://mp.weixin.qq.com/s?__biz=MzkxMTM2MjMzNg==&mid=2247609430&idx=1&sn=ff3fe18b51a20849f737972d978d3222&chksm=c11e3c19f669b50f0128f508ebcd2b6373eb930b8b0a0e54f493fdfb7a5dc30b10266656e003&scene=21#wechat_redirect",[180],[517],{"type":23,"value":518},"项目分享 | 如何通过MindNLP将HuggingFace Datasets嫁接到昇思MindSpore",{"type":17,"tag":25,"props":520,"children":521},{},[522],{"type":17,"tag":176,"props":523,"children":526},{"href":524,"rel":525},"http://mp.weixin.qq.com/s?__biz=MzkxMTM2MjMzNg==&mid=2247608163&idx=1&sn=6a07ea1ffa8e7276d22bb3d331a4affa&chksm=c11e372cf669be3ad60b6f6338c351daadf8265e87fca703d7cfe0e8cf89bf54fce84f19011c&scene=21#wechat_redirect",[180],[527],{"type":23,"value":528},"项目分享 | 昇思MindSpore接入强化学习的新环境和新算法",{"type":17,"tag":25,"props":530,"children":531},{},[532],{"type":17,"tag":176,"props":533,"children":536},{"href":534,"rel":535},"http://mp.weixin.qq.com/s?__biz=MzkxMTM2MjMzNg==&mid=2247599998&idx=2&sn=5391bde94de0eee39b5ece32c7fab0a3&chksm=c11e5731f669de273eafad815cae90cdd22ef754ab568f84b1709949fa124fe125a899ca3282&scene=21#wechat_redirect",[180],[537],{"type":23,"value":538},"项目分享 | 腺形智消——新一代儿童腺样体肥大诊疗方案领航者",{"title":7,"searchDepth":540,"depth":540,"links":541},4,[542,544],{"id":7,"depth":543,"text":7},3,{"id":200,"depth":543,"text":200},"markdown","content:technology-blogs:zh:2882.md","content","technology-blogs/zh/2882.md","technology-blogs/zh/2882","md",1776506123663]