[{"data":1,"prerenderedAt":617},["ShallowReactive",2],{"content-query-VJY7OSpeXh":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":611,"_id":612,"_source":613,"_file":614,"_stem":615,"_extension":616},"/technology-blogs/zh/3902","zh",false,"","把 Llama 迁到 MindSpore：一份带坑的实战笔记","难点集中在键名映射、RoPE 位移、KV Cache 写法三件事","2025-11-07","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/11/14/36ab0eb7d52a4d1280f8fe6595b188ea.png","technology-blogs",{"type":14,"children":15,"toc":608},"root",[16,24,30,35,52,77,92,104,114,119,127,140,155,185,193,211,226,231,239,244,252,262,270,288,303,313,331,336,345,353,371,386,391,396,404,422,437,465,480,487,505,520,578,593,598,603],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"把-llama-迁到-mindspore一份带坑的实战笔记",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28],{"type":23,"value":29},"来源：昇腾论坛",{"type":17,"tag":25,"props":31,"children":32},{},[33],{"type":23,"value":34},"这篇文章记录了把Llama 7B从PyTorch/HF生态迁到 MindSpore的过程。不是广告，不是评测，也不是哲学讨论，就是扎扎实实的技术活，踩过的坑都摊开说。",{"type":17,"tag":18,"props":36,"children":38},{"id":37},"_01-背景和目标",[39,45,47],{"type":17,"tag":40,"props":41,"children":42},"strong",{},[43],{"type":23,"value":44},"# 01",{"type":23,"value":46}," ",{"type":17,"tag":40,"props":48,"children":49},{},[50],{"type":23,"value":51},"背景和目标",{"type":17,"tag":53,"props":54,"children":55},"ul",{},[56,62,67],{"type":17,"tag":57,"props":58,"children":59},"li",{},[60],{"type":23,"value":61},"**目标：**在Ascend上用MindSpore跑通Llama（推理 + 微调），尽量少魔改，支持KV Cache、RoPE、混合精度和断点恢复。",{"type":17,"tag":57,"props":63,"children":64},{},[65],{"type":23,"value":66},"**限制：**不依赖奇怪分支；只用公开可得的接口（MindSpore 基座 + 常见组件）。",{"type":17,"tag":57,"props":68,"children":69},{},[70,72],{"type":23,"value":71},"**策略：**能复用的就复用（Tokenizer、权重），不能复用的就写一个薄转换层。",{"type":17,"tag":40,"props":73,"children":74},{},[75],{"type":23,"value":76},"不追求一步到位，但要“能打”。",{"type":17,"tag":18,"props":78,"children":80},{"id":79},"_02-环境要点",[81,86,87],{"type":17,"tag":40,"props":82,"children":83},{},[84],{"type":23,"value":85},"# 02",{"type":23,"value":46},{"type":17,"tag":40,"props":88,"children":89},{},[90],{"type":23,"value":91},"环境要点",{"type":17,"tag":25,"props":93,"children":94},{},[95,97,102],{"type":23,"value":96},"MindSpore 两种模式：GRAPH_MODE（编译图）和 PYNATIVE_MODE（动态图）。",{"type":17,"tag":40,"props":98,"children":99},{},[100],{"type":23,"value":101},"在Ascend上尽量用 GRAPH",{"type":23,"value":103},"，性能差一大截不是开玩笑的。",{"type":17,"tag":105,"props":106,"children":108},"pre",{"code":107},"import mindspore as ms\n",[109],{"type":17,"tag":110,"props":111,"children":112},"code",{"__ignoreMap":7},[113],{"type":23,"value":107},{"type":17,"tag":25,"props":115,"children":116},{},[117],{"type":23,"value":118},"混合精度推荐O2，配合loss scale（训练阶段）：",{"type":17,"tag":105,"props":120,"children":122},{"code":121},"from mindspore.amp import auto_mixed_precision, StaticLossScaler\n",[123],{"type":17,"tag":110,"props":124,"children":125},{"__ignoreMap":7},[126],{"type":23,"value":121},{"type":17,"tag":128,"props":129,"children":130},"blockquote",{},[131],{"type":17,"tag":25,"props":132,"children":133},{},[134],{"type":17,"tag":135,"props":136,"children":137},"em",{},[138],{"type":23,"value":139},"踩坑 1：MindSpore对Ascend的算子融合比较激进，图模式下某些自定义Python控制流容易被“优化没了”。遇到莫名其妙的数值波动，先关掉你新加的“聪明”控制流。",{"type":17,"tag":18,"props":141,"children":143},{"id":142},"_03-tokenizer-与-rope别在细节上翻车",[144,149,150],{"type":17,"tag":40,"props":145,"children":146},{},[147],{"type":23,"value":148},"# 03",{"type":23,"value":46},{"type":17,"tag":40,"props":151,"children":152},{},[153],{"type":23,"value":154},"Tokenizer 与 RoPE：别在细节上翻车",{"type":17,"tag":53,"props":156,"children":157},{},[158,173],{"type":17,"tag":57,"props":159,"children":160},{},[161],{"type":17,"tag":40,"props":162,"children":163},{},[164,166,171],{"type":23,"value":165},"Tokenizer：",{"type":17,"tag":40,"props":167,"children":168},{},[169],{"type":23,"value":170},"我直接复用 HF 的 tokenizer.json和 tokenizer.model，在数据前处理阶段完成编码解码。训练/推理时只给 MindSpore 喂 input_ids和 attention_mask",{"type":23,"value":172},"（注意 mask 的 dtype 和 shape）。",{"type":17,"tag":57,"props":174,"children":175},{},[176,178,183],{"type":23,"value":177},"**RoPE（Rotary Embedding）：**MindSpore 里实现 RoPE 时，**位置索引的广播维度和角度表（cos/sin）**缓存要提前考虑到 ",{"type":17,"tag":40,"props":179,"children":180},{},[181],{"type":23,"value":182},"prefill+decode",{"type":23,"value":184},"两阶段。 简化做法：预缓存最大max_seq_len的cos/sin；decode阶段按 pos_offset索引切片。",{"type":17,"tag":105,"props":186,"children":188},{"code":187},"def precompute_rope(theta_base, head_dim, max_len, dtype=ms.float16):\n",[189],{"type":17,"tag":110,"props":190,"children":191},{"__ignoreMap":7},[192],{"type":23,"value":187},{"type":17,"tag":128,"props":194,"children":195},{},[196],{"type":17,"tag":25,"props":197,"children":198},{},[199],{"type":17,"tag":135,"props":200,"children":201},{},[202,204,209],{"type":23,"value":203},"踩坑 2：有的实现把cos/sin的 layout 写反了；",{"type":17,"tag":40,"props":205,"children":206},{},[207],{"type":23,"value":208},"decode 阶段pos要累加",{"type":23,"value":210},"（pos_offset += 1），别反复从0开始。",{"type":17,"tag":18,"props":212,"children":214},{"id":213},"_04-权重转换从-huggingface-mindspore-ckpt",[215,220,221],{"type":17,"tag":40,"props":216,"children":217},{},[218],{"type":23,"value":219},"# 04",{"type":23,"value":46},{"type":17,"tag":40,"props":222,"children":223},{},[224],{"type":23,"value":225},"权重转换：从 HuggingFace → MindSpore .ckpt",{"type":17,"tag":25,"props":227,"children":228},{},[229],{"type":23,"value":230},"HuggingFace 的 Llama 权重是多个pytorch_model-*.bin。思路：用torch.load拿 state_dict，做键名映射，再 mindspore.save_checkpoint。",{"type":17,"tag":25,"props":232,"children":233},{},[234],{"type":17,"tag":40,"props":235,"children":236},{},[237],{"type":23,"value":238},"1、键名映射表（示例）",{"type":17,"tag":25,"props":240,"children":241},{},[242],{"type":23,"value":243},"HuggingFace（常见） → MindSpore（示例命名）：",{"type":17,"tag":105,"props":245,"children":247},{"code":246},"model.embed_tokens.weight             → tok_embeddings.embedding_table\n",[248],{"type":17,"tag":110,"props":249,"children":250},{"__ignoreMap":7},[251],{"type":23,"value":246},{"type":17,"tag":25,"props":253,"children":254},{},[255,257],{"type":23,"value":256},"**2、**",{"type":17,"tag":40,"props":258,"children":259},{},[260],{"type":23,"value":261},"转换脚本（最小可用）",{"type":17,"tag":105,"props":263,"children":265},{"code":264},"import os, torch, mindspore as ms\n",[266],{"type":17,"tag":110,"props":267,"children":268},{"__ignoreMap":7},[269],{"type":23,"value":264},{"type":17,"tag":128,"props":271,"children":272},{},[273],{"type":17,"tag":25,"props":274,"children":275},{},[276],{"type":17,"tag":135,"props":277,"children":278},{},[279,281,286],{"type":23,"value":280},"踩坑 3：LayerNorm 在 Llama 是",{"type":17,"tag":40,"props":282,"children":283},{},[284],{"type":23,"value":285},"无 bias",{"type":23,"value":287},"，MindSpore 里如果你 LayerNorm 定义带 beta，要么删掉，要么初始为 0 并在图里不使用；否则数值会“飘”。",{"type":17,"tag":18,"props":289,"children":291},{"id":290},"_05-llama-前向与-kv-cacheprefill-decode",[292,297,298],{"type":17,"tag":40,"props":293,"children":294},{},[295],{"type":23,"value":296},"# 05",{"type":23,"value":46},{"type":17,"tag":40,"props":299,"children":300},{},[301],{"type":23,"value":302},"Llama 前向与 KV Cache（prefill + decode）",{"type":17,"tag":25,"props":304,"children":305},{},[306,308],{"type":23,"value":307},"**1、**",{"type":17,"tag":40,"props":309,"children":310},{},[311],{"type":23,"value":312},"Attention mask 语义",{"type":17,"tag":53,"props":314,"children":315},{},[316,321],{"type":17,"tag":57,"props":317,"children":318},{},[319],{"type":23,"value":320},"**训练：**通常是 [bs, 1, seq, seq]或 [bs, seq]的下三角 + padding mask。",{"type":17,"tag":57,"props":322,"children":323},{},[324,326],{"type":23,"value":325},"**推理：**prefill 阶段 mask 仍按下三角；decode 阶段仅对新 token 做与历史的点积，",{"type":17,"tag":40,"props":327,"children":328},{},[329],{"type":23,"value":330},"mask 形状变小。",{"type":17,"tag":25,"props":332,"children":333},{},[334],{"type":23,"value":335},"建议统一为 floatmask，填充不可见位置为 -1e4（或和你 softmax 实现一致的 -inf），避免 dtype 乱战。",{"type":17,"tag":25,"props":337,"children":338},{},[339,340],{"type":23,"value":256},{"type":17,"tag":40,"props":341,"children":342},{},[343],{"type":23,"value":344},"简化版 KV Cache",{"type":17,"tag":105,"props":346,"children":348},{"code":347},"class KvCache:\n",[349],{"type":17,"tag":110,"props":350,"children":351},{"__ignoreMap":7},[352],{"type":23,"value":347},{"type":17,"tag":128,"props":354,"children":355},{},[356],{"type":17,"tag":25,"props":357,"children":358},{},[359],{"type":17,"tag":135,"props":360,"children":361},{},[362,364,369],{"type":23,"value":363},"踩坑 4：别在 decode 阶段每步都 concat，",{"type":17,"tag":40,"props":365,"children":366},{},[367],{"type":23,"value":368},"就地写入slice，Ascend",{"type":23,"value":370}," 的内存移动不白嫖。",{"type":17,"tag":18,"props":372,"children":374},{"id":373},"_06-训练与微调lora全参",[375,380,381],{"type":17,"tag":40,"props":376,"children":377},{},[378],{"type":23,"value":379},"# 06",{"type":23,"value":46},{"type":17,"tag":40,"props":382,"children":383},{},[384],{"type":23,"value":385},"训练与微调（LoRA/全参）",{"type":17,"tag":25,"props":387,"children":388},{},[389],{"type":23,"value":390},"LoRA在 MindSpore 的一个常见实现：给线性层包一个 A/B 低秩旁路，前向时加上 x @ A @ B * alpha/r。",{"type":17,"tag":25,"props":392,"children":393},{},[394],{"type":23,"value":395},"建议把 LoRA 的参数单独分组，禁用 weight decay；并只在 target 模块（q_proj, v_proj, o_proj, w1/w3）上挂。",{"type":17,"tag":105,"props":397,"children":399},{"code":398},"def wrap_lora(linear, r=16, alpha=32):\n",[400],{"type":17,"tag":110,"props":401,"children":402},{"__ignoreMap":7},[403],{"type":23,"value":398},{"type":17,"tag":128,"props":405,"children":406},{},[407],{"type":17,"tag":25,"props":408,"children":409},{},[410],{"type":17,"tag":135,"props":411,"children":412},{},[413,415,420],{"type":23,"value":414},"踩坑 5：MindSpore Graph 下如果你“猴子补丁”forward，要确保图能稳定跟住；",{"type":17,"tag":40,"props":416,"children":417},{},[418],{"type":23,"value":419},"更稳的做法",{"type":23,"value":421},"是写一个 LoraLinear(Cell)包起来。",{"type":17,"tag":18,"props":423,"children":425},{"id":424},"_07-性能小记不玄学",[426,431,432],{"type":17,"tag":40,"props":427,"children":428},{},[429],{"type":23,"value":430},"# 07",{"type":23,"value":46},{"type":17,"tag":40,"props":433,"children":434},{},[435],{"type":23,"value":436},"性能小记（不玄学）",{"type":17,"tag":53,"props":438,"children":439},{},[440,445,450,455,460],{"type":17,"tag":57,"props":441,"children":442},{},[443],{"type":23,"value":444},"**GRAPH_MODE + O2 混合精度：**不解释。",{"type":17,"tag":57,"props":446,"children":447},{},[448],{"type":23,"value":449},"**大 batch prefill：**把多条输入拼长些，prefill 吞吐会好不少（当然别 OOM）。",{"type":17,"tag":57,"props":451,"children":452},{},[453],{"type":23,"value":454},"**KV Cache 扁平化：**把 [bs, head, t, dim]按设备最友好的内存布局摆放（这块我没深挖，简单就地 slice 已经够用）。",{"type":17,"tag":57,"props":456,"children":457},{},[458],{"type":23,"value":459},"**避免 Python 回环：**decode loop 尽量把张量操作留在图里，减少 host 参与。",{"type":17,"tag":57,"props":461,"children":462},{},[463],{"type":23,"value":464},"**检查算子降级：**图编译日志里搜 “fallback/host” 之类关键词，别让关键算子跑到 CPU 端。",{"type":17,"tag":18,"props":466,"children":468},{"id":467},"_08-端到端推理样例极简",[469,474,475],{"type":17,"tag":40,"props":470,"children":471},{},[472],{"type":23,"value":473},"# 08",{"type":23,"value":46},{"type":17,"tag":40,"props":476,"children":477},{},[478],{"type":23,"value":479},"端到端推理样例（极简）",{"type":17,"tag":105,"props":481,"children":482},{"code":107},[483],{"type":17,"tag":110,"props":484,"children":485},{"__ignoreMap":7},[486],{"type":23,"value":107},{"type":17,"tag":128,"props":488,"children":489},{},[490],{"type":17,"tag":25,"props":491,"children":492},{},[493],{"type":17,"tag":135,"props":494,"children":495},{},[496,498,503],{"type":23,"value":497},"踩坑 6：很多人把 pos写死，导致 RoPE 永远用到第 0 行，性能和数值全飞。",{"type":17,"tag":40,"props":499,"children":500},{},[501],{"type":23,"value":502},"prefill 后 pos 应等于上下文长度",{"type":23,"value":504},"，decode 逐步 +1。",{"type":17,"tag":18,"props":506,"children":508},{"id":507},"_09-常见报错对照以防手忙脚乱",[509,514,515],{"type":17,"tag":40,"props":510,"children":511},{},[512],{"type":23,"value":513},"# 09",{"type":23,"value":46},{"type":17,"tag":40,"props":516,"children":517},{},[518],{"type":23,"value":519},"常见报错对照（以防手忙脚乱）",{"type":17,"tag":53,"props":521,"children":522},{},[523,548,558,568],{"type":17,"tag":57,"props":524,"children":525},{},[526,531,533,539,541,546],{"type":17,"tag":40,"props":527,"children":528},{},[529],{"type":23,"value":530},"Shape 不一致",{"type":23,"value":532},"：尤其 ",{"type":17,"tag":110,"props":534,"children":536},{"className":535},[],[537],{"type":23,"value":538},"attention_mask",{"type":23,"value":540},"，MindSpore 的广播规则和你在 PyTorch 的“侥幸成功”未必一致，",{"type":17,"tag":40,"props":542,"children":543},{},[544],{"type":23,"value":545},"显式 reshape",{"type":23,"value":547},"保命。",{"type":17,"tag":57,"props":549,"children":550},{},[551,556],{"type":17,"tag":40,"props":552,"children":553},{},[554],{"type":23,"value":555},"LayerNorm gamma/beta",{"type":23,"value":557},"：权重名映射遗漏，或 beta 多出来。",{"type":17,"tag":57,"props":559,"children":560},{},[561,566],{"type":17,"tag":40,"props":562,"children":563},{},[564],{"type":23,"value":565},"溢出",{"type":23,"value":567},"：fp16 的 matmul 穿了，loss scale 或者切到 bf16。",{"type":17,"tag":57,"props":569,"children":570},{},[571,576],{"type":17,"tag":40,"props":572,"children":573},{},[574],{"type":23,"value":575},"图编译卡慢",{"type":23,"value":577},"：第一次长一些正常，第二次还慢，看看是否每次都在重建图（输入 shape 乱飘）。",{"type":17,"tag":18,"props":579,"children":581},{"id":580},"_10-小结",[582,587,588],{"type":17,"tag":40,"props":583,"children":584},{},[585],{"type":23,"value":586},"# 10",{"type":23,"value":46},{"type":17,"tag":40,"props":589,"children":590},{},[591],{"type":23,"value":592},"小结",{"type":17,"tag":25,"props":594,"children":595},{},[596],{"type":23,"value":597},"迁 Llama 到 MindSpore 没有想象中那么可怕，难点集中在键名映射、RoPE 位移、KV Cache 写法三件事。",{"type":17,"tag":25,"props":599,"children":600},{},[601],{"type":23,"value":602},"一旦跑通，Ascend 上的吞吐和能效都挺能打。别追求一步封神，先上一个“能打”的版本，再迭代优化。",{"type":17,"tag":25,"props":604,"children":605},{},[606],{"type":23,"value":607},"最后，再次提醒自己：少写骚代码，别给图编译添堵。有时候“朴素写法”反而更快更稳（这点我已经被现实教育过两次，脸疼）。",{"title":7,"searchDepth":609,"depth":609,"links":610},4,[],"markdown","content:technology-blogs:zh:3902.md","content","technology-blogs/zh/3902.md","technology-blogs/zh/3902","md",1776506136712]