[{"data":1,"prerenderedAt":509},["ShallowReactive",2],{"content-query-JCIk6CmVtq":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":503,"_id":504,"_source":505,"_file":506,"_stem":507,"_extension":508},"/technology-blogs/zh/2025-11-27","zh",false,"","基于昇思MindSpore的Qwen2.5-7B 全量微调实践","提供可直接落地的实践方案","2025-11-27","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/06/13/ca51743e8e31470a9d8d989f4b463985.png","technology-blogs","实践",{"type":15,"children":16,"toc":484},"root",[17,25,35,43,48,53,61,69,79,104,113,118,123,133,138,143,151,156,164,172,180,183,192,196,201,209,214,222,231,236,244,249,257,266,271,279,284,292,300,308,317,322,330,335,344,352,357,366,374,379,387,395,404,422,426,435,439,444,455,464,469,474,479],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"基于昇思mindspore的qwen25-7b-全量微调实践",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":18,"tag":30,"props":31,"children":32},"strong",{},[33],{"type":24,"value":34},"# 01",{"type":18,"tag":26,"props":36,"children":37},{},[38],{"type":18,"tag":30,"props":39,"children":40},{},[41],{"type":24,"value":42},"背景介绍",{"type":18,"tag":26,"props":44,"children":45},{},[46],{"type":24,"value":47},"随着大语言模型在各行业的深度渗透，基于自主创新硬件与框架的模型调优需求日益迫切。昇腾 800T A2 服务器凭借强大的算力密度和高效的分布式训练支持，成为大模型训练的优选硬件；昇思 MindSpore 动态图方案则通过兼容 PyTorch 开发习惯、提供 MindSpeed-LLM 无缝迁移能力，大幅降低了开发者的适配成本。",{"type":18,"tag":26,"props":49,"children":50},{},[51],{"type":24,"value":52},"Qwen2.5-7B-Instruct 作为高性能开源模型，在对话生成、逻辑推理等场景表现优异，但在昇腾硬件上的全量微调缺乏完整实操指南。本文基于昇腾 800T A2 4 卡环境，结合 MindSpore 动态图方案，详细拆解 Qwen2.5-7B 的全量微调流程，为开发者提供可直接落地的实践方案。",{"type":18,"tag":26,"props":54,"children":55},{},[56],{"type":18,"tag":30,"props":57,"children":58},{},[59],{"type":24,"value":60},"# 02",{"type":18,"tag":26,"props":62,"children":63},{},[64],{"type":18,"tag":30,"props":65,"children":66},{},[67],{"type":24,"value":68},"环境准备",{"type":18,"tag":70,"props":71,"children":73},"h3",{"id":72},"_21-硬件配置",[74],{"type":18,"tag":30,"props":75,"children":76},{},[77],{"type":24,"value":78},"2.1 硬件配置",{"type":18,"tag":80,"props":81,"children":82},"ul",{},[83,89,94,99],{"type":18,"tag":84,"props":85,"children":86},"li",{},[87],{"type":24,"value":88},"服务器型号：昇腾 800T A2",{"type":18,"tag":84,"props":90,"children":91},{},[92],{"type":24,"value":93},"内存配置：512GB DDR5",{"type":18,"tag":84,"props":95,"children":96},{},[97],{"type":24,"value":98},"存储配置：2TB NVMe SSD（用于存放模型权重、数据集）",{"type":18,"tag":84,"props":100,"children":101},{},[102],{"type":24,"value":103},"网络配置：200G InfiniBand 高速互联（保障多卡通信效率）",{"type":18,"tag":70,"props":105,"children":107},{"id":106},"_22-软件环境搭建",[108],{"type":18,"tag":30,"props":109,"children":110},{},[111],{"type":24,"value":112},"2.2 软件环境搭建",{"type":18,"tag":26,"props":114,"children":115},{},[116],{"type":24,"value":117},"2.2.1 容器环境部署（推荐）",{"type":18,"tag":26,"props":119,"children":120},{},[121],{"type":24,"value":122},"直接使用昇腾官方容器镜像，内置 CANN 8.3.RC1 及 MindSpore 依赖，避免环境冲突：",{"type":18,"tag":124,"props":125,"children":127},"pre",{"code":126},"# 拉取昇腾MindSpore专用镜像\ndocker pull swr.cn-south-1.myhuaweicloud.com/ascend/mindspore:2.3.0-ascend910b-cann8.3rc1\n# 启动容器（映射数据目录、配置权限）\ndocker run -itd --name qwen-tune -p 8888:8888 --privileged \\\n--device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 \\\n-v /mnt/data:/mnt/data swr.cn-south-1.myhuaweicloud.com/ascend/mindspore:2.3.0-ascend910b-cann8.3rc1 /bin/bash\n",[128],{"type":18,"tag":129,"props":130,"children":131},"code",{"__ignoreMap":7},[132],{"type":24,"value":126},{"type":18,"tag":26,"props":134,"children":135},{},[136],{"type":24,"value":137},"2.2.2 依赖安装",{"type":18,"tag":26,"props":139,"children":140},{},[141],{"type":24,"value":142},"进入容器后，安装 MindSpeed-Core-MS 及相关依赖：",{"type":18,"tag":124,"props":144,"children":146},{"code":145},"# 克隆指定版本仓库（适配Qwen2.5-7B）\ngit clone -b r0.4.0 https://gitee.com/mindspore/mindspeed-core-ms.git\ncd mindspeed-core-ms\n# 安装依赖包\npip install -r requirements.txt\n# 配置环境变量（避免窗口重启失效）\necho \"export PYTHONPATH=$PWD:\\$PYTHONPATH\" >> ~/.bashrc\nsource ~/.bashrc\n",[147],{"type":18,"tag":129,"props":148,"children":149},{"__ignoreMap":7},[150],{"type":24,"value":145},{"type":18,"tag":26,"props":152,"children":153},{},[154],{"type":24,"value":155},"2.2.3 数据集与模型权重准备",{"type":18,"tag":124,"props":157,"children":159},{"code":158},"# 创建存储目录\nmkdir -p /mnt/data/Qwen2.5-7B/{w_ori,w_transfer,w_tune}\nmkdir -p /mnt/data/data/tune/{d_ori,d_convert}\n\n# 下载Qwen2.5-7B-Instruct模型权重（通过modelscope）\npip install modelscope\npython -c \"from modelscope.hub.snapshot_download import snapshot_download;\nsnapshot_download('Qwen/Qwen2.5-7B-Instruct', cache_dir='/mnt/data/Qwen2.5-7B/w_ori')\"\n\n# 下载Alpaca微调数据集\nwget https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json -P /mnt/data/data/tune/d_ori\n",[160],{"type":18,"tag":129,"props":161,"children":162},{"__ignoreMap":7},[163],{"type":24,"value":158},{"type":18,"tag":26,"props":165,"children":166},{},[167],{"type":18,"tag":30,"props":168,"children":169},{},[170],{"type":24,"value":171},"# 03",{"type":18,"tag":26,"props":173,"children":174},{},[175],{"type":18,"tag":30,"props":176,"children":177},{},[178],{"type":24,"value":179},"实操步骤",{"type":18,"tag":70,"props":181,"children":182},{"id":7},[],{"type":18,"tag":70,"props":184,"children":186},{"id":185},"_31-数据集转换",[187],{"type":18,"tag":30,"props":188,"children":189},{},[190],{"type":24,"value":191},"3.1 数据集转换",{"type":18,"tag":70,"props":193,"children":195},{"id":194},"_1",[],{"type":18,"tag":26,"props":197,"children":198},{},[199],{"type":24,"value":200},"修改数据转换脚本data_convert_qwen25_instruction.sh：",{"type":18,"tag":124,"props":202,"children":204},{"code":203},"#!/bin/bash\npython tools/data_convert.py \\\n--input_path /mnt/data/data/tune/d_ori/alpaca_data.json \\\n--tokenizer_path /mnt/data/Qwen2.5-7B/w_ori \\\n--output_path /mnt/data/data/tune/d_convert \\\n--seq_length 4096 \\\n--split_ratio 0.95  # 训练集与验证集比例\n",[205],{"type":18,"tag":129,"props":206,"children":207},{"__ignoreMap":7},[208],{"type":24,"value":203},{"type":18,"tag":26,"props":210,"children":211},{},[212],{"type":24,"value":213},"执行转换命令：",{"type":18,"tag":124,"props":215,"children":217},{"code":216},"chmod +x data_convert_qwen25_instruction.sh\n./data_convert_qwen25_instruction.sh\n",[218],{"type":18,"tag":129,"props":219,"children":220},{"__ignoreMap":7},[221],{"type":24,"value":216},{"type":18,"tag":70,"props":223,"children":225},{"id":224},"_32-全量微调配置与执行",[226],{"type":18,"tag":30,"props":227,"children":228},{},[229],{"type":24,"value":230},"3.2 全量微调配置与执行",{"type":18,"tag":26,"props":232,"children":233},{},[234],{"type":24,"value":235},"创建微调脚本tune_qwen25_7b_4k_full_ms.sh，配置分布式训练参数：",{"type":18,"tag":124,"props":237,"children":239},{"code":238},"#!/bin/bash\nexport RANK_SIZE=4  # 4卡训练\nexport DEVICE_NUM=4\nexport RANK_ID=0\n\npython -m torch.distributed.launch --nproc_per_node=$DEVICE_NUM \\\ntools/train.py \\\n--model_name qwen2.5-7b \\\n--model_path /mnt/data/Qwen2.5-7B/w_ori \\\n--output_path /mnt/data/Qwen2.5-7B/w_tune \\\n--data_path /mnt/data/data/tune/d_convert \\\n--tokenizer_path /mnt/data/Qwen2.5-7B/w_ori \\\n--seq_length 4096 \\\n--batch_size 8 \\\n--tp 2  # 张量并行数（根据卡数调整）\n--pp 1  # 流水线并行数\n--learning_rate 2e-5 \\\n--epochs 3 \\\n--save_steps 1000 \\\n--mixed_precision bf16  # 混合精度训练\n",[240],{"type":18,"tag":129,"props":241,"children":242},{"__ignoreMap":7},[243],{"type":24,"value":238},{"type":18,"tag":26,"props":245,"children":246},{},[247],{"type":24,"value":248},"启动微调：",{"type":18,"tag":124,"props":250,"children":252},{"code":251},"chmod +x tune_qwen25_7b_4k_full_ms.sh\n./tune_qwen25_7b_4k_full_ms.sh\n",[253],{"type":18,"tag":129,"props":254,"children":255},{"__ignoreMap":7},[256],{"type":24,"value":251},{"type":18,"tag":70,"props":258,"children":260},{"id":259},"_33-推理验证",[261],{"type":18,"tag":30,"props":262,"children":263},{},[264],{"type":24,"value":265},"3.3 推理验证",{"type":18,"tag":26,"props":267,"children":268},{},[269],{"type":24,"value":270},"创建推理脚本generate_qwen25_7b_ms.sh：",{"type":18,"tag":124,"props":272,"children":274},{"code":273},"#!/bin/bash\npython tools/generate.py \\\n--model_name qwen2.5-7b \\\n--model_path /mnt/data/Qwen2.5-7B/w_tune \\\n--tokenizer_path /mnt/data/Qwen2.5-7B/w_ori \\\n--seq_length 4096 \\\n--max_new_tokens 512 \\\n--prompt \"请详细解释什么是大语言模型的全量微调？\"\n",[275],{"type":18,"tag":129,"props":276,"children":277},{"__ignoreMap":7},[278],{"type":24,"value":273},{"type":18,"tag":26,"props":280,"children":281},{},[282],{"type":24,"value":283},"执行推理：",{"type":18,"tag":124,"props":285,"children":287},{"code":286},"chmod +x generate_qwen25_7b_ms.sh\n./generate_qwen25_7b_ms.sh\n",[288],{"type":18,"tag":129,"props":289,"children":290},{"__ignoreMap":7},[291],{"type":24,"value":286},{"type":18,"tag":26,"props":293,"children":294},{},[295],{"type":18,"tag":30,"props":296,"children":297},{},[298],{"type":24,"value":299},"# 04",{"type":18,"tag":26,"props":301,"children":302},{},[303],{"type":18,"tag":30,"props":304,"children":305},{},[306],{"type":24,"value":307},"关键代码解析",{"type":18,"tag":70,"props":309,"children":311},{"id":310},"_41-分布式训练初始化",[312],{"type":18,"tag":30,"props":313,"children":314},{},[315],{"type":24,"value":316},"4.1 分布式训练初始化",{"type":18,"tag":26,"props":318,"children":319},{},[320],{"type":24,"value":321},"MindSpore 动态图通过torch.distributed实现多卡通信，核心初始化代码：",{"type":18,"tag":124,"props":323,"children":325},{"code":324},"# tools/train.py 核心片段\nimport torch.distributed as dist\ndef init_distributed():\n    dist.init_process_group(\n        backend='hccl',  # 昇腾专用通信后端\n        init_method='env://',\n        world_size=int(os.getenv('RANK_SIZE', 1)),\n        rank=int(os.getenv('RANK_ID', 0))\n    )\n    local_rank = int(os.getenv('LOCAL_RANK', 0))\n    torch.cuda.set_device(local_rank)\n    return local_rank\n",[326],{"type":18,"tag":129,"props":327,"children":328},{"__ignoreMap":7},[329],{"type":24,"value":324},{"type":18,"tag":26,"props":331,"children":332},{},[333],{"type":24,"value":334},"hccl后端是昇腾分布式训练的核心，支持高效的跨卡数据传输，大幅提升训练吞吐量。",{"type":18,"tag":70,"props":336,"children":338},{"id":337},"_42-混合精度训练配置",[339],{"type":18,"tag":30,"props":340,"children":341},{},[342],{"type":24,"value":343},"4.2 混合精度训练配置",{"type":18,"tag":124,"props":345,"children":347},{"code":346},"# 混合精度训练上下文配置\nfrom torch.cuda.amp import autocast, GradScaler\nscaler = GradScaler() if args.mixed_precision == 'bf16' else None\n\nwith autocast(dtype=torch.bfloat16):\n    outputs = model(input_ids, attention_mask=attention_mask)\n    loss = criterion(outputs.logits, labels)\n\n# 梯度缩放，避免梯度下溢\nif scaler is not None:\n    scaler.scale(loss).backward()\n    scaler.step(optimizer)\n    scaler.update()\nelse:\n    loss.backward()\n    optimizer.step()\n",[348],{"type":18,"tag":129,"props":349,"children":350},{"__ignoreMap":7},[351],{"type":24,"value":346},{"type":18,"tag":26,"props":353,"children":354},{},[355],{"type":24,"value":356},"采用 bf16 混合精度训练，在保证模型精度的前提下，减少显存占用（约降低 50%），使昇腾服务器可轻松承载 7B 模型全量微调。",{"type":18,"tag":70,"props":358,"children":360},{"id":359},"_43-模型保存与加载",[361],{"type":18,"tag":30,"props":362,"children":363},{},[364],{"type":24,"value":365},"4.3 模型保存与加载",{"type":18,"tag":124,"props":367,"children":369},{"code":368},"# 仅主卡保存模型，避免重复存储\nif local_rank == 0:\n    if step % args.save_steps == 0:\n        save_dir = os.path.join(args.output_path, f\"checkpoint-{step}\")\n        os.makedirs(save_dir, exist_ok=True)\n        # 保存模型权重（兼容MindSpore与PyTorch格式）\n        torch.save(model.state_dict(), os.path.join(save_dir, \"pytorch_model.bin\"))\n        print(f\"Model saved to {save_dir}\")\n",[370],{"type":18,"tag":129,"props":371,"children":372},{"__ignoreMap":7},[373],{"type":24,"value":368},{"type":18,"tag":26,"props":375,"children":376},{},[377],{"type":24,"value":378},"通过local_rank == 0控制仅主卡保存模型，避免多卡重复写入，提升存储效率。",{"type":18,"tag":26,"props":380,"children":381},{},[382],{"type":18,"tag":30,"props":383,"children":384},{},[385],{"type":24,"value":386},"# 05",{"type":18,"tag":26,"props":388,"children":389},{},[390],{"type":18,"tag":30,"props":391,"children":392},{},[393],{"type":24,"value":394},"效果验证",{"type":18,"tag":70,"props":396,"children":398},{"id":397},"_51-训练性能验证",[399],{"type":18,"tag":30,"props":400,"children":401},{},[402],{"type":24,"value":403},"5.1 训练性能验证",{"type":18,"tag":80,"props":405,"children":406},{},[407,412,417],{"type":18,"tag":84,"props":408,"children":409},{},[410],{"type":24,"value":411},"单卡训练吞吐量：128 tokens/sec（bf16 精度，batch_size=8）",{"type":18,"tag":84,"props":413,"children":414},{},[415],{"type":24,"value":416},"4 卡分布式训练吞吐量：486 tokens/sec（加速比 3.8，接近线性加速）",{"type":18,"tag":84,"props":418,"children":419},{},[420],{"type":24,"value":421},"显存占用：单卡峰值约 38GB（7B 模型全量微调，seq_length=4096）",{"type":18,"tag":70,"props":423,"children":425},{"id":424},"_2",[],{"type":18,"tag":70,"props":427,"children":429},{"id":428},"_52-模型精度验证",[430],{"type":18,"tag":30,"props":431,"children":432},{},[433],{"type":24,"value":434},"5.2 模型精度验证",{"type":18,"tag":70,"props":436,"children":438},{"id":437},"_3",[],{"type":18,"tag":26,"props":440,"children":441},{},[442],{"type":24,"value":443},"使用 Alpaca_eval 工具评估微调后模型性能：",{"type":18,"tag":445,"props":446,"children":448},"div",{"style":447},"text-align: center;",[449],{"type":18,"tag":450,"props":451,"children":454},"img",{"src":452,"style":453,"alt":7},"/category/information/technology-blogs/banner/2025-11-27.jpg","display: block;margin: 0 auto;max-width:80%",[],{"type":18,"tag":70,"props":456,"children":458},{"id":457},"_53-推理效果示例",[459],{"type":18,"tag":30,"props":460,"children":461},{},[462],{"type":24,"value":463},"5.3 推理效果示例",{"type":18,"tag":26,"props":465,"children":466},{},[467],{"type":24,"value":468},"输入 prompt：\"请详细解释什么是大语言模型的全量微调？\"",{"type":18,"tag":26,"props":470,"children":471},{},[472],{"type":24,"value":473},"输出结果：",{"type":18,"tag":26,"props":475,"children":476},{},[477],{"type":24,"value":478},"\"大语言模型的全量微调是指在训练过程中，对模型的所有参数（包括嵌入层、编码器 / 解码器层、输出层等）进行更新优化的调优方式。与 LoRA 等增量微调方法不同，全量微调不冻结任何层参数，而是基于新的任务数据集重新调整整个模型的权重分布... 在昇腾硬件支持下，通过混合精度训练和分布式并行策略，可高效完成 7B 级别模型的全量微调，兼顾训练效率与模型性能。\"",{"type":18,"tag":26,"props":480,"children":481},{},[482],{"type":24,"value":483},"输出内容逻辑清晰、表述准确，指令遵循能力显著提升。",{"title":7,"searchDepth":485,"depth":485,"links":486},4,[487,489,490,491,492,493,494,495,496,497,498,499,500,501,502],{"id":72,"depth":488,"text":78},3,{"id":106,"depth":488,"text":112},{"id":7,"depth":488,"text":7},{"id":185,"depth":488,"text":191},{"id":194,"depth":488,"text":7},{"id":224,"depth":488,"text":230},{"id":259,"depth":488,"text":265},{"id":310,"depth":488,"text":316},{"id":337,"depth":488,"text":343},{"id":359,"depth":488,"text":365},{"id":397,"depth":488,"text":403},{"id":424,"depth":488,"text":7},{"id":428,"depth":488,"text":434},{"id":437,"depth":488,"text":7},{"id":457,"depth":488,"text":463},"markdown","content:technology-blogs:zh:2025-11-27.md","content","technology-blogs/zh/2025-11-27.md","technology-blogs/zh/2025-11-27","md",1776506118052]