[{"data":1,"prerenderedAt":531},["ShallowReactive",2],{"content-query-VTq7kvwA6S":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":525,"_id":526,"_source":527,"_file":528,"_stem":529,"_extension":530},"/news/zh/3630","zh",false,"","鹏城实验室发布基于昇思框架、昇腾硬件的强化学习训练框架GRPO-Training-Suite","当前代码已在启智社区和昇思社区开源，根据本教程，用户可以快速上手体验，探索强化学习的奥秘！","2025-02-28","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/03/03/3656c3af697a4c6eb54e11fcbc5651d3.png","news",{"type":14,"children":15,"toc":522},"root",[16,24,30,34,39,50,59,64,69,89,94,102,107,112,119,128,143,150,164,169,177,189,197,207,215,223,231,244,249,257,262,270,275,288,293,298,306,311,319,323,331,339,347,351,359,364,372,380,384,392,406,414,419,424,432,437,445,453,458,463,471,479,484,489,497,501,509,514],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"鹏城实验室发布基于昇思框架昇腾硬件的强化学习训练框架grpo-training-suite",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28],{"type":23,"value":29},"本次发布基于Qwen2.5-7B、32B打通GRPO强化学习训练全流程，为强化学习开发者提供了训练接口，支持算法快速开发，提供多种训练优化技术，并内置GRPO强化学习训练流程。",{"type":17,"tag":25,"props":31,"children":32},{},[33],{"type":23,"value":9},{"type":17,"tag":25,"props":35,"children":36},{},[37],{"type":23,"value":38},"开源代码仓：",{"type":17,"tag":25,"props":40,"children":41},{},[42],{"type":17,"tag":43,"props":44,"children":48},"a",{"href":45,"rel":46},"https://openi.pcl.ac.cn/PCL-Reasoner/GRPO-Training-Suite",[47],"nofollow",[49],{"type":23,"value":45},{"type":17,"tag":25,"props":51,"children":52},{},[53],{"type":17,"tag":43,"props":54,"children":57},{"href":55,"rel":56},"https://gitee.com/mindspore/mindrlhf",[47],[58],{"type":23,"value":55},{"type":17,"tag":25,"props":60,"children":61},{},[62],{"type":23,"value":63},"鹏城实验室基于昇思MindSpore AI框架及昇腾AI硬件，率先突破大模型强化学习技术全栈壁垒。通过自主创新硬件、框架及超大规模集群的深度协同，成功构建从硬件算力、算法优化到集群调度的完整技术链条，实现GRPO强化学习训练在Qwen2.5（7B，32B）上的全流程部署，发布并开源GRPO强化学习训练流程和代码。",{"type":17,"tag":25,"props":65,"children":66},{},[67],{"type":23,"value":68},"与我们以往开发的单模型训练代码相比，GRPO强化学习训练流程涉及策略模型与参考模型，通过策略模型生成数据，利用参考模型和奖励函数计算Loss，然后进行策略模型的训练。这个过程需要模型在推理和训练状态间的频繁切换，并涉及参考模型和策略模型推理、训练三份权重，对训练性能和显存管理提出了更高的要求，对于强化学习开发者来说，快速完成算法开发和模型训练是个不小的挑战。昇思MindSpore框架为GRPO强化学习训练流程提供了优化技术：",{"type":17,"tag":70,"props":71,"children":72},"ul",{},[73,79,84],{"type":17,"tag":74,"props":75,"children":76},"li",{},[77],{"type":23,"value":78},"组件化解耦训练流程与模型定义，支持用户自定义修改模型结构、奖励函数、训练超参等。",{"type":17,"tag":74,"props":80,"children":81},{},[82],{"type":23,"value":83},"训推共部署，实现训练和推理权重在线快速自动重排，避免权重文件落盘操作，节省离线转换保存权重文件的时间开销。",{"type":17,"tag":74,"props":85,"children":86},{},[87],{"type":23,"value":88},"通过异构内存Swap技术，按需加载模型至显存，避免训练和推理的权重同时存在，支持更大规模模型的训练任务。",{"type":17,"tag":25,"props":90,"children":91},{},[92],{"type":23,"value":93},"下一步，鹏城实验室将基于昇思MindSpore及昇腾AI硬件，在代码自动生成优化、科学计算等下游任务上孵化自主创新大模型，并且提供对接多种模型的强化学习流程。",{"type":17,"tag":25,"props":95,"children":96},{},[97],{"type":17,"tag":98,"props":99,"children":101},"img",{"alt":7,"src":100},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/03/01/5fcfa9f8db0b4650937b6d6dca3b201d.png",[],{"type":17,"tag":25,"props":103,"children":104},{},[105],{"type":23,"value":106},"GRPO（Group Relative Policy Optimization，组相对策略优化）是针对数学等逻辑推理任务提出的强化学习训练的算法。强化学习的训练过程是学习一个策略模型，通过不断试错，策略模型与奖励函数的不断交互，策略模型会逐渐倾向于选择能获得更高奖励的行为，自主探索出最佳学习路径。通过GRPO算法的大规模后训练得到的DeepSeek R1模型在逻辑推理能力上得到了显著提升，涌现出了长思维链和反思等深度思考能力，其在数学和编程任务上的表现已超越或媲美OpenAI o1系列模型。",{"type":17,"tag":25,"props":108,"children":109},{},[110],{"type":23,"value":111},"GRPO创新性地引入“组内相对比较”机制，这样的设计直接利用同一批次样本中不同策略输出的相对优势评估，既降低了显存占用，又显著提升了训练稳定性‌。同时，GRPO强化学习训练流程采用‌基于规则的显式奖励函数‌，而非奖励模型‌。这种方法减少了奖励函数被对抗性样本“欺骗”（reward hacking）的风险，使奖励信号更透明、可解释且与任务目标强对齐‌。",{"type":17,"tag":25,"props":113,"children":114},{},[115],{"type":17,"tag":98,"props":116,"children":118},{"alt":7,"src":117},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/03/01/d71332bd394c44a7944b66cc152c25ee.png",[],{"type":17,"tag":25,"props":120,"children":121},{},[122],{"type":17,"tag":123,"props":124,"children":125},"strong",{},[126],{"type":23,"value":127},"一、环境搭建",{"type":17,"tag":25,"props":129,"children":130},{},[131,136,138],{"type":17,"tag":123,"props":132,"children":133},{},[134],{"type":23,"value":135},"01",{"type":23,"value":137}," ",{"type":17,"tag":123,"props":139,"children":140},{},[141],{"type":23,"value":142},"版本匹配关系",{"type":17,"tag":25,"props":144,"children":145},{},[146],{"type":17,"tag":98,"props":147,"children":149},{"alt":7,"src":148},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/03/01/9b2e1ef1554c489f9c2263ed565ea2bf.png",[],{"type":17,"tag":25,"props":151,"children":152},{},[153,158,159],{"type":17,"tag":123,"props":154,"children":155},{},[156],{"type":23,"value":157},"02",{"type":23,"value":137},{"type":17,"tag":123,"props":160,"children":161},{},[162],{"type":23,"value":163},"镜像安装",{"type":17,"tag":25,"props":165,"children":166},{},[167],{"type":23,"value":168},"为了方便复现，我们也提供了镜像。",{"type":17,"tag":25,"props":170,"children":171},{},[172],{"type":17,"tag":123,"props":173,"children":174},{},[175],{"type":23,"value":176},"2.1下载Docker镜像",{"type":17,"tag":25,"props":178,"children":179},{},[180,182],{"type":23,"value":181},"从GRPO-Training-Container仓中下载镜像文件（",{"type":17,"tag":43,"props":183,"children":186},{"href":184,"rel":185},"https://openi.pcl.ac.cn/PCL-Reasoner/GRPO-Training-Container.git%EF%BC%89",[47],[187],{"type":23,"value":188},"https://openi.pcl.ac.cn/PCL-Reasoner/GRPO-Training-Container.git）",{"type":17,"tag":25,"props":190,"children":191},{},[192],{"type":17,"tag":123,"props":193,"children":194},{},[195],{"type":23,"value":196},"2.2基于镜像创建容器",{"type":17,"tag":198,"props":199,"children":201},"pre",{"code":200},"docker run -itd  --privileged  --network=host \\\n   --shm-size 500g \\\n   --device=/dev/davinci0 \\\n   --device=/dev/davinci1 \\\n   --device=/dev/davinci2 \\\n   --device=/dev/davinci3 \\\n   --device=/dev/davinci4 \\\n   --device=/dev/davinci5 \\\n   --device=/dev/davinci6 \\\n   --device=/dev/davinci7 \\\n   --device=/dev/davinci_manager \\\n   --device=/dev/hisi_hdc \\\n   --device /dev/devmm_svm \\\n   -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \\\n   -v /usr/local/Ascend/firmware:/usr/local/Ascend/firmware \\\n   -v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \\\n   -v /usr/local/sbin:/usr/local/sbin \\\n   -v /etc/hccn.conf:/etc/hccn.conf \\\n\n   CONTAINER_NAME:TAG \\\n   bash\n",[202],{"type":17,"tag":203,"props":204,"children":205},"code",{"__ignoreMap":7},[206],{"type":23,"value":200},{"type":17,"tag":25,"props":208,"children":209},{},[210],{"type":17,"tag":123,"props":211,"children":212},{},[213],{"type":23,"value":214},"2.3进入容器",{"type":17,"tag":198,"props":216,"children":218},{"code":217},"docker exec -it CONTAINER_ID bash\n",[219],{"type":17,"tag":203,"props":220,"children":221},{"__ignoreMap":7},[222],{"type":23,"value":217},{"type":17,"tag":25,"props":224,"children":225},{},[226],{"type":17,"tag":123,"props":227,"children":228},{},[229],{"type":23,"value":230},"二、使用指南",{"type":17,"tag":25,"props":232,"children":233},{},[234,238,239],{"type":17,"tag":123,"props":235,"children":236},{},[237],{"type":23,"value":135},{"type":23,"value":137},{"type":17,"tag":123,"props":240,"children":241},{},[242],{"type":23,"value":243},"数据集及文件的获取",{"type":17,"tag":25,"props":245,"children":246},{},[247],{"type":23,"value":248},"使用examples/qwen_grpo_tutorial/rlhf_data.py将GSM8k.json转换成mindrecord的形式，此数据路径为mind_dataset_dir的取值。此数据路径在启动训推作为mind_dataset_dir 的值。",{"type":17,"tag":198,"props":250,"children":252},{"code":251},"python rlhf_data.py --vocab_path /path/to/vocab.json --merges_file_path /path/to/merges.txt --file_path /path/to/raw/data/ --output_path /path/to/mindrecord/\n",[253],{"type":17,"tag":203,"props":254,"children":255},{"__ignoreMap":7},[256],{"type":23,"value":251},{"type":17,"tag":25,"props":258,"children":259},{},[260],{"type":23,"value":261},"参数说明：",{"type":17,"tag":198,"props":263,"children":265},{"code":264},"vocab_path：vocab.json路径\nmerges_file_path：merges.txt路径\nfile_path：原始数据文件路径\noutput_path：输出文件路径\n",[266],{"type":17,"tag":203,"props":267,"children":268},{"__ignoreMap":7},[269],{"type":23,"value":264},{"type":17,"tag":25,"props":271,"children":272},{},[273],{"type":23,"value":274},"其中vocab.json和merges.txt 都可以从Huggingface社区对应模型页面获取。",{"type":17,"tag":25,"props":276,"children":277},{},[278,282,283],{"type":17,"tag":123,"props":279,"children":280},{},[281],{"type":23,"value":157},{"type":23,"value":137},{"type":17,"tag":123,"props":284,"children":285},{},[286],{"type":23,"value":287},"权重获取",{"type":17,"tag":25,"props":289,"children":290},{},[291],{"type":23,"value":292},"由于训练是计算密集型，生成是内存密集型，所以为了最大限度的优化性能，推理与训练阶段采用的并行策略往往不同。因此需要针对推理和训练模型分别进行切分。",{"type":17,"tag":25,"props":294,"children":295},{},[296],{"type":23,"value":297},"下面介绍了获得不同权重切分的方法。",{"type":17,"tag":25,"props":299,"children":300},{},[301],{"type":17,"tag":123,"props":302,"children":303},{},[304],{"type":23,"value":305},"2.1 MindSpore权重转换",{"type":17,"tag":25,"props":307,"children":308},{},[309],{"type":23,"value":310},"完整权重转为MindSpore用的ckpt，进入MindSpore TransFormers路径下。",{"type":17,"tag":198,"props":312,"children":314},{"code":313},"cd research/qwen2_5/\n\npython convert_weight.py --torch_ckpt_dir /path/to/your/torch/ckpt/  \\\n--mindspore_ckpt_path /path/to/save/ms/ckpt --dtype bf16 --config_path \\  research/qwen2_5/finetune_qwen2_5_7b.yaml\n",[315],{"type":17,"tag":203,"props":316,"children":317},{"__ignoreMap":7},[318],{"type":23,"value":313},{"type":17,"tag":25,"props":320,"children":321},{},[322],{"type":23,"value":261},{"type":17,"tag":198,"props":324,"children":326},{"code":325},"torch_ckpt_dir：torch权重文件\nmindspore_ckpt_path：mindspore权重保存路径\nconfig_path：模型权重配置文件\n",[327],{"type":17,"tag":203,"props":328,"children":329},{"__ignoreMap":7},[330],{"type":23,"value":325},{"type":17,"tag":25,"props":332,"children":333},{},[334],{"type":17,"tag":123,"props":335,"children":336},{},[337],{"type":23,"value":338},"2.2 获得策略文件",{"type":17,"tag":198,"props":340,"children":342},{"code":341},"cd /path/to/your/mindformers/research/qwen2_5\n\nbash ../../scripts/msrun_launcher.sh \"run_qwen2_5.py \\\n--config /path/to/your/desired/model/yaml \\\n--run_mode finetune \\\n--train_data /path/to/mindrecord \" 8 PORT output/msrun_log False 2000\n",[343],{"type":17,"tag":203,"props":344,"children":345},{"__ignoreMap":7},[346],{"type":23,"value":341},{"type":17,"tag":25,"props":348,"children":349},{},[350],{"type":23,"value":261},{"type":17,"tag":198,"props":352,"children":354},{"code":353},"# run_qwen2_5.py 参数\nconfig: 模型的配置文件\nrun_mode: 运行模式选微调\ntrain_data: 训练用的数据文件\n# msrun_launcher.sh 参数\n单机上卡数8\nPORT为节点PORT\njoin=False\ntimeout=2000\n",[355],{"type":17,"tag":203,"props":356,"children":357},{"__ignoreMap":7},[358],{"type":23,"value":353},{"type":17,"tag":25,"props":360,"children":361},{},[362],{"type":23,"value":363},"生成的策略文件在strategy下，在下一步切分ckpt时作为dst_strategy的值。",{"type":17,"tag":25,"props":365,"children":366},{},[367],{"type":17,"tag":123,"props":368,"children":369},{},[370],{"type":23,"value":371},"2.3 获得特定切分的ckpt",{"type":17,"tag":198,"props":373,"children":375},{"code":374},"nohup python transform_checkpoint.py --src_checkpoint=/path/to/checkpoint.ckpt --dst_checkpoint=/path/to/desired/ckpt/ --dst_strategy=/path/to/strategy/ > output.log 2>&1 &\n",[376],{"type":17,"tag":203,"props":377,"children":378},{"__ignoreMap":7},[379],{"type":23,"value":374},{"type":17,"tag":25,"props":381,"children":382},{},[383],{"type":23,"value":261},{"type":17,"tag":198,"props":385,"children":387},{"code":386},"src_checkpoint：原始权重路径\ndst_checkpoint：目标权重路径\ndst_strategy：目标权重策略文件路径\n",[388],{"type":17,"tag":203,"props":389,"children":390},{"__ignoreMap":7},[391],{"type":23,"value":386},{"type":17,"tag":25,"props":393,"children":394},{},[395,400,401],{"type":17,"tag":123,"props":396,"children":397},{},[398],{"type":23,"value":399},"03",{"type":23,"value":137},{"type":17,"tag":123,"props":402,"children":403},{},[404],{"type":23,"value":405},"训练/推理模型配置",{"type":17,"tag":25,"props":407,"children":408},{},[409],{"type":17,"tag":123,"props":410,"children":411},{},[412],{"type":23,"value":413},"3.1 训练模型配置",{"type":17,"tag":25,"props":415,"children":416},{},[417],{"type":23,"value":418},"训练的模型的配置finetune_qwen2_5_7b.yaml:",{"type":17,"tag":25,"props":420,"children":421},{},[422],{"type":23,"value":423},"并行配置:",{"type":17,"tag":198,"props":425,"children":427},{"code":426},"parallel_config:\n  data_parallel: 1 # 数据并行切分为 1\n  model_parallel: 4 # 模型并行切分为 4\n  pipeline_stage: 2 # 流水线并行切分为 2\n  use_seq_parallel: True\n  micro_batch_num: 2\n  vocab_emb_dp: False\n  gradient_aggregation_group: 4\n  micro_batch_interleave_num: 2 # mp大于1时，设为1可提升训练效率\n",[428],{"type":17,"tag":203,"props":429,"children":430},{"__ignoreMap":7},[431],{"type":23,"value":426},{"type":17,"tag":25,"props":433,"children":434},{},[435],{"type":23,"value":436},"训练相关配置在mindrlhf/configs/grpo_configs.py有学习率和GRPO相关的超参。",{"type":17,"tag":198,"props":438,"children":440},{"code":439},"optimizer: str = 'adamw' # 优化器类型\nbeta1: float = 0.9  # 优化器adamw超参，下同\nbeta2: float = 0.95 \neps: float = 1.0e-8 \nweight_decay: float = 0.01\n\nepochs: int = 100 # 训练轮数\n",[441],{"type":17,"tag":203,"props":442,"children":443},{"__ignoreMap":7},[444],{"type":23,"value":439},{"type":17,"tag":25,"props":446,"children":447},{},[448],{"type":17,"tag":123,"props":449,"children":450},{},[451],{"type":23,"value":452},"3.2 推理模型配置",{"type":17,"tag":25,"props":454,"children":455},{},[456],{"type":23,"value":457},"推理的模型的配置predict_qwen2_5_7b_instruct.yaml。",{"type":17,"tag":25,"props":459,"children":460},{},[461],{"type":23,"value":462},"并行配置：",{"type":17,"tag":198,"props":464,"children":466},{"code":465},"\nparallel_config:\n  data_parallel: 2 # 数据并行切分为2\n  model_parallel: 4 # 模型并行切分为4\n  pipeline_stage: 1 # 流水线并行切分为1\n  micro_batch_num: 1\n  vocab_emb_dp: False\n  gradient_aggregation_group: 4\n  micro_batch_interleave_num: 1\n",[467],{"type":17,"tag":203,"props":468,"children":469},{"__ignoreMap":7},[470],{"type":23,"value":465},{"type":17,"tag":25,"props":472,"children":473},{},[474],{"type":17,"tag":123,"props":475,"children":476},{},[477],{"type":23,"value":478},"三、启动单机8卡训练脚本",{"type":17,"tag":25,"props":480,"children":481},{},[482],{"type":23,"value":483},"用bash run_grpo.sh启动GRPO强化学习流程。",{"type":17,"tag":25,"props":485,"children":486},{},[487],{"type":23,"value":488},"注意：用户需要确认将MindSpore Transformers和MindSpore RLHF的路径加入PYTHONPATH。",{"type":17,"tag":198,"props":490,"children":492},{"code":491},"msrun --worker_num=8 --local_worker_num=8 --master_addr=127.0.0.1 \\\n--master_port=9190 --join=False --log_dir=./qwen2_5_one_log \\\nexamples/qwen_grpo_tutorial/grpo_one_stage.py \\\n--sft_path_infer ./model_configs/qwen_grpo/predict_qwen2_5_7b_instruct.yaml \\\n--sft_path_train ./model_configs/qwen_grpo/finetune_qwen2_5_7b.yaml \\\n--vocab_path /path/to/your/vocab.json \\\n--merges_file_path /path/to/your/merges.txt \\\n--mind_dataset_dir /path/to/gms8k.mindrecord \\\n--save_data_file /path/to/grpo.mindrecord \\\n--save_ckpt_dir /path/to/save/ckpt \\\n--use_parallel True \\\n--load_sft_checkpoint_infer /path/to/infer/ckpt \\\n--load_sft_checkpoint_train /path/to/train/ckpt \\\n--load_ref_checkpoint /path/to/ref/ckpt \\\n--enable_compile_cache False \\\n--only_save_strategy False\n",[493],{"type":17,"tag":203,"props":494,"children":495},{"__ignoreMap":7},[496],{"type":23,"value":491},{"type":17,"tag":25,"props":498,"children":499},{},[500],{"type":23,"value":261},{"type":17,"tag":198,"props":502,"children":504},{"code":503},"# msrun 参数\nworker_num： 总卡数\nlocal_worker_num： 单机的卡数\nmaster_addr：主节点地址\nmaster_port: 主节点端口\njoin：是否等待所有worker退出\nlog_dir: 日志路径\n# grpo_one_stage.py 参数\nsft_path_infer：推理用的模型配置\nsft_path_train：训练用的模型配置\nvocab_path: vocab.json的路径\nmerges_file_path：权重合并配置\nmind_dataset_dir：训练数据文件的路径\nsave_data_file：中间推理结果的保存路径（可选）\nsave_ckpt_dir：训练ckpt的保存路径\nuse_parallel：是否并行\nload_sft_checkpoint_infer: 推理ckpt路径\nload_sft_checkpoint_train: 训练ckpt路径\nload_ref_checkpoint： 参考模型ckpt路径\nenable_compile_cache：是否编译缓存\nonly_save_strategy：是否保存策略文件\n",[505],{"type":17,"tag":203,"props":506,"children":507},{"__ignoreMap":7},[508],{"type":23,"value":503},{"type":17,"tag":25,"props":510,"children":511},{},[512],{"type":23,"value":513},"拉起任务后，通过以下指令查看运行日志。",{"type":17,"tag":198,"props":515,"children":517},{"code":516},"tail -f qwen2_5_one_log/worker_0.log\n",[518],{"type":17,"tag":203,"props":519,"children":520},{"__ignoreMap":7},[521],{"type":23,"value":516},{"title":7,"searchDepth":523,"depth":523,"links":524},4,[],"markdown","content:news:zh:3630.md","content","news/zh/3630.md","news/zh/3630","md",1776506086148]