[{"data":1,"prerenderedAt":320},["ShallowReactive",2],{"content-query-xzXOeP11dL":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":314,"_id":315,"_source":316,"_file":317,"_stem":318,"_extension":319},"/news/zh/3701","zh",false,"","上海交通大学联合MindSpore与openEuler社区，实现DeepSeek全栈开源单机推理部署","2025年1月DeepSeek-R1发布以来，已触发了AI大模型产业的一系列进化与变革。","2025-04-17","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/04/18/4b561cdc4c1040e58fdf75a6bbea7e6c.png","news",{"type":14,"children":15,"toc":311},"root",[16,24,29,34,39,44,49,54,59,64,89,98,106,111,116,121,126,134,139,144,149,154,159,164,169,174,182,190,195,200,205,210,217,222,227,232,237,242,251,256,265,270,279,294,299,306],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"上海交通大学联合mindspore与openeuler社区实现deepseek全栈开源单机推理部署",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28],{"type":23,"value":9},{"type":17,"tag":25,"props":30,"children":31},{},[32],{"type":23,"value":33},"一方面，DeepSeek-V3/R1以其卓越的交互体验，7天即揽获过亿用户。3月24日发布的DeepSeek-V3-0324更在代码生成、技术写作等多项任务能力上有了质的飞跃，使得AI大模型加速成为新一代生产力工具。",{"type":17,"tag":25,"props":35,"children":36},{},[37],{"type":23,"value":38},"另一方面，DeepSeek-V3/R1依托MoE网络结构、动态冗余专家等一系列巧妙设计，显著提升了训推性价比，使得中小企业有能力定制训练和部署专有模型，有望加速AI+产业赋能。",{"type":17,"tag":25,"props":40,"children":41},{},[42],{"type":23,"value":43},"然而，伴随着DeepSeek-V3/R1的持续火爆，其大规模产业落地，仍面临诸多挑战：",{"type":17,"tag":25,"props":45,"children":46},{},[47],{"type":23,"value":48},"1、部署门槛高：在裸金属服务器或云主机上部署DeepSeek-V3/R1推理服务，涉及AI框架、推理加速库、推理服务框架等10+个开源软件的安装调测，学习成本高，探索路径曲折；",{"type":17,"tag":25,"props":50,"children":51},{},[52],{"type":23,"value":53},"2、部署满血版成本高：满血版DeepSeek-V3/R1采用BFloat16浮点格式的权重文件部署，通常需4台Atlas 800T A2服务器（64GB），部署成本相对较高。",{"type":17,"tag":25,"props":55,"children":56},{},[57],{"type":23,"value":58},"3、性能优化难：以昇腾为代表的国产化算力平台，此前缺乏性能领先、全栈开源的DeepSeek-V3/R1推理解决方案，阻碍了行业化定制调优。",{"type":17,"tag":25,"props":60,"children":61},{},[62],{"type":23,"value":63},"3月26日，上海交通大学并行与分布式系统研究所联合昇思MindSpore社区、openEuler社区，联合开发验证了基于vLLM+openEuler+MindSpore、全栈开源的DeepSeek推理解决方案。双方研发人员协同完成DeepSeek-R1/V3-0324权重参数的Int4量化，在精度几乎无损的条件下，实现单台Atlas 800T A2服务器（64GB）部署运行DeepSeek大模型推理服务。同时，验证了DeepSeek-R1/V3-0324 Int8量化推理，其系统总吞吐性能较前序版本提升幅度达15%。",{"type":17,"tag":25,"props":65,"children":66},{},[67,69,78,80,87],{"type":23,"value":68},"值得关注的是，该解决方案的DeepSeek-V3/R1 Int8量化推理版本，此前已与北京大学联合进行了技术验证，并参展华为计算合作伙伴大会，支持RAG、智能运维、智能调优Agent等应用。相关技术细节可参阅《",{"type":17,"tag":70,"props":71,"children":75},"a",{"href":72,"rel":73},"https://mp.weixin.qq.com/s?__biz=MzkxMTM2MjMzNg==&mid=2247630547&idx=1&sn=75c07ecde9740aac326b4a02fb54784c&scene=21#wechat_redirect",[74],"nofollow",[76],{"type":23,"value":77},"北京大学联合openEuler与MindSpore发布DeepSeek全栈开源解决方案",{"type":23,"value":79},"》、《",{"type":17,"tag":70,"props":81,"children":84},{"href":82,"rel":83},"https://mp.weixin.qq.com/s?__biz=MzkxMTM2MjMzNg==&mid=2247630851&idx=3&sn=49203ab228fe623f26d113a9ff3e26e1&scene=21#wechat_redirect",[74],[85],{"type":23,"value":86},"openEuler全栈开源推理方案亮相华为伙伴大会",{"type":23,"value":88},"》。",{"type":17,"tag":25,"props":90,"children":91},{},[92],{"type":17,"tag":93,"props":94,"children":95},"strong",{},[96],{"type":23,"value":97},"# 01",{"type":17,"tag":25,"props":99,"children":100},{},[101],{"type":17,"tag":93,"props":102,"children":103},{},[104],{"type":23,"value":105},"技术亮点解析",{"type":17,"tag":25,"props":107,"children":108},{},[109],{"type":23,"value":110},"1.1 昇思MindSpore Int4权重量化推理",{"type":17,"tag":25,"props":112,"children":113},{},[114],{"type":23,"value":115},"DeepSeek是首个使用FP8浮点格式进行全流程训练和推理的大模型，但其千亿级的庞大参数量仍对硬件部署环境提出了极高的要求。借助昇思MindSpore金箍棒套件，成功实现了对DeepSeek-R1/V3-0324权重参数的W4A16 Int4权重量化，将部署的硬件部署成本的进一步减半。",{"type":17,"tag":25,"props":117,"children":118},{},[119],{"type":23,"value":120},"昇思MindSpore金箍棒套件使用GPTQ算法，通过分层配置DeepSeek-R1/V3-0324网络的量化策略，有效平衡了量化模型精度和显存占用。面向DeepSeek MoE结构，使用Hessian矩阵的二阶信息约束权重量化前后的模型输出差异，实现高精度的Int4量化。同时，采用逐块量化策略，降低量化校准耗时和显存占用，在完成一批参数的量化完后，使用逆Hessian矩阵信息对后一批参数的权重进行补偿。如表1所示，Int4权重量化后的DeepSeek-R1推理模型ceval评分，较BFloat16浮点格式几乎无退化，但权重文件和显存占用减少约75%。",{"type":17,"tag":25,"props":122,"children":123},{},[124],{"type":23,"value":125},"表1 DeepSeek-R1/V3-0324 Int4量化推理测试结果",{"type":17,"tag":25,"props":127,"children":128},{},[129],{"type":17,"tag":130,"props":131,"children":133},"img",{"alt":7,"src":132},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/04/18/6ce9283c9cef43ef884f4c5f1be09575.png",[],{"type":17,"tag":25,"props":135,"children":136},{},[137],{"type":23,"value":138},"注：表中为zero-shot CEval评测结果，限定模型输出最多5个token，匹配正确答案来计分。",{"type":17,"tag":25,"props":140,"children":141},{},[142],{"type":23,"value":143},"借助昇思MindSpore的图编译功能，DeepSeek量化模型推理过程中自动进行Vector-Vector、Cube-Vector算子融合优化，同时结合vLLM的Multi-Step Scheduling/Prefix Cache以及Chunked Prefill等功能，有效地提升了模型的推理吞吐率。",{"type":17,"tag":25,"props":145,"children":146},{},[147],{"type":23,"value":148},"1.2 openEuler异构融合内存按需预取&offload",{"type":17,"tag":25,"props":150,"children":151},{},[152],{"type":23,"value":153},"openEuler提供了操作系统层面细粒度的L2Cache按需控制和预取技术，支持计算任务和通信任务多线程并发执行，在通信任务流执行的同时按需预取MoE层的权重参数，实现通算的进一步融合和流水并行时延掩盖，提升推理吞吐。",{"type":17,"tag":25,"props":155,"children":156},{},[157],{"type":23,"value":158},"openEuler引入了KVCache Offload机制，大幅降低了DeepSeek模型的NPU显存开销，进一步提升显存利用率与整体吞吐性能：该机制可自动识别冷热的KV Cache数据，将不常用的KV Blocks动态交换到内存中；当用户请求激活或者匹配到共享前缀时，再将KV Blocks交换到NPU中继续推理。同时，使用异步传输进一步将KV Blocks的传输和计算交叠掩盖，降低了上述过程对于Prefill时延的影响。",{"type":17,"tag":25,"props":160,"children":161},{},[162],{"type":23,"value":163},"1.3 毕昇编译优化",{"type":17,"tag":25,"props":165,"children":166},{},[167],{"type":23,"value":168},"毕昇编译器针对NPU后端使能融合编译技术，使能架构亲和指令，分析多级流水线之间数据依赖关系，自动插入最优同步，实现最优性能Vector-Vector融合算子与Cube-Vector融合算子，并优化PagedAttention算子等关键算子的执行速率，加速NPU设备运行速度，减少算子数量，降低下发执行压力。",{"type":17,"tag":25,"props":170,"children":171},{},[172],{"type":23,"value":173},"在openEuler、MindSpore与 DeepSeek全栈开源推理方案中，毕昇编译器针对Host CPU侧算子下发阶段的性能瓶颈，通过架构亲和优化、循环优化、多级并行优化、指令优化、智能编译选项和链接时优化等编译技术优化Python、Mindspore和Ray框架，使代码布局更优，有效提高程序IPC，降低访存开销，进而降低时延，提高吞吐率。",{"type":17,"tag":25,"props":175,"children":176},{},[177],{"type":17,"tag":93,"props":178,"children":179},{},[180],{"type":23,"value":181},"# 02",{"type":17,"tag":25,"props":183,"children":184},{},[185],{"type":17,"tag":93,"props":186,"children":187},{},[188],{"type":23,"value":189},"部署验证结果",{"type":17,"tag":25,"props":191,"children":192},{},[193],{"type":23,"value":194},"3月26日，上海交通大学并行与分布式系统研究所和openEuler社区的研发人员通过线上协同，在上海交通大学的Atlas 800T A2服务器集群上，使用MindSpore+openEuler全栈开源推理解决方案镜像，验证了DeepSeek-R1和DeepSeek-V3-0324的Int8/In4量化推理。",{"type":17,"tag":25,"props":196,"children":197},{},[198],{"type":23,"value":199},"双方研发人员首先使用2台Atlas 800T A2服务器（64GB），部署了基于vLLM+MindSpore+openEuler的DeepSeek-R1 Int8量化推理，其单Batch吞吐率19token/s，192 Batch总吞吐率1400token/s，较该解决方案首版本提升了超过15%。",{"type":17,"tag":25,"props":201,"children":202},{},[203],{"type":23,"value":204},"双方研发人员然后使用昇思MindSpore金箍棒套件，对DeepSeek-R1和DeepSeek-V3-0324 1274GB的BFloat16浮点格式权重文件，进行了GPTQ权重量化，经历约6小时的量化寻优，获得了体积仅337GB的Int4格式权重文件，可在单台Atlas 800T A2服务器（64GB）上进行服务部署，192 Batch总吞吐率420token/s。",{"type":17,"tag":25,"props":206,"children":207},{},[208],{"type":23,"value":209},"上述开发验证过程的容器镜像和权重文件，已分别上传至openEuler社区和天翼云魔乐社区（modelers.cn），可供开发者下载使用。",{"type":17,"tag":25,"props":211,"children":212},{},[213],{"type":17,"tag":130,"props":214,"children":216},{"alt":7,"src":215},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/04/18/6ac3fbdeb4724013909619b0dc5c8c5f.png",[],{"type":17,"tag":25,"props":218,"children":219},{},[220],{"type":23,"value":221},"图1 openEuler社区朱睿（左图）与上海交通大学并行与分布式系统研究所陈启炜（右图）远程协同完成DeepSeek Int8/Int4推理部署开发与验证",{"type":17,"tag":25,"props":223,"children":224},{},[225],{"type":23,"value":226},"相关资源链接：",{"type":17,"tag":25,"props":228,"children":229},{},[230],{"type":23,"value":231},"1. openEuler+MindSpore全栈开源解决方案镜像：",{"type":17,"tag":25,"props":233,"children":234},{},[235],{"type":23,"value":236},"hub.oepkgs.net/oedeploy/openeuler/aarch64/mindspore:20250326",{"type":17,"tag":25,"props":238,"children":239},{},[240],{"type":23,"value":241},"2. DeepSeek-R1 Int8量化模型权重：",{"type":17,"tag":25,"props":243,"children":244},{},[245],{"type":17,"tag":70,"props":246,"children":249},{"href":247,"rel":248},"https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-W8A8",[74],[250],{"type":23,"value":247},{"type":17,"tag":25,"props":252,"children":253},{},[254],{"type":23,"value":255},"3. DeepSeek-V3-0324 Int4量化模型权重：",{"type":17,"tag":25,"props":257,"children":258},{},[259],{"type":17,"tag":70,"props":260,"children":263},{"href":261,"rel":262},"https://modelers.cn/models/IPADS/DeepSeek-V3-0324-A16W4",[74],[264],{"type":23,"value":261},{"type":17,"tag":25,"props":266,"children":267},{},[268],{"type":23,"value":269},"4. DeepSeek-R1 Int4量化模型权重：",{"type":17,"tag":25,"props":271,"children":272},{},[273],{"type":17,"tag":70,"props":274,"children":277},{"href":275,"rel":276},"https://modelers.cn/models/IPADS/DeepSeek-R1-A16W4",[74],[278],{"type":23,"value":275},{"type":17,"tag":25,"props":280,"children":281},{},[282,287,289],{"type":17,"tag":93,"props":283,"children":284},{},[285],{"type":23,"value":286},"3",{"type":23,"value":288}," ",{"type":17,"tag":93,"props":290,"children":291},{},[292],{"type":23,"value":293},"演进路标披露",{"type":17,"tag":25,"props":295,"children":296},{},[297],{"type":23,"value":298},"结合AI产业发展趋势和开源软件客户应用需求，MindSpore社区和openEuler社区已联合多家机构与产业伙伴，规划出DeepSeek全栈开源推理解决方案的技术演进路标。如下图所示，通过垂直整合和持续演进MindSpore、openEuler、毕昇编译器在异构资源调度、图编译、异构编译优化等领域的技术长板，该解决方案将陆续支持多模态RAG、AI Agent等应用功能，并将所支持的系统形态由单机/双机逐步拓展至32卡/64卡PD分离集群，以期提供鲲鹏+昇腾智算平台的开源最优推理部署方案。",{"type":17,"tag":25,"props":300,"children":301},{},[302],{"type":17,"tag":130,"props":303,"children":305},{"alt":7,"src":304},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/04/18/cf95f7ab5302488f914db1fa51debc4d.png",[],{"type":17,"tag":25,"props":307,"children":308},{},[309],{"type":23,"value":310},"图2 MindSpore+openEuler开源推理解决方案路标",{"title":7,"searchDepth":312,"depth":312,"links":313},4,[],"markdown","content:news:zh:3701.md","content","news/zh/3701.md","news/zh/3701","md",1776506087984]