[{"data":1,"prerenderedAt":1337},["ShallowReactive",2],{"content-query-0Ro7FdKcHD":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":1331,"_id":1332,"_source":1333,"_file":1334,"_stem":1335,"_extension":1336},"/technology-blogs/zh/2026-1-8","zh",false,"","开源之夏｜贾阔源：基于vLLM-MindSpore，深入探索Beam Search解码优化实战","在 vLLM-MindSpore 中完整实现 Beam Search 解码算法","2026-1-8","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/06/06/1a18a46ef03442ea8f8d83ba33b0a7af.png","technology-blogs","开发者说",{"type":15,"children":16,"toc":1323},"root",[17,29,36,46,54,87,95,103,107,116,139,183,192,200,218,226,244,251,259,267,272,281,285,293,303,312,321,330,353,361,370,379,388,397,406,415,424,433,462,480,496,500,507,515,524,533,542,551,560,572,579,587,596,605,614,623,673,677,693,697,704,712,721,730,741,754,761,769,777,786,795,803,812,821,826,875,883,891,900,909,918,927,935,943,951,1013,1022,1063,1071,1079,1087,1092,1097,1107,1115,1126,1134,1157,1165,1199,1207,1218],{"type":18,"tag":19,"props":20,"children":22},"element","div",{"style":21},"text-align: center;",[23],{"type":18,"tag":24,"props":25,"children":28},"img",{"src":26,"style":27,"alt":7},"/category/information/technology-blogs/banner/2026-1-8/1.jpg","display: block;margin: 0 auto;max-width:70%",[],{"type":18,"tag":30,"props":31,"children":33},"h1",{"id":32},"开源之夏贾阔源基于vllm-mindspore深入探索beam-search解码优化实战",[34],{"type":35,"value":8},"text",{"type":18,"tag":37,"props":38,"children":39},"p",{},[40],{"type":18,"tag":41,"props":42,"children":43},"strong",{},[44],{"type":35,"value":45},"# 01",{"type":18,"tag":37,"props":47,"children":48},{},[49],{"type":18,"tag":41,"props":50,"children":51},{},[52],{"type":35,"value":53},"项目介绍",{"type":18,"tag":55,"props":56,"children":57},"ul",{},[58,69,74],{"type":18,"tag":59,"props":60,"children":61},"li",{},[62,67],{"type":18,"tag":41,"props":63,"children":64},{},[65],{"type":35,"value":66},"项目名称：",{"type":35,"value":68},"  基于 vLLM-MindSpore 实现 Beam Search 算法功能",{"type":18,"tag":59,"props":70,"children":71},{},[72],{"type":35,"value":73},"**项目描述：**本项目旨在 vLLM-MindSpore 中完整实现 Beam Search 解码算法。项目解决了该框架在高质量文本生成任务（如机器翻译、摘要）中缺乏高效解码策略的痛点。通过重构采样逻辑和算子适配，显著提升了推理吞吐量和引擎初始化速度。",{"type":18,"tag":59,"props":75,"children":76},{},[77,79],{"type":35,"value":78},"**项目源码链接：**",{"type":18,"tag":80,"props":81,"children":85},"a",{"href":82,"rel":83},"https://gitee.com/mindspore/vllm-mindspore/pulls/1084/files",[84],"nofollow",[86],{"type":35,"value":82},{"type":18,"tag":37,"props":88,"children":89},{},[90],{"type":18,"tag":41,"props":91,"children":92},{},[93],{"type":35,"value":94},"# 02",{"type":18,"tag":37,"props":96,"children":97},{},[98],{"type":18,"tag":41,"props":99,"children":100},{},[101],{"type":35,"value":102},"技术实现：如何重新设计采样链路？",{"type":18,"tag":104,"props":105,"children":106},"h3",{"id":7},[],{"type":18,"tag":104,"props":108,"children":110},{"id":109},"_1架构选型",[111],{"type":18,"tag":41,"props":112,"children":113},{},[114],{"type":35,"value":115},"1、架构选型",{"type":18,"tag":55,"props":117,"children":118},{},[119,129],{"type":18,"tag":59,"props":120,"children":121},{},[122,127],{"type":18,"tag":41,"props":123,"children":124},{},[125],{"type":35,"value":126},"开发方式：",{"type":35,"value":128}," 深入 vLLM v0 架构的 model_executor 层进行二次开发，重点在于理解原版逻辑并结合 MindSpore 特性进行更优的等价迁移。",{"type":18,"tag":59,"props":130,"children":131},{},[132,137],{"type":18,"tag":41,"props":133,"children":134},{},[135],{"type":35,"value":136},"整体流程：",{"type":35,"value":138}," （参考 vLLM v0 Beam Search 采样流程）",{"type":18,"tag":140,"props":141,"children":142},"ol",{},[143,153,163,173],{"type":18,"tag":59,"props":144,"children":145},{},[146,151],{"type":18,"tag":41,"props":147,"children":148},{},[149],{"type":35,"value":150},"初始化与收集：",{"type":35,"value":152}," 构建 BeamSearchInstance，在迭代开始时收集所有活跃 Beam。",{"type":18,"tag":59,"props":154,"children":155},{},[156,161],{"type":18,"tag":41,"props":157,"children":158},{},[159],{"type":35,"value":160},"前向推理：",{"type":35,"value":162}," 调用 LLMEngine.step() 和 ModelExecutor 获取 Logits。",{"type":18,"tag":59,"props":164,"children":165},{},[166,171],{"type":18,"tag":41,"props":167,"children":168},{},[169],{"type":35,"value":170},"采样与扩展：",{"type":35,"value":172}," 利用 Sampler 计算 Logprobs，将每个 Beam 扩展为 2 * beam_width 个候选。",{"type":18,"tag":59,"props":174,"children":175},{},[176,181],{"type":18,"tag":41,"props":177,"children":178},{},[179],{"type":35,"value":180},"管理与更新：",{"type":35,"value":182}," 执行 EOS 判断、排序裁剪，并更新 Block Table 状态。",{"type":18,"tag":104,"props":184,"children":186},{"id":185},"_2实现步骤",[187],{"type":18,"tag":41,"props":188,"children":189},{},[190],{"type":35,"value":191},"2、实现步骤",{"type":18,"tag":37,"props":193,"children":194},{},[195],{"type":18,"tag":41,"props":196,"children":197},{},[198],{"type":35,"value":199},"步骤一：环境搭建与避坑",{"type":18,"tag":55,"props":201,"children":202},{},[203,208],{"type":18,"tag":59,"props":204,"children":205},{},[206],{"type":35,"value":207},"参考官方 vllm_mindspore 安装指南。",{"type":18,"tag":59,"props":209,"children":210},{},[211,216],{"type":18,"tag":41,"props":212,"children":213},{},[214],{"type":35,"value":215},"避坑指南：",{"type":35,"value":217}," 务必注意 MindSpore 版本与 vllm_mindspore 的对应关系。我初期因文档版本滞后遇到大量环境报错，教训是**遇到一些长时间无法解决问题应及时加入 SIG 组向专家求助，**避免消耗宝贵的开发时间。",{"type":18,"tag":37,"props":219,"children":220},{},[221],{"type":18,"tag":41,"props":222,"children":223},{},[224],{"type":35,"value":225},"步骤二：深入源码分析与路径定位",{"type":18,"tag":55,"props":227,"children":228},{},[229,234],{"type":18,"tag":59,"props":230,"children":231},{},[232],{"type":35,"value":233},"深入阅读vLLM源码: 系统梳理了vLLM的Beam Search采样链路, 彻底理清了从BeamSearchInstance到BeamSearchOutput的数据链路。",{"type":18,"tag":59,"props":235,"children":236},{},[237,242],{"type":18,"tag":41,"props":238,"children":239},{},[240],{"type":35,"value":241},"研读 vLLM-MindSpore 源码：",{"type":35,"value":243}," 理解其当前的工作原理和适配进度，通过对比分析，精确定位了 model_executor/layers/sampler.py 中缺失的关键逻辑，明确了下一步代码实现的方向。",{"type":18,"tag":19,"props":245,"children":246},{"style":21},[247],{"type":18,"tag":24,"props":248,"children":250},{"src":249,"style":27,"alt":7},"/category/information/technology-blogs/banner/2026-1-8/2.jpg",[],{"type":18,"tag":37,"props":252,"children":253},{},[254],{"type":18,"tag":41,"props":255,"children":256},{},[257],{"type":35,"value":258},"步骤三：核心采样逻辑重构",{"type":18,"tag":55,"props":260,"children":261},{},[262],{"type":18,"tag":59,"props":263,"children":264},{},[265],{"type":35,"value":266},"基于对源码的理解，重写 _greedy_sample 以支持多 Beam 并行采样，并优化了 _get_ranks 中的张量操作逻辑。",{"type":18,"tag":268,"props":269,"children":271},"h6",{"id":270},"_1",[],{"type":18,"tag":268,"props":273,"children":275},{"id":274},"_1-_greedy_sample-从单点阻塞到并行切片",[276],{"type":18,"tag":41,"props":277,"children":278},{},[279],{"type":35,"value":280},"1. _greedy_sample: 从“单点阻塞”到“并行切片”",{"type":18,"tag":268,"props":282,"children":284},{"id":283},"_2",[],{"type":18,"tag":37,"props":286,"children":287},{},[288],{"type":18,"tag":41,"props":289,"children":290},{},[291],{"type":35,"value":292},"vLLM 原生实现 (PyTorch):",{"type":18,"tag":37,"props":294,"children":295},{},[296],{"type":18,"tag":297,"props":298,"children":300},"code",{"className":299},[],[301],{"type":35,"value":302},"# 原始代码片段",{"type":18,"tag":37,"props":304,"children":305},{},[306],{"type":18,"tag":297,"props":307,"children":309},{"className":308},[],[310],{"type":35,"value":311},"num_parent_seqs = len(seq_ids)",{"type":18,"tag":37,"props":313,"children":314},{},[315],{"type":18,"tag":297,"props":316,"children":318},{"className":317},[],[319],{"type":35,"value":320},"assert num_parent_seqs == 1, (\"Greedy sampling should have only one seq.\") # \u003C--- 致命限制",{"type":18,"tag":37,"props":322,"children":323},{},[324],{"type":18,"tag":297,"props":325,"children":327},{"className":326},[],[328],{"type":35,"value":329},"parent_ids = list(range(num_parent_seqs))   next_token_ids = [samples_lst[sample_idx]] # 只能取 1 个",{"type":18,"tag":55,"props":331,"children":332},{},[333,343],{"type":18,"tag":59,"props":334,"children":335},{},[336,341],{"type":18,"tag":41,"props":337,"children":338},{},[339],{"type":35,"value":340},"问题点：",{"type":35,"value":342}," 原生代码显式通过 assert 禁止了 Greedy Sampling 处理多父序列的情况。它假设 Greedy 采样只用于简单的单序列生成。当 Beam Search (e.g., beam_width=4) 传入一个包含 4 个 seq 的 group 时，这里会直接报错中断。",{"type":18,"tag":59,"props":344,"children":345},{},[346,351],{"type":18,"tag":41,"props":347,"children":348},{},[349],{"type":35,"value":350},"局限性：",{"type":35,"value":352}," 无法支持 Beam Search 的“分叉”逻辑（即一个 Group 产生多个候选）。",{"type":18,"tag":37,"props":354,"children":355},{},[356],{"type":18,"tag":41,"props":357,"children":358},{},[359],{"type":35,"value":360},"优化实现 (MindSpore):",{"type":18,"tag":37,"props":362,"children":363},{},[364],{"type":18,"tag":297,"props":365,"children":367},{"className":366},[],[368],{"type":35,"value":369},"# 优化后代码片段",{"type":18,"tag":37,"props":371,"children":372},{},[373],{"type":18,"tag":297,"props":374,"children":376},{"className":375},[],[377],{"type":35,"value":378},"# Beam search: can have multiple sequences per group",{"type":18,"tag":37,"props":380,"children":381},{},[382],{"type":18,"tag":297,"props":383,"children":385},{"className":384},[],[386],{"type":35,"value":387},"if num_parent_seqs == 1:",{"type":18,"tag":37,"props":389,"children":390},{},[391],{"type":18,"tag":297,"props":392,"children":394},{"className":393},[],[395],{"type":35,"value":396},"# ... 标准处理 ...",{"type":18,"tag":37,"props":398,"children":399},{},[400],{"type":18,"tag":297,"props":401,"children":403},{"className":402},[],[404],{"type":35,"value":405},"else:",{"type":18,"tag":37,"props":407,"children":408},{},[409],{"type":18,"tag":297,"props":410,"children":412},{"className":411},[],[413],{"type":35,"value":414},"# Beam search: multiple parents    parent_ids = list(range(num_parent_seqs))",{"type":18,"tag":37,"props":416,"children":417},{},[418],{"type":18,"tag":297,"props":419,"children":421},{"className":420},[],[422],{"type":35,"value":423},"# \u003C--- 核心优化：并行切片    next_token_ids =samples_lst[sample_idx:sample_idx +",{"type":18,"tag":37,"props":425,"children":426},{},[427],{"type":18,"tag":297,"props":428,"children":430},{"className":429},[],[431],{"type":35,"value":432},"num_parent_seqs]",{"type":18,"tag":55,"props":434,"children":435},{},[436,454],{"type":18,"tag":59,"props":437,"children":438},{},[439,444,446,452],{"type":18,"tag":41,"props":440,"children":441},{},[442],{"type":35,"value":443},"优化逻辑：",{"type":35,"value":445}," 移除了断言，并引入了切片操作 samples_lst",{"type":18,"tag":447,"props":448,"children":449},"span",{},[450],{"type":35,"value":451},"sample_idx:sample_idx + num_parent_seqs",{"type":35,"value":453},"。",{"type":18,"tag":59,"props":455,"children":456},{},[457],{"type":18,"tag":41,"props":458,"children":459},{},[460],{"type":35,"value":461},"技术价值：",{"type":18,"tag":140,"props":463,"children":464},{},[465,475],{"type":18,"tag":59,"props":466,"children":467},{},[468,473],{"type":18,"tag":41,"props":469,"children":470},{},[471],{"type":35,"value":472},"解锁功能：",{"type":35,"value":474}," 使得采样器能够一次性处理 Beam Search 产生的多个候选分支，填补了 vLLM 原生逻辑在 Greedy 模式下不支持多 Beam 的空白。",{"type":18,"tag":59,"props":476,"children":477},{},[478],{"type":35,"value":479},"**零循环开销：**通过切片而非 Python for 循环逐个获取 token，保持了 Python 层面的执行效率。",{"type":18,"tag":268,"props":481,"children":483},{"id":482},"_2-_get_ranks-消除中间张量适配-npu-内存模型",[484,489,491],{"type":18,"tag":41,"props":485,"children":486},{},[487],{"type":35,"value":488},"2.",{"type":35,"value":490}," _get_ranks: ",{"type":18,"tag":41,"props":492,"children":493},{},[494],{"type":35,"value":495},"消除中间张量，适配 NPU 内存模型",{"type":18,"tag":268,"props":497,"children":499},{"id":498},"_3",[],{"type":18,"tag":37,"props":501,"children":502},{},[503],{"type":18,"tag":41,"props":504,"children":505},{},[506],{"type":35,"value":292},{"type":18,"tag":37,"props":508,"children":509},{},[510],{"type":18,"tag":297,"props":511,"children":513},{"className":512},[],[514],{"type":35,"value":302},{"type":18,"tag":37,"props":516,"children":517},{},[518],{"type":18,"tag":297,"props":519,"children":521},{"className":520},[],[522],{"type":35,"value":523},"# 1. 高级索引：创建中间张量 vals，形状 [N]",{"type":18,"tag":37,"props":525,"children":526},{},[527],{"type":18,"tag":297,"props":528,"children":530},{"className":529},[],[531],{"type":35,"value":532},"vals = x[torch.arange(0, len(x), ...), indices]",{"type":18,"tag":37,"props":534,"children":535},{},[536],{"type":18,"tag":297,"props":537,"children":539},{"className":538},[],[540],{"type":35,"value":541},"# 2. 广播比较：vals[:, None] 触发 unsqueeze 和广播result = (x > vals[:, None])",{"type":18,"tag":37,"props":543,"children":544},{},[545],{"type":18,"tag":297,"props":546,"children":548},{"className":547},[],[549],{"type":35,"value":550},"# 3. 手动删除：暗示了对显存的担忧",{"type":18,"tag":37,"props":552,"children":553},{},[554],{"type":18,"tag":297,"props":555,"children":557},{"className":556},[],[558],{"type":35,"value":559},"del vals",{"type":18,"tag":55,"props":561,"children":562},{},[563],{"type":18,"tag":59,"props":564,"children":565},{},[566,570],{"type":18,"tag":41,"props":567,"children":568},{},[569],{"type":35,"value":340},{"type":35,"value":571},"  vals 是一个显式创建的中间张量。在 GPU/CPU上，分配内存 -> 写入数据 -> 读取数据 -> 释放内存 这一连串动作会打断计算流水线。",{"type":18,"tag":37,"props":573,"children":574},{},[575],{"type":18,"tag":41,"props":576,"children":577},{},[578],{"type":35,"value":360},{"type":18,"tag":37,"props":580,"children":581},{},[582],{"type":18,"tag":297,"props":583,"children":585},{"className":584},[],[586],{"type":35,"value":369},{"type":18,"tag":37,"props":588,"children":589},{},[590],{"type":18,"tag":297,"props":591,"children":593},{"className":592},[],[594],{"type":35,"value":595},"# 1. 算子融合思路：直接使用 gather 配合 unsqueeze",{"type":18,"tag":37,"props":597,"children":598},{},[599],{"type":18,"tag":297,"props":600,"children":602},{"className":601},[],[603],{"type":35,"value":604},"chosen_values = x.gather(1, indices.unsqueeze(1)) # 直接生成 [N, 1]",{"type":18,"tag":37,"props":606,"children":607},{},[608],{"type":18,"tag":297,"props":609,"children":611},{"className":610},[],[612],{"type":35,"value":613},"# 2. 广播比较",{"type":18,"tag":37,"props":615,"children":616},{},[617],{"type":18,"tag":297,"props":618,"children":620},{"className":619},[],[621],{"type":35,"value":622},"rank_counts = (x > chosen_values).sum(1)",{"type":18,"tag":55,"props":624,"children":625},{},[626,630,654,664],{"type":18,"tag":59,"props":627,"children":628},{},[629],{"type":35,"value":443},{"type":18,"tag":59,"props":631,"children":632},{},[633,638,640,645,647,652],{"type":18,"tag":41,"props":634,"children":635},{},[636],{"type":35,"value":637},"移除中间态：",{"type":35,"value":639}," 使用 gather 算子直接提取并保持维度，一步生成了 ",{"type":18,"tag":447,"props":641,"children":642},{},[643],{"type":35,"value":644},"N, 1",{"type":35,"value":646}," 的 chosen_values。这避免了先生成 ",{"type":18,"tag":447,"props":648,"children":649},{},[650],{"type":35,"value":651},"N",{"type":35,"value":653}," 再 reshape 的过程。",{"type":18,"tag":59,"props":655,"children":656},{},[657,662],{"type":18,"tag":41,"props":658,"children":659},{},[660],{"type":35,"value":661},"NPU 亲和性：",{"type":35,"value":663}," gather 是 MindSpore 和 Ascend NPU 高度优化的算子。相比于 PyTorch 风格的复杂索引，显式调用 gather 更容易命中底层的高性能算子核（Kernel），减少算子编译和调度的开销。",{"type":18,"tag":59,"props":665,"children":666},{},[667,671],{"type":18,"tag":41,"props":668,"children":669},{},[670],{"type":35,"value":461},{"type":35,"value":672}," 减少了显存申请和释放的频率，降低了内存碎片风险，提升了计算图的执行效率。",{"type":18,"tag":268,"props":674,"children":676},{"id":675},"_4",[],{"type":18,"tag":268,"props":678,"children":680},{"id":679},"_3-get_logprobs-保持批处理优势精准适配算子",[681,686,688],{"type":18,"tag":41,"props":682,"children":683},{},[684],{"type":35,"value":685},"3.",{"type":35,"value":687}," get_logprobs: ",{"type":18,"tag":41,"props":689,"children":690},{},[691],{"type":35,"value":692},"保持批处理优势，精准适配算子",{"type":18,"tag":268,"props":694,"children":696},{"id":695},"_5",[],{"type":18,"tag":37,"props":698,"children":699},{},[700],{"type":18,"tag":41,"props":701,"children":702},{},[703],{"type":35,"value":292},{"type":18,"tag":37,"props":705,"children":706},{},[707],{"type":18,"tag":297,"props":708,"children":710},{"className":709},[],[711],{"type":35,"value":302},{"type":18,"tag":37,"props":713,"children":714},{},[715],{"type":18,"tag":297,"props":716,"children":718},{"className":717},[],[719],{"type":35,"value":720},"query_indices_gpu = torch.tensor(query_indices, device=logprobs.device)",{"type":18,"tag":37,"props":722,"children":723},{},[724],{"type":18,"tag":297,"props":725,"children":727},{"className":726},[],[728],{"type":35,"value":729},"# ...",{"type":18,"tag":37,"props":731,"children":732},{},[733,739],{"type":18,"tag":297,"props":734,"children":736},{"className":735},[],[737],{"type":35,"value":738},"top_logprobs, top_token_ids = torch.topk(...)   top_logprobs = top_logprobs.to('cpu')",{"type":35,"value":740}," # 频繁的数据搬运",{"type":18,"tag":55,"props":742,"children":743},{},[744],{"type":18,"tag":59,"props":745,"children":746},{},[747,752],{"type":18,"tag":41,"props":748,"children":749},{},[750],{"type":35,"value":751},"现状：",{"type":35,"value":753}," 原生代码虽然逻辑也是批处理的，但它深度依赖 PyTorch 的 device 管理机制和 torch.topk。",{"type":18,"tag":37,"props":755,"children":756},{},[757],{"type":18,"tag":41,"props":758,"children":759},{},[760],{"type":35,"value":360},{"type":18,"tag":37,"props":762,"children":763},{},[764],{"type":18,"tag":297,"props":765,"children":767},{"className":766},[],[768],{"type":35,"value":369},{"type":18,"tag":37,"props":770,"children":771},{},[772],{"type":18,"tag":297,"props":773,"children":775},{"className":774},[],[776],{"type":35,"value":729},{"type":18,"tag":37,"props":778,"children":779},{},[780],{"type":18,"tag":297,"props":781,"children":783},{"className":782},[],[784],{"type":35,"value":785},"# 1. 索引扩展逻辑 (保持逻辑正确性)",{"type":18,"tag":37,"props":787,"children":788},{},[789],{"type":18,"tag":297,"props":790,"children":792},{"className":791},[],[793],{"type":35,"value":794},"query_indices.extend([query_idx + parent_id for parent_id in parent_seq_ids])",{"type":18,"tag":37,"props":796,"children":797},{},[798],{"type":18,"tag":297,"props":799,"children":801},{"className":800},[],[802],{"type":35,"value":729},{"type":18,"tag":37,"props":804,"children":805},{},[806],{"type":18,"tag":297,"props":807,"children":809},{"className":808},[],[810],{"type":35,"value":811},"# 2. 使用 mint 接口适配 MindSpore",{"type":18,"tag":37,"props":813,"children":814},{},[815],{"type":18,"tag":297,"props":816,"children":818},{"className":817},[],[819],{"type":35,"value":820},"if largest_num_logprobs > 0:",{"type":18,"tag":37,"props":822,"children":823},{},[824],{"type":35,"value":825},"top_logprobs, top_token_ids = mint.topk(logprobs, largest_num_logprobs, dim=-1) # ...",{"type":18,"tag":55,"props":827,"children":828},{},[829,836,846,856,866],{"type":18,"tag":59,"props":830,"children":831},{},[832],{"type":18,"tag":41,"props":833,"children":834},{},[835],{"type":35,"value":443},{"type":18,"tag":59,"props":837,"children":838},{},[839,844],{"type":18,"tag":41,"props":840,"children":841},{},[842],{"type":35,"value":843},"逻辑复用与适配：",{"type":35,"value":845}," 保留了 query_indices.extend 这一高效的 Lazy Collection（延迟收集） 策略，确保所有 Beam 的 Logprob 计算合并为一次大 Batch 操作。",{"type":18,"tag":59,"props":847,"children":848},{},[849,854],{"type":18,"tag":41,"props":850,"children":851},{},[852],{"type":35,"value":853},"算子替换：",{"type":35,"value":855}," 将 torch.topk 替换为 mint.topk。mint 是 MindSpore 专门为对齐 PyTorch 接口设计的模块，它底层调用的是 MindSpore 的高性能算子。",{"type":18,"tag":59,"props":857,"children":858},{},[859,864],{"type":18,"tag":41,"props":860,"children":861},{},[862],{"type":35,"value":863},"索引修正：",{"type":35,"value":865}," 在移植过程中，重点解决了 Beam Search 下 parent_id 带来的索引偏移问题，确保在 MindSpore 的 Tensor 内存布局下，能够准确找到父 Beam 对应的 Logits。",{"type":18,"tag":59,"props":867,"children":868},{},[869,873],{"type":18,"tag":41,"props":870,"children":871},{},[872],{"type":35,"value":461},{"type":35,"value":874}," 这里的优化更多体现为 “架构适配”。没有因为框架切换而退化为逐个循环计算，而是坚持并正确实现了全 Batch 并行计算，确保了 Beam Search 在生成 Top-K 候选词时，Host（CPU）与 Device（NPU）的交互次数降到最低。",{"type":18,"tag":37,"props":876,"children":877},{},[878],{"type":18,"tag":41,"props":879,"children":880},{},[881],{"type":35,"value":882},"步骤四：组件注入与系统集成（Monkey Patch 策略）",{"type":18,"tag":55,"props":884,"children":885},{},[886],{"type":18,"tag":59,"props":887,"children":888},{},[889],{"type":35,"value":890},"为了将上述优化代码无缝集成到 vLLM 架构中，采用Monkey Patch 策略。",{"type":18,"tag":37,"props":892,"children":893},{},[894],{"type":18,"tag":297,"props":895,"children":897},{"className":896},[],[898],{"type":35,"value":899},"# 用于 beam search 的增强实现",{"type":18,"tag":37,"props":901,"children":902},{},[903],{"type":18,"tag":297,"props":904,"children":906},{"className":905},[],[907],{"type":35,"value":908},"vllm.model_executor.layers.sampler._greedy_sample = _greedy_sample",{"type":18,"tag":37,"props":910,"children":911},{},[912],{"type":18,"tag":297,"props":913,"children":915},{"className":914},[],[916],{"type":35,"value":917},"# logprob / ranking / result 构造函数（Beam Search 依赖）vllm.model_executor.layers.sampler.get_logprobs = get_logprobs",{"type":18,"tag":37,"props":919,"children":920},{},[921],{"type":18,"tag":297,"props":922,"children":924},{"className":923},[],[925],{"type":35,"value":926},"vllm.model_executor.layers.sampler._get_ranks = _get_ranks",{"type":18,"tag":37,"props":928,"children":929},{},[930],{"type":18,"tag":41,"props":931,"children":932},{},[933],{"type":35,"value":934},"实现方式：在vllm_mindspore的初始化阶段，通过 Python 的动态特性，将原⽣vllm.model_executor.layers.sampler中的关键函数（如_greedy_sample 、get_logprobs 、 _get_ranks ）替换为我们针对 MindSpore 增强后的实现。",{"type":18,"tag":37,"props":936,"children":937},{},[938],{"type":18,"tag":41,"props":939,"children":940},{},[941],{"type":35,"value":942},"优势：这种⽅式保证了 vLLM 上层调度逻辑⽆需修改，即可在底层⾃动调⽤优化后的MindSpore 算⼦，实现了架构的解耦与功能的平滑扩展。",{"type":18,"tag":37,"props":944,"children":945},{},[946],{"type":18,"tag":41,"props":947,"children":948},{},[949],{"type":35,"value":950},"步骤五：全方位的功能验证与测试",{"type":18,"tag":55,"props":952,"children":953},{},[954,964],{"type":18,"tag":59,"props":955,"children":956},{},[957,962],{"type":18,"tag":41,"props":958,"children":959},{},[960],{"type":35,"value":961},"测试脚本：",{"type":35,"value":963}," 参考 tests/st/python/test_beam.py。",{"type":18,"tag":59,"props":965,"children":966},{},[967,972,976,981,983,986,991,993,996,1001,1003,1006,1011],{"type":18,"tag":41,"props":968,"children":969},{},[970],{"type":35,"value":971},"测试维度：",{"type":18,"tag":973,"props":974,"children":975},"br",{},[],{"type":18,"tag":41,"props":977,"children":978},{},[979],{"type":35,"value":980},"功能一致性：",{"type":35,"value":982}," 验证 Beam Search 输出结果与预期一致 11。",{"type":18,"tag":973,"props":984,"children":985},{},[],{"type":18,"tag":41,"props":987,"children":988},{},[989],{"type":35,"value":990},"参数变化：",{"type":35,"value":992}," 覆盖不同温度（Temperature）、长度惩罚（Length Penalty）下的生成效果。",{"type":18,"tag":973,"props":994,"children":995},{},[],{"type":18,"tag":41,"props":997,"children":998},{},[999],{"type":35,"value":1000},"Batch 处理：",{"type":35,"value":1002}," 验证在 Batch Size 为 4/8 下的稳定性与正确性。",{"type":18,"tag":973,"props":1004,"children":1005},{},[],{"type":18,"tag":41,"props":1007,"children":1008},{},[1009],{"type":35,"value":1010},"Beam Width 对比：",{"type":35,"value":1012}," 测试 Beam Width 为 2、4、8 时的性能表现。",{"type":18,"tag":104,"props":1014,"children":1016},{"id":1015},"_3核心贡献",[1017],{"type":18,"tag":41,"props":1018,"children":1019},{},[1020],{"type":35,"value":1021},"3、核心贡献",{"type":18,"tag":55,"props":1023,"children":1024},{},[1025,1035,1043,1048,1053],{"type":18,"tag":59,"props":1026,"children":1027},{},[1028,1033],{"type":18,"tag":41,"props":1029,"children":1030},{},[1031],{"type":35,"value":1032},"功能实现：",{"type":35,"value":1034}," 在 vLLM-MindSpore 中完整实现了 Beam Search 解码算法。",{"type":18,"tag":59,"props":1036,"children":1037},{},[1038],{"type":18,"tag":41,"props":1039,"children":1040},{},[1041],{"type":35,"value":1042},"性能突破：",{"type":18,"tag":59,"props":1044,"children":1045},{},[1046],{"type":35,"value":1047},"推理吞吐量（Throughput）从 84.2 tokens/s 提升至 91.1 tokens/s（提升约 8.2%）。",{"type":18,"tag":59,"props":1049,"children":1050},{},[1051],{"type":35,"value":1052},"引擎初始化时间从 29.92秒缩短至 22.70秒（提升约 24.1%）。",{"type":18,"tag":59,"props":1054,"children":1055},{},[1056,1061],{"type":18,"tag":41,"props":1057,"children":1058},{},[1059],{"type":35,"value":1060},"代码合入：",{"type":35,"value":1062}," 核心代码及测试用例已提交至社区 PR #1084 。",{"type":18,"tag":37,"props":1064,"children":1065},{},[1066],{"type":18,"tag":41,"props":1067,"children":1068},{},[1069],{"type":35,"value":1070},"# 03",{"type":18,"tag":37,"props":1072,"children":1073},{},[1074],{"type":18,"tag":41,"props":1075,"children":1076},{},[1077],{"type":35,"value":1078},"攻克技术难关：从环境配置到算法优化",{"type":18,"tag":37,"props":1080,"children":1081},{},[1082],{"type":18,"tag":41,"props":1083,"children":1084},{},[1085],{"type":35,"value":1086},"难题一：环境配置与版本依赖的“隐形坑”",{"type":18,"tag":37,"props":1088,"children":1089},{},[1090],{"type":35,"value":1091},"**问题描述：**项目初期严格按照文档搭建环境，却频繁报错，无法启动最基础的推理服务。这严重挤占了开发排期，让我一度陷入自我怀疑。",{"type":18,"tag":37,"props":1093,"children":1094},{},[1095],{"type":35,"value":1096},"**探索过程：**我尝试了手动源码编译、更换 Python 版本等多种常规方法，但均未奏效。",{"type":18,"tag":37,"props":1098,"children":1099},{},[1100,1102],{"type":35,"value":1101},"**最终方案：**没有继续“闭门造车”，而是选择加入 SIG 组向项目组的前辈咨询。经沟通得知，是当时文档中的 MindSpore 版本号未及时更新以适配 vllm_mindspore。这次经历让我深刻意识到：",{"type":18,"tag":41,"props":1103,"children":1104},{},[1105],{"type":35,"value":1106},"做开源项目，及时沟通比独自死磕更重要。",{"type":18,"tag":37,"props":1108,"children":1109},{},[1110],{"type":18,"tag":41,"props":1111,"children":1112},{},[1113],{"type":35,"value":1114},"难题二：vLLM 原生采样逻辑的深入分析与优化",{"type":18,"tag":37,"props":1116,"children":1117},{},[1118,1120,1124],{"type":35,"value":1119},"**问题描述：**在实现核心的 _greedy_sample 和 _get_ranks 函数时，我并没有简单地翻译 PyTorch 代码。通过深入阅读源码，我发现 vLLM 原生的 _get_ranks 实现存在内存冗余——它在计算 logprobs 时会创建一个与 vals 等大小的 ",{"type":18,"tag":447,"props":1121,"children":1122},{},[1123],{"type":35,"value":651},{"type":35,"value":1125}," 中间张量，这在 NPU 上会带来不必要的显存开销。",{"type":18,"tag":37,"props":1127,"children":1128},{},[1129],{"type":18,"tag":41,"props":1130,"children":1131},{},[1132],{"type":35,"value":1133},"探索过程：",{"type":18,"tag":55,"props":1135,"children":1136},{},[1137,1147],{"type":18,"tag":59,"props":1138,"children":1139},{},[1140,1145],{"type":18,"tag":41,"props":1141,"children":1142},{},[1143],{"type":35,"value":1144},"源码定位：",{"type":35,"value":1146}," 我深入分析了 model_executor/layers/sampler.py，理清了数据在 gather 操作前后的维度变化。",{"type":18,"tag":59,"props":1148,"children":1149},{},[1150,1155],{"type":18,"tag":41,"props":1151,"children":1152},{},[1153],{"type":35,"value":1154},"逻辑推演：",{"type":35,"value":1156}," 我思考能否利用广播机制来替代显式的中间张量创建，从而减少内存占用。",{"type":18,"tag":37,"props":1158,"children":1159},{},[1160],{"type":18,"tag":41,"props":1161,"children":1162},{},[1163],{"type":35,"value":1164},"最终方案：",{"type":18,"tag":55,"props":1166,"children":1167},{},[1168,1184],{"type":18,"tag":59,"props":1169,"children":1170},{},[1171,1176,1178,1182],{"type":18,"tag":41,"props":1172,"children":1173},{},[1174],{"type":35,"value":1175},"代****码重构：",{"type":35,"value":1177}," 我在 MindSpore 中重新实现了 _get_ranks，巧妙结合 gather 和 unsqueeze 操作，直接生成形状为 ",{"type":18,"tag":447,"props":1179,"children":1180},{},[1181],{"type":35,"value":644},{"type":35,"value":1183}," 的 chosen_values 张量用于广播比较。",{"type":18,"tag":59,"props":1185,"children":1186},{},[1187,1192,1194],{"type":18,"tag":41,"props":1188,"children":1189},{},[1190],{"type":35,"value":1191},"效果验证：",{"type":35,"value":1193}," 这一改动配合对多 Beam 并行采样的支持，不仅确保了功能正确，更在 测试中帮助推理吞吐量提升了约 8.2%。这证明了",{"type":18,"tag":41,"props":1195,"children":1196},{},[1197],{"type":35,"value":1198},"深入理解原框架逻辑并进行针对性优化，比单纯的代码迁移更有价值。",{"type":18,"tag":37,"props":1200,"children":1201},{},[1202],{"type":18,"tag":41,"props":1203,"children":1204},{},[1205],{"type":35,"value":1206},"# 04",{"type":18,"tag":37,"props":1208,"children":1209},{},[1210],{"type":18,"tag":41,"props":1211,"children":1212},{},[1213],{"type":18,"tag":41,"props":1214,"children":1215},{},[1216],{"type":35,"value":1217},"开发者说：从学生到开源贡献者",{"type":18,"tag":55,"props":1219,"children":1220},{},[1221,1239,1257,1275,1293],{"type":18,"tag":59,"props":1222,"children":1223},{},[1224,1229,1232,1234,1237],{"type":18,"tag":41,"props":1225,"children":1226},{},[1227],{"type":35,"value":1228},"是什么机缘让你在开源之夏的诸多项目中选择了昇思MindSpore？在选择项目任务和撰写申请书的时候有哪些考虑和准备？",{"type":18,"tag":973,"props":1230,"children":1231},{},[],{"type":35,"value":1233},"贾阔源：选择 MindSpore 主要是出于职业规划的考量。我意识到大模型推理优化是未来的关键技术方向，并计划后期向大模型领域转型。恰逢 MindSpore 社区发布了 vLLM-MindSpore 课题，这与我的学习目标高度契合。",{"type":18,"tag":973,"props":1235,"children":1236},{},[],{"type":35,"value":1238},"在选题上，为了确保能上手，我选择了逻辑相对直观的“Beam Search 采样”作为切入点。在撰写申请书时，我深知一份详实的项目计划书是展示开发者理解程度的关键，因此在提交申请前，我积极浏览了大量网站，查阅了关于 vLLM 技术原理的解读和 Beam Search 算法的详细讲解，为项目的顺利开展做足了理论准备。",{"type":18,"tag":59,"props":1240,"children":1241},{},[1242,1247,1250,1252,1255],{"type":18,"tag":41,"props":1243,"children":1244},{},[1245],{"type":35,"value":1246},"此次开发工作与你以前的项目开发经历有何不可异同？",{"type":18,"tag":973,"props":1248,"children":1249},{},[],{"type":35,"value":1251},"贾阔源：最大的不同在于工程规模与代码量级。相比于以往小规模的算法 Demo，vllm-mindspore 是基于业界主流推理引擎 vLLM 的庞大开源项目，代码量十分巨大。",{"type":18,"tag":973,"props":1253,"children":1254},{},[],{"type":35,"value":1256},"我面临的挑战不再是简单的逻辑实现，而是要深入理解海量源码：不仅要理解 vLLM-MindSpore 的工作机制，更要理清 vLLM 原生 Beam Search 的数据流转链路、组件 API 调用，以及明确 vLLM 与 MindSpore 适配层各自承担的职责边界。在阅读源码和确认“正确”的开发策略上，我花费了比以往任何项目都多的功夫。",{"type":18,"tag":59,"props":1258,"children":1259},{},[1260,1265,1268,1270,1273],{"type":18,"tag":41,"props":1261,"children":1262},{},[1263],{"type":35,"value":1264},"通过这个项目任务，你对开源有了什么更深刻的理解吗?",{"type":18,"tag":973,"props":1266,"children":1267},{},[],{"type":35,"value":1269},"贾阔源：我对“聚沙成塔”有了更直观的感受。面对如此庞大的工程项目，我深刻认识到个人开发者的力量往往是薄弱的，它不像简单的算法那样易于独自开发。",{"type":18,"tag":973,"props":1271,"children":1272},{},[],{"type":35,"value":1274},"开源项目需要开发者们齐心协力，共同维护社区，积极交流。每一位开发者贡献的一行行优雅代码，汇聚起来最终构成了功能齐全、合作共赢的开源项目。这种协作精神是开源社区最宝贵的财富。",{"type":18,"tag":59,"props":1276,"children":1277},{},[1278,1283,1286,1288,1291],{"type":18,"tag":41,"props":1279,"children":1280},{},[1281],{"type":35,"value":1282},"作为学生参与开源项目，你认为最大的挑战是什么？又是如何克服的？",{"type":18,"tag":973,"props":1284,"children":1285},{},[],{"type":35,"value":1287},"贾阔源：最大的挑战在于如何驾驭庞大的开源代码库，以及如何制定合适的开发策略。初次面对巨量代码时，很容易迷失方向，难以找到功能缺失的切入点。",{"type":18,"tag":973,"props":1289,"children":1290},{},[],{"type":35,"value":1292},"克服方法： 我花了大量时间沉下心来认真阅读源码，不再急于求成。同时，我通过不断的测试来验证自己的理解，在测试中查找功能漏洞或不完善的地方，逐步理清逻辑链路，最终将这些理解转化为高质量的 PR 提交。",{"type":18,"tag":59,"props":1294,"children":1295},{},[1296,1298,1303,1306,1308,1311,1313,1316,1318,1321],{"type":35,"value":1297},"作为过来人，有没有",{"type":18,"tag":41,"props":1299,"children":1300},{},[1301],{"type":35,"value":1302},"什么话****想对过去的自己/学弟学妹/刚加入昇思MindSpore的开发者说呢？",{"type":18,"tag":973,"props":1304,"children":1305},{},[],{"type":35,"value":1307},"贾阔源：我想送给大家三个关键词：源码、测试、交流。",{"type":18,"tag":973,"props":1309,"children":1310},{},[],{"type":35,"value":1312},"认真阅读源码：这是理解大型项目的基石。",{"type":18,"tag":973,"props":1314,"children":1315},{},[],{"type":35,"value":1317},"多做测试：只有通过不断的测试验证，才能发现潜在的漏洞，确保功能的健壮性。",{"type":18,"tag":973,"props":1319,"children":1320},{},[],{"type":35,"value":1322},"多交流：开源社区非常友好，多向前辈们请教，能让你少走很多弯路。",{"title":7,"searchDepth":1324,"depth":1324,"links":1325},4,[1326,1328,1329,1330],{"id":7,"depth":1327,"text":7},3,{"id":109,"depth":1327,"text":115},{"id":185,"depth":1327,"text":191},{"id":1015,"depth":1327,"text":1021},"markdown","content:technology-blogs:zh:2026-1-8.md","content","technology-blogs/zh/2026-1-8.md","technology-blogs/zh/2026-1-8","md",1776506119035]