[{"data":1,"prerenderedAt":579},["ShallowReactive",2],{"content-query-LNFKSNmk0y":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":573,"_id":574,"_source":575,"_file":576,"_stem":577,"_extension":578},"/technology-blogs/zh/3770","zh",false,"","Whisper加速实战：教你用MindSpore Profiler为推理提速","Whisper 是由 OpenAI 开发的多语言语音识别模型。","2025-06-23","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/06/27/9106a8a1ab4e464688b53cce13fcf20a.png","technology-blogs","实践",{"type":15,"children":16,"toc":570},"root",[17,25,60,72,83,91,99,104,109,114,119,124,129,134,139,144,149,154,159,164,169,174,179,184,192,200,205,213,228,236,249,257,262,270,278,283,291,304,314,322,330,335,348,353,358,363,371,379,384,402,409,417,425,430,438,443,451,459,464,472,477,485,493,498,506,514,522,530,535,545,556,561],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"whisper加速实战教你用mindspore-profiler为推理提速",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29,31,37,39,44,46,51,53,58],{"type":24,"value":30},"Whisper 是由 OpenAI 开发的多语言语音识别模型。一经开源受到开发者广泛关注和使用，在使用中遇到其耗时过高问题，一段 91 秒的音频，识别耗时长达 95 秒， 推理效率不足，难以满足实时应用需求。 本文将系统分享我们在",{"type":18,"tag":32,"props":33,"children":34},"strong",{},[35],{"type":24,"value":36},"MindSpore 2.5.0 + MindSpore NLP 0.4.0",{"type":24,"value":38},"环境下，通过引入",{"type":18,"tag":32,"props":40,"children":41},{},[42],{"type":24,"value":43},"FlashAttention 2",{"type":24,"value":45}," 与",{"type":18,"tag":32,"props":47,"children":48},{},[49],{"type":24,"value":50},"优化Conv1D",{"type":24,"value":52},"，借助",{"type":18,"tag":32,"props":54,"children":55},{},[56],{"type":24,"value":57},"MindSpore Profiler",{"type":24,"value":59},"[1]精准定位瓶颈，最终将Whisper模型推理耗时压缩至60秒以内的全过程。",{"type":18,"tag":26,"props":61,"children":62},{},[63,65,70],{"type":24,"value":64},"目前该模型已上线模力方舟，点击",{"type":18,"tag":32,"props":66,"children":67},{},[68],{"type":24,"value":69},"阅读原文",{"type":24,"value":71},"可直接体验。",{"type":18,"tag":26,"props":73,"children":74},{},[75],{"type":18,"tag":76,"props":77,"children":81},"a",{"href":78,"rel":79},"https://ai.gitee.com/serverless-api/packages/1495?model=whisper-large-v3&package=1495",[80],"nofollow",[82],{"type":24,"value":78},{"type":18,"tag":26,"props":84,"children":85},{},[86],{"type":18,"tag":32,"props":87,"children":88},{},[89],{"type":24,"value":90},"# 01",{"type":18,"tag":26,"props":92,"children":93},{},[94],{"type":18,"tag":32,"props":95,"children":96},{},[97],{"type":24,"value":98},"三种注意力机制对比",{"type":18,"tag":26,"props":100,"children":101},{},[102],{"type":24,"value":103},"模式",{"type":18,"tag":26,"props":105,"children":106},{},[107],{"type":24,"value":108},"特点",{"type":18,"tag":26,"props":110,"children":111},{},[112],{"type":24,"value":113},"适用场景",{"type":18,"tag":26,"props":115,"children":116},{},[117],{"type":24,"value":118},"Eager",{"type":18,"tag":26,"props":120,"children":121},{},[122],{"type":24,"value":123},"直接计算完整注意力矩阵",{"type":18,"tag":26,"props":125,"children":126},{},[127],{"type":24,"value":128},"短序列",{"type":18,"tag":26,"props":130,"children":131},{},[132],{"type":24,"value":133},"SDPA",{"type":18,"tag":26,"props":135,"children":136},{},[137],{"type":24,"value":138},"通过缩放点积计算注意力权重，引入部分显存优化",{"type":18,"tag":26,"props":140,"children":141},{},[142],{"type":24,"value":143},"中等长度序列",{"type":18,"tag":26,"props":145,"children":146},{},[147],{"type":24,"value":148},"FlashAttention2",{"type":18,"tag":26,"props":150,"children":151},{},[152],{"type":24,"value":153},"通过“分块+重计算”，避免存储完整矩阵，大幅降低显存消耗",{"type":18,"tag":26,"props":155,"children":156},{},[157],{"type":24,"value":158},"长序列任务",{"type":18,"tag":26,"props":160,"children":161},{},[162],{"type":24,"value":163},"FlashAttention 2为何能加速？想象一下拼图游戏：",{"type":18,"tag":26,"props":165,"children":166},{},[167],{"type":24,"value":168},"1. 切块处理：就像无法一次性处理超大图片，FA2将长序列切分成与硬件缓存匹配的“小拼图”分批处理，避免内存爆炸",{"type":18,"tag":26,"props":170,"children":171},{},[172],{"type":24,"value":173},"2. 分块统计：在每个“小拼图”内，先扫描计算关键统计量（如最大值、归一化因子），相当于找出每块图像的“关键特征”",{"type":18,"tag":26,"props":175,"children":176},{},[177],{"type":24,"value":178},"3. 按需重算：反向传播时，仅需保存少量统计量，按需重新计算中间结果，极大节省显存",{"type":18,"tag":26,"props":180,"children":181},{},[182],{"type":24,"value":183},"这样的”分块处理＋重计算“策略，显著降低了显存使用，提高了并行计算效率，特别适用于语音识别等需要处理超长序列的任务。",{"type":18,"tag":26,"props":185,"children":186},{},[187],{"type":18,"tag":32,"props":188,"children":189},{},[190],{"type":24,"value":191},"# 02",{"type":18,"tag":26,"props":193,"children":194},{},[195],{"type":18,"tag":32,"props":196,"children":197},{},[198],{"type":24,"value":199},"接入 FlashAttention2 模式",{"type":18,"tag":26,"props":201,"children":202},{},[203],{"type":24,"value":204},"我们将FlashAttention 2集成MindSpore NLP中，具体流程[2]如下：",{"type":18,"tag":26,"props":206,"children":207},{},[208],{"type":18,"tag":32,"props":209,"children":210},{},[211],{"type":24,"value":212},"1、核心适配",{"type":18,"tag":214,"props":215,"children":216},"ul",{},[217,223],{"type":18,"tag":218,"props":219,"children":220},"li",{},[221],{"type":24,"value":222},"移值flash-attn 库中处理填充(Padding)的关键函数 (index_put_first_axis, index_first_axis, unpad_input, pad_input)",{"type":18,"tag":218,"props":224,"children":225},{},[226],{"type":24,"value":227},"新增modeling_flash_attention_utils.py模块，实现支撑FA2的辅助函数（如 _get_unpad_data, _flash_attention_forward等）",{"type":18,"tag":26,"props":229,"children":230},{},[231],{"type":18,"tag":32,"props":232,"children":233},{},[234],{"type":24,"value":235},"2、模型改造",{"type":18,"tag":214,"props":237,"children":238},{},[239,244],{"type":18,"tag":218,"props":240,"children":241},{},[242],{"type":24,"value":243},"在modeling_whisper.py中新增WhisperFlashAttention2模块",{"type":18,"tag":218,"props":245,"children":246},{},[247],{"type":24,"value":248},"用户只需在初始化模型时设置attn_implementation=\"flash_attention_2\" 即可启用FA2",{"type":18,"tag":26,"props":250,"children":251},{},[252],{"type":18,"tag":32,"props":253,"children":254},{},[255],{"type":24,"value":256},"3、初战告捷",{"type":18,"tag":26,"props":258,"children":259},{},[260],{"type":24,"value":261},"91秒音频推理时间从95秒降至约85秒，性能提升约10%，但性能仍有优化空间。",{"type":18,"tag":26,"props":263,"children":264},{},[265],{"type":18,"tag":32,"props":266,"children":267},{},[268],{"type":24,"value":269},"# 03",{"type":18,"tag":26,"props":271,"children":272},{},[273],{"type":18,"tag":32,"props":274,"children":275},{},[276],{"type":24,"value":277},"Profiler精准定位瓶颈：Conv1D成“罪魁祸首”",{"type":18,"tag":26,"props":279,"children":280},{},[281],{"type":24,"value":282},"初步优化后性能仍有瓶颈？MindSpore Profiler成为关键突破口！",{"type":18,"tag":26,"props":284,"children":285},{},[286],{"type":18,"tag":32,"props":287,"children":288},{},[289],{"type":24,"value":290},"1、Profiler: 性能瓶颈的“显微镜”",{"type":18,"tag":214,"props":292,"children":293},{},[294,299],{"type":18,"tag":218,"props":295,"children":296},{},[297],{"type":24,"value":298},"功能强大：MindSpore官方提供的性能调优利器，能够对神经网络的各个环节进行精细的性能采集和分析。",{"type":18,"tag":218,"props":300,"children":301},{},[302],{"type":24,"value":303},"使用便捷：只需先创建并初始化Profiler对象，设置采集级别和调度策略，然后在推理过程中自动收集数据：",{"type":18,"tag":305,"props":306,"children":308},"pre",{"code":307},"import mindspore\nfrom mindspore.profiler import ProfilerLevel, schedule, tensorboard_trace_handler\nexperimental_config = mindspore.profiler._ExperimentalConfig(\n                            profiler_level=ProfilerLevel.Level0,\n                            aic_metrics=AicoreMetrics.AiCoreNone,\n                            l2_cache=False,\n                            mstx=False,\n                            data_simplification=False,\n                            export_type=[ExportType.Text])\n# Profiler 数据默认存储在路径：\n# ./data/modelfoundry-prod-node-xxx/ASCEND_PROFILER_OUTPUT\nwith mindspore.profiler.profile(\n    activities=[ProfilerActivity.CPU, ProfilerActivity.NPU],\n    schedule=mindspore.profiler.schedule(wait=0, warmup=0, active=1, repeat=1, skip_first=0),\n    on_trace_ready=mindspore.profiler.tensorboard_trace_handler(\"./data\"),\n    profile_memory=False,\n    experimental_config=experimental_config\n    ) as prof:\n    # 运行你的推理代码 (pipe(audio_file))\n    prof.step()\n",[309],{"type":18,"tag":310,"props":311,"children":312},"code",{"__ignoreMap":7},[313],{"type":24,"value":307},{"type":18,"tag":214,"props":315,"children":316},{},[317],{"type":18,"tag":218,"props":318,"children":319},{},[320],{"type":24,"value":321},"可视化分析：使用**MindStudio Insight[3]**或浏览器内置的Trace Viewer分析生成的timeline文件",{"type":18,"tag":26,"props":323,"children":324},{},[325],{"type":18,"tag":32,"props":326,"children":327},{},[328],{"type":24,"value":329},"2、性能瓶颈：低效的Conv1D实现",{"type":18,"tag":26,"props":331,"children":332},{},[333],{"type":24,"value":334},"使用 MindStudio Insight（版本 8.0.RC1） 对 timeline 文件进行分析后，可清晰定位性能瓶颈：",{"type":18,"tag":214,"props":336,"children":337},{},[338,343],{"type":18,"tag":218,"props":339,"children":340},{},[341],{"type":24,"value":342},"瓶颈算子：Conv1D",{"type":18,"tag":218,"props":344,"children":345},{},[346],{"type":24,"value":347},"问题根源：旧版本MindSpore (\u003C=2.5.0)的Conv1D是通过Conv2D间接模拟实现的：",{"type":18,"tag":26,"props":349,"children":350},{},[351],{"type":24,"value":352},"1）存在多余的维度转换操作",{"type":18,"tag":26,"props":354,"children":355},{},[356],{"type":24,"value":357},"2）计算主要在 CPU上执行，无法利用NPU加速",{"type":18,"tag":26,"props":359,"children":360},{},[361],{"type":24,"value":362},"3）导致频繁的内存拷贝，拖累整体速度",{"type":18,"tag":26,"props":364,"children":365},{},[366],{"type":18,"tag":367,"props":368,"children":370},"img",{"alt":7,"src":369},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/06/27/ba4e32a4e2d84708a99fe3e4cc410861.png",[],{"type":18,"tag":26,"props":372,"children":373},{},[374],{"type":18,"tag":32,"props":375,"children":376},{},[377],{"type":24,"value":378},"3、解决方案：引入高效 Conv1D 实现",{"type":18,"tag":26,"props":380,"children":381},{},[382],{"type":24,"value":383},"自 MindSpore 2.6.0 起，框架已提供更高效的 Conv1D 实现，支持图模式和硬件加速。将框架升级到MindSpore 2.6.0并适配新版的Conv1D后，结合之前集成的FlashAttention 2：",{"type":18,"tag":214,"props":385,"children":386},{},[387,392,397],{"type":18,"tag":218,"props":388,"children":389},{},[390],{"type":24,"value":391},"推理耗时由原来的95秒优化至平均60秒内",{"type":18,"tag":218,"props":393,"children":394},{},[395],{"type":24,"value":396},"相比原始版本提升超过35%",{"type":18,"tag":218,"props":398,"children":399},{},[400],{"type":24,"value":401},"CPU占用显著下降，资源利用更高效",{"type":18,"tag":26,"props":403,"children":404},{},[405],{"type":18,"tag":367,"props":406,"children":408},{"alt":7,"src":407},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/06/27/a28aa777ae7542f082129e2df07ab541.png",[],{"type":18,"tag":26,"props":410,"children":411},{},[412],{"type":18,"tag":32,"props":413,"children":414},{},[415],{"type":24,"value":416},"# 04",{"type":18,"tag":26,"props":418,"children":419},{},[420],{"type":18,"tag":32,"props":421,"children":422},{},[423],{"type":24,"value":424},"手把手推理教程",{"type":18,"tag":26,"props":426,"children":427},{},[428],{"type":24,"value":429},"想亲身体验优化后的超快Whisper？跟着以下步骤操作：",{"type":18,"tag":26,"props":431,"children":432},{},[433],{"type":18,"tag":32,"props":434,"children":435},{},[436],{"type":24,"value":437},"1、下载镜像",{"type":18,"tag":26,"props":439,"children":440},{},[441],{"type":24,"value":442},"执行以下Shell命令，拉取MindSpore官方容器镜像：",{"type":18,"tag":305,"props":444,"children":446},{"code":445},"docker pull quay.io/ascend/mindspore:openeuler-python3.10-cann8.1.rc1-mindspore2.6.0rc1\n",[447],{"type":18,"tag":310,"props":448,"children":449},{"__ignoreMap":7},[450],{"type":24,"value":445},{"type":18,"tag":26,"props":452,"children":453},{},[454],{"type":18,"tag":32,"props":455,"children":456},{},[457],{"type":24,"value":458},"2、创建并进入容器",{"type":18,"tag":26,"props":460,"children":461},{},[462],{"type":24,"value":463},"执行以下命令创建容器，name 设置为 whisper：",{"type":18,"tag":305,"props":465,"children":467},{"code":466},"\n\ndocker run -itd --privileged  --name=whisper --net=host \\\n   --shm-size 500g \\\n   --device=/dev/davinci0 \\\n   --device=/dev/davinci1 \\\n   --device=/dev/davinci2 \\\n   --device=/dev/davinci3 \\\n   --device=/dev/davinci4 \\\n   --device=/dev/davinci5 \\\n   --device=/dev/davinci6 \\\n   --device=/dev/davinci7 \\\n   --device=/dev/davinci_manager \\\n   --device=/dev/hisi_hdc \\\n   --device /dev/devmm_svm \\\n   -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \\\n   -v /usr/local/Ascend/firmware:/usr/local/Ascend/firmware \\\n   -v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \\\n   -v /usr/local/sbin:/usr/local/sbin \\\n   -v /etc/hccn.conf:/etc/hccn.conf \\\n   quay.io/ascend/mindspore:openeuler-python3.10-cann8.1.rc1-mindspore2.6.0rc1 \\\n   bash\n",[468],{"type":18,"tag":310,"props":469,"children":470},{"__ignoreMap":7},[471],{"type":24,"value":466},{"type":18,"tag":26,"props":473,"children":474},{},[475],{"type":24,"value":476},"进入容器，后续所有操作均在容器内操作。",{"type":18,"tag":305,"props":478,"children":480},{"code":479},"docker exec -it whisper bash\n",[481],{"type":18,"tag":310,"props":482,"children":483},{"__ignoreMap":7},[484],{"type":24,"value":479},{"type":18,"tag":26,"props":486,"children":487},{},[488],{"type":18,"tag":32,"props":489,"children":490},{},[491],{"type":24,"value":492},"3、安装 MindSpore NLP 与依赖",{"type":18,"tag":26,"props":494,"children":495},{},[496],{"type":24,"value":497},"执行以下脚本，安装 MindSpore NLP 及相关依赖包：",{"type":18,"tag":305,"props":499,"children":501},{"code":500},"# 安装相关依赖\nyum install ffmpeg git\n# 配置国内源\npip config set global.index-url https://repo.huaweicloud.com/repository/pypi/simple/ \n# 升级pip\npip install --upgrade pip\n# 安装mindnlp\ngit clone -b 0.4 https://github.com/mindspore-lab/mindnlp.git\ncd mindnlp\nbash scripts/build_and_reinstall.sh\n",[502],{"type":18,"tag":310,"props":503,"children":504},{"__ignoreMap":7},[505],{"type":24,"value":500},{"type":18,"tag":26,"props":507,"children":508},{},[509],{"type":18,"tag":32,"props":510,"children":511},{},[512],{"type":24,"value":513},"4、推理代码示例",{"type":18,"tag":305,"props":515,"children":517},{"code":516},"\n\nimport mindspore\nfrom mindnlp.transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline\n \n# 国内可设置HF镜像 (可选) \nimport os \nos.environ['HF_ENDPOINT'] = \"https://hf-mirror.com\"\n \n# 加载模型与处理器，启用FlashAttention 2\nmodel_id = \"openai/whisper-large-v3\"\nmodel = AutoModelForSpeechSeq2Seq.from_pretrained(\n    model_id, \n    ms_dtype=mindspore.float16, \n    low_cpu_mem_usage=True,\n    use_safetensors=True,\n    attn_implementation=\"flash_attention_2\",\n)\nprocessor = AutoProcessor.from_pretrained(model_id)\n \n# 创建推理管道\npipe = pipeline(\n    \"automatic-speech-recognition\",\n    model=model,\n    tokenizer=processor.tokenizer,\n    feature_extractor=processor.feature_extractor,\n    ms_dtype=mindspore.float16,\n    return_timestamps=True,\n)\n \n# 执行推理\naudio_file = \"/path/to/your/audio.mp3\" # 替换为你的音频文件路径\nresult = pipe(audio_file)\nprint(result[\"text\"])    # 打印识别结果\n",[518],{"type":18,"tag":310,"props":519,"children":520},{"__ignoreMap":7},[521],{"type":24,"value":516},{"type":18,"tag":26,"props":523,"children":524},{},[525],{"type":18,"tag":32,"props":526,"children":527},{},[528],{"type":24,"value":529},"引用",{"type":18,"tag":26,"props":531,"children":532},{},[533],{"type":24,"value":534},"[1] MindSpore 性能采集工具 Profiler:",{"type":18,"tag":26,"props":536,"children":537},{},[538],{"type":18,"tag":76,"props":539,"children":542},{"href":540,"rel":541},"https://www.mindspore.cn/docs/zh-CN/r2.6.0/api%5C_python/mindspore/mindspore.Profiler.html",[80],[543],{"type":24,"value":544},"https://www.mindspore.cn/docs/zh-CN/r2.6.0/api\\_python/mindspore/mindspore.Profiler.html",{"type":18,"tag":26,"props":546,"children":547},{},[548,550],{"type":24,"value":549},"[2] Whisper接入FlashAttention2 流程: ",{"type":18,"tag":76,"props":551,"children":554},{"href":552,"rel":553},"https://github.com/mindspore-lab/mindnlp/pull/2018",[80],[555],{"type":24,"value":552},{"type":18,"tag":26,"props":557,"children":558},{},[559],{"type":24,"value":560},"[3] MindStudio Insight工具下载:",{"type":18,"tag":26,"props":562,"children":563},{},[564],{"type":18,"tag":76,"props":565,"children":568},{"href":566,"rel":567},"https://www.hiascend.com/developer/download/community/result?module=sto",[80],[569],{"type":24,"value":566},{"title":7,"searchDepth":571,"depth":571,"links":572},4,[],"markdown","content:technology-blogs:zh:3770.md","content","technology-blogs/zh/3770.md","technology-blogs/zh/3770","md",1776506134914]