[{"data":1,"prerenderedAt":460},["ShallowReactive",2],{"content-query-1EDIp0hTk3":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":454,"_id":455,"_source":456,"_file":457,"_stem":458,"_extension":459},"/technology-blogs/zh/3797","zh",false,"","audio_spectrogram_transformer模型论文解读，并基于MindSpore NLP推理复现","作者：洛伦兹的肯定   来源：开源实习","2025-07-16","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/18/fdef6debc4ed439b95816083ce55b4b0.png","technology-blogs","开发者说",{"type":15,"children":16,"toc":451},"root",[17,25,31,36,41,49,54,63,71,76,81,88,93,101,109,117,127,137,142,150,158,166,171,176,184,189,194,199,204,209,217,222,227,232,240,248,258,268,273,281,289,294,301,306,313,320,325,332,337,345,353,358,365,370,382,390,398,406,416,426,436,446],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"audio_spectrogram_transformer模型论文解读并基于mindspore-nlp推理复现",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"作者：洛伦兹的肯定",{"type":18,"tag":26,"props":32,"children":33},{},[34],{"type":24,"value":35},"来源：开源实习",{"type":18,"tag":26,"props":37,"children":38},{},[39],{"type":24,"value":40},"昇思MindSpore开源实习模型论文解读任务已顺利完成，共收到高质量模型论文解读稿件10+篇。欢迎开发者积极参与昇思MindSpore开源实习活动，开源实习暑期活动已开启，更多新任务等你来挑战！",{"type":18,"tag":26,"props":42,"children":43},{},[44],{"type":18,"tag":45,"props":46,"children":48},"img",{"alt":7,"src":47},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/18/00655f07a4c144668da1ee43c129f48e.png",[],{"type":18,"tag":26,"props":50,"children":51},{},[52],{"type":24,"value":53},"开源实习官网",{"type":18,"tag":26,"props":55,"children":56},{},[57],{"type":18,"tag":58,"props":59,"children":60},"strong",{},[61],{"type":24,"value":62},"# 01",{"type":18,"tag":26,"props":64,"children":65},{},[66],{"type":18,"tag":58,"props":67,"children":68},{},[69],{"type":24,"value":70},"引言",{"type":18,"tag":26,"props":72,"children":73},{},[74],{"type":24,"value":75},"近年来，Transformer架构在自然语言处理和计算机视觉领域取得了巨大成功。然而，在音频处理领域，Transformer的应用相对较少。传统的音频分类方法通常依赖于卷积神经网络（CNN）或循环神经网络（RNN），这些方法在处理长序列音频数据时存在一定的局限性。Transformer的自注意力机制能够更好地捕捉音频信号中的全局依赖关系，因此作者提出了将Transformer应用于音频分类任务的想法。",{"type":18,"tag":26,"props":77,"children":78},{},[79],{"type":24,"value":80},"原论文名为《Audio Spectrogram Transformer》，主要探讨了如何将Transformer架构应用于音频分类任务。作者提出了一种基于音频频谱图的Transformer模型（AST），并通过在多个音频数据集上的实验验证了其有效性。实验结果显示，AST在多个音频分类任务上取得了state-of-the-art的性能，尤其是在AudioSet、ESC-50和Speech Commands数据集上表现优异。",{"type":18,"tag":26,"props":82,"children":83},{},[84],{"type":18,"tag":45,"props":85,"children":87},{"alt":7,"src":86},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/18/9baabc8758094f8e91f9a1ec3f07ed25.png",[],{"type":18,"tag":26,"props":89,"children":90},{},[91],{"type":24,"value":92},"ast主要模型架构",{"type":18,"tag":26,"props":94,"children":95},{},[96],{"type":18,"tag":58,"props":97,"children":98},{},[99],{"type":24,"value":100},"# 02",{"type":18,"tag":26,"props":102,"children":103},{},[104],{"type":18,"tag":58,"props":105,"children":106},{},[107],{"type":24,"value":108},"论文研究方法",{"type":18,"tag":26,"props":110,"children":111},{},[112],{"type":18,"tag":58,"props":113,"children":114},{},[115],{"type":24,"value":116},"1、基于Transformer的音频分类模型：",{"type":18,"tag":26,"props":118,"children":119},{},[120,122],{"type":24,"value":121},"作者提出了一种基于音频频谱图的Transformer模型（AST），该模型将音频信号转换为频谱图，并将其作为输入传递给Transformer编码器。通过这种方式，模型能够捕捉音频信号中的全局依赖关系。 ",{"type":18,"tag":58,"props":123,"children":124},{},[125],{"type":24,"value":126},"2、ImageNet预训练：",{"type":18,"tag":26,"props":128,"children":129},{},[130,132],{"type":24,"value":131},"作者发现，通过在ImageNet数据集上预训练Transformer编码器，可以显著提高模型在音频分类任务上的性能。这种跨模态的预训练策略为音频分类任务提供了新的思路。 ",{"type":18,"tag":58,"props":133,"children":134},{},[135],{"type":24,"value":136},"3、多数据集实验：",{"type":18,"tag":26,"props":138,"children":139},{},[140],{"type":24,"value":141},"作者在多个音频数据集上进行了实验，包括AudioSet、ESC-50和Speech Commands。实验结果表明，AST在这些数据集上均取得了state-of-the-art的性能。",{"type":18,"tag":26,"props":143,"children":144},{},[145],{"type":18,"tag":58,"props":146,"children":147},{},[148],{"type":24,"value":149},"# 03",{"type":18,"tag":26,"props":151,"children":152},{},[153],{"type":18,"tag":58,"props":154,"children":155},{},[156],{"type":24,"value":157},"论文模型及训练细节",{"type":18,"tag":26,"props":159,"children":160},{},[161],{"type":18,"tag":58,"props":162,"children":163},{},[164],{"type":24,"value":165},"1、频谱图输入",{"type":18,"tag":26,"props":167,"children":168},{},[169],{"type":24,"value":170},"输入表示：AST的输入是音频信号的频谱图，通常是通过短时傅里叶变换（STFT）或梅尔频谱图（Mel-spectrogram）生成的。频谱图可以被视为一个二维图像，其中时间轴和频率轴分别对应图像的宽度和高度。",{"type":18,"tag":26,"props":172,"children":173},{},[174],{"type":24,"value":175},"预处理：在输入Transformer之前，频谱图会被分割成固定大小的patch（类似于图像中的小块），这些patch会被展平并作为Transformer的输入。",{"type":18,"tag":26,"props":177,"children":178},{},[179],{"type":18,"tag":58,"props":180,"children":181},{},[182],{"type":24,"value":183},"2、模型架构细节",{"type":18,"tag":26,"props":185,"children":186},{},[187],{"type":24,"value":188},"Patch Embedding：每个patch会被线性投影到一个固定维度的嵌入向量，这些嵌入向量会被输入到Transformer中。",{"type":18,"tag":26,"props":190,"children":191},{},[192],{"type":24,"value":193},"Positional Encoding：由于Transformer本身不包含位置信息，AST会为每个patch添加位置编码，以保留其在频谱图中的位置信息。",{"type":18,"tag":26,"props":195,"children":196},{},[197],{"type":24,"value":198},"Self-Attention Mechanism：Transformer的核心是自注意力机制，它允许模型在处理每个patch时考虑到所有其他patch的信息。这种机制使得AST能够捕捉音频中的全局依赖关系。",{"type":18,"tag":26,"props":200,"children":201},{},[202],{"type":24,"value":203},"Multi-Head Attention：AST使用了多头注意力机制，允许模型在不同的表示子空间中学习不同的特征。",{"type":18,"tag":26,"props":205,"children":206},{},[207],{"type":24,"value":208},"Feed-Forward Network：在自注意力层之后，AST会通过一个前馈神经网络进一步处理特征。",{"type":18,"tag":26,"props":210,"children":211},{},[212],{"type":18,"tag":58,"props":213,"children":214},{},[215],{"type":24,"value":216},"3、预训练与微调",{"type":18,"tag":26,"props":218,"children":219},{},[220],{"type":24,"value":221},"ImageNet预训练：AST的Transformer部分是在ImageNet数据集上进行预训练的，这使得模型能够从大规模图像数据中学习到有用的特征表示。",{"type":18,"tag":26,"props":223,"children":224},{},[225],{"type":24,"value":226},"微调：在音频分类任务中，AST会在特定的音频数据集（如AudioSet、ESC-50等）上进行微调，以适应具体的任务需求。",{"type":18,"tag":26,"props":228,"children":229},{},[230],{"type":24,"value":231},"# 04",{"type":18,"tag":26,"props":233,"children":234},{},[235],{"type":18,"tag":58,"props":236,"children":237},{},[238],{"type":24,"value":239},"论文创新点分析",{"type":18,"tag":26,"props":241,"children":242},{},[243],{"type":18,"tag":58,"props":244,"children":245},{},[246],{"type":24,"value":247},"1、将Transformer架构应用于音频分类任务：",{"type":18,"tag":26,"props":249,"children":250},{},[251,253],{"type":24,"value":252},"传统的音频分类方法通常依赖于CNN或RNN，而AST首次将Transformer架构应用于音频分类任务，并取得了显著的效果。这一创新为音频处理领域提供了新的研究方向。 ",{"type":18,"tag":58,"props":254,"children":255},{},[256],{"type":24,"value":257},"2、跨模态预训练策略：",{"type":18,"tag":26,"props":259,"children":260},{},[261,263],{"type":24,"value":262},"作者通过在ImageNet数据集上预训练Transformer编码器，显著提高了模型在音频分类任务上的性能。这种跨模态的预训练策略为音频分类任务提供了新的思路。 ",{"type":18,"tag":58,"props":264,"children":265},{},[266],{"type":24,"value":267},"3、全局依赖关系的捕捉：",{"type":18,"tag":26,"props":269,"children":270},{},[271],{"type":24,"value":272},"Transformer的自注意力机制能够更好地捕捉音频信号中的全局依赖关系，这使得AST在处理长序列音频数据时具有优势。",{"type":18,"tag":26,"props":274,"children":275},{},[276],{"type":18,"tag":58,"props":277,"children":278},{},[279],{"type":24,"value":280},"# 05",{"type":18,"tag":26,"props":282,"children":283},{},[284],{"type":18,"tag":58,"props":285,"children":286},{},[287],{"type":24,"value":288},"结果",{"type":18,"tag":26,"props":290,"children":291},{},[292],{"type":24,"value":293},"作者在多个音频数据集上进行了实验，以下是AST在这些数据集上的训练细节和结果： AudioSet：mAP（mean Average Precision）为0.485，相比之前的state-of-the-art模型（PSLA）提高了2.3%。",{"type":18,"tag":26,"props":295,"children":296},{},[297],{"type":18,"tag":45,"props":298,"children":300},{"alt":7,"src":299},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/18/4e920f3cf99a4c68937af63055aa4c4a.png",[],{"type":18,"tag":26,"props":302,"children":303},{},[304],{"type":24,"value":305},"消融实验：作者进行了多项消融实验，验证了不同组件对模型性能的影响。例如，去除了ImageNet预训练后，模型性能显著下降，表明预训练对模型的重要性",{"type":18,"tag":26,"props":307,"children":308},{},[309],{"type":18,"tag":45,"props":310,"children":312},{"alt":7,"src":311},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/18/1d11aeaf082d45d7a21fd714523d09de.png",[],{"type":18,"tag":26,"props":314,"children":315},{},[316],{"type":18,"tag":45,"props":317,"children":319},{"alt":7,"src":318},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/18/f0c87958c4214ec68b4f367a3f10af0e.png",[],{"type":18,"tag":26,"props":321,"children":322},{},[323],{"type":24,"value":324},"ESC-50：准确率为95.6%，相比之前的state-of-the-art模型（PSLA）提高了1.2%。",{"type":18,"tag":26,"props":326,"children":327},{},[328],{"type":18,"tag":45,"props":329,"children":331},{"alt":7,"src":330},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/18/45c0eb6d0d214a5fa2d8f16a5c39d246.png",[],{"type":18,"tag":26,"props":333,"children":334},{},[335],{"type":24,"value":336},"Speech Commands：准确率为98.1%，相比之前的state-of-the-art模型（PSLA）提高了0.5%。",{"type":18,"tag":26,"props":338,"children":339},{},[340],{"type":18,"tag":58,"props":341,"children":342},{},[343],{"type":24,"value":344},"# 06",{"type":18,"tag":26,"props":346,"children":347},{},[348],{"type":18,"tag":58,"props":349,"children":350},{},[351],{"type":24,"value":352},"使用MindSpore NLP进行****模型评估",{"type":18,"tag":26,"props":354,"children":355},{},[356],{"type":24,"value":357},"我们将使用MindNLP加载AST模型，并在ESC-50数据集上进行评估。以下是评估与推理的结果：",{"type":18,"tag":26,"props":359,"children":360},{},[361],{"type":18,"tag":45,"props":362,"children":364},{"alt":7,"src":363},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/07/18/a0f82dfd3c594074a14d878789fc7e0c.png",[],{"type":18,"tag":26,"props":366,"children":367},{},[368],{"type":24,"value":369},"MindSpore NLP相关实现评估推理代码请看以下连接：",{"type":18,"tag":26,"props":371,"children":372},{},[373],{"type":18,"tag":374,"props":375,"children":379},"a",{"href":376,"rel":377},"https://github.com/guyueyuan/audio%5C_spectrogram%5C_transformer%5C_mindnlp",[378],"nofollow",[380],{"type":24,"value":381},"https://github.com/guyueyuan/audio\\_spectrogram\\_transformer\\_mindnlp",{"type":18,"tag":26,"props":383,"children":384},{},[385],{"type":18,"tag":58,"props":386,"children":387},{},[388],{"type":24,"value":389},"# 07",{"type":18,"tag":26,"props":391,"children":392},{},[393],{"type":18,"tag":58,"props":394,"children":395},{},[396],{"type":24,"value":397},"总结",{"type":18,"tag":26,"props":399,"children":400},{},[401],{"type":18,"tag":58,"props":402,"children":403},{},[404],{"type":24,"value":405},"1、音频谱图Transformer模型的有效性",{"type":18,"tag":26,"props":407,"children":408},{},[409,411],{"type":24,"value":410},"该论文提出的音频谱图Transformer（AST）模型在音频分类任务中表现出色，特别是在AudioSet、ESC-50和Speech Commands数据集上取得了显著的性能提升。 ",{"type":18,"tag":58,"props":412,"children":413},{},[414],{"type":24,"value":415},"2、ImageNet预训练的重要性",{"type":18,"tag":26,"props":417,"children":418},{},[419,421],{"type":24,"value":420},"通过使用ImageNet预训练的视觉Transformer模型作为基础，AST模型能够更好地捕捉音频谱图中的全局特征，从而提高了分类精度。 ",{"type":18,"tag":58,"props":422,"children":423},{},[424],{"type":24,"value":425},"3、模型架构的优化",{"type":18,"tag":26,"props":427,"children":428},{},[429,431],{"type":24,"value":430},"AST模型通过引入Transformer架构，能够有效地处理长序列数据，并且在音频分类任务中表现出比传统卷积神经网络（CNN）更好的性能。 ",{"type":18,"tag":58,"props":432,"children":433},{},[434],{"type":24,"value":435},"4、消融实验的验证",{"type":18,"tag":26,"props":437,"children":438},{},[439,441],{"type":24,"value":440},"通过一系列消融实验，论文验证了不同组件（如位置编码、多头注意力机制等）对模型性能的贡献，进一步证明了AST模型设计的合理性。 ",{"type":18,"tag":58,"props":442,"children":443},{},[444],{"type":24,"value":445},"5、跨数据集的泛化能力",{"type":18,"tag":26,"props":447,"children":448},{},[449],{"type":24,"value":450},"AST模型不仅在AudioSet上表现优异，还在ESC-50和Speech Commands数据集上展现了强大的泛化能力，表明该模型适用于多种音频分类任务。",{"title":7,"searchDepth":452,"depth":452,"links":453},4,[],"markdown","content:technology-blogs:zh:3797.md","content","technology-blogs/zh/3797.md","technology-blogs/zh/3797","md",1776506135489]