[{"data":1,"prerenderedAt":496},["ShallowReactive",2],{"content-query-NBBHXh8lSD":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":490,"_id":491,"_source":492,"_file":493,"_stem":494,"_extension":495},"/technology-blogs/zh/3705","zh",false,"","零样本声音克隆！基于昇腾+MindSpore玩转Spark-TTS ！","Spark-TTS  是一款基于大语言模型（LLM）技术的先进文本转语音系统，能够根据用户需求合成高准确度且自然流畅的定制化语音。","2025-04-21","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/04/25/d6b0a281f05b4e13b1ee1d22f52b22b3.png","technology-blogs","实践",{"type":15,"children":16,"toc":474},"root",[17,25,31,44,66,83,91,99,107,114,129,134,199,214,219,226,231,238,243,248,253,260,265,280,288,295,326,334,344,352,357,365,370,378,386,391,396,404,409,417,426,434,439,446,455,463],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"零样本声音克隆基于昇腾mindspore玩转spark-tts",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"Spark-TTS 是一款基于大语言模型（LLM）技术的先进文本转语音系统，能够根据用户需求合成高准确度且自然流畅的定制化语音。",{"type":18,"tag":26,"props":32,"children":33},{},[34,36,42],{"type":24,"value":35},"MindSpore团队现已完成对Spark-TTS 的适配，并将其开源至",{"type":18,"tag":37,"props":38,"children":39},"strong",{},[40],{"type":24,"value":41},"MindSpore ONE",{"type":24,"value":43},"仓库，本文将要给大家详细介绍，如何基于昇思MindSpore和单机Atlas 800T A2，完整实现Spark-TTS 定制化语音合成的部署流程。",{"type":18,"tag":45,"props":46,"children":47},"ul",{},[48],{"type":18,"tag":49,"props":50,"children":51},"li",{},[52,54,58],{"type":24,"value":53},"MindSpore ONE开源代码仓链接：",{"type":18,"tag":55,"props":56,"children":57},"br",{},[],{"type":18,"tag":59,"props":60,"children":64},"a",{"href":61,"rel":62},"https://github.com/mindspore-lab/mindone/tree/master/examples/sparktts",[63],"nofollow",[65],{"type":24,"value":61},{"type":18,"tag":67,"props":68,"children":70},"h3",{"id":69},"_01-效果展示",[71,76,78],{"type":18,"tag":37,"props":72,"children":73},{},[74],{"type":24,"value":75},"# 01",{"type":24,"value":77}," ",{"type":18,"tag":37,"props":79,"children":80},{},[81],{"type":24,"value":82},"效果展示",{"type":18,"tag":26,"props":84,"children":85},{},[86],{"type":18,"tag":37,"props":87,"children":88},{},[89],{"type":24,"value":90},"1、声音克隆",{"type":18,"tag":26,"props":92,"children":93},{},[94],{"type":18,"tag":95,"props":96,"children":98},"img",{"alt":7,"src":97},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/04/25/c8ae323240b842da933a9d1021c8ca56.png",[],{"type":18,"tag":26,"props":100,"children":101},{},[102],{"type":18,"tag":37,"props":103,"children":104},{},[105],{"type":24,"value":106},"2、可控语音合成",{"type":18,"tag":26,"props":108,"children":109},{},[110],{"type":18,"tag":95,"props":111,"children":113},{"alt":7,"src":112},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/04/25/f32baf148f6843c896f40089b060a9f8.png",[],{"type":18,"tag":67,"props":115,"children":117},{"id":116},"_02-核心特性",[118,123,124],{"type":18,"tag":37,"props":119,"children":120},{},[121],{"type":24,"value":122},"# 02",{"type":24,"value":77},{"type":18,"tag":37,"props":125,"children":126},{},[127],{"type":24,"value":128},"核心特性",{"type":18,"tag":26,"props":130,"children":131},{},[132],{"type":24,"value":133},"SparkTTS模型具有以下特性：",{"type":18,"tag":45,"props":135,"children":136},{},[137,155,173,186],{"type":18,"tag":49,"props":138,"children":139},{},[140,145,148,150,153],{"type":18,"tag":37,"props":141,"children":142},{},[143],{"type":24,"value":144},"简洁高效：",{"type":18,"tag":55,"props":146,"children":147},{},[],{"type":24,"value":149},"1）完全基于 Qwen2.5 架构，无需依赖流匹配（flow matching）等额外生成模型。",{"type":18,"tag":55,"props":151,"children":152},{},[],{"type":24,"value":154},"2）直接通过大语言模型预测的音频编码重建语音，简化流程并提升合成效率。",{"type":18,"tag":49,"props":156,"children":157},{},[158,163,166,168,171],{"type":18,"tag":37,"props":159,"children":160},{},[161],{"type":24,"value":162},"高保真音色克隆：",{"type":18,"tag":55,"props":164,"children":165},{},[],{"type":24,"value":167},"1）支持零样本语音克隆（zero-shot），即使无目标说话人的训练数据，也能复现其音色。",{"type":18,"tag":55,"props":169,"children":170},{},[],{"type":24,"value":172},"2）特别适用于跨语言和语码转换场景，无需针对每种语言或音色单独训练。",{"type":18,"tag":49,"props":174,"children":175},{},[176,181,184],{"type":18,"tag":37,"props":177,"children":178},{},[179],{"type":24,"value":180},"双语混合支持：",{"type":18,"tag":55,"props":182,"children":183},{},[],{"type":24,"value":185},"兼容中文与英文，支持跨语言和语码切换的零样本克隆，实现多语言自然流畅的语音合成。",{"type":18,"tag":49,"props":187,"children":188},{},[189,194,197],{"type":18,"tag":37,"props":190,"children":191},{},[192],{"type":24,"value":193},"可控语音生成：",{"type":18,"tag":55,"props":195,"children":196},{},[],{"type":24,"value":198},"可通过调节性别、音高、语速等参数，自定义虚拟发音人声线特征。",{"type":18,"tag":67,"props":200,"children":202},{"id":201},"_03-模型介绍",[203,208,209],{"type":18,"tag":37,"props":204,"children":205},{},[206],{"type":24,"value":207},"# 03",{"type":24,"value":77},{"type":18,"tag":37,"props":210,"children":211},{},[212],{"type":24,"value":213},"模型介绍",{"type":18,"tag":26,"props":215,"children":216},{},[217],{"type":24,"value":218},"Spark-TTS的语言模型采用解码器-仅变压器架构，与典型的文本语言模型统一。它使用预训练的文本LLM Qwen2.5-0.5B作为骨干模型。Spark-TTS不需要流匹配来生成声学特征，而是通过BiCodec的解码器直接处理LM的输出，生成最终的音频。 Spark-TTS的语音合成流程如下图所示：",{"type":18,"tag":26,"props":220,"children":221},{},[222],{"type":18,"tag":95,"props":223,"children":225},{"alt":7,"src":224},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/04/25/522b292802f2488ba339f7ca15a707d0.png",[],{"type":18,"tag":26,"props":227,"children":228},{},[229],{"type":24,"value":230},"figure1.infer_control",{"type":18,"tag":26,"props":232,"children":233},{},[234],{"type":18,"tag":95,"props":235,"children":237},{"alt":7,"src":236},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/04/25/28e001e3069b41909ac8afe82e172f13.png",[],{"type":18,"tag":26,"props":239,"children":240},{},[241],{"type":24,"value":242},"figure2.infer_voice_cloning",{"type":18,"tag":26,"props":244,"children":245},{},[246],{"type":24,"value":247},"BiCodec包括一个全局tokenizer 和一个语义tokenizer 。前者从输入音频的梅尔频谱图中提取全局token ，后者使用wav2vec 2.0的特征作为输入提取语义token 。BiCodec的架构遵循标准的VQ-VAE编码器-解码器框架，并增加了tokenizer，解码器将离散token 重构为音频信号。语义tokenizer 的编码器和解码器是基于ConvNeXt块的卷积神经网络，采用单码本矢量量化。全局tokenizer 的编码器使用ECAPA-TDNN架构，并通过交叉注意力机制提取固定长度的全局token 序列，使用FSQ进行量化，以避免训练崩溃的风险。",{"type":18,"tag":26,"props":249,"children":250},{},[251],{"type":24,"value":252},"LM模型结构如下：",{"type":18,"tag":26,"props":254,"children":255},{},[256],{"type":18,"tag":95,"props":257,"children":259},{"alt":7,"src":258},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/04/25/c019061d79f54134b90d9671cc047f0e.png",[],{"type":18,"tag":26,"props":261,"children":262},{},[263],{"type":24,"value":264},"figure3.Speech language model of Spark-TTS",{"type":18,"tag":67,"props":266,"children":268},{"id":267},"_04-快速上手",[269,274,275],{"type":18,"tag":37,"props":270,"children":271},{},[272],{"type":24,"value":273},"# 04",{"type":24,"value":77},{"type":18,"tag":37,"props":276,"children":277},{},[278],{"type":24,"value":279},"快速上手",{"type":18,"tag":26,"props":281,"children":282},{},[283],{"type":18,"tag":37,"props":284,"children":285},{},[286],{"type":24,"value":287},"1、环境准备",{"type":18,"tag":26,"props":289,"children":290},{},[291],{"type":18,"tag":95,"props":292,"children":294},{"alt":7,"src":293},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/04/25/cda25f41e8ea479c82b90c446e72c111.png",[],{"type":18,"tag":45,"props":296,"children":297},{},[298,312],{"type":18,"tag":49,"props":299,"children":300},{},[301,303,306],{"type":24,"value":302},"CANN下载：",{"type":18,"tag":55,"props":304,"children":305},{},[],{"type":18,"tag":59,"props":307,"children":310},{"href":308,"rel":309},"https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.RC3.beta1",[63],[311],{"type":24,"value":308},{"type":18,"tag":49,"props":313,"children":314},{},[315,317,320],{"type":24,"value":316},"MindSpore下载：",{"type":18,"tag":55,"props":318,"children":319},{},[],{"type":18,"tag":59,"props":321,"children":324},{"href":322,"rel":323},"https://www.mindspore.cn/install",[63],[325],{"type":24,"value":322},{"type":18,"tag":26,"props":327,"children":328},{},[329],{"type":18,"tag":37,"props":330,"children":331},{},[332],{"type":24,"value":333},"2、安装依赖",{"type":18,"tag":335,"props":336,"children":338},"pre",{"code":337},"git clone https://github.com/mindspore-lab/mindone\ncd mindone/examples/sparktts\npip install -r requirements.txt\n",[339],{"type":18,"tag":340,"props":341,"children":342},"code",{"__ignoreMap":7},[343],{"type":24,"value":337},{"type":18,"tag":26,"props":345,"children":346},{},[347],{"type":18,"tag":37,"props":348,"children":349},{},[350],{"type":24,"value":351},"3、模型下载",{"type":18,"tag":26,"props":353,"children":354},{},[355],{"type":24,"value":356},"从Huggingface下载模型权重:",{"type":18,"tag":335,"props":358,"children":360},{"code":359},"\nfrom huggingface_hub import snapshot_download\nsnapshot_download(\"SparkAudio/Spark-TTS-0.5B\", local_dir=\"pretrained_models/Spark-TTS-0.5B\")\n",[361],{"type":18,"tag":340,"props":362,"children":363},{"__ignoreMap":7},[364],{"type":24,"value":359},{"type":18,"tag":26,"props":366,"children":367},{},[368],{"type":24,"value":369},"下载完成后，使用以下命令把wav2vec2模型权重从bin格式转为safetensors格式：",{"type":18,"tag":335,"props":371,"children":373},{"code":372},"python convert.py \\\n    --pt_filename pretrained_models/Spark-TTS-0.5B/wav2vec2-large-xlsr-53/pytorch_model.bin \\\n    --sf_filename pretrained_models/Spark-TTS-0.5B/wav2vec2-large-xlsr-53/model.safetensors \\\n    --config_path /pretrained_models/Spark-TTS-0.5B/wav2vec2-large-xlsr-53/config.json\n",[374],{"type":18,"tag":340,"props":375,"children":376},{"__ignoreMap":7},[377],{"type":24,"value":372},{"type":18,"tag":26,"props":379,"children":380},{},[381],{"type":18,"tag":37,"props":382,"children":383},{},[384],{"type":24,"value":385},"4、运行推理",{"type":18,"tag":26,"props":387,"children":388},{},[389],{"type":24,"value":390},"进行语音生成推理也非常简单，运行下面的命令即可：",{"type":18,"tag":26,"props":392,"children":393},{},[394],{"type":24,"value":395},"如果想要自定义语音生成的内容，只需要修改--text，输入你想要生成的文字即可，支持多种语言文字输入。另外，你还可以通过修 改--prompt_speech_path中提供的参考语音来控制你想要克隆的音色。",{"type":18,"tag":335,"props":397,"children":399},{"code":398},"python -m cli.inference \\\n    --text \"text to synthesis.\" \\\n    --save_dir \"path/to/save/audio\" \\\n    --model_dir pretrained_models/Spark-TTS-0.5B \\\n    --prompt_text \"transcript of the prompt audio\" \\\n    --prompt_speech_path \"path/to/prompt_audio\"\n",[400],{"type":18,"tag":340,"props":401,"children":402},{"__ignoreMap":7},[403],{"type":24,"value":398},{"type":18,"tag":26,"props":405,"children":406},{},[407],{"type":24,"value":408},"如果你想要自定义虚拟发音人声线特征，调节性别、音高、语速等参数，只需要运行以下添加了此类参数的命令即可：",{"type":18,"tag":335,"props":410,"children":412},{"code":411},"\npython -m cli.inference \\\n    --text \"text to synthesis.\" \\\n    --save_dir \"path/to/save/audio\" \\\n    --model_dir pretrained_models/Spark-TTS-0.5B \\\n    --prompt_text \"transcript of the prompt audio\" \\\n    --prompt_speech_path \"path/to/prompt_audio\" \\\n    --gender choices=[\"male\",\"female\"]\\\n    --pitch choices=[\"very_low\",\"low\", \"moderate\", \"high\", \"very_high\"]\\\n    --speed choices=[\"very_low\",\"low\", \"moderate\", \"high\", \"very_high\"]\\\n",[413],{"type":18,"tag":340,"props":414,"children":415},{"__ignoreMap":7},[416],{"type":24,"value":411},{"type":18,"tag":67,"props":418,"children":420},{"id":419},"_05",[421],{"type":18,"tag":37,"props":422,"children":423},{},[424],{"type":24,"value":425},"# 05",{"type":18,"tag":67,"props":427,"children":429},{"id":428},"性能实测",[430],{"type":18,"tag":37,"props":431,"children":432},{},[433],{"type":24,"value":428},{"type":18,"tag":26,"props":435,"children":436},{},[437],{"type":24,"value":438},"基于Atlas 800T A2和MindSpore2.5.0的性能测试结果如下：",{"type":18,"tag":26,"props":440,"children":441},{},[442],{"type":18,"tag":95,"props":443,"children":445},{"alt":7,"src":444},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/04/25/8cfe2e5c5aca4702999024122fb4f13c.png",[],{"type":18,"tag":67,"props":447,"children":449},{"id":448},"_06",[450],{"type":18,"tag":37,"props":451,"children":452},{},[453],{"type":24,"value":454},"# 06",{"type":18,"tag":67,"props":456,"children":458},{"id":457},"马上体验",[459],{"type":18,"tag":37,"props":460,"children":461},{},[462],{"type":24,"value":457},{"type":18,"tag":26,"props":464,"children":465},{},[466,468],{"type":24,"value":467},"我们在魔乐社区上完成了Spark-TTS的部署！欢迎体验：",{"type":18,"tag":59,"props":469,"children":472},{"href":470,"rel":471},"https://modelers.cn/spaces/MindSpore-Lab/sparktts",[63],[473],{"type":24,"value":470},{"title":7,"searchDepth":475,"depth":475,"links":476},4,[477,480,482,484,486,487,488,489],{"id":69,"depth":478,"text":479},3,"# 01 效果展示",{"id":116,"depth":478,"text":481},"# 02 核心特性",{"id":201,"depth":478,"text":483},"# 03 模型介绍",{"id":267,"depth":478,"text":485},"# 04 快速上手",{"id":419,"depth":478,"text":425},{"id":428,"depth":478,"text":428},{"id":448,"depth":478,"text":454},{"id":457,"depth":478,"text":457},"markdown","content:technology-blogs:zh:3705.md","content","technology-blogs/zh/3705.md","technology-blogs/zh/3705","md",1776506133358]