[{"data":1,"prerenderedAt":942},["ShallowReactive",2],{"content-query-K6YmzkPNgm":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":936,"_id":937,"_source":938,"_file":939,"_stem":940,"_extension":941},"/technology-blogs/zh/2026-1-6","zh",false,"","基于昇思MindSpore，国内首个开源的全自主创新千亿参数细粒度MoE语义大模型TeleChat3-105B-A4.7-Thinking正式发布","国内首个开源的全自主创新训练的千亿参数细粒度MoE语义大模型","2026-1-6","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/11/28/8e0e0150508a4c5ba4287fa3bec8ea3f.png","technology-blogs","实践",{"type":15,"children":16,"toc":918},"root",[17,25,31,40,48,62,73,83,88,98,105,112,121,133,145,152,171,188,198,205,217,226,231,236,241,246,253,258,263,268,280,285,293,303,308,363,368,376,381,387,390,402,408,412,417,424,428,434,438,450,457,461,467,471,483,493,507,512,526,551,556,570,575,580,588,593,605,610,615,620,625,630,635,640,650,655,663,672,677,682,686,691,711,715,720,724,739,762,767,779,789,803,808,813,818,823,828,833,838,843,848,853,858,863,868,873,881,886,890,895,904,909],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"基于昇思mindspore国内首个开源的全自主创新千亿参数细粒度moe语义大模型telechat3-105b-a47-thinking正式发布",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"TeleChat3-105B-A4.7-Thinking，国内首个开源的全自主创新训练的千亿参数细粒度MoE语义大模型，正式发布！在问答、写作、数学、代码、Agent等多维度比肩业内头部。该模型基于昇思MindSpore AI框架训练完成，现已正式发布，欢迎广大开发者下载体验！",{"type":18,"tag":26,"props":32,"children":33},{},[34],{"type":18,"tag":35,"props":36,"children":37},"strong",{},[38],{"type":24,"value":39},"# 01",{"type":18,"tag":26,"props":41,"children":42},{},[43],{"type":18,"tag":35,"props":44,"children":45},{},[46],{"type":24,"value":47},"模型介绍",{"type":18,"tag":26,"props":49,"children":50},{},[51,53],{"type":24,"value":52},"TeleChat3-105B-A4.7-Thinking是 TeleChat系列国内首个开源的全自主创新千亿参数细粒度MoE语义大模型，由中国电信人工智能研究院（TeleAI）研发训练，在问答、写作、数学、代码、Agent等多维度，与业内头部模型比肩，特别在代码能力、复杂任务通用问答、细粒度MoE等维度上有显著的效果提升，同时采用创新训练方式，加快模型在训练初期的收敛速度，增强模型在训练中的稳定性。具体请参考 TeleChat3（",{"type":18,"tag":54,"props":55,"children":59},"a",{"href":56,"rel":57},"https://github.com/Tele-AI/TeleChat3%EF%BC%89%E3%80%82",[58],"nofollow",[60],{"type":24,"value":61},"https://github.com/Tele-AI/TeleChat3）。",{"type":18,"tag":63,"props":64,"children":66},"div",{"style":65},"text-align: center;",[67],{"type":18,"tag":68,"props":69,"children":72},"img",{"src":70,"style":71,"alt":7},"/category/information/technology-blogs/banner/2026-1-6-1.jpg","display: block;margin: 0 auto;max-width:70%",[],{"type":18,"tag":74,"props":75,"children":77},"h2",{"id":76},"_1代码能力提升复杂任务拿捏住了",[78],{"type":18,"tag":35,"props":79,"children":80},{},[81],{"type":24,"value":82},"1、代码能力提升，复杂任务拿捏住了！",{"type":18,"tag":26,"props":84,"children":85},{},[86],{"type":24,"value":87},"面对综合任务场景，TeleChat3-105B-A4.7-Thinking 高效拆解任务需求，整合多项代码能力，一次性交付出完整可运行的代码。",{"type":18,"tag":26,"props":89,"children":90},{},[91,93],{"type":24,"value":92},"省去大量人工调试时间投入，",{"type":18,"tag":35,"props":94,"children":95},{},[96],{"type":24,"value":97},"运行流畅，审美在线！",{"type":18,"tag":63,"props":99,"children":100},{"style":65},[101],{"type":18,"tag":68,"props":102,"children":104},{"src":103,"style":71,"alt":7},"/category/information/technology-blogs/banner/2026-1-6-2.jpg",[],{"type":18,"tag":63,"props":106,"children":107},{"style":65},[108],{"type":18,"tag":68,"props":109,"children":111},{"src":110,"style":71,"alt":7},"/category/information/technology-blogs/banner/2026-1-6-3.jpg",[],{"type":18,"tag":74,"props":113,"children":115},{"id":114},"_2细粒度moe-术业有专攻协同更高效",[116],{"type":18,"tag":35,"props":117,"children":118},{},[119],{"type":24,"value":120},"2、细粒度MoE, 术业有专攻，协同更高效！",{"type":18,"tag":26,"props":122,"children":123},{},[124,126,131],{"type":24,"value":125},"此前，中国电信人工智能研究院（TeleAI）与中电信人工智能科技有限公司已陆续开源原创打造的 TeleChat、TeleChat2 及TeleChat2.5系列模型，以传统",{"type":18,"tag":35,"props":127,"children":128},{},[129],{"type":24,"value":130},"稠密参数",{"type":24,"value":132},"架构为主，模型尺寸覆盖十亿到千亿，构建了全尺寸大模型开源布局。",{"type":18,"tag":26,"props":134,"children":135},{},[136,138,143],{"type":24,"value":137},"上半年，星辰语义大模型的首个MoE架构模型TeleChat2-39B-A12B也正式开源,采用",{"type":18,"tag":35,"props":139,"children":140},{},[141],{"type":24,"value":142},"粗粒度MoE架构",{"type":24,"value":144},"，初步实现知识模块化存储，按需唤醒相关专家模块。",{"type":18,"tag":63,"props":146,"children":147},{"style":65},[148],{"type":18,"tag":68,"props":149,"children":151},{"src":150,"style":71,"alt":7},"/category/information/technology-blogs/banner/2026-1-6-4.jpg",[],{"type":18,"tag":26,"props":153,"children":154},{},[155,157,162,164,169],{"type":24,"value":156},"为了进一步提升MoE 大模型的效率与性能，让参数利用更充分，TeleAI团队基于昇腾与MindSpore训练框架完成了",{"type":18,"tag":35,"props":158,"children":159},{},[160],{"type":24,"value":161},"TeleChat3-105B-A4.7-Thinking",{"type":24,"value":163}," 的全自主创新训练。该模型采用",{"type":18,"tag":35,"props":165,"children":166},{},[167],{"type":24,"value":168},"细粒度 MoE 架构",{"type":24,"value":170},"，基础模型训练数据超15T，共包括1个共享专家和192个路由专家（每次激活4个专家），模型整体共105B参数量，实际激活参数为 4.7B，专家稀疏比处于业界前列。",{"type":18,"tag":26,"props":172,"children":173},{},[174,176,181,183],{"type":24,"value":175},"面对不同的任务类型，更加细分的专家子模块实现了",{"type":18,"tag":35,"props":177,"children":178},{},[179],{"type":24,"value":180},"术业有专攻",{"type":24,"value":182},"，模块之间也实现了",{"type":18,"tag":35,"props":184,"children":185},{},[186],{"type":24,"value":187},"更精准、更任务导向的协同。",{"type":18,"tag":26,"props":189,"children":190},{},[191,193],{"type":24,"value":192},"打个比方，假如大模型是个理综考生，稠密参数大模型就是从一本“十年高考真题大全”合订本里找思路，知识庞杂，效率低下。粗粒度模型，则实现了初步的学科分类和调用，减少了无效的知识调用。细粒度MoE，则是更进一步，特定的题目只调用特定的细分知识点组合，",{"type":18,"tag":35,"props":194,"children":195},{},[196],{"type":24,"value":197},"见招拆招，精准调配。",{"type":18,"tag":63,"props":199,"children":200},{"style":65},[201],{"type":18,"tag":68,"props":202,"children":204},{"src":203,"style":71,"alt":7},"/category/information/technology-blogs/banner/2026-1-6-5.jpg",[],{"type":18,"tag":26,"props":206,"children":207},{},[208,210,215],{"type":24,"value":209},"此外，TeleAI还同步开源了稠密参数模型",{"type":18,"tag":35,"props":211,"children":212},{},[213],{"type":24,"value":214},"TeleChat3-36B-Thinking",{"type":24,"value":216},"模型，在知识、逻辑推理、智能体等维度实现了能力提升，并实现了文本创作、语义理解、角色扮演等任务的针对性优化。",{"type":18,"tag":74,"props":218,"children":220},{"id":219},"_3训练方式创新黑科技拉满收敛稳效率优",[221],{"type":18,"tag":35,"props":222,"children":223},{},[224],{"type":24,"value":225},"3、训练方式创新：黑科技拉满，收敛稳、效率优",{"type":18,"tag":26,"props":227,"children":228},{},[229],{"type":24,"value":230},"TeleAI 科研团队采用细粒度的模型初始化方式和学习率控制，对不同权重采用不同的初始化方式和学习率，加快模型在训练初期的收敛速度，增强模型在训练中的稳定性。",{"type":18,"tag":26,"props":232,"children":233},{},[234],{"type":24,"value":235},"基础模型训练通过两个阶段预训练和一个阶段中训练完成，总计训练 15T tokens。",{"type":18,"tag":26,"props":237,"children":238},{},[239],{"type":24,"value":240},"预训练第一阶段以通识数据（网页、书籍、多语言数据等）为主，主要提升模型知识能力；第二阶段增大 STEM 和代码相关数据占比，提升模型推理相关能力。",{"type":18,"tag":26,"props":242,"children":243},{},[244],{"type":24,"value":245},"中训练阶段以合成数据为主，包含仓库级代码任务、高质量数理逻辑数据以及智能体任务数据，持续提升模型逻辑推理和智能体相关能力。",{"type":18,"tag":63,"props":247,"children":248},{"style":65},[249],{"type":18,"tag":68,"props":250,"children":252},{"src":251,"style":71,"alt":7},"/category/information/technology-blogs/banner/2026-1-6-6.jpg",[],{"type":18,"tag":26,"props":254,"children":255},{},[256],{"type":24,"value":257},"后训练也包含两个阶段：",{"type":18,"tag":26,"props":259,"children":260},{},[261],{"type":24,"value":262},"第一阶段模型冷启动微调，为取得更好的冷启动效果，针对微调数据难度和多样性做了大量筛选工作，大幅提升模型多任务理解及指令遵从能力。",{"type":18,"tag":26,"props":264,"children":265},{},[266],{"type":24,"value":267},"第二阶段强化学习，采用基于规则校验奖励和 RM 打分模型融合的方式。针对数理、代码、指令遵循等采用规则校验；针对文本创作、语义理解、角色扮演等任务采用专项训练 RM 模型进行打分。",{"type":18,"tag":26,"props":269,"children":270},{},[271,273],{"type":24,"value":272},"执行以下 Python 脚本从魔乐社区下载昇思 MindSpore 版本的 T1-35B 文件至指定路径 /home/teleAI/T1-35B （需提前pip安装openmind_hub库，相关参考文档可参考：openMind Hub Client使用教程（",{"type":18,"tag":54,"props":274,"children":277},{"href":275,"rel":276},"https://modelers.cn/docs/zh/openmind-hub-client/0.9/overview.html%EF%BC%89",[58],[278],{"type":24,"value":279},"https://modelers.cn/docs/zh/openmind-hub-client/0.9/overview.html）",{"type":18,"tag":26,"props":281,"children":282},{},[283],{"type":24,"value":284},"）。下载的文件包含模型代码、权重、分词模型和示例代码，占用约 67GB 的磁盘空间：",{"type":18,"tag":26,"props":286,"children":287},{},[288],{"type":18,"tag":35,"props":289,"children":290},{},[291],{"type":24,"value":292},"# 02",{"type":18,"tag":26,"props":294,"children":295},{},[296,301],{"type":18,"tag":35,"props":297,"children":298},{},[299],{"type":24,"value":300},"基于昇思",{"type":24,"value":302},"MindSpore的系列优化特性，使能模型高效训练",{"type":18,"tag":26,"props":304,"children":305},{},[306],{"type":24,"value":307},"TeleChat3-105B-A4.7-Thinking是基于昇思MindSpore AI框架训练的MoE类模型，通过昇思MindSpore提供的各种并行及加速特性，实现千亿级细粒度专家MoE模型高效训练，典型的优化特性如下：",{"type":18,"tag":309,"props":310,"children":311},"ul",{},[312,323,333,343,353],{"type":18,"tag":313,"props":314,"children":315},"li",{},[316,321],{"type":18,"tag":35,"props":317,"children":318},{},[319],{"type":24,"value":320},"核隔离+CPU绑核",{"type":24,"value":322},"： 减少host侧进程抢占，提升算子下发性能。",{"type":18,"tag":313,"props":324,"children":325},{},[326,331],{"type":18,"tag":35,"props":327,"children":328},{},[329],{"type":24,"value":330},"断流优化",{"type":24,"value":332},"：FFN使用异步拷贝，减少流同步耗时。",{"type":18,"tag":313,"props":334,"children":335},{},[336,341],{"type":18,"tag":35,"props":337,"children":338},{},[339],{"type":24,"value":340},"DVM****调优",{"type":24,"value":342},"：Cube类算子Tiling在线调优、MM-AssignAdd融合、vector融合，提升算子性能。",{"type":18,"tag":313,"props":344,"children":345},{},[346,351],{"type":18,"tag":35,"props":347,"children":348},{},[349],{"type":24,"value":350},"计算通信掩盖优化",{"type":24,"value":352},"：vpp 1f1b通信掩盖、dw反向通信掩盖等特性，模型并行通信掩盖率 > 60%。",{"type":18,"tag":313,"props":354,"children":355},{},[356,361],{"type":18,"tag":35,"props":357,"children":358},{},[359],{"type":24,"value":360},"策略优化&重计算调优",{"type":24,"value":362},"：扩大BS增加Device算子执行开销，掩盖host的性能抖动；增大TP换取内存空间做重计算调优。",{"type":18,"tag":26,"props":364,"children":365},{},[366],{"type":24,"value":367},"通过端到端的加速特性优化，实现MFU的绝对值提升近14%。",{"type":18,"tag":26,"props":369,"children":370},{},[371],{"type":18,"tag":35,"props":372,"children":373},{},[374],{"type":24,"value":375},"# 03",{"type":18,"tag":26,"props":377,"children":378},{},[379],{"type":24,"value":380},"快速开始",{"type":18,"tag":74,"props":382,"children":384},{"id":383},"_1环境安装",[385],{"type":24,"value":386},"1、环境安装",{"type":18,"tag":74,"props":388,"children":389},{"id":7},[],{"type":18,"tag":26,"props":391,"children":392},{},[393,395],{"type":24,"value":394},"按照上述版本配套，参考环境安装指南（",{"type":18,"tag":54,"props":396,"children":399},{"href":397,"rel":398},"https://www.mindspore.cn/mindformers/docs/zh-CN/master/installation.html%EF%BC%89%E5%AE%89%E8%A3%85%E8%BF%90%E8%A1%8C%E7%8E%AF%E5%A2%83%E3%80%82",[58],[400],{"type":24,"value":401},"https://www.mindspore.cn/mindformers/docs/zh-CN/master/installation.html）安装运行环境。",{"type":18,"tag":74,"props":403,"children":405},{"id":404},"_2模型下载",[406],{"type":24,"value":407},"2、模型下载",{"type":18,"tag":74,"props":409,"children":411},{"id":410},"_1",[],{"type":18,"tag":26,"props":413,"children":414},{},[415],{"type":24,"value":416},"用户可以从Modelers、Hugging Face、ModelScope等开源社区下载所需的模型文件，包括模型权重、Tokenizer、配置等（从头预训练不需加载权重）。链接如下：",{"type":18,"tag":63,"props":418,"children":419},{"style":65},[420],{"type":18,"tag":68,"props":421,"children":423},{"src":422,"style":71,"alt":7},"/category/information/technology-blogs/banner/2026-1-6-7.jpg",[],{"type":18,"tag":74,"props":425,"children":427},{"id":426},"_2",[],{"type":18,"tag":74,"props":429,"children":431},{"id":430},"_3数据集下载",[432],{"type":24,"value":433},"3、数据集下载",{"type":18,"tag":74,"props":435,"children":437},{"id":436},"_3",[],{"type":18,"tag":26,"props":439,"children":440},{},[441,443],{"type":24,"value":442},"MindSpore Transformers 以下面的数据集为例提供了 TeleChat3 的预训练流程的使用案例，实际训练时可参考数据集（",{"type":18,"tag":54,"props":444,"children":447},{"href":445,"rel":446},"https://www.mindspore.cn/mindformers/docs/zh-CN/master/feature/dataset.html%EF%BC%89%E7%AB%A0%E8%8A%82%E5%88%B6%E4%BD%9C%E6%95%B0%E6%8D%AE%E9%9B%86%E3%80%82%E8%AF%B7%E5%9C%A8%E6%89%A7%E8%A1%8C%E4%BB%BB%E5%8A%A1%E5%89%8D%E6%8F%90%E5%89%8D%E4%B8%8B%E8%BD%BD%E6%89%80%E9%9C%80%E6%95%B0%E6%8D%AE%E9%9B%86%E3%80%82%E9%93%BE%E6%8E%A5%E5%A6%82%E4%B8%8B%EF%BC%9A",[58],[448],{"type":24,"value":449},"https://www.mindspore.cn/mindformers/docs/zh-CN/master/feature/dataset.html）章节制作数据集。请在执行任务前提前下载所需数据集。链接如下：",{"type":18,"tag":63,"props":451,"children":452},{"style":65},[453],{"type":18,"tag":68,"props":454,"children":456},{"src":455,"style":71,"alt":7},"/category/information/technology-blogs/banner/2026-1-6-8.jpg",[],{"type":18,"tag":74,"props":458,"children":460},{"id":459},"_4",[],{"type":18,"tag":74,"props":462,"children":464},{"id":463},"_4预训练样例",[465],{"type":24,"value":466},"4、预训练样例",{"type":18,"tag":74,"props":468,"children":470},{"id":469},"_5",[],{"type":18,"tag":26,"props":472,"children":473},{},[474,476],{"type":24,"value":475},"预训练是指在大规模无标注数据上训练模型，使其能够全面捕捉语言的广泛特性。在MindSpore官网提供了详细的指导。（",{"type":18,"tag":54,"props":477,"children":480},{"href":478,"rel":479},"https://www.mindspore.cn/mindformers/docs/zh-CN/master/guide/pre_training.html%EF%BC%89",[58],[481],{"type":24,"value":482},"https://www.mindspore.cn/mindformers/docs/zh-CN/master/guide/pre_training.html）",{"type":18,"tag":26,"props":484,"children":485},{},[486,488],{"type":24,"value":487},"**1）**",{"type":18,"tag":35,"props":489,"children":490},{},[491],{"type":24,"value":492},"数据预处理",{"type":18,"tag":26,"props":494,"children":495},{},[496,498,505],{"type":24,"value":497},"MindSpore Transformers 预训练阶段当前已支持Megatron格式的数据集（",{"type":18,"tag":54,"props":499,"children":502},{"href":500,"rel":501},"https://www.mindspore.cn/mindformers/docs/zh-CN/master/feature/dataset.html#megatron%E6%95%B0%E6%8D%AE%E9%9B%86%EF%BC%89%E3%80%82%E7%94%A8%E6%88%B7%E5%8F%AF%E4%BB%A5%E5%8F%82%E8%80%83%E6%95%B0%E6%8D%AE%E9%9B%86%E7%AB%A0%E8%8A%82%EF%BC%8C%E4%BD%BF%E7%94%A8",[58],[503],{"type":24,"value":504},"https://www.mindspore.cn/mindformers/docs/zh-CN/master/feature/dataset.html#megatron%E6%95%B0%E6%8D%AE%E9%9B%86）。用户可以参考数据集章节，使用",{"type":24,"value":506}," MindSpore 提供的工具将原始数据集转换为 Megatron 格式。",{"type":18,"tag":26,"props":508,"children":509},{},[510],{"type":24,"value":511},"制作Megatron格式数据集，需要经过两个步骤。首先将原始文本数据集转换为jsonl格式数据，然后使用MindSpore Transformers提供的脚本将jsonl格式数据转换为Megatron格式的.bin和.idx文件。",{"type":18,"tag":309,"props":513,"children":514},{},[515],{"type":18,"tag":313,"props":516,"children":517},{},[518,520,524],{"type":24,"value":519},"wiki.train.tokens",{"type":18,"tag":521,"props":522,"children":523},"br",{},[],{"type":24,"value":525},"转为  jsonl 格式数据",{"type":18,"tag":26,"props":527,"children":528},{},[529,531,536,537,542,544],{"type":24,"value":530},"用户需要",{"type":18,"tag":35,"props":532,"children":533},{},[534],{"type":24,"value":535},"自行将",{"type":24,"value":519},{"type":18,"tag":35,"props":538,"children":539},{},[540],{"type":24,"value":541},"数据集处理成jsonl格式的文件",{"type":24,"value":543},"。作为参考，文档末尾的FAQ（",{"type":18,"tag":54,"props":545,"children":548},{"href":546,"rel":547},"https://gitee.com/mindspore/mindformers/tree/master/configs/telechat3#faq%EF%BC%89%E9%83%A8%E5%88%86%E6%8F%90%E4%BE%9B%E4%BA%86%E4%B8%80%E4%B8%AA%E4%B8%B4%E6%97%B6%E8%BD%AC%E6%8D%A2%E6%96%B9%E6%A1%88%EF%BC%8C%E7%94%A8%E6%88%B7%E9%9C%80%E8%A6%81%E6%A0%B9%E6%8D%AE%E5%AE%9E%E9%99%85%E9%9C%80%E6%B1%82%E8%87%AA%E8%A1%8C%E5%BC%80%E5%8F%91%E5%92%8C%E9%AA%8C%E8%AF%81%E8%BD%AC%E6%8D%A2%E9%80%BB%E8%BE%91%E3%80%82",[58],[549],{"type":24,"value":550},"https://gitee.com/mindspore/mindformers/tree/master/configs/telechat3#faq）部分提供了一个临时转换方案，用户需要根据实际需求自行开发和验证转换逻辑。",{"type":18,"tag":26,"props":552,"children":553},{},[554],{"type":24,"value":555},"下面是jsonl格式文件的示例：",{"type":18,"tag":26,"props":557,"children":558},{},[559,561,568],{"type":24,"value":560},"{\"src\": \"",{"type":18,"tag":54,"props":562,"children":565},{"href":563,"rel":564},"http://www.nvidia.com",[58],[566],{"type":24,"value":567},"www.nvidia.com",{"type":24,"value":569},"\", \"text\": \"The quick brown fox\", \"type\": \"Eng\", \"id\": \"0\", \"title\": \"First Part\"}",{"type":18,"tag":26,"props":571,"children":572},{},[573],{"type":24,"value":574},"{\"src\": \"The Internet\", \"text\": \"jumps over the lazy dog\", \"type\": \"Eng\", \"id\": \"42\", \"title\": \"Second Part\"}",{"type":18,"tag":26,"props":576,"children":577},{},[578],{"type":24,"value":579},"...",{"type":18,"tag":309,"props":581,"children":582},{},[583],{"type":18,"tag":313,"props":584,"children":585},{},[586],{"type":24,"value":587},"jsonl 格式数据 转为  bin 格式数据",{"type":18,"tag":26,"props":589,"children":590},{},[591],{"type":24,"value":592},"MindSpore Transformers提供了数据预处理脚本toolkit/data_preprocess/megatron/preprocess_indexed_dataset.py用于将jsonl格式的原始文本预料转换成.bin或.idx文件。",{"type":18,"tag":26,"props":594,"children":595},{},[596,598],{"type":24,"value":597},"这里需要提前下载TeleChat3-105B-A4.7B（",{"type":18,"tag":54,"props":599,"children":602},{"href":600,"rel":601},"https://huggingface.co/TeleChat/TeleChat3-105B-A4.7B%EF%BC%89%E6%A8%A1%E5%9E%8B%E7%9A%84tokenizer%E6%96%87%E4%BB%B6%E3%80%82",[58],[603],{"type":24,"value":604},"https://huggingface.co/TeleChat/TeleChat3-105B-A4.7B）模型的tokenizer文件。",{"type":18,"tag":26,"props":606,"children":607},{},[608],{"type":24,"value":609},"例如：",{"type":18,"tag":26,"props":611,"children":612},{},[613],{"type":24,"value":614},"python toolkit/data_preprocess/megatron/preprocess_indexed_dataset.py \\",{"type":18,"tag":26,"props":616,"children":617},{},[618],{"type":24,"value":619},"--input /path/to/data.jsonl \\",{"type":18,"tag":26,"props":621,"children":622},{},[623],{"type":24,"value":624},"--output-prefix /path/to/wiki103-megatron \\",{"type":18,"tag":26,"props":626,"children":627},{},[628],{"type":24,"value":629},"--tokenizer-type HuggingFaceTokenizer \\",{"type":18,"tag":26,"props":631,"children":632},{},[633],{"type":24,"value":634},"--tokenizer-dir /path/to/TeleChat3-105B-A4.7B # 其他规格的模型可以调整为对应的tokenizer路径",{"type":18,"tag":26,"props":636,"children":637},{},[638],{"type":24,"value":639},"运行完成后会生成/path/to/wiki103-megatron_text_document.bin和/path/to/wiki103-megatron_text_document.idx文件。 填写数据集路径时需要使用/path/to/wiki103-megatron_text_document，不需要带后缀名。",{"type":18,"tag":26,"props":641,"children":642},{},[643,645],{"type":24,"value":644},"**2）**",{"type":18,"tag":35,"props":646,"children":647},{},[648],{"type":24,"value":649},"修改任务配置",{"type":18,"tag":26,"props":651,"children":652},{},[653],{"type":24,"value":654},"MindSpore Transformers 提供了预训练任务的配置文件，用户可以根据实际情况修改配置文件。以下是一个示例配置文件片段，用户需要根据自己的数据集路径和其他参数进行相应修改。",{"type":18,"tag":309,"props":656,"children":657},{},[658],{"type":18,"tag":313,"props":659,"children":660},{},[661],{"type":24,"value":662},"数据集配置",{"type":18,"tag":26,"props":664,"children":665},{},[666],{"type":18,"tag":667,"props":668,"children":669},"em",{},[670],{"type":24,"value":671},"# Dataset configuration",{"type":18,"tag":26,"props":673,"children":674},{},[675],{"type":24,"value":676},"train_dataset: &train_dataset",{"type":18,"tag":26,"props":678,"children":679},{},[680],{"type":24,"value":681},"data_loader:",{"type":18,"tag":26,"props":683,"children":684},{},[685],{"type":24,"value":579},{"type":18,"tag":26,"props":687,"children":688},{},[689],{"type":24,"value":690},"sizes:",{"type":18,"tag":309,"props":692,"children":693},{},[694],{"type":18,"tag":313,"props":695,"children":696},{},[697,699,704,706],{"type":24,"value":698},"8000  ",{"type":18,"tag":667,"props":700,"children":701},{},[702],{"type":24,"value":703},"#",{"type":24,"value":705}," ",{"type":18,"tag":667,"props":707,"children":708},{},[709],{"type":24,"value":710},"数据集的大小，可以根据实际数据集大小进行调整",{"type":18,"tag":26,"props":712,"children":713},{},[714],{"type":24,"value":579},{"type":18,"tag":26,"props":716,"children":717},{},[718],{"type":24,"value":719},"config:",{"type":18,"tag":26,"props":721,"children":722},{},[723],{"type":24,"value":579},{"type":18,"tag":26,"props":725,"children":726},{},[727,729,733,734],{"type":24,"value":728},"data_path: ",{"type":18,"tag":667,"props":730,"children":731},{},[732],{"type":24,"value":703},{"type":24,"value":705},{"type":18,"tag":667,"props":735,"children":736},{},[737],{"type":24,"value":738},"采样比例和Megatron格式数据集路径",{"type":18,"tag":309,"props":740,"children":741},{},[742,747],{"type":18,"tag":313,"props":743,"children":744},{},[745],{"type":24,"value":746},"'1'",{"type":18,"tag":313,"props":748,"children":749},{},[750,752,756,757],{"type":24,"value":751},"\"/path/to/wiki103-megatron_text_document\"",{"type":18,"tag":667,"props":753,"children":754},{},[755],{"type":24,"value":703},{"type":24,"value":705},{"type":18,"tag":667,"props":758,"children":759},{},[760],{"type":24,"value":761},"替换为实际的Megatron格式数据集路径，此处不带后缀名",{"type":18,"tag":26,"props":763,"children":764},{},[765],{"type":24,"value":766},"数据集路径需要替换为实际的Megatron格式数据集路径。",{"type":18,"tag":26,"props":768,"children":769},{},[770,772],{"type":24,"value":771},"不同规格和序列长度的并行配置可参考并行配置建议。（",{"type":18,"tag":54,"props":773,"children":776},{"href":774,"rel":775},"https://gitee.com/mindspore/mindformers/tree/master/configs/telechat3#%E5%B9%B6%E8%A1%8C%E9%85%8D%E7%BD%AE%E5%BB%BA%E8%AE%AE%EF%BC%89",[58],[777],{"type":24,"value":778},"https://gitee.com/mindspore/mindformers/tree/master/configs/telechat3#%E5%B9%B6%E8%A1%8C%E9%85%8D%E7%BD%AE%E5%BB%BA%E8%AE%AE）",{"type":18,"tag":26,"props":780,"children":781},{},[782,784],{"type":24,"value":783},"**3）**",{"type":18,"tag":35,"props":785,"children":786},{},[787],{"type":24,"value":788},"启动预训练任务",{"type":18,"tag":26,"props":790,"children":791},{},[792,794,801],{"type":24,"value":793},"通过指定模型路径和配置文件configs/telechat3/pretrain_telechat3_105b_a4b_4k.yaml（",{"type":18,"tag":54,"props":795,"children":798},{"href":796,"rel":797},"https://gitee.com/mindspore/mindformers/blob/master/configs/telechat3/pretrain_telechat3_105b_a4b_4k.yaml%EF%BC%89%E4%BB%A5msrun%E7%9A%84%E6%96%B9%E5%BC%8F%E5%90%AF%E5%8A%A8run_mindformer.py%EF%BC%88https://gitee.com/mindspore/mindformers/blob/master/run_mindformer.py%EF%BC%89%E8%84%9A%E6%9C%AC%EF%BC%8C%E8%BF%9B%E8%A1%8C16%E5%8D%A1%E5%88%86%E5%B8%83%E5%BC%8F%E8%AE%AD%E7%BB%83%E3%80%82%E6%82%A8%E5%8F%AF%E5%8F%82%E8%80%83%E5%A6%82%E4%B8%8B%E6%96%B9%E5%BC%8F%EF%BC%8C%E6%8B%89%E8%B5%B7%E4%B8%A4%E5%8F%B0Atlas",[58],[799],{"type":24,"value":800},"https://gitee.com/mindspore/mindformers/blob/master/configs/telechat3/pretrain_telechat3_105b_a4b_4k.yaml）以msrun的方式启动run_mindformer.py（https://gitee.com/mindspore/mindformers/blob/master/run_mindformer.py）脚本，进行16卡分布式训练。您可参考如下方式，拉起两台Atlas",{"type":24,"value":802}," 800T A2（64G）训练。",{"type":18,"tag":26,"props":804,"children":805},{},[806],{"type":24,"value":807},"在每台服务器上执行如下命令。设置master_ip为主节点IP地址，即Rank 0服务器的IP；node_rank为每个节点的序号；port为当前进程的端口号（可在50000~65536中选择）。",{"type":18,"tag":26,"props":809,"children":810},{},[811],{"type":24,"value":812},"master_ip=192.168.1.1",{"type":18,"tag":26,"props":814,"children":815},{},[816],{"type":24,"value":817},"node_rank=0",{"type":18,"tag":26,"props":819,"children":820},{},[821],{"type":24,"value":822},"port=50001",{"type":18,"tag":26,"props":824,"children":825},{},[826],{"type":24,"value":827},"bash scripts/msrun_launcher.sh \"run_mindformer.py \\",{"type":18,"tag":26,"props":829,"children":830},{},[831],{"type":24,"value":832},"--config configs/telechat3/pretrain_telechat3_105b_a4b_4k.yaml \\",{"type":18,"tag":26,"props":834,"children":835},{},[836],{"type":24,"value":837},"--auto_trans_ckpt False \\",{"type":18,"tag":26,"props":839,"children":840},{},[841],{"type":24,"value":842},"--use_parallel True \\",{"type":18,"tag":26,"props":844,"children":845},{},[846],{"type":24,"value":847},"--run_mode train\" \\",{"type":18,"tag":26,"props":849,"children":850},{},[851],{"type":24,"value":852},"48 8 $master_ip $port $node_rank output/msrun_log False 7200",{"type":18,"tag":26,"props":854,"children":855},{},[856],{"type":24,"value":857},"此处样例代码假设主节点为192.168.1.1、当前Rank序号为0。实际执行时请将master_ip设置为实际的主节点IP地址；将node_rank设置为当前节点的Rank序号；将port设置为当前进程的端口号。",{"type":18,"tag":26,"props":859,"children":860},{},[861],{"type":24,"value":862},"上述命令执行完毕后，训练任务将在后台执行，过程日志保存在./output/msrun_log下，使用以下命令可查看训练状态（由于开启了流水并行，真实loss只显示在最后一个pipeline stage的日志中，其余pipeline stage会显示loss为0）",{"type":18,"tag":26,"props":864,"children":865},{},[866],{"type":24,"value":867},"tail -f ./output/msrun_log/worker_0.log",{"type":18,"tag":26,"props":869,"children":870},{},[871],{"type":24,"value":872},"训练过程中的权重checkpoint将会保存在./output/checkpoint下。",{"type":18,"tag":26,"props":874,"children":875},{},[876],{"type":18,"tag":35,"props":877,"children":878},{},[879],{"type":24,"value":880},"# 04",{"type":18,"tag":26,"props":882,"children":883},{},[884],{"type":24,"value":885},"结语",{"type":18,"tag":74,"props":887,"children":889},{"id":888},"_6",[],{"type":18,"tag":26,"props":891,"children":892},{},[893],{"type":24,"value":894},"相关训练工作的介绍请参考《Training Report of TeleChat3-MoE》",{"type":18,"tag":26,"props":896,"children":897},{},[898],{"type":18,"tag":54,"props":899,"children":902},{"href":900,"rel":901},"https://arxiv.org/abs/2512.24157",[58],[903],{"type":24,"value":900},{"type":18,"tag":26,"props":905,"children":906},{},[907],{"type":24,"value":908},"更多详细的模型训练，请参考开源链接",{"type":18,"tag":26,"props":910,"children":911},{},[912],{"type":18,"tag":54,"props":913,"children":916},{"href":914,"rel":915},"https://gitee.com/mindspore/mindformers/tree/master/configs/telechat3#%E6%A8%A1%E5%9E%8B%E4%B8%8B%E8%BD%BD",[58],[917],{"type":24,"value":914},{"title":7,"searchDepth":919,"depth":919,"links":920},4,[921,923,924,925,926,927,928,929,930,931,932,933,934,935],{"id":76,"depth":922,"text":82},2,{"id":114,"depth":922,"text":120},{"id":219,"depth":922,"text":225},{"id":383,"depth":922,"text":386},{"id":7,"depth":922,"text":7},{"id":404,"depth":922,"text":407},{"id":410,"depth":922,"text":7},{"id":426,"depth":922,"text":7},{"id":430,"depth":922,"text":433},{"id":436,"depth":922,"text":7},{"id":459,"depth":922,"text":7},{"id":463,"depth":922,"text":466},{"id":469,"depth":922,"text":7},{"id":888,"depth":922,"text":7},"markdown","content:technology-blogs:zh:2026-1-6.md","content","technology-blogs/zh/2026-1-6.md","technology-blogs/zh/2026-1-6","md",1776506118836]