[{"data":1,"prerenderedAt":527},["ShallowReactive",2],{"content-query-arSlBEbNqZ":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":521,"_id":522,"_source":523,"_file":524,"_stem":525,"_extension":526},"/news/en/3612","en",false,"","DeepSeek-V3 & MindSpore: Training Deployment Made Easy","Exciting News for AI Developers: DeepSeek-V3 is Here!","2025-02-20","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/02/20/a8cfe1dcb44048dca0dcf8bc3e3a4fbd.png","news",{"type":14,"children":15,"toc":512},"root",[16,24,35,40,45,50,55,68,79,84,93,106,111,116,121,133,138,143,148,153,158,168,173,181,186,191,199,208,213,218,230,242,247,273,278,286,295,314,319,327,332,337,345,350,358,363,371,376,381,389,394,402,411,416,424,457,465,470,478,489,494,507],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"deepseek-v3-mindspore-training-deployment-made-easy",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":28},"h3",{"id":27},"background",[29],{"type":17,"tag":30,"props":31,"children":32},"strong",{},[33],{"type":23,"value":34},"Background",{"type":17,"tag":36,"props":37,"children":38},"p",{},[39],{"type":23,"value":9},{"type":17,"tag":36,"props":41,"children":42},{},[43],{"type":23,"value":44},"The powerful DeepSeek-V3 model is now integrated with the MindSpore AI framework, running on Ascend AI hardware. This unlocks ready-to-use pre-training and inference, validated by large-scale cluster deployments.",{"type":17,"tag":36,"props":46,"children":47},{},[48],{"type":23,"value":49},"Thanks to MindSpore's foundation model kits and its multi-dimensional hybrid distributed capabilities, automatic parallelism, and dryrun simulation, adapting DeepSeek-V3's advanced architectures was remarkably fast. Furthermore, MindSpore ensures efficient inference deployment, with optimizations for complex structures like MLA and DeepSeekMoE.",{"type":17,"tag":36,"props":51,"children":52},{},[53],{"type":23,"value":54},"Start building with DeepSeek-V3 on MindSpore today! Dive into the open source code and unleash AI potential:",{"type":17,"tag":36,"props":56,"children":57},{},[58,60],{"type":23,"value":59},"- Pre-training code: ",{"type":17,"tag":61,"props":62,"children":66},"a",{"href":63,"rel":64},"https://github.com/mindspore-lab/mindformers/tree/dev/research/deepseek3",[65],"nofollow",[67],{"type":23,"value":63},{"type":17,"tag":36,"props":69,"children":70},{},[71,73],{"type":23,"value":72},"- Inference code: ",{"type":17,"tag":61,"props":74,"children":77},{"href":75,"rel":76},"https://modelers.cn/models/MindSpore-Lab/DeepSeek-V3",[65],[78],{"type":23,"value":75},{"type":17,"tag":36,"props":80,"children":81},{},[82],{"type":23,"value":83},"Let's now proceed with the tutorial on DeepSeek-V3 pre-training deployment.",{"type":17,"tag":25,"props":85,"children":87},{"id":86},"environment-setup",[88],{"type":17,"tag":30,"props":89,"children":90},{},[91],{"type":23,"value":92},"Environment Setup",{"type":17,"tag":36,"props":94,"children":95},{},[96,98,104],{"type":23,"value":97},"MindSpore Transformers offers seamless support for DeepSeek-V3 pre-training. We've got a sample configuration file for 128 Atlas 800T A2 (64 G) servers in our repository, with WikiText-2 dataset included. Refer to the ",{"type":17,"tag":61,"props":99,"children":101},{"href":63,"rel":100},[65],[102],{"type":23,"value":103},"README",{"type":23,"value":105}," for details.",{"type":17,"tag":36,"props":107,"children":108},{},[109],{"type":23,"value":110},"Want to try it on a single device? We provide a modified configuration that reduces the DeepSeek-V3 model parameter count, allowing you to implement pre-training on a single Atlas 800T A2 (64G) server.",{"type":17,"tag":36,"props":112,"children":113},{},[114],{"type":23,"value":115},"To prepare for training, an Atlas 800T A2 (64 GB) server is required. The environment dependencies for MindSpore Transformers are as follows:",{"type":17,"tag":36,"props":117,"children":118},{},[119],{"type":23,"value":120},"- Python 3.10",{"type":17,"tag":36,"props":122,"children":123},{},[124,126],{"type":23,"value":125},"- ",{"type":17,"tag":61,"props":127,"children":130},{"href":128,"rel":129},"https://www.mindspore.cn/install/en",[65],[131],{"type":23,"value":132},"MindSpore 2.4.10",{"type":17,"tag":36,"props":134,"children":135},{},[136],{"type":23,"value":137},"- CANN 8.0.RC3.beta1",{"type":17,"tag":36,"props":139,"children":140},{},[141],{"type":23,"value":142},"- Firmware and driver 24.1.RC3",{"type":17,"tag":36,"props":144,"children":145},{},[146],{"type":23,"value":147},"MindSpore offers a dedicated Docker image for DeepSeek-V3 pre-training. Follow these steps for usage.",{"type":17,"tag":36,"props":149,"children":150},{},[151],{"type":23,"value":152},"1. Download the Docker image.",{"type":17,"tag":36,"props":154,"children":155},{},[156],{"type":23,"value":157},"Use the following command to download the Docker image.",{"type":17,"tag":159,"props":160,"children":162},"pre",{"code":161},"docker pull swr.cn-central-221.ovaijisuan.com/mindformers/deepseek_v3_mindspore2.4.10-train:20250209\n",[163],{"type":17,"tag":164,"props":165,"children":166},"code",{"__ignoreMap":7},[167],{"type":23,"value":161},{"type":17,"tag":36,"props":169,"children":170},{},[171],{"type":23,"value":172},"2. Create a container from the image.",{"type":17,"tag":159,"props":174,"children":176},{"code":175},"image_name=swr.cn-central-221.ovaijisuan.com/mindformers/deepseek_v3_mindspore2.4.10-train:20250209\ndocker_name=deepseek_v3\ndocker run -itd -u root \\\n--ipc=host --net=host \\\n--privileged \\\n--device=/dev/davinci0 \\\n--device=/dev/davinci1 \\\n--device=/dev/davinci2 \\\n--device=/dev/davinci3 \\\n--device=/dev/davinci4 \\\n--device=/dev/davinci5 \\\n--device=/dev/davinci6 \\\n--device=/dev/davinci7 \\\n--device=/dev/davinci_manager \\\n--device=/dev/devmm_svm \\\n--device=/dev/hisi_hdc \\\n-v /etc/localtime:/etc/localtime \\\n-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \\\n-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/bin/hccn_tool \\\n-v /etc/ascend_install.info:/etc/ascend_install.info \\\n-v /var/log/npu:/usr/slog \\\n-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \\\n-v /etc/hccn.conf:/etc/hccn.conf \\\n--name \"$docker_name\" \\\n\"$image_name\" \\\n/bin/bash\n",[177],{"type":17,"tag":164,"props":178,"children":179},{"__ignoreMap":7},[180],{"type":23,"value":175},{"type":17,"tag":36,"props":182,"children":183},{},[184],{"type":23,"value":185},"3. Access the container environment.",{"type":17,"tag":36,"props":187,"children":188},{},[189],{"type":23,"value":190},"Enter the newly created container and navigate to the designated code directory using the command below.",{"type":17,"tag":159,"props":192,"children":194},{"code":193},"docker exec -ti deepseek_v3 bash\ncd /home/work/mindformers\n",[195],{"type":17,"tag":164,"props":196,"children":197},{"__ignoreMap":7},[198],{"type":23,"value":193},{"type":17,"tag":25,"props":200,"children":202},{"id":201},"dataset-preparation",[203],{"type":17,"tag":30,"props":204,"children":205},{},[206],{"type":23,"value":207},"Dataset Preparation",{"type":17,"tag":36,"props":209,"children":210},{},[211],{"type":23,"value":212},"Taking the WikiText-2 dataset as an example, follow the steps below to convert the dataset into a Megatron BIN file.",{"type":17,"tag":36,"props":214,"children":215},{},[216],{"type":23,"value":217},"1. Download the dataset and tokenizer model file.",{"type":17,"tag":36,"props":219,"children":220},{},[221,223],{"type":23,"value":222},"Dataset: ",{"type":17,"tag":61,"props":224,"children":227},{"href":225,"rel":226},"https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/MindFormers/dataset/wikitext-2/wikitext-2-v1.zip",[65],[228],{"type":23,"value":229},"WikiText2 dataset",{"type":17,"tag":36,"props":231,"children":232},{},[233,235],{"type":23,"value":234},"Tokenizer model: ",{"type":17,"tag":61,"props":236,"children":239},{"href":237,"rel":238},"https://huggingface.co/deepseek-ai/DeepSeek-V3/resolve/main/tokenizer.json?download=true",[65],[240],{"type":23,"value":241},"DeepSeek-V3 tokenizer.json",{"type":17,"tag":36,"props":243,"children":244},{},[245],{"type":23,"value":246},"2. Generate a Megatron BIN file.",{"type":17,"tag":36,"props":248,"children":249},{},[250,252,257,259,264,266,271],{"type":23,"value":251},"Place the dataset file ",{"type":17,"tag":30,"props":253,"children":254},{},[255],{"type":23,"value":256},"wiki.train.tokens",{"type":23,"value":258}," and the tokenizer model file ",{"type":17,"tag":30,"props":260,"children":261},{},[262],{"type":23,"value":263},"tokenizer.json",{"type":23,"value":265}," under the ",{"type":17,"tag":30,"props":267,"children":268},{},[269],{"type":23,"value":270},"/home/work/dataset",{"type":23,"value":272}," directory.",{"type":17,"tag":36,"props":274,"children":275},{},[276],{"type":23,"value":277},"Use the following command to convert the dataset file to Megatron BIN format.",{"type":17,"tag":159,"props":279,"children":281},{"code":280},"cd /home/work/mindformers/research/deepseek3\npython wikitext_to_bin.py \\\n--input /home/work/dataset/wiki.train.tokens \\\n--output-prefix /home/work/dataset/wiki_4096 \\\n--vocab-file /home/work/dataset/tokenizer.json \\\n--seq-length 4096 \\\n--worker 1\n",[282],{"type":17,"tag":164,"props":283,"children":284},{"__ignoreMap":7},[285],{"type":23,"value":280},{"type":17,"tag":25,"props":287,"children":289},{"id":288},"configuration-example",[290],{"type":17,"tag":30,"props":291,"children":292},{},[293],{"type":23,"value":294},"Configuration Example",{"type":17,"tag":36,"props":296,"children":297},{},[298,300,305,307,312],{"type":23,"value":299},"This procedure outlines the steps to configure a single-node environment for DeepSeek-V3 pre-training. Start with the ",{"type":17,"tag":30,"props":301,"children":302},{},[303],{"type":23,"value":304},"pretrain_deepseek3_671b.yaml",{"type":23,"value":306}," configuration file and save the modified version as ",{"type":17,"tag":30,"props":308,"children":309},{},[310],{"type":23,"value":311},"pretrain_deepseek3_1b.yaml",{"type":23,"value":313},".",{"type":17,"tag":36,"props":315,"children":316},{},[317],{"type":23,"value":318},"1. Adjust model configuration.",{"type":17,"tag":159,"props":320,"children":322},{"code":321},"# model config\nmodel:\n  model_config:\n    type: DeepseekV3Config\n    auto_register: deepseek3_config.DeepseekV3Config\n    seq_length: 4096\n    hidden_size: 2048                                # Set the value to 2048.\n    num_layers: &num_layers 3                       # Set the value to 3.\n    num_heads: 8                                    # Set the value to 8.\n    max_position_embeddings: 4096\n    intermediate_size: 6144                          # Set the value to 6144.\n    offset: 0                                        # Set the value to 0.\n    ……\n",[323],{"type":17,"tag":164,"props":324,"children":325},{"__ignoreMap":7},[326],{"type":23,"value":321},{"type":17,"tag":36,"props":328,"children":329},{},[330],{"type":23,"value":331},"2. Adjust mixture-of-experts (MoE) configuration.",{"type":17,"tag":36,"props":333,"children":334},{},[335],{"type":23,"value":336},"Follow the instruction below to use the DeepSeek-V3 dedicated Docker image.",{"type":17,"tag":159,"props":338,"children":340},{"code":339},"#moe\nmoe_config:\n  expert_num: &expert_num 16                      # Set the value to 16.\n  first_k_dense_replace: 1                        # Set the value to 1.\n  ……\n",[341],{"type":17,"tag":164,"props":342,"children":343},{"__ignoreMap":7},[344],{"type":23,"value":339},{"type":17,"tag":36,"props":346,"children":347},{},[348],{"type":23,"value":349},"3. Modify parallel configuration.",{"type":17,"tag":159,"props":351,"children":353},{"code":352},"# parallel config for devices num=8\nparallel_config:\n  data_parallel: 2                                   # Set the value to 2.\n  model_parallel: 2                                  # Set the value to 2.\n  pipeline_stage: 2                                  # Set the value to 2.\n  expert_parallel: 2                                 # Set the value to 2.\n  micro_batch_num: µ_batch_num 4           # Set the value to 4.\n      parallel:\n        parallel_optimizer_config:\n          optimizer_weight_shard_size: 8                  # Set the value to 8.\n  ……\n",[354],{"type":17,"tag":164,"props":355,"children":356},{"__ignoreMap":7},[357],{"type":23,"value":352},{"type":17,"tag":36,"props":359,"children":360},{},[361],{"type":23,"value":362},"4. Adjust learning rate configuration.",{"type":17,"tag":159,"props":364,"children":366},{"code":365},"# lr schedule\n   lr_schedule:\n     type: ConstantWarmUpLR\n     warmup_steps: 20                                    # Set the value to 20.\n",[367],{"type":17,"tag":164,"props":368,"children":369},{"__ignoreMap":7},[370],{"type":23,"value":365},{"type":17,"tag":36,"props":372,"children":373},{},[374],{"type":23,"value":375},"5. Modify dataset configuration.",{"type":17,"tag":36,"props":377,"children":378},{},[379],{"type":23,"value":380},"- Configure the dataset path:",{"type":17,"tag":159,"props":382,"children":384},{"code":383},"# dataset\n\n   train_dataset: &train_dataset\n\n     data_loader:\n\n type: BlendedMegatronDatasetDataLoader\n\n       config:\n\n         data_path:\n\n           - 1\n\n           - \"/home/work/dataset/wiki_4096_text_document\"  # Set the dataset path.\n\n    ……\n",[385],{"type":17,"tag":164,"props":386,"children":387},{"__ignoreMap":7},[388],{"type":23,"value":383},{"type":17,"tag":36,"props":390,"children":391},{},[392],{"type":23,"value":393},"- Configure the dataset parallel communication configuration path:",{"type":17,"tag":159,"props":395,"children":397},{"code":396},"# mindspore context init config\n\n   context:\n\n     ascend_config:\n\n       parallel_speed_up_json_path: \"/home/work/mindformers/research/deepseek3/parallel_speed_up.json\"  # Set the dataset parallel communication configuration path.\n",[398],{"type":17,"tag":164,"props":399,"children":400},{"__ignoreMap":7},[401],{"type":23,"value":396},{"type":17,"tag":25,"props":403,"children":405},{"id":404},"training-task-initiation",[406],{"type":17,"tag":30,"props":407,"children":408},{},[409],{"type":23,"value":410},"Training Task Initiation",{"type":17,"tag":36,"props":412,"children":413},{},[414],{"type":23,"value":415},"Navigate to the code root directory and execute the following command to launch a single-node Atlas 800T A2 (64G) pre-training task.",{"type":17,"tag":159,"props":417,"children":419},{"code":418},"cd /home/work/mindformers\nbash scripts/msrun_launcher.sh \"run_mindformer.py \\\n--register_path research/deepseek3 \\\n--config research/deepseek3/deepseek3_671b/pretrain_deepseek3_1b.yaml\"\n",[420],{"type":17,"tag":164,"props":421,"children":422},{"__ignoreMap":7},[423],{"type":23,"value":418},{"type":17,"tag":36,"props":425,"children":426},{},[427,429,434,436,441,443,448,450,455],{"type":23,"value":428},"Once the startup script is executed, the task will be launched in the background. Training logs are saved under ",{"type":17,"tag":30,"props":430,"children":431},{},[432],{"type":23,"value":433},"/home/work/mindformers/output/msrun_log",{"type":23,"value":435},". Use the following command to view the training logs (Due to pipeline parallelism being enabled with ",{"type":17,"tag":30,"props":437,"children":438},{},[439],{"type":23,"value":440},"pipeline_stage: 2",{"type":23,"value":442},", loss is only displayed in the log of the last card, ",{"type":17,"tag":30,"props":444,"children":445},{},[446],{"type":23,"value":447},"worker_7.log",{"type":23,"value":449},". Loss in other logs will show as ",{"type":17,"tag":30,"props":451,"children":452},{},[453],{"type":23,"value":454},"0",{"type":23,"value":456},").",{"type":17,"tag":159,"props":458,"children":460},{"code":459},"tail -f /home/work/mindformers/output/msrun_log/worker_7.log\n",[461],{"type":17,"tag":164,"props":462,"children":463},{"__ignoreMap":7},[464],{"type":23,"value":459},{"type":17,"tag":36,"props":466,"children":467},{},[468],{"type":23,"value":469},"Training loss chart is shown below:",{"type":17,"tag":36,"props":471,"children":472},{},[473],{"type":17,"tag":474,"props":475,"children":477},"img",{"alt":7,"src":476},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/02/20/20e28aa06fc54928b0b810ab282f723c.png",[],{"type":17,"tag":36,"props":479,"children":480},{},[481,483,488],{"type":23,"value":482},"During the training process, weight checkpoints will be saved in the ",{"type":17,"tag":30,"props":484,"children":485},{},[486],{"type":23,"value":487},"/home/work/mindformers/output/checkpoint",{"type":23,"value":272},{"type":17,"tag":36,"props":490,"children":491},{},[492],{"type":23,"value":493},"That wraps up our tutorial on DeepSeek-V3 pre-training deployment with MindSpore!",{"type":17,"tag":36,"props":495,"children":496},{},[497,499,505],{"type":23,"value":498},"Now it's your turn — get started today by accessing the ",{"type":17,"tag":61,"props":500,"children":502},{"href":63,"rel":501},[65],[503],{"type":23,"value":504},"DeepSeek-V3 image on MindSpore",{"type":23,"value":506}," to conduct pre-training deployment and unlock streamlined development workflows.",{"type":17,"tag":36,"props":508,"children":509},{},[510],{"type":23,"value":511},"And a sneak peek —— next week, we will bring you a tutorial on DeepSeek-V3 inference deployment using MindSpore. Stay tuned for that!",{"title":7,"searchDepth":513,"depth":513,"links":514},4,[515,517,518,519,520],{"id":27,"depth":516,"text":34},3,{"id":86,"depth":516,"text":92},{"id":201,"depth":516,"text":207},{"id":288,"depth":516,"text":294},{"id":404,"depth":516,"text":410},"markdown","content:news:en:3612.md","content","news/en/3612.md","news/en/3612","md",1776506047246]