[{"data":1,"prerenderedAt":670},["ShallowReactive",2],{"content-query-29drhKMypV":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":664,"_id":665,"_source":666,"_file":667,"_stem":668,"_extension":669},"/technology-blogs/en/2516","en",false,"","MindSpore Case Study | An Example of Distributed Parallel Training on the CPU Platform","This blog describes how to use MindSpore on the CPU platform.","2023-05-10","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/05/29/93d63c45feaa45eb9d83e939e37c4062.png","technology-blogs","Practices",{"type":15,"children":16,"toc":662},"root",[17,25,31,47,61,81,89,94,101,106,116,128,137,149,156,168,176,190,202,213,221,228,236,241,246,254,264,274,284,308,313,321,331,348,371,381,393,411,416,424,443,453,463,468,480,502,514,522,556,566,578,585,592,604,612,628,637,642,654],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore-case-study-an-example-of-distributed-parallel-training-on-the-cpu-platform",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"Author: Yeats_Liao | Source: CSDN",{"type":18,"tag":26,"props":32,"children":33},{},[34,36,45],{"type":24,"value":35},"This blog describes how to use MindSpore on the CPU platform. For the detailed code, go to the ",{"type":18,"tag":37,"props":38,"children":42},"a",{"href":39,"rel":40},"https://gitee.com/mindspore/docs/tree/r2.0.0-alpha/docs/sample_code/distributed_training_cpu",[41],"nofollow",[43],{"type":24,"value":44},"code repository",{"type":24,"value":46},".",{"type":18,"tag":48,"props":49,"children":50},"ol",{},[51],{"type":18,"tag":52,"props":53,"children":54},"li",{},[55],{"type":18,"tag":56,"props":57,"children":58},"strong",{},[59],{"type":24,"value":60},"Setting Up the Environment",{"type":18,"tag":26,"props":62,"children":63},{},[64,66,73,75,80],{"type":24,"value":65},"Go to the ",{"type":18,"tag":37,"props":67,"children":70},{"href":68,"rel":69},"https://www.mindspore.cn/en",[41],[71],{"type":24,"value":72},"MindSpore official website",{"type":24,"value":74}," and click ",{"type":18,"tag":56,"props":76,"children":77},{},[78],{"type":24,"value":79},"Install",{"type":24,"value":46},{"type":18,"tag":26,"props":82,"children":83},{},[84],{"type":18,"tag":85,"props":86,"children":88},"img",{"alt":7,"src":87},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/05/29/e8f7bcb8027f475e994db303a5fb2433.png",[],{"type":18,"tag":26,"props":90,"children":91},{},[92],{"type":24,"value":93},"Obtain the installation command.",{"type":18,"tag":26,"props":95,"children":96},{},[97],{"type":18,"tag":85,"props":98,"children":100},{"alt":7,"src":99},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/05/29/8782b98fbbf149c3873217e3224a7141.png",[],{"type":18,"tag":26,"props":102,"children":103},{},[104],{"type":24,"value":105},"Open a terminal in ModelArts and enter the installation command.",{"type":18,"tag":107,"props":108,"children":110},"pre",{"code":109},"conda install mindspore=2.0.0a0 -c mindspore -c conda-forge\n",[111],{"type":18,"tag":112,"props":113,"children":114},"code",{"__ignoreMap":7},[115],{"type":24,"value":109},{"type":18,"tag":26,"props":117,"children":118},{},[119,121,126],{"type":24,"value":120},"Click ",{"type":18,"tag":56,"props":122,"children":123},{},[124],{"type":24,"value":125},"Clone a Repository",{"type":24,"value":127}," in the sidebar and input the following URL.",{"type":18,"tag":26,"props":129,"children":130},{},[131],{"type":18,"tag":37,"props":132,"children":135},{"href":133,"rel":134},"https://gitee.com/mindspore/docs.git",[41],[136],{"type":24,"value":133},{"type":18,"tag":26,"props":138,"children":139},{},[140,142,147],{"type":24,"value":141},"The ",{"type":18,"tag":56,"props":143,"children":144},{},[145],{"type":24,"value":146},"docs",{"type":24,"value":148}," project is imported successfully.",{"type":18,"tag":26,"props":150,"children":151},{},[152],{"type":18,"tag":85,"props":153,"children":155},{"alt":7,"src":154},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/05/29/e0a4c9edf3cd4e7c8f8375b56bcf8a41.png",[],{"type":18,"tag":48,"props":157,"children":159},{"start":158},2,[160],{"type":18,"tag":52,"props":161,"children":162},{},[163],{"type":18,"tag":56,"props":164,"children":165},{},[166],{"type":24,"value":167},"Preparations",{"type":18,"tag":26,"props":169,"children":170},{},[171],{"type":18,"tag":56,"props":172,"children":173},{},[174],{"type":24,"value":175},"2.1 Downloading a Dataset",{"type":18,"tag":26,"props":177,"children":178},{},[179,181,188],{"type":24,"value":180},"This example uses the ",{"type":18,"tag":37,"props":182,"children":185},{"href":183,"rel":184},"http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz",[41],[186],{"type":24,"value":187},"CIFAR-10 dataset",{"type":24,"value":189},", which consists of 10 types of 32 x 32 color images, with each type containing 6,000 images. The training dataset contains 50,000 images, and the test dataset 10,000 images.",{"type":18,"tag":26,"props":191,"children":192},{},[193,195,200],{"type":24,"value":194},"Upload the downloaded dataset and decompress it to obtain the ",{"type":18,"tag":56,"props":196,"children":197},{},[198],{"type":24,"value":199},"cifar-10-batches-bin",{"type":24,"value":201}," folder.",{"type":18,"tag":26,"props":203,"children":204},{},[205,206,211],{"type":24,"value":141},{"type":18,"tag":56,"props":207,"children":208},{},[209],{"type":24,"value":210},"mindspore.dataset",{"type":24,"value":212}," module provides multiple transforms for different data types such as images, texts, and audios, and supports Lambda functions.",{"type":18,"tag":107,"props":214,"children":216},{"code":215},"tar -zxvf cifar-10-binary.tar.gz\n",[217],{"type":18,"tag":112,"props":218,"children":219},{"__ignoreMap":7},[220],{"type":24,"value":215},{"type":18,"tag":26,"props":222,"children":223},{},[224],{"type":18,"tag":85,"props":225,"children":227},{"alt":7,"src":226},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/05/29/75cf41c7af2940bdab31b8be6014da2d.png",[],{"type":18,"tag":26,"props":229,"children":230},{},[231],{"type":18,"tag":56,"props":232,"children":233},{},[234],{"type":24,"value":235},"2.2 Configuring the Distributed Environment",{"type":18,"tag":26,"props":237,"children":238},{},[239],{"type":24,"value":240},"There are two types of data parallelism on the CPU: single-server multi-node parallelism and multi-server multi-node parallelism (a training process can be regarded as a node). Before running the training script, set up the networking environment, including configuring environment variables and calling the initialization API in the training script.",{"type":18,"tag":26,"props":242,"children":243},{},[244],{"type":24,"value":245},"Configure the environment variables as follows:",{"type":18,"tag":107,"props":247,"children":249},{"code":248},"export MS_WORKER_NUM=8                # Number of worker nodes\nexport MS_SCHED_HOST=xxx.xxx.xxx..xxx        # Scheduler IP address\nexport MS_SCHED_PORT=6667             # Scheduler port\nexport MS_ROLE=MS_WORKER              # The role of this node. MS_SCHED represents the scheduler, MS_WORKER represents the worker.\n",[250],{"type":18,"tag":112,"props":251,"children":252},{"__ignoreMap":7},[253],{"type":24,"value":248},{"type":18,"tag":26,"props":255,"children":256},{},[257,262],{"type":18,"tag":56,"props":258,"children":259},{},[260],{"type":24,"value":261},"MS_WORKER_NUM",{"type":24,"value":263},": number of worker nodes. In the multi-server scenario, the number of worker nodes is the sum of worker nodes on each server.",{"type":18,"tag":26,"props":265,"children":266},{},[267,272],{"type":18,"tag":56,"props":268,"children":269},{},[270],{"type":24,"value":271},"MS_SCHED_HOST",{"type":24,"value":273},": IP address of the scheduler node.",{"type":18,"tag":26,"props":275,"children":276},{},[277,282],{"type":18,"tag":56,"props":278,"children":279},{},[280],{"type":24,"value":281},"MS_SCHED_PORT",{"type":24,"value":283},": service port of the scheduler node. It is used to receive the IP addresses and service ports sent by worker nodes and deliver the collected information to each worker node.",{"type":18,"tag":26,"props":285,"children":286},{},[287,292,294,299,301,306],{"type":18,"tag":56,"props":288,"children":289},{},[290],{"type":24,"value":291},"MS_ROLE",{"type":24,"value":293},": node type, which can be worker (",{"type":18,"tag":56,"props":295,"children":296},{},[297],{"type":24,"value":298},"MS_WORKER",{"type":24,"value":300},") or scheduler (",{"type":18,"tag":56,"props":302,"children":303},{},[304],{"type":24,"value":305},"MS_SCHED",{"type":24,"value":307},"). A scheduler node must be configured for networking no matter whether it is a single-server multi-node or multi-server multi-node scenario.",{"type":18,"tag":26,"props":309,"children":310},{},[311],{"type":24,"value":312},"Call the initialization API in the training script as follows:",{"type":18,"tag":107,"props":314,"children":316},{"code":315},"import mindspore as ms\nfrom mindspore.communication import init\n\nms.set_context(mode=ms.GRAPH_MODE, device_target=\"CPU\")\nms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True)\nms.set_ps_context(enable_ssl=False)\ninit()\n",[317],{"type":18,"tag":112,"props":318,"children":319},{"__ignoreMap":7},[320],{"type":24,"value":315},{"type":18,"tag":26,"props":322,"children":323},{},[324,329],{"type":18,"tag":56,"props":325,"children":326},{},[327],{"type":24,"value":328},"ms.set_context(mode=context.GRAPH_MODE, device_target=\"CPU\")",{"type":24,"value":330},": Sets the mode to graph mode (parallelism is not supported in PyNative mode on the CPU) and the device to CPU.",{"type":18,"tag":26,"props":332,"children":333},{},[334,339,341,346],{"type":18,"tag":56,"props":335,"children":336},{},[337],{"type":24,"value":338},"ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True)",{"type":24,"value":340},": specifies the data parallelism mode. ",{"type":18,"tag":56,"props":342,"children":343},{},[344],{"type":24,"value":345},"gradients_mean=True",{"type":24,"value":347}," indicates that the mean value is calculated after gradient reduction. Currently, only the sum operation is supported for gradient reduction on the CPU.",{"type":18,"tag":26,"props":349,"children":350},{},[351,356,358,363,365,370],{"type":18,"tag":56,"props":352,"children":353},{},[354],{"type":24,"value":355},"ms.set_ps_context",{"type":24,"value":357},": configures encrypted communication. You can set ",{"type":18,"tag":56,"props":359,"children":360},{},[361],{"type":24,"value":362},"ms.set_ps_context(enable_ssl=True)",{"type":24,"value":364}," to enable encrypted communication. The default value is ",{"type":18,"tag":56,"props":366,"children":367},{},[368],{"type":24,"value":369},"False",{"type":24,"value":46},{"type":18,"tag":26,"props":372,"children":373},{},[374,379],{"type":18,"tag":56,"props":375,"children":376},{},[377],{"type":24,"value":378},"init",{"type":24,"value":380},": initializes a node. After the initialization is complete, the networking is successful.",{"type":18,"tag":48,"props":382,"children":384},{"start":383},3,[385],{"type":18,"tag":52,"props":386,"children":387},{},[388],{"type":18,"tag":56,"props":389,"children":390},{},[391],{"type":24,"value":392},"Loading the Dataset",{"type":18,"tag":26,"props":394,"children":395},{},[396,398,403,405,409],{"type":24,"value":397},"A dataset is imported in data parallel mode during distributed training. The following uses the CIFAR-10 dataset as an example to describe how to import a dataset in data parallel mode. ",{"type":18,"tag":56,"props":399,"children":400},{},[401],{"type":24,"value":402},"data_path",{"type":24,"value":404}," indicates the dataset path (path of the ",{"type":18,"tag":56,"props":406,"children":407},{},[408],{"type":24,"value":199},{"type":24,"value":410}," folder in this example).",{"type":18,"tag":26,"props":412,"children":413},{},[414],{"type":24,"value":415},"The sample code is as follows:",{"type":18,"tag":107,"props":417,"children":419},{"code":418},"import mindspore as ms\nimport mindspore.dataset as ds\nimport mindspore.dataset.vision as vision\nimport mindspore.dataset.transforms as transforms\nfrom mindspore.communication import get_rank, get_group_size\n\ndef create_dataset(data_path, repeat_num=1, batch_size=32):\n    \"\"\"Create training dataset\"\"\"\n    resize_height = 224\n    resize_width = 224\n    rescale = 1.0 / 255.0\n    shift = 0.0\n\n    # get rank_id and rank_size\n    rank_size = get_group_size()\n    rank_id = get_rank()\n    data_set = ds.Cifar10Dataset(data_path, num_shards=rank_size, shard_id=rank_id)\n\n    # define map operations\n    random_crop_op = vision.RandomCrop((32, 32), (4, 4, 4, 4))\n    random_horizontal_op = vision.RandomHorizontalFlip()\n    resize_op = vision.Resize((resize_height, resize_width))\n    rescale_op = vision.Rescale(rescale, shift)\n    normalize_op = vision.Normalize((0.4465, 0.4822, 0.4914), (0.2010, 0.1994, 0.2023))\n    changeswap_op = vision.HWC2CHW()\n    type_cast_op = transforms.TypeCast(ms.int32)\n\n    c_trans = [random_crop_op, random_horizontal_op]\n    c_trans += [resize_op, rescale_op, normalize_op, changeswap_op]\n\n    # apply map operations on images\n    data_set = data_set.map(operations=type_cast_op, input_columns=\"label\")\n    data_set = data_set.map(operations=c_trans, input_columns=\"image\")\n\n    # apply shuffle operations\n    data_set = data_set.shuffle(buffer_size=10)\n\n    # apply batch operations\n    data_set = data_set.batch(batch_size=batch_size, drop_remainder=True)\n\n    # apply repeat operations\n    data_set = data_set.repeat(repeat_num)\n\n    return data_set\n",[420],{"type":18,"tag":112,"props":421,"children":422},{"__ignoreMap":7},[423],{"type":24,"value":418},{"type":18,"tag":26,"props":425,"children":426},{},[427,429,434,436,441],{"type":24,"value":428},"Unlike the single-server system, when Cifar10Dataset is constructed, the ",{"type":18,"tag":56,"props":430,"children":431},{},[432],{"type":24,"value":433},"num_shards",{"type":24,"value":435}," and ",{"type":18,"tag":56,"props":437,"children":438},{},[439],{"type":24,"value":440},"shard_id",{"type":24,"value":442}," parameters need to be passed, which correspond to the number of worker nodes and logical sequence number, respectively. You can obtain the parameter values using the following interfaces:",{"type":18,"tag":26,"props":444,"children":445},{},[446,451],{"type":18,"tag":56,"props":447,"children":448},{},[449],{"type":24,"value":450},"get_group_size",{"type":24,"value":452},": obtains the number of worker nodes in a cluster.",{"type":18,"tag":26,"props":454,"children":455},{},[456,461],{"type":18,"tag":56,"props":457,"children":458},{},[459],{"type":24,"value":460},"get_rank",{"type":24,"value":462},": obtains the logical sequence number of a worker node in the cluster.",{"type":18,"tag":26,"props":464,"children":465},{},[466],{"type":24,"value":467},"When loading a dataset in parallel mode, you are advised to specify the same dataset file for each CPU to ensure calculation accuracy.",{"type":18,"tag":48,"props":469,"children":471},{"start":470},4,[472],{"type":18,"tag":52,"props":473,"children":474},{},[475],{"type":18,"tag":56,"props":476,"children":477},{},[478],{"type":24,"value":479},"Model Definition",{"type":18,"tag":26,"props":481,"children":482},{},[483,485,492,494,501],{"type":24,"value":484},"In data parallel mode, the network definition is the same as that in a single-node system. For details, see the ",{"type":18,"tag":37,"props":486,"children":489},{"href":487,"rel":488},"https://gitee.com/mindspore/docs/blob/r2.0.0-alpha/docs/sample_code/distributed_training_cpu/resnet.py",[41],[490],{"type":24,"value":491},"ResNet sample script",{"type":24,"value":493},". For the definitions of the optimizer, loss function, and training model, see the ",{"type":18,"tag":37,"props":495,"children":498},{"href":496,"rel":497},"https://www.mindspore.cn/tutorials/en/r2.0.0-alpha/beginner/train.html",[41],[499],{"type":24,"value":500},"training model definition",{"type":24,"value":46},{"type":18,"tag":26,"props":503,"children":504},{},[505,507,512],{"type":24,"value":506},"For details about the complete training script code, see the ",{"type":18,"tag":37,"props":508,"children":510},{"href":487,"rel":509},[41],[511],{"type":24,"value":491},{"type":24,"value":513},". The training startup code is as follows:",{"type":18,"tag":107,"props":515,"children":517},{"code":516},"import os\nimport mindspore as ms\nimport mindspore.nn as nn\nfrom mindspore import train\nfrom mindspore.communication import init\nfrom resnet import resnet50\n\ndef train_resnet50_with_cifar10(epoch_size=10):\n    \"\"\"Start the training\"\"\"\n    loss_cb = train.LossMonitor()\n    data_path = os.getenv('DATA_PATH')\n    dataset = create_dataset(data_path)\n    batch_size = 32\n    num_classes = 10\n    net = resnet50(batch_size, num_classes)\n    loss = SoftmaxCrossEntropyExpand(sparse=True)\n    opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9)\n    model = ms.Model(net, loss_fn=loss, optimizer=opt)\n    model.train(epoch_size, dataset, callbacks=[loss_cb], dataset_sink_mode=True)\n\n\nif __name__ == \"__main__\":\n    ms.set_context(mode=ms.GRAPH_MODE, device_target=\"CPU\")\n    ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True)\n    ms.set_ps_context(enable_ssl=False)\n    init()\n    train_resnet50_with_cifar10()\n",[518],{"type":18,"tag":112,"props":519,"children":520},{"__ignoreMap":7},[521],{"type":24,"value":516},{"type":18,"tag":26,"props":523,"children":524},{},[525,527,532,533,538,540,547,549,554],{"type":24,"value":526},"In the script, the ",{"type":18,"tag":56,"props":528,"children":529},{},[530],{"type":24,"value":531},"create_dataset",{"type":24,"value":435},{"type":18,"tag":56,"props":534,"children":535},{},[536],{"type":24,"value":537},"SoftmaxCrossEntropyExpand",{"type":24,"value":539}," APIs are referenced from ",{"type":18,"tag":37,"props":541,"children":544},{"href":542,"rel":543},"https://gitee.com/mindspore/docs/blob/r2.0.0-alpha/docs/sample_code/distributed_training_cpu/resnet50_distributed_training.py",[41],[545],{"type":24,"value":546},"distributed_training_cpu",{"type":24,"value":548},", and the ",{"type":18,"tag":56,"props":550,"children":551},{},[552],{"type":24,"value":553},"resnet50",{"type":24,"value":555}," API is referenced from the ResNet sample script.",{"type":18,"tag":26,"props":557,"children":558},{},[559,561],{"type":24,"value":560},"5. ",{"type":18,"tag":56,"props":562,"children":563},{},[564],{"type":24,"value":565},"Starting Training",{"type":18,"tag":26,"props":567,"children":568},{},[569,571,576],{"type":24,"value":570},"The following uses a single-server 8-node system as an example to perform distributed training on the CPU platform. Go to the ",{"type":18,"tag":56,"props":572,"children":573},{},[574],{"type":24,"value":575},"/home/ma-user/work/docs/docs/sample_code/distributed_training_cpu",{"type":24,"value":577}," directory.",{"type":18,"tag":26,"props":579,"children":580},{},[581],{"type":18,"tag":85,"props":582,"children":584},{"alt":7,"src":583},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/05/29/9e3f7ba41e824a03bfde7ba6e78e3af0.png",[],{"type":18,"tag":26,"props":586,"children":587},{},[588],{"type":18,"tag":85,"props":589,"children":591},{"alt":7,"src":590},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/05/29/f7e766ba697d4bd0a7d377084175193f.png",[],{"type":18,"tag":26,"props":593,"children":594},{},[595,597,602],{"type":24,"value":596},"Run ",{"type":18,"tag":56,"props":598,"children":599},{},[600],{"type":24,"value":601},"bash run.sh /dataset/cifar-10-batches-bin",{"type":24,"value":603}," to start the training.",{"type":18,"tag":107,"props":605,"children":607},{"code":606},"(PyTorch-1.8) [ma-user distributed_training_cpu]$bash run.sh cifar-10-batches-bin\n==============================================================================================================\nPlease run the script with dataset path, such as: \nbash run.sh DATA_PATH\nFor example: bash run.sh /path/dataset\nIt is better to use the absolute path.\n==============================================================================================================\nscheduler start success!\nworker 0 start success with pid 8240\nworker 1 start success with pid 8241\nworker 2 start success with pid 8242\nworker 3 start success with pid 8243\nworker 4 start success with pid 8244\nworker 5 start success with pid 8245\nworker 6 start success with pid 8246\nworker 7 start success with pid 8247\n#!/bin/bash\n# run data parallel training on CPU\n\necho \"==============================================================================================================\"\necho \"Please run the script with dataset path, such as: \"\necho \"bash run.sh DATA_PATH\"\necho \"For example: bash run.sh /path/dataset\"\necho \"It is better to use the absolute path.\"\necho \"==============================================================================================================\"\nset -e\nDATA_PATH=$1\nexport DATA_PATH=${DATA_PATH}\n\nexport MS_WORKER_NUM=8\nexport MS_SCHED_HOST=xxx.xxx.xxx..xxx\nexport MS_SCHED_PORT=8117\n\n# Launch 1 scheduler.\nexport MS_ROLE=MS_SCHED\npython3 resnet50_distributed_training.py >scheduler.txt 2>&1 &\necho \"scheduler start success!\"\n\n# Launch 8 workers.\nexport MS_ROLE=MS_WORKER\nfor((i=0;i\u003C${MS_WORKER_NUM};i++));\ndo\n    python3 resnet50_distributed_training.py >worker_$i.txt 2>&1 &\n    echo \"worker ${i} start success with pid ${!}\"\ndone\n",[608],{"type":18,"tag":112,"props":609,"children":610},{"__ignoreMap":7},[611],{"type":24,"value":606},{"type":18,"tag":26,"props":613,"children":614},{},[615,620,622,626],{"type":18,"tag":56,"props":616,"children":617},{},[618],{"type":24,"value":619},"resnet50_distributed_training.py",{"type":24,"value":621}," is the defined training script. In the multi-server multi-node scenario, run the preceding script to start the training for corresponding worker nodes on each server. Given that there is only one scheduler node, you only need to start the training on the scheduler node for one server only (",{"type":18,"tag":56,"props":623,"children":624},{},[625],{"type":24,"value":271},{"type":24,"value":627},").",{"type":18,"tag":26,"props":629,"children":630},{},[631,635],{"type":18,"tag":56,"props":632,"children":633},{},[634],{"type":24,"value":261},{"type":24,"value":636}," specifies the number of worker nodes that need to be started for training. If the number of started worker nodes is not enough, the networking fails.",{"type":18,"tag":26,"props":638,"children":639},{},[640],{"type":24,"value":641},"Although the training script is executed on the scheduler node, the scheduler is mainly used for networking and does not participate in training.",{"type":18,"tag":26,"props":643,"children":644},{},[645,647,652],{"type":24,"value":646},"After training for a period of time, open the ",{"type":18,"tag":56,"props":648,"children":649},{},[650],{"type":24,"value":651},"worker_0",{"type":24,"value":653}," log. The training information is as follows:",{"type":18,"tag":107,"props":655,"children":657},{"code":656},"(PyTorch-1.8) [ma-user distributed_training_cpu]$tail -f worker_0.txt \n\n……\nepoch: 1 step: 1, loss is 1.4686084\nepoch: 1 step: 2, loss is 1.3278534\nepoch: 1 step: 3, loss is 1.4246798\nepoch: 1 step: 4, loss is 1.4920032\nepoch: 1 step: 5, loss is 1.4324203\nepoch: 1 step: 6, loss is 1.432581\nepoch: 1 step: 7, loss is 1.319618\n",[658],{"type":18,"tag":112,"props":659,"children":660},{"__ignoreMap":7},[661],{"type":24,"value":656},{"title":7,"searchDepth":470,"depth":470,"links":663},[],"markdown","content:technology-blogs:en:2516.md","content","technology-blogs/en/2516.md","technology-blogs/en/2516","md",1776506106234]