[{"data":1,"prerenderedAt":1332},["ShallowReactive",2],{"content-query-hneZTK9Wxx":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":1326,"_id":1327,"_source":1328,"_file":1329,"_stem":1330,"_extension":1331},"/technology-blogs/zh/1643","zh",false,"","MindSpore易点通·精讲系列--模型训练之GPU分布式并行训练","在GPU硬件环境下，如何依赖OpenMPI进行多卡训练。","2022-07-18","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/07/25/6f3e08c9632349c0bd751164fb997765.png","technology-blogs","开发者分享",{"type":15,"children":16,"toc":1296},"root",[17,25,31,37,42,72,77,120,134,146,151,180,188,199,219,258,263,274,285,314,325,367,380,389,402,410,421,426,431,439,452,457,471,479,484,492,497,508,513,524,529,611,616,624,635,648,656,661,669,681,689,700,705,716,720,803,808,816,827,832,844,849,857,862,870,880,888,899,904,909,917,921,929,939,947,958,1029,1040,1087,1107,1146,1174,1185,1217,1228,1238,1249,1291],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore易点通精讲系列-模型训练之gpu分布式并行训练",[23],{"type":24,"value":8},"text",{"type":18,"tag":19,"props":26,"children":28},{"id":27},"dive-into-mindspore-distributed-training-with-gpu-for-model-train",[29],{"type":24,"value":30},"Dive Into MindSpore – Distributed Training With GPU For Model Train",{"type":18,"tag":32,"props":33,"children":34},"p",{},[35],{"type":24,"value":36},"MindSpore易点通·精讲系列–模型训练之GPU分布式并行训练",{"type":18,"tag":32,"props":38,"children":39},{},[40],{"type":24,"value":41},"本文开发环境",{"type":18,"tag":43,"props":44,"children":45},"ul",{},[46,52,57,62,67],{"type":18,"tag":47,"props":48,"children":49},"li",{},[50],{"type":24,"value":51},"Ubuntu 20.04",{"type":18,"tag":47,"props":53,"children":54},{},[55],{"type":24,"value":56},"Python 3.8",{"type":18,"tag":47,"props":58,"children":59},{},[60],{"type":24,"value":61},"MindSpore 1.7.0",{"type":18,"tag":47,"props":63,"children":64},{},[65],{"type":24,"value":66},"OpenMPI 4.0.3",{"type":18,"tag":47,"props":68,"children":69},{},[70],{"type":24,"value":71},"RTX 1080Ti * 4",{"type":18,"tag":32,"props":73,"children":74},{},[75],{"type":24,"value":76},"本文内容摘要",{"type":18,"tag":43,"props":78,"children":79},{},[80,85,90,95,100,105,110,115],{"type":18,"tag":47,"props":81,"children":82},{},[83],{"type":24,"value":84},"基础知识",{"type":18,"tag":47,"props":86,"children":87},{},[88],{"type":24,"value":89},"环境搭建",{"type":18,"tag":47,"props":91,"children":92},{},[93],{"type":24,"value":94},"单卡训练",{"type":18,"tag":47,"props":96,"children":97},{},[98],{"type":24,"value":99},"多卡训练–OpenMPI",{"type":18,"tag":47,"props":101,"children":102},{},[103],{"type":24,"value":104},"多卡训练–非OpenMPI",{"type":18,"tag":47,"props":106,"children":107},{},[108],{"type":24,"value":109},"本文总结",{"type":18,"tag":47,"props":111,"children":112},{},[113],{"type":24,"value":114},"遇到问题",{"type":18,"tag":47,"props":116,"children":117},{},[118],{"type":24,"value":119},"本文参考",{"type":18,"tag":121,"props":122,"children":124},"h2",{"id":123},"_1-基础知识",[125,132],{"type":18,"tag":126,"props":127,"children":131},"a",{"href":128,"rel":129},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#1.-%E5%9F%BA%E7%A1%80%E7%9F%A5%E8%AF%86",[130],"nofollow",[],{"type":24,"value":133},"1. 基础知识",{"type":18,"tag":135,"props":136,"children":138},"h3",{"id":137},"_11-概念介绍",[139,144],{"type":18,"tag":126,"props":140,"children":143},{"href":141,"rel":142},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#1.1-%E6%A6%82%E5%BF%B5%E4%BB%8B%E7%BB%8D",[130],[],{"type":24,"value":145},"1.1 概念介绍",{"type":18,"tag":32,"props":147,"children":148},{},[149],{"type":24,"value":150},"在深度学习中，随着模型和数据的不断增长，在很多情况下需要使用单机多卡或者多机多卡进行训练，即分布式训练。分布式训练策略按照并行方式不同，可以简单的分为数据并行和模型并行两种方式。",{"type":18,"tag":43,"props":152,"children":153},{},[154,167],{"type":18,"tag":47,"props":155,"children":156},{},[157,159],{"type":24,"value":158},"数据并行\n",{"type":18,"tag":43,"props":160,"children":161},{},[162],{"type":18,"tag":47,"props":163,"children":164},{},[165],{"type":24,"value":166},"数据并行是指在不同的 GPU 上都 copy 保存一份模型的副本，然后将不同的数据分配到不同的 GPU 上进行计算，最后将所有 GPU 计算的结果进行合并，从而达到加速模型训练的目的。",{"type":18,"tag":47,"props":168,"children":169},{},[170,172],{"type":24,"value":171},"模型并行\n",{"type":18,"tag":43,"props":173,"children":174},{},[175],{"type":18,"tag":47,"props":176,"children":177},{},[178],{"type":24,"value":179},"与数据并行不同，分布式训练中的模型并行是指将整个神经网络模型拆解分布到不同的 GPU 中，不同的 GPU 负责计算网络模型中的不同部分。这通常是在网络模型很大很大、单个 GPU 的显存已经完全装不下整体网络的情况下才会采用。",{"type":18,"tag":32,"props":181,"children":182},{},[183],{"type":18,"tag":184,"props":185,"children":187},"img",{"alt":7,"src":186},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/15/1657853676059222796.png",[],{"type":18,"tag":135,"props":189,"children":191},{"id":190},"_12-mindspore中的支持",[192,197],{"type":18,"tag":126,"props":193,"children":196},{"href":194,"rel":195},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#1.2-mindspore%E4%B8%AD%E7%9A%84%E6%94%AF%E6%8C%81",[130],[],{"type":24,"value":198},"1.2 MindSpore中的支持",{"type":18,"tag":32,"props":200,"children":201},{},[202,209,211,217],{"type":18,"tag":203,"props":204,"children":206},"code",{"className":205},[],[207],{"type":24,"value":208},"1.1",{"type":24,"value":210},"中介绍了理论中的并行方式，具体到",{"type":18,"tag":203,"props":212,"children":214},{"className":213},[],[215],{"type":24,"value":216},"MIndSpore",{"type":24,"value":218},"框架中，目前支持下述的四种并行模式：",{"type":18,"tag":43,"props":220,"children":221},{},[222,227,232,245],{"type":18,"tag":47,"props":223,"children":224},{},[225],{"type":24,"value":226},"数据并行：用户的网络参数规模在单卡上可以计算的情况下使用。这种模式会在每卡上复制相同的网络参数，训练时输入不同的训练数据，适合大部分用户使用。",{"type":18,"tag":47,"props":228,"children":229},{},[230],{"type":24,"value":231},"半自动并行：用户的神经网络在单卡上无法计算，并且对切分的性能存在较大的需求。用户可以设置这种运行模式，手动指定每个算子的切分策略，达到较佳的训练性能。",{"type":18,"tag":47,"props":233,"children":234},{},[235,237,243],{"type":24,"value":236},"自动并行：用户的神经网络在单卡上无法计算，但是不知道如何配置算子策略。用户启动这种模式，",{"type":18,"tag":203,"props":238,"children":240},{"className":239},[],[241],{"type":24,"value":242},"MindSpore",{"type":24,"value":244},"会自动针对每个算子进行配置策略，适合想要并行训练但是不知道如何配置策略的用户。",{"type":18,"tag":47,"props":246,"children":247},{},[248,250,256],{"type":24,"value":249},"混合并行：完全由用户自己设计并行训练的逻辑和实现，用户可以自己在网络中定义",{"type":18,"tag":203,"props":251,"children":253},{"className":252},[],[254],{"type":24,"value":255},"AllGather",{"type":24,"value":257},"等通信算子。适合熟悉并行训练的用户。",{"type":18,"tag":32,"props":259,"children":260},{},[261],{"type":24,"value":262},"对于大部分用户来说，其实能够用到的是数据并行模式，所以下面的案例中，会以数据并行模式来展开讲解。",{"type":18,"tag":121,"props":264,"children":266},{"id":265},"_2-环境搭建",[267,272],{"type":18,"tag":126,"props":268,"children":271},{"href":269,"rel":270},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#2.-%E7%8E%AF%E5%A2%83%E6%90%AD%E5%BB%BA",[130],[],{"type":24,"value":273},"2. 环境搭建",{"type":18,"tag":135,"props":275,"children":277},{"id":276},"_21-mindspore安装",[278,283],{"type":18,"tag":126,"props":279,"children":282},{"href":280,"rel":281},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#2.1-mindspore%E5%AE%89%E8%A3%85",[130],[],{"type":24,"value":284},"2.1 MindSpore安装",{"type":18,"tag":32,"props":286,"children":287},{},[288,290,297,299,304,306,312],{"type":24,"value":289},"略。可参考笔者之前的文章",{"type":18,"tag":126,"props":291,"children":294},{"href":292,"rel":293},"https://bbs.huaweicloud.com/forum/thread-179309-1-1.html",[130],[295],{"type":24,"value":296},"MindSpore入门–基于GPU服务器安装MindSpore 1.5.0",{"type":24,"value":298},"，注意将文章中的",{"type":18,"tag":203,"props":300,"children":302},{"className":301},[],[303],{"type":24,"value":242},{"type":24,"value":305},"版本升级到",{"type":18,"tag":203,"props":307,"children":309},{"className":308},[],[310],{"type":24,"value":311},"1.7.0",{"type":24,"value":313},"。",{"type":18,"tag":135,"props":315,"children":317},{"id":316},"_22-openmpi安装",[318,323],{"type":18,"tag":126,"props":319,"children":322},{"href":320,"rel":321},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#2.2-openmpi%E5%AE%89%E8%A3%85",[130],[],{"type":24,"value":324},"2.2 OpenMPI安装",{"type":18,"tag":32,"props":326,"children":327},{},[328,330,336,338,343,345,351,353,359,361,366],{"type":24,"value":329},"在",{"type":18,"tag":203,"props":331,"children":333},{"className":332},[],[334],{"type":24,"value":335},"GPU",{"type":24,"value":337},"硬件平台上，",{"type":18,"tag":203,"props":339,"children":341},{"className":340},[],[342],{"type":24,"value":242},{"type":24,"value":344},"采用",{"type":18,"tag":203,"props":346,"children":348},{"className":347},[],[349],{"type":24,"value":350},"OpenMPI",{"type":24,"value":352},"的",{"type":18,"tag":203,"props":354,"children":356},{"className":355},[],[357],{"type":24,"value":358},"mpirun",{"type":24,"value":360},"进行分布式训练。所以我们先来安装",{"type":18,"tag":203,"props":362,"children":364},{"className":363},[],[365],{"type":24,"value":350},{"type":24,"value":313},{"type":18,"tag":32,"props":368,"children":369},{},[370,372,378],{"type":24,"value":371},"本文安装的是",{"type":18,"tag":203,"props":373,"children":375},{"className":374},[],[376],{"type":24,"value":377},"4.0.3",{"type":24,"value":379},"版本，安装命令如下：",{"type":18,"tag":381,"props":382,"children":384},"pre",{"code":383},"wget -c https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.3.tar.gz\ntar xf openmpi-4.0.3.tar.gz\ncd openmpi-4.0.3/\n./configure --prefix=/usr/local/openmpi-4.0.3\nmake -j 16\nsudo make install\necho -e \"export PATH=/usr/local/openmpi-4.0.3/bin:\\$PATH\" >> ~/.bashrc\necho -e \"export LD_LIBRARY_PATH=/usr/local/openmpi-4.0.3/lib:\\$LD_LIBRARY_PATH\" >> ~/.bashrc\nsource ~/.bashrc\n",[385],{"type":18,"tag":203,"props":386,"children":387},{"__ignoreMap":7},[388],{"type":24,"value":383},{"type":18,"tag":32,"props":390,"children":391},{},[392,394,400],{"type":24,"value":393},"使用",{"type":18,"tag":203,"props":395,"children":397},{"className":396},[],[398],{"type":24,"value":399},"mpirun --version",{"type":24,"value":401},"命令验证是否安装成功，输出如下内容：",{"type":18,"tag":381,"props":403,"children":405},{"code":404},"mpirun (Open MPI) 4.0.3\n\nReport bugs to http://www.open-mpi.org/community/help/\n",[406],{"type":18,"tag":203,"props":407,"children":408},{"__ignoreMap":7},[409],{"type":24,"value":404},{"type":18,"tag":135,"props":411,"children":413},{"id":412},"_23-环境验证",[414,419],{"type":18,"tag":126,"props":415,"children":418},{"href":416,"rel":417},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#2.3-%E7%8E%AF%E5%A2%83%E9%AA%8C%E8%AF%81",[130],[],{"type":24,"value":420},"2.3 环境验证",{"type":18,"tag":32,"props":422,"children":423},{},[424],{"type":24,"value":425},"上面基础环境安装完成后，我们对环境进行一个初步验证，来看看是否搭建成功。",{"type":18,"tag":32,"props":427,"children":428},{},[429],{"type":24,"value":430},"验证代码如下：",{"type":18,"tag":381,"props":432,"children":434},{"code":433},"# nccl_allgather.py\nimport numpy as np\nimport mindspore.ops as ops\nimport mindspore.nn as nn\nfrom mindspore import context, Tensor\nfrom mindspore.communication import init, get_rank\n\n\nclass Net(nn.Cell):\n    def __init__(self):\n        super(Net, self).__init__()\n        self.allgather = ops.AllGather()\n\n    def construct(self, x):\n        return self.allgather(x)\n\n\nif __name__ == \"__main__\":\n    context.set_context(mode=context.GRAPH_MODE, device_target=\"GPU\")\n    init(\"nccl\")\n    value = get_rank()\n    input_x = Tensor(np.array([[value]]).astype(np.float32))\n    net = Net()\n    output = net(input_x)\n    print(output)\n",[435],{"type":18,"tag":203,"props":436,"children":437},{"__ignoreMap":7},[438],{"type":24,"value":433},{"type":18,"tag":32,"props":440,"children":441},{},[442,444,450],{"type":24,"value":443},"将上面代码保存到文件",{"type":18,"tag":203,"props":445,"children":447},{"className":446},[],[448],{"type":24,"value":449},"nccl_allgather.py",{"type":24,"value":451},"中，运行命令：",{"type":18,"tag":32,"props":453,"children":454},{},[455],{"type":24,"value":456},"命令解读：",{"type":18,"tag":43,"props":458,"children":459},{},[460],{"type":18,"tag":47,"props":461,"children":462},{},[463,469],{"type":18,"tag":203,"props":464,"children":466},{"className":465},[],[467],{"type":24,"value":468},"-n",{"type":24,"value":470}," 后面数字代表使用GPU的数量，这里使用了机器内全部GPU。如果读者不想使用全部，记得设置相应的环境变量。",{"type":18,"tag":381,"props":472,"children":474},{"code":473},"mpirun -n 4 python3 nccl_allgather.py\n",[475],{"type":18,"tag":203,"props":476,"children":477},{"__ignoreMap":7},[478],{"type":24,"value":473},{"type":18,"tag":32,"props":480,"children":481},{},[482],{"type":24,"value":483},"输出内容如下：",{"type":18,"tag":381,"props":485,"children":487},{"code":486},"[[0.]\n [1.]\n [2.]\n [3.]]\n[[0.]\n [1.]\n [2.]\n [3.]]\n[[0.]\n [1.]\n [2.]\n [3.]]\n[[0.]\n [1.]\n [2.]\n [3.]]\n",[488],{"type":18,"tag":203,"props":489,"children":490},{"__ignoreMap":7},[491],{"type":24,"value":486},{"type":18,"tag":32,"props":493,"children":494},{},[495],{"type":24,"value":496},"至此，我们的环境搭建完成，且验证成功。",{"type":18,"tag":121,"props":498,"children":500},{"id":499},"_3-单卡训练",[501,506],{"type":18,"tag":126,"props":502,"children":505},{"href":503,"rel":504},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#3.-%E5%8D%95%E5%8D%A1%E8%AE%AD%E7%BB%83",[130],[],{"type":24,"value":507},"3. 单卡训练",{"type":18,"tag":32,"props":509,"children":510},{},[511],{"type":24,"value":512},"为了能够后续进行对比测试，这里我们先来进行单卡训练，以此做为基准。",{"type":18,"tag":135,"props":514,"children":516},{"id":515},"_31-代码部分",[517,522],{"type":18,"tag":126,"props":518,"children":521},{"href":519,"rel":520},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#3.1-%E4%BB%A3%E7%A0%81%E9%83%A8%E5%88%86",[130],[],{"type":24,"value":523},"3.1 代码部分",{"type":18,"tag":32,"props":525,"children":526},{},[527],{"type":24,"value":528},"代码说明：",{"type":18,"tag":530,"props":531,"children":532},"ol",{},[533,562,590],{"type":18,"tag":47,"props":534,"children":535},{},[536,538,544,546,552,554,561],{"type":24,"value":537},"网络结构采用的是",{"type":18,"tag":203,"props":539,"children":541},{"className":540},[],[542],{"type":24,"value":543},"ResNet-50",{"type":24,"value":545},"，读者可以在",{"type":18,"tag":203,"props":547,"children":549},{"className":548},[],[550],{"type":24,"value":551},"MindSpore Models",{"type":24,"value":553},"仓库进行获取，复制粘贴过来即可，",{"type":18,"tag":126,"props":555,"children":558},{"href":556,"rel":557},"https://gitee.com/mindspore/mindspore/blob/r1.1/model_zoo/official/cv/resnet/src/resnet.py",[130],[559],{"type":24,"value":560},"ResNet-50代码链接",{"type":24,"value":313},{"type":18,"tag":47,"props":563,"children":564},{},[565,567,573,575,582,583],{"type":24,"value":566},"数据集采用的是",{"type":18,"tag":203,"props":568,"children":570},{"className":569},[],[571],{"type":24,"value":572},"Fruit-360",{"type":24,"value":574},"数据集，有关该数据集的更详细介绍可以参看笔者之前的文章",{"type":18,"tag":126,"props":576,"children":579},{"href":577,"rel":578},"https://bbs.huaweicloud.com/forum/thread-190708-1-1.html",[130],[580],{"type":24,"value":581},"MindSpore易点通·精讲系列–数据集加载之ImageFolderDataset",{"type":24,"value":313},{"type":18,"tag":126,"props":584,"children":587},{"href":585,"rel":586},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#",[130],[588],{"type":24,"value":589},"数据集下载链接",{"type":18,"tag":47,"props":591,"children":592},{},[593,595,601,603,609],{"type":24,"value":594},"读者注意将代码中的",{"type":18,"tag":203,"props":596,"children":598},{"className":597},[],[599],{"type":24,"value":600},"train_dataset_dir",{"type":24,"value":602},"和",{"type":18,"tag":203,"props":604,"children":606},{"className":605},[],[607],{"type":24,"value":608},"test_dataset_dir",{"type":24,"value":610},"替换为自己的文件目录。",{"type":18,"tag":32,"props":612,"children":613},{},[614],{"type":24,"value":615},"单卡训练的代码如下：",{"type":18,"tag":381,"props":617,"children":619},{"code":618},"import numpy as np\n\nfrom mindspore import context\nfrom mindspore import nn\nfrom mindspore.common import dtype as mstype\nfrom mindspore.common import set_seed\nfrom mindspore.common import Tensor\nfrom mindspore.communication import init, get_rank, get_group_size\nfrom mindspore.dataset import ImageFolderDataset\nfrom mindspore.dataset.transforms.c_transforms import Compose, TypeCast\nfrom mindspore.dataset.vision.c_transforms import HWC2CHW, Normalize, RandomCrop, RandomHorizontalFlip, Resize\nfrom mindspore.nn.loss import SoftmaxCrossEntropyWithLogits\nfrom mindspore.nn.optim import Momentum\nfrom mindspore.ops import operations as P\nfrom mindspore.ops import functional as F\nfrom mindspore.train import Model\nfrom mindspore.train.callback import CheckpointConfig, ModelCheckpoint, LossMonitor\nfrom scipy.stats import truncnorm\n\n# define reset50\n\ndef create_dataset(dataset_dir, mode=\"train\", decode=True, batch_size=32, repeat_num=1):\n    if mode == \"train\":\n        shuffle = True\n    else:\n        shuffle = False\n\n    dataset = ImageFolderDataset(\n        dataset_dir=dataset_dir, shuffle=shuffle, decode=decode)\n\n    mean = [127.5, 127.5, 127.5]\n    std = [127.5, 127.5, 127.5]\n    if mode == \"train\":\n        transforms_list = Compose(\n            [RandomCrop((32, 32), (4, 4, 4, 4)),\n             RandomHorizontalFlip(),\n             Resize((100, 100)),\n             Normalize(mean, std),\n             HWC2CHW()])\n    else:\n        transforms_list = Compose(\n            [Resize((128, 128)),\n             Normalize(mean, std),\n             HWC2CHW()])\n\n    cast_op = TypeCast(mstype.int32)\n\n    dataset = dataset.map(operations=transforms_list, input_columns=\"image\")\n    dataset = dataset.map(operations=cast_op, input_columns=\"label\")\n    dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)\n    dataset = dataset.repeat(repeat_num)\n\n    return dataset\n\n\ndef run_train():\n    context.set_context(mode=context.GRAPH_MODE, device_target=\"GPU\")\n    set_seed(0)\n\n    train_dataset_dir = \"/mnt/data_0002_24t/xingchaolong/dataset/Fruits_360/fruits-360_dataset/fruits-360/Training\"\n    test_dataset_dir = \"/mnt/data_0002_24t/xingchaolong/dataset/Fruits_360/fruits-360_dataset/fruits-360/Test\"\n    batch_size = 32\n\n    train_dataset = create_dataset(dataset_dir=train_dataset_dir, batch_size=batch_size)\n    test_dataset = create_dataset(dataset_dir=test_dataset_dir, mode=\"test\")\n    train_batch_num = train_dataset.get_dataset_size()\n    test_batch_num = test_dataset.get_dataset_size()\n    print(\"train dataset batch num: {}\".format(train_batch_num), flush=True)\n    print(\"test dataset batch num: {}\".format(test_batch_num), flush=True)\n\n    # build model\n    net = resnet50(class_num=131)\n    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction=\"mean\")\n    optim = Momentum(params=net.trainable_params(), learning_rate=0.01, momentum=0.9, loss_scale=1024.0)\n    model = Model(net, loss_fn=loss, optimizer=optim, metrics={\"accuracy\"})\n\n    # CheckPoint CallBack definition\n    config_ck = CheckpointConfig(save_checkpoint_steps=train_batch_num, keep_checkpoint_max=35)\n    ckpoint_cb = ModelCheckpoint(prefix=\"fruit_360_renet50\", directory=\"./ckpt/\", config=config_ck)\n    # LossMonitor is used to print loss value on screen\n    loss_cb = LossMonitor()\n\n    # model train\n    model.train(10, train_dataset, callbacks=[ckpoint_cb, loss_cb], dataset_sink_mode=True)\n\n    # model eval\n    result = model.eval(test_dataset)\n    print(\"eval result: {}\".format(result), flush=True)\n\n\ndef main():\n    run_train()\n\n\nif __name__ == \"__main__\":\n    main()\n\n",[620],{"type":18,"tag":203,"props":621,"children":622},{"__ignoreMap":7},[623],{"type":24,"value":618},{"type":18,"tag":135,"props":625,"children":627},{"id":626},"_32-训练部分",[628,633],{"type":18,"tag":126,"props":629,"children":632},{"href":630,"rel":631},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#3.2-%E8%AE%AD%E7%BB%83%E9%83%A8%E5%88%86",[130],[],{"type":24,"value":634},"3.2 训练部分",{"type":18,"tag":32,"props":636,"children":637},{},[638,640,646],{"type":24,"value":639},"保存代码到",{"type":18,"tag":203,"props":641,"children":643},{"className":642},[],[644],{"type":24,"value":645},"gpu_single_train.py",{"type":24,"value":647},"，使用如下命令进行训练：",{"type":18,"tag":381,"props":649,"children":651},{"code":650},"export CUDA_VISIBLE_DEVICES=0\npython3 gpu_single_train.py\n",[652],{"type":18,"tag":203,"props":653,"children":654},{"__ignoreMap":7},[655],{"type":24,"value":650},{"type":18,"tag":32,"props":657,"children":658},{},[659],{"type":24,"value":660},"训练过程输出内容如下：",{"type":18,"tag":381,"props":662,"children":664},{"code":663},"train dataset batch num: 2115\ntest dataset batch num: 709\nepoch: 1 step: 2115, loss is 4.219570636749268\nepoch: 2 step: 2115, loss is 3.7109947204589844\n......\nepoch: 9 step: 2115, loss is 2.66499400138855\nepoch: 10 step: 2115, loss is 2.540522336959839\neval result: {'accuracy': 0.676348730606488}\n",[665],{"type":18,"tag":203,"props":666,"children":667},{"__ignoreMap":7},[668],{"type":24,"value":663},{"type":18,"tag":32,"props":670,"children":671},{},[672,673,679],{"type":24,"value":393},{"type":18,"tag":203,"props":674,"children":676},{"className":675},[],[677],{"type":24,"value":678},"tree ckpt",{"type":24,"value":680},"命令，查看一下模型保存目录的情况，输出内容如下：",{"type":18,"tag":381,"props":682,"children":684},{"code":683},"ckpt/\n├── fruit_360_renet50-10_2115.ckpt\n├── fruit_360_renet50-1_2115.ckpt\n......\n├── fruit_360_renet50-9_2115.ckpt\n└── fruit_360_renet50-graph.meta\n",[685],{"type":18,"tag":203,"props":686,"children":687},{"__ignoreMap":7},[688],{"type":24,"value":683},{"type":18,"tag":121,"props":690,"children":692},{"id":691},"_4-多卡训练openmpi",[693,698],{"type":18,"tag":126,"props":694,"children":697},{"href":695,"rel":696},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#4.-%E5%A4%9A%E5%8D%A1%E8%AE%AD%E7%BB%83%E2%80%93openmpi",[130],[],{"type":24,"value":699},"4. 多卡训练–OpenMPI",{"type":18,"tag":32,"props":701,"children":702},{},[703],{"type":24,"value":704},"下面我们通过实际案例，介绍如何在GPU平台上，采用OpenMPI进行分布式训练。",{"type":18,"tag":135,"props":706,"children":708},{"id":707},"_41-代码部分",[709,714],{"type":18,"tag":126,"props":710,"children":713},{"href":711,"rel":712},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#4.1-%E4%BB%A3%E7%A0%81%E9%83%A8%E5%88%86",[130],[],{"type":24,"value":715},"4.1 代码部分",{"type":18,"tag":32,"props":717,"children":718},{},[719],{"type":24,"value":528},{"type":18,"tag":43,"props":721,"children":722},{},[723,736,749,769],{"type":18,"tag":47,"props":724,"children":725},{},[726,728,734],{"type":24,"value":727},"前三点说明请参考",{"type":18,"tag":203,"props":729,"children":731},{"className":730},[],[732],{"type":24,"value":733},"3.1",{"type":24,"value":735},"部分的代码说明。",{"type":18,"tag":47,"props":737,"children":738},{},[739,741,747],{"type":24,"value":740},"多卡训练主要修改的是数据集读取和",{"type":18,"tag":203,"props":742,"children":744},{"className":743},[],[745],{"type":24,"value":746},"context",{"type":24,"value":748},"设置部分。",{"type":18,"tag":47,"props":750,"children":751},{},[752,754,760,761,767],{"type":24,"value":753},"数据集读取：需要指定",{"type":18,"tag":203,"props":755,"children":757},{"className":756},[],[758],{"type":24,"value":759},"num_shards",{"type":24,"value":602},{"type":18,"tag":203,"props":762,"children":764},{"className":763},[],[765],{"type":24,"value":766},"shard_id",{"type":24,"value":768},"，详细内容参考代码。",{"type":18,"tag":47,"props":770,"children":771},{},[772,777,779,785,787,793,795,801],{"type":18,"tag":203,"props":773,"children":775},{"className":774},[],[776],{"type":24,"value":746},{"type":24,"value":778},"设置：包含参数一致性和并行模式设定。参数一致性这里使用的是",{"type":18,"tag":203,"props":780,"children":782},{"className":781},[],[783],{"type":24,"value":784},"set_seed",{"type":24,"value":786},"来设定；并行模式通过",{"type":18,"tag":203,"props":788,"children":790},{"className":789},[],[791],{"type":24,"value":792},"set_auto_parallel_context",{"type":24,"value":794},"方法和",{"type":18,"tag":203,"props":796,"children":798},{"className":797},[],[799],{"type":24,"value":800},"parallel_mode",{"type":24,"value":802},"参数来进行设置。",{"type":18,"tag":32,"props":804,"children":805},{},[806],{"type":24,"value":807},"多卡训练的代码如下：",{"type":18,"tag":381,"props":809,"children":811},{"code":810},"import numpy as np\n\nfrom mindspore import context\nfrom mindspore import nn\nfrom mindspore.common import dtype as mstype\nfrom mindspore.common import set_seed\nfrom mindspore.common import Tensor\nfrom mindspore.communication import init, get_rank, get_group_size\nfrom mindspore.dataset import ImageFolderDataset\nfrom mindspore.dataset.transforms.c_transforms import Compose, TypeCast\nfrom mindspore.dataset.vision.c_transforms import HWC2CHW, Normalize, RandomCrop, RandomHorizontalFlip, Resize\nfrom mindspore.nn.loss import SoftmaxCrossEntropyWithLogits\nfrom mindspore.nn.optim import Momentum\nfrom mindspore.ops import operations as P\nfrom mindspore.ops import functional as F\nfrom mindspore.train import Model\nfrom mindspore.train.callback import CheckpointConfig, ModelCheckpoint, LossMonitor\nfrom scipy.stats import truncnorm\n\n# define reset50\n\ndef create_dataset(dataset_dir, mode=\"train\", decode=True, batch_size=32, repeat_num=1):\n    if mode == \"train\":\n        shuffle = True\n        rank_id = get_rank()\n        rank_size = get_group_size()\n    else:\n        shuffle = False\n        rank_id = None\n        rank_size = None\n\n    dataset = ImageFolderDataset(\n        dataset_dir=dataset_dir, shuffle=shuffle, decode=decode, num_shards=rank_size, shard_id=rank_id)\n\n    mean = [127.5, 127.5, 127.5]\n    std = [127.5, 127.5, 127.5]\n    if mode == \"train\":\n        transforms_list = Compose(\n            [RandomCrop((32, 32), (4, 4, 4, 4)),\n             RandomHorizontalFlip(),\n             Resize((100, 100)),\n             Normalize(mean, std),\n             HWC2CHW()])\n    else:\n        transforms_list = Compose(\n            [Resize((128, 128)),\n             Normalize(mean, std),\n             HWC2CHW()])\n\n    cast_op = TypeCast(mstype.int32)\n\n    dataset = dataset.map(operations=transforms_list, input_columns=\"image\")\n    dataset = dataset.map(operations=cast_op, input_columns=\"label\")\n    dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)\n    dataset = dataset.repeat(repeat_num)\n\n    return dataset\n\n\ndef run_train():\n    context.set_context(mode=context.GRAPH_MODE, device_target=\"GPU\")\n    init(\"nccl\")\n    rank_id = get_rank()\n    rank_size = get_group_size()\n    print(\"rank size: {}, rank id: {}\".format(rank_size, rank_id), flush=True)\n    set_seed(0)\n    context.set_auto_parallel_context(\n        device_num=rank_size, gradients_mean=True, parallel_mode=context.ParallelMode.DATA_PARALLEL)\n\n    train_dataset_dir = \"/mnt/data_0002_24t/xingchaolong/dataset/Fruits_360/fruits-360_dataset/fruits-360/Training\"\n    test_dataset_dir = \"/mnt/data_0002_24t/xingchaolong/dataset/Fruits_360/fruits-360_dataset/fruits-360/Test\"\n    batch_size = 32\n\n    train_dataset = create_dataset(dataset_dir=train_dataset_dir, batch_size=batch_size//rank_size)\n    test_dataset = create_dataset(dataset_dir=test_dataset_dir, mode=\"test\")\n    train_batch_num = train_dataset.get_dataset_size()\n    test_batch_num = test_dataset.get_dataset_size()\n    print(\"train dataset batch num: {}\".format(train_batch_num), flush=True)\n    print(\"test dataset batch num: {}\".format(test_batch_num), flush=True)\n\n    # build model\n    net = resnet50(class_num=131)\n    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction=\"mean\")\n    optim = Momentum(params=net.trainable_params(), learning_rate=0.01, momentum=0.9, loss_scale=1024.0)\n    model = Model(net, loss_fn=loss, optimizer=optim, metrics={\"accuracy\"})\n\n    # CheckPoint CallBack definition\n    config_ck = CheckpointConfig(save_checkpoint_steps=train_batch_num, keep_checkpoint_max=35)\n    ckpoint_cb = ModelCheckpoint(prefix=\"fruit_360_renet50_{}\".format(rank_id), directory=\"./ckpt/\", config=config_ck)\n    # LossMonitor is used to print loss value on screen\n    loss_cb = LossMonitor()\n\n    # model train\n    model.train(10, train_dataset, callbacks=[ckpoint_cb, loss_cb], dataset_sink_mode=True)\n\n    # model eval\n    result = model.eval(test_dataset)\n    print(\"eval result: {}\".format(result), flush=True)\n\n\ndef main():\n    run_train()\n\n\nif __name__ == \"__main__\":\n    main()\n\n",[812],{"type":18,"tag":203,"props":813,"children":814},{"__ignoreMap":7},[815],{"type":24,"value":810},{"type":18,"tag":135,"props":817,"children":819},{"id":818},"_42-训练部分",[820,825],{"type":18,"tag":126,"props":821,"children":824},{"href":822,"rel":823},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#4.2-%E8%AE%AD%E7%BB%83%E9%83%A8%E5%88%86",[130],[],{"type":24,"value":826},"4.2 训练部分",{"type":18,"tag":32,"props":828,"children":829},{},[830],{"type":24,"value":831},"下面来介绍如何使用多卡GPU训练。",{"type":18,"tag":833,"props":834,"children":836},"h4",{"id":835},"_421-4卡gpu训练",[837,842],{"type":18,"tag":126,"props":838,"children":841},{"href":839,"rel":840},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#4.2.1-4%E5%8D%A1gpu%E8%AE%AD%E7%BB%83",[130],[],{"type":24,"value":843},"4.2.1 4卡GPU训练",{"type":18,"tag":32,"props":845,"children":846},{},[847],{"type":24,"value":848},"使用如下命令，进行4卡GPU训练：",{"type":18,"tag":381,"props":850,"children":852},{"code":851},"export CUDA_VISIBLE_DEVICES=0,1,2,3\nmpirun -n 4 python3 gpu_distributed_train.py\n",[853],{"type":18,"tag":203,"props":854,"children":855},{"__ignoreMap":7},[856],{"type":24,"value":851},{"type":18,"tag":32,"props":858,"children":859},{},[860],{"type":24,"value":861},"训练过程中，输出内容如下：",{"type":18,"tag":381,"props":863,"children":865},{"code":864},"rank size: 4, rank id: 0\nrank size: 4, rank id: 1\nrank size: 4, rank id: 2\nrank size: 4, rank id: 3\ntrain dataset batch num: 2115\ntest dataset batch num: 709\ntrain dataset batch num: 2115\ntest dataset batch num: 709\ntrain dataset batch num: 2115\ntest dataset batch num: 709\ntrain dataset batch num: 2115\ntest dataset batch num: 709\n[WARNING] PRE_ACT(294248,7fa67e831740,python3):2022-07-13-17:11:24.528.381 [mindspore/ccsrc/backend/common/pass/communication_op_fusion.cc:198] GetAllReduceSplitSegment] Split threshold is 0. AllReduce nodes will take default fusion strategy.\n[WARNING] PRE_ACT(294245,7f57993a5740,python3):2022-07-13-17:11:26.176.114 [mindspore/ccsrc/backend/common/pass/communication_op_fusion.cc:198] GetAllReduceSplitSegment] Split threshold is 0. AllReduce nodes will take default fusion strategy.\n[WARNING] PRE_ACT(294247,7f36f889b740,python3):2022-07-13-17:11:30.475.177 [mindspore/ccsrc/backend/common/pass/communication_op_fusion.cc:198] GetAllReduceSplitSegment] Split threshold is 0. AllReduce nodes will take default fusion strategy.\n[WARNING] PRE_ACT(294246,7f5f1820c740,python3):2022-07-13-17:11:31.271.259 [mindspore/ccsrc/backend/common/pass/communication_op_fusion.cc:198] GetAllReduceSplitSegment] Split threshold is 0. AllReduce nodes will take default fusion strategy.\nepoch: 1 step: 2115, loss is 4.536644458770752\nepoch: 1 step: 2115, loss is 4.347061634063721\nepoch: 1 step: 2115, loss is 4.557111740112305\nepoch: 1 step: 2115, loss is 4.467658519744873\n......\nepoch: 10 step: 2115, loss is 3.263073205947876\nepoch: 10 step: 2115, loss is 3.169656753540039\nepoch: 10 step: 2115, loss is 3.2040905952453613\nepoch: 10 step: 2115, loss is 3.812671184539795\neval result: {'accuracy': 0.48113540197461213}\neval result: {'accuracy': 0.5190409026798307}\neval result: {'accuracy': 0.4886283497884344}\neval result: {'accuracy': 0.5010578279266573}\n",[866],{"type":18,"tag":203,"props":867,"children":868},{"__ignoreMap":7},[869],{"type":24,"value":864},{"type":18,"tag":32,"props":871,"children":872},{},[873,874,879],{"type":24,"value":393},{"type":18,"tag":203,"props":875,"children":877},{"className":876},[],[878],{"type":24,"value":678},{"type":24,"value":680},{"type":18,"tag":381,"props":881,"children":883},{"code":882},"ckpt/\n├── fruit_360_renet50_0-10_2115.ckpt\n├── fruit_360_renet50_0-1_2115.ckpt\n├── fruit_360_renet50_0-2_2115.ckpt\n├── fruit_360_renet50_0-3_2115.ckpt\n├── fruit_360_renet50_0-4_2115.ckpt\n├── fruit_360_renet50_0-5_2115.ckpt\n├── fruit_360_renet50_0-6_2115.ckpt\n├── fruit_360_renet50_0-7_2115.ckpt\n├── fruit_360_renet50_0-8_2115.ckpt\n├── fruit_360_renet50_0-9_2115.ckpt\n├── fruit_360_renet50_0-graph.meta\n......\n├── fruit_360_renet50_3-10_2115.ckpt\n├── fruit_360_renet50_3-1_2115.ckpt\n├── fruit_360_renet50_3-2_2115.ckpt\n├── fruit_360_renet50_3-3_2115.ckpt\n├── fruit_360_renet50_3-4_2115.ckpt\n├── fruit_360_renet50_3-5_2115.ckpt\n├── fruit_360_renet50_3-6_2115.ckpt\n├── fruit_360_renet50_3-7_2115.ckpt\n├── fruit_360_renet50_3-8_2115.ckpt\n├── fruit_360_renet50_3-9_2115.ckpt\n└── fruit_360_renet50_3-graph.meta\n",[884],{"type":18,"tag":203,"props":885,"children":886},{"__ignoreMap":7},[887],{"type":24,"value":882},{"type":18,"tag":833,"props":889,"children":891},{"id":890},"_422-2卡gpu训练",[892,897],{"type":18,"tag":126,"props":893,"children":896},{"href":894,"rel":895},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#4.2.2-2%E5%8D%A1gpu%E8%AE%AD%E7%BB%83",[130],[],{"type":24,"value":898},"4.2.2 2卡GPU训练",{"type":18,"tag":32,"props":900,"children":901},{},[902],{"type":24,"value":903},"为了进行对比，再来进行2卡GPU训练，命令如下：",{"type":18,"tag":32,"props":905,"children":906},{},[907],{"type":24,"value":908},"这里为了验证普遍性，并非依序选择GPU。",{"type":18,"tag":381,"props":910,"children":912},{"code":911},"export CUDA_VISIBLE_DEVICES=2,3\nmpirun -n 2 python3 gpu_distributed_train.py\n",[913],{"type":18,"tag":203,"props":914,"children":915},{"__ignoreMap":7},[916],{"type":24,"value":911},{"type":18,"tag":32,"props":918,"children":919},{},[920],{"type":24,"value":861},{"type":18,"tag":381,"props":922,"children":924},{"code":923},"rank size: 2, rank id: 0\nrank size: 2, rank id: 1\ntrain dataset batch num: 2115\ntest dataset batch num: 709\ntrain dataset batch num: 2115\ntest dataset batch num: 709\n[WARNING] PRE_ACT(295459,7ff930118740,python3):2022-07-13-17:31:07.210.231 [mindspore/ccsrc/backend/common/pass/communication_op_fusion.cc:198] GetAllReduceSplitSegment] Split threshold is 0. AllReduce nodes will take default fusion strategy.\n[WARNING] PRE_ACT(295460,7f5fed564740,python3):2022-07-13-17:31:07.649.536 [mindspore/ccsrc/backend/common/pass/communication_op_fusion.cc:198] GetAllReduceSplitSegment] Split threshold is 0. AllReduce nodes will take default fusion strategy.\nepoch: 1 step: 2115, loss is 4.391518592834473\nepoch: 1 step: 2115, loss is 4.337993621826172\n......\nepoch: 10 step: 2115, loss is 2.7631659507751465\nepoch: 10 step: 2115, loss is 3.0124118328094482\neval result: {'accuracy': 0.6057827926657263}\neval result: {'accuracy': 0.6202397743300423}\n",[925],{"type":18,"tag":203,"props":926,"children":927},{"__ignoreMap":7},[928],{"type":24,"value":923},{"type":18,"tag":32,"props":930,"children":931},{},[932,933,938],{"type":24,"value":393},{"type":18,"tag":203,"props":934,"children":936},{"className":935},[],[937],{"type":24,"value":678},{"type":24,"value":680},{"type":18,"tag":381,"props":940,"children":942},{"code":941},"ckpt/\n├── fruit_360_renet50_0-10_2115.ckpt\n├── fruit_360_renet50_0-1_2115.ckpt\n├── fruit_360_renet50_0-2_2115.ckpt\n├── fruit_360_renet50_0-3_2115.ckpt\n├── fruit_360_renet50_0-4_2115.ckpt\n├── fruit_360_renet50_0-5_2115.ckpt\n├── fruit_360_renet50_0-6_2115.ckpt\n├── fruit_360_renet50_0-7_2115.ckpt\n├── fruit_360_renet50_0-8_2115.ckpt\n├── fruit_360_renet50_0-9_2115.ckpt\n├── fruit_360_renet50_0-graph.meta\n├── fruit_360_renet50_1-10_2115.ckpt\n├── fruit_360_renet50_1-1_2115.ckpt\n├── fruit_360_renet50_1-2_2115.ckpt\n├── fruit_360_renet50_1-3_2115.ckpt\n├── fruit_360_renet50_1-4_2115.ckpt\n├── fruit_360_renet50_1-5_2115.ckpt\n├── fruit_360_renet50_1-6_2115.ckpt\n├── fruit_360_renet50_1-7_2115.ckpt\n├── fruit_360_renet50_1-8_2115.ckpt\n├── fruit_360_renet50_1-9_2115.ckpt\n└── fruit_360_renet50_1-graph.meta\n",[943],{"type":18,"tag":203,"props":944,"children":945},{"__ignoreMap":7},[946],{"type":24,"value":941},{"type":18,"tag":833,"props":948,"children":950},{"id":949},"_423-多卡对比说明",[951,956],{"type":18,"tag":126,"props":952,"children":955},{"href":953,"rel":954},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#4.2.3-%E5%A4%9A%E5%8D%A1%E5%AF%B9%E6%AF%94%E8%AF%B4%E6%98%8E",[130],[],{"type":24,"value":957},"4.2.3 多卡对比说明",{"type":18,"tag":43,"props":959,"children":960},{},[961,974,1016],{"type":18,"tag":47,"props":962,"children":963},{},[964,966,972],{"type":24,"value":965},"结合",{"type":18,"tag":203,"props":967,"children":969},{"className":968},[],[970],{"type":24,"value":971},"3.2",{"type":24,"value":973},"部分，进行4卡GPU训练和2卡GPU训练的对比。",{"type":18,"tag":47,"props":975,"children":976},{},[977,979,985,987,993,995,1000,1002,1007,1009,1014],{"type":24,"value":978},"三种情况下，分别将",{"type":18,"tag":203,"props":980,"children":982},{"className":981},[],[983],{"type":24,"value":984},"batch_size",{"type":24,"value":986},"设置为了32、8、16，对应到的",{"type":18,"tag":203,"props":988,"children":990},{"className":989},[],[991],{"type":24,"value":992},"batch_num",{"type":24,"value":994},"不变。也可以认为是在",{"type":18,"tag":203,"props":996,"children":998},{"className":997},[],[999],{"type":24,"value":335},{"type":24,"value":1001},"显存不足于支持更大",{"type":18,"tag":203,"props":1003,"children":1005},{"className":1004},[],[1006],{"type":24,"value":984},{"type":24,"value":1008},"时，通过多卡来实现更大",{"type":18,"tag":203,"props":1010,"children":1012},{"className":1011},[],[1013],{"type":24,"value":984},{"type":24,"value":1015},"的方案。",{"type":18,"tag":47,"props":1017,"children":1018},{},[1019,1021,1027],{"type":24,"value":1020},"从实际训练情况来看（都训练了10个epoch），单卡的效果最好，2卡次之，4卡最差。导致这种情况的原因是因为网络中使用到了",{"type":18,"tag":203,"props":1022,"children":1024},{"className":1023},[],[1025],{"type":24,"value":1026},"BatchNorm2d",{"type":24,"value":1028},"算子，而在多卡情况下，无法跨卡计算，从而导致精度上的差别。在GPU硬件下，笔者暂时并没有找到合理的解决方案。",{"type":18,"tag":121,"props":1030,"children":1032},{"id":1031},"_5-多卡训练非openmpi",[1033,1038],{"type":18,"tag":126,"props":1034,"children":1037},{"href":1035,"rel":1036},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#5.-%E5%A4%9A%E5%8D%A1%E8%AE%AD%E7%BB%83%E2%80%93%E9%9D%9Eopenmpi",[130],[],{"type":24,"value":1039},"5. 多卡训练–非OpenMPI",{"type":18,"tag":32,"props":1041,"children":1042},{},[1043,1044,1050,1052,1057,1059,1064,1066,1071,1073,1078,1080,1085],{"type":24,"value":329},{"type":18,"tag":203,"props":1045,"children":1047},{"className":1046},[],[1048],{"type":24,"value":1049},"4",{"type":24,"value":1051},"中我们介绍了依赖",{"type":18,"tag":203,"props":1053,"children":1055},{"className":1054},[],[1056],{"type":24,"value":350},{"type":24,"value":1058},"如何来进行",{"type":18,"tag":203,"props":1060,"children":1062},{"className":1061},[],[1063],{"type":24,"value":335},{"type":24,"value":1065},"多卡训练，同时",{"type":18,"tag":203,"props":1067,"children":1069},{"className":1068},[],[1070],{"type":24,"value":242},{"type":24,"value":1072},"也支持不依赖",{"type":18,"tag":203,"props":1074,"children":1076},{"className":1075},[],[1077],{"type":24,"value":350},{"type":24,"value":1079},"来进行",{"type":18,"tag":203,"props":1081,"children":1083},{"className":1082},[],[1084],{"type":24,"value":335},{"type":24,"value":1086},"多卡训练。官方对此的说明如下：",{"type":18,"tag":32,"props":1088,"children":1089},{},[1090,1092,1098,1100,1106],{"type":24,"value":1091},"出于训练时的安全及可靠性要求，",{"type":18,"tag":203,"props":1093,"children":1095},{"className":1094},[],[1096],{"type":24,"value":1097},"MindSpore GPU",{"type":24,"value":1099},"还支持",{"type":18,"tag":1101,"props":1102,"children":1103},"strong",{},[1104],{"type":24,"value":1105},"不依赖OpenMPI的分布式训练",{"type":24,"value":313},{"type":18,"tag":32,"props":1108,"children":1109},{},[1110,1115,1117,1123,1125,1130,1132,1137,1139,1144],{"type":18,"tag":203,"props":1111,"children":1113},{"className":1112},[],[1114],{"type":24,"value":350},{"type":24,"value":1116},"在分布式训练的场景中，起到在",{"type":18,"tag":203,"props":1118,"children":1120},{"className":1119},[],[1121],{"type":24,"value":1122},"Host",{"type":24,"value":1124},"侧同步数据以及进程间组网的功能；",{"type":18,"tag":203,"props":1126,"children":1128},{"className":1127},[],[1129],{"type":24,"value":242},{"type":24,"value":1131},"通过",{"type":18,"tag":1101,"props":1133,"children":1134},{},[1135],{"type":24,"value":1136},"复用Parameter Server模式训练架构",{"type":24,"value":1138},"，取代了",{"type":18,"tag":203,"props":1140,"children":1142},{"className":1141},[],[1143],{"type":24,"value":350},{"type":24,"value":1145},"能力。",{"type":18,"tag":32,"props":1147,"children":1148},{},[1149,1151,1157,1159,1165,1167,1173],{"type":24,"value":1150},"不过",{"type":18,"tag":203,"props":1152,"children":1154},{"className":1153},[],[1155],{"type":24,"value":1156},"Parameter Server",{"type":24,"value":1158},"相关的文档及代码示例不够充分。笔者尝试采用此种方式进行训练，参考了官方文档、",{"type":18,"tag":203,"props":1160,"children":1162},{"className":1161},[],[1163],{"type":24,"value":1164},"gitee",{"type":24,"value":1166},"上面的测试用例，最终未能顺利完成整个",{"type":18,"tag":203,"props":1168,"children":1170},{"className":1169},[],[1171],{"type":24,"value":1172},"pipline",{"type":24,"value":313},{"type":18,"tag":121,"props":1175,"children":1177},{"id":1176},"_6-本文总结",[1178,1183],{"type":18,"tag":126,"props":1179,"children":1182},{"href":1180,"rel":1181},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#6.-%E6%9C%AC%E6%96%87%E6%80%BB%E7%BB%93",[130],[],{"type":24,"value":1184},"6. 本文总结",{"type":18,"tag":32,"props":1186,"children":1187},{},[1188,1190,1195,1197,1202,1204,1209,1210,1215],{"type":24,"value":1189},"本来重点介绍了在",{"type":18,"tag":203,"props":1191,"children":1193},{"className":1192},[],[1194],{"type":24,"value":335},{"type":24,"value":1196},"硬件环境下，如何依赖",{"type":18,"tag":203,"props":1198,"children":1200},{"className":1199},[],[1201],{"type":24,"value":350},{"type":24,"value":1203},"进行多卡训练。对于非依赖",{"type":18,"tag":203,"props":1205,"children":1207},{"className":1206},[],[1208],{"type":24,"value":350},{"type":24,"value":352},{"type":18,"tag":203,"props":1211,"children":1213},{"className":1212},[],[1214],{"type":24,"value":1156},{"type":24,"value":1216},"本文也有所涉及，但由于官方文档的缺失和相应代码不足，无法形成可行案例。",{"type":18,"tag":121,"props":1218,"children":1220},{"id":1219},"_7-遇到问题",[1221,1226],{"type":18,"tag":126,"props":1222,"children":1225},{"href":1223,"rel":1224},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#7.-%E9%81%87%E5%88%B0%E9%97%AE%E9%A2%98",[130],[],{"type":24,"value":1227},"7. 遇到问题",{"type":18,"tag":32,"props":1229,"children":1230},{},[1231,1236],{"type":18,"tag":203,"props":1232,"children":1234},{"className":1233},[],[1235],{"type":24,"value":1156},{"type":24,"value":1237},"模式下的官方文档跳跃性太大，相关的测试用例缺失中间过程代码，希望能够完善这部分的文档和代码。",{"type":18,"tag":121,"props":1239,"children":1241},{"id":1240},"_8-本文参考",[1242,1247],{"type":18,"tag":126,"props":1243,"children":1246},{"href":1244,"rel":1245},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=194370#8.-%E6%9C%AC%E6%96%87%E5%8F%82%E8%80%83",[130],[],{"type":24,"value":1248},"8. 本文参考",{"type":18,"tag":43,"props":1250,"children":1251},{},[1252,1261,1271,1281],{"type":18,"tag":47,"props":1253,"children":1254},{},[1255],{"type":18,"tag":126,"props":1256,"children":1258},{"href":585,"rel":1257},[130],[1259],{"type":24,"value":1260},"深度学习中的分布式训练",{"type":18,"tag":47,"props":1262,"children":1263},{},[1264],{"type":18,"tag":126,"props":1265,"children":1268},{"href":1266,"rel":1267},"https://www.mindspore.cn/tutorials/experts/zh-CN/r1.7/parallel/introduction.html",[130],[1269],{"type":24,"value":1270},"MindSpore分布式并行总览",{"type":18,"tag":47,"props":1272,"children":1273},{},[1274],{"type":18,"tag":126,"props":1275,"children":1278},{"href":1276,"rel":1277},"https://www.mindspore.cn/tutorials/experts/zh-CN/r1.7/parallel/train_gpu.html",[130],[1279],{"type":24,"value":1280},"MindSpore分布式并行训练基础样例（GPU）",{"type":18,"tag":47,"props":1282,"children":1283},{},[1284],{"type":18,"tag":126,"props":1285,"children":1288},{"href":1286,"rel":1287},"https://www.mindspore.cn/docs/zh-CN/r1.7/design/parameter_server_training.html",[130],[1289],{"type":24,"value":1290},"MindSpore Parameter Server模式",{"type":18,"tag":32,"props":1292,"children":1293},{},[1294],{"type":24,"value":1295},"本文为原创文章，版权归作者所有，未经授权不得转载！",{"title":7,"searchDepth":1297,"depth":1297,"links":1298},4,[1299,1305,1310,1314,1322,1323,1324,1325],{"id":123,"depth":1300,"text":133,"children":1301},2,[1302,1304],{"id":137,"depth":1303,"text":145},3,{"id":190,"depth":1303,"text":198},{"id":265,"depth":1300,"text":273,"children":1306},[1307,1308,1309],{"id":276,"depth":1303,"text":284},{"id":316,"depth":1303,"text":324},{"id":412,"depth":1303,"text":420},{"id":499,"depth":1300,"text":507,"children":1311},[1312,1313],{"id":515,"depth":1303,"text":523},{"id":626,"depth":1303,"text":634},{"id":691,"depth":1300,"text":699,"children":1315},[1316,1317],{"id":707,"depth":1303,"text":715},{"id":818,"depth":1303,"text":826,"children":1318},[1319,1320,1321],{"id":835,"depth":1297,"text":843},{"id":890,"depth":1297,"text":898},{"id":949,"depth":1297,"text":957},{"id":1031,"depth":1300,"text":1039},{"id":1176,"depth":1300,"text":1184},{"id":1219,"depth":1300,"text":1227},{"id":1240,"depth":1300,"text":1248},"markdown","content:technology-blogs:zh:1643.md","content","technology-blogs/zh/1643.md","technology-blogs/zh/1643","md",1776506114696]