[{"data":1,"prerenderedAt":514},["ShallowReactive",2],{"content-query-tKWofSh141":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":508,"_id":509,"_source":510,"_file":511,"_stem":512,"_extension":513},"/technology-blogs/zh/1586","zh",false,"","【AI工程】05-基于MindSpore的Resnet-50模型分布式训练实践","对于训练要求较高的模型，使用分布式训练是更好的选择。","2022-06-29","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/06/30/d8d37ee16b7347e89041c5c9cdf4420d.png","technology-blogs","开发者分享",{"type":15,"children":16,"toc":493},"root",[17,25,31,48,93,99,113,119,124,134,139,188,194,199,204,212,217,235,241,246,254,259,272,278,292,298,303,308,316,321,327,332,355,360,368,372,385,391,396,404,409,427,432,440,445,450,468,473,481],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"ai工程05-基于mindspore的resnet-50模型分布式训练实践",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":29},"h2",{"id":28},"概述",[30],{"type":24,"value":28},{"type":18,"tag":32,"props":33,"children":34},"p",{},[35,37,46],{"type":24,"value":36},"在上一篇文章《",{"type":18,"tag":38,"props":39,"children":43},"a",{"href":40,"rel":41},"https://zhuanlan.zhihu.com/p/528964900",[42],"nofollow",[44],{"type":24,"value":45},"为什么AI需要分布式并行？",{"type":24,"value":47},"》中，我们介绍了AI算力为什么需要分布式并行、分布式并行的策略、以及MindSpore框架实现自动、半自动并行的原理。在本篇文章中，我们尝试基于MindSpore，使用自动并行策略完成Resnet-50模型的分布式训练，该实践步骤如下所示。",{"type":18,"tag":49,"props":50,"children":51},"ol",{},[52,58,63,68,73,78,83,88],{"type":18,"tag":53,"props":54,"children":55},"li",{},[56],{"type":24,"value":57},"准备数据集：下载Cifar019数据集作为训练数据集。",{"type":18,"tag":53,"props":59,"children":60},{},[61],{"type":24,"value":62},"配置分布式环境：配置昇腾910 8卡环境。",{"type":18,"tag":53,"props":64,"children":65},{},[66],{"type":24,"value":67},"调用集合通信库：引入HCCL，完成多卡间通信初始化。",{"type":18,"tag":53,"props":69,"children":70},{},[71],{"type":24,"value":72},"加载数据集：基于数据并行模式加载训练数据集。",{"type":18,"tag":53,"props":74,"children":75},{},[76],{"type":24,"value":77},"定义网络：定义Resnet-50网络。",{"type":18,"tag":53,"props":79,"children":80},{},[81],{"type":24,"value":82},"定义损失函数及优化器：定义针对分布式并行场景下的损失函数和优化器。",{"type":18,"tag":53,"props":84,"children":85},{},[86],{"type":24,"value":87},"构建网络训练代码：定义分布式并行策略以及训练代码。",{"type":18,"tag":53,"props":89,"children":90},{},[91],{"type":24,"value":92},"训练脚本：完成训练脚本并执行训练。",{"type":18,"tag":26,"props":94,"children":96},{"id":95},"第1步准备数据集",[97],{"type":24,"value":98},"第1步：准备数据集",{"type":18,"tag":32,"props":100,"children":101},{},[102,104,111],{"type":24,"value":103},"首先需要下载ResNet50训练的CIFAR-10数据集，该数据集由10类32*32的彩色图片组成，每类包含6000张图片。其中训练集共50000张图片，测试集共10000张图片。将数据集下载并解压到本地路径下，解压后的文件夹为cifar-10-batches-bin，数据集",{"type":18,"tag":38,"props":105,"children":108},{"href":106,"rel":107},"https://link.zhihu.com/?target=http%3A//www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz",[42],[109],{"type":24,"value":110},"下载链接",{"type":24,"value":112}," 。",{"type":18,"tag":26,"props":114,"children":116},{"id":115},"第2步-配置分布式环境变量",[117],{"type":24,"value":118},"第2步： 配置分布式环境变量",{"type":18,"tag":32,"props":120,"children":121},{},[122],{"type":24,"value":123},"在裸机环境进行分布式训练时，需要配置当前多卡环境的组网信息文件。以Ascend 910 AI处理器为例，1个8卡环境的json配置文件示例如下，本样例将该配置文件命名为rank_table_8pcs.json。",{"type":18,"tag":125,"props":126,"children":128},"pre",{"code":127}," {      \n    \"board_id\": \"0x0000\",      \n    \"chip_info\": \"910\",      \n    \"deploy_mode\": \"lab\",      \n    \"group_count\": \"1\",      \n    \"group_list\": [      \n        {      \n            \"device_num\": \"8\",      \n            \"server_num\": \"1\",      \n            \"group_name\": \"\",      \n            \"instance_count\": \"8\",      \n            \"instance_list\": [...]      \n        }      \n    ],      \n    \"para_plane_nic_location\": \"device\",      \n    \"para_plane_nic_name\": [\"eth0\",\"eth1\",\"eth2\",\"eth3\",\"eth4\",\"eth5\",\"eth6\",\"eth7\"],      \n    \"para_plane_nic_num\": \"8\",      \n    \"status\": \"completed\"      \n}      \n",[129],{"type":18,"tag":130,"props":131,"children":132},"code",{"__ignoreMap":7},[133],{"type":24,"value":127},{"type":18,"tag":32,"props":135,"children":136},{},[137],{"type":24,"value":138},"其中，以下参数需要根据实际训练环境修改：",{"type":18,"tag":140,"props":141,"children":142},"ul",{},[143,163,168,173,178,183],{"type":18,"tag":53,"props":144,"children":145},{},[146,148,154,156,161],{"type":24,"value":147},"board_id：表示当前运行的环境，",{"type":18,"tag":149,"props":150,"children":151},"strong",{},[152],{"type":24,"value":153},"x86",{"type":24,"value":155},"设为“0x0000”，",{"type":18,"tag":149,"props":157,"children":158},{},[159],{"type":24,"value":160},"arm",{"type":24,"value":162},"设为“0x0020”。",{"type":18,"tag":53,"props":164,"children":165},{},[166],{"type":24,"value":167},"server_num：表示机器数量， server_id表示本机IP地址。",{"type":18,"tag":53,"props":169,"children":170},{},[171],{"type":24,"value":172},"device_num、para_plane_nic_num及instance_count表示卡的数量。",{"type":18,"tag":53,"props":174,"children":175},{},[176],{"type":24,"value":177},"rank_id：表示卡逻辑序号，固定从0开始编号，device_id表示卡物理序号，即卡所在机器中的实际序号。",{"type":18,"tag":53,"props":179,"children":180},{},[181],{"type":24,"value":182},"device_ip：表示集成网卡的IP地址，可以在当前机器执行指令cat /etc/hccn.conf，address_x的键值就是网卡IP地址。",{"type":18,"tag":53,"props":184,"children":185},{},[186],{"type":24,"value":187},"para_plane_nic_name：对应网卡名称。",{"type":18,"tag":26,"props":189,"children":191},{"id":190},"第3步调用集合通信库",[192],{"type":24,"value":193},"第3步：调用集合通信库",{"type":18,"tag":32,"props":195,"children":196},{},[197],{"type":24,"value":198},"MindSpore分布式并行训练的通信使用了华为集合通信库Huawei Collective Communication Library（以下简称HCCL），可以在Ascend AI处理器配套的软件包中找到。同时mindspore.communication.management中封装了HCCL提供的集合通信接口，方便用户配置分布式信息。HCCL实现了基于Ascend AI处理器的多机多卡通信，我们列出使用分布式服务常见的一些使用限制，详细的可以查看HCCL对应的使用文档。",{"type":18,"tag":32,"props":200,"children":201},{},[202],{"type":24,"value":203},"下面是调用集合通信库样例代码：",{"type":18,"tag":125,"props":205,"children":207},{"code":206},"import os      \nfrom mindspore import context      \nfrom mindspore.communication.management import init      \nif __name__ == \"__main__\":      \n     context.set_context(mode=context.GRAPH_MODE, device_target=\"Ascend\", device_id=int(os.environ[\"DEVICE_ID\"]))      \n     init()      \n     ...      \n",[208],{"type":18,"tag":130,"props":209,"children":210},{"__ignoreMap":7},[211],{"type":24,"value":206},{"type":18,"tag":32,"props":213,"children":214},{},[215],{"type":24,"value":216},"其中，",{"type":18,"tag":140,"props":218,"children":219},{},[220,225,230],{"type":18,"tag":53,"props":221,"children":222},{},[223],{"type":24,"value":224},"mode=context.GRAPH_MODE：使用分布式训练需要指定“运行模式”为“图模式”（PyNative模式不支持并行）。",{"type":18,"tag":53,"props":226,"children":227},{},[228],{"type":24,"value":229},"device_id：卡的物理序号，即卡所在机器中的实际序号。",{"type":18,"tag":53,"props":231,"children":232},{},[233],{"type":24,"value":234},"init：使能HCCL通信，并完成分布式训练初始化操作。",{"type":18,"tag":26,"props":236,"children":238},{"id":237},"第4步基于数据并行模式加载数据集",[239],{"type":24,"value":240},"第4步：基于数据并行模式加载数据集",{"type":18,"tag":32,"props":242,"children":243},{},[244],{"type":24,"value":245},"分布式训练时，数据是以数据并行的方式导入的。下面的代码通过数据并行的方式加载了CIFAR-10数据集，代码中的data_path是指数据集的路径，即cifar-10-batches-bin文件夹的路径。",{"type":18,"tag":125,"props":247,"children":249},{"code":248},"import mindspore.common.dtype as mstype      \nimport mindspore.dataset as ds      \nimport mindspore.dataset.transforms.c_transforms as C      \nimport mindspore.dataset.transforms.vision.c_transforms as vision      \nfrom mindspore.communication.management import get_rank, get_group_size      \ndef create_dataset(data_path, repeat_num=1, batch_size=32, rank_id=0, rank_size=1):      \n    resize_height = 224      \n    resize_width = 224      \n    rescale = 1.0 / 255.0      \n    shift = 0.0      \n         \n    # get rank_id and rank_size      \n    rank_id = get_rank()      \n    rank_size = get_group_size()      \n    data_set = ds.Cifar10Dataset(data_path, num_shards=rank_size, shard_id=rank_id)      \n         \n    # define map operations      \n    random_crop_op = vision.RandomCrop((32, 32), (4, 4, 4, 4))      \n    random_horizontal_op = vision.RandomHorizontalFlip()      \n    resize_op = vision.Resize((resize_height, resize_width))      \n    rescale_op = vision.Rescale(rescale, shift)      \n    normalize_op = vision.Normalize((0.4465, 0.4822, 0.4914), (0.2010, 0.1994, 0.2023))      \n    changeswap_op = vision.HWC2CHW()      \n    type_cast_op = C.TypeCast(mstype.int32)      \n    c_trans = [random_crop_op, random_horizontal_op]      \n    c_trans += [resize_op, rescale_op, normalize_op, changeswap_op]      \n    # apply map operations on images      \n    data_set = data_set.map(input_columns=\"label\", operations=type_cast_op)      \n    data_set = data_set.map(input_columns=\"image\", operations=c_trans)      \n    # apply shuffle operations      \n    data_set = data_set.shuffle(buffer_size=10)      \n    # apply batch operations      \n    data_set = data_set.batch(batch_size=batch_size, drop_remainder=True)      \n    # apply repeat operations      \n    data_set = data_set.repeat(repeat_num)      \n    return data_set   \n",[250],{"type":18,"tag":130,"props":251,"children":252},{"__ignoreMap":7},[253],{"type":24,"value":248},{"type":18,"tag":32,"props":255,"children":256},{},[257],{"type":24,"value":258},"与单机不同处，在数据集接口需要传入num_shards和shard_id参数，分别对应卡的数量和逻辑序号，建议通过HCCL接口获取：",{"type":18,"tag":140,"props":260,"children":261},{},[262,267],{"type":18,"tag":53,"props":263,"children":264},{},[265],{"type":24,"value":266},"get_rank：获取当前设备在集群中的ID。",{"type":18,"tag":53,"props":268,"children":269},{},[270],{"type":24,"value":271},"get_group_size：获取集群数量。",{"type":18,"tag":26,"props":273,"children":275},{"id":274},"第5步定义网络",[276],{"type":24,"value":277},"第5步：定义网络",{"type":18,"tag":32,"props":279,"children":280},{},[281,283,290],{"type":24,"value":282},"数据并行及自动并行模式下，网络定义方式与单机一致。代码请参考",{"type":18,"tag":38,"props":284,"children":287},{"href":285,"rel":286},"https://zhuanlan.zhihu.com/ResNet50%E5%AE%9E%E7%8E%B0",[42],[288],{"type":24,"value":289},"ResNet50实现",{"type":24,"value":291},"。",{"type":18,"tag":26,"props":293,"children":295},{"id":294},"第6步定义损失函数及优化器",[296],{"type":24,"value":297},"第6步：定义损失函数及优化器",{"type":18,"tag":32,"props":299,"children":300},{},[301],{"type":24,"value":302},"自动并行以算子为粒度切分模型，通过算法搜索得到最优并行策略，所以与单机训练不同的是，为了有更好的并行训练效果，损失函数建议使用小算子来实现。",{"type":18,"tag":32,"props":304,"children":305},{},[306],{"type":24,"value":307},"在Loss部分，我们采用SoftmaxCrossEntropyWithLogits的展开形式，即按照数学公式，将其展开为多个小算子进行实现，样例代码如下：",{"type":18,"tag":125,"props":309,"children":311},{"code":310},"from mindspore.ops import operations as P      \nfrom mindspore import Tensor      \nimport mindspore.ops.functional as F      \nimport mindspore.common.dtype as mstype      \nimport mindspore.nn as nn      \nclass SoftmaxCrossEntropyExpand(nn.Cell):      \n    def __init__(self, sparse=False):      \n        super(SoftmaxCrossEntropyExpand, self).__init__()      \n        self.exp = P.Exp()      \n        self.sum = P.ReduceSum(keep_dims=True)      \n        self.onehot = P.OneHot()      \n        self.on_value = Tensor(1.0, mstype.float32)      \n        self.off_value = Tensor(0.0, mstype.float32)      \n        self.div = P.Div()      \n        self.log = P.Log()      \n        self.sum_cross_entropy = P.ReduceSum(keep_dims=False)      \n        self.mul = P.Mul()      \n        self.mul2 = P.Mul()      \n        self.mean = P.ReduceMean(keep_dims=False)      \n        self.sparse = sparse      \n        self.max = P.ReduceMax(keep_dims=True)      \n        self.sub = P.Sub()      \n             \n    def construct(self, logit, label):      \n        logit_max = self.max(logit, -1)      \n        exp = self.exp(self.sub(logit, logit_max))      \n        exp_sum = self.sum(exp, -1)      \n        softmax_result = self.div(exp, exp_sum)      \n        if self.sparse:      \n            label = self.onehot(label, F.shape(logit)[1], self.on_value, self.off_value)      \n        softmax_result_log = self.log(softmax_result)      \n        loss = self.sum_cross_entropy((self.mul(softmax_result_log, label)), -1)      \n        loss = self.mul2(F.scalar_to_array(-1.0), loss)      \n        loss = self.mean(loss, -1)      \n        return loss\n",[312],{"type":18,"tag":130,"props":313,"children":314},{"__ignoreMap":7},[315],{"type":24,"value":310},{"type":18,"tag":32,"props":317,"children":318},{},[319],{"type":24,"value":320},"定义优化器：采用Momentum优化器作为参数更新工具，这里定义与单机一致，不再展开，具体可以参考样例代码中的实现。",{"type":18,"tag":26,"props":322,"children":324},{"id":323},"第7步训练网络",[325],{"type":24,"value":326},"第7步：训练网络",{"type":18,"tag":32,"props":328,"children":329},{},[330],{"type":24,"value":331},"context.set_auto_parallel_context是配置并行训练参数的接口，必须在Model初始化前调用。如用户未指定参数，框架会自动根据并行模式为用户设置参数的经验值。如数据并行模式下，parameter_broadcast默认打开。主要参数包括：",{"type":18,"tag":140,"props":333,"children":334},{},[335,340,345,350],{"type":18,"tag":53,"props":336,"children":337},{},[338],{"type":24,"value":339},"parallel_mode：分布式并行模式，默认为单机模式ParallelMode.STAND_ALONE。可选数据并行ParallelMode.DATA_PARALLEL及自动并行ParallelMode.AUTO_PARALLEL。",{"type":18,"tag":53,"props":341,"children":342},{},[343],{"type":24,"value":344},"parameter_broadcast： 参数初始化广播开关，DATA_PARALLEL和HYBRID_PARALLEL模式下，默认值为True。",{"type":18,"tag":53,"props":346,"children":347},{},[348],{"type":24,"value":349},"mirror_mean：反向计算时，框架内部会将数据并行参数分散在多台机器的梯度值进行收集，得到全局梯度值后再传入优化器中更新。默认值为False，True对应allreduce_mean操作，False对应allreduce_sum操作。",{"type":18,"tag":53,"props":351,"children":352},{},[353],{"type":24,"value":354},"device_num和global_rank建议采用默认值，框架内会调用HCCL接口获取。",{"type":18,"tag":32,"props":356,"children":357},{},[358],{"type":24,"value":359},"如脚本中存在多个网络用例，请在执行下个用例前调用context.reset_auto_parallel_context将所有参数还原到默认值。在下面的样例中我们指定并行模式为自动并行，用户如需切换为数据并行模式，只需将parallel_mode改为DATA_PARALLEL。",{"type":18,"tag":125,"props":361,"children":363},{"code":362},"from mindspore import context      \nfrom mindspore.nn.optim.momentum import Momentum      \nfrom mindspore.train.callback import LossMonitor      \nfrom mindspore.train.model import Model, ParallelMode      \nfrom resnet import resnet50      \ndevice_id = int(os.getenv('DEVICE_ID'))      \ncontext.set_context(mode=context.GRAPH_MODE, device_target=\"Ascend\")      \ncontext.set_context(device_id=device_id) # set device_id      \ndef test_train_cifar(epoch_size=10):      \n    context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, mirror_mean=True)      \n    loss_cb = LossMonitor()      \n    dataset = create_dataset(data_path, epoch_size)      \n    batch_size = 32      \n    num_classes = 10      \n    net = resnet50(batch_size, num_classes)      \n    loss = SoftmaxCrossEntropyExpand(sparse=True)      \n    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9)      \n    model = Model(net, loss_fn=loss, optimizer=opt)      \n    model.train(epoch_size, dataset, callbacks=[loss_cb], dataset_sink_mode=True)      \n",[364],{"type":18,"tag":130,"props":365,"children":366},{"__ignoreMap":7},[367],{"type":24,"value":362},{"type":18,"tag":32,"props":369,"children":370},{},[371],{"type":24,"value":216},{"type":18,"tag":140,"props":373,"children":374},{},[375,380],{"type":18,"tag":53,"props":376,"children":377},{},[378],{"type":24,"value":379},"dataset_sink_mode=True：表示采用数据集的下沉模式，即训练的计算下沉到硬件平台中执行。",{"type":18,"tag":53,"props":381,"children":382},{},[383],{"type":24,"value":384},"LossMonitor：能够通过回调函数返回Loss值，用于监控损失函数。",{"type":18,"tag":26,"props":386,"children":388},{"id":387},"第8步运行脚本",[389],{"type":24,"value":390},"第8步：运行脚本",{"type":18,"tag":32,"props":392,"children":393},{},[394],{"type":24,"value":395},"上述已将训练所需的脚本编辑好了，接下来通过命令调用对应的脚本。目前MindSpore分布式执行采用单卡单进程运行方式，即每张卡上运行1个进程，进程数量与使用的卡的数量一致。其中，0卡在前台执行，其他卡放在后台执行。每个进程创建1个目录，用来保存日志信息以及算子编译信息。下面以使用8张卡的分布式训练脚本为例，演示如何运行脚本：",{"type":18,"tag":125,"props":397,"children":399},{"code":398},"       #!/bin/bash      \nexport DATA_PATH=${DATA_PATH：-$1}      \n       export RANK_TABLE_FILE=$(pwd) /rank_table_8pcs.json      \n       export RANK_SIZE=8      \n       for((i=1;i\u003C${RANK_SIZE};i++))      \n       do      \n           rm -rf device$i      \n           mkdir device$i      \n           cp ./resnet50_distributed_training.py ./resnet.py ./device$i      \n           cd ./device$i      \n           export DEVICE_ID=$i      \n           export RANK_ID=$i      \n           echo \"start training for device $i\"      \n           env > env$i.log      \n           pytest -s -v ./resnet50_distributed_training.py > train.log$i 2>&1 &      \n           cd ../      \n       done      \n       rm -rf device0      \n       mkdir device0      \n       cp ./resnet50_distributed_training.py ./resnet.py ./device0      \n       cd ./device0      \n       export DEVICE_ID=0      \n       export RANK_ID=0      \n       echo \"start training for device 0\"      \n       env > env0.log      \n       pytest -s -v ./resnet50_distributed_training.py > train.log0 2>&1      \n       if [ $? -eq 0 ];then      \n           echo \"training success\"      \n       else      \n           echo \"training failed\"      \n           exit 2      \n       fi      \n       cd ../   \n",[400],{"type":18,"tag":130,"props":401,"children":402},{"__ignoreMap":7},[403],{"type":24,"value":398},{"type":18,"tag":32,"props":405,"children":406},{},[407],{"type":24,"value":408},"脚本需要传入数据集的路径变量DATA_PATH。其中需要设置的环境变量有，",{"type":18,"tag":140,"props":410,"children":411},{},[412,417,422],{"type":18,"tag":53,"props":413,"children":414},{},[415],{"type":24,"value":416},"RANK_TABLE_FILE：组网信息文件的路径。",{"type":18,"tag":53,"props":418,"children":419},{},[420],{"type":24,"value":421},"DEVICE_ID：当前卡在机器上的实际序号。",{"type":18,"tag":53,"props":423,"children":424},{},[425],{"type":24,"value":426},"RANK_ID: 当前卡的逻辑序号。 其余环境变量请参考安装教程中的配置项。",{"type":18,"tag":32,"props":428,"children":429},{},[430],{"type":24,"value":431},"启动脚本运行时间大约在5分钟内，主要时间是用于算子的编译，实际训练时间在20秒内，而单卡的脚本编译加上训练的时间在10分钟左右。device目录的train.log中运行的日志示例如下：",{"type":18,"tag":125,"props":433,"children":435},{"code":434},"epoch: 1 step: 156, loss is 2.0084016\nepoch: 2 step: 156, loss is 1.6407638\nepoch: 3 step: 156, loss is 1.6164391\nepoch: 4 step: 156, loss is 1.6838071\nepoch: 5 step: 156, loss is 1.6320667\nepoch: 6 step: 156, loss is 1.3098773\nepoch: 7 step: 156, loss is 1.3515002\nepoch: 8 step: 156, loss is 1.2943741\nepoch: 9 step: 156, loss is 1.2316195\nepoch: 10 step: 156, loss is 1.1533381\n",[436],{"type":18,"tag":130,"props":437,"children":438},{"__ignoreMap":7},[439],{"type":24,"value":434},{"type":18,"tag":26,"props":441,"children":443},{"id":442},"总结",[444],{"type":24,"value":442},{"type":18,"tag":32,"props":446,"children":447},{},[448],{"type":24,"value":449},"从上面Resnet-50的分布式并行训练的案例，我们不难看出，分布式并行相比单卡引入了一些开发上的复杂性：",{"type":18,"tag":49,"props":451,"children":452},{},[453,458,463],{"type":18,"tag":53,"props":454,"children":455},{},[456],{"type":24,"value":457},"多卡的分布式配置及分布式通信。",{"type":18,"tag":53,"props":459,"children":460},{},[461],{"type":24,"value":462},"定义网络时需要考虑损失函数的算子选择。",{"type":18,"tag":53,"props":464,"children":465},{},[466],{"type":24,"value":467},"训练脚本相比单卡更加复杂。",{"type":18,"tag":32,"props":469,"children":470},{},[471],{"type":24,"value":472},"同时，分布式训练下的调试调优也会更难些。虽然引入了这些开发调试的成本，但是分布式训练带来的性能收益可能远高于开发调试的开销，以Resnet-50为例，在昇腾芯片上8卡的性能超过单卡的X倍。因此对于训练要求较高的模型，使用分布式训练是更好的选择。",{"type":18,"tag":26,"props":474,"children":476},{"id":475},"参考资料",[477],{"type":18,"tag":149,"props":478,"children":479},{},[480],{"type":24,"value":475},{"type":18,"tag":32,"props":482,"children":483},{},[484,486],{"type":24,"value":485},"[1] ",{"type":18,"tag":38,"props":487,"children":490},{"href":488,"rel":489},"https://link.zhihu.com/?target=https%3A//gitee.com/mindspore/docs/tree/master/docs/sample_code/distributed_training",[42],[491],{"type":24,"value":492},"分布式训练完整样例代码",{"title":7,"searchDepth":494,"depth":494,"links":495},4,[496,498,499,500,501,502,503,504,505,506,507],{"id":28,"depth":497,"text":28},2,{"id":95,"depth":497,"text":98},{"id":115,"depth":497,"text":118},{"id":190,"depth":497,"text":193},{"id":237,"depth":497,"text":240},{"id":274,"depth":497,"text":277},{"id":294,"depth":497,"text":297},{"id":323,"depth":497,"text":326},{"id":387,"depth":497,"text":390},{"id":442,"depth":497,"text":442},{"id":475,"depth":497,"text":475},"markdown","content:technology-blogs:zh:1586.md","content","technology-blogs/zh/1586.md","technology-blogs/zh/1586","md",1776506113688]