[{"data":1,"prerenderedAt":216},["ShallowReactive",2],{"content-query-7P7ZRtJ9La":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":210,"_id":211,"_source":212,"_file":213,"_stem":214,"_extension":215},"/technology-blogs/zh/1726","zh",false,"","【MindSpore易点通】如何将MindSpore单机单卡代码拓展为分布式代码","将MindSpore单机单卡代码拓展为分布式代码，并在Ascend芯片上实现单机多卡训练。","2022-08-18","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/08/18/d253038d43f143c998c8a11c3148bbec.png","technology-blogs","实践",{"type":15,"children":16,"toc":202},"root",[17,25,31,42,47,52,57,62,67,72,82,91,96,101,106,114,119,124,132,141,146,154,163,168,176,181,189,194],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore易点通如何将mindspore单机单卡代码拓展为分布式代码",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"本文介绍如何将MindSpore单机单卡代码拓展为分布式代码，并在Ascend芯片上实现单机多卡训练。",{"type":18,"tag":32,"props":33,"children":35},"h2",{"id":34},"一配置运行信息和并行模式",[36],{"type":18,"tag":37,"props":38,"children":39},"strong",{},[40],{"type":24,"value":41},"一、配置运行信息和并行模式",{"type":18,"tag":26,"props":43,"children":44},{},[45],{"type":24,"value":46},"在训练代码中添加如下接口，分别配置运行信息和并行模式，并依次调用。",{"type":18,"tag":26,"props":48,"children":49},{},[50],{"type":24,"value":51},"1.get_rank：获取当前设备在集群中的ID；",{"type":18,"tag":26,"props":53,"children":54},{},[55],{"type":24,"value":56},"2.get_group_size：获取集群总数；",{"type":18,"tag":26,"props":58,"children":59},{},[60],{"type":24,"value":61},"3.context.set_context:配置当前执行环境；",{"type":18,"tag":26,"props":63,"children":64},{},[65],{"type":24,"value":66},"4.init:使能分布式通信，并完成相关初始化操作；",{"type":18,"tag":26,"props":68,"children":69},{},[70],{"type":24,"value":71},"5.context.set_auto_parallel_context:设置自动并行模式，该用例使用数据并行模式。",{"type":18,"tag":73,"props":74,"children":76},"pre",{"code":75},"import os\nimport argparse\nfrom mindspore import context\nfrom mindspore.context import ParallelModefrom mindspore.communication.management import init, get_rank, get_group_size\n\ndef network_init(argvs):\n    devid = int(os.getenv('DEVICE_ID', '0'))\n    context.set_context(mode=context.GRAPH_MODE,enable_auto_mixed_precision=True, device_target=argvs.device_target,save_graphs=False, device_id=devid)\n\n    # Init distributed\n\n    if argvs.is_distributed:\n        if argvs.device_target == \"Ascend\":\n            init()\n        else:\n            init(\"nccl\")\n        argvs.rank = get_rank()\n        argvs.group_size = get_group_size()\n\ndef parallel_init(argv):\n    context.reset_auto_parallel_context()\n    parallel_mode = ParallelMode.STAND_ALONE\n    degree = 1\n    if argv.is_distributed:\n        parallel_mode = ParallelMode.DATA_PARALLEL\n        degree = get_group_size()\n   context.set_auto_parallel_context(parallel_mode=parallel_mode,gradients_mean=True, device_num=degree, parameter_broadcast=True)\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser(description='MindSpore CIFAR-10 Example')\n    parser.add_argument('--device_target', type=str, default=\"Ascend\",choices=['Ascend', 'GPU', 'CPU'],help='device where the code will be implemented (default: CPU)')\n    parser.add_argument('--is_distributed', type=int, default=1,help='Distribute train or not, 1 for yes, 0 for no. Default: 1')\n    parser.add_argument('--rank', type=int, default=0,help='Local rank of distributed. Default: 0')\n    parser.add_argument('--group_size', type=int, default=1,help='World size of device. Default: 1')\n    parser.add_argument('--pre_trained', type=str, default=None,help='Pretrained checkpoint path')\n    args = parser.parse_args()\n    network_init(args)\n    parallel_init(args)\n",[77],{"type":18,"tag":78,"props":79,"children":80},"code",{"__ignoreMap":7},[81],{"type":24,"value":75},{"type":18,"tag":32,"props":83,"children":85},{"id":84},"二卡间初始化权重保持一致",[86],{"type":18,"tag":37,"props":87,"children":88},{},[89],{"type":24,"value":90},"二、卡间初始化权重保持一致",{"type":18,"tag":26,"props":92,"children":93},{},[94],{"type":24,"value":95},"卡间权重不一致，将为训练带来极大的阻碍，为使集群中每张Ascend芯片上模型的初始化权重保持一致，可以通过以下两种方式进行设置：",{"type":18,"tag":26,"props":97,"children":98},{},[99],{"type":24,"value":100},"1.固定随机种子",{"type":18,"tag":26,"props":102,"children":103},{},[104],{"type":24,"value":105},"在脚本开头添加如下代码：",{"type":18,"tag":73,"props":107,"children":109},{"code":108},"import mindspore\nmindspore.set_seed(0)\n",[110],{"type":18,"tag":78,"props":111,"children":112},{"__ignoreMap":7},[113],{"type":24,"value":108},{"type":18,"tag":26,"props":115,"children":116},{},[117],{"type":24,"value":118},"2.广播参数",{"type":18,"tag":26,"props":120,"children":121},{},[122],{"type":24,"value":123},"在context.set_auto_parallel_context接口中设置parameter_broadcast=True，即训练开始前自动广播0号卡上数据并行的参数权值到其他卡上，默认值为False",{"type":18,"tag":73,"props":125,"children":127},{"code":126},"context.set_auto_parallel_context(parallel_mode=parallel_mode,gradients_mean=True, device_num=degree, parameter_broadcast=True)\n",[128],{"type":18,"tag":78,"props":129,"children":130},{"__ignoreMap":7},[131],{"type":24,"value":126},{"type":18,"tag":32,"props":133,"children":135},{"id":134},"三数据并行模式加载数据集",[136],{"type":18,"tag":37,"props":137,"children":138},{},[139],{"type":24,"value":140},"三、数据并行模式加载数据集",{"type":18,"tag":26,"props":142,"children":143},{},[144],{"type":24,"value":145},"与单机不同，加载训练数据时，数据集接口中需要传入num_shards和shard_id参数，分别对应卡的数量和逻辑序号。通过配置运行信息和并行模式中的get_group_size和get_rank获取。",{"type":18,"tag":73,"props":147,"children":149},{"code":148},"if do_train:\n     cifar_ds = ds.Cifar10Dataset(dataset_dir=data_home, shuffle=True,num_shards=device_num, shard_id=rank, usage='train')else:\n     cifar_ds = ds.Cifar10Dataset(dataset_dir=data_home,shuffle=False, usage='test')\n",[150],{"type":18,"tag":78,"props":151,"children":152},{"__ignoreMap":7},[153],{"type":24,"value":148},{"type":18,"tag":32,"props":155,"children":157},{"id":156},"四运行脚本",[158],{"type":18,"tag":37,"props":159,"children":160},{},[161],{"type":24,"value":162},"四、运行脚本",{"type":18,"tag":26,"props":164,"children":165},{},[166],{"type":24,"value":167},"目前MindSpore分布式执行采用单卡单进程运行方式，即每张卡上运行1个进程，进程数量与使用的卡的数量一致。其中，0卡在前台执行，其他卡放在后台执行。用来保存日志信息以及算子编译信息。下面以使用8张卡的分布式训练脚本为例，演示如何运行脚本：",{"type":18,"tag":73,"props":169,"children":171},{"code":170},"export RANK_SIZE=8\n\ncurrent_exec_path=$(pwd)\necho ${current_exec_path}\n\necho 'start training'\n\nfor((i=0;i\u003C=$RANK_SIZE-1;i++));\n\ndo\n\n    echo 'start rank '$i\n    mkdir ${current_exec_path}/device$i\n    cd ${current_exec_path}/device$i\n    export RANK_ID=$i\n    dev=`expr $i`\n    export DEVICE_ID=$dev\n    python ../MindSpore_8P.py \n        --device_target='Ascend' \n        --is_distributed=1 > train.log  2>&1 &\n\ndone\n",[172],{"type":18,"tag":78,"props":173,"children":174},{"__ignoreMap":7},[175],{"type":24,"value":170},{"type":18,"tag":26,"props":177,"children":178},{},[179],{"type":24,"value":180},"启动shell脚本进行分布式训练(8卡)：",{"type":18,"tag":73,"props":182,"children":184},{"code":183},"sh run_distribute.sh\n",[185],{"type":18,"tag":78,"props":186,"children":187},{"__ignoreMap":7},[188],{"type":24,"value":183},{"type":18,"tag":26,"props":190,"children":191},{},[192],{"type":24,"value":193},"启动shell脚本后，将在后台运行分布训练， 执行tail -f device0/train.log 命令，可查看实时运行结果。如下图所示：",{"type":18,"tag":26,"props":195,"children":196},{},[197],{"type":18,"tag":198,"props":199,"children":201},"img",{"alt":7,"src":200},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/08/18/79f2af8c6db745648de919abc102f3d3.png",[],{"title":7,"searchDepth":203,"depth":203,"links":204},4,[205,207,208,209],{"id":34,"depth":206,"text":41},2,{"id":84,"depth":206,"text":90},{"id":134,"depth":206,"text":140},{"id":156,"depth":206,"text":162},"markdown","content:technology-blogs:zh:1726.md","content","technology-blogs/zh/1726.md","technology-blogs/zh/1726","md",1776506115551]