[{"data":1,"prerenderedAt":391},["ShallowReactive",2],{"content-query-7ihDrwViPb":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":385,"_id":386,"_source":387,"_file":388,"_stem":389,"_extension":390},"/technology-blogs/zh/647","zh",false,"","开发者分享 | 获取MindSpore运行环境信息及解决dataset并行数报错","dataset并行数报错","2021-07-12","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/12/58503e67c7c740f88d270700f2b82b67.png","technology-blogs","开发者分享",{"type":15,"children":16,"toc":378},"root",[17,25,34,48,60,65,70,77,88,93,100,109,114,121,126,133,138,145,156,165,170,177,186,191,196,203,210,215,222,229,234,239,244,254,259,264,271,276,283,288,296,301,308,318,330,340,345,356,361,366,371],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"开发者分享-获取mindspore运行环境信息及解决dataset并行数报错",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":18,"tag":30,"props":31,"children":33},"img",{"alt":7,"src":32},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/12/1b195b80e711440790873d74038d6b42.gif",[],{"type":18,"tag":26,"props":35,"children":36},{},[37,39],{"type":24,"value":38},"本文来源于：",{"type":18,"tag":40,"props":41,"children":45},"a",{"href":42,"rel":43},"https://bbs.huaweicloud.com/forum/forum-1076-1.html",[44],"nofollow",[46],{"type":24,"value":47},"MindSpore论坛",{"type":18,"tag":26,"props":49,"children":50},{},[51,53],{"type":24,"value":52},"作者：",{"type":18,"tag":40,"props":54,"children":57},{"href":55,"rel":56},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=134920",[44],[58],{"type":24,"value":59},"Gongliyao",{"type":18,"tag":26,"props":61,"children":62},{},[63],{"type":24,"value":64},"本文主要分享一个和运行环境相关报错的解决方案，不知道大家有没有遇到过\"ValueError: num_parallel_workers exceeds the boundary between 1 and XXX !\"的报错。然而并不清楚变量\"num_parallel_workers\"的取值有何依据，为啥MindSpore/mindspore仓内model_zoo的网络脚本，别人能用，而自己却频繁报错。",{"type":18,"tag":26,"props":66,"children":67},{},[68],{"type":24,"value":69},"其实这一切都是因为每个人的运行环境存在差异，从硬件信息到驱动包，以及第三方依赖库，都可能导致报错的产生。所以，想要随心所欲的用好MindSpore框架，做到知彼知己是必需的。下面将从了解自己的运行环境开始，详细介绍dataset并行数报错的解决过程。",{"type":18,"tag":26,"props":71,"children":72},{},[73],{"type":18,"tag":30,"props":74,"children":76},{"alt":7,"src":75},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/12/348493d154de46faae5c1513cb8e707b.png",[],{"type":18,"tag":78,"props":79,"children":81},"h3",{"id":80},"_11-操作系统",[82],{"type":18,"tag":83,"props":84,"children":85},"strong",{},[86],{"type":24,"value":87},"1.1 操作系统",{"type":18,"tag":26,"props":89,"children":90},{},[91],{"type":24,"value":92},"首先，Linux系统可以使用\"uname -a\"和\"cat /etc/os-release\"来查看操作系统版本",{"type":18,"tag":26,"props":94,"children":95},{},[96],{"type":18,"tag":30,"props":97,"children":99},{"alt":7,"src":98},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/12/20d91adce7d34f4f993947640725bcc2.jpg",[],{"type":18,"tag":78,"props":101,"children":103},{"id":102},"_12-硬件信息",[104],{"type":18,"tag":83,"props":105,"children":106},{},[107],{"type":24,"value":108},"1.2 硬件信息",{"type":18,"tag":26,"props":110,"children":111},{},[112],{"type":24,"value":113},"1）使用\"lscpu\"查看cpu信息，包括处理器架构和线程数",{"type":18,"tag":26,"props":115,"children":116},{},[117],{"type":18,"tag":30,"props":118,"children":120},{"alt":7,"src":119},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/12/bf5c89f67bc645deaf92329e87042310.jpg",[],{"type":18,"tag":26,"props":122,"children":123},{},[124],{"type":24,"value":125},"2）使用\"free -h\"或\"cat /proc/cpuinfo |grep \"processor\" |wc -l\"查看内存大小",{"type":18,"tag":26,"props":127,"children":128},{},[129],{"type":18,"tag":30,"props":130,"children":132},{"alt":7,"src":131},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/12/a8281a2bc7c041f0a5edcbe8e6a49aa0.jpg",[],{"type":18,"tag":26,"props":134,"children":135},{},[136],{"type":24,"value":137},"3）使用\"npu-smi info\"查看昇腾系列NPU型号及工作状态（需Atlas run包）",{"type":18,"tag":26,"props":139,"children":140},{},[141],{"type":18,"tag":30,"props":142,"children":144},{"alt":7,"src":143},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/12/2d12967d41ac4d55ae26d0fdb783d93a.jpg",[],{"type":18,"tag":146,"props":147,"children":149},"ol",{"start":148},4,[150],{"type":18,"tag":151,"props":152,"children":153},"li",{},[154],{"type":24,"value":155},"使用\"nvidia-smi\"查看英伟达系列GPU型号、驱动版本及状态",{"type":18,"tag":78,"props":157,"children":159},{"id":158},"_13-mindspore版本",[160],{"type":18,"tag":83,"props":161,"children":162},{},[163],{"type":24,"value":164},"1.3 mindspore版本",{"type":18,"tag":26,"props":166,"children":167},{},[168],{"type":24,"value":169},"在conda环境类或者原生python环境下，使用\"pip show mindspore-ascend\"或\"pip show mindspore-gpu\"查看MindSpore版本号",{"type":18,"tag":26,"props":171,"children":172},{},[173],{"type":18,"tag":30,"props":174,"children":176},{"alt":7,"src":175},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/12/6ad516c2cddd404281e4f29beed03c56.jpg",[],{"type":18,"tag":78,"props":178,"children":180},{"id":179},"_14-run-package版本",[181],{"type":18,"tag":83,"props":182,"children":183},{},[184],{"type":24,"value":185},"1.4 run package版本",{"type":18,"tag":26,"props":187,"children":188},{},[189],{"type":24,"value":190},"通常情况下，使用root用户安装的Ascend芯片run包都会在/usr/local/Ascend目录下，而使用非root用户进行安装的话，会在/home/HwHiAiUser/目录下。",{"type":18,"tag":26,"props":192,"children":193},{},[194],{"type":24,"value":195},"以root用户的安装地址为例，使用\"cat /usr/local/Ascend/version.info\"可以查看Atlas版run包的版本号",{"type":18,"tag":26,"props":197,"children":198},{},[199],{"type":18,"tag":30,"props":200,"children":202},{"alt":7,"src":201},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/12/db67e4f0d2f24d58b9d34b05f12b437f.jpg",[],{"type":18,"tag":26,"props":204,"children":205},{},[206],{"type":18,"tag":30,"props":207,"children":209},{"alt":7,"src":208},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/12/9bfcf1ee42e8445a8684e0d6d79e89b6.png",[],{"type":18,"tag":26,"props":211,"children":212},{},[213],{"type":24,"value":214},"前文提及的dataset报错如下： 我们以使用FasterRCNN网络进行推理遇到的报错为例",{"type":18,"tag":26,"props":216,"children":217},{},[218],{"type":18,"tag":30,"props":219,"children":221},{"alt":7,"src":220},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/12/6e97ac85acdd46e1a94a484b8faeb096.jpg",[],{"type":18,"tag":26,"props":223,"children":224},{},[225],{"type":18,"tag":30,"props":226,"children":228},{"alt":7,"src":227},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/12/316c1655368849c689efe84a51d4b503.png",[],{"type":18,"tag":26,"props":230,"children":231},{},[232],{"type":24,"value":233},"通过查看调用栈可知，报错位置在“src/dataset.py\"文件line 497的”create_fasterrcnn_dataset“ 函数的内。",{"type":18,"tag":26,"props":235,"children":236},{},[237],{"type":24,"value":238},"下层的调用都在conda环境的python依赖库中，根据路径大致可分析出是MindSpore的接口抛出的错误。",{"type":18,"tag":26,"props":240,"children":241},{},[242],{"type":24,"value":243},"暂且认为MindSpore的接口功能正常，分析接口调用的逻辑，将create_fasterrcnn_dataset函数的代码部分拷贝，如下：",{"type":18,"tag":245,"props":246,"children":248},"pre",{"code":247},"import mindspore.dataset as de\n\ndef create_fasterrcnn_dataset(mindrecord_file, batch_size=2, repeat_num=12, device_num=1, rank_id=0,\n                              is_training=True, num_parallel_workers=4):\n    ds = de.MindDataset(mindrecord_file, columns_list=[\"image\", \"annotation\"], num_shards=device_num, shard_id=rank_id,\n                        num_parallel_workers=1, shuffle=is_training)\n    decode = C.Decode()\n    ds = ds.map(operations=decode, input_columns=[\"image\"], num_parallel_workers=1)\n    compose_map_func = (lambda image, annotation: preprocess_fn(image, annotation, is_training))\n\n    ...\n\n    if is_training:\n        ...\n\n    else:\n        ds = ds.map(operations=compose_map_func,\n                    input_columns=[\"image\", \"annotation\"],\n                    output_columns=[\"image\", \"image_shape\", \"box\", \"label\", \"valid_num\"],\n                    column_order=[\"image\", \"image_shape\", \"box\", \"label\", \"valid_num\"],\n                    num_parallel_workers=num_parallel_workers)\n\n        ds = ds.map(operations=[normalize_op, hwc_to_chw, type_cast1], input_columns=[\"image\"],\n                    num_parallel_workers=24)\n    ...\n\n    return ds\n",[249],{"type":18,"tag":250,"props":251,"children":252},"code",{"__ignoreMap":7},[253],{"type":24,"value":247},{"type":18,"tag":26,"props":255,"children":256},{},[257],{"type":24,"value":258},"通过阅读代码可知，该方法是在调用MindSpore框架的MindDataset接口，同时对数据进行预处理。",{"type":18,"tag":26,"props":260,"children":261},{},[262],{"type":24,"value":263},"那么报错信息中“num_parallel_workers”的取值是与mindspore.dataset.MindDataset接口息息相关的，通过查看官网API文档可知：",{"type":18,"tag":26,"props":265,"children":266},{},[267],{"type":18,"tag":30,"props":268,"children":270},{"alt":7,"src":269},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/12/2b9fea78321b4f99b81a1c17560394f7.jpg",[],{"type":18,"tag":26,"props":272,"children":273},{},[274],{"type":24,"value":275},"该参数是用来设置MindRecord格式文件的读取器并行数的，可以理解为多线程数，这里就和上文查询到的处理器线程数有直接关系了。",{"type":18,"tag":26,"props":277,"children":278},{},[279],{"type":18,"tag":30,"props":280,"children":282},{"alt":7,"src":281},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/12/755600c6a8be4132b60e4c328cd4fb10.png",[],{"type":18,"tag":26,"props":284,"children":285},{},[286],{"type":24,"value":287},"通过Linux命令查看当前环境的CPU硬件信息，得知处理器为4核8线程设计，故应将多线程数量限制在0-7的范围内。",{"type":18,"tag":245,"props":289,"children":291},{"code":290},"# lscpu\n\n>>>\n    Architecture:          x86_64\n    CPU op-mode(s):        32-bit, 64-bit\n    Byte Order:            Little Endian\n    CPU(s):                8\n    On-line CPU(s) list:   0-7\n    Thread(s) per core:    2\n    Core(s) per socket:    4\n    Socket(s):             1\n    NUMA node(s):          1\n    Vendor ID:             GenuineIntel\n    CPU family:            6\n    Model:                 158\n    Model name:            Intel(R) Core(TM) i7-7700 CPU @ 3.60GHz\n    Stepping:              9\n    CPU MHz:               4200.292\n    CPU max MHz:           4200.0000\n    CPU min MHz:           800.0000\n    BogoMIPS:              7200.00\n    Virtualization:        VT-x\n    L1d cache:             32K\n    L1i cache:             32K\n    L2 cache:              256K\n    L3 cache:              8192K\n    NUMA node0 CPU(s):     0-7\n",[292],{"type":18,"tag":250,"props":293,"children":294},{"__ignoreMap":7},[295],{"type":24,"value":290},{"type":18,"tag":26,"props":297,"children":298},{},[299],{"type":24,"value":300},"将“num_parallel_workers”参数的硬编码部分修改为[0,8)的区间内，重新进行模型推理计算，报错不复现并得到该网络预期的推理精度：",{"type":18,"tag":26,"props":302,"children":303},{},[304],{"type":18,"tag":30,"props":305,"children":307},{"alt":7,"src":306},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/12/e25ad2d44ab24030999fee474faa27bd.jpg",[],{"type":18,"tag":26,"props":309,"children":310},{},[311],{"type":18,"tag":40,"props":312,"children":315},{"href":313,"rel":314},"http://mp.weixin.qq.com/s?__biz=MzAxMDA1MDM0NQ==&mid=2247537121&idx=1&sn=6cffafd09399f650a2fe7e095535972e&chksm=9b545fa7ac23d6b1845476340d5dea2bdeb31af027df09b80aa0d28eb096b3ce487ce339cd22&scene=21#wechat_redirect",[44],[316],{"type":24,"value":317},"爱数智慧加入MindSpore社区，共同加速开源生态创新！",{"type":18,"tag":26,"props":319,"children":320},{},[321,328],{"type":18,"tag":40,"props":322,"children":325},{"href":323,"rel":324},"http://mp.weixin.qq.com/s?__biz=MzAxMDA1MDM0NQ==&mid=2247537283&idx=1&sn=e0a103179ae45be3cb2d1ade32bfc69c&chksm=9b545cc5ac23d5d33f37eff2d117b19238ebd2ce1bc0769c313d7f0f6810428132c82895aac0&scene=21#wechat_redirect",[44],[326],{"type":24,"value":327},"WAIC | MindSpore开源社区发起AI数据生态联盟",{"type":24,"value":329},"！",{"type":18,"tag":26,"props":331,"children":332},{},[333],{"type":18,"tag":40,"props":334,"children":337},{"href":335,"rel":336},"http://mp.weixin.qq.com/s?__biz=MzAxMDA1MDM0NQ==&mid=2247536962&idx=1&sn=855bb1799f6fe80bef0ab56fc44a356a&chksm=9b545f04ac23d6123e59527c581a5607fbcc7210706d16236740b67fd541d9995c7697d19645&scene=21#wechat_redirect",[44],[338],{"type":24,"value":339},"招募：对组织活动感兴趣，想结识更多开源大咖的你",{"type":18,"tag":26,"props":341,"children":342},{},[343],{"type":24,"value":344},"MindSpore官方资料",{"type":18,"tag":26,"props":346,"children":347},{},[348,350],{"type":24,"value":349},"GitHub : ",{"type":18,"tag":40,"props":351,"children":354},{"href":352,"rel":353},"https://github.com/mindspore-ai/mindspore",[44],[355],{"type":24,"value":352},{"type":18,"tag":26,"props":357,"children":358},{},[359],{"type":24,"value":360},"Gitee:https : //gitee.com/mindspore/mindspore",{"type":18,"tag":26,"props":362,"children":363},{},[364],{"type":24,"value":365},"官方QQ群 : 871543426",{"type":18,"tag":26,"props":367,"children":368},{},[369],{"type":24,"value":370},"长按下方二维码加入MindSpore项目↓",{"type":18,"tag":26,"props":372,"children":373},{},[374],{"type":18,"tag":30,"props":375,"children":377},{"alt":7,"src":376},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/12/afd397774c314939b2bad54807380cae.jpg",[],{"title":7,"searchDepth":148,"depth":148,"links":379},[380,382,383,384],{"id":80,"depth":381,"text":87},3,{"id":102,"depth":381,"text":108},{"id":158,"depth":381,"text":164},{"id":179,"depth":381,"text":185},"markdown","content:technology-blogs:zh:647.md","content","technology-blogs/zh/647.md","technology-blogs/zh/647","md",1776506139207]