[{"data":1,"prerenderedAt":423},["ShallowReactive",2],{"content-query-JwXk99EoIB":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":417,"_id":418,"_source":419,"_file":420,"_stem":421,"_extension":422},"/technology-blogs/zh/1092","zh",false,"","DCPUE网络性能优化 -- CPU训练篇","不绑定核的情况下，MindSpore 训练为什么会需要如此多的 CPU 资源？","2022-03-23","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/03/23/a7e148f0713140b5a5380e4443cece5c.png","technology-blogs","开发者分享",{"type":15,"children":16,"toc":410},"root",[17,25,35,48,62,73,120,131,139,149,159,167,177,185,195,203,211,219,227,239,247,257,265,275,283,291,301,309,321,329,337,347,355,365,377],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"dcpue网络性能优化-cpu训练篇",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":29},"h3",{"id":28},"问题描述",[30],{"type":18,"tag":31,"props":32,"children":33},"strong",{},[34],{"type":24,"value":28},{"type":18,"tag":36,"props":37,"children":38},"ol",{},[39],{"type":18,"tag":40,"props":41,"children":42},"li",{},[43],{"type":18,"tag":31,"props":44,"children":45},{},[46],{"type":24,"value":47},"1.训练条件：",{"type":18,"tag":49,"props":50,"children":51},"p",{},[52],{"type":18,"tag":31,"props":53,"children":54},{},[55],{"type":18,"tag":56,"props":57,"children":59},"code",{"className":58},[],[60],{"type":24,"value":61},"Linux Euler OS x86;  8 显卡;  物理 CPU 2;  每个物理 CPU 中的核数 26;  逻辑 CPU 104;  MindSpore 1.2.0  TensorFlow 1.15.0",{"type":18,"tag":49,"props":63,"children":64},{},[65],{"type":18,"tag":31,"props":66,"children":67},{},[68],{"type":18,"tag":69,"props":70,"children":72},"img",{"alt":7,"src":71},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/03/23/48af2a8baef34f8195d2d6b22dfb0dd0.png",[],{"type":18,"tag":36,"props":74,"children":75},{},[76,97],{"type":18,"tag":40,"props":77,"children":78},{},[79,84,88],{"type":18,"tag":31,"props":80,"children":81},{},[82],{"type":24,"value":83},"2.CPU 训练时长",{"type":18,"tag":85,"props":86,"children":87},"br",{},[],{"type":18,"tag":31,"props":89,"children":90},{},[91],{"type":18,"tag":56,"props":92,"children":94},{"className":93},[],[95],{"type":24,"value":96},"MindSpore(不绑定核): 33:00  TensorFlow(不绑定核): 01:06   MindSpore(绑定一个逻辑 CPU)：3小时左右  TensorFlow(绑定一个逻辑 CPU): 01:47",{"type":18,"tag":40,"props":98,"children":99},{},[100,105,108],{"type":18,"tag":31,"props":101,"children":102},{},[103],{"type":24,"value":104},"3.优化目的",{"type":18,"tag":85,"props":106,"children":107},{},[],{"type":18,"tag":31,"props":109,"children":110},{},[111,113,118],{"type":24,"value":112},"在相同条件下，使用 MindSpore 框架训练网络的时长 ",{"type":18,"tag":31,"props":114,"children":115},{},[116],{"type":24,"value":117},"小于或等于",{"type":24,"value":119}," 使用 TensorFlow 框架训练的时长",{"type":18,"tag":26,"props":121,"children":123},{"id":122},"问题分析",[124],{"type":18,"tag":31,"props":125,"children":126},{},[127],{"type":18,"tag":31,"props":128,"children":129},{},[130],{"type":24,"value":122},{"type":18,"tag":49,"props":132,"children":133},{},[134],{"type":18,"tag":31,"props":135,"children":136},{},[137],{"type":24,"value":138},"1.不绑定核的情况下，MindSpore 训练会占用几乎所有56个逻辑 CPU。不知道它为什么会需要如此多的 CPU 资源。而绑定一个逻辑 CPU 的情况下，MindSpore 的训练慢到几乎跑不动",{"type":18,"tag":49,"props":140,"children":141},{},[142],{"type":18,"tag":31,"props":143,"children":144},{},[145],{"type":18,"tag":69,"props":146,"children":148},{"alt":7,"src":147},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/03/23/cfc5d13f07cd46b1ac4d471e2742b5c3.png",[],{"type":18,"tag":49,"props":150,"children":151},{},[152],{"type":18,"tag":31,"props":153,"children":154},{},[155],{"type":18,"tag":69,"props":156,"children":158},{"alt":7,"src":157},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/03/23/009f8acf4a25455ea7db29bfcdce90d4.png",[],{"type":18,"tag":49,"props":160,"children":161},{},[162],{"type":18,"tag":31,"props":163,"children":164},{},[165],{"type":24,"value":166},"2.在绑定一个逻辑 CPU 的情况下，通过 MindSpore 配套的网络分析工具 MindInsight 查看具体算子的执行时间信息：",{"type":18,"tag":49,"props":168,"children":169},{},[170],{"type":18,"tag":31,"props":171,"children":172},{},[173],{"type":18,"tag":69,"props":174,"children":176},{"alt":7,"src":175},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/03/23/c457d92b20ca422fa8c744f2e7436a9c.png",[],{"type":18,"tag":49,"props":178,"children":179},{},[180],{"type":18,"tag":31,"props":181,"children":182},{},[183],{"type":24,"value":184},"3.再用 TensorFlow 训练该神经网络，并通过 TensorFlow 的 profiler 工具分析具体算子的执行信息（耗时最长的5个算子）：",{"type":18,"tag":49,"props":186,"children":187},{},[188],{"type":18,"tag":31,"props":189,"children":190},{},[191],{"type":18,"tag":69,"props":192,"children":194},{"alt":7,"src":193},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/03/23/1cb7d0c4cae84ce08c873434fa00861d.png",[],{"type":18,"tag":49,"props":196,"children":197},{},[198],{"type":18,"tag":31,"props":199,"children":200},{},[201],{"type":24,"value":202},"4.对比发现，TensorFlow 算子的执行时间都在 us 级别，而 MindSpore 耗时前5的算子的执行时间在 s 级别。相差一百万倍，这显然是不正常的。",{"type":18,"tag":49,"props":204,"children":205},{},[206],{"type":18,"tag":31,"props":207,"children":208},{},[209],{"type":24,"value":210},"5.阅读 MindSpore 耗时前5的算子（AddN、Relu、Mul、MatMul、Abs）的源代码，发现它们都是通过第三方库 MKL-DNN 实现的。",{"type":18,"tag":49,"props":212,"children":213},{},[214],{"type":18,"tag":31,"props":215,"children":216},{},[217],{"type":24,"value":218},"6.MKL-DNN 全称 Math Kernel Library for Deep Neural Networks。该库是几年前，由 Intel 官方发布。主要优化了深度学习中一些常用算子在 CPU 上的性能表现。因此不太可能是这个库的实现出了问题。估计是应用这个库的方式出了问题。",{"type":18,"tag":49,"props":220,"children":221},{},[222],{"type":18,"tag":31,"props":223,"children":224},{},[225],{"type":24,"value":226},"7.咨询负责这部分代码实现的同事，并查阅相关信息，发现在应用该库时，需要设置环境变量：",{"type":18,"tag":49,"props":228,"children":229},{},[230],{"type":18,"tag":31,"props":231,"children":232},{},[233],{"type":18,"tag":56,"props":234,"children":236},{"className":235},[],[237],{"type":24,"value":238},"export OMP_NUM_THREADS = num physical cores",{"type":18,"tag":49,"props":240,"children":241},{},[242],{"type":18,"tag":31,"props":243,"children":244},{},[245],{"type":24,"value":246},"8.设置该环境变量为1（因为该网络绑定一个逻辑 CPU 训练，而一般一个物理 CPU 含有 N 个逻辑 CPU，因此理论上该环境变量应设为 1/N，向上取整为1），测试该神经网络的性能。发现时间性能从原来的3小时提升到了3分13秒，单步执行时间从19260ms左右提升到了6ms左右",{"type":18,"tag":49,"props":248,"children":249},{},[250],{"type":18,"tag":31,"props":251,"children":252},{},[253],{"type":18,"tag":69,"props":254,"children":256},{"alt":7,"src":255},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/03/23/4faccf3f1f504166b81f505e6cadab34.png",[],{"type":18,"tag":49,"props":258,"children":259},{},[260],{"type":18,"tag":31,"props":261,"children":262},{},[263],{"type":24,"value":264},"9.查看算子执行时间的具体信息，发现原先5个耗时最长的算子都有大幅的性能提升，其中 AddN、Relu、Abs、Mul 均达到了 TensorFlow 的性能量级。MatMul 与 TensorFlow的相比仍有差距，但也有不小的提升。",{"type":18,"tag":49,"props":266,"children":267},{},[268],{"type":18,"tag":31,"props":269,"children":270},{},[271],{"type":18,"tag":69,"props":272,"children":274},{"alt":7,"src":273},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/03/23/1f1248646792499190f4a38e6edc9bdb.png",[],{"type":18,"tag":49,"props":276,"children":277},{},[278],{"type":18,"tag":31,"props":279,"children":280},{},[281],{"type":24,"value":282},"10.至此，该神经网络的 CPU 训练性能从原来的3小时提升到了3分13秒，训练时间减少了99.97%。但相比 TensorFlow 的 1分47秒，仍有不小差距。",{"type":18,"tag":49,"props":284,"children":285},{},[286],{"type":18,"tag":31,"props":287,"children":288},{},[289],{"type":24,"value":290},"11.继续分析。从算子时间信息中可以发现，优化 MKL-DNN 后，ReluGrad（Relu 算子的反向） 在算子总耗时中占比最高，为76%。阅读该算子的源代码，发现该算子的实现有很大问题：",{"type":18,"tag":49,"props":292,"children":293},{},[294],{"type":18,"tag":31,"props":295,"children":296},{},[297],{"type":18,"tag":69,"props":298,"children":300},{"alt":7,"src":299},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/03/23/0375cdf3077f469bae2279a5d0e4b4fc.png",[],{"type":18,"tag":49,"props":302,"children":303},{},[304],{"type":18,"tag":31,"props":305,"children":306},{},[307],{"type":24,"value":308},"12.该算子在实现多线程时，并没有使用统一的线程池，而是自起了最大线程数的线程。这会导致：",{"type":18,"tag":49,"props":310,"children":311},{},[312],{"type":18,"tag":31,"props":313,"children":314},{},[315],{"type":18,"tag":56,"props":316,"children":318},{"className":317},[],[319],{"type":24,"value":320},"1. 同时起如此多的线程，会造成 CPU 资源浪费； 2. 除此之外，还有可能引起线程竞争； 3. 线程的启用和销毁，也会增加额外的开销；",{"type":18,"tag":49,"props":322,"children":323},{},[324],{"type":18,"tag":31,"props":325,"children":326},{},[327],{"type":24,"value":328},"这些都会大大增加 CPU 的负担，降低神经网络的时间性能。",{"type":18,"tag":49,"props":330,"children":331},{},[332],{"type":18,"tag":31,"props":333,"children":334},{},[335],{"type":24,"value":336},"13.重构这部分代码，使用统一的线程池管理线程，并删除冗余操作。再测试该神经网络的性能。发现时间性能从原来的3分13秒提升到了1分52秒，单步执行时间从6ms左右提升到了3.5ms左右：",{"type":18,"tag":49,"props":338,"children":339},{},[340],{"type":18,"tag":31,"props":341,"children":342},{},[343],{"type":18,"tag":69,"props":344,"children":346},{"alt":7,"src":345},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/03/23/360a02b54d1645419e4af6ecb3a4e790.png",[],{"type":18,"tag":49,"props":348,"children":349},{},[350],{"type":18,"tag":31,"props":351,"children":352},{},[353],{"type":24,"value":354},"14.查看算子执行时间的具体信息，发现包括 ReluGrad 在内的所有算子，执行时间都降到了 us 级别，与 TensorFlow 的基本相当。",{"type":18,"tag":49,"props":356,"children":357},{},[358],{"type":18,"tag":31,"props":359,"children":360},{},[361],{"type":18,"tag":69,"props":362,"children":364},{"alt":7,"src":363},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/03/23/4a6fccf54b6246d7960178161759d893.png",[],{"type":18,"tag":26,"props":366,"children":368},{"id":367},"reference",[369],{"type":18,"tag":31,"props":370,"children":371},{},[372],{"type":18,"tag":31,"props":373,"children":374},{},[375],{"type":24,"value":376},"Reference",{"type":18,"tag":49,"props":378,"children":379},{},[380],{"type":18,"tag":31,"props":381,"children":382},{},[383,392,394,400,402,408],{"type":18,"tag":384,"props":385,"children":389},"a",{"href":386,"rel":387},"https://bbs.huaweicloud.com/forum/thread-166289-1-1.html#",[388],"nofollow",[390],{"type":24,"value":391},"Accelerating Deep Learning on CPU with Intel MKL-DNN",{"type":24,"value":393},", Apache MXNet, May 11, 2018 ",{"type":18,"tag":384,"props":395,"children":397},{"href":386,"rel":396},[388],[398],{"type":24,"value":399},"Maximize TensorFlow* Performance on CPU: Considerations and Recommendations for Inference Workloads",{"type":24,"value":401},", Intel Official ",{"type":18,"tag":384,"props":403,"children":405},{"href":386,"rel":404},[388],[406],{"type":24,"value":407},"linux top命令查看内存及多核CPU的使用讲述",{"type":24,"value":409},", 长风破浪, 2016-05-20",{"title":7,"searchDepth":411,"depth":411,"links":412},4,[413,415,416],{"id":28,"depth":414,"text":28},3,{"id":122,"depth":414,"text":122},{"id":367,"depth":414,"text":376},"markdown","content:technology-blogs:zh:1092.md","content","technology-blogs/zh/1092.md","technology-blogs/zh/1092","md",1776506112145]