[{"data":1,"prerenderedAt":234},["ShallowReactive",2],{"content-query-fE1gbJ5CPv":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":228,"_id":229,"_source":230,"_file":231,"_stem":232,"_extension":233},"/technology-blogs/zh/2189","zh",false,"","MindSpore网络推理时使用Matmul矩阵乘法算子计算速度较慢","profiler 后发现大部分的时间都花在了 全连接层的 Matmul 矩阵乘法上，应该如何优化？","2023-03-01","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/03/09/119f0ca1f6144fd997e50e08ea15ae4d.png","technology-blogs","实践",{"type":15,"children":16,"toc":214},"root",[17,25,32,38,43,48,53,58,64,71,76,82,86,92,97,103,108,118,128,144,150,163,171,178,186,191],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore网络推理时使用matmul矩阵乘法算子计算速度较慢",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":29},"h2",{"id":28},"_1-系统环境",[30],{"type":24,"value":31},"1 系统环境",{"type":18,"tag":33,"props":34,"children":35},"p",{},[36],{"type":24,"value":37},"硬件环境(Ascend/GPU/CPU): Ascend/GPU/CPU",{"type":18,"tag":33,"props":39,"children":40},{},[41],{"type":24,"value":42},"MindSpore版本: 不限版本",{"type":18,"tag":33,"props":44,"children":45},{},[46],{"type":24,"value":47},"执行模式（PyNative/ Graph）: 不限模式",{"type":18,"tag":33,"props":49,"children":50},{},[51],{"type":24,"value":52},"Python版本: Python=3.7/3.8/3.9",{"type":18,"tag":33,"props":54,"children":55},{},[56],{"type":24,"value":57},"操作系统平台: 不限",{"type":18,"tag":26,"props":59,"children":61},{"id":60},"_2-报错信息",[62],{"type":24,"value":63},"2 报错信息",{"type":18,"tag":65,"props":66,"children":68},"h3",{"id":67},"_21-问题描述",[69],{"type":24,"value":70},"2.1 问题描述",{"type":18,"tag":33,"props":72,"children":73},{},[74],{"type":24,"value":75},"在使用 Mindspore 训练网络时，发现网络推理的时间过长。",{"type":18,"tag":26,"props":77,"children":79},{"id":78},"_22-报错信息",[80],{"type":24,"value":81},"2.2 报错信息",{"type":18,"tag":33,"props":83,"children":84},{},[85],{"type":24,"value":9},{"type":18,"tag":65,"props":87,"children":89},{"id":88},"_23-脚本代码",[90],{"type":24,"value":91},"2.3 脚本代码",{"type":18,"tag":33,"props":93,"children":94},{},[95],{"type":24,"value":96},"可根据描述自行构造",{"type":18,"tag":26,"props":98,"children":100},{"id":99},"_3-根因分析",[101],{"type":24,"value":102},"3 根因分析",{"type":18,"tag":33,"props":104,"children":105},{},[106],{"type":24,"value":107},"根据报错信息可知，训练速度慢的主要原因是Matmul 矩阵乘法上。对于计算量比较密集的算子，使用float32精度计算会比float16精度计算耗时更多。为了提升速度，节省时间，可以在执行前先转化成float16精度类型，计算结束后再转换回float32精度类型，这样可以加快计算速度。",{"type":18,"tag":109,"props":110,"children":111},"ul",{},[112],{"type":18,"tag":113,"props":114,"children":115},"li",{},[116],{"type":24,"value":117},"自定义代码测试，数值相乘，看运行时间差距",{"type":18,"tag":119,"props":120,"children":122},"pre",{"code":121},"import numpy as np\nimport mindspore.nn as nn\nfrom mindspore.ops import operations as ops\nimport mindspore as ms\nimport time\n\nms.set_context(mode=ms.GRAPH_MODE, device_target=\"GPU\")\n\nclass Net(nn.Cell):\n    def __init__(self):\n        super(Net, self).__init__()\n        self.matmul = ops.MatMul(transpose_b=True)\n\n    def construct(self, x, y):\n        return self.matmul(x, y)\n\nx = ms.Tensor(np.arange(10240*10240).reshape(10240, 10240).astype(np.float32))\ny = ms.Tensor(np.arange(10240*10240).reshape(10240, 10240).astype(np.float32))\n\nnet = Net()\n# print(net(x, y))\n\n# 计时\na = time.time()\noutput = net(x, y)\ntime32 = time.time() - a\n# print(output)\nprint(output.shape)\nprint (time32)\n\nnet2 = Net()\n# 类型转换\nx2 = ms.Tensor(x, dtype=ms.float16)\n# 计时\nb = time.time()\noutput = net(x, y)\ntime16 = time.time() - b\n\n# print(output)\nprint(output.shape)\nprint (time16)\n",[123],{"type":18,"tag":124,"props":125,"children":126},"code",{"__ignoreMap":7},[127],{"type":24,"value":121},{"type":18,"tag":109,"props":129,"children":130},{},[131,136],{"type":18,"tag":113,"props":132,"children":133},{},[134],{"type":24,"value":135},"输出结果：可以看出float16要比float32快数倍",{"type":18,"tag":113,"props":137,"children":138},{},[139],{"type":18,"tag":140,"props":141,"children":143},"img",{"alt":7,"src":142},"https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/944/db8/9cb/1cf64b2b1a944db89cbd8cbbfb1d7f4c.20230301005516.38588674814756158929595353011407:50540308074638:2400:4114F75E626FBD16967D3439FC966E5E73DF8E56B4FFE13983B02B43F3055770.png",[],{"type":18,"tag":26,"props":145,"children":147},{"id":146},"_4-解决方案",[148],{"type":24,"value":149},"4 解决方案",{"type":18,"tag":109,"props":151,"children":152},{},[153,158],{"type":18,"tag":113,"props":154,"children":155},{},[156],{"type":24,"value":157},"执行前先转化成float16精度类型，计算结束后再转换回float32精度类型，这样可以加快计算速度。",{"type":18,"tag":113,"props":159,"children":160},{},[161],{"type":24,"value":162},"根据报错信息可知作者使用全连接层时遇到的问题，因此我们通过全连接层运算，发现数据类型转换后速度提升约50倍。",{"type":18,"tag":119,"props":164,"children":166},{"code":165},"import numpy as np\nimport mindspore.nn as nn\nfrom mindspore.ops import operations as ops\nimport mindspore as ms\nimport time\n\nms.set_context(mode=ms.GRAPH_MODE, device_target=\"GPU\")\n\n\nx = ms.Tensor(np.arange(10240*10240).reshape(10240, 10240).astype(np.float32))\n\nnet = nn.Dense(10240, 60)\n# 计时\na = time.time()\noutput = net(x)\ntime32 = time.time() - a\n# print(output)\nprint(output.shape)\nprint (time32)\n\nnet2 = nn.Dense(10240, 60)\n# 类型转换\nx2 = ms.Tensor(x, dtype=ms.float16)\n# 计时\nb = time.time()\noutput = net(x)\ntime16 = time.time() - b\n\n# print(output)\nprint(output.shape)\nprint (time16)\n",[167],{"type":18,"tag":124,"props":168,"children":169},{"__ignoreMap":7},[170],{"type":24,"value":165},{"type":18,"tag":33,"props":172,"children":173},{},[174],{"type":18,"tag":140,"props":175,"children":177},{"alt":7,"src":176},"https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/944/db8/9cb/1cf64b2b1a944db89cbd8cbbfb1d7f4c.20230301005814.24498924698832328830253116961931:50540308074638:2400:0681711F6CD91F6E938227D26D8F62C76D63D2398CF1F55F7D92D6453A330E1B.png",[],{"type":18,"tag":109,"props":179,"children":180},{},[181],{"type":18,"tag":113,"props":182,"children":183},{},[184],{"type":24,"value":185},"提速将近2.15/0.04 = 50 倍",{"type":18,"tag":33,"props":187,"children":188},{},[189],{"type":24,"value":190},"参考官方文档",{"type":18,"tag":109,"props":192,"children":193},{},[194,205],{"type":18,"tag":113,"props":195,"children":196},{},[197],{"type":18,"tag":198,"props":199,"children":203},"a",{"href":200,"rel":201},"https://www.mindspore.cn/docs/zh-CN/r2.0.0-alpha/api_python/mindspore/mindspore.dtype.html#mindspore.dtype",[202],"nofollow",[204],{"type":24,"value":200},{"type":18,"tag":113,"props":206,"children":207},{},[208],{"type":18,"tag":198,"props":209,"children":212},{"href":210,"rel":211},"https://www.mindspore.cn/docs/zh-CN/r2.0.0-alpha/api_python/nn/mindspore.nn.Dense.html#mindspore.nn.Dense",[202],[213],{"type":24,"value":210},{"title":7,"searchDepth":215,"depth":215,"links":216},4,[217,219,223,226,227],{"id":28,"depth":218,"text":31},2,{"id":60,"depth":218,"text":63,"children":220},[221],{"id":67,"depth":222,"text":70},3,{"id":78,"depth":218,"text":81,"children":224},[225],{"id":88,"depth":222,"text":91},{"id":99,"depth":218,"text":102},{"id":146,"depth":218,"text":149},"markdown","content:technology-blogs:zh:2189.md","content","technology-blogs/zh/2189.md","technology-blogs/zh/2189","md",1776506120759]