[{"data":1,"prerenderedAt":210},["ShallowReactive",2],{"content-query-tVzyyZFDX3":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"body":13,"_type":204,"_id":205,"_source":206,"_file":207,"_stem":208,"_extension":209},"/news/en/2596","en",false,"","Implementing Efficient Computation of Matmul  for Accelerated MindSpore Network Inference","The Profiler analysis shows that most of the computation time is spent on the Matmul matrix multiplication at the fully-connected layer. Optimization is required.","2023-03-01","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/07/03/1e5134a685e945478ec039d86d82f1ec.png","news",{"type":14,"children":15,"toc":201},"root",[16,24,34,39,44,49,54,59,67,72,77,82,87,92,97,105,110,115,125,130,138,146,151,156,164,171,176,181,192],{"type":17,"tag":18,"props":19,"children":21},"element","h1",{"id":20},"implementing-efficient-computation-of-matmul-for-accelerated-mindspore-network-inference",[22],{"type":23,"value":8},"text",{"type":17,"tag":25,"props":26,"children":27},"p",{},[28],{"type":17,"tag":29,"props":30,"children":31},"strong",{},[32],{"type":23,"value":33},"1 System Environment",{"type":17,"tag":25,"props":35,"children":36},{},[37],{"type":23,"value":38},"Hardware environment: Ascend/GPU/CPU",{"type":17,"tag":25,"props":40,"children":41},{},[42],{"type":23,"value":43},"MindSpore version: any version",{"type":17,"tag":25,"props":45,"children":46},{},[47],{"type":23,"value":48},"Execution mode (PyNative/Graph): any mode",{"type":17,"tag":25,"props":50,"children":51},{},[52],{"type":23,"value":53},"Python version: 3.7/3.8/3.9",{"type":17,"tag":25,"props":55,"children":56},{},[57],{"type":23,"value":58},"OS platform: any OS",{"type":17,"tag":25,"props":60,"children":61},{},[62],{"type":17,"tag":29,"props":63,"children":64},{},[65],{"type":23,"value":66},"2 Error Information",{"type":17,"tag":25,"props":68,"children":69},{},[70],{"type":23,"value":71},"2.1 Error Description",{"type":17,"tag":25,"props":73,"children":74},{},[75],{"type":23,"value":76},"When training a network using MindSpore, it is found that the network inference time is relatively long and requires optimization.",{"type":17,"tag":25,"props":78,"children":79},{},[80],{"type":23,"value":81},"2.2 Error Message",{"type":17,"tag":25,"props":83,"children":84},{},[85],{"type":23,"value":86},"The Profiler analysis shows that most of the time is spent on the Matmul matrix multiplication at the fully-connected layer.",{"type":17,"tag":25,"props":88,"children":89},{},[90],{"type":23,"value":91},"2.3 Script Code",{"type":17,"tag":25,"props":93,"children":94},{},[95],{"type":23,"value":96},"Construct the code based on the description.",{"type":17,"tag":25,"props":98,"children":99},{},[100],{"type":17,"tag":29,"props":101,"children":102},{},[103],{"type":23,"value":104},"3. Root Cause Analysis",{"type":17,"tag":25,"props":106,"children":107},{},[108],{"type":23,"value":109},"According to the error message, the primary cause for slow training is the Matmul matrix multiplication. When dealing with computationally intensive operators, using float32 precision takes longer than using float16 precision. To speed up computations and save time, you can convert the precision type to float16 before performing the computation, and then convert it back to float32 once the computation is complete.",{"type":17,"tag":25,"props":111,"children":112},{},[113],{"type":23,"value":114},"Test the custom code and multiply the values to check the running time difference.",{"type":17,"tag":116,"props":117,"children":119},"pre",{"code":118},"import numpy as np\nimport mindspore.nn as nn\nfrom mindspore.ops import operations as ops\nimport mindspore as ms\nimport time\n\nms.set_context(mode=ms.GRAPH_MODE, device_target=\"GPU\")\n\nclass Net(nn.Cell):\n    def __init__(self):\n        super(Net, self).__init__()\n        self.matmul = ops.MatMul(transpose_b=True)\n\n    def construct(self, x, y):\n        return self.matmul(x, y)\n\nx = ms.Tensor(np.arange(10240*10240).reshape(10240, 10240).astype(np.float32))\ny = ms.Tensor(np.arange(10240*10240).reshape(10240, 10240).astype(np.float32))\n\nnet = Net()\n# print(net(x, y))\n\n# Timing\na = time.time()\noutput = net(x, y)\ntime32 = time.time() - a\n# print(output)\nprint(output.shape)\nprint (time32)\n\nnet2 = Net()\n# Type conversion\nx2 = ms.Tensor(x, dtype=ms.float16)\n# Timing\nb = time.time()\noutput = net(x, y)\ntime16 = time.time() - b\n\n# print(output)\nprint(output.shape)\nprint (time16)\n",[120],{"type":17,"tag":121,"props":122,"children":123},"code",{"__ignoreMap":7},[124],{"type":23,"value":118},{"type":17,"tag":25,"props":126,"children":127},{},[128],{"type":23,"value":129},"The output shows that using float16 is several times faster than using float32.",{"type":17,"tag":25,"props":131,"children":132},{},[133],{"type":17,"tag":134,"props":135,"children":137},"img",{"alt":7,"src":136},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/07/03/21b803ad958f40e0bb62e2000167c18f.png",[],{"type":17,"tag":25,"props":139,"children":140},{},[141],{"type":17,"tag":29,"props":142,"children":143},{},[144],{"type":23,"value":145},"4. Solution",{"type":17,"tag":25,"props":147,"children":148},{},[149],{"type":23,"value":150},"Convert the precision type to float16 before performing the computation, and then convert it back to float32 once the computation is complete. This accelerates the computation.",{"type":17,"tag":25,"props":152,"children":153},{},[154],{"type":23,"value":155},"According to the error message, we can locate the error at the fully-connected layer. After conducting computation tests at this layer, we can find that the computation speed is improved by approximately 50 times following the conversion of data types.",{"type":17,"tag":116,"props":157,"children":159},{"code":158},"import numpy as np\nimport mindspore.nn as nn\nfrom mindspore.ops import operations as ops\nimport mindspore as ms\nimport time\n\nms.set_context(mode=ms.GRAPH_MODE, device_target=\"GPU\")\n\n\nx = ms.Tensor(np.arange(10240*10240).reshape(10240, 10240).astype(np.float32))\n\nnet = nn.Dense(10240, 60)\n# Timing\na = time.time()\noutput = net(x)\ntime32 = time.time() - a\n# print(output)\nprint(output.shape)\nprint (time32)\n\nnet2 = nn.Dense(10240, 60)\n# Type conversion\nx2 = ms.Tensor(x, dtype=ms.float16)\n# Timing\nb = time.time()\noutput = net(x)\ntime16 = time.time() - b\n\n# print(output)\nprint(output.shape)\nprint (time16)\n",[160],{"type":17,"tag":121,"props":161,"children":162},{"__ignoreMap":7},[163],{"type":23,"value":158},{"type":17,"tag":25,"props":165,"children":166},{},[167],{"type":17,"tag":134,"props":168,"children":170},{"alt":7,"src":169},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/07/03/ae3f284d85384ce5b36a63599fe0df69.png",[],{"type":17,"tag":25,"props":172,"children":173},{},[174],{"type":23,"value":175},"Acceleration: nearly 2.15/0.04 = 50 times",{"type":17,"tag":25,"props":177,"children":178},{},[179],{"type":23,"value":180},"For details, see the official documents.",{"type":17,"tag":25,"props":182,"children":183},{},[184],{"type":17,"tag":185,"props":186,"children":190},"a",{"href":187,"rel":188},"https://www.mindspore.cn/docs/en/r2.0.0-alpha/api_python/mindspore.html",[189],"nofollow",[191],{"type":23,"value":187},{"type":17,"tag":25,"props":193,"children":194},{},[195],{"type":17,"tag":185,"props":196,"children":199},{"href":197,"rel":198},"https://www.mindspore.cn/docs/en/r2.0.0-alpha/api_python/nn/mindspore.nn.Dense.html",[189],[200],{"type":23,"value":197},{"title":7,"searchDepth":202,"depth":202,"links":203},4,[],"markdown","content:news:en:2596.md","content","news/en/2596.md","news/en/2596","md",1776506045126]