[{"data":1,"prerenderedAt":810},["ShallowReactive",2],{"content-query-6JwRJPL4x0":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":804,"_id":805,"_source":806,"_file":807,"_stem":808,"_extension":809},"/technology-blogs/zh/1697","zh",false,"","【MindSpore易点通】如何根据profiler数据查看性能瓶颈","使用混合精度可以加速训练，减少前反向时间。","2022-08-12","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/08/15/4aca50d157fd482a8e2f978097cc3d3c.png","technology-blogs","实践",{"type":15,"children":16,"toc":784},"root",[17,25,42,59,64,73,83,91,101,109,119,129,137,147,155,165,170,178,192,208,213,221,236,241,249,254,262,267,272,277,292,297,302,307,312,316,321,326,331,336,341,349,359,374,379,383,391,396,401,406,411,416,421,426,431,436,444,448,452,456,460,464,469,474,478,485,490,494,499,503,507,512,516,523,527,531,535,539,543,548,556,565,570,585,590,598,612,622,630,635,650,655,660,665,670,678,683,691,700,715,720,728,743,747,752,757,762,770,779],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore易点通如何根据profiler数据查看性能瓶颈",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":29},"h2",{"id":28},"mindinsight主要功能介绍",[30],{"type":18,"tag":31,"props":32,"children":33},"strong",{},[34],{"type":18,"tag":31,"props":35,"children":36},{},[37],{"type":18,"tag":31,"props":38,"children":39},{},[40],{"type":24,"value":41},"MindInsight主要功能介绍",{"type":18,"tag":43,"props":44,"children":45},"p",{},[46,48,57],{"type":24,"value":47},"MindInsight详细功能介绍请参考",{"type":18,"tag":49,"props":50,"children":54},"a",{"href":51,"rel":52},"https://www.mindspore.cn/mindinsight/docs/zh-CN/master/performance_profiling_ascend.html",[53],"nofollow",[55],{"type":24,"value":56},"MindInsight性能调试",{"type":24,"value":58},"。",{"type":18,"tag":43,"props":60,"children":61},{},[62],{"type":24,"value":63},"MindInsight主页面包括主要功能的概览信息：迭代轨迹、算子耗时统计、数据准备、时间线、调优助手。",{"type":18,"tag":43,"props":65,"children":66},{},[67],{"type":18,"tag":68,"props":69,"children":72},"img",{"alt":70,"src":71},"cke_20000.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/5e4/e02/8f7/550440a1fe5e4e028f77e5cf18005adc.20220814143435.91317730700584046258812815609310:20220815004352:2400:C9D94F69EC399F3B4C3D18D9FF817B90D9F8970CF0553315CD19EB9EFEA02C27.png",[],{"type":18,"tag":43,"props":74,"children":75},{},[76,81],{"type":18,"tag":31,"props":77,"children":78},{},[79],{"type":24,"value":80},"迭代轨迹",{"type":24,"value":82},"：展示了每个迭代各个阶段的性能信息：包括迭代间隙、前向反向、迭代拖尾以及每个all_reduce的信息。下方展示各个阶段的变化趋势。",{"type":18,"tag":43,"props":84,"children":85},{},[86],{"type":18,"tag":68,"props":87,"children":90},{"alt":88,"src":89},"cke_44240.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/5e4/e02/8f7/550440a1fe5e4e028f77e5cf18005adc.20220814143500.40198466978772743189894332172212:20220815004352:2400:0C680FEDB21B502D4A95C3B34B773A23F3FB551B44D0B0DDAD31BF3E57F27860.png",[],{"type":18,"tag":43,"props":92,"children":93},{},[94,99],{"type":18,"tag":31,"props":95,"children":96},{},[97],{"type":24,"value":98},"算子耗时统计",{"type":24,"value":100},"：通过类型、详细信息两个维度展示AICORE和AICPU算子耗时统计信息。",{"type":18,"tag":43,"props":102,"children":103},{},[104],{"type":18,"tag":68,"props":105,"children":108},{"alt":106,"src":107},"cke_61975.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/5e4/e02/8f7/550440a1fe5e4e028f77e5cf18005adc.20220814143524.87913704007323387430258578445691:20220815004352:2400:4801FE240697D044A8A269901277933D64503AD96AB742BA462497DD4ABBFED5.png",[],{"type":18,"tag":43,"props":110,"children":111},{},[112,117],{"type":18,"tag":31,"props":113,"children":114},{},[115],{"type":24,"value":116},"数据准备",{"type":24,"value":118},"：性能分析分为两部分：1、迭代间隙数据处理分析；2、数据处理pipeline分析。",{"type":18,"tag":43,"props":120,"children":121},{},[122,127],{"type":18,"tag":31,"props":123,"children":124},{},[125],{"type":24,"value":126},"迭代间隙阶段",{"type":24,"value":128},"：下图展示了迭代间隙阶段执行的操作的流程，通过分析队列中数据的情况判断出现性能问题的步骤。",{"type":18,"tag":43,"props":130,"children":131},{},[132],{"type":18,"tag":68,"props":133,"children":136},{"alt":134,"src":135},"cke_86296.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/5e4/e02/8f7/550440a1fe5e4e028f77e5cf18005adc.20220814143547.68607015532129078076391802076618:20220815004352:2400:0B2D26F228A4BE906DAE41D988149706B6DCD9CB1D36B2EA8C9823FDA621FDBF.png",[],{"type":18,"tag":43,"props":138,"children":139},{},[140,145],{"type":18,"tag":31,"props":141,"children":142},{},[143],{"type":24,"value":144},"数据处理阶段",{"type":24,"value":146},"：分析用于已经定位到了该阶段的问题时，定位其中哪个算子存在问题，通过各个算子之间的队列的使用率，判断前面的算子能否提供足够的数据到队列中供下一个算子使用。",{"type":18,"tag":43,"props":148,"children":149},{},[150],{"type":18,"tag":68,"props":151,"children":154},{"alt":152,"src":153},"cke_112932.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/5e4/e02/8f7/550440a1fe5e4e028f77e5cf18005adc.20220814143623.28087117052047031477589199811343:20220815004352:2400:EAEFFF837FEFDF283C4EF417D82A1C08E7D107A81F28E8EBB7075D999A210B49.png",[],{"type":18,"tag":43,"props":156,"children":157},{},[158,163],{"type":18,"tag":31,"props":159,"children":160},{},[161],{"type":24,"value":162},"时间线",{"type":24,"value":164},"：展示了算子在各个stream上的起止时刻，执行顺序、算子间隙、allreduce信息，从详细的粒度展示算子执行情况。",{"type":18,"tag":43,"props":166,"children":167},{},[168],{"type":24,"value":169},"用户可参考下图，进行算子分析（常用的为MindData阶段分析和算子性能分析）。",{"type":18,"tag":43,"props":171,"children":172},{},[173],{"type":18,"tag":68,"props":174,"children":177},{"alt":175,"src":176},"cke_137408.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/5e4/e02/8f7/550440a1fe5e4e028f77e5cf18005adc.20220814143649.63619260614944567356738054808609:20220815004352:2400:7F2EEA0B5C85AFE16CCEFE12DBD61244CFC66E3C5435E953065EEE59FE4D3914.png",[],{"type":18,"tag":26,"props":179,"children":181},{"id":180},"案例问题分析",[182],{"type":18,"tag":31,"props":183,"children":184},{},[185],{"type":18,"tag":31,"props":186,"children":187},{},[188],{"type":18,"tag":31,"props":189,"children":190},{},[191],{"type":24,"value":180},{"type":18,"tag":193,"props":194,"children":196},"h3",{"id":195},"问题1迭代间隙过长",[197],{"type":18,"tag":31,"props":198,"children":199},{},[200],{"type":18,"tag":31,"props":201,"children":202},{},[203],{"type":18,"tag":31,"props":204,"children":205},{},[206],{"type":24,"value":207},"问题1：迭代间隙过长",{"type":18,"tag":43,"props":209,"children":210},{},[211],{"type":24,"value":212},"通过迭代轨迹发现，迭代间隙过长：",{"type":18,"tag":43,"props":214,"children":215},{},[216],{"type":18,"tag":68,"props":217,"children":220},{"alt":218,"src":219},"cke_167495.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/5e4/e02/8f7/550440a1fe5e4e028f77e5cf18005adc.20220814143713.20361765391907751849654280105558:20220815004352:2400:38E585A1ED9934A844EEE22C3F44BFB2B20D990C48331006898526184318972C.png",[],{"type":18,"tag":222,"props":223,"children":225},"h4",{"id":224},"问题分析",[226],{"type":18,"tag":31,"props":227,"children":228},{},[229],{"type":18,"tag":31,"props":230,"children":231},{},[232],{"type":18,"tag":31,"props":233,"children":234},{},[235],{"type":24,"value":224},{"type":18,"tag":43,"props":237,"children":238},{},[239],{"type":24,"value":240},"1.迭代间隙过长，通常因为数据处理过程导致，进入数据处理阶段分析。",{"type":18,"tag":43,"props":242,"children":243},{},[244],{"type":18,"tag":68,"props":245,"children":248},{"alt":246,"src":247},"cke_184283.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/5e4/e02/8f7/550440a1fe5e4e028f77e5cf18005adc.20220814143732.08311953867599775084551019771100:20220815004352:2400:6C9D959DB6995060B98B9C56917CBDC42CDB7C06F70FB90836C444B50982A8D6.png",[],{"type":18,"tag":43,"props":250,"children":251},{},[252],{"type":24,"value":253},"2.主机队列几乎为空，判定是数据处理算子问题，进入数据处理pipeline查看具体问题。",{"type":18,"tag":43,"props":255,"children":256},{},[257],{"type":18,"tag":68,"props":258,"children":261},{"alt":259,"src":260},"cke_208950.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/5e4/e02/8f7/550440a1fe5e4e028f77e5cf18005adc.20220814143749.79716934196880325857405485570167:20220815004352:2400:B4EA16D2C1D846D3EB192F16DC9B50A2BB03EE8BEE17C690EE291CFED22876B4.png",[],{"type":18,"tag":43,"props":263,"children":264},{},[265],{"type":24,"value":266},"3.当算子左边连接的Queue使用率都比较高，右边连接的Queue使用率比较低时，该算子可能是性能瓶颈：图中红框内数据可以看出，map操作过程中的队列使用率高，而右边连接的队列使用率较低，判断map中的数据处理过程存在性能瓶颈。",{"type":18,"tag":43,"props":268,"children":269},{},[270],{"type":24,"value":271},"4.分析map中数据处理相关代码：发现数据处理进程数为默认值1，可以尝试调整数据处理进程数。",{"type":18,"tag":43,"props":273,"children":274},{},[275],{"type":24,"value":276},"5.分析map中数据处理相关代码：发现存在c_transform和py_transform混用的问题，降低了训练性能。",{"type":18,"tag":222,"props":278,"children":280},{"id":279},"措施1调整数据处理进程数",[281],{"type":18,"tag":31,"props":282,"children":283},{},[284],{"type":18,"tag":31,"props":285,"children":286},{},[287],{"type":18,"tag":31,"props":288,"children":289},{},[290],{"type":24,"value":291},"措施1：调整数据处理进程数",{"type":18,"tag":43,"props":293,"children":294},{},[295],{"type":24,"value":296},"调整数据处理进程数为8：",{"type":18,"tag":43,"props":298,"children":299},{},[300],{"type":24,"value":301},"if do_train:",{"type":18,"tag":43,"props":303,"children":304},{},[305],{"type":24,"value":306},"cifar_ds = ds.Cifar10Dataset(dataset_dir=data_home,",{"type":18,"tag":43,"props":308,"children":309},{},[310],{"type":24,"value":311},"num_parallel_workers=8, shuffle=True, usage='train')else:",{"type":18,"tag":43,"props":313,"children":314},{},[315],{"type":24,"value":306},{"type":18,"tag":43,"props":317,"children":318},{},[319],{"type":24,"value":320},"num_parallel_workers=8, shuffle=False, usage='test')",{"type":18,"tag":43,"props":322,"children":323},{},[324],{"type":24,"value":325},"cifar_ds = cifar_ds.map(operations=transform_label, num_parallel_workers=8, python_multiprocessing=True, input_columns=\"label\")",{"type":18,"tag":43,"props":327,"children":328},{},[329],{"type":24,"value":330},"cifar_ds = cifar_ds.map(operations=transform_data, num_parallel_workers=8, python_multiprocessing=True, python_multiprocessing=True, input_columns=\"image\")",{"type":18,"tag":43,"props":332,"children":333},{},[334],{"type":24,"value":335},"cifar_ds = cifar_ds.batch(batch_size, num_parallel_workers=8, drop_remainder=True)",{"type":18,"tag":43,"props":337,"children":338},{},[339],{"type":24,"value":340},"修改后，重新训练，将训练后代码继续做profiling，发现迭代间隙明显缩短。",{"type":18,"tag":43,"props":342,"children":343},{},[344],{"type":18,"tag":68,"props":345,"children":348},{"alt":346,"src":347},"cke_237040.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/5e4/e02/8f7/550440a1fe5e4e028f77e5cf18005adc.20220814143810.37835734922079862752171799501033:20220815004352:2400:5B97EDCF362F747EC1377AFF5F2637FBC66B2E4331D487E5C48657883543BA52.png",[],{"type":18,"tag":43,"props":350,"children":351},{},[352,357],{"type":18,"tag":31,"props":353,"children":354},{},[355],{"type":24,"value":356},"性能对比",{"type":24,"value":358},": 改进前：1100imgs/sec 改进后: 2150imgs/sec",{"type":18,"tag":222,"props":360,"children":362},{"id":361},"措施2避免c_transform和py_transform混用",[363],{"type":18,"tag":31,"props":364,"children":365},{},[366],{"type":18,"tag":31,"props":367,"children":368},{},[369],{"type":18,"tag":31,"props":370,"children":371},{},[372],{"type":24,"value":373},"措施2：避免c_transform和py_transform混用",{"type":18,"tag":43,"props":375,"children":376},{},[377],{"type":24,"value":378},"数据处理过程中发现存在c_transform和py_transform混用的情况：",{"type":18,"tag":43,"props":380,"children":381},{},[382],{"type":24,"value":301},{"type":18,"tag":43,"props":384,"children":385},{},[386],{"type":18,"tag":31,"props":387,"children":388},{},[389],{"type":24,"value":390},"# Transformation on train data",{"type":18,"tag":43,"props":392,"children":393},{},[394],{"type":24,"value":395},"transform_data = py_trans.Compose([",{"type":18,"tag":43,"props":397,"children":398},{},[399],{"type":24,"value":400},"CV.RandomCrop((32, 32), (4, 4, 4, 4)),",{"type":18,"tag":43,"props":402,"children":403},{},[404],{"type":24,"value":405},"py_vision.ToPIL(),",{"type":18,"tag":43,"props":407,"children":408},{},[409],{"type":24,"value":410},"py_vision.RandomHorizontalFlip(),",{"type":18,"tag":43,"props":412,"children":413},{},[414],{"type":24,"value":415},"CV.Rescale(rescale, shift),",{"type":18,"tag":43,"props":417,"children":418},{},[419],{"type":24,"value":420},"CV.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),",{"type":18,"tag":43,"props":422,"children":423},{},[424],{"type":24,"value":425},"CV.HWC2CHW()",{"type":18,"tag":43,"props":427,"children":428},{},[429],{"type":24,"value":430},"])",{"type":18,"tag":43,"props":432,"children":433},{},[434],{"type":24,"value":435},"else:",{"type":18,"tag":43,"props":437,"children":438},{},[439],{"type":18,"tag":31,"props":440,"children":441},{},[442],{"type":24,"value":443},"# Transformation on validation data",{"type":18,"tag":43,"props":445,"children":446},{},[447],{"type":24,"value":395},{"type":18,"tag":43,"props":449,"children":450},{},[451],{"type":24,"value":415},{"type":18,"tag":43,"props":453,"children":454},{},[455],{"type":24,"value":420},{"type":18,"tag":43,"props":457,"children":458},{},[459],{"type":24,"value":425},{"type":18,"tag":43,"props":461,"children":462},{},[463],{"type":24,"value":430},{"type":18,"tag":43,"props":465,"children":466},{},[467],{"type":24,"value":468},"cifar_ds = cifar_ds.map(operations=transform_data, input_columns=\"image\")",{"type":18,"tag":43,"props":470,"children":471},{},[472],{"type":24,"value":473},"将数据处理过程中代码进行如下修改：",{"type":18,"tag":43,"props":475,"children":476},{},[477],{"type":24,"value":301},{"type":18,"tag":43,"props":479,"children":480},{},[481],{"type":18,"tag":31,"props":482,"children":483},{},[484],{"type":24,"value":390},{"type":18,"tag":43,"props":486,"children":487},{},[488],{"type":24,"value":489},"transform_data = C.Compose([",{"type":18,"tag":43,"props":491,"children":492},{},[493],{"type":24,"value":400},{"type":18,"tag":43,"props":495,"children":496},{},[497],{"type":24,"value":498},"CV.RandomHorizontalFlip(),",{"type":18,"tag":43,"props":500,"children":501},{},[502],{"type":24,"value":415},{"type":18,"tag":43,"props":504,"children":505},{},[506],{"type":24,"value":420},{"type":18,"tag":43,"props":508,"children":509},{},[510],{"type":24,"value":511},"CV.HWC2CHW()])",{"type":18,"tag":43,"props":513,"children":514},{},[515],{"type":24,"value":435},{"type":18,"tag":43,"props":517,"children":518},{},[519],{"type":18,"tag":31,"props":520,"children":521},{},[522],{"type":24,"value":443},{"type":18,"tag":43,"props":524,"children":525},{},[526],{"type":24,"value":489},{"type":18,"tag":43,"props":528,"children":529},{},[530],{"type":24,"value":415},{"type":18,"tag":43,"props":532,"children":533},{},[534],{"type":24,"value":420},{"type":18,"tag":43,"props":536,"children":537},{},[538],{"type":24,"value":511},{"type":18,"tag":43,"props":540,"children":541},{},[542],{"type":24,"value":468},{"type":18,"tag":43,"props":544,"children":545},{},[546],{"type":24,"value":547},"修改后，重新训练，将训练后代码继续做profiling。迭代间隙有所缩短。",{"type":18,"tag":43,"props":549,"children":550},{},[551],{"type":18,"tag":68,"props":552,"children":555},{"alt":553,"src":554},"cke_261826.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/5e4/e02/8f7/550440a1fe5e4e028f77e5cf18005adc.20220814143833.27195400453362362842700134837649:20220815004352:2400:89F853E730C3F66AC2D6773A95C0D87E57FFC3BEA396DE097CEA331B04BBBC2E.png",[],{"type":18,"tag":43,"props":557,"children":558},{},[559,563],{"type":18,"tag":31,"props":560,"children":561},{},[562],{"type":24,"value":356},{"type":24,"value":564},": 改进前：2150imgs/sec 改进后: 2250imgs/sec",{"type":18,"tag":43,"props":566,"children":567},{},[568],{"type":24,"value":569},"经过这两步优化，迭代间隙由原来的77.5027ms减少到17.0623ms，有明显改善。",{"type":18,"tag":193,"props":571,"children":573},{"id":572},"问题2数据队列为空",[574],{"type":18,"tag":31,"props":575,"children":576},{},[577],{"type":18,"tag":31,"props":578,"children":579},{},[580],{"type":18,"tag":31,"props":581,"children":582},{},[583],{"type":24,"value":584},"问题2：数据队列为空",{"type":18,"tag":43,"props":586,"children":587},{},[588],{"type":24,"value":589},"继续分析优化后的Profiler数据，发现数据准备中的主机队列几乎为空：",{"type":18,"tag":43,"props":591,"children":592},{},[593],{"type":18,"tag":68,"props":594,"children":597},{"alt":595,"src":596},"cke_283298.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/5e4/e02/8f7/550440a1fe5e4e028f77e5cf18005adc.20220814143852.98629754705506281942180173352173:20220815004352:2400:FDEA4A59D7EA62A137B07D420BEE2246A8120492B1CA9C24F458B455C7AE4691.png",[],{"type":18,"tag":222,"props":599,"children":601},{"id":600},"问题分析-1",[602],{"type":18,"tag":31,"props":603,"children":604},{},[605],{"type":18,"tag":31,"props":606,"children":607},{},[608],{"type":18,"tag":31,"props":609,"children":610},{},[611],{"type":24,"value":224},{"type":18,"tag":613,"props":614,"children":615},"ol",{},[616],{"type":18,"tag":617,"props":618,"children":619},"li",{},[620],{"type":24,"value":621},"进入数据处理pipeline查看具体问题。",{"type":18,"tag":43,"props":623,"children":624},{},[625],{"type":18,"tag":68,"props":626,"children":629},{"alt":627,"src":628},"cke_308197.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/5e4/e02/8f7/550440a1fe5e4e028f77e5cf18005adc.20220814143910.01294800351039354063719378388458:20220815004352:2400:3C2B136EE0AA83CD0B23936908DB4DD14D6D47DBCE48B5805CBED8ACF48BCAA4.png",[],{"type":18,"tag":43,"props":631,"children":632},{},[633],{"type":24,"value":634},"2.如红框所示，数据处理过程中的队列使用率高。对于最右侧的算子，如果其左边所有Queue的使用率都比较高，该算子可能是性能瓶颈。而此处最右侧算子为框架自动插入算子，因此判断数据下发存在性能瓶颈，应提升数据从Host传输到Device的速度。",{"type":18,"tag":222,"props":636,"children":638},{"id":637},"措施3采用数据下沉模式",[639],{"type":18,"tag":31,"props":640,"children":641},{},[642],{"type":18,"tag":31,"props":643,"children":644},{},[645],{"type":18,"tag":31,"props":646,"children":647},{},[648],{"type":24,"value":649},"措施3：采用数据下沉模式",{"type":18,"tag":43,"props":651,"children":652},{},[653],{"type":24,"value":654},"采用数据下沉模式，实现整图下沉到Device执行，避免Host-Device频繁交互，减小了数据传输开销。",{"type":18,"tag":43,"props":656,"children":657},{},[658],{"type":24,"value":659},"将Model.train接口中dataset_sink_mode值设为True，即可采用数据下沉模式。",{"type":18,"tag":43,"props":661,"children":662},{},[663],{"type":24,"value":664},"model.train(..., dataset_sink_mode=True, sink_size=steps_per_epoch_train)",{"type":18,"tag":43,"props":666,"children":667},{},[668],{"type":24,"value":669},"修改后，重新训练，将训练后代码继续做profiling，发现主机队列为空比例大大下降。",{"type":18,"tag":43,"props":671,"children":672},{},[673],{"type":18,"tag":68,"props":674,"children":677},{"alt":675,"src":676},"cke_334361.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/5e4/e02/8f7/550440a1fe5e4e028f77e5cf18005adc.20220814143935.56039004543698328302863431793748:20220815004352:2400:189B9ED1A3752F0E09278C92DE003508671E70D828692C6BAAB0B83BA7F4A8ED.png",[],{"type":18,"tag":43,"props":679,"children":680},{},[681],{"type":24,"value":682},"数据处理过程中的队列使用率明显降低。",{"type":18,"tag":43,"props":684,"children":685},{},[686],{"type":18,"tag":68,"props":687,"children":690},{"alt":688,"src":689},"cke_358276.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/5e4/e02/8f7/550440a1fe5e4e028f77e5cf18005adc.20220814143952.53246642977906887619869351037934:20220815004352:2400:663E5C4A772A443AB085174D42A351601315B538029F2A4E42A2988BA67727A7.png",[],{"type":18,"tag":43,"props":692,"children":693},{},[694,698],{"type":18,"tag":31,"props":695,"children":696},{},[697],{"type":24,"value":356},{"type":24,"value":699},": 改进前：2250imgs/sec 改进后: 2350imgs/sec",{"type":18,"tag":193,"props":701,"children":703},{"id":702},"问题3前向反向时间较长",[704],{"type":18,"tag":31,"props":705,"children":706},{},[707],{"type":18,"tag":31,"props":708,"children":709},{},[710],{"type":18,"tag":31,"props":711,"children":712},{},[713],{"type":24,"value":714},"问题3：前向+反向时间较长",{"type":18,"tag":43,"props":716,"children":717},{},[718],{"type":24,"value":719},"继续分析优化后的Profiler数据，由迭代轨迹看出，前反向时间相对较长，可能存在优化空间：",{"type":18,"tag":43,"props":721,"children":722},{},[723],{"type":18,"tag":68,"props":724,"children":727},{"alt":725,"src":726},"cke_381099.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/5e4/e02/8f7/550440a1fe5e4e028f77e5cf18005adc.20220814144007.31151968025568271699382697281509:20220815004352:2400:14FA39B3780FE13BFA1A4F073F3B7C5BCDC5EC97FED08D22337EC1342633A47A.png",[],{"type":18,"tag":222,"props":729,"children":731},{"id":730},"措施4使用混合精度",[732],{"type":18,"tag":31,"props":733,"children":734},{},[735],{"type":18,"tag":31,"props":736,"children":737},{},[738],{"type":18,"tag":31,"props":739,"children":740},{},[741],{"type":24,"value":742},"措施4：使用混合精度",{"type":18,"tag":43,"props":744,"children":745},{},[746],{"type":24,"value":9},{"type":18,"tag":43,"props":748,"children":749},{},[750],{"type":24,"value":751},"修改高阶API代码中的Model接口，将amp_level设置成\"O3\"，网络将采用FP16进行训练。",{"type":18,"tag":43,"props":753,"children":754},{},[755],{"type":24,"value":756},"net = Model(net, loss, opt, metrics=metrics, amp_level=\"O3\")",{"type":18,"tag":43,"props":758,"children":759},{},[760],{"type":24,"value":761},"修改后，重新训练，将训练后代码继续做profiling。发现前反向时间明显减少。",{"type":18,"tag":43,"props":763,"children":764},{},[765],{"type":18,"tag":68,"props":766,"children":769},{"alt":767,"src":768},"cke_406270.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/5e4/e02/8f7/550440a1fe5e4e028f77e5cf18005adc.20220814144029.73783835158267462249677263915612:20220815004352:2400:76654B9CAE7E22D8D59D124BA6E71AAB98E6AA01AB08BB885FFA19EA956F7BF3.png",[],{"type":18,"tag":43,"props":771,"children":772},{},[773,777],{"type":18,"tag":31,"props":774,"children":775},{},[776],{"type":24,"value":356},{"type":24,"value":778},": 改进前：2350imgs/sec 改进后: 3500imgs/sec",{"type":18,"tag":43,"props":780,"children":781},{},[782],{"type":24,"value":783},"经过以上优化，训练性能得到明显提升。由初始的1100imgs/sec，改进到3500imgs/sec。",{"title":7,"searchDepth":785,"depth":785,"links":786},4,[787,789],{"id":28,"depth":788,"text":41},2,{"id":180,"depth":788,"text":180,"children":790},[791,797,801],{"id":195,"depth":792,"text":207,"children":793},3,[794,795,796],{"id":224,"depth":785,"text":224},{"id":279,"depth":785,"text":291},{"id":361,"depth":785,"text":373},{"id":572,"depth":792,"text":584,"children":798},[799,800],{"id":600,"depth":785,"text":224},{"id":637,"depth":785,"text":649},{"id":702,"depth":792,"text":714,"children":802},[803],{"id":730,"depth":785,"text":742},"markdown","content:technology-blogs:zh:1697.md","content","technology-blogs/zh/1697.md","technology-blogs/zh/1697","md",1776506115136]