[{"data":1,"prerenderedAt":899},["ShallowReactive",2],{"content-query-Pp0OkQVVyN":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":893,"_id":894,"_source":895,"_file":896,"_stem":897,"_extension":898},"/technology-blogs/zh/399","zh",false,"","MindSpore大V博文之架构系列（5）：AI框架中数据处理的挑战与解决思路","数据处理其实在AI的训练和推理中占了很大的比重，但是业界在这一块的分析比较少，本文期望通过MindSpore实践给大家一些参考。","2021-02-24","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/02/24/68de39ac1eed47528c980012e9eb5bbd.png","technology-blogs","大V博文",{"type":15,"children":16,"toc":878},"root",[17,25,31,44,55,60,65,75,79,86,91,99,104,109,114,119,124,129,136,141,146,151,157,162,167,173,178,192,197,202,213,218,223,230,235,241,252,257,268,273,278,283,288,293,298,303,308,313,318,323,328,333,338,344,350,355,362,367,372,382,387,392,397,402,407,413,424,431,436,455,460,465,476,481,488,493,498,503,508,519,524,529,535,540,551,556,561,566,571,582,589,594,605,610,615,622,627,635,642,650,657,665,672,677,688,693,700,713,724,729,736,741,747,758,763,768,775,780,787,798,803,810,815,820,825,830,841,846,857,862,873],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore大v博文之架构系列5ai框架中数据处理的挑战与解决思路",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"作者：金雪锋",{"type":18,"tag":26,"props":32,"children":33},{},[34,36],{"type":24,"value":35},"作者主页：",{"type":18,"tag":37,"props":38,"children":42},"a",{"href":39,"rel":40},"https://www.zhihu.com/people/jin-xue-feng",[41],"nofollow",[43],{"type":24,"value":39},{"type":18,"tag":26,"props":45,"children":46},{},[47,49],{"type":24,"value":48},"文章链接：",{"type":18,"tag":37,"props":50,"children":53},{"href":51,"rel":52},"https://zhuanlan.zhihu.com/p/352487023",[41],[54],{"type":24,"value":51},{"type":18,"tag":26,"props":56,"children":57},{},[58],{"type":24,"value":59},"本文是AI框架分析专栏的第五篇，总体目录参见：",{"type":18,"tag":26,"props":61,"children":62},{},[63],{"type":24,"value":64},"AI框架的演进趋势和MindSpore的构想：",{"type":18,"tag":26,"props":66,"children":67},{},[68],{"type":18,"tag":37,"props":69,"children":72},{"href":70,"rel":71},"https://zhuanlan.zhihu.com/p/225392622",[41],[73],{"type":24,"value":74},"金雪锋：AI框架的演进趋势和MindSpore的构想",{"type":18,"tag":26,"props":76,"children":77},{},[78],{"type":24,"value":9},{"type":18,"tag":80,"props":81,"children":83},"h2",{"id":82},"ai框架中的数据处理",[84],{"type":24,"value":85},"AI框架中的数据处理",{"type":18,"tag":26,"props":87,"children":88},{},[89],{"type":24,"value":90},"在构建深度学习模型时，数据处理是我们最先面临的挑战。任务开始之前，由于数据量受限，或者为了得到更好的结果，通常需要进行数据增强操作，来获得能使网络受益的数据输入。典型的训练数据处理流程如下图所示：",{"type":18,"tag":26,"props":92,"children":93},{},[94],{"type":18,"tag":95,"props":96,"children":98},"img",{"alt":7,"src":97},"https://pic4.zhimg.com/80/v2-f611e82c6f6fa060e9e25c36c357429b_720w.jpg",[],{"type":18,"tag":26,"props":100,"children":101},{},[102],{"type":24,"value":103},"加载：指从各种异构存储中将训练数据加载到内存中，加载时涉及数据的访问、解析等处理；",{"type":18,"tag":26,"props":105,"children":106},{},[107],{"type":24,"value":108},"Shuffle：训练一般是多个epoch，通过shuffle打乱数据集不同epoch的数据排序，防止训练过拟合。如果数据集支持随机访问，则只需按不同顺序随机选择数据就可以非常有效地进行混洗shuffle。如果数据集不支持随机访问（或仅部分随机访问像多个文件对象），那么一个子集的数据可以加载到一个特殊的混洗缓冲区shuffle buffer中。",{"type":18,"tag":26,"props":110,"children":111},{},[112],{"type":24,"value":113},"map：完成训练数据的处理，包括图像类数据增强、Text类分词等处理。其中，数据增强是一种创造有着不同方向的“新”数据的方法，一是从有限数据中生成“更多数据”，二是防止过拟合。",{"type":18,"tag":26,"props":115,"children":116},{},[117],{"type":24,"value":118},"batch：训练时一般都是使用mini-batch的方式，即一个批次训练少量数据；batch算子负责构造一个批次的数据发送给训练；",{"type":18,"tag":26,"props":120,"children":121},{},[122],{"type":24,"value":123},"repeat：可以通过repeat的方式增加训练的总数据量；一次repeat就是加载一遍整个训练集。",{"type":18,"tag":26,"props":125,"children":126},{},[127],{"type":24,"value":128},"模型在进行推理时，同样会涉及到数据处理，典型的过程如下图所示：",{"type":18,"tag":26,"props":130,"children":131},{},[132],{"type":18,"tag":95,"props":133,"children":135},{"alt":7,"src":134},"https://pic4.zhimg.com/80/v2-e2cefbd7bfebf411b42121af8dfd34fb_720w.jpg",[],{"type":18,"tag":26,"props":137,"children":138},{},[139],{"type":24,"value":140},"推理过程的数据处理，将一张图片进行解码、缩放、中心截图、归一化、通道变换等操作后，送入模型中进行推理并得到结果。与训练相比，推理时的数据转换基本一致，不同的是推理时一般加载单样本进行处理，而非数据集。",{"type":18,"tag":26,"props":142,"children":143},{},[144],{"type":24,"value":145},"本文将重点分析AI框架在数据处理时面临的挑战以及MindSpore的解决思路。",{"type":18,"tag":80,"props":147,"children":149},{"id":148},"难点与挑战",[150],{"type":24,"value":148},{"type":18,"tag":80,"props":152,"children":154},{"id":153},"_21-数据处理的高效性",[155],{"type":24,"value":156},"2.1 数据处理的高效性",{"type":18,"tag":26,"props":158,"children":159},{},[160],{"type":24,"value":161},"当前各AI框架的数据处理主要利用CPU运算，训练则利用GPU/AI芯片，两者是并行的。理想情况下，应该在每轮迭代开始前，就准备好完成增强之后的数据，保持训练过程持续地进行。然而在实际的训练中，很多时候数据处理成为了阻碍性能提升的瓶颈：或是因为从存储中读取数据的速度不足（I/O bound），或是因为数据增强操作效率过低（CPU bound）。",{"type":18,"tag":26,"props":163,"children":164},{},[165],{"type":24,"value":166},"根据黄氏定律，GPU/AI芯片的算力每一年会提升一倍，相比于即将失效的摩尔定律，AI芯片的算力提升速度会远大于CPU。模型迭代计算效率的不断提升，对数据处理也提出了更高的要求：数据处理过程必须足够高效，才能够避免GPU/AI芯片因为等待训练数据而空闲。",{"type":18,"tag":80,"props":168,"children":170},{"id":169},"_22-数据处理的灵活性",[171],{"type":24,"value":172},"2.2 数据处理的灵活性",{"type":18,"tag":26,"props":174,"children":175},{},[176],{"type":24,"value":177},"数据处理的灵活性挑战主要体现在以下两个方面：",{"type":18,"tag":179,"props":180,"children":181},"ul",{},[182],{"type":18,"tag":183,"props":184,"children":185},"li",{},[186],{"type":18,"tag":187,"props":188,"children":189},"strong",{},[190],{"type":24,"value":191},"数据集种类繁多，难以统一",{"type":18,"tag":26,"props":193,"children":194},{},[195],{"type":24,"value":196},"目前已知常用的开源数据集有几百种，每种由不同的组织/机构来产生，有各自的格式与组织方式。根据深度学习任务的不同，每种类型的输入也有自身的特点。以图像为例，其包含类型、长、宽、大小等属性信息，每张训练图像被标记成某一个类别，这些图像及其对应类别的列表数据被用来进行分类等训练；以音频为例，通过训练可以直接将语音转换为文字，进一步作为智能AI的输入，完成语义理解及指令性操作。",{"type":18,"tag":26,"props":198,"children":199},{},[200],{"type":24,"value":201},"在数据集加载时，如何支持种类繁多的图像、文本、音频、视频格式，屏蔽IO差异并将其映射到内存结构中进行下一步处理，是AI框架重点需要解决的问题。",{"type":18,"tag":179,"props":203,"children":204},{},[205],{"type":18,"tag":183,"props":206,"children":207},{},[208],{"type":18,"tag":187,"props":209,"children":210},{},[211],{"type":24,"value":212},"数据增强算法非常灵活，需要框架提供足够易用的接口来支持用户定制数据处理过程",{"type":18,"tag":26,"props":214,"children":215},{},[216],{"type":24,"value":217},"例如CV类场景，图像作为网络的输入，需要保证一定的一致性（如：大小、通道数、归一化等），也需要有一定的泛化能力（如：镜像、旋转、混合、颜色变换等），才能使训练得到的模型具有更好的精度。研究已经证明，采用不同的数据处理逻辑，训练得到的模型精度会有明显的不同。",{"type":18,"tag":26,"props":219,"children":220},{},[221],{"type":24,"value":222},"以resnet50和ssd为例，数据处理过程对比如下：",{"type":18,"tag":26,"props":224,"children":225},{},[226],{"type":18,"tag":95,"props":227,"children":229},{"alt":7,"src":228},"https://pic2.zhimg.com/80/v2-a945efba0558bfa9771e7f9ded9497a1_720w.jpg",[],{"type":18,"tag":26,"props":231,"children":232},{},[233],{"type":24,"value":234},"其中既涉及到经典的数据处理逻辑，又包含了用户自定义的处理过程。AutoML和动态shape等场景，也对数据处理的灵活性提出了更高要求。所以，提供更多、更灵活的数据处理机制，也是框架需要重点考虑的。",{"type":18,"tag":80,"props":236,"children":238},{"id":237},"_23-端云统一",[239],{"type":24,"value":240},"2.3 端云统一",{"type":18,"tag":179,"props":242,"children":243},{},[244],{"type":18,"tag":183,"props":245,"children":246},{},[247],{"type":18,"tag":187,"props":248,"children":249},{},[250],{"type":24,"value":251},"训练导出的模型，在推理时如何高效进行数据处理：",{"type":18,"tag":26,"props":253,"children":254},{},[255],{"type":24,"value":256},"网络训练生成的模型文件中，记录了训练时的计算图及权重信息，但是数据处理过程往往没有统一存储到模型中，这就导致AI工程师在使用模型进行推理时，需要重新编写数据处理代码，十分不便。",{"type":18,"tag":179,"props":258,"children":259},{},[260],{"type":18,"tag":183,"props":261,"children":262},{},[263],{"type":18,"tag":187,"props":264,"children":265},{},[266],{"type":24,"value":267},"端侧资源受限，需要提供更轻量化的数据处理方式：",{"type":18,"tag":26,"props":269,"children":270},{},[271],{"type":24,"value":272},"在端侧场景下，CPU和内存资源往往比较少，在提供数据处理的能力时，要求API尽可能简单、轻便，以最少的资源占用获得最快的执行效率。同时，需要AI框架提供的库尽可能的小。所以，使用云化场景提供的数据处理机制（启动慢、资源占用大）就不是特别适用。如何在不同的场景下提供最合适的数据处理方法是AI框架面临的挑战。",{"type":18,"tag":26,"props":274,"children":275},{},[276],{"type":24,"value":277},"典型的云侧场景与端侧场景资源对比如下：",{"type":18,"tag":26,"props":279,"children":280},{},[281],{"type":24,"value":282},"资源",{"type":18,"tag":26,"props":284,"children":285},{},[286],{"type":24,"value":287},"云侧",{"type":18,"tag":26,"props":289,"children":290},{},[291],{"type":24,"value":292},"端侧",{"type":18,"tag":26,"props":294,"children":295},{},[296],{"type":24,"value":297},"CPU",{"type":18,"tag":26,"props":299,"children":300},{},[301],{"type":24,"value":302},"100+核",{"type":18,"tag":26,"props":304,"children":305},{},[306],{"type":24,"value":307},"4核-8核",{"type":18,"tag":26,"props":309,"children":310},{},[311],{"type":24,"value":312},"内存",{"type":18,"tag":26,"props":314,"children":315},{},[316],{"type":24,"value":317},"250G-750G",{"type":18,"tag":26,"props":319,"children":320},{},[321],{"type":24,"value":322},"3G-8G",{"type":18,"tag":26,"props":324,"children":325},{},[326],{"type":24,"value":327},"Device",{"type":18,"tag":26,"props":329,"children":330},{},[331],{"type":24,"value":332},"GPU/Ascend910",{"type":18,"tag":26,"props":334,"children":335},{},[336],{"type":24,"value":337},"CPU/Ascend310",{"type":18,"tag":80,"props":339,"children":341},{"id":340},"mindspore设计思考",[342],{"type":24,"value":343},"MindSpore设计思考",{"type":18,"tag":80,"props":345,"children":347},{"id":346},"_31-设计目标与思路",[348],{"type":24,"value":349},"3.1 设计目标与思路",{"type":18,"tag":26,"props":351,"children":352},{},[353],{"type":24,"value":354},"MindSpore的设计中，充分考虑了数据处理的高效性、灵活性以及在不同场景下的适配性。整个数据处理子系统分为以下模块：",{"type":18,"tag":26,"props":356,"children":357},{},[358],{"type":18,"tag":95,"props":359,"children":361},{"alt":7,"src":360},"https://pic3.zhimg.com/80/v2-8875652651f78b5ce8ac75c1e8766076_720w.jpg",[],{"type":18,"tag":26,"props":363,"children":364},{},[365],{"type":24,"value":366},"**API：**数据处理过程在MindSpore中以图的形式表示，称为“数据图”。MindSpore对外提供Python API来定义数据图，内部实现图优化和图执行。",{"type":18,"tag":26,"props":368,"children":369},{},[370],{"type":24,"value":371},"整个数据加载和预处理的流程实现为多步并行流水线（data processing pipeline），包括：",{"type":18,"tag":26,"props":373,"children":374},{},[375,380],{"type":18,"tag":187,"props":376,"children":377},{},[378],{"type":24,"value":379},"Adaptor",{"type":24,"value":381},"：将上层语言（如Python）构建的数据图，转换为下层可执行的C++数据图（Execution tree）",{"type":18,"tag":26,"props":383,"children":384},{},[385],{"type":24,"value":386},"**Optimizer：**数据图优化器",{"type":18,"tag":26,"props":388,"children":389},{},[390],{"type":24,"value":391},"**Runtime：**运行优化后Execution tree的执行引擎",{"type":18,"tag":26,"props":393,"children":394},{},[395],{"type":24,"value":396},"**数据集算子（dataset operators）：**Execution tree中的某个节点，对应数据处理流水线中的一步具体操作，比如从文件夹加载训练数据的ImageFolder算子，做各种数据增强的Map算子，Repeat算子等。",{"type":18,"tag":26,"props":398,"children":399},{},[400],{"type":24,"value":401},"**数据增强算子（data augmentation operators）：**也可称为tensor算子，是对某个tensor变换的，比如Decode，Resize，Crop，Pad等，通常是被dataset operator中的Map算子调用。",{"type":18,"tag":26,"props":403,"children":404},{},[405],{"type":24,"value":406},"数据增强后的结果，通过队列和前向反向计算系统相连。下面将介绍这套子系统如何达到极致的数据处理性能。",{"type":18,"tag":80,"props":408,"children":410},{"id":409},"_32-极致的处理性能",[411],{"type":24,"value":412},"3.2 极致的处理性能",{"type":18,"tag":179,"props":414,"children":415},{},[416],{"type":18,"tag":183,"props":417,"children":418},{},[419],{"type":18,"tag":187,"props":420,"children":421},{},[422],{"type":24,"value":423},"多段pipeline流水线:",{"type":18,"tag":26,"props":425,"children":426},{},[427],{"type":18,"tag":95,"props":428,"children":430},{"alt":7,"src":429},"https://pic2.zhimg.com/80/v2-8271226637ccc0100142433711e39b11_720w.jpg",[],{"type":18,"tag":26,"props":432,"children":433},{},[434],{"type":24,"value":435},"相比于业界其他框架，MindSpore采用了多段并行流水线（multi-stage parallel pipeline）的方式来构建数据处理pipeline。这种架构一方面可以更加细粒度地规划CPU等计算资源的使用，另一方面天然支持各段使用异构硬件进行流水处理，从而提高数据处理过程的吞吐量。如上图所示，每个数据集算子（inline除外）都包含一个输出Connector：由一组阻塞队列和计数器组成的保序缓冲队列。每当一个数据集算子完成一块缓存数据的处理，这个算子会将这块缓存推送到自身的输出Connector。下游的数据集算子会从上游的输出Connector里取出缓存进行后续处理。这种机制的优势包括：",{"type":18,"tag":437,"props":438,"children":439},"ol",{},[440,445,450],{"type":18,"tag":183,"props":441,"children":442},{},[443],{"type":24,"value":444},"数据集加载、map、batch等操作以任务调度机制来驱动，每个操作的任务互相独立，上下文通过Connector来实现Pipeline；",{"type":18,"tag":183,"props":446,"children":447},{},[448],{"type":24,"value":449},"每个操作均可以实现细粒度的多线程/多进程并行加速。数据框架为用户提供调整算子线程数/多进程处理的接口，可以灵活控制各个节点的处理速度，进而实现整个数据处理Pipeline性能最优；",{"type":18,"tag":183,"props":451,"children":452},{},[453],{"type":24,"value":454},"Connector支持用户对其大小进行设置，在一定程度上可以有效的控制内存的使用率，适用于不同网络对于数据处理速度的要求。",{"type":18,"tag":26,"props":456,"children":457},{},[458],{"type":24,"value":459},"在这种数据处理机制下，对输出数据进行保序处理是保证训练精度的关键。保序意味数据处理流水线运行时，同样顺序的原始数据输入，需要保证数据处理完成后，得到同样顺序的数据输出。MindSpore采用轮询算法来保证多个线程数据处理的有序性：",{"type":18,"tag":26,"props":461,"children":462},{},[463],{"type":24,"value":464},"在上面的数据处理pipeline示例中，保序操作发生在下游map操作（4并发）的取出操作（单线程执行）中，其以轮询的方式取出上游队列中的数据。Connector内部有两个计数器expect_consumer_记录已经有多少个consumer从queues_中取了数据，pop_from_记录了哪个内部阻塞队列将要进行下一次取出。expect_consumer_以consumer个数取余，而pop_from_以producer个数取余。在expect_consumer_再次为0时，说明所有的local_queues_已经都处理上一批任务，然后继续下一批任务分配及处理，进而实现了上游至下游map操作的多并发保序处理。",{"type":18,"tag":179,"props":466,"children":467},{},[468],{"type":18,"tag":183,"props":469,"children":470},{},[471],{"type":18,"tag":187,"props":472,"children":473},{},[474],{"type":24,"value":475},"数据处理与计算过程pipeline",{"type":18,"tag":26,"props":477,"children":478},{},[479],{"type":24,"value":480},"数据pipeline会不断地进行数据处理，并把处理后的数据发送到device侧的缓存；当一个step执行结束后，会直接从device的缓存中读取下一个step的数据。",{"type":18,"tag":26,"props":482,"children":483},{},[484],{"type":18,"tag":95,"props":485,"children":487},{"alt":7,"src":486},"https://pic4.zhimg.com/80/v2-0d65cbdf05f54323b0d91e976788b913_720w.jpg",[],{"type":18,"tag":26,"props":489,"children":490},{},[491],{"type":24,"value":492},"数据处理：负责将数据集处理成网络需要的输入，并传递给发送队列中，保证数据处理的高效取可；",{"type":18,"tag":26,"props":494,"children":495},{},[496],{"type":24,"value":497},"发送队列Queue：维护数据列队，保证数据处理与网络计算过程互不影响，实现桥梁的作用；",{"type":18,"tag":26,"props":499,"children":500},{},[501],{"type":24,"value":502},"网络计算：从发送队列中获取数据，用于迭代训练。",{"type":18,"tag":26,"props":504,"children":505},{},[506],{"type":24,"value":507},"以上三者各司其职，相互独立，构筑整个训练过程中Pipeline。这样，只要数据队列不为空，模型训练就不会因为等待训练数据而产生瓶颈。",{"type":18,"tag":179,"props":509,"children":510},{},[511],{"type":18,"tag":183,"props":512,"children":513},{},[514],{"type":18,"tag":187,"props":515,"children":516},{},[517],{"type":24,"value":518},"缓存技术",{"type":18,"tag":26,"props":520,"children":521},{},[522],{"type":24,"value":523},"当数据集的size较大时，无法全部加载到memory cache，此时训练中的部分数据需要从磁盘读取，可能会遇到I/O瓶颈，增大每个epoch的cache命中率就显得尤为关键。传统的缓存管理策略采用LRU策略，没有考虑深度学习数据的读取特点：在不同的epoch之间数据是重复读取的，而在同一个epoch中随机读取。每条数据的读取概率都是相同的，因此哪个数据被缓存并不是最重要的，已经cache的数据在被使用之前不被换出更加重要。针对这个特点，我们使用了一个简单高效的缓存算法：数据一旦被缓存，就不会从cache中被换出。",{"type":18,"tag":26,"props":525,"children":526},{},[527],{"type":24,"value":528},"在数据图优化的过程中，会根据流水线结构自动生成缓存算子，既可以缓存原始数据集，也可以缓存数据增强处理后的结果。",{"type":18,"tag":80,"props":530,"children":532},{"id":531},"_33-灵活的定制能力",[533],{"type":24,"value":534},"3.3 灵活的定制能力",{"type":18,"tag":26,"props":536,"children":537},{},[538],{"type":24,"value":539},"整个数据处理pipeline中，用户往往需要特定的处理逻辑，这些处理逻辑有其特殊性，无法固化到框架中。因此，框架需要具备开放的能力，让用户能够订制不同的数据处理逻辑。MindSpore提供了灵活的数据集加载方式、丰富的数据处理算子、自动数据增强、数据动态Shape、数据处理Callback等机制等供开发人员在各种场景使用。",{"type":18,"tag":179,"props":541,"children":542},{},[543],{"type":18,"tag":183,"props":544,"children":545},{},[546],{"type":18,"tag":187,"props":547,"children":548},{},[549],{"type":24,"value":550},"灵活的数据集加载方式",{"type":18,"tag":26,"props":552,"children":553},{},[554],{"type":24,"value":555},"针对数据集种类繁多、格式与组织各异的难题，MindSpore提供了三种不同的数据集加载方式：",{"type":18,"tag":26,"props":557,"children":558},{},[559],{"type":24,"value":560},"1）如果用户使用常用数据集，那么可以使用MindSpore内置的API接口直接进行加载。MindSpore实现了包括CelebADataset、Cifar10Dataset、CocoDataset、ImageFolderDataset、MnistDataset、VOCDataset等数据集的C++ IO Reader加载，在保证性能的同时，实现了数据集的开箱即用。",{"type":18,"tag":26,"props":562,"children":563},{},[564],{"type":24,"value":565},"2）用户将数据集转换为MindSpore数据格式，即MindRecord，然后通过MindSpore的API进行加载。MindRecord可以将不同的数据集格式归一化，有聚合存储、高效读取、快速编解码、灵活控制分区大小等多种优势。",{"type":18,"tag":26,"props":567,"children":568},{},[569],{"type":24,"value":570},"3）如果用户已经有自己数据集的Python读取类，那么可以使用MindSpore的GeneratorDataset API调用该类实现数据集加载。这种方式可以快速集成已有代码，改动最小，但因为是Python IO Reader，需要额外关注数据加载性能。",{"type":18,"tag":179,"props":572,"children":573},{},[574],{"type":18,"tag":183,"props":575,"children":576},{},[577],{"type":18,"tag":187,"props":578,"children":579},{},[580],{"type":24,"value":581},"通过Python层自定义、C层插件的方式支持更多算子",{"type":18,"tag":26,"props":583,"children":584},{},[585],{"type":18,"tag":95,"props":586,"children":588},{"alt":7,"src":587},"https://pic2.zhimg.com/80/v2-41c4a059808f76f7e2a0f1f90c55a885_720w.jpg",[],{"type":18,"tag":26,"props":590,"children":591},{},[592],{"type":24,"value":593},"MindSpore内置了丰富的数据处理算子。这些算子可以分为C层以及Python层，C层算子能提供较高的执行性能；Python层算子可以很方便集成第三方包完成数据处理的功能，但是性能较低，好处是易开发易使用。MindSpore支持用户扩展自定义的数据处理算子，用户可以开发C层算子代码，编译后以插件的形式注册到MindSpore的数据处理中进行调用。",{"type":18,"tag":179,"props":595,"children":596},{},[597],{"type":18,"tag":183,"props":598,"children":599},{},[600],{"type":18,"tag":187,"props":601,"children":602},{},[603],{"type":24,"value":604},"支持自动数据增强策略",{"type":18,"tag":26,"props":606,"children":607},{},[608],{"type":24,"value":609},"MindSpore提供了基于特定策略自动对图像进行数据增强处理的机制：通过基于概率或者回调参数的数据增强策略，可以实现算子自动选择执行，达到训练精度提升的目的。",{"type":18,"tag":26,"props":611,"children":612},{},[613],{"type":24,"value":614},"例如对 ImageNet 数据集，自动数据增强最终搜索出的方案包含 25 个子策略组合，每个子策略包含两种变换，针对每幅图像都随机的挑选一个子策略组合，然后以一定的概率来决定是否执行子策略中的每种变换。",{"type":18,"tag":26,"props":616,"children":617},{},[618],{"type":18,"tag":95,"props":619,"children":621},{"alt":7,"src":620},"https://pic2.zhimg.com/80/v2-42a0bece984fc819a14c44961e06b5c9_720w.jpg",[],{"type":18,"tag":26,"props":623,"children":624},{},[625],{"type":24,"value":626},"这些策略包括：",{"type":18,"tag":26,"props":628,"children":629},{},[630],{"type":18,"tag":187,"props":631,"children":632},{},[633],{"type":24,"value":634},"RandomSelectSubpolicy - 多批概率算子，随机选择其中一批执行",{"type":18,"tag":26,"props":636,"children":637},{},[638],{"type":18,"tag":95,"props":639,"children":641},{"alt":7,"src":640},"https://pic1.zhimg.com/80/v2-202508289823035dd23241252f1f08d4_720w.jpg",[],{"type":18,"tag":26,"props":643,"children":644},{},[645],{"type":18,"tag":187,"props":646,"children":647},{},[648],{"type":24,"value":649},"RandomChoice - 多个算子选择其中一个执行",{"type":18,"tag":26,"props":651,"children":652},{},[653],{"type":18,"tag":95,"props":654,"children":656},{"alt":7,"src":655},"https://pic2.zhimg.com/80/v2-fb7d2f241ae346cb776df97eee6d06a1_720w.jpg",[],{"type":18,"tag":26,"props":658,"children":659},{},[660],{"type":18,"tag":187,"props":661,"children":662},{},[663],{"type":24,"value":664},"RandomApply -基于某个概率执行这批算子",{"type":18,"tag":26,"props":666,"children":667},{},[668],{"type":18,"tag":95,"props":669,"children":671},{"alt":7,"src":670},"https://pic4.zhimg.com/80/v2-c8b8f6c4608b797a40ba4a3f7bc5fd23_720w.jpg",[],{"type":18,"tag":26,"props":673,"children":674},{},[675],{"type":24,"value":676},"通过自动数据增强操作，在ImageNet数据集上可以提升1%左右的训练精度。",{"type":18,"tag":179,"props":678,"children":679},{},[680],{"type":18,"tag":183,"props":681,"children":682},{},[683],{"type":18,"tag":187,"props":684,"children":685},{},[686],{"type":24,"value":687},"支持动态shape",{"type":18,"tag":26,"props":689,"children":690},{},[691],{"type":24,"value":692},"MindSpore通过 per_batch_map支持用户自定义控制输出不同Shape的训练数据，满足网络需要基于不同场景调整数据Shape的诉求。",{"type":18,"tag":26,"props":694,"children":695},{},[696],{"type":18,"tag":95,"props":697,"children":699},{"alt":7,"src":698},"https://pic4.zhimg.com/80/v2-59287731a18844ea4120108a4ba1c577_720w.jpg",[],{"type":18,"tag":437,"props":701,"children":702},{},[703,708],{"type":18,"tag":183,"props":704,"children":705},{},[706],{"type":24,"value":707},"用户自定义数据Shape生成逻辑udf，如：第n个step生成shape为(x,y,z,...)的数据；",{"type":18,"tag":183,"props":709,"children":710},{},[711],{"type":24,"value":712},"通过batch(…, per_batch_map=udf)将第一步定义的逻辑生效，最终得到不同Shape训练数据。",{"type":18,"tag":179,"props":714,"children":715},{},[716],{"type":18,"tag":183,"props":717,"children":718},{},[719],{"type":18,"tag":187,"props":720,"children":721},{},[722],{"type":24,"value":723},"Callback机制让数据处理更加灵活",{"type":18,"tag":26,"props":725,"children":726},{},[727],{"type":24,"value":728},"通过callback机制实现根据训练结果动态调整数据增强的逻辑，为数据增强过程提供更灵活的操作。",{"type":18,"tag":26,"props":730,"children":731},{},[732],{"type":18,"tag":95,"props":733,"children":735},{"alt":7,"src":734},"https://pic1.zhimg.com/80/v2-e7e04f614f033c9fc639b7f5afc535cc_720w.jpg",[],{"type":18,"tag":26,"props":737,"children":738},{},[739],{"type":24,"value":740},"MindSpore支持用户基于数据处理提供的DSCallback（包含epoch开始、step开始、step结束、epoch结束等）实现自己的数据增强逻辑UDF，并将其添加至map操作中，以实现更灵活的数据增强操作。",{"type":18,"tag":80,"props":742,"children":744},{"id":743},"_34-端云统一架构",[745],{"type":24,"value":746},"3.4 端云统一架构",{"type":18,"tag":179,"props":748,"children":749},{},[750],{"type":18,"tag":183,"props":751,"children":752},{},[753],{"type":18,"tag":187,"props":754,"children":755},{},[756],{"type":24,"value":757},"数据图与计算图的统一",{"type":18,"tag":26,"props":759,"children":760},{},[761],{"type":24,"value":762},"MindIR是MindSpore基于图表示的函数式IR，其最核心的目的是服务于自动微分变换。自动微分采用的是基于函数式编程框架的变换方法，因此IR采用了接近于ANF函数式的语义。",{"type":18,"tag":26,"props":764,"children":765},{},[766],{"type":24,"value":767},"推理数据图典型的场景为：数据集样本大小缩放、中间截图、归一化、通道变换。",{"type":18,"tag":26,"props":769,"children":770},{},[771],{"type":18,"tag":95,"props":772,"children":774},{"alt":7,"src":773},"https://pic3.zhimg.com/80/v2-08190f351dd9a5aa81c31813d3ee09b6_720w.jpg",[],{"type":18,"tag":26,"props":776,"children":777},{},[778],{"type":24,"value":779},"我们将推理数据图以子图的方式保存至生成的模型文件（MindIR）中，那么在推理时，可以通过统一的接口加载模型中数据处理流程，自动进行数据处理得到模型需要的输入数据，简化用户的操作，提升易用性。",{"type":18,"tag":26,"props":781,"children":782},{},[783],{"type":18,"tag":95,"props":784,"children":786},{"alt":7,"src":785},"https://pic1.zhimg.com/80/v2-2223182cfbc898e592ceeaa43c9a4e3c_720w.jpg",[],{"type":18,"tag":179,"props":788,"children":789},{},[790],{"type":18,"tag":183,"props":791,"children":792},{},[793],{"type":18,"tag":187,"props":794,"children":795},{},[796],{"type":24,"value":797},"轻量化的数据处理",{"type":18,"tag":26,"props":799,"children":800},{},[801],{"type":24,"value":802},"云侧训练时，可以被使用的资源往往比较充裕，数据处理Pipeline在运行过程中是会占用比较多系统资源（CPU和内存），以ImageNet为例，训练过程中CPU占用20%，内存占用30-50G，这显然在端侧是不可被接收的，并且数据处理Pipeline在初始化时启动会慢些，这也不适用于端侧需要快速启动、多次训练/推理的前提条件。故：MindSpore基于现有数据处理算子，提供一套更轻量化、更适用于端侧的API，解决云化场景数据处理Pipeline不适用于端侧的问题。",{"type":18,"tag":26,"props":804,"children":805},{},[806],{"type":18,"tag":95,"props":807,"children":809},{"alt":7,"src":808},"https://pic2.zhimg.com/80/v2-6c1aab48a2246bd45a2a88daa8a20bf9_720w.jpg",[],{"type":18,"tag":26,"props":811,"children":812},{},[813],{"type":24,"value":814},"MindSpore基于Pipeline调整架构，支持数据处理单算子独立使用（Eager Mode），支持各种场景推理，提供给AI开发人员更大的灵活性；同时，将Pipeline轻量化，实现基于Pull Base的轻量化pipeline，减少资源占用并且处理速度快。",{"type":18,"tag":26,"props":816,"children":817},{},[818],{"type":24,"value":819},"通过上述两种方法，MindSpore保证了统一的数据处理架构支撑多种不同的应用场景。",{"type":18,"tag":80,"props":821,"children":823},{"id":822},"未来计划",[824],{"type":24,"value":822},{"type":18,"tag":26,"props":826,"children":827},{},[828],{"type":24,"value":829},"在未来，MindSpore将继续完善数据处理的易用性，提供更丰富的算子和内置数据集。与此同时，探索大规模数据处理的加速技术，包括：",{"type":18,"tag":179,"props":831,"children":832},{},[833],{"type":18,"tag":183,"props":834,"children":835},{},[836],{"type":18,"tag":187,"props":837,"children":838},{},[839],{"type":24,"value":840},"资源自适应分配",{"type":18,"tag":26,"props":842,"children":843},{},[844],{"type":24,"value":845},"当前各数据增强算子使用的处理线程数目由用户手工配置，对用户的调优经验要求极高。通过自适应判断Pipeline瓶颈，由框架给各个数据增强算子合理分配CPU资源，可以在训练过程中动态优化数据处理性能，免去用户繁琐的调优过程。",{"type":18,"tag":179,"props":847,"children":848},{},[849],{"type":18,"tag":183,"props":850,"children":851},{},[852],{"type":18,"tag":187,"props":853,"children":854},{},[855],{"type":24,"value":856},"异构硬件加速",{"type":18,"tag":26,"props":858,"children":859},{},[860],{"type":24,"value":861},"当前的数据处理Pipeline操作在CPU执行，一旦出现瓶颈，带来AI芯片/GPU等待空闲，用户无法充分利用所有硬件的计算能力。MindSpore期望构建用户无感知的异构硬件资源调度能力：通过监测硬件资源使用，完善Ascend/GPU上的数据处理算子，采用代价模型自适应地将数据处理任务调度至合适的资源，实现异构硬件的充分利用。",{"type":18,"tag":179,"props":863,"children":864},{},[865],{"type":18,"tag":183,"props":866,"children":867},{},[868],{"type":18,"tag":187,"props":869,"children":870},{},[871],{"type":24,"value":872},"分布式缓存",{"type":18,"tag":26,"props":874,"children":875},{},[876],{"type":24,"value":877},"当前如GPT-3等网络的训练需要使用超大规模数据集，这些数据难以在本地存储，直接从OBS读取会受网络IO限制而影响性能。与此同时，在AutoML场景下，用户经常在集群中运行同一类模型的多个作业（只是超参设置不同），每个作业独立进行数据加载和处理效率极低。MindSpore期望构建分布式缓存能力，加速这些场景下的数据处理。",{"title":7,"searchDepth":879,"depth":879,"links":880},4,[881,883,884,885,886,887,888,889,890,891,892],{"id":82,"depth":882,"text":85},2,{"id":148,"depth":882,"text":148},{"id":153,"depth":882,"text":156},{"id":169,"depth":882,"text":172},{"id":237,"depth":882,"text":240},{"id":340,"depth":882,"text":343},{"id":346,"depth":882,"text":349},{"id":409,"depth":882,"text":412},{"id":531,"depth":882,"text":534},{"id":743,"depth":882,"text":746},{"id":822,"depth":882,"text":822},"markdown","content:technology-blogs:zh:399.md","content","technology-blogs/zh/399.md","technology-blogs/zh/399","md",1776506136843]