[{"data":1,"prerenderedAt":637},["ShallowReactive",2],{"content-query-6gjFlZObLh":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":631,"_id":632,"_source":633,"_file":634,"_stem":635,"_extension":636},"/technology-blogs/zh/1700","zh",false,"","【MindSpore易点通】数据处理经验总结","MindRecord数据在MindSpore中读取性能更优，推荐用户将其他格式的数据集转换为MindRecord格式。","2022-08-15","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/09/02/f53e7955f0684ba091f8ec306db0b551.png","technology-blogs","实践",{"type":15,"children":16,"toc":606},"root",[17,25,36,51,56,70,75,80,85,90,95,100,105,110,115,120,125,130,135,145,150,159,173,178,192,197,202,207,212,217,222,231,245,250,264,274,279,284,289,294,299,304,309,314,322,327,337,347,356,365,370,379,384,389,394,399,407,412,417,426,431,436,441,450,458,467,481,486,500,505,510,515,520,525,535,545,555,560,568,582,587,601],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore易点通数据处理经验总结",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":29},"h2",{"id":28},"多进程转mindrecord",[30],{"type":18,"tag":31,"props":32,"children":33},"strong",{},[34],{"type":24,"value":35},"多进程转MindRecord",{"type":18,"tag":37,"props":38,"children":40},"h3",{"id":39},"背景信息",[41],{"type":18,"tag":31,"props":42,"children":43},{},[44],{"type":18,"tag":31,"props":45,"children":46},{},[47],{"type":18,"tag":31,"props":48,"children":49},{},[50],{"type":24,"value":39},{"type":18,"tag":52,"props":53,"children":54},"p",{},[55],{"type":24,"value":9},{"type":18,"tag":37,"props":57,"children":59},{"id":58},"经验总结",[60],{"type":18,"tag":31,"props":61,"children":62},{},[63],{"type":18,"tag":31,"props":64,"children":65},{},[66],{"type":18,"tag":31,"props":67,"children":68},{},[69],{"type":24,"value":58},{"type":18,"tag":52,"props":71,"children":72},{},[73],{"type":24,"value":74},"利用multiprocessing.Pool接口，实现多进程运行。将Cifar10图片格式数据集，转换成MindRecord格式，并进行性能对比。与单进程方式相比，多进程中需进行如下配置（具体差异详见代码附件）：",{"type":18,"tag":52,"props":76,"children":77},{},[78],{"type":24,"value":79},"total_data_length = len(os.listdir(os.path.join(argv.data_define_path,",{"type":18,"tag":52,"props":81,"children":82},{},[83],{"type":24,"value":84},"argv.path)))",{"type":18,"tag":52,"props":86,"children":87},{},[88],{"type":24,"value":89},"part_num = math.ceil(total_data_length / argv.multi_num)",{"type":18,"tag":52,"props":91,"children":92},{},[93],{"type":24,"value":94},"left = total_data_length % part_numif left == 0:",{"type":18,"tag":52,"props":96,"children":97},{},[98],{"type":24,"value":99},"left = part_num",{"type":18,"tag":52,"props":101,"children":102},{},[103],{"type":24,"value":104},"total_index_with_length = []for i in range(argv.multi_num - 1):",{"type":18,"tag":52,"props":106,"children":107},{},[108],{"type":24,"value":109},"total_index_with_length.append([i * part_num, part_num])",{"type":18,"tag":52,"props":111,"children":112},{},[113],{"type":24,"value":114},"total_index_with_length.append([(argv.multi_num - 1) * part_num, left])",{"type":18,"tag":52,"props":116,"children":117},{},[118],{"type":24,"value":119},"pool_list = []for i, index_with_length in enumerate(total_index_with_length):",{"type":18,"tag":52,"props":121,"children":122},{},[123],{"type":24,"value":124},"pool_list.append((argv, index_with_length[0], index_with_length[1], i))with Pool(processes=argv.multi_num) as pool:",{"type":18,"tag":52,"props":126,"children":127},{},[128],{"type":24,"value":129},"pool.map(process_single_writer, pool_list)",{"type":18,"tag":52,"props":131,"children":132},{},[133],{"type":24,"value":134},"其中argv.multi_num为进程数；process_single_writer和单进程中的转换方式相同；pool_list为每个进程传入参数。",{"type":18,"tag":52,"props":136,"children":137},{},[138,143],{"type":18,"tag":31,"props":139,"children":140},{},[141],{"type":24,"value":142},"注",{"type":24,"value":144},"：当数据集较大时，推荐在训练环境中利用训练环境机器的本地SSD（开发环境中使用EFS会影响数据转换性能），并选用8卡规格进行转换（若选用1卡的规格，CPU/内存/SSD空间（一共3.3T） 都只会被分到1/8 ）。需要注意的是8卡任务会起8次脚本导致重复转换，可以通过RANK_ID来控制，使仅有一个脚本正常执行；",{"type":18,"tag":52,"props":146,"children":147},{},[148],{"type":24,"value":149},"性能对比：多进程转MindRecord: 8886.35imgs/sec ； 单进程转MindRecord：2133.79imgs/sec",{"type":18,"tag":26,"props":151,"children":153},{"id":152},"一个epoch最后一个batch的数据不足",[154],{"type":18,"tag":31,"props":155,"children":156},{},[157],{"type":24,"value":158},"一个Epoch最后一个Batch的数据不足",{"type":18,"tag":37,"props":160,"children":162},{"id":161},"背景信息-1",[163],{"type":18,"tag":31,"props":164,"children":165},{},[166],{"type":18,"tag":31,"props":167,"children":168},{},[169],{"type":18,"tag":31,"props":170,"children":171},{},[172],{"type":24,"value":39},{"type":18,"tag":52,"props":174,"children":175},{},[176],{"type":24,"value":177},"一个Epoch的迭代次数=图片总数/Batch Size，如果不能整除，就会出现最后一个Batch的数据不足。目前如果Batch Size数据不足会报错。",{"type":18,"tag":37,"props":179,"children":181},{"id":180},"经验总结-1",[182],{"type":18,"tag":31,"props":183,"children":184},{},[185],{"type":18,"tag":31,"props":186,"children":187},{},[188],{"type":18,"tag":31,"props":189,"children":190},{},[191],{"type":24,"value":58},{"type":18,"tag":52,"props":193,"children":194},{},[195],{"type":24,"value":196},"如果迭代一个Epoch的末尾训练报错结束，可以检查是否该问题导致的，可以通过如下配置规避drop_remainder=true",{"type":18,"tag":52,"props":198,"children":199},{},[200],{"type":24,"value":201},"代码示例如下：",{"type":18,"tag":52,"props":203,"children":204},{},[205],{"type":24,"value":206},"import mindspore.dataset as ds__# data is an instance of Dataset object# declare an apply_func function which returns a Dataset object__def apply_func(ds):",{"type":18,"tag":52,"props":208,"children":209},{},[210],{"type":24,"value":211},"ds = ds.batch(2, drop_remainder=true)",{"type":18,"tag":52,"props":213,"children":214},{},[215],{"type":24,"value":216},"return ds__# use apply to call apply_func__",{"type":18,"tag":52,"props":218,"children":219},{},[220],{"type":24,"value":221},"data = data.apply(apply_func)",{"type":18,"tag":26,"props":223,"children":225},{"id":224},"tfrecord数据读取",[226],{"type":18,"tag":31,"props":227,"children":228},{},[229],{"type":24,"value":230},"TFRecord数据读取",{"type":18,"tag":37,"props":232,"children":234},{"id":233},"背景信息-2",[235],{"type":18,"tag":31,"props":236,"children":237},{},[238],{"type":18,"tag":31,"props":239,"children":240},{},[241],{"type":18,"tag":31,"props":242,"children":243},{},[244],{"type":24,"value":39},{"type":18,"tag":52,"props":246,"children":247},{},[248],{"type":24,"value":249},"TFRecord数据文件是一种将图像数据和标签统一存储的二进制文件，能更好的利用内存，在TensorFlow中快速的复制，移动，读取，存储等。",{"type":18,"tag":37,"props":251,"children":253},{"id":252},"经验总结-2",[254],{"type":18,"tag":31,"props":255,"children":256},{},[257],{"type":18,"tag":31,"props":258,"children":259},{},[260],{"type":18,"tag":31,"props":261,"children":262},{},[263],{"type":24,"value":58},{"type":18,"tag":265,"props":266,"children":267},"ol",{},[268],{"type":18,"tag":269,"props":270,"children":271},"li",{},[272],{"type":24,"value":273},"TFRecord的读取",{"type":18,"tag":52,"props":275,"children":276},{},[277],{"type":24,"value":278},"FP32图片数据使用tostring保存为TFRecord，若直接使用TFRecordDataset读取为UINT8格式，而MindSpore中暂时缺少tf.decode_raw这样类似功能的算子，需要自己手动进行数据转化，转化代码如下：",{"type":18,"tag":52,"props":280,"children":281},{},[282],{"type":24,"value":283},"def trans_dtype(data, target):",{"type":18,"tag":52,"props":285,"children":286},{},[287],{"type":24,"value":288},"trans_to_float32 = lambda x: np.frombuffer(np.ndarray.tobytes(x), dtype=np.float32)",{"type":18,"tag":52,"props":290,"children":291},{},[292],{"type":24,"value":293},"input_size = 128",{"type":18,"tag":52,"props":295,"children":296},{},[297],{"type":24,"value":298},"data = np.reshape(trans_to_float32(data), (input_size, input_size, 10))",{"type":18,"tag":52,"props":300,"children":301},{},[302],{"type":24,"value":303},"target = np.reshape(trans_to_float32(target), (input_size, input_size, 10))",{"type":18,"tag":52,"props":305,"children":306},{},[307],{"type":24,"value":308},"return data, target",{"type":18,"tag":52,"props":310,"children":311},{},[312],{"type":24,"value":313},"tfdataset = tfdataset.map(input_columns=['data', 'target'], operations=trans_dtype)",{"type":18,"tag":265,"props":315,"children":316},{},[317],{"type":18,"tag":269,"props":318,"children":319},{},[320],{"type":24,"value":321},"自动生成Schema与预先定义Schema",{"type":18,"tag":52,"props":323,"children":324},{},[325],{"type":24,"value":326},"训练数据含有两批不同大小的数据（Key值不同），如A中包含{a,b,c}3种Key及其值，B种包含{a,b,c,d}4种Key及其值，目前训练任务中只需要两批数据中{a,b}2种Key及其值即可。 以读取TFRecord为例：",{"type":18,"tag":52,"props":328,"children":329},{},[330,332],{"type":24,"value":331},"DATA ",{"type":18,"tag":31,"props":333,"children":334},{},[335],{"type":24,"value":336},"#A+B",{"type":18,"tag":52,"props":338,"children":339},{},[340,342],{"type":24,"value":341},"tfdataset = de.TFRecordDataset(dataset_files=DATA) ",{"type":18,"tag":31,"props":343,"children":344},{},[345],{"type":24,"value":346},"# (1)",{"type":18,"tag":52,"props":348,"children":349},{},[350,351],{"type":24,"value":331},{"type":18,"tag":31,"props":352,"children":353},{},[354],{"type":24,"value":355},"#B+A",{"type":18,"tag":52,"props":357,"children":358},{},[359,360],{"type":24,"value":341},{"type":18,"tag":31,"props":361,"children":362},{},[363],{"type":24,"value":364},"# (2)",{"type":18,"tag":52,"props":366,"children":367},{},[368],{"type":24,"value":369},"以上两种写法中，(2)写法会报错。其关键在于DATA的构造，在读取数据过程中，如果不进行Schema的定义，MindSpore会在读取第一个数据的时候根据读取到的数据本身自行定义Schema。所以如果先读取B类数据，自动生成的Schema会包含{a,b,c,d}4个Key，而之后读取A类数据的时候得不到key=d的值，此时便会报错。所以较为恰当的做法是事先在程序中定义好Schema，如：",{"type":18,"tag":52,"props":371,"children":372},{},[373,374],{"type":24,"value":331},{"type":18,"tag":31,"props":375,"children":376},{},[377],{"type":24,"value":378},"# B+A或A+B",{"type":18,"tag":52,"props":380,"children":381},{},[382],{"type":24,"value":383},"schema = de.Schema()",{"type":18,"tag":52,"props":385,"children":386},{},[387],{"type":24,"value":388},"schema.add_column('a')",{"type":18,"tag":52,"props":390,"children":391},{},[392],{"type":24,"value":393},"schema.add_column('b')",{"type":18,"tag":52,"props":395,"children":396},{},[397],{"type":24,"value":398},"tfdataset = de.TFRecordDataset(dataset_files=DATA, schema=schema)",{"type":18,"tag":265,"props":400,"children":401},{},[402],{"type":18,"tag":269,"props":403,"children":404},{},[405],{"type":24,"value":406},"NHWC到NCHW的转化",{"type":18,"tag":52,"props":408,"children":409},{},[410],{"type":24,"value":411},"数据读入时是NHWC的形式，需转化为NCHW的形式送入网络",{"type":18,"tag":52,"props":413,"children":414},{},[415],{"type":24,"value":416},"data = data.transpose(2,0,1)",{"type":18,"tag":52,"props":418,"children":419},{},[420,422],{"type":24,"value":421},"target = target.transpose(2,0,1) ",{"type":18,"tag":31,"props":423,"children":424},{},[425],{"type":24,"value":346},{"type":18,"tag":52,"props":427,"children":428},{},[429],{"type":24,"value":430},"import mindspore.transforms.py_transforms as T",{"type":18,"tag":52,"props":432,"children":433},{},[434],{"type":24,"value":435},"transforms = T.ComposeOp([T.HWC2CHW()])",{"type":18,"tag":52,"props":437,"children":438},{},[439],{"type":24,"value":440},"tfdataset = tfdataset.map(input_columns='target', operations=())",{"type":18,"tag":52,"props":442,"children":443},{},[444,446],{"type":24,"value":445},"tfdataset = tfdataset.map(input_columns='data', operations=transforms()) ",{"type":18,"tag":31,"props":447,"children":448},{},[449],{"type":24,"value":364},{"type":18,"tag":265,"props":451,"children":452},{},[453],{"type":18,"tag":269,"props":454,"children":455},{},[456],{"type":24,"value":457},"得到的数据Shape是正确的，但是数据会被打乱，图片写出不正常；(2)使用T.HWC2CHW()则可以得到Shape正确数据顺序正确的图片，图片写出正常。",{"type":18,"tag":26,"props":459,"children":461},{"id":460},"minddata数据处理流程推荐顺序",[462],{"type":18,"tag":31,"props":463,"children":464},{},[465],{"type":24,"value":466},"MindData数据处理流程推荐顺序",{"type":18,"tag":37,"props":468,"children":470},{"id":469},"背景信息-3",[471],{"type":18,"tag":31,"props":472,"children":473},{},[474],{"type":18,"tag":31,"props":475,"children":476},{},[477],{"type":18,"tag":31,"props":478,"children":479},{},[480],{"type":24,"value":39},{"type":18,"tag":52,"props":482,"children":483},{},[484],{"type":24,"value":485},"MindData包含的数据处理操作包括Repeat、Batch、Shuffle、Map和Zip，一般训练过程中都会用到Repeat、Batch、Shuffle和Map的操作。",{"type":18,"tag":37,"props":487,"children":489},{"id":488},"经验总结-3",[490],{"type":18,"tag":31,"props":491,"children":492},{},[493],{"type":18,"tag":31,"props":494,"children":495},{},[496],{"type":18,"tag":31,"props":497,"children":498},{},[499],{"type":24,"value":58},{"type":18,"tag":52,"props":501,"children":502},{},[503],{"type":24,"value":504},"在实际使用过程中，需要组合使用这几个操作时，为达到最优性能，推荐按照如下顺序：数据集加载并shuffle -> Map -> Batch -> Repeat。 原因是shuffle操作需要填充足够数据之后才会往下走，如果是先Load和Map的话，Shuffle很容易受到Map的阻塞导致Pipeline无法并行处理。",{"type":18,"tag":52,"props":506,"children":507},{},[508],{"type":24,"value":509},"示例代码如下：",{"type":18,"tag":52,"props":511,"children":512},{},[513],{"type":24,"value":514},"import mindspore.dataset as dsimport mindspore.dataset.transforms.vision.py_transforms as transforms",{"type":18,"tag":52,"props":516,"children":517},{},[518],{"type":24,"value":519},"DATA_DIR = \"custom_dataset_dir/\"",{"type":18,"tag":52,"props":521,"children":522},{},[523],{"type":24,"value":524},"imagefolder_dataset = ds.ImageFolderDatasetV2(dataset_dir, shuffle=True)",{"type":18,"tag":52,"props":526,"children":527},{},[528,530],{"type":24,"value":529},"resize_op = transforms.Resize(size=(500,500))",{"type":18,"tag":31,"props":531,"children":532},{},[533],{"type":24,"value":534},"# map()",{"type":18,"tag":52,"props":536,"children":537},{},[538,540],{"type":24,"value":539},"dataset.map(input_columns=\"image\", operations=resize_op)",{"type":18,"tag":31,"props":541,"children":542},{},[543],{"type":24,"value":544},"# batch()",{"type":18,"tag":52,"props":546,"children":547},{},[548,550],{"type":24,"value":549},"dataset = dataset.batch(32, drop_remainder=True)",{"type":18,"tag":31,"props":551,"children":552},{},[553],{"type":24,"value":554},"# repeat()",{"type":18,"tag":52,"props":556,"children":557},{},[558],{"type":24,"value":559},"dataset = dataset.repeat(10)",{"type":18,"tag":26,"props":561,"children":563},{"id":562},"数据分布式方式",[564],{"type":18,"tag":31,"props":565,"children":566},{},[567],{"type":24,"value":562},{"type":18,"tag":37,"props":569,"children":571},{"id":570},"背景信息-4",[572],{"type":18,"tag":31,"props":573,"children":574},{},[575],{"type":18,"tag":31,"props":576,"children":577},{},[578],{"type":18,"tag":31,"props":579,"children":580},{},[581],{"type":24,"value":39},{"type":18,"tag":52,"props":583,"children":584},{},[585],{"type":24,"value":586},"PyTorch的数据分布式方式是ddp（distributeddataparallel）或者dp（dataparallel），而MindSpore的数据分布式方式是ddp（distributeddataparallel）。",{"type":18,"tag":37,"props":588,"children":590},{"id":589},"经验总结内容",[591],{"type":18,"tag":31,"props":592,"children":593},{},[594],{"type":18,"tag":31,"props":595,"children":596},{},[597],{"type":18,"tag":31,"props":598,"children":599},{},[600],{"type":24,"value":589},{"type":18,"tag":52,"props":602,"children":603},{},[604],{"type":24,"value":605},"dp和ddp的区别就是数据集是集中在一台机器还是分发到每台机器。差别是用户对batchsize的设置，如果是dp，batchsize=total_batchsize。如果是ddp，batchsize=per_gpu_batch。因此MindSpore的batchsize=per_gpu_batch。",{"title":7,"searchDepth":607,"depth":607,"links":608},4,[609,615,619,623,627],{"id":28,"depth":610,"text":35,"children":611},2,[612,614],{"id":39,"depth":613,"text":39},3,{"id":58,"depth":613,"text":58},{"id":152,"depth":610,"text":158,"children":616},[617,618],{"id":161,"depth":613,"text":39},{"id":180,"depth":613,"text":58},{"id":224,"depth":610,"text":230,"children":620},[621,622],{"id":233,"depth":613,"text":39},{"id":252,"depth":613,"text":58},{"id":460,"depth":610,"text":466,"children":624},[625,626],{"id":469,"depth":613,"text":39},{"id":488,"depth":613,"text":58},{"id":562,"depth":610,"text":562,"children":628},[629,630],{"id":570,"depth":613,"text":39},{"id":589,"depth":613,"text":589},"markdown","content:technology-blogs:zh:1700.md","content","technology-blogs/zh/1700.md","technology-blogs/zh/1700","md",1776506115185]