[{"data":1,"prerenderedAt":464},["ShallowReactive",2],{"content-query-PWqnoNNcGh":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":458,"_id":459,"_source":460,"_file":461,"_stem":462,"_extension":463},"/technology-blogs/zh/2196","zh",false,"","AI设计模式 | 如何通过昇思MindSpore实践特征哈希模式","特征哈希模式在使用时有它适用的场景，它的主要问题是损失了模型精度。","2023-03-13","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/03/17/2c99603b1bf049dbae6275f802cb9a10.png","technology-blogs","实践",{"type":15,"children":16,"toc":452},"root",[17,25,34,45,50,55,62,77,82,96,101,111,116,124,129,134,144,149,157,162,171,176,190,195,200,205,210,215,220,228,235,240,245,250,255,269,274,282,287,295,309,314,319,324,329,337,350,357,365,375,391,407,422,437],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"ai设计模式-如何通过昇思mindspore实践特征哈希模式",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":18,"tag":30,"props":31,"children":33},"img",{"alt":7,"src":32},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/03/17/f7292866ffe645e0a765233509770611.gif",[],{"type":18,"tag":26,"props":35,"children":36},{},[37,39],{"type":24,"value":38},"**作者：**王磊 ｜",{"type":18,"tag":40,"props":41,"children":42},"strong",{},[43],{"type":24,"value":44},"来源：知乎",{"type":18,"tag":26,"props":46,"children":47},{},[48],{"type":24,"value":49},"在AI软件的开发过程中，设计模式的选择会影响到效率以及结果的准确程度，AI软件开发需要重点考虑数据处理和具体算法问题，传统软件的设计模式侧重于解决业务代码封装抽象问题，无法对AI软件开发关心的重点问题产生实质性的帮助，所以需要拓展新的设计模式来解决AI领域常见的开发问题。",{"type":18,"tag":26,"props":51,"children":52},{},[53],{"type":24,"value":54},"当前业界已有针对AI设计模式的总结，整体上可分为设计和运行类，从端到端的流程上可分为数据表示、数据处理、问题表示、网络设计、数据处理、模型训练和弹性部署。每个分类下针对不同的问题提供了具体的模式。在AI框架，如昇思MindSpore，从框架层面已经支持了部分模式。本篇文章将介绍AI设计模式中的一种数据表示模式 - 特征哈希（Feature Hashed）模式，并探讨如何使用昇思MindSpore实践该模式。",{"type":18,"tag":26,"props":56,"children":57},{},[58],{"type":18,"tag":30,"props":59,"children":61},{"alt":7,"src":60},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/03/17/f5362128c48846d9a933d651431d04f1.png",[],{"type":18,"tag":26,"props":63,"children":64},{},[65,70,72],{"type":18,"tag":40,"props":66,"children":67},{},[68],{"type":24,"value":69},"01",{"type":24,"value":71}," ",{"type":18,"tag":40,"props":73,"children":74},{},[75],{"type":24,"value":76},"模式定义",{"type":18,"tag":26,"props":78,"children":79},{},[80],{"type":24,"value":81},"特征哈希是AI设计模式中的一种数据表示模式，能够有效解决分类数据不完整、高基数（特征类别不均）、以及冷启动问题（推理时无法处理新出现的类别）。结合昇思MindSpore提供的数据处理接口，开发者可以很容易的应用该实践。",{"type":18,"tag":26,"props":83,"children":84},{},[85,90,91],{"type":18,"tag":40,"props":86,"children":87},{},[88],{"type":24,"value":89},"02",{"type":24,"value":71},{"type":18,"tag":40,"props":92,"children":93},{},[94],{"type":24,"value":95},"问题",{"type":18,"tag":26,"props":97,"children":98},{},[99],{"type":24,"value":100},"机器学习在数据处理时，通常使用独热编码（one-hot encoding）的方式将分类数据转换为数值数据。独热编码是用N个状态对N个分类数据编码，这样在任意时刻，只有一位是有效的。比如，假设我们有6个邮政编码[1,2,3,4,5,6]，然后通过独热编码对这些分类数据进行编码：",{"type":18,"tag":102,"props":103,"children":105},"pre",{"code":104},"import numpy as np\nimport mindspore.dataset.transforms.c_transforms as c_transforms\nimport mindspore.dataset as ds\n\ncode = [1,2,3,4,5,6]\ndata = np.array(code)   # 将结果列表转为Numpy的数组\ndataset = ds.NumpySlicesDataset(data, column_names=[\"clz\"], shuffle=False)  # 基于MindSpore的Dataset接口把Numpy数组转为Dataset对象\nonehot_op = c_transforms.OneHot(num_classes=7)                                  # 定义操作，这里num_class要大于code中最大数的值\ndataset = dataset.map(operations=onehot_op, input_columns=[\"clz\"])          # 应用独热编码\n\nfor item in dataset:\n    print(item)\n",[106],{"type":18,"tag":107,"props":108,"children":109},"code",{"__ignoreMap":7},[110],{"type":24,"value":104},{"type":18,"tag":26,"props":112,"children":113},{},[114],{"type":24,"value":115},"执行后可以看到编码的结果：",{"type":18,"tag":102,"props":117,"children":119},{"code":118},"[Tensor(shape=[7], dtype=Int32, value= [0, 1, 0, 0, 0, 0, 0])]\n[Tensor(shape=[7], dtype=Int32, value= [0, 0, 1, 0, 0, 0, 0])]\n[Tensor(shape=[7], dtype=Int32, value= [0, 0, 0, 1, 0, 0, 0])]\n[Tensor(shape=[7], dtype=Int32, value= [0, 0, 0, 0, 1, 0, 0])]\n[Tensor(shape=[7], dtype=Int32, value= [0, 0, 0, 0, 0, 1, 0])]\n[Tensor(shape=[7], dtype=Int32, value= [0, 0, 0, 0, 0, 0, 1])]\n",[120],{"type":18,"tag":107,"props":121,"children":122},{"__ignoreMap":7},[123],{"type":24,"value":118},{"type":18,"tag":26,"props":125,"children":126},{},[127],{"type":24,"value":128},"这样可以确保分类数据的输入的唯一性。",{"type":18,"tag":26,"props":130,"children":131},{},[132],{"type":24,"value":133},"处理分类输入需要提前知道所有的类别，语言、日期等相对确定的数据很容易处理，而对于比较难预测的数据会存在一些问题：",{"type":18,"tag":135,"props":136,"children":138},"h3",{"id":137},"_1-数据不完整",[139],{"type":18,"tag":40,"props":140,"children":141},{},[142],{"type":24,"value":143},"1. 数据不完整",{"type":18,"tag":26,"props":145,"children":146},{},[147],{"type":24,"value":148},"训练数据中没有包含所有的特征类别。如果训练数据不完整，可能无法提前获得所有可能的单词，导致编码以后的数据也不完整。比如，针对医疗方面的一些模型，训练数据的词汇表中无法包含所有的医院和医生信息。",{"type":18,"tag":26,"props":150,"children":151},{},[152],{"type":18,"tag":40,"props":153,"children":154},{},[155],{"type":24,"value":156},"2. 高基数（某个分类特征的类别特别多）",{"type":18,"tag":26,"props":158,"children":159},{},[160],{"type":24,"value":161},"单个分类特征的不同值很多，可能需要长度数百万的特征向量， 如IP地址、家庭住址等，导致模型也需要很大空间，无法在小设备上部署。",{"type":18,"tag":135,"props":163,"children":165},{"id":164},"_3-冷启动问题推理时无法处理新出现的类别",[166],{"type":18,"tag":40,"props":167,"children":168},{},[169],{"type":24,"value":170},"3. 冷启动问题（推理时无法处理新出现的类别）",{"type":18,"tag":26,"props":172,"children":173},{},[174],{"type":24,"value":175},"对于新的分类数据，生产环境中的模型无法正确的预测，会出现错误，需要专门的服务来处理这种冷启动的问题。",{"type":18,"tag":26,"props":177,"children":178},{},[179,184,185],{"type":18,"tag":40,"props":180,"children":181},{},[182],{"type":24,"value":183},"03",{"type":24,"value":71},{"type":18,"tag":40,"props":186,"children":187},{},[188],{"type":24,"value":189},"解决方案",{"type":18,"tag":26,"props":191,"children":192},{},[193],{"type":24,"value":194},"以参考图书中预测航班的准点率模型场景为例，美国约有350个机场，机场间的差别会比较大，有些机场航班很多，有些机场航班很少，同时，每年会有新的机场出现。这个场景同时存在了独热编码时的数据不完整、高基数和冷启动问题。",{"type":18,"tag":26,"props":196,"children":197},{},[198],{"type":24,"value":199},"通过特征哈希模式来解决分类数据在独热编码存在的问题。具体的操作如下：",{"type":18,"tag":26,"props":201,"children":202},{},[203],{"type":24,"value":204},"1、将机场的分类数据，把输入转化为唯一的字符串，如把机场名称数据改为缩写并保证数据不重复；",{"type":18,"tag":26,"props":206,"children":207},{},[208],{"type":24,"value":209},"2、对字符串使用稳定可移植（训练和推理场景都可用）的哈希算法进行哈希；",{"type":18,"tag":26,"props":211,"children":212},{},[213],{"type":24,"value":214},"3、对哈希结果取余数。",{"type":18,"tag":26,"props":216,"children":217},{},[218],{"type":24,"value":219},"通过farmhash算法，对于这些机场进行哈希，然后分别放入10，1000个桶中，结果如下：",{"type":18,"tag":102,"props":221,"children":223},{"code":222},">> airports = [\"DTW\", \"LBB\", \"SNA\", \"MSO\", \"ANC\"]\n>>> list(map(lambda x: farmhash.hash64withseed(x, 10) % 10, airports))\n[9, 9, 4, 0, 1]\n>>> list(map(lambda x: farmhash.hash64withseed(x, 1000) % 1000, airports))\n[416, 532, 193, 538, 971]\n",[224],{"type":18,"tag":107,"props":225,"children":226},{"__ignoreMap":7},[227],{"type":24,"value":222},{"type":18,"tag":26,"props":229,"children":230},{},[231],{"type":18,"tag":30,"props":232,"children":234},{"alt":7,"src":233},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/03/17/cdde7dfcbb954f2eb29909e3a7551dad.png",[],{"type":18,"tag":26,"props":236,"children":237},{},[238],{"type":24,"value":239},"特征哈希如何解决分类数据的问题:",{"type":18,"tag":26,"props":241,"children":242},{},[243],{"type":24,"value":244},"1、针对数据不完整问题：即使有些机场数据不在训练数据集中，但它通过特征值哈希后在桶的大小范围内，不用担心数据不完整的情况。",{"type":18,"tag":26,"props":246,"children":247},{},[248],{"type":24,"value":249},"2、针对高基数问题：通过哈希的方式可以将数据的规模降低，减少了系统内存占用和模型大小，即便有百万的数据规模，哈希后也只会落入到有限的桶中。",{"type":18,"tag":26,"props":251,"children":252},{},[253],{"type":24,"value":254},"3、针对冷启动问题：如果新的分类数据添加到系统中，它在哈希后落入和其它机场相同的桶，所以不用担心在生产环境中预测时会出错的情况。之后通过训练更新的模型获得更好的预测。比如，对于350个机场，哈希桶设置为70，大约每个桶有5个机场，每个桶都有数据，生产环境预测就不会落空，只是预测的数据可能不会特别精确，需要后续训练来优化模型。",{"type":18,"tag":26,"props":256,"children":257},{},[258,263,264],{"type":18,"tag":40,"props":259,"children":260},{},[261],{"type":24,"value":262},"04",{"type":24,"value":71},{"type":18,"tag":40,"props":265,"children":266},{},[267],{"type":24,"value":268},"案例",{"type":18,"tag":26,"props":270,"children":271},{},[272],{"type":24,"value":273},"这里沿用了上面提到的预测机场航班准点率的例子，首先对机场数据应用模式，而后通过昇思MindSpore的独热编码接口，完成数据的编码准备。其中依赖哈希算法库需要通过`pip install pyfarmhash`安装。",{"type":18,"tag":102,"props":275,"children":277},{"code":276},"import farmhash\nimport numpy as np\nimport mindspore.dataset.transforms.c_transforms as c_transforms\nimport mindspore.dataset as ds\n\nairports = [\"DTW\", \"LBB\", \"SNA\", \"MSO\", \"ANC\", \"ABC\", \"CDE\", \"FGH\"]    #  将机场名称缩写\nhashed_data = list(map(lambda x: farmhash.hash64withseed(x, 1000) % 4, airports))  #  对字符串应用特征哈希模式\n\ndata = np.array(hashed_data)   # 将结果列表转为Numpy的数组\ndataset = ds.NumpySlicesDataset(data, column_names=[\"airport_name\"], shuffle=False)  # 基于MindSpore的Dataset接口把Numpy数组转为Dataset对象\nonehot_op = c_transforms.OneHot(num_classes=4)                                       # 定义独热编码操作，这里num_class的数量和桶的数量保持一致\ndataset = dataset.map(operations=onehot_op, input_columns=[\"airport_name\"])          # 对机场信息数据应用编码\n\nfor item in dataset:\n    print(item)\n",[278],{"type":18,"tag":107,"props":279,"children":280},{"__ignoreMap":7},[281],{"type":24,"value":276},{"type":18,"tag":26,"props":283,"children":284},{},[285],{"type":24,"value":286},"编码的输出结果如下：",{"type":18,"tag":102,"props":288,"children":290},{"code":289},"[Tensor(shape=[4], dtype=Int32, value= [1, 0, 0, 0])]\n[Tensor(shape=[4], dtype=Int32, value= [1, 0, 0, 0])]\n[Tensor(shape=[4], dtype=Int32, value= [0, 1, 0, 0])]\n[Tensor(shape=[4], dtype=Int32, value= [0, 0, 1, 0])]\n[Tensor(shape=[4], dtype=Int32, value= [0, 0, 0, 1])]\n[Tensor(shape=[4], dtype=Int32, value= [0, 0, 0, 1])]\n[Tensor(shape=[4], dtype=Int32, value= [0, 0, 1, 0])]\n[Tensor(shape=[4], dtype=Int32, value= [1, 0, 0, 0])]\n",[291],{"type":18,"tag":107,"props":292,"children":293},{"__ignoreMap":7},[294],{"type":24,"value":289},{"type":18,"tag":26,"props":296,"children":297},{},[298,303,304],{"type":18,"tag":40,"props":299,"children":300},{},[301],{"type":24,"value":302},"05",{"type":24,"value":71},{"type":18,"tag":40,"props":305,"children":306},{},[307],{"type":24,"value":308},"总结",{"type":18,"tag":26,"props":310,"children":311},{},[312],{"type":24,"value":313},"特征哈希模式在使用时有它适用的场景，它的主要问题是损失了模型精度。特征哈希模式不适合分类数据明确，词汇表大小相对较小（1000量级），并且不存在冷启动的场景。取模是有损操作，特征哈希模式将不同的分类放到了同一个桶中，损失了数据的准确性。在分类的数据特别不平衡时，会导致推理的误差比较大。比如榆林机场的流量比较小，西安机场的流量比它大两个量级，如果它们被放到同一个桶中，当成一种编码处理。模型的结果将更偏向于西安的场景，导致对于起飞等待时间等预测出现偏差。",{"type":18,"tag":26,"props":315,"children":316},{},[317],{"type":24,"value":318},"有两种方式可以缓解模式造成的模型精度损失，可以在实践时考虑应用：",{"type":18,"tag":26,"props":320,"children":321},{},[322],{"type":24,"value":323},"1、添加聚合特征：如果分类变量的分布偏斜，或者桶的数量少导致冲突多，可以通过添加聚合特征作为模型的输入来缓解。比如，对于每个机场，都可以在训练数据集中找到准时航班的概率，并将其作为一个特征添加到模型中。避免在散列机场代码时丢失与个别机场相关的信息。在某些情况下，可以完全避免将机场名称作为一个特征，因为有航班准点的相对频率数据可能就够了。",{"type":18,"tag":26,"props":325,"children":326},{},[327],{"type":24,"value":328},"2、把桶的数量作为超参来调整，以达到精度的平衡。",{"type":18,"tag":26,"props":330,"children":331},{},[332],{"type":18,"tag":40,"props":333,"children":334},{},[335],{"type":24,"value":336},"下载链接",{"type":18,"tag":26,"props":338,"children":339},{},[340,342],{"type":24,"value":341},"[1]",{"type":18,"tag":343,"props":344,"children":348},"a",{"href":345,"rel":346},"https://www.oreilly.com/library/view/machine-learning-design/9781098115777/",[347],"nofollow",[349],{"type":24,"value":345},{"type":18,"tag":26,"props":351,"children":352},{},[353],{"type":18,"tag":30,"props":354,"children":356},{"alt":7,"src":355},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/03/17/0d2f909ca76c4a2ca2f1604def1b7895.png",[],{"type":18,"tag":26,"props":358,"children":359},{},[360],{"type":18,"tag":40,"props":361,"children":362},{},[363],{"type":24,"value":364},"MindSpore官方资料",{"type":18,"tag":26,"props":366,"children":367},{},[368,373],{"type":18,"tag":40,"props":369,"children":370},{},[371],{"type":24,"value":372},"官方QQ群",{"type":24,"value":374}," : 871543426",{"type":18,"tag":26,"props":376,"children":377},{},[378,383,385],{"type":18,"tag":40,"props":379,"children":380},{},[381],{"type":24,"value":382},"官网",{"type":24,"value":384},"：",{"type":18,"tag":343,"props":386,"children":389},{"href":387,"rel":388},"https://www.mindspore.cn/",[347],[390],{"type":24,"value":387},{"type":18,"tag":26,"props":392,"children":393},{},[394,399,401],{"type":18,"tag":40,"props":395,"children":396},{},[397],{"type":24,"value":398},"Gitee",{"type":24,"value":400}," : ",{"type":18,"tag":343,"props":402,"children":405},{"href":403,"rel":404},"https://gitee.com/mindspore/mindspore",[347],[406],{"type":24,"value":403},{"type":18,"tag":26,"props":408,"children":409},{},[410,415,416],{"type":18,"tag":40,"props":411,"children":412},{},[413],{"type":24,"value":414},"GitHub",{"type":24,"value":400},{"type":18,"tag":343,"props":417,"children":420},{"href":418,"rel":419},"https://github.com/mindspore-ai/mindspore",[347],[421],{"type":24,"value":418},{"type":18,"tag":26,"props":423,"children":424},{},[425,430,431],{"type":18,"tag":40,"props":426,"children":427},{},[428],{"type":24,"value":429},"论坛",{"type":24,"value":384},{"type":18,"tag":343,"props":432,"children":435},{"href":433,"rel":434},"https://www.hiascend.com/forum/forum-0106101385921175002-1.html",[347],[436],{"type":24,"value":433},{"type":18,"tag":26,"props":438,"children":439},{},[440,445,446],{"type":18,"tag":40,"props":441,"children":442},{},[443],{"type":24,"value":444},"Openl启智社区",{"type":24,"value":384},{"type":18,"tag":343,"props":447,"children":450},{"href":448,"rel":449},"https://openi.org.cn",[347],[451],{"type":24,"value":448},{"title":7,"searchDepth":453,"depth":453,"links":454},4,[455,457],{"id":137,"depth":456,"text":143},3,{"id":164,"depth":456,"text":170},"markdown","content:technology-blogs:zh:2196.md","content","technology-blogs/zh/2196.md","technology-blogs/zh/2196","md",1776506120840]