[{"data":1,"prerenderedAt":923},["ShallowReactive",2],{"content-query-qrTNOpKSHo":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":917,"_id":918,"_source":919,"_file":920,"_stem":921,"_extension":922},"/technology-blogs/zh/1508","zh",false,"","解决图像分类任务的利器——Vision Transformer","ViT则是自然语言处理和计算机视觉两个领域的融合结晶——在不依赖卷积操作的情况下，依然可以在图像分类任务上达到很好的效果。","2022-05-23","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/05/30/11331f8de86146b3b941c325d4961110.png","technology-blogs","实践",{"type":15,"children":16,"toc":914},"root",[17,25,34,43,55,70,78,83,90,98,103,123,128,136,141,146,159,171,176,181,188,198,203,210,218,223,231,236,243,248,255,260,265,270,278,283,288,305,310,317,333,338,345,350,357,373,378,383,390,395,400,405,412,417,425,433,438,445,463,468,476,484,489,494,517,522,530,535,573,578,583,588,596,601,609,614,621,629,637,642,647,655,663,671,676,681,686,691,696,701,706,711,719,724,729,737,742,747,755,760,765,773,778,783,790,798,803,814,822,827,832,839,847,857,873,883,899],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"解决图像分类任务的利器vision-transformer",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":18,"tag":30,"props":31,"children":33},"img",{"alt":7,"src":32},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/05/30/afaf0c9683254d8eba27173ce4b8b7ec.gif",[],{"type":18,"tag":26,"props":35,"children":36},{},[37],{"type":18,"tag":38,"props":39,"children":40},"strong",{},[41],{"type":24,"value":42},"Vision Transformer (ViT) 简介",{"type":18,"tag":26,"props":44,"children":45},{},[46,48,53],{"type":24,"value":47},"近些年，随着基于",{"type":18,"tag":38,"props":49,"children":50},{},[51],{"type":24,"value":52},"自注意（Self-Attention）****结构",{"type":24,"value":54},"的模型的发展，特别是Transformer模型的提出，极大地促进了自然语言处理模型的发展。由于Transformers的计算效率和可扩展性，它已经能够训练具有超过100B参数的空前规模的模型。",{"type":18,"tag":26,"props":56,"children":57},{},[58,63,65],{"type":18,"tag":38,"props":59,"children":60},{},[61],{"type":24,"value":62},"ViT则是自然语言处理和计算机视觉两个领域的融合结晶",{"type":24,"value":64},"——",{"type":18,"tag":38,"props":66,"children":67},{},[68],{"type":24,"value":69},"在不依赖卷积操作的情况下，依然可以在图像分类任务上达到很好的效果。",{"type":18,"tag":26,"props":71,"children":72},{},[73],{"type":18,"tag":38,"props":74,"children":75},{},[76],{"type":24,"value":77},"模型结构",{"type":18,"tag":26,"props":79,"children":80},{},[81],{"type":24,"value":82},"ViT模型的主体结构是基于Transformer模型的Encoder部分（部分结构顺序有调整，如：Normalization的位置与标准Transformer不同），其结构图[1]如下：",{"type":18,"tag":26,"props":84,"children":85},{},[86],{"type":18,"tag":30,"props":87,"children":89},{"alt":7,"src":88},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/05/30/31ab2f254588405493c9b08801c5545c.png",[],{"type":18,"tag":26,"props":91,"children":92},{},[93],{"type":18,"tag":38,"props":94,"children":95},{},[96],{"type":24,"value":97},"模型特点",{"type":18,"tag":26,"props":99,"children":100},{},[101],{"type":24,"value":102},"ViT模型主要应用于图像分类领域。因此，其模型结构相较于传统的Transformer有以下几个特点：",{"type":18,"tag":104,"props":105,"children":106},"ul",{},[107,113,118],{"type":18,"tag":108,"props":109,"children":110},"li",{},[111],{"type":24,"value":112},"数据集的原图像被划分为多个patch后，将二维patch（不考虑channel）转换为一维向量，再加上类别向量与位置向量作为模型输入。",{"type":18,"tag":108,"props":114,"children":115},{},[116],{"type":24,"value":117},"模型主体的Block结构是基于Transformer的Encoder结构，但是调整了Normalization的位置，其中，最主要的结构依然是Multi-head Attention结构。",{"type":18,"tag":108,"props":119,"children":120},{},[121],{"type":24,"value":122},"模型在Blocks堆叠后接全连接层，接受类别向量的输出作为输入并用于分类。通常情况下，我们将最后的全连接层称为Head，Transformer Encoder部分为backbone。",{"type":18,"tag":26,"props":124,"children":125},{},[126],{"type":24,"value":127},"下面将通过代码实例来详细解释基于ViT实现ImageNet分类任务。",{"type":18,"tag":26,"props":129,"children":130},{},[131],{"type":18,"tag":38,"props":132,"children":133},{},[134],{"type":24,"value":135},"环境准备与数据读取",{"type":18,"tag":26,"props":137,"children":138},{},[139],{"type":24,"value":140},"开始实验之前，请确保本地已经安装了Python环境并安装了MindSpore Vision套件。",{"type":18,"tag":26,"props":142,"children":143},{},[144],{"type":24,"value":145},"首先导入相关模块，配置相关超参数并读取数据集，该部分代码在MindSpore Vision套件中都有API可直接调用，详情可以参考以下链接：",{"type":18,"tag":26,"props":147,"children":148},{},[149,157],{"type":18,"tag":150,"props":151,"children":155},"a",{"href":152,"rel":153},"https://www.mindspore.cn/vision/docs/zh-CN/master/index.html",[154],"nofollow",[156],{"type":24,"value":152},{"type":24,"value":158}," 。",{"type":18,"tag":26,"props":160,"children":161},{},[162,164],{"type":24,"value":163},"可通过",{"type":18,"tag":150,"props":165,"children":168},{"href":166,"rel":167},"http://image-net.org%E4%B8%8B%E8%BD%BD%E5%AE%8C%E6%95%B4%E7%9A%84ImageNet%E6%95%B0%E6%8D%AE%E9%9B%86%E3%80%82",[154],[169],{"type":24,"value":170},"http://image-net.org下载完整的ImageNet数据集。",{"type":18,"tag":26,"props":172,"children":173},{},[174],{"type":24,"value":175},"本案例应用的数据集是从ImageNet中筛选出来的子集，运行第一段代码时会自动下载并解压。",{"type":18,"tag":26,"props":177,"children":178},{},[179],{"type":24,"value":180},"请确保你的数据集路径如以下结构。",{"type":18,"tag":26,"props":182,"children":183},{},[184],{"type":18,"tag":30,"props":185,"children":187},{"alt":7,"src":186},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/05/30/84c0a8a6a4414c8f99b076aa51a3a1b5.png",[],{"type":18,"tag":189,"props":190,"children":192},"pre",{"code":191},"from mindvision.dataset import DownLoad\n\ndataset_url = \"https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/notebook/datasets/vit_dataset.zip\"\n\ndl = DownLoad()\ndl.download_and_extract_archive(dataset_url, \"./\")\n",[193],{"type":18,"tag":194,"props":195,"children":196},"code",{"__ignoreMap":7},[197],{"type":24,"value":191},{"type":18,"tag":26,"props":199,"children":200},{},[201],{"type":24,"value":202},"510368768B [03:36, 2353191.94B/s]",{"type":18,"tag":26,"props":204,"children":205},{},[206],{"type":18,"tag":30,"props":207,"children":209},{"alt":7,"src":208},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/05/30/45ad152fa2444fabb74ebcc74a4cddda.png",[],{"type":18,"tag":26,"props":211,"children":212},{},[213],{"type":18,"tag":38,"props":214,"children":215},{},[216],{"type":24,"value":217},"模型解析",{"type":18,"tag":26,"props":219,"children":220},{},[221],{"type":24,"value":222},"下面将通过代码来细致剖析ViT模型的内部结构。",{"type":18,"tag":26,"props":224,"children":225},{},[226],{"type":18,"tag":38,"props":227,"children":228},{},[229],{"type":24,"value":230},"Transformer基本原理",{"type":18,"tag":26,"props":232,"children":233},{},[234],{"type":24,"value":235},"Transformer模型源于2017年的一篇文章[2]。在这篇文章中提出的基于Attention机制的编码器-解码器型结构在自然语言处理领域获得了巨大的成功。模型结构如下图所示：",{"type":18,"tag":26,"props":237,"children":238},{},[239],{"type":18,"tag":30,"props":240,"children":242},{"alt":7,"src":241},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/05/30/4884e3f53ffc438986a30b00f013ddd0.png",[],{"type":18,"tag":26,"props":244,"children":245},{},[246],{"type":24,"value":247},"其主要结构为多个Encoder和Decoder模块所组成，其中Encoder和Decoder的详细结构如下图[2]所示：",{"type":18,"tag":26,"props":249,"children":250},{},[251],{"type":18,"tag":30,"props":252,"children":254},{"alt":7,"src":253},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/05/30/cbefb075200c4fb3986816ca8863b515.png",[],{"type":18,"tag":26,"props":256,"children":257},{},[258],{"type":24,"value":259},"Encoder与Decoder由许多结构组成，如：多头注意力（Multi-Head Attention）层，Feed Forward层，Normaliztion层，甚至残差连接（Residual Connection，图中的“Add”）。",{"type":18,"tag":26,"props":261,"children":262},{},[263],{"type":24,"value":264},"不过，其中最重要的结构是多头注意力（Multi-Head Attention）结构，该结构基于自注意力（Self-Attention）机制，是多个Self-Attention的并行组成。",{"type":18,"tag":26,"props":266,"children":267},{},[268],{"type":24,"value":269},"所以，理解了Self-Attention就抓住了Transformer的核心。",{"type":18,"tag":26,"props":271,"children":272},{},[273],{"type":18,"tag":38,"props":274,"children":275},{},[276],{"type":24,"value":277},"Attention模块",{"type":18,"tag":26,"props":279,"children":280},{},[281],{"type":24,"value":282},"以下是Self-Attention的解释，其核心内容是为输入向量的每个单词学习一个权重。通过给定一个任务相关的查询向量Query向量，计算Query和各个Key的相似性或者相关性得到注意力分布，即得到每个Key对应Value的权重系数，然后对Value进行加权求和得到最终的Attention数值。",{"type":18,"tag":26,"props":284,"children":285},{},[286],{"type":24,"value":287},"在Self-Attention中:",{"type":18,"tag":104,"props":289,"children":290},{},[291],{"type":18,"tag":108,"props":292,"children":293},{},[294,296,300],{"type":24,"value":295},"最初的输入向量首先会经过Embedding层映射成Q（Query），K（Key），V（Value）三个向量，由于是并行操作，所以代码中是映射成为dim x 3的向量然后进行分割，换言之，如果你的输入向量为一个向量序列（$x_1$，$x_2$，$x_3$），其中的$x_1$，$x_2$，$x_3$都是一维向量，那么每一个一维向量都会经过Embedding层映射出Q，K，V三个向量，只是Embedding矩阵不同，矩阵参数也是通过学习得到的。",{"type":18,"tag":297,"props":298,"children":299},"br",{},[],{"type":18,"tag":38,"props":301,"children":302},{},[303],{"type":24,"value":304},"这里大家可以认为，Q，K，V三个矩阵是发现向量之间关联信息的一种手段，需要经过学习得到，至于为什么是Q，K，V三个，主要是因为需要两个向量点乘以获得权重，又需要另一个向量来承载权重向加的结果，所以，最少需要3个矩阵。",{"type":18,"tag":26,"props":306,"children":307},{},[308],{"type":24,"value":309},"$$ \\begin{cases} q_i = W_q \\cdot x_i & \\\\ k_i = W_k \\cdot x_i,\\hspace{1em} &i = 1,2,3 \\ldots \\\\ v_i = W_v \\cdot x_i & \\end{cases} \\tag{1} $$",{"type":18,"tag":26,"props":311,"children":312},{},[313],{"type":18,"tag":30,"props":314,"children":316},{"alt":7,"src":315},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/05/30/dc269c069dee4337b1937d0ce1c142eb.png",[],{"type":18,"tag":104,"props":318,"children":319},{},[320],{"type":18,"tag":108,"props":321,"children":322},{},[323,325,328],{"type":24,"value":324},"自注意力机制的自注意主要体现在它的Q，K，V都来源于其自身，也就是该过程是在提取输入的不同顺序的向量的联系与特征，最终通过不同顺序向量之间的联系紧密性（Q与K乘积经过Softmax的结果）来表现出来。",{"type":18,"tag":297,"props":326,"children":327},{},[],{"type":18,"tag":38,"props":329,"children":330},{},[331],{"type":24,"value":332},"Q，K，V得到后就需要获取向量间权重，需要对Q和K进行点乘并除以维度的平方根，对所有向量的结果进行Softmax处理，通过公式(2)的操作，我们获得了向量之间的关系权重。",{"type":18,"tag":26,"props":334,"children":335},{},[336],{"type":24,"value":337},"$$ \\begin{cases} a_{1,1} = q_1 \\cdot k_1 / \\sqrt d \\\\ a_{1,2} = q_1 \\cdot k_2 / \\sqrt d \\\\ a_{1,3} = q_1 \\cdot k_3 / \\sqrt d \\end{cases} \\tag{2} $$",{"type":18,"tag":26,"props":339,"children":340},{},[341],{"type":18,"tag":30,"props":342,"children":344},{"alt":7,"src":343},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/05/30/2e954d747fa34915bd2b8c68138a35ce.png",[],{"type":18,"tag":26,"props":346,"children":347},{},[348],{"type":24,"value":349},"$$ Softmax: \\hat a_{1,i} = exp(a_{1,i}) / \\sum_j exp(a_{1,j}),\\hspace{1em} j = 1,2,3 \\ldots \\tag{3}$$",{"type":18,"tag":26,"props":351,"children":352},{},[353],{"type":18,"tag":30,"props":354,"children":356},{"alt":7,"src":355},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/05/30/42202be6c70843afacb9eda986cb4951.png",[],{"type":18,"tag":104,"props":358,"children":359},{},[360],{"type":18,"tag":108,"props":361,"children":362},{},[363,365,368],{"type":24,"value":364},"其最终输出则是通过V这个映射后的向量与Q，K经过Softmax结果进行weight sum获得，这个过程可以理解为在全局上进行自注意表示。",{"type":18,"tag":297,"props":366,"children":367},{},[],{"type":18,"tag":38,"props":369,"children":370},{},[371],{"type":24,"value":372},"每一组Q，K，V最后都有一个V输出，这是Self-Attention得到的最终结果，是当前向量在结合了它与其他向量关联权重后得到的结果。",{"type":18,"tag":26,"props":374,"children":375},{},[376],{"type":24,"value":377},"$$ b_1 = \\sum_i \\hat a_{1,i}v_i,\\hspace{1em} i = 1,2,3... \\tag{4} $$",{"type":18,"tag":26,"props":379,"children":380},{},[381],{"type":24,"value":382},"通过下图可以整体把握Self-Attention的全部过程。",{"type":18,"tag":26,"props":384,"children":385},{},[386],{"type":18,"tag":30,"props":387,"children":389},{"alt":7,"src":388},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/05/30/986c448cc7614fc3919189b34fa1a610.png",[],{"type":18,"tag":26,"props":391,"children":392},{},[393],{"type":24,"value":394},"多头注意力机制就是将原本self-Attention处理的向量分割为多个Head进行处理，这一点也可以从代码中体现，这也是attention结构可以进行并行加速的一个方面。",{"type":18,"tag":26,"props":396,"children":397},{},[398],{"type":24,"value":399},"总结来说，多头注意力机制在保持参数总量不变的情况下，将同样的query, key和value映射到原来的高维空间（Q,K,V）的不同子空间(Q_0,K_0,V_0)中进行自注意力的计算，最后再合并不同子空间中的注意力信息。",{"type":18,"tag":26,"props":401,"children":402},{},[403],{"type":24,"value":404},"所以，对于同一个输入向量，多个注意力机制可以同时对其进行处理，即利用并行计算加速处理过程，又在处理的时候更充分的分析和利用了向量特征。下图展示了多头注意力机制，其并行能力的主要体现在下图中的$a_1$和$a_2$是同一个向量进行分割获得的。",{"type":18,"tag":26,"props":406,"children":407},{},[408],{"type":18,"tag":30,"props":409,"children":411},{"alt":7,"src":410},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/05/30/a8f770d684bd4c108a1dac3e26dcc0ff.png",[],{"type":18,"tag":26,"props":413,"children":414},{},[415],{"type":24,"value":416},"以下是vision套件中的Multi-Head Attention代码，结合上文的解释，代码清晰的展现了这一过程。",{"type":18,"tag":189,"props":418,"children":420},{"code":419},"import mindspore.nn as nn\n\nclass Attention(nn.Cell):\n    def __init__(self,\n                 dim: int,\n                 num_heads: int = 8,\n                 keep_prob: float = 1.0,\n                 attention_keep_prob: float = 1.0):\n        super(Attention, self).__init__()\n\n        self.num_heads = num_heads\n        head_dim = dim // num_heads\n        self.scale = Tensor(head_dim ** -0.5)\n\n        self.qkv = nn.Dense(dim, dim * 3)\n        self.attn_drop = nn.Dropout(attention_keep_prob)\n        self.out = nn.Dense(dim, dim)\n        self.out_drop = nn.Dropout(keep_prob)\n\n        self.mul = P.Mul()\n        self.reshape = P.Reshape()\n        self.transpose = P.Transpose()\n        self.unstack = P.Unstack(axis=0)\n        self.attn_matmul_v = P.BatchMatMul()\n        self.q_matmul_k = P.BatchMatMul(transpose_b=True)\n        self.softmax = nn.Softmax(axis=-1)\n\n    def construct(self, x):\n        \"\"\"Attention construct.\"\"\"\n        b, n, c = x.shape\n\n        # 最初的输入向量首先会经过Embedding层映射成Q(Query)，K(Key)，V(Value)三个向量\n        # 由于是并行操作，所以代码中是映射成为dim*3的向量然后进行分割\n        qkv = self.qkv(x)\n\n        #多头注意力机制就是将原本self-Attention处理的向量分割为多个Head进行处理\n        qkv = self.reshape(qkv, (b, n, 3, self.num_heads, c // self.num_heads))\n        qkv = self.transpose(qkv, (2, 0, 3, 1, 4))\n        q, k, v = self.unstack(qkv)\n\n        # 自注意力机制的自注意主要体现在它的Q，K，V都来源于其自身\n        # 也就是该过程是在提取输入的不同顺序的向量的联系与特征\n        # 最终通过不同顺序向量之间的联系紧密性（Q与K乘积经过Softmax的结果）来表现出来\n        attn = self.q_matmul_k(q, k)\n        attn = self.mul(attn, self.scale)\n        attn = self.softmax(attn)\n        attn = self.attn_drop(attn)\n\n        # 其最终输出则是通过V这个映射后的向量与QK经过Softmax结果进行weight sum获得\n        # 这个过程可以理解为在全局上进行自注意表示\n        out = self.attn_matmul_v(attn, v)\n        out = self.transpose(out, (0, 2, 1, 3))\n        out = self.reshape(out, (b, n, c))\n        out = self.out(out)\n        out = self.out_drop(out)\n\n        return out\n",[421],{"type":18,"tag":194,"props":422,"children":423},{"__ignoreMap":7},[424],{"type":24,"value":419},{"type":18,"tag":26,"props":426,"children":427},{},[428],{"type":18,"tag":38,"props":429,"children":430},{},[431],{"type":24,"value":432},"Transformer Encoder",{"type":18,"tag":26,"props":434,"children":435},{},[436],{"type":24,"value":437},"在了解了Self-Attention结构之后，通过与Feed Forward，Residual Connection等结构的拼接就可以形成Transformer的基础结构，接下来就利用Self-Attention来构建ViT模型中的TransformerEncoder部分，类似于构建了一个Transformer的编码器部分，如下图所示：",{"type":18,"tag":26,"props":439,"children":440},{},[441],{"type":18,"tag":30,"props":442,"children":444},{"alt":7,"src":443},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/05/30/75b341660e0c440e808058f9cd13f4b4.png",[],{"type":18,"tag":104,"props":446,"children":447},{},[448,453,458],{"type":18,"tag":108,"props":449,"children":450},{},[451],{"type":24,"value":452},"ViT模型中的基础结构与标准Transformer有所不同，主要在于Normalization的位置是放在Self-Attention和Feed Forward之前，其他结构如Residual Connection，Feed Forward，Normalization都如Transformer中所设计。",{"type":18,"tag":108,"props":454,"children":455},{},[456],{"type":24,"value":457},"从Transformer结构的图片可以发现，多个子encoder的堆叠就完成了模型编码器的构建，在ViT模型中，依然沿用这个思路，通过配置超参数num_layers，就可以确定堆叠层数。",{"type":18,"tag":108,"props":459,"children":460},{},[461],{"type":24,"value":462},"Residual Connection，Normalization的结构可以保证模型有很强的扩展性（保证信息经过深层处理不会出现退化的现象，这是Residual Connection的作用），Normalization和dropout的应用可以增强模型泛化能力。",{"type":18,"tag":26,"props":464,"children":465},{},[466],{"type":24,"value":467},"从以下源码中就可以清晰看到Transformer的结构。将TransformerEncoder结构和一个多层感知器（MLP）结合，就构成了ViT模型的backbone部分。",{"type":18,"tag":189,"props":469,"children":471},{"code":470},"class TransformerEncoder(nn.Cell):\n    def __init__(self,\n                 dim: int,\n                 num_layers: int,\n                 num_heads: int,\n                 mlp_dim: int,\n                 keep_prob: float = 1.,\n                 attention_keep_prob: float = 1.0,\n                 drop_path_keep_prob: float = 1.0,\n                 activation: nn.Cell = nn.GELU,\n                 norm: nn.Cell = nn.LayerNorm):\n        super(TransformerEncoder, self).__init__()\n        layers = []\n\n        # 从vit_architecture图可以发现，多个子encoder的堆叠就完成了模型编码器的构建\n        # 在ViT模型中，依然沿用这个思路，通过配置超参数num_layers，就可以确定堆叠层数\n        for _ in range(num_layers):\n            normalization1 = norm((dim,))\n            normalization2 = norm((dim,))\n            attention = Attention(dim=dim,\n                                  num_heads=num_heads,\n                                  keep_prob=keep_prob,\n                                  attention_keep_prob=attention_keep_prob)\n\n            feedforward = FeedForward(in_features=dim,\n                                      hidden_features=mlp_dim,\n                                      activation=activation,\n                                      keep_prob=keep_prob)\n\n            # ViT模型中的基础结构与标准Transformer有所不同\n            # 主要在于Normalization的位置是放在Self-Attention和Feed Forward之前\n            # 其他结构如Residual Connection，Feed Forward，Normalization都如Transformer中所设计\n            layers.append(\n                nn.SequentialCell([\n                    # Residual Connection，Normalization的结构可以保证模型有很强的扩展性\n                    # 保证信息经过深层处理不会出现退化的现象，这是Residual Connection的作用\n                    # Normalization和dropout的应用可以增强模型泛化能力\n                    ResidualCell(nn.SequentialCell([normalization1,\n                                                    attention])),\n\n                    ResidualCell(nn.SequentialCell([normalization2,\n                                                    feedforward]))\n                ])\n            )\n        self.layers = nn.SequentialCell(layers)\n\n    def construct(self, x):\n        \"\"\"Transformer construct.\"\"\"\n        return self.layers(x)\n",[472],{"type":18,"tag":194,"props":473,"children":474},{"__ignoreMap":7},[475],{"type":24,"value":470},{"type":18,"tag":26,"props":477,"children":478},{},[479],{"type":18,"tag":38,"props":480,"children":481},{},[482],{"type":24,"value":483},"ViT模型的输入",{"type":18,"tag":26,"props":485,"children":486},{},[487],{"type":24,"value":488},"传统的Transformer结构主要用于处理自然语言领域的词向量（Word Embedding or Word Vector），词向量与传统图像数据的主要区别在于，词向量通常是1维向量进行堆叠，而图片则是二维矩阵的堆叠，多头注意力机制在处理1维词向量的堆叠时会提取词向量之间的联系也就是上下文语义，这使得Transformer在自然语言处理领域非常好用，而2维图片矩阵如何与1维词向量进行转化就成为了Transformer进军图像处理领域的一个小门槛。",{"type":18,"tag":26,"props":490,"children":491},{},[492],{"type":24,"value":493},"在ViT模型中：",{"type":18,"tag":104,"props":495,"children":496},{},[497,512],{"type":18,"tag":108,"props":498,"children":499},{},[500,502,505,510],{"type":24,"value":501},"通过将输入图像在每个channel上划分为16*16个patch，这一步是通过卷积操作来完成的，当然也可以人工进行划分，但卷积操作也可以达到目的同时还可以进行一次而外的数据处理；",{"type":18,"tag":297,"props":503,"children":504},{},[],{"type":18,"tag":38,"props":506,"children":507},{},[508],{"type":24,"value":509},"例如一幅输入224 x 224的图像，首先经过卷积处理得到16 x 16个patch，那么每一个patch的大小就是14 x 14",{"type":24,"value":511},"。",{"type":18,"tag":108,"props":513,"children":514},{},[515],{"type":24,"value":516},"再将每一个patch的矩阵拉伸成为一个1维向量，从而获得了近似词向量堆叠的效果。上一步得到的14 x 14的patch就转换为长度为196的向量。",{"type":18,"tag":26,"props":518,"children":519},{},[520],{"type":24,"value":521},"这是图像输入网络经过的第一步处理。具体Patch Embedding的代码如下所示：",{"type":18,"tag":189,"props":523,"children":525},{"code":524},"class PatchEmbedding(nn.Cell):\n    MIN_NUM_PATCHES = 4\n    def __init__(self,\n                 image_size: int = 224,\n                 patch_size: int = 16,\n                 embed_dim: int = 768,\n                 input_channels: int = 3):\n        super(PatchEmbedding, self).__init__()\n\n        self.image_size = image_size\n        self.patch_size = patch_size\n        self.num_patches = (image_size // patch_size) ** 2\n\n        # 通过将输入图像在每个channel上划分为16*16个patch\n        self.conv = nn.Conv2d(input_channels, embed_dim, kernel_size=patch_size, stride=patch_size, has_bias=True)\n        self.reshape = P.Reshape()\n        self.transpose = P.Transpose()\n\n    def construct(self, x):\n        \"\"\"Path Embedding construct.\"\"\"\n        x = self.conv(x)\n        b, c, h, w = x.shape\n\n        # 再将每一个patch的矩阵拉伸成为一个1维向量，从而获得了近似词向量堆叠的效果；\n        x = self.reshape(x, (b, c, h * w))\n        x = self.transpose(x, (0, 2, 1))\n\n        return x\n",[526],{"type":18,"tag":194,"props":527,"children":528},{"__ignoreMap":7},[529],{"type":24,"value":524},{"type":18,"tag":26,"props":531,"children":532},{},[533],{"type":24,"value":534},"输入图像在划分为patch之后，会经过pos_embedding 和 class_embedding两个过程。",{"type":18,"tag":104,"props":536,"children":537},{},[538,548,558,563],{"type":18,"tag":108,"props":539,"children":540},{},[541,543],{"type":24,"value":542},"class_embedding主要借鉴了BERT模型的用于文本分类时的思想，在每一个word vector之前增加一个类别值，通常是加在向量的第一位，",{"type":18,"tag":38,"props":544,"children":545},{},[546],{"type":24,"value":547},"上一步得到的196维的向量加上class_embedding后变为197维。",{"type":18,"tag":108,"props":549,"children":550},{},[551,553],{"type":24,"value":552},"增加的class_embedding是一个可以学习的参数，经过网络的不断训练，最终以输出向量的第一个维度的输出来决定最后的输出类别；",{"type":18,"tag":38,"props":554,"children":555},{},[556],{"type":24,"value":557},"由于输入是16 x 16个patch，所以输出进行分类时是取 16 x 16个class_embedding进行分类。",{"type":18,"tag":108,"props":559,"children":560},{},[561],{"type":24,"value":562},"pos_embedding也是一组可以学习的参数，会被加入到经过处理的patch矩阵中。",{"type":18,"tag":108,"props":564,"children":565},{},[566,568],{"type":24,"value":567},"由于pos_embedding也是可以学习的参数，所以它的加入类似于全链接网络和卷积的bias。",{"type":18,"tag":38,"props":569,"children":570},{},[571],{"type":24,"value":572},"这一步就是创造一个长度维197的可训练向量加入到经过class_embedding的向量中。",{"type":18,"tag":26,"props":574,"children":575},{},[576],{"type":24,"value":577},"实际上，pos_embedding总共有4种方案。",{"type":18,"tag":26,"props":579,"children":580},{},[581],{"type":24,"value":582},"但是经过作者的论证，只有加上pos_embedding和不加pos_embedding有明显影响，至于pos_embedding是1维还是2维对分类结果影响不大，所以，在我们的代码中，也是采用了1维的pos_embedding，由于class_embedding是加在pos_embedding之前，所以pos_embedding的维度会比patch拉伸后的维度加1。",{"type":18,"tag":26,"props":584,"children":585},{},[586],{"type":24,"value":587},"总的而言，ViT模型还是利用了Transformer模型在处理上下文语义时的优势，将图像转换为一种“变种词向量”然后进行处理，而这样转换的意义在于，多个patch之间本身具有空间联系，这类似于一种“空间语义”，从而获得了比较好的处理效果。",{"type":18,"tag":26,"props":589,"children":590},{},[591],{"type":18,"tag":38,"props":592,"children":593},{},[594],{"type":24,"value":595},"整体构建ViT",{"type":18,"tag":26,"props":597,"children":598},{},[599],{"type":24,"value":600},"以下代码构建了一个完整的ViT模型。",{"type":18,"tag":189,"props":602,"children":604},{"code":603},"from typing import Optional\n\nclass ViT(nn.Cell):\n    def __init__(self,\n                 image_size: int = 224,\n                 input_channels: int = 3,\n                 patch_size: int = 16,\n                 embed_dim: int = 768,\n                 num_layers: int = 12,\n                 num_heads: int = 12,\n                 mlp_dim: int = 3072,\n                 keep_prob: float = 1.0,\n                 attention_keep_prob: float = 1.0,\n                 drop_path_keep_prob: float = 1.0,\n                 activation: nn.Cell = nn.GELU,\n                 norm: Optional[nn.Cell] = nn.LayerNorm,\n                 pool: str = 'cls') -> None:\n        super(ViT, self).__init__()\n\n        self.patch_embedding = PatchEmbedding(image_size=image_size,\n                                              patch_size=patch_size,\n                                              embed_dim=embed_dim,\n                                              input_channels=input_channels)\n        num_patches = self.patch_embedding.num_patches\n\n        # 此处增加class_embedding和pos_embedding，如果不是进行分类任务\n        # 可以只增加pos_embedding，通过pool参数进行控制\n        self.cls_token = init(init_type=Normal(sigma=1.0),\n                              shape=(1, 1, embed_dim),\n                              dtype=ms.float32,\n                              name='cls',\n                              requires_grad=True)\n\n        # pos_embedding也是一组可以学习的参数，会被加入到经过处理的patch矩阵中\n        self.pos_embedding = init(init_type=Normal(sigma=1.0),\n                                  shape=(1, num_patches + 1, embed_dim),\n                                  dtype=ms.float32,\n                                  name='pos_embedding',\n                                  requires_grad=True)\n\n        # axis=1定义了会在向量的开头加入class_embedding\n        self.concat = P.Concat(axis=1)\n\n        self.pool = pool\n        self.pos_dropout = nn.Dropout(keep_prob)\n        self.norm = norm((embed_dim,))\n        self.tile = P.Tile()\n        self.transformer = TransformerEncoder(dim=embed_dim,\n                                              num_layers=num_layers,\n                                              num_heads=num_heads,\n                                              mlp_dim=mlp_dim,\n                                              keep_prob=keep_prob,\n                                              attention_keep_prob=attention_keep_prob,\n                                              drop_path_keep_prob=drop_path_keep_prob,\n                                              activation=activation,\n                                              norm=norm)\n\n    def construct(self, x):\n        \"\"\"ViT construct.\"\"\"\n        x = self.patch_embedding(x)\n\n        # class_embedding主要借鉴了BERT模型的用于文本分类时的思想\n        # 在每一个word vector之前增加一个类别值，通常是加在向量的第一位\n        cls_tokens = self.tile(self.cls_token, (x.shape[0], 1, 1))\n        x = self.concat((cls_tokens, x))\n        x += self.pos_embedding\n\n        x = self.pos_dropout(x)\n        x = self.transformer(x)\n        x = self.norm(x)\n\n        # 增加的class_embedding是一个可以学习的参数，经过网络的不断训练\n        # 最终以输出向量的第一个维度的输出来决定最后的输出类别；\n        x = x[:, 0]\n\n        return x\n",[605],{"type":18,"tag":194,"props":606,"children":607},{"__ignoreMap":7},[608],{"type":24,"value":603},{"type":18,"tag":26,"props":610,"children":611},{},[612],{"type":24,"value":613},"整体流程图如下所示：",{"type":18,"tag":26,"props":615,"children":616},{},[617],{"type":18,"tag":30,"props":618,"children":620},{"alt":7,"src":619},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/05/30/62653c286eed427d9ca986d6d72cb15d.png",[],{"type":18,"tag":26,"props":622,"children":623},{},[624],{"type":18,"tag":38,"props":625,"children":626},{},[627],{"type":24,"value":628},"模型训练与推理",{"type":18,"tag":26,"props":630,"children":631},{},[632],{"type":18,"tag":38,"props":633,"children":634},{},[635],{"type":24,"value":636},"模型训练",{"type":18,"tag":26,"props":638,"children":639},{},[640],{"type":24,"value":641},"模型开始训练前，需要设定损失函数，优化器，回调函数等，直接调用MindSpore Vision提供的接口可以方便完成实例化。",{"type":18,"tag":26,"props":643,"children":644},{},[645],{"type":24,"value":646},"完整训练ViT模型需要很长的时间，实际应用时建议根据项目需要调整epoch_size，当正常输出每个Epoch的step信息时，意味着训练正在进行，通过模型输出可以查看当前训练的loss值和时间等指标。",{"type":18,"tag":189,"props":648,"children":650},{"code":649},"import mindspore.nn as nn\nfrom mindspore import Model\nfrom mindspore import ModelCheckpoint, CheckpointConfig\n\nfrom mindvision.classification.models import vit_b_16\nfrom mindvision.engine.callback import LossMonitor\nfrom mindvision.engine.loss import CrossEntropySmooth\n\n# 定义超参数\nepoch_size = 10\nmomentum = 0.9\nstep_size = dataset_train.get_dataset_size()\nnum_classes = 1000\n\n# 构建模型\nnetwork = vit_b_16(num_classes=num_classes, image_size=resize, pretrained=True)\n\n# 定义递减的学习率\nlr = nn.cosine_decay_lr(min_lr=float(0),\n                        max_lr=0.003,\n                        total_step=epoch_size * step_size,\n                        step_per_epoch=step_size,\n                        decay_epoch=90)\n\n# 定义优化器\nnetwork_opt = nn.Adam(network.trainable_params(), lr, momentum)\n\n# 定义损失函数\nnetwork_loss = CrossEntropySmooth(sparse=True,\n                                  reduction=\"mean\",\n                                  smooth_factor=0.1,\n                                  classes_num=num_classes)\n\n# 设定checkpoint\nckpt_config = CheckpointConfig(save_checkpoint_steps=step_size, keep_checkpoint_max=100)\nckpt_callback = ModelCheckpoint(prefix='vit_b_16', directory='./ViT', config=ckpt_config)\n\n# 初始化模型\nmodel = Model(network, loss_fn=network_loss, optimizer=network_opt, metrics={\"acc\"})\n\n# 训练\nmodel.train(epoch_size,\n            dataset_train,\n            callbacks=[ckpt_callback, LossMonitor(lr)],\n            dataset_sink_mode=False)\n",[651],{"type":18,"tag":194,"props":652,"children":653},{"__ignoreMap":7},[654],{"type":24,"value":649},{"type":18,"tag":189,"props":656,"children":658},{"code":657},"346281984B [02:29, 2316804.40B/s]                                \nEpoch:[  0/ 10], step:[    1/  125], loss:[1.908/1.908], time:9580.221 ms, lr:0.00300\nEpoch:[  0/ 10], step:[    2/  125], loss:[13.546/7.727], time:756.571 ms, lr:0.00300\nEpoch:[  0/ 10], step:[    3/  125], loss:[6.916/7.457], time:767.550 ms, lr:0.00300\nEpoch:[  0/ 10], step:[    4/  125], loss:[10.087/8.114], time:767.763 ms, lr:0.00300\nEpoch:[  0/ 10], step:[    5/  125], loss:[7.964/8.084], time:767.676 ms, lr:0.00300\nEpoch:[  0/ 10], step:[    6/  125], loss:[7.451/7.979], time:773.207 ms, lr:0.00300\nEpoch:[  0/ 10], step:[    7/  125], loss:[7.434/7.901], time:769.799 ms, lr:0.00300\n...\nEpoch:[  9/ 10], step:[  120/  125], loss:[6.524/6.403], time:788.128 ms, lr:0.00293\nEpoch:[  9/ 10], step:[  121/  125], loss:[6.467/6.404], time:788.564 ms, lr:0.00293\nEpoch:[  9/ 10], step:[  122/  125], loss:[6.483/6.405], time:788.837 ms, lr:0.00293\nEpoch:[  9/ 10], step:[  123/  125], loss:[6.233/6.403], time:788.030 ms, lr:0.00293\nEpoch:[  9/ 10], step:[  124/  125], loss:[6.781/6.406], time:791.158 ms, lr:0.00293\nEpoch:[  9/ 10], step:[  125/  125], loss:[6.584/6.408], time:1614.970 ms, lr:0.00293\nEpoch time: 99938.756 ms, per step time: 799.510 ms, avg loss: 6.408\n",[659],{"type":18,"tag":194,"props":660,"children":661},{"__ignoreMap":7},[662],{"type":24,"value":657},{"type":18,"tag":26,"props":664,"children":665},{},[666],{"type":18,"tag":38,"props":667,"children":668},{},[669],{"type":24,"value":670},"模型验证",{"type":18,"tag":26,"props":672,"children":673},{},[674],{"type":24,"value":675},"模型验证过程主要应用了Model，ImageNet，CrossEntropySmooth和vit_b_16等接口。",{"type":18,"tag":26,"props":677,"children":678},{},[679],{"type":24,"value":680},"Model主要用于编译模型。",{"type":18,"tag":26,"props":682,"children":683},{},[684],{"type":24,"value":685},"ImageNet主要用于读取数据集。",{"type":18,"tag":26,"props":687,"children":688},{},[689],{"type":24,"value":690},"CrossEntropySmooth是损失函数实例化接口。",{"type":18,"tag":26,"props":692,"children":693},{},[694],{"type":24,"value":695},"vit_b_16是MindSpore Vision提供的模型实例化接口。",{"type":18,"tag":26,"props":697,"children":698},{},[699],{"type":24,"value":700},"通过改变ImageNet接口的split参数即可调用验证集。",{"type":18,"tag":26,"props":702,"children":703},{},[704],{"type":24,"value":705},"与训练过程相似，首先调用vit_b_16接口定义网络结构，加载预训练模型参数。随后设置损失函数，评价指标等，编译模型后进行验证。本案例采用了业界通用的评价标准Top_1_Accuracy和Top_5_Accuracy评价指标来评价模型表现。",{"type":18,"tag":26,"props":707,"children":708},{},[709],{"type":24,"value":710},"在本案例中，这两个指标代表了在输出的1000维向量中，以最大值或前5的输出值所代表的类别为预测结果时，模型预测的准确率。这两个指标的值越大，代表模型准确率越高。",{"type":18,"tag":189,"props":712,"children":714},{"code":713},"dataset_analyse = ImageNet(data_url,\n                           split=\"val\",\n                           num_parallel_workers=1,\n                           resize=resize,\n                           batch_size=batch_size)\ndataset_eval = dataset_analyse.run()\n\nnetwork = vit_b_16(num_classes=num_classes, image_size=resize, pretrained=True)\n\nnetwork_loss = CrossEntropySmooth(sparse=True,\n                                  reduction=\"mean\",\n                                  smooth_factor=0.1,\n                                  classes_num=num_classes)\n\n# 定义评价指标\neval_metrics = {'Top_1_Accuracy': nn.Top1CategoricalAccuracy(),\n                'Top_5_Accuracy': nn.Top5CategoricalAccuracy()}\n\nmodel = Model(network, network_loss, metrics=eval_metrics)\n\n# 评估模型\nresult = model.eval(dataset_eval)\nprint(result)\n",[715],{"type":18,"tag":194,"props":716,"children":717},{"__ignoreMap":7},[718],{"type":24,"value":713},{"type":18,"tag":26,"props":720,"children":721},{},[722],{"type":24,"value":723},"{'Top_1_Accuracy': 0.73524, 'Top_5_Accuracy': 0.91756}",{"type":18,"tag":26,"props":725,"children":726},{},[727],{"type":24,"value":728},"从结果可以看出，由于我们加载了预训练模型参数，模型的Top_1_Accuracy和Top_5_Accuracy达到了很高的水平，实际项目中也可以以此准确率为标准。如果未使用预训练模型参数，则需要更多的epoch来训练。",{"type":18,"tag":26,"props":730,"children":731},{},[732],{"type":18,"tag":38,"props":733,"children":734},{},[735],{"type":24,"value":736},"模型推理",{"type":18,"tag":26,"props":738,"children":739},{},[740],{"type":24,"value":741},"在进行模型推理之前，首先要定义一个对推理图片进行数据预处理的方法。该方法可以对我们的推理图片进行resize和normalize处理，这样才能与我们训练时的输入数据匹配。",{"type":18,"tag":26,"props":743,"children":744},{},[745],{"type":24,"value":746},"本案例采用了一张Doberman的图片作为推理图片来测试模型表现，期望模型可以给出正确的预测结果。",{"type":18,"tag":189,"props":748,"children":750},{"code":749},"import mindspore.dataset.vision.c_transforms as transforms\n\n# 数据预处理操作\ndef infer_transform(dataset, columns_list, resize):\n\n    mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]\n    std = [0.229 * 255, 0.224 * 255, 0.225 * 255]\n\n    trans = [transforms.Decode(),\n             transforms.Resize([resize, resize]),\n             transforms.Normalize(mean=mean, std=std),\n             transforms.HWC2CHW()]\n\n    dataset = dataset.map(operations=trans,\n                          input_columns=columns_list[0],\n                          num_parallel_workers=1)\n    dataset = dataset.batch(1)\n\n    return dataset\n",[751],{"type":18,"tag":194,"props":752,"children":753},{"__ignoreMap":7},[754],{"type":24,"value":749},{"type":18,"tag":26,"props":756,"children":757},{},[758],{"type":24,"value":759},"接下来，我们将调用模型的predict方法进行模型推理，利用read_dataset接口读推理图片路径，利用GeneratorDataset来生成测试集。",{"type":18,"tag":26,"props":761,"children":762},{},[763],{"type":24,"value":764},"在推理过程中，ImageNet接口主要负责对原数据集标签和模型输出进行配对。通过index2label就可以获取对应标签，再通过show_result接口将结果写在对应图片上。",{"type":18,"tag":189,"props":766,"children":768},{"code":767},"import numpy as np\n\nimport mindspore.dataset as ds\nfrom mindspore import Tensor\n\nfrom mindvision.dataset.generator import DatasetGenerator\nfrom mindvision.dataset.download import read_dataset\nfrom mindvision.classification.utils.image import show_result\n\n# 读取推理图片\nimage_list, image_label = read_dataset('./dataset/infer')\ncolumns_list = ('image', 'label')\n\ndataset_infer = ds.GeneratorDataset(DatasetGenerator(image_list, image_label),\n                                    column_names=list(columns_list),\n                                    num_parallel_workers=1)\n\ndataset_infer = infer_transform(dataset_infer, columns_list, resize)\n\n# 读取数据进行推理\nfor i, image in enumerate(dataset_infer.create_dict_iterator(output_numpy=True)):\n    image = image[\"image\"]\n    image = Tensor(image)\n    prob = model.predict(image)\n    label = np.argmax(prob.asnumpy(), axis=1)\n\n    predict = dataset_analyse.index2label[int(label)]\n    output = {int(label): predict}\n    print(output)\n    show_result(img=image_list[i], result=output, out_file=image_list[i])\n",[769],{"type":18,"tag":194,"props":770,"children":771},{"__ignoreMap":7},[772],{"type":24,"value":767},{"type":18,"tag":26,"props":774,"children":775},{},[776],{"type":24,"value":777},"{236: 'Doberman'}",{"type":18,"tag":26,"props":779,"children":780},{},[781],{"type":24,"value":782},"推理过程完成后，在推理文件夹下可以找到图片的推理结果，可以看出预测结果是Doberman，与期望结果相同，验证了模型的准确性。",{"type":18,"tag":26,"props":784,"children":785},{},[786],{"type":18,"tag":30,"props":787,"children":789},{"alt":7,"src":788},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/05/30/1d1a59a1a044488e9a0d6a7c4ae5d00e.png",[],{"type":18,"tag":26,"props":791,"children":792},{},[793],{"type":18,"tag":38,"props":794,"children":795},{},[796],{"type":24,"value":797},"总结",{"type":18,"tag":26,"props":799,"children":800},{},[801],{"type":24,"value":802},"本案例完成了一个ViT模型在ImageNet数据上进行训练，验证和推理的过程，其中，对关键的ViT模型结构和原理作了讲解。",{"type":18,"tag":26,"props":804,"children":805},{},[806,808],{"type":24,"value":807},"通过学习本案例，理解源码可以帮助用户掌握Multi-Head Attention，TransformerEncoder，pos_embedding等关键概念，如果要详细理解ViT的模型原理，建议基于源码更深层次的详细阅读，可以参考vision套件：",{"type":18,"tag":150,"props":809,"children":812},{"href":810,"rel":811},"https://gitee.com/mindspore/vision/tree/master/examples/classification/vit",[154],[813],{"type":24,"value":810},{"type":18,"tag":26,"props":815,"children":816},{},[817],{"type":18,"tag":38,"props":818,"children":819},{},[820],{"type":24,"value":821},"【引用】",{"type":18,"tag":26,"props":823,"children":824},{},[825],{"type":24,"value":826},"[1] Dosovitskiy, Alexey, et al. \\\"An image is worth 16x16 words: Transformers for image recognition at scale.\\\" arXiv preprint arXiv:2010.11929 (2020).",{"type":18,"tag":26,"props":828,"children":829},{},[830],{"type":24,"value":831},"[2] Vaswani, Ashish, et al. \\\"Attention is all you need.\\\"Advances in Neural Information Processing Systems. (2017).",{"type":18,"tag":26,"props":833,"children":834},{},[835],{"type":18,"tag":30,"props":836,"children":838},{"alt":7,"src":837},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/05/30/aacfde2b0c9d481288a561f9b72a5308.jpg",[],{"type":18,"tag":26,"props":840,"children":841},{},[842],{"type":18,"tag":38,"props":843,"children":844},{},[845],{"type":24,"value":846},"MindSpore官方资料",{"type":18,"tag":26,"props":848,"children":849},{},[850,855],{"type":18,"tag":38,"props":851,"children":852},{},[853],{"type":24,"value":854},"官方QQ群",{"type":24,"value":856}," : 486831414",{"type":18,"tag":26,"props":858,"children":859},{},[860,865,867],{"type":18,"tag":38,"props":861,"children":862},{},[863],{"type":24,"value":864},"官网",{"type":24,"value":866},"：",{"type":18,"tag":150,"props":868,"children":871},{"href":869,"rel":870},"https://www.mindspore.cn/",[154],[872],{"type":24,"value":869},{"type":18,"tag":26,"props":874,"children":875},{},[876,881],{"type":18,"tag":38,"props":877,"children":878},{},[879],{"type":24,"value":880},"Gitee",{"type":24,"value":882}," : https : //gitee.com/mindspore/mindspore",{"type":18,"tag":26,"props":884,"children":885},{},[886,891,893],{"type":18,"tag":38,"props":887,"children":888},{},[889],{"type":24,"value":890},"GitHub",{"type":24,"value":892}," : ",{"type":18,"tag":150,"props":894,"children":897},{"href":895,"rel":896},"https://github.com/mindspore-ai/mindspore",[154],[898],{"type":24,"value":895},{"type":18,"tag":26,"props":900,"children":901},{},[902,907,908],{"type":18,"tag":38,"props":903,"children":904},{},[905],{"type":24,"value":906},"论坛",{"type":24,"value":866},{"type":18,"tag":150,"props":909,"children":912},{"href":910,"rel":911},"https://bbs.huaweicloud.com/forum/forum-1076-1.html",[154],[913],{"type":24,"value":910},{"title":7,"searchDepth":915,"depth":915,"links":916},4,[],"markdown","content:technology-blogs:zh:1508.md","content","technology-blogs/zh/1508.md","technology-blogs/zh/1508","md",1776506113000]