[{"data":1,"prerenderedAt":765},["ShallowReactive",2],{"content-query-J6qqZeDDRC":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":759,"_id":760,"_source":761,"_file":762,"_stem":763,"_extension":764},"/technology-blogs/zh/2836","zh",false,"","项目分享 | 如何通过昇思MindSpore实现强化学习玩游戏","《Playing Atari with Deep Reinforcement Learning》是首篇将强化学习与深度学习结合起来的深度强化学习经典论文，由DeepMind团队设计开发，算法在Atari 2600 游戏环境进行测试，在部分游戏中的测试表现优于人类玩家。","2023-10-19","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/10/25/6b7699f53be340108fe79e0132d65a2e.png","technology-blogs","大V博文",{"type":15,"children":16,"toc":750},"root",[17,25,42,50,56,69,77,85,98,103,108,113,122,127,134,139,146,151,183,192,197,205,210,218,226,238,243,248,253,258,263,268,273,280,288,296,304,312,317,332,337,345,362,371,376,384,392,401,409,417,425,455,463,506,514,522,570,578,583,591,599,607,612,620,625,633,638,646,654,662,670,677,685,693,698,705,710,720,730,740],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"项目分享-如何通过昇思mindspore实现强化学习玩游戏",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29,35,37],{"type":18,"tag":30,"props":31,"children":32},"strong",{},[33],{"type":24,"value":34},"作者：Hamwon",{"type":24,"value":36}," ",{"type":18,"tag":30,"props":38,"children":39},{},[40],{"type":24,"value":41},"来源：CSDN",{"type":18,"tag":26,"props":43,"children":44},{},[45],{"type":18,"tag":30,"props":46,"children":47},{},[48],{"type":24,"value":49},"摘要",{"type":18,"tag":51,"props":52,"children":54},"h3",{"id":53},"playing-atari-with-deep-reinforcement-learning是首篇将强化学习与深度学习结合起来的深度强化学习经典论文由deepmind团队设计开发算法在atari-2600-游戏环境进行测试在部分游戏中的测试表现优于人类玩家",[55],{"type":24,"value":9},{"type":18,"tag":26,"props":57,"children":58},{},[59,61],{"type":24,"value":60},"论文网址：",{"type":18,"tag":62,"props":63,"children":67},"a",{"href":64,"rel":65},"https://paperswithcode.com/paper/playing-atari-with-deep-reinforcement",[66],"nofollow",[68],{"type":24,"value":64},{"type":18,"tag":26,"props":70,"children":71},{},[72],{"type":18,"tag":30,"props":73,"children":74},{},[75],{"type":24,"value":76},"01",{"type":18,"tag":26,"props":78,"children":79},{},[80],{"type":18,"tag":30,"props":81,"children":82},{},[83],{"type":24,"value":84},"用Pycharm创建虚拟环境项目",{"type":18,"tag":26,"props":86,"children":87},{},[88,90,96],{"type":24,"value":89},"项目代码和训练结果上传到百度网盘了，可以先下载下来，但是由于虚拟环境太大了所以没有上传，需要自己下载安装一遍，具体操作可以查看下文介绍。 链接：",{"type":18,"tag":62,"props":91,"children":94},{"href":92,"rel":93},"https://pan.baidu.com/s/1zoh0glqH4xcNSbOUuR2r7g?pwd=00wd",[66],[95],{"type":24,"value":92},{"type":24,"value":97}," 提取码：00wd",{"type":18,"tag":26,"props":99,"children":100},{},[101],{"type":24,"value":102},"首先使用Pycharm创建一个新项目，然后如下图所示在设置中添加虚拟环境：",{"type":18,"tag":26,"props":104,"children":105},{},[106],{"type":24,"value":107},"创建虚拟环境项目的目的在于使当前项目的运行环境与自己的Python环境分开，后续会在虚拟环境中安装需要的包，以免影响自己之前的Python环境。我用的Pycharm版本是2019版的，新版Pycharm的设置应该是类似的，可以根据自身情况百度。每个人的Anaconda路径不同，需要根据自己安装位置选择基本解释器。",{"type":18,"tag":26,"props":109,"children":110},{},[111],{"type":24,"value":112},"虚拟环境的配置参考CSDN文章：Pycharm 创建并管理虚拟环境。",{"type":18,"tag":26,"props":114,"children":115},{},[116],{"type":18,"tag":117,"props":118,"children":121},"img",{"alt":119,"src":120},"image.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231025014017.08539674715192839658414596769826:50541024031313:2400:D1BB926B0C34BBBA29D6179F9B8B018B225CD1F8BB8634DF4B358C86CD720950.png",[],{"type":18,"tag":26,"props":123,"children":124},{},[125],{"type":24,"value":126},"虚拟环境创建完成后，还需要在设置里面把终端程序设置一下：",{"type":18,"tag":26,"props":128,"children":129},{},[130],{"type":18,"tag":117,"props":131,"children":133},{"alt":119,"src":132},"https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231025014033.96277622176543006207490699119590:50541024031313:2400:8312AF8F21A76308618AC2A7A0E858302106AF5A085429687AD19812C90D0C71.png",[],{"type":18,"tag":26,"props":135,"children":136},{},[137],{"type":24,"value":138},"这个时候打开Pycharm下面的终端选项卡，可以看到终端前面提示(venv)，表示当前终端是处于虚拟环境中的：",{"type":18,"tag":26,"props":140,"children":141},{},[142],{"type":18,"tag":117,"props":143,"children":145},{"alt":119,"src":144},"https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231025014050.77575176526129988714699763684126:50541024031313:2400:88707D9EBD366B26F994225558B9EECD3285D07D11EB9751C3E87E1D877CCF3A.png",[],{"type":18,"tag":26,"props":147,"children":148},{},[149],{"type":24,"value":150},"这个时候我们需要的包都可以在这个终端这里通过pip进行安装了。",{"type":18,"tag":26,"props":152,"children":153},{},[154,156,161,163,168,170,175,177,181],{"type":24,"value":155},"记得把从百度云下载的文件",{"type":18,"tag":30,"props":157,"children":158},{},[159],{"type":24,"value":160},"code、Imgs、model",{"type":24,"value":162},"这三个文件夹给复制到当前的项目文件夹里面。项目需要的Python包已经包含在",{"type":18,"tag":30,"props":164,"children":165},{},[166],{"type":24,"value":167},"code",{"type":24,"value":169},"文件夹下的",{"type":18,"tag":30,"props":171,"children":172},{},[173],{"type":24,"value":174},"requirements.tx",{"type":24,"value":176},"文件中了，打开Pycharm的终端选项卡，通过cd命令进入",{"type":18,"tag":30,"props":178,"children":179},{},[180],{"type":24,"value":167},{"type":24,"value":182},"文件夹：",{"type":18,"tag":184,"props":185,"children":187},"pre",{"code":186},"cd code\n",[188],{"type":18,"tag":167,"props":189,"children":190},{"__ignoreMap":7},[191],{"type":24,"value":186},{"type":18,"tag":26,"props":193,"children":194},{},[195],{"type":24,"value":196},"然后pip安装需要的包：",{"type":18,"tag":184,"props":198,"children":200},{"code":199},"pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple\n",[201],{"type":18,"tag":167,"props":202,"children":203},{"__ignoreMap":7},[204],{"type":24,"value":199},{"type":18,"tag":26,"props":206,"children":207},{},[208],{"type":24,"value":209},"正常来讲上面的环境配置完之后，code文件夹下的代码应该都可以正常运行了。如果无法正常运行，有可能是Atari的游戏环境的问题，具体可以参考这篇CSDN文章：【gym】新版安装（0.21以上）以及配置Atari环境，超简单（Windows）。",{"type":18,"tag":26,"props":211,"children":212},{},[213],{"type":18,"tag":30,"props":214,"children":215},{},[216],{"type":24,"value":217},"02",{"type":18,"tag":26,"props":219,"children":220},{},[221],{"type":18,"tag":30,"props":222,"children":223},{},[224],{"type":24,"value":225},"论文模型解释",{"type":18,"tag":26,"props":227,"children":228},{},[229,231,236],{"type":24,"value":230},"简单来讲，论文设计了一个DQN网络，将连续4帧、裁切为84X84的游戏画面堆叠成4X84X84的输入，然后通过",{"type":18,"tag":30,"props":232,"children":233},{},[234],{"type":24,"value":235},"卷积+ReLU、卷积+ReLU、Flatten、全连接+ReLU、全连接",{"type":24,"value":237},"得到与动作维数相对应的输出。这里主要对 BreakOut (弹球打方块) 这款游戏进行了训练和测试，这款游戏对应的动作有4种，所以这里的输出维数为4。",{"type":18,"tag":26,"props":239,"children":240},{},[241],{"type":24,"value":242},"输出的4维数组，分别代表4种动作对应的Q(s,a)值，选择最大的Q值所对应编号作为网络所输出的动作代号：",{"type":18,"tag":26,"props":244,"children":245},{},[246],{"type":24,"value":247},"0：表示不移动",{"type":18,"tag":26,"props":249,"children":250},{},[251],{"type":24,"value":252},"1：表示开始游戏 (如果游戏已经开始，那么1仍然不移动)",{"type":18,"tag":26,"props":254,"children":255},{},[256],{"type":24,"value":257},"2：表示右移",{"type":18,"tag":26,"props":259,"children":260},{},[261],{"type":24,"value":262},"3：表示左移",{"type":18,"tag":26,"props":264,"children":265},{},[266],{"type":24,"value":267},"卷积的大小计算：",{"type":18,"tag":26,"props":269,"children":270},{},[271],{"type":24,"value":272},"输出大小 = (输入大小 - 卷积核大小 + 2 x padding) / 步长 + 1",{"type":18,"tag":26,"props":274,"children":275},{},[276],{"type":18,"tag":117,"props":277,"children":279},{"alt":119,"src":278},"https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231025014122.90940851895781620272068123732246:50541024031313:2400:A3E0D2A39F01CD12A01D685129759825AF6DDCCF00BDF980850B08C68E2CD9B4.png",[],{"type":18,"tag":26,"props":281,"children":282},{},[283],{"type":18,"tag":117,"props":284,"children":287},{"alt":285,"src":286},"cke_6108.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231025014146.55492156277183711940124602511233:50541024031313:2400:2DC3E34429750BC5984E86CB2EEC7C394B2506E60FFCC388FB3988A7F09152DE.png",[],{"type":18,"tag":26,"props":289,"children":290},{},[291],{"type":18,"tag":117,"props":292,"children":295},{"alt":293,"src":294},"cke_8210.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231025014212.20451017065958895093324276358917:50541024031313:2400:D32C7702549EB4D983818D2B6845A46A10935F5C59FBFB5A7A41791BD95711AC.png",[],{"type":18,"tag":26,"props":297,"children":298},{},[299],{"type":18,"tag":30,"props":300,"children":301},{},[302],{"type":24,"value":303},"03",{"type":18,"tag":26,"props":305,"children":306},{},[307],{"type":18,"tag":30,"props":308,"children":309},{},[310],{"type":24,"value":311},"昇思MindSpore代码实现",{"type":18,"tag":26,"props":313,"children":314},{},[315],{"type":24,"value":316},"打开 code文件夹中 playing_atari.py文件，代码的具体含义如下：",{"type":18,"tag":51,"props":318,"children":320},{"id":319},"_31-游戏环境创建",[321,326,327],{"type":18,"tag":30,"props":322,"children":323},{},[324],{"type":24,"value":325},"3.1",{"type":24,"value":36},{"type":18,"tag":30,"props":328,"children":329},{},[330],{"type":24,"value":331},"游戏环境创建",{"type":18,"tag":26,"props":333,"children":334},{},[335],{"type":24,"value":336},"在导入相应的库之后，首先创建游戏环境env：",{"type":18,"tag":184,"props":338,"children":340},{"code":339},"env = gym.make(\"BreakoutNoFrameskip-v4\")  # 游戏环境\nenv = gym.wrappers.RecordEpisodeStatistics(env)\nenv = gym.wrappers.ResizeObservation(env, (84, 84))  # 设置图片放缩\nenv = gym.wrappers.GrayScaleObservation(env)  # 设置图片为灰度图\nenv = gym.wrappers.FrameStack(env, 4)  # 4帧图片堆叠在一起作为一个观测\nenv = MaxAndSkipEnv(env, skip=4)  # 跳帧，一个动作维持4帧\n",[341],{"type":18,"tag":167,"props":342,"children":343},{"__ignoreMap":7},[344],{"type":24,"value":339},{"type":18,"tag":346,"props":347,"children":348},"blockquote",{},[349],{"type":18,"tag":26,"props":350,"children":351},{},[352,354,360],{"type":24,"value":353},"这里已经对",{"type":18,"tag":167,"props":355,"children":357},{"className":356},[],[358],{"type":24,"value":359},"env",{"type":24,"value":361},"环境进行了封装，对其输出的图片进行了预处理，每一次的观测输出都是4X84X84的堆叠的灰度图片。",{"type":18,"tag":51,"props":363,"children":365},{"id":364},"_32-dqn网络定义",[366],{"type":18,"tag":30,"props":367,"children":368},{},[369],{"type":24,"value":370},"3.2 DQN网络定义",{"type":18,"tag":26,"props":372,"children":373},{},[374],{"type":24,"value":375},"利用昇思MindSpore定义DQN网络，直接利用nn.SequentialCell()，按设计的网络进行定义即可：",{"type":18,"tag":184,"props":377,"children":379},{"code":378},"class DQN(nn.Cell):\n    def __init__(self, nb_actions):\n        super().__init__()\n        self.network = nn.SequentialCell(\n            nn.Conv2d(in_channels=4, out_channels=16, kernel_size=8, stride=4, pad_mode='valid'),\n            nn.ReLU(),\n            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=4, stride=2, pad_mode='valid'),\n            nn.ReLU(),\n            nn.Flatten(),\n            nn.Dense(in_channels=2592, out_channels=256),\n            nn.ReLU(),\n            nn.Dense(in_channels=256, out_channels=nb_actions),\n        )\n\n    def construct(self, x):\n        return self.network(x / 255.)\n",[380],{"type":18,"tag":167,"props":381,"children":382},{"__ignoreMap":7},[383],{"type":24,"value":378},{"type":18,"tag":346,"props":385,"children":386},{},[387],{"type":18,"tag":26,"props":388,"children":389},{},[390],{"type":24,"value":391},"construct() 表示网络的输出，类似于Pytorch框架里面的forward()",{"type":18,"tag":51,"props":393,"children":395},{"id":394},"_33-设计经验存放池",[396],{"type":18,"tag":30,"props":397,"children":398},{},[399],{"type":24,"value":400},"3.3 设计经验存放池",{"type":18,"tag":184,"props":402,"children":404},{"code":403},"class ReplayBuffer():\n    def __init__(self, replay_memory_size):\n\n    def add(self, obs, next_obs, action, reward, done):\n\n    def sample(self, sample_num):\n        ...\n        return Tensor(temp_obs, ms.float32), Tensor(temp_next_obs, ms.float32), Tensor(temp_action, ms.int32), Tensor(temp_reward, ms.float32), Tensor(temp_done, ms.float32)\n",[405],{"type":18,"tag":167,"props":406,"children":407},{"__ignoreMap":7},[408],{"type":24,"value":403},{"type":18,"tag":346,"props":410,"children":411},{},[412],{"type":18,"tag":26,"props":413,"children":414},{},[415],{"type":24,"value":416},"这里不贴出具体代码了，简单来说就是实现了经验元组的保存，以及批量采样方便用于后续神经网络的训练。",{"type":18,"tag":26,"props":418,"children":419},{},[420],{"type":18,"tag":30,"props":421,"children":422},{},[423],{"type":24,"value":424},"3.4 损失函数、优化器、训练函数的定义",{"type":18,"tag":346,"props":426,"children":427},{},[428],{"type":18,"tag":26,"props":429,"children":430},{},[431,433,439,441,447,449],{"type":24,"value":432},"首先对定义的DQN类实例化一个网络",{"type":18,"tag":167,"props":434,"children":436},{"className":435},[],[437],{"type":24,"value":438},"q_network",{"type":24,"value":440},"，然后定义优化器为",{"type":18,"tag":167,"props":442,"children":444},{"className":443},[],[445],{"type":24,"value":446},"nn.Adam",{"type":24,"value":448},"，定义损失函数为",{"type":18,"tag":167,"props":450,"children":452},{"className":451},[],[453],{"type":24,"value":454},"nn.HuberLOss()",{"type":18,"tag":184,"props":456,"children":458},{"code":457},"q_network = DQN(nb_actions=env.action_space.n)  # 网络实例化\noptimizer = nn.Adam(params=q_network.trainable_params(), learning_rate=1.25e-4)  # 优化器\nloss_fn = nn.HuberLoss()  # 损失函数\n",[459],{"type":18,"tag":167,"props":460,"children":461},{"__ignoreMap":7},[462],{"type":24,"value":457},{"type":18,"tag":26,"props":464,"children":465},{},[466,468,473,475,481,483,489,491,497,499,504],{"type":24,"value":467},"后面是昇思MindSpore定义网络训练时特有的步骤，叫函数式自动微分，可以参考官网关于",{"type":18,"tag":30,"props":469,"children":470},{},[471],{"type":24,"value":472},"函数式自动微分的教程",{"type":24,"value":474},"。具体而言就是先定义一个Loss计算函数",{"type":18,"tag":167,"props":476,"children":478},{"className":477},[],[479],{"type":24,"value":480},"forward_fn",{"type":24,"value":482},"，然后根据Loss计算函数生成梯度计算函数",{"type":18,"tag":167,"props":484,"children":486},{"className":485},[],[487],{"type":24,"value":488},"grad_fn",{"type":24,"value":490},"，然后利用梯度计算函数来定义网络训练一步的函数",{"type":18,"tag":167,"props":492,"children":494},{"className":493},[],[495],{"type":24,"value":496},"train_step",{"type":24,"value":498},"。这样利用",{"type":18,"tag":167,"props":500,"children":502},{"className":501},[],[503],{"type":24,"value":496},{"type":24,"value":505},"函数，只需要输入所需要的数据，就可以对网络的参数进行一次更新，完成一步训练。",{"type":18,"tag":184,"props":507,"children":509},{"code":508},"# 损失值计算函数\ndef forward_fn(observations, actions, y):\n    current_q_value = q_network(observations).gather_elements(dim=1, index=actions).squeeze()  # 把经验对中这个动作对应的q_value给提取出来\n    loss = loss_fn(current_q_value, y)\n    return loss\n",[510],{"type":18,"tag":167,"props":511,"children":512},{"__ignoreMap":7},[513],{"type":24,"value":508},{"type":18,"tag":26,"props":515,"children":516},{},[517],{"type":18,"tag":117,"props":518,"children":521},{"alt":519,"src":520},"cke_14837.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231025014358.78034916337968451122528118734273:50541024031313:2400:63ED79E86C5BDEE75B14B9AE7F804E7FFB55B91B7287E5F1B1EDFEAF1C1136BA.png",[],{"type":18,"tag":346,"props":523,"children":524},{},[525,543],{"type":18,"tag":26,"props":526,"children":527},{},[528,534,536,541],{"type":18,"tag":167,"props":529,"children":531},{"className":530},[],[532],{"type":24,"value":533},"ms.ops.value_and_grad",{"type":24,"value":535},"利用定义好的Loss计算函数",{"type":18,"tag":167,"props":537,"children":539},{"className":538},[],[540],{"type":24,"value":480},{"type":24,"value":542},"，可以返回得到一个梯度计算函数grad_fn。",{"type":18,"tag":26,"props":544,"children":545},{},[546,548,553,555,560,562,568],{"type":24,"value":547},"然后在训练函数",{"type":18,"tag":167,"props":549,"children":551},{"className":550},[],[552],{"type":24,"value":496},{"type":24,"value":554},"中，我们就可以利用",{"type":18,"tag":167,"props":556,"children":558},{"className":557},[],[559],{"type":24,"value":488},{"type":24,"value":561},"计算梯度，然后利用优化器",{"type":18,"tag":167,"props":563,"children":565},{"className":564},[],[566],{"type":24,"value":567},"optimizer",{"type":24,"value":569},"进行梯度反向传播，更新网络参数，完成一步训练。",{"type":18,"tag":26,"props":571,"children":572},{},[573],{"type":18,"tag":30,"props":574,"children":575},{},[576],{"type":24,"value":577},"3.4 网络训练",{"type":18,"tag":26,"props":579,"children":580},{},[581],{"type":24,"value":582},"接下来就可以对网络进行训练了，这里对主要一些关键代码做出解释：",{"type":18,"tag":184,"props":584,"children":586},{"code":585},"def Deep_Q_Learning(env, replay_memory_size=100_000, nb_epochs=40000_000, update_frequency=4, batch_size=32,\n                    discount_factor=0.99, replay_start_size=5000, initial_exploration=1, final_exploration=0.01,\n                    exploration_steps=100_000):\n",[587],{"type":18,"tag":167,"props":588,"children":589},{"__ignoreMap":7},[590],{"type":24,"value":585},{"type":18,"tag":346,"props":592,"children":593},{},[594],{"type":18,"tag":26,"props":595,"children":596},{},[597],{"type":24,"value":598},"首先定义好训练需要的相关参数，包括经验池容量大小100_000，总训练epochs=40000_000，每4个epoch更新一次网络参数，折扣因子为0.99，经验池满5000时开始训练，初始探索概率为1，总探索epochs为100_000",{"type":18,"tag":346,"props":600,"children":601},{},[602],{"type":18,"tag":26,"props":603,"children":604},{},[605],{"type":24,"value":606},"这里的探索是指为了DQN学到更好的策略，在训练之前先随机产生动作进行探索，探索概率会逐渐减小，然后就会完全依靠DQN产生动作，称这个策略为ε−greedy策略。",{"type":18,"tag":26,"props":608,"children":609},{},[610],{"type":24,"value":611},"在训练之前要将网络设置为训练模式：",{"type":18,"tag":184,"props":613,"children":615},{"code":614},"q_network.set_train()  # 设置网络为训练模式\n",[616],{"type":18,"tag":167,"props":617,"children":618},{"__ignoreMap":7},[619],{"type":24,"value":614},{"type":18,"tag":26,"props":621,"children":622},{},[623],{"type":24,"value":624},"然后就是让DQN与游戏进行交互，产生动作的相应的代码为（随机探索或者由DQN产生动作）：",{"type":18,"tag":184,"props":626,"children":628},{"code":627},"if random.random() \u003C epsilon:  # With probability ε select a random action a\n    action = np.array(env.action_space.sample())\nelse:  # Otherwise select a = max_a Q∗(φ(st), a; θ)\n    temp_input = Tensor(obs, ms.float32).unsqueeze(0)\n    q_values = q_network(temp_input)\n    action = q_values.argmax(axis=1).item().asnumpy()\n",[629],{"type":18,"tag":167,"props":630,"children":631},{"__ignoreMap":7},[632],{"type":24,"value":627},{"type":18,"tag":26,"props":634,"children":635},{},[636],{"type":24,"value":637},"保存每次经验元组到经验池:",{"type":18,"tag":184,"props":639,"children":641},{"code":640},"rb.add(obs, real_next_obs, action, reward, done)\n",[642],{"type":18,"tag":167,"props":643,"children":644},{"__ignoreMap":7},[645],{"type":24,"value":640},{"type":18,"tag":26,"props":647,"children":648},{},[649],{"type":18,"tag":117,"props":650,"children":653},{"alt":651,"src":652},"cke_21372.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231025014440.87794678570098192223947423537648:50541024031313:2400:FBB4009BBEA6D342A010BF0BBD32398D4CF3FC887BACF5288651CAA64728FF31.png",[],{"type":18,"tag":184,"props":655,"children":657},{"code":656},"data_obs, data_next_obs, data_action, data_reward, data_done = rb.sample(batch_size)\n# 这一部分不用求梯度，所以写在forward_fn和train_step函数之外\nmax_q_value = q_network(data_next_obs).max(1)\ny = data_reward.flatten() + discount_factor * max_q_value * (1 - data_done.flatten())\nloss = train_step(data_obs, data_action, y)\n",[658],{"type":18,"tag":167,"props":659,"children":660},{"__ignoreMap":7},[661],{"type":24,"value":656},{"type":18,"tag":26,"props":663,"children":664},{},[665],{"type":18,"tag":117,"props":666,"children":669},{"alt":667,"src":668},"cke_26946.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231025014508.22305351042198701192712813633393:50541024031313:2400:1AC566CA75D72714B55F8FE5D0155993CEA8B27B1EE0FE2A02B488DC2DEFE64E.png",[],{"type":18,"tag":26,"props":671,"children":672},{},[673],{"type":18,"tag":117,"props":674,"children":676},{"alt":119,"src":675},"https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231025014516.69060803615290247542641417262676:50541024031313:2400:1F1BF77B16B34AD0C67BF3B2A3F5098333F307FEA24B9FF4E72AF50D6D919499.png",[],{"type":18,"tag":26,"props":678,"children":679},{},[680],{"type":18,"tag":30,"props":681,"children":682},{},[683],{"type":24,"value":684},"04",{"type":18,"tag":26,"props":686,"children":687},{},[688],{"type":18,"tag":30,"props":689,"children":690},{},[691],{"type":24,"value":692},"实验结果",{"type":18,"tag":26,"props":694,"children":695},{},[696],{"type":24,"value":697},"可以看出训练之后的DQN已经学会玩这个游戏了，一般能得150分左右，运气好的的话像这样能打到300分：",{"type":18,"tag":26,"props":699,"children":700},{},[701],{"type":18,"tag":117,"props":702,"children":704},{"alt":119,"src":703},"https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231025014532.07682971029168148454198968324786:50541024031313:2400:654EE84F03860679B8BD853083ECE8B591477405D77379F605EDEA5B5EDE4840.png",[],{"type":18,"tag":26,"props":706,"children":707},{},[708],{"type":24,"value":709},"往期回顾",{"type":18,"tag":26,"props":711,"children":712},{},[713],{"type":18,"tag":62,"props":714,"children":717},{"href":715,"rel":716},"http://mp.weixin.qq.com/s?__biz=MzkxMTM2MjMzNg==&mid=2247609430&idx=1&sn=ff3fe18b51a20849f737972d978d3222&chksm=c11e3c19f669b50f0128f508ebcd2b6373eb930b8b0a0e54f493fdfb7a5dc30b10266656e003&scene=21#wechat_redirect",[66],[718],{"type":24,"value":719},"项目分享 | 如何通过MindNLP将HuggingFace Datasets嫁接到昇思MindSpore",{"type":18,"tag":26,"props":721,"children":722},{},[723],{"type":18,"tag":62,"props":724,"children":727},{"href":725,"rel":726},"http://mp.weixin.qq.com/s?__biz=MzkxMTM2MjMzNg==&mid=2247608163&idx=1&sn=6a07ea1ffa8e7276d22bb3d331a4affa&chksm=c11e372cf669be3ad60b6f6338c351daadf8265e87fca703d7cfe0e8cf89bf54fce84f19011c&scene=21#wechat_redirect",[66],[728],{"type":24,"value":729},"项目分享 | 昇思MindSpore接入强化学习的新环境和新算法",{"type":18,"tag":26,"props":731,"children":732},{},[733],{"type":18,"tag":62,"props":734,"children":737},{"href":735,"rel":736},"http://mp.weixin.qq.com/s?__biz=MzkxMTM2MjMzNg==&mid=2247599998&idx=2&sn=5391bde94de0eee39b5ece32c7fab0a3&chksm=c11e5731f669de273eafad815cae90cdd22ef754ab568f84b1709949fa124fe125a899ca3282&scene=21#wechat_redirect",[66],[738],{"type":24,"value":739},"项目分享 | 腺形智消——新一代儿童腺样体肥大诊疗方案领航者",{"type":18,"tag":26,"props":741,"children":742},{},[743],{"type":18,"tag":62,"props":744,"children":747},{"href":745,"rel":746},"http://mp.weixin.qq.com/s?__biz=MzkxMTM2MjMzNg==&mid=2247596222&idx=2&sn=4d7bb8621a8818b4bb9cc77c44bafbd3&chksm=c11e68f1f669e1e7537b5c0871c451d32cf41dd12d5aff758bbb1dec639e38489a9e27f650a3&scene=21#wechat_redirect",[66],[748],{"type":24,"value":749},"项目分享 | 基于昇思MindSpore AI框架的肾脏肿瘤分割一等奖[咸鱼]团队思路",{"title":7,"searchDepth":751,"depth":751,"links":752},4,[753,755,757,758],{"id":53,"depth":754,"text":9},3,{"id":319,"depth":754,"text":756},"3.1 游戏环境创建",{"id":364,"depth":754,"text":370},{"id":394,"depth":754,"text":400},"markdown","content:technology-blogs:zh:2836.md","content","technology-blogs/zh/2836.md","technology-blogs/zh/2836","md",1776506123335]