[{"data":1,"prerenderedAt":826},["ShallowReactive",2],{"content-query-S0DrfjDAVl":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":820,"_id":821,"_source":822,"_file":823,"_stem":824,"_extension":825},"/technology-blogs/zh/3160","zh",false,"","MindSpore A2C 强化学习","今天我们使用A2C算法进行训练。 Advantage Actor-Critic (A2C)算法是一个强化学习算法，它结合了策略梯度（Actor）和价值函数（Critic）的方法。","2024-06-04","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/11/28/c309646d58e540bd8f04528d26101693.png","technology-blogs","开发者分享",{"type":15,"children":16,"toc":808},"root",[17,25,31,38,74,80,85,95,100,204,209,214,222,227,235,240,248,256,261,269,276,281,287,295,431,437,445,564,570,578,688,694,702,774,780,788,803],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore-a2c-强化学习",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"今天我们使用A2C算法进行训练。 Advantage Actor-Critic (A2C)算法是一个强化学习算法，它结合了策略梯度（Actor）和价值函数（Critic）的方法。A2C算法在许多强化学习任务中表现优越，因为它能够利用价值函数来减少策略梯度的方差，同时直接优化策略。",{"type":18,"tag":32,"props":33,"children":35},"h3",{"id":34},"a2c算法的核心思想",[36],{"type":24,"value":37},"A2C算法的核心思想",{"type":18,"tag":39,"props":40,"children":41},"ul",{},[42,54,64],{"type":18,"tag":43,"props":44,"children":45},"li",{},[46,52],{"type":18,"tag":47,"props":48,"children":49},"strong",{},[50],{"type":24,"value":51},"Actor",{"type":24,"value":53},"：根据当前策略选择动作。",{"type":18,"tag":43,"props":55,"children":56},{},[57,62],{"type":18,"tag":47,"props":58,"children":59},{},[60],{"type":24,"value":61},"Critic",{"type":24,"value":63},"：评估一个状态-动作对的值（通常是使用状态值函数或动作值函数）。",{"type":18,"tag":43,"props":65,"children":66},{},[67,72],{"type":18,"tag":47,"props":68,"children":69},{},[70],{"type":24,"value":71},"优势函数（Advantage Function）",{"type":24,"value":73},"：用来衡量某个动作相对于平均水平的好坏，通常定义为 $A(s, a) = Q(s, a) - V(s)$。",{"type":18,"tag":32,"props":75,"children":77},{"id":76},"a2c算法的伪代码",[78],{"type":24,"value":79},"A2C算法的伪代码",{"type":18,"tag":26,"props":81,"children":82},{},[83],{"type":24,"value":84},"以下是A2C算法的伪代码：",{"type":18,"tag":86,"props":87,"children":89},"pre",{"code":88},"Initialize policy network (actor) π with parameters θ\nInitialize value network (critic) V with parameters w\nInitialize learning rates α_θ for policy network and α_w for value network\n\nfor each episode do\n    Initialize state s\n    while state s is not terminal do\n        # Actor: select action a according to the current policy π(a|s; θ)\n        a = select_action(s, θ)\n        \n        # Execute action a in the environment, observe reward r and next state s'\n        r, s' = environment.step(a)\n        \n        # Critic: compute the value of the current state V(s; w)\n        V_s = V(s, w)\n        \n        # Critic: compute the value of the next state V(s'; w)\n        V_s_prime = V(s', w)\n        \n        # Compute the TD error (δ)\n        δ = r + γ * V_s_prime - V_s\n        \n        # Critic: update the value network parameters w\n        w = w + α_w * δ * ∇_w V(s; w)\n        \n        # Compute the advantage function A(s, a)\n        A = δ\n        \n        # Actor: update the policy network parameters θ\n        θ = θ + α_θ * A * ∇_θ log π(a|s; θ)\n        \n        # Move to the next state\n        s = s'\n    end while\nend for\n",[90],{"type":18,"tag":91,"props":92,"children":93},"code",{"__ignoreMap":7},[94],{"type":24,"value":88},{"type":18,"tag":32,"props":96,"children":98},{"id":97},"解释",[99],{"type":24,"value":97},{"type":18,"tag":101,"props":102,"children":103},"ol",{},[104,114,124,134,144,154,164,174,184,194],{"type":18,"tag":43,"props":105,"children":106},{},[107,112],{"type":18,"tag":47,"props":108,"children":109},{},[110],{"type":24,"value":111},"初始化",{"type":24,"value":113},"：初始化策略网络（Actor）和价值网络（Critic）的参数，以及它们的学习率。",{"type":18,"tag":43,"props":115,"children":116},{},[117,122],{"type":18,"tag":47,"props":118,"children":119},{},[120],{"type":24,"value":121},"循环每个Episode",{"type":24,"value":123},"：在每个Episode开始时，初始化状态。",{"type":18,"tag":43,"props":125,"children":126},{},[127,132],{"type":18,"tag":47,"props":128,"children":129},{},[130],{"type":24,"value":131},"选择动作",{"type":24,"value":133},"：根据当前策略从Actor中选择动作。",{"type":18,"tag":43,"props":135,"children":136},{},[137,142],{"type":18,"tag":47,"props":138,"children":139},{},[140],{"type":24,"value":141},"执行动作",{"type":24,"value":143},"：在环境中执行动作，并观察奖励和下一个状态。",{"type":18,"tag":43,"props":145,"children":146},{},[147,152],{"type":18,"tag":47,"props":148,"children":149},{},[150],{"type":24,"value":151},"计算状态值",{"type":24,"value":153},"：用Critic评估当前状态和下一个状态的值。",{"type":18,"tag":43,"props":155,"children":156},{},[157,162],{"type":18,"tag":47,"props":158,"children":159},{},[160],{"type":24,"value":161},"计算TD误差",{"type":24,"value":163},"：计算时序差分误差（Temporal Difference Error），它是当前奖励加上下一个状态的折扣值与当前状态值的差。",{"type":18,"tag":43,"props":165,"children":166},{},[167,172],{"type":18,"tag":47,"props":168,"children":169},{},[170],{"type":24,"value":171},"更新Critic",{"type":24,"value":173},"：根据TD误差更新价值网络的参数。",{"type":18,"tag":43,"props":175,"children":176},{},[177,182],{"type":18,"tag":47,"props":178,"children":179},{},[180],{"type":24,"value":181},"计算优势函数",{"type":24,"value":183},"：使用TD误差计算优势函数。",{"type":18,"tag":43,"props":185,"children":186},{},[187,192],{"type":18,"tag":47,"props":188,"children":189},{},[190],{"type":24,"value":191},"更新Actor",{"type":24,"value":193},"：根据优势函数更新策略网络的参数。",{"type":18,"tag":43,"props":195,"children":196},{},[197,202],{"type":18,"tag":47,"props":198,"children":199},{},[200],{"type":24,"value":201},"更新状态",{"type":24,"value":203},"：移动到下一个状态，重复上述步骤，直到Episode结束。",{"type":18,"tag":26,"props":205,"children":206},{},[207],{"type":24,"value":208},"这个伪代码展示了A2C算法的核心步骤，实际实现中可能会有更多细节，如使用折扣因子γ、多个并行环境等。",{"type":18,"tag":26,"props":210,"children":211},{},[212],{"type":24,"value":213},"代码如下：",{"type":18,"tag":86,"props":215,"children":217},{"code":216},"import argparse\n\nfrom mindspore import context\nfrom mindspore import dtype as mstype\nfrom mindspore.communication import init\n\nfrom mindspore_rl.algorithm.a2c import config\nfrom mindspore_rl.algorithm.a2c.a2c_session import A2CSession\nfrom mindspore_rl.algorithm.a2c.a2c_trainer import A2CTrainer\n\nparser = argparse.ArgumentParser(description=\"MindSpore Reinforcement A2C\")\nparser.add_argument(\"--episode\", type=int, default=10000, help=\"total episode numbers.\")\nparser.add_argument(\n    \"--device_target\",\n    type=str,\n    default=\"CPU\",\n    choices=[\"CPU\", \"GPU\", \"Ascend\", \"Auto\"],\n    help=\"Choose a devioptions.device_targece to run the ac example(Default: Auto).\",\n)\nparser.add_argument(\n    \"--precision_mode\",\n    type=str,\n    default=\"fp32\",\n    choices=[\"fp32\", \"fp16\"],\n    help=\"Precision mode\",\n)\nparser.add_argument(\n    \"--env_yaml\",\n    type=str,\n    default=\"../env_yaml/CartPole-v0.yaml\",\n    help=\"Choose an environment yaml to update the a2c example(Default: CartPole-v0.yaml).\",\n)\nparser.add_argument(\n    \"--algo_yaml\",\n    type=str,\n    default=None,\n    help=\"Choose an algo yaml to update the a2c example(Default: None).\",\n)\nparser.add_argument(\n    \"--enable_distribute\",\n    type=bool,\n    default=False,\n    help=\"Train in distribute mode (Default: False).\",\n)\nparser.add_argument(\n    \"--worker_num\",\n    type=int,\n    default=2,\n    help=\"Worker num (Default: 2).\",\n)\noptions, _ = parser.parse_known_args()\n",[218],{"type":18,"tag":91,"props":219,"children":220},{"__ignoreMap":7},[221],{"type":24,"value":216},{"type":18,"tag":26,"props":223,"children":224},{},[225],{"type":24,"value":226},"首先初始化参数，然后我这里用cpu运行：options.device_targe = \"CPU\"",{"type":18,"tag":86,"props":228,"children":230},{"code":229},"episode=options.episode\n\"\"\"Train a2c\"\"\"\nif options.device_target != \"Auto\":\n    context.set_context(device_target=options.device_target)\nif context.get_context(\"device_target\") in [\"CPU\", \"GPU\"]:\n    context.set_context(enable_graph_kernel=True)\ncontext.set_context(mode=context.GRAPH_MODE)\ncompute_type = (\n    mstype.float32 if options.precision_mode == \"fp32\" else mstype.float16\n)\nconfig.algorithm_config[\"policy_and_network\"][\"params\"][\n    \"compute_type\"\n] = compute_type\nif compute_type == mstype.float16 and options.device_target != \"Ascend\":\n    raise ValueError(\"Fp16 mode is supported by Ascend backend.\")\nis_distribte = options.enable_distribute\nif is_distribte:\n    init()\n    context.set_context(enable_graph_kernel=False)\n    config.deploy_config[\"worker_num\"] = options.worker_num\na2c_session = A2CSession(options.env_yaml, options.algo_yaml, is_distribte)\n",[231],{"type":18,"tag":91,"props":232,"children":233},{"__ignoreMap":7},[234],{"type":24,"value":229},{"type":18,"tag":26,"props":236,"children":237},{},[238],{"type":24,"value":239},"设置上下文管理器",{"type":18,"tag":86,"props":241,"children":243},{"code":242},"import sys\nimport time\nfrom io import StringIO\n\nclass RealTimeCaptureAndDisplayOutput(object):\n    def __init__(self):\n        self._original_stdout = sys.stdout\n        self._original_stderr = sys.stderr\n        self.captured_output = StringIO()\n\n    def write(self, text):\n        self._original_stdout.write(text)  # 实时打印\n        self.captured_output.write(text)   # 保存到缓冲区\n\n    def flush(self):\n        self._original_stdout.flush()\n        self.captured_output.flush()\n\n    def __enter__(self):\n        sys.stdout = self\n        sys.stderr = self\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        sys.stdout = self._original_stdout\n        sys.stderr = self._original_stderr\n\nepisode=10\n# dqn_session.run(class_type=DQNTrainer, episode=episode)\nwith RealTimeCaptureAndDisplayOutput() as captured_new:\n    a2c_session.run(class_type=A2CTrainer, episode=episode)\n",[244],{"type":18,"tag":91,"props":245,"children":246},{"__ignoreMap":7},[247],{"type":24,"value":242},{"type":18,"tag":86,"props":249,"children":251},{"code":250},"import re\nimport matplotlib.pyplot as plt\n\n# 原始输出\nraw_output = captured_new.captured_output.getvalue()\n\n# 使用正则表达式从输出中提取loss和rewards\nloss_pattern = r\"loss=(\\d+\\.\\d+)\"\nreward_pattern = r\"running_reward=(\\d+\\.\\d+)\"\nloss_values = [float(match.group(1)) for match in re.finditer(loss_pattern, raw_output)]\nreward_values = [float(match.group(1)) for match in re.finditer(reward_pattern, raw_output)]\n\n# 绘制loss曲线\nplt.plot(loss_values, label='Loss')\nplt.xlabel('Episode')\nplt.ylabel('Loss')\nplt.title('Loss Curve')\nplt.legend()\nplt.show()\n\n# 绘制reward曲线\nplt.plot(reward_values, label='Rewards')\nplt.xlabel('Episode')\nplt.ylabel('Rewards')\nplt.title('Rewards Curve')\nplt.legend()\nplt.show()\n",[252],{"type":18,"tag":91,"props":253,"children":254},{"__ignoreMap":7},[255],{"type":24,"value":250},{"type":18,"tag":26,"props":257,"children":258},{},[259],{"type":24,"value":260},"展示结果：",{"type":18,"tag":26,"props":262,"children":263},{},[264],{"type":18,"tag":265,"props":266,"children":268},"img",{"alt":7,"src":267},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/06/14/e9688e9cd887406d953648bd08b31d3c.png",[],{"type":18,"tag":26,"props":270,"children":271},{},[272],{"type":18,"tag":265,"props":273,"children":275},{"alt":7,"src":274},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/06/14/f03a7c2427024fb79d8fc4d8a1889c85.png",[],{"type":18,"tag":26,"props":277,"children":278},{},[279],{"type":24,"value":280},"下面我将详细解释你提供的 MindSpore A2C 算法训练配置参数的含义：",{"type":18,"tag":32,"props":282,"children":284},{"id":283},"actor-配置",[285],{"type":24,"value":286},"Actor 配置",{"type":18,"tag":86,"props":288,"children":290},{"code":289},"'actor': {\n  'number': 1,\n  'type': mindspore_rl.algorithm.a2c.a2c.A2CActor,\n  'params': {\n    'collect_environment': PyFuncWrapper\u003C\n       (_envs): GymEnvironment\u003C>\n     >,\n   'eval_environment': PyFuncWrapper\u003C\n     (_envs): GymEnvironment\u003C>\n     >,\n   'replay_buffer': None,\n   'a2c_net': ActorCriticNet\u003C\n     (common): Dense\n     (actor): Dense\n     (critic): Dense\n     (relu): LeakyReLU\u003C>\n     >},\n  'policies': [],\n  'networks': ['a2c_net']\n}\n",[291],{"type":18,"tag":91,"props":292,"children":293},{"__ignoreMap":7},[294],{"type":24,"value":289},{"type":18,"tag":39,"props":296,"children":297},{},[298,309,328,407],{"type":18,"tag":43,"props":299,"children":300},{},[301,307],{"type":18,"tag":91,"props":302,"children":304},{"className":303},[],[305],{"type":24,"value":306},"number",{"type":24,"value":308},": Actor 的实例数量，这里设置为1，表示使用一个 Actor 实例。",{"type":18,"tag":43,"props":310,"children":311},{},[312,318,320,326],{"type":18,"tag":91,"props":313,"children":315},{"className":314},[],[316],{"type":24,"value":317},"type",{"type":24,"value":319},": Actor 的类型，这里使用 ",{"type":18,"tag":91,"props":321,"children":323},{"className":322},[],[324],{"type":24,"value":325},"mindspore_rl.algorithm.a2c.a2c.A2CActor",{"type":24,"value":327},"。",{"type":18,"tag":43,"props":329,"children":330},{},[331,337,339],{"type":18,"tag":91,"props":332,"children":334},{"className":333},[],[335],{"type":24,"value":336},"params",{"type":24,"value":338},": Actor 的参数配置。\n",{"type":18,"tag":39,"props":340,"children":341},{},[342,377,396],{"type":18,"tag":43,"props":343,"children":344},{},[345,351,353,359,361,367,369,375],{"type":18,"tag":91,"props":346,"children":348},{"className":347},[],[349],{"type":24,"value":350},"collect_environment",{"type":24,"value":352}," 和 ",{"type":18,"tag":91,"props":354,"children":356},{"className":355},[],[357],{"type":24,"value":358},"eval_environment",{"type":24,"value":360},": 使用 ",{"type":18,"tag":91,"props":362,"children":364},{"className":363},[],[365],{"type":24,"value":366},"PyFuncWrapper",{"type":24,"value":368}," 包装的 ",{"type":18,"tag":91,"props":370,"children":372},{"className":371},[],[373],{"type":24,"value":374},"GymEnvironment",{"type":24,"value":376},"，用于数据收集和评估环境。",{"type":18,"tag":43,"props":378,"children":379},{},[380,386,388,394],{"type":18,"tag":91,"props":381,"children":383},{"className":382},[],[384],{"type":24,"value":385},"replay_buffer",{"type":24,"value":387},": 设置为 ",{"type":18,"tag":91,"props":389,"children":391},{"className":390},[],[392],{"type":24,"value":393},"None",{"type":24,"value":395},"，表示不使用经验回放缓冲区。",{"type":18,"tag":43,"props":397,"children":398},{},[399,405],{"type":18,"tag":91,"props":400,"children":402},{"className":401},[],[403],{"type":24,"value":404},"a2c_net",{"type":24,"value":406},": Actor-Critic 网络，包含一个公共层、一个 Actor 层和一个 Critic 层，以及一个 Leaky ReLU 激活函数。",{"type":18,"tag":43,"props":408,"children":409},{},[410,416,417,423,425,430],{"type":18,"tag":91,"props":411,"children":413},{"className":412},[],[414],{"type":24,"value":415},"policies",{"type":24,"value":352},{"type":18,"tag":91,"props":418,"children":420},{"className":419},[],[421],{"type":24,"value":422},"networks",{"type":24,"value":424},": Actor 关联的策略和网络，这里主要是 ",{"type":18,"tag":91,"props":426,"children":428},{"className":427},[],[429],{"type":24,"value":404},{"type":24,"value":327},{"type":18,"tag":32,"props":432,"children":434},{"id":433},"learner-配置",[435],{"type":24,"value":436},"Learner 配置",{"type":18,"tag":86,"props":438,"children":440},{"code":439},"'learner': {\n  'number': 1,\n  'type': mindspore_rl.algorithm.a2c.a2c.A2CLearner,\n  'params': {\n    'gamma': 0.99,\n    'state_space_dim': 4,\n    'action_space_dim': 2,\n    'a2c_net': ActorCriticNet\u003C\n      (common): Dense\n      (actor): Dense\n      (critic): Dense\n      (relu): LeakyReLU\u003C>\n    >,\n    'a2c_net_train': TrainOneStepCell\u003C\n      (network): Loss\u003C\n        (a2c_net): ActorCriticNet\u003C\n          (common): Dense\n          (actor): Dense\n          (critic): Dense\n          (relu): LeakyReLU\u003C>\n        >\n        (smoothl1_loss): SmoothL1Loss\u003C>\n      >\n      (optimizer): Adam\u003C>\n      (grad_reducer): Identity\u003C>\n    >\n  },\n  'networks': ['a2c_net_train', 'a2c_net']\n}\n",[441],{"type":18,"tag":91,"props":442,"children":443},{"__ignoreMap":7},[444],{"type":24,"value":439},{"type":18,"tag":39,"props":446,"children":447},{},[448,458,475,542],{"type":18,"tag":43,"props":449,"children":450},{},[451,456],{"type":18,"tag":91,"props":452,"children":454},{"className":453},[],[455],{"type":24,"value":306},{"type":24,"value":457},": Learner 的实例数量，这里设置为1，表示使用一个 Learner 实例。",{"type":18,"tag":43,"props":459,"children":460},{},[461,466,468,474],{"type":18,"tag":91,"props":462,"children":464},{"className":463},[],[465],{"type":24,"value":317},{"type":24,"value":467},": Learner 的类型，这里使用 ",{"type":18,"tag":91,"props":469,"children":471},{"className":470},[],[472],{"type":24,"value":473},"mindspore_rl.algorithm.a2c.a2c.A2CLearner",{"type":24,"value":327},{"type":18,"tag":43,"props":476,"children":477},{},[478,483,485],{"type":18,"tag":91,"props":479,"children":481},{"className":480},[],[482],{"type":24,"value":336},{"type":24,"value":484},": Learner 的参数配置。\n",{"type":18,"tag":39,"props":486,"children":487},{},[488,499,510,521,531],{"type":18,"tag":43,"props":489,"children":490},{},[491,497],{"type":18,"tag":91,"props":492,"children":494},{"className":493},[],[495],{"type":24,"value":496},"gamma",{"type":24,"value":498},": 折扣因子，用于未来奖励的折扣计算。",{"type":18,"tag":43,"props":500,"children":501},{},[502,508],{"type":18,"tag":91,"props":503,"children":505},{"className":504},[],[506],{"type":24,"value":507},"state_space_dim",{"type":24,"value":509},": 状态空间的维度，这里为4。",{"type":18,"tag":43,"props":511,"children":512},{},[513,519],{"type":18,"tag":91,"props":514,"children":516},{"className":515},[],[517],{"type":24,"value":518},"action_space_dim",{"type":24,"value":520},": 动作空间的维度，这里为2。",{"type":18,"tag":43,"props":522,"children":523},{},[524,529],{"type":18,"tag":91,"props":525,"children":527},{"className":526},[],[528],{"type":24,"value":404},{"type":24,"value":530},": Actor-Critic 网络定义，与 Actor 中相同。",{"type":18,"tag":43,"props":532,"children":533},{},[534,540],{"type":18,"tag":91,"props":535,"children":537},{"className":536},[],[538],{"type":24,"value":539},"a2c_net_train",{"type":24,"value":541},": 用于训练的网络，包含损失函数（SmoothL1Loss）、优化器（Adam）和梯度缩减器（Identity）。",{"type":18,"tag":43,"props":543,"children":544},{},[545,550,552,557,558,563],{"type":18,"tag":91,"props":546,"children":548},{"className":547},[],[549],{"type":24,"value":422},{"type":24,"value":551},": Learner 关联的网络，包括 ",{"type":18,"tag":91,"props":553,"children":555},{"className":554},[],[556],{"type":24,"value":539},{"type":24,"value":352},{"type":18,"tag":91,"props":559,"children":561},{"className":560},[],[562],{"type":24,"value":404},{"type":24,"value":327},{"type":18,"tag":32,"props":565,"children":567},{"id":566},"policy-and-network-配置",[568],{"type":24,"value":569},"Policy and Network 配置",{"type":18,"tag":86,"props":571,"children":573},{"code":572},"'policy_and_network': {\n  'type': mindspore_rl.algorithm.a2c.a2c.A2CPolicyAndNetwork,\n  'params': {\n    'lr': 0.01,\n    'state_space_dim': 4,\n    'action_space_dim': 2,\n    'hidden_size': 128,\n    'gamma': 0.99,\n    'compute_type': mindspore.float32,\n    'environment_config': {\n      'id': 'CartPole-v0',\n      'entry_point': 'gym.envs.classic_control:CartPoleEnv',\n      'reward_threshold': 195.0,\n      'nondeterministic': False,\n      'max_episode_steps': 200,\n      '_kwargs': {},\n      '_env_name': 'CartPole'\n    }\n  }\n}\n",[574],{"type":18,"tag":91,"props":575,"children":576},{"__ignoreMap":7},[577],{"type":24,"value":572},{"type":18,"tag":39,"props":579,"children":580},{},[581,598],{"type":18,"tag":43,"props":582,"children":583},{},[584,589,591,597],{"type":18,"tag":91,"props":585,"children":587},{"className":586},[],[588],{"type":24,"value":317},{"type":24,"value":590},": 策略和网络的类型，这里使用 ",{"type":18,"tag":91,"props":592,"children":594},{"className":593},[],[595],{"type":24,"value":596},"mindspore_rl.algorithm.a2c.a2c.A2CPolicyAndNetwork",{"type":24,"value":327},{"type":18,"tag":43,"props":599,"children":600},{},[601,606,608],{"type":18,"tag":91,"props":602,"children":604},{"className":603},[],[605],{"type":24,"value":336},{"type":24,"value":607},": 策略和网络的参数配置。\n",{"type":18,"tag":39,"props":609,"children":610},{},[611,622,638,649,659,677],{"type":18,"tag":43,"props":612,"children":613},{},[614,620],{"type":18,"tag":91,"props":615,"children":617},{"className":616},[],[618],{"type":24,"value":619},"lr",{"type":24,"value":621},": 学习率，这里为0.01。",{"type":18,"tag":43,"props":623,"children":624},{},[625,630,631,636],{"type":18,"tag":91,"props":626,"children":628},{"className":627},[],[629],{"type":24,"value":507},{"type":24,"value":352},{"type":18,"tag":91,"props":632,"children":634},{"className":633},[],[635],{"type":24,"value":518},{"type":24,"value":637},": 状态和动作空间的维度。",{"type":18,"tag":43,"props":639,"children":640},{},[641,647],{"type":18,"tag":91,"props":642,"children":644},{"className":643},[],[645],{"type":24,"value":646},"hidden_size",{"type":24,"value":648},": 隐藏层的大小，这里为128。",{"type":18,"tag":43,"props":650,"children":651},{},[652,657],{"type":18,"tag":91,"props":653,"children":655},{"className":654},[],[656],{"type":24,"value":496},{"type":24,"value":658},": 折扣因子。",{"type":18,"tag":43,"props":660,"children":661},{},[662,668,670,676],{"type":18,"tag":91,"props":663,"children":665},{"className":664},[],[666],{"type":24,"value":667},"compute_type",{"type":24,"value":669},": 计算类型，这里为 ",{"type":18,"tag":91,"props":671,"children":673},{"className":672},[],[674],{"type":24,"value":675},"mindspore.float32",{"type":24,"value":327},{"type":18,"tag":43,"props":678,"children":679},{},[680,686],{"type":18,"tag":91,"props":681,"children":683},{"className":682},[],[684],{"type":24,"value":685},"environment_config",{"type":24,"value":687},": 环境配置，包括环境 ID、入口、奖励阈值、最大步数等。",{"type":18,"tag":32,"props":689,"children":691},{"id":690},"collect-environment-配置",[692],{"type":24,"value":693},"Collect Environment 配置",{"type":18,"tag":86,"props":695,"children":697},{"code":696},"'collect_environment': {\n  'number': 1,\n  'type': mindspore_rl.environment.gym_environment.GymEnvironment,\n  'wrappers': [mindspore_rl.environment.pyfunc_wrapper.PyFuncWrapper],\n  'params': {\n    'GymEnvironment': {\n      'name': 'CartPole-v0',\n      'seed': 42\n    },\n    'name': 'CartPole-v0'\n  }\n}\n",[698],{"type":18,"tag":91,"props":699,"children":700},{"__ignoreMap":7},[701],{"type":24,"value":696},{"type":18,"tag":39,"props":703,"children":704},{},[705,715,732,749],{"type":18,"tag":43,"props":706,"children":707},{},[708,713],{"type":18,"tag":91,"props":709,"children":711},{"className":710},[],[712],{"type":24,"value":306},{"type":24,"value":714},": 环境实例数量，这里为1。",{"type":18,"tag":43,"props":716,"children":717},{},[718,723,725,731],{"type":18,"tag":91,"props":719,"children":721},{"className":720},[],[722],{"type":24,"value":317},{"type":24,"value":724},": 环境的类型，这里使用 ",{"type":18,"tag":91,"props":726,"children":728},{"className":727},[],[729],{"type":24,"value":730},"mindspore_rl.environment.gym_environment.GymEnvironment",{"type":24,"value":327},{"type":18,"tag":43,"props":733,"children":734},{},[735,741,743,748],{"type":18,"tag":91,"props":736,"children":738},{"className":737},[],[739],{"type":24,"value":740},"wrappers",{"type":24,"value":742},": 环境使用的包装器，这里是 ",{"type":18,"tag":91,"props":744,"children":746},{"className":745},[],[747],{"type":24,"value":366},{"type":24,"value":327},{"type":18,"tag":43,"props":750,"children":751},{},[752,757,759,765,767,773],{"type":18,"tag":91,"props":753,"children":755},{"className":754},[],[756],{"type":24,"value":336},{"type":24,"value":758},": 环境的参数配置，包括环境名称 ",{"type":18,"tag":91,"props":760,"children":762},{"className":761},[],[763],{"type":24,"value":764},"CartPole-v0",{"type":24,"value":766}," 和随机种子 ",{"type":18,"tag":91,"props":768,"children":770},{"className":769},[],[771],{"type":24,"value":772},"42",{"type":24,"value":327},{"type":18,"tag":32,"props":775,"children":777},{"id":776},"eval-environment-配置",[778],{"type":24,"value":779},"Eval Environment 配置",{"type":18,"tag":86,"props":781,"children":783},{"code":782},"'eval_environment': {\n  'number': 1,\n  'type': mindspore_rl.environment.gym_environment.GymEnvironment,\n  'wrappers': [mindspore_rl.environment.pyfunc_wrapper.PyFuncWrapper],\n  'params': {\n    'GymEnvironment': {\n      'name': 'CartPole-v0',\n      'seed': 42\n    },\n    'name': 'CartPole-v0'\n  }\n}\n",[784],{"type":18,"tag":91,"props":785,"children":786},{"__ignoreMap":7},[787],{"type":24,"value":782},{"type":18,"tag":39,"props":789,"children":790},{},[791],{"type":18,"tag":43,"props":792,"children":793},{},[794,796,801],{"type":24,"value":795},"配置与 ",{"type":18,"tag":91,"props":797,"children":799},{"className":798},[],[800],{"type":24,"value":350},{"type":24,"value":802}," 类似，用于评估模型性能。",{"type":18,"tag":26,"props":804,"children":805},{},[806],{"type":24,"value":807},"总结一下，这些配置定义了 Actor-Critic 算法在 MindSpore 框架中的具体实现，包括 Actor 和 Learner 的设置、策略和网络的参数，以及训练和评估环境的配置。这个还是比较基础的。",{"title":7,"searchDepth":809,"depth":809,"links":810},4,[811,813,814,815,816,817,818,819],{"id":34,"depth":812,"text":37},3,{"id":76,"depth":812,"text":79},{"id":97,"depth":812,"text":97},{"id":283,"depth":812,"text":286},{"id":433,"depth":812,"text":436},{"id":566,"depth":812,"text":569},{"id":690,"depth":812,"text":693},{"id":776,"depth":812,"text":779},"markdown","content:technology-blogs:zh:3160.md","content","technology-blogs/zh/3160.md","technology-blogs/zh/3160","md",1776506126828]