[{"data":1,"prerenderedAt":249},["ShallowReactive",2],{"content-query-8sZR4iSbU8":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":243,"_id":244,"_source":245,"_file":246,"_stem":247,"_extension":248},"/technology-blogs/zh/3159","zh",false,"","MindSpore AC模型强化学习","AC算法，也称为Actor-Critic算法，是强化学习中的一种重要方法。","2024-06-04","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/11/28/616ac975ce554ac2a85f301a12476989.png","technology-blogs","开发者分享",{"type":15,"children":16,"toc":240},"root",[17,25,31,177,182,192,197,205,210,218,226],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore-ac模型强化学习",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"AC算法，也称为Actor-Critic算法，是强化学习中的一种重要方法。它结合了策略梯度方法和价值函数方法的优点，主要由两部分组成：演员（Actor）和评论家（Critic）。",{"type":18,"tag":32,"props":33,"children":34},"ol",{},[35,66,93,120,147,152,157,162,167,172],{"type":18,"tag":36,"props":37,"children":38},"li",{},[39,45,47],{"type":18,"tag":40,"props":41,"children":42},"strong",{},[43],{"type":24,"value":44},"演员（Actor）",{"type":24,"value":46},"：\n",{"type":18,"tag":48,"props":49,"children":50},"ul",{},[51,56,61],{"type":18,"tag":36,"props":52,"children":53},{},[54],{"type":24,"value":55},"负责根据当前状态选择动作。",{"type":18,"tag":36,"props":57,"children":58},{},[59],{"type":24,"value":60},"通常采用策略函数 π(a|s) 来表示在给定状态 s 下采取动作 a 的概率。",{"type":18,"tag":36,"props":62,"children":63},{},[64],{"type":24,"value":65},"目标是学习一种策略，以最大化长期的累积奖励。",{"type":18,"tag":36,"props":67,"children":68},{},[69,74,75],{"type":18,"tag":40,"props":70,"children":71},{},[72],{"type":24,"value":73},"评论家（Critic）",{"type":24,"value":46},{"type":18,"tag":48,"props":76,"children":77},{},[78,83,88],{"type":18,"tag":36,"props":79,"children":80},{},[81],{"type":24,"value":82},"评估演员采取的动作有多好。",{"type":18,"tag":36,"props":84,"children":85},{},[86],{"type":24,"value":87},"使用价值函数 V(s) 或 Q(s, a) 来衡量在状态 s 或在状态 s 下采取动作 a 的预期回报。",{"type":18,"tag":36,"props":89,"children":90},{},[91],{"type":24,"value":92},"目标是准确预测未来的回报，以指导演员的决策。",{"type":18,"tag":36,"props":94,"children":95},{},[96,101,102],{"type":18,"tag":40,"props":97,"children":98},{},[99],{"type":24,"value":100},"训练过程",{"type":24,"value":46},{"type":18,"tag":48,"props":103,"children":104},{},[105,110,115],{"type":18,"tag":36,"props":106,"children":107},{},[108],{"type":24,"value":109},"演员根据当前策略选择动作，环境根据这一动作返回新的状态和奖励。",{"type":18,"tag":36,"props":111,"children":112},{},[113],{"type":24,"value":114},"评论家根据奖励和新状态来评估这一动作的价值，并提供反馈给演员。",{"type":18,"tag":36,"props":116,"children":117},{},[118],{"type":24,"value":119},"演员根据评论家的反馈通过策略梯度方法调整其策略，以提高未来动作的预期回报。",{"type":18,"tag":36,"props":121,"children":122},{},[123,128,129],{"type":18,"tag":40,"props":124,"children":125},{},[126],{"type":24,"value":127},"算法特点",{"type":24,"value":46},{"type":18,"tag":48,"props":130,"children":131},{},[132,137,142],{"type":18,"tag":36,"props":133,"children":134},{},[135],{"type":24,"value":136},"平衡探索与利用：AC 算法通过持续更新策略来平衡探索（探索新动作）和利用（重复已知的好动作）。",{"type":18,"tag":36,"props":138,"children":139},{},[140],{"type":24,"value":141},"减少方差：由于评论家的引导，演员的策略更新更加稳定，减少了策略梯度方法中的方差。",{"type":18,"tag":36,"props":143,"children":144},{},[145],{"type":24,"value":146},"适用性：AC 算法适用于离散和连续动作空间，可以处理复杂的决策问题。 伪代码方面，Actor-Critic算法的一个典型流程包括以下步骤：",{"type":18,"tag":36,"props":148,"children":149},{},[150],{"type":24,"value":151},"使用来自参与者网络的策略 πθ 对 {s_t, a_t} 进行采样。",{"type":18,"tag":36,"props":153,"children":154},{},[155],{"type":24,"value":156},"评估优势函数 A_t，也称为TD误差 δt。在Actor-Critic算法中，优势函数是由评论者网络产生的。",{"type":18,"tag":36,"props":158,"children":159},{},[160],{"type":24,"value":161},"使用特定表达式评估梯度。",{"type":18,"tag":36,"props":163,"children":164},{},[165],{"type":24,"value":166},"更新策略参数 θ。",{"type":18,"tag":36,"props":168,"children":169},{},[170],{"type":24,"value":171},"更新基于评价者的基于价值的RL（Q学习）的权重。δt等于优势函数。",{"type":18,"tag":36,"props":173,"children":174},{},[175],{"type":24,"value":176},"重复以上步骤，直到找到最佳策略 πθ。 这个算法框架是一个很好的起点，但要应用于实际还需要进一步的发展。主要挑战在于如何有效管理两个神经网络（演员和评论家）的梯度更新，并确保它们相互依赖和协调。",{"type":18,"tag":26,"props":178,"children":179},{},[180],{"type":24,"value":181},"导入相关包",{"type":18,"tag":183,"props":184,"children":186},"pre",{"code":185},"import argparse\nfrom mindspore_rl.algorithm.ac.ac_trainer import ACTrainer\nfrom mindspore_rl.algorithm.ac.ac_session import ACSession\nfrom mindspore import context\n\nparser = argparse.ArgumentParser(description='MindSpore Reinforcement AC')\nparser.add_argument('--episode', type=int, default=1000, help='total episode numbers.')\nparser.add_argument('--device_target', type=str, default='Auto', choices=['Ascend', 'CPU', 'GPU', 'Auto'],\n                    help='Choose a device to run the ac example(Default: Auto).')\nparser.add_argument('--env_yaml', type=str, default='../env_yaml/CartPole-v0.yaml',\n                    help='Choose an environment yaml to update the ac example(Default: CartPole-v0.yaml).')\nparser.add_argument('--algo_yaml', type=str, default=None,\n                    help='Choose an algo yaml to update the ac example(Default: None).')\noptions, _ = parser.parse_known_args()\n",[187],{"type":18,"tag":188,"props":189,"children":190},"code",{"__ignoreMap":7},[191],{"type":24,"value":185},{"type":18,"tag":26,"props":193,"children":194},{},[195],{"type":24,"value":196},"启动环境",{"type":18,"tag":183,"props":198,"children":200},{"code":199},"episode=options.episode\n\"\"\"start to train ac algorithm\"\"\"\nif options.device_target != 'Auto':\n    context.set_context(device_target=options.device_target)\nif context.get_context('device_target') in ['CPU']:\n    context.set_context(enable_graph_kernel=True)\ncontext.set_context(mode=context.GRAPH_MODE)\nac_session = ACSession(options.env_yaml, options.algo_yaml)\n",[201],{"type":18,"tag":188,"props":202,"children":203},{"__ignoreMap":7},[204],{"type":24,"value":199},{"type":18,"tag":26,"props":206,"children":207},{},[208],{"type":24,"value":209},"上下文管理",{"type":18,"tag":183,"props":211,"children":213},{"code":212},"import sys\nimport time\nfrom io import StringIO\n\nclass RealTimeCaptureAndDisplayOutput(object):\n    def __init__(self):\n        self._original_stdout = sys.stdout\n        self._original_stderr = sys.stderr\n        self.captured_output = StringIO()\n\n    def write(self, text):\n        self._original_stdout.write(text)  # 实时打印\n        self.captured_output.write(text)   # 保存到缓冲区\n\n    def flush(self):\n        self._original_stdout.flush()\n        self.captured_output.flush()\n\n    def __enter__(self):\n        sys.stdout = self\n        sys.stderr = self\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        sys.stdout = self._original_stdout\n        sys.stderr = self._original_stderr\n\nepisode=100\n# dqn_session.run(class_type=DQNTrainer, episode=episode)\nwith RealTimeCaptureAndDisplayOutput() as captured_new:\n    ac_session.run(class_type=ACTrainer, episode=episode)\n",[214],{"type":18,"tag":188,"props":215,"children":216},{"__ignoreMap":7},[217],{"type":24,"value":212},{"type":18,"tag":183,"props":219,"children":221},{"code":220},"import re\nimport matplotlib.pyplot as plt\n\n# 原始输出\nraw_output = captured_new.captured_output.getvalue()\n\n# 使用正则表达式从输出中提取loss和rewards\nloss_pattern = r\"loss is (\\d+\\.\\d+)\"\nreward_pattern = r\"rewards is (\\d+\\.\\d+)\"\nloss_values = [float(match.group(1)) for match in re.finditer(loss_pattern, raw_output)]\nreward_values = [float(match.group(1)) for match in re.finditer(reward_pattern, raw_output)]\n\n# 绘制loss曲线\nplt.plot(loss_values, label='Loss')\nplt.xlabel('Episode')\nplt.ylabel('Loss')\nplt.title('Loss Curve')\nplt.legend()\nplt.show()\n\n# 绘制reward曲线\nplt.plot(reward_values, label='Rewards')\nplt.xlabel('Episode')\nplt.ylabel('Rewards')\nplt.title('Rewards Curve')\nplt.legend()\nplt.show()\n",[222],{"type":18,"tag":188,"props":223,"children":224},{"__ignoreMap":7},[225],{"type":24,"value":220},{"type":18,"tag":26,"props":227,"children":228},{},[229,234,236],{"type":18,"tag":230,"props":231,"children":233},"img",{"alt":7,"src":232},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/06/14/7fd424c8f9a8478fbbf04707ba6db3ce.png",[],{"type":24,"value":235}," ",{"type":18,"tag":230,"props":237,"children":239},{"alt":7,"src":238},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/06/14/dbcf3cf741424cc2ac0aad54beef4d4e.png",[],{"title":7,"searchDepth":241,"depth":241,"links":242},4,[],"markdown","content:technology-blogs:zh:3159.md","content","technology-blogs/zh/3159.md","technology-blogs/zh/3159","md",1776506126785]