[{"data":1,"prerenderedAt":293},["ShallowReactive",2],{"content-query-GcegeHdbio":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":287,"_id":288,"_source":289,"_file":290,"_stem":291,"_extension":292},"/technology-blogs/en/3439","en",false,"","MindSpore AC Model of Reinforcement Learning","The actor-critic algorithm, also referred to as the AC algorithm, is an important method of reinforcement learning.","2024-06-04","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/11/28/b754e755a4c14096a500f4c5e8cfb3e1.png","technology-blogs","Practices",{"type":15,"children":16,"toc":284},"root",[17,25,31,42,47,73,78,88,93,129,134,144,149,154,159,169,174,179,184,202,221,226,237,248,259,269,277],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore-ac-model-of-reinforcement-learning",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"The actor-critic algorithm, also referred to as the AC algorithm, is an important method of reinforcement learning. It combines the advantages of the policy gradient method and the value function method. The AC algorithm mainly consists of two parts: actor and critic.",{"type":18,"tag":26,"props":32,"children":33},{},[34,36],{"type":24,"value":35},"1. ",{"type":18,"tag":37,"props":38,"children":39},"strong",{},[40],{"type":24,"value":41},"Actor",{"type":18,"tag":26,"props":43,"children":44},{},[45],{"type":24,"value":46},"o An actor selects actions based on its current state.",{"type":18,"tag":26,"props":48,"children":49},{},[50,52,57,59,64,66,71],{"type":24,"value":51},"o Generally, a policy function ",{"type":18,"tag":37,"props":53,"children":54},{},[55],{"type":24,"value":56},"π(a|s)",{"type":24,"value":58}," is used to represent a probability of taking action ",{"type":18,"tag":37,"props":60,"children":61},{},[62],{"type":24,"value":63},"a",{"type":24,"value":65}," in a given state ",{"type":18,"tag":37,"props":67,"children":68},{},[69],{"type":24,"value":70},"s",{"type":24,"value":72},".",{"type":18,"tag":26,"props":74,"children":75},{},[76],{"type":24,"value":77},"o The actor's goal is to learn a strategy to maximize long-term cumulative rewards.",{"type":18,"tag":26,"props":79,"children":80},{},[81,83],{"type":24,"value":82},"2. ",{"type":18,"tag":37,"props":84,"children":85},{},[86],{"type":24,"value":87},"Critic",{"type":18,"tag":26,"props":89,"children":90},{},[91],{"type":24,"value":92},"o A critic assesses how well the actor has done.",{"type":18,"tag":26,"props":94,"children":95},{},[96,98,103,105,110,112,116,118,122,124,128],{"type":24,"value":97},"o The value function ",{"type":18,"tag":37,"props":99,"children":100},{},[101],{"type":24,"value":102},"V(s)",{"type":24,"value":104}," or ",{"type":18,"tag":37,"props":106,"children":107},{},[108],{"type":24,"value":109},"Q(s, a)",{"type":24,"value":111}," measures the expected rewards of state ",{"type":18,"tag":37,"props":113,"children":114},{},[115],{"type":24,"value":70},{"type":24,"value":117}," or of taking action ",{"type":18,"tag":37,"props":119,"children":120},{},[121],{"type":24,"value":63},{"type":24,"value":123}," in state ",{"type":18,"tag":37,"props":125,"children":126},{},[127],{"type":24,"value":70},{"type":24,"value":72},{"type":18,"tag":26,"props":130,"children":131},{},[132],{"type":24,"value":133},"o The critic's goal is to accurately predict future rewards to guide the actor's decision-making.",{"type":18,"tag":26,"props":135,"children":136},{},[137,139],{"type":24,"value":138},"3. ",{"type":18,"tag":37,"props":140,"children":141},{},[142],{"type":24,"value":143},"Training process",{"type":18,"tag":26,"props":145,"children":146},{},[147],{"type":24,"value":148},"o The actor selects an action based on the current policy, and the environment returns a new state and reward based on the action.",{"type":18,"tag":26,"props":150,"children":151},{},[152],{"type":24,"value":153},"o The critic evaluates the value of the action based on the reward and new state, and provides feedback to the actor.",{"type":18,"tag":26,"props":155,"children":156},{},[157],{"type":24,"value":158},"o The actor adjusts its policy through a policy gradient approach based on the feedback from the critic to improve the expected reward of its future actions.",{"type":18,"tag":26,"props":160,"children":161},{},[162,164],{"type":24,"value":163},"4. ",{"type":18,"tag":37,"props":165,"children":166},{},[167],{"type":24,"value":168},"Algorithm features",{"type":18,"tag":26,"props":170,"children":171},{},[172],{"type":24,"value":173},"o Balanced exploration and utilization: The AC algorithm balances exploration (exploring new actions) and utilization (repeating known good actions) by continuously updating policies.",{"type":18,"tag":26,"props":175,"children":176},{},[177],{"type":24,"value":178},"o Reduced variance: Due to the critic's guidance, the actor's policy update is more stable, reducing the variance in the policy gradient method.",{"type":18,"tag":26,"props":180,"children":181},{},[182],{"type":24,"value":183},"o Applicability: The AC algorithm is applicable to discrete and continuous action spaces and can handle complex decision-making problems. In terms of pseudocode, a typical procedure of the AC algorithm includes the following steps:",{"type":18,"tag":26,"props":185,"children":186},{},[187,189,194,196,201],{"type":24,"value":188},"5. Use the policy ",{"type":18,"tag":37,"props":190,"children":191},{},[192],{"type":24,"value":193},"πθ",{"type":24,"value":195}," from the participant network to sample ",{"type":18,"tag":37,"props":197,"children":198},{},[199],{"type":24,"value":200},"{s_t, a_t}",{"type":24,"value":72},{"type":18,"tag":26,"props":203,"children":204},{},[205,207,212,214,219],{"type":24,"value":206},"6. Evaluate the advantage function ",{"type":18,"tag":37,"props":208,"children":209},{},[210],{"type":24,"value":211},"A_t",{"type":24,"value":213},", which is also referred to as a TD error ",{"type":18,"tag":37,"props":215,"children":216},{},[217],{"type":24,"value":218},"δt",{"type":24,"value":220},". In the AC algorithm, the advantage function is generated by the critic network.",{"type":18,"tag":26,"props":222,"children":223},{},[224],{"type":24,"value":225},"7. Evaluate gradients using specific expressions.",{"type":18,"tag":26,"props":227,"children":228},{},[229,231,236],{"type":24,"value":230},"8. Update the policy parameter ",{"type":18,"tag":37,"props":232,"children":233},{},[234],{"type":24,"value":235},"θ",{"type":24,"value":72},{"type":18,"tag":26,"props":238,"children":239},{},[240,242,246],{"type":24,"value":241},"9. Update the weight of the value-based RL (Q-learning) according to the critic. ",{"type":18,"tag":37,"props":243,"children":244},{},[245],{"type":24,"value":218},{"type":24,"value":247}," is equal to the advantage function.",{"type":18,"tag":26,"props":249,"children":250},{},[251,253,257],{"type":24,"value":252},"10. Repeat the preceding steps until the optimal policy ",{"type":18,"tag":37,"props":254,"children":255},{},[256],{"type":24,"value":193},{"type":24,"value":258}," is found. The AC algorithm framework is a good starting point, but its real-world application requires further development. The main challenge is how to effectively manage the gradient updates of two neural networks (actor and critic) and ensure that they are interdependent and coordinated.",{"type":18,"tag":260,"props":261,"children":263},"pre",{"code":262},"Importing packages\nimport argparse\nfrom mindspore_rl.algorithm.ac.ac_trainer import ACTrainer\nfrom mindspore_rl.algorithm.ac.ac_session import ACSession\nfrom mindspore import context\n\nparser = argparse.ArgumentParser(description='MindSpore Reinforcement AC')\nparser.add_argument('--episode', type=int, default=1000, help='total episode numbers.')\nparser.add_argument('--device_target', type=str, default='Auto', choices=['Ascend', 'CPU', 'GPU', 'Auto'],\n                    help='Choose a device to run the ac example(Default: Auto).')\nparser.add_argument('--env_yaml', type=str, default='../env_yaml/CartPole-v0.yaml',\n                    help='Choose an environment yaml to update the ac example(Default: CartPole-v0.yaml).')\nparser.add_argument('--algo_yaml', type=str, default=None,\n                    help='Choose an algo yaml to update the ac example(Default: None).')\noptions, _ = parser.parse_known_args()\nStarting the environment\nepisode=options.episode\n\"\"\"start to train ac algorithm\"\"\"\nif options.device_target != 'Auto':\n    context.set_context(device_target=options.device_target)\nif context.get_context('device_target') in ['CPU']:\n    context.set_context(enable_graph_kernel=True)\ncontext.set_context(mode=context.GRAPH_MODE)\nac_session = ACSession(options.env_yaml, options.algo_yaml)\nManaging the context\nimport sys\nimport time\nfrom io import StringIO\n\nclass RealTimeCaptureAndDisplayOutput(object):\n    def __init__(self):\n        self._original_stdout = sys.stdout\n        self._original_stderr = sys.stderr\n        self.captured_output = StringIO()\n\n    def write(self, text):\n        self._original_stdout.write(text)  # Print in real time.\n        self.captured_output.write(text)   # Save to the buffer.\n\n    def flush(self):\n        self._original_stdout.flush()\n        self.captured_output.flush()\n\n    def __enter__(self):\n        sys.stdout = self\n        sys.stderr = self\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        sys.stdout = self._original_stdout\n        sys.stderr = self._original_stderr\n\nepisode=100\n# dqn_session.run(class_type=DQNTrainer, episode=episode)\nwith RealTimeCaptureAndDisplayOutput() as captured_new:\n    ac_session.run(class_type=ACTrainer, episode=episode)\nimport re\nimport matplotlib.pyplot as plt\n\n# Original output\nraw_output = captured_new.captured_output.getvalue()\n\n# Use the regular expression to extract loss and rewards from the output.\nloss_pattern = r\"loss is (\\d+\\.\\d+)\"\nreward_pattern = r\"rewards is (\\d+\\.\\d+)\"\nloss_values = [float(match.group(1)) for match in re.finditer(loss_pattern, raw_output)]\nreward_values = [float(match.group(1)) for match in re.finditer(reward_pattern, raw_output)]\n\n# Draw the loss curve.\nplt.plot(loss_values, label='Loss')\nplt.xlabel('Episode')\nplt.ylabel('Loss')\nplt.title('Loss Curve')\nplt.legend()\nplt.show()\n\n# Draw the rewards curve.\nplt.plot(reward_values, label='Rewards')\nplt.xlabel('Episode')\nplt.ylabel('Rewards')\nplt.title('Rewards Curve')\nplt.legend()\nplt.show()\n",[264],{"type":18,"tag":265,"props":266,"children":267},"code",{"__ignoreMap":7},[268],{"type":24,"value":262},{"type":18,"tag":26,"props":270,"children":271},{},[272],{"type":18,"tag":273,"props":274,"children":276},"img",{"alt":7,"src":275},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/10/25/459a02aa41fe44bba1fa28a4cef2fdca.png",[],{"type":18,"tag":26,"props":278,"children":279},{},[280],{"type":18,"tag":273,"props":281,"children":283},{"alt":7,"src":282},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2024/10/25/275ca357febc4aa685b7b691d365b0d3.png",[],{"title":7,"searchDepth":285,"depth":285,"links":286},4,[],"markdown","content:technology-blogs:en:3439.md","content","technology-blogs/en/3439.md","technology-blogs/en/3439","md",1776506111319]