Source code for mindspore_rl.environment.tic_tac_toe_environment

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Tic-Tac-Toe game"""
#pylint: disable=C0325
import numpy as np

import mindspore as ms
from mindspore.ops import operations as P
from mindspore_rl.environment import Environment
from mindspore_rl.environment import Space


[docs]class TicTacToeEnvironment(Environment): """ Tic-Tac-Toe is a famous paper-and-pencil game (https://en.wikipedia.org/wiki/Tic-tac-toe). The rule is that two players draw Os or Xs in a three-by-tree grid. When three of their marks are in a Horizontal, vertical or diagonal row, that player will be the winner. The following figure is an example of Tic-Tac-Toe. +---+---+---+ | o | | x | +---+---+---+ | x | o | | +---+---+---+ | | x | o | +---+---+---+ Args: params (dict): A dictionary contains all the parameters which are used in this class. env_id (int, optional): A integer which is used to set the seed of this environment, default value means the 0th environment. Default: ``0`` . Supported Platforms: ``Ascend`` ``GPU`` ``CPU`` Examples: >>> from mindspore_rl.environment import TicTacToeEnvironment >>> env_params = {} >>> environment = TicTacToeEnvironment(env_params, 0) >>> print(environment) TicTacToeEnvironment<> """ def __init__(self, params, env_id=0): super().__init__() self._board = np.zeros((3, 3), np.float32) self._current_player_var = 0 self._total_num_player = 2.0 self._avail_action = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8], np.int32) self._max_utility = 1.0 self.params = params self.env_id = env_id self._player_one_win = np.array([1.0, -1.0], np.float32) self._player_two_win = np.array([-1.0, 1.0], np.float32) self._draw_or_no_result = np.array([0, 0], np.float32) self._done = np.array([False], np.bool_) self._observation_space = Space((3, 3), np.float32, low=-1, high=2) self._action_space = Space((1,), np.int32, low=0, high=9) self._reward_space = Space((2,), np.float32, low=-1, high=2) self._done_space = Space((1,), np.bool_) self._reset_ops = P.PyFunc(self._reset, (), (), (self._observation_space.ms_dtype,), (self._observation_space.shape,)) step_out_dtype = (self._observation_space.ms_dtype, self._reward_space.ms_dtype, self._done_space.ms_dtype) step_out_shape = (self._observation_space.shape, self._reward_space.shape, self._done_space.shape) self._step_ops = P.PyFunc(self._step, (self._action_space.ms_dtype,), (self._action_space.shape,), step_out_dtype, step_out_shape) self._save_ops = P.PyFunc(self._save, (), (), (self._observation_space.ms_dtype,), (self._observation_space.shape,)) self._load_ops = P.PyFunc(self._load, (self._observation_space.ms_dtype,), (self._observation_space.shape,), step_out_dtype, step_out_shape) self._legal_action_ops = P.PyFunc(self._legal_action, (), (), (ms.int32,), ((9,),)) self._current_player_ops = P.PyFunc(self._current_player, (), (), (ms.int32,), ((1,),)) self._is_terminal_ops = P.PyFunc(self._is_terminal, (), (), (ms.bool_,), ((1,),)) self._reward_ops = P.PyFunc(self._rewards, (), (), (ms.float32,), ((2,),)) @property def action_space(self): """ Get the action space of the environment. Returns: The action space of environment. """ return self._action_space @property def config(self): """ Get the config of environment. Returns: A dictionary which contains environment's info. """ return {} @property def done_space(self): """ Get the done space of the environment. Returns: The done space of environment. """ return self._done_space @property def observation_space(self): """ Get the state space of the environment. Returns: The state space of environment. """ return self._observation_space @property def reward_space(self): """ Get the reward space of the environment. Returns: The reward space of environment. """ return self._reward_space
[docs] def reset(self): """ Reset the environment to the initial state. It is always used at the beginning of each episode. It will return the value of initial state. Returns: A Tensor which states for initial state. """ return self._reset_ops()[0]
[docs] def step(self, action): r""" Execute the environment step, which means that interact with environment once. Args: action (Tensor): A tensor that contains the action information. Returns: - state (Tensor), the environment state after performing the action. - reward (Tensor), the reward after performing the action. - done (Tensor), whether the simulation finishes or not. """ return self._step_ops(action)
[docs] def save(self): """ Return a repilca of environment. Tic-Tac-Toe do not need a replica, thus it will return the current state Returns: A tensor which states for the current state. """ return self._save_ops()[0]
[docs] def load(self, state): """ Load the input state. It will update the legal action, current state and done info of the game to the input checkpoint. Args: state (Tensor): The input checkpoint state. Returns: - state (Tensor), the state of checkpoint. - reward (Tensor), the reward of checkpoint. - done (Tensor), whether the checkpoint is terminal. """ return self._load_ops(state)
[docs] def calculate_rewards(self): """ Return the rewards of current state. Returns: A tensor which states for the rewards of current state. """ return self._rewards_ops()[0]
[docs] def legal_action(self): """ Return the legal action of current state. Returns: A tensor which states for the legal action. """ return self._legal_action_ops()[0]
[docs] def max_utility(self): """ Return the max utility of Tic-Tac-Toe. Returns: A tensor which states for max utility. """ return self._max_utility
[docs] def total_num_player(self): """ Return the total number of player Returns: Tensor, the total number of player. """ return self._total_num_player
[docs] def current_player(self): """ Return the current player of current state. Returns: A tensor which states for current player. """ return self._current_player_ops()[0][0]
[docs] def is_terminal(self): """ Return whether the current state is terminal. Returns: whether the current state is terminal or not. """ return self._is_terminal_ops()[0]
def _reset(self): """private reset function""" self._board = np.zeros_like(self._board) return self._board def _step(self, action): """private step function""" action = action[0] if not action in self._avail_action or action == -1: raise ValueError("action {} is not available, please check the input of step function".format(action)) self._avail_action[action] = -1 row, column = np.divmod(action, 3) if self._current_player_var == 0: self._board[row][column] = 1 else: self._board[row][column] = -1 self._current_player_var = 1 - self._current_player_var reward = self._rewards() if reward[0] != 0 or self._avail_action.sum() == -9: self._done = np.array([True], np.bool_) return self._board, reward, self._done def _save(self): """private save function""" return self._board def _load(self, state): """private load function""" self._board = state new_avail = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8], np.int32) for row in range(3): for column in range(3): if state[row][column] != 0: new_avail[row*3+column] = -1 out_reward = self._rewards() if (new_avail.sum() == -9) or (out_reward == self._player_one_win).all() \ or (out_reward == self._player_two_win).all(): self._done = np.array([True], np.bool_) else: self._done = np.array([False], np.bool_) if state.sum() == 0: self._current_player_var = 0 else: self._current_player_var = 1 self._avail_action = new_avail return self._board, out_reward, self._done def _legal_action(self): """private legal action function""" return self._avail_action def _current_player(self): """private current player function""" return np.array([self._current_player_var], np.int32) def _is_terminal(self): """private is terminal function""" return self._done def _rewards(self): """private rewards function""" if (self._board[0].sum() == 3) or (self._board[1].sum() == 3) or (self._board[2].sum() == 3): return self._player_one_win if (self._board[0].sum() == -3) or (self._board[1].sum() == -3) or (self._board[2].sum() == -3): return self._player_two_win for column in range(3): if (self._board[0][column] + self._board[1][column] + self._board[2][column] == 3): return self._player_one_win if (self._board[0][column] + self._board[1][column] + self._board[2][column] == -3): return self._player_two_win cross_one = self._board[0][0] + self._board[1][1] + self._board[2][2] cross_two = self._board[0][2] + self._board[1][1] + self._board[2][0] if cross_one == 3 or cross_two == 3: return self._player_one_win if cross_one == -3 or cross_two == -3: return self._player_two_win return self._draw_or_no_result