Source code for alf.environments.suite_tic_tac_toe

# Copyright (c) 2020 Horizon Robotics and ALF Contributors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch

import alf
from alf.data_structures import TimeStep, StepType

from .alf_environment import AlfEnvironment


[docs]class TicTacToeEnvironment(AlfEnvironment): """A Simple 3x3 board game. For two players, X and O, who take turns marking the spaces in a 3×3 grid. The player who succeeds in placing three of their marks in a horizontal, vertical, or diagonal line is the winner. The reward is +1 if player 0 win, -1 if player 1 win and 0 for draw. An invalid move will give the reward for the opponent. """ def __init__(self, batch_size): self._batch_size = batch_size self._observation_spec = alf.TensorSpec((3, 3)) self._action_spec = alf.BoundedTensorSpec((), minimum=0, maximum=8, dtype=torch.int64) self._line_x = torch.tensor( [[0, 0, 0], [1, 1, 1], [2, 2, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2]]).unsqueeze(0) self._line_y = torch.tensor( [[0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 0, 0], [1, 1, 1], [2, 2, 2], [0, 1, 2], [2, 1, 0]]).unsqueeze(0) self._B = torch.arange(self._batch_size) self._empty_board = self._observation_spec.zeros() self._boards = self._observation_spec.zeros((self._batch_size, )) self._env_ids = torch.arange(batch_size) self._player_0 = torch.tensor(-1.) self._player_1 = torch.tensor(1.) @property def batched(self): return True @property def batch_size(self): return self._batch_size
[docs] def env_info_spec(self): return { "play0_win": alf.TensorSpec(()), "play1_win": alf.TensorSpec(()), "draw": alf.TensorSpec(()), "invalid_move": alf.TensorSpec(()), }
[docs] def observation_spec(self): return self._observation_spec
[docs] def observation_desc(self): return ""
[docs] def action_spec(self): return self._action_spec
def _reset(self): self._boards = self._observation_spec.zeros((self._batch_size, )) self._game_over = torch.zeros((self._batch_size, ), dtype=torch.bool) self._prev_action = self._action_spec.zeros((self._batch_size, )) return TimeStep( observation=self._boards.clone().detach(), step_type=torch.full((self._batch_size, ), StepType.FIRST), reward=torch.zeros((self._batch_size, )), discount=torch.ones((self._batch_size, )), prev_action=self._action_spec.zeros((self._batch_size, )), env_id=self._env_ids, env_info={ "play0_win": torch.zeros(self._batch_size), "play1_win": torch.zeros(self._batch_size), "draw": torch.zeros(self._batch_size), "invalid_move": torch.zeros(self._batch_size), }) def _step(self, action): prev_game_over = self._game_over prev_action = action.clone() prev_action[prev_game_over] = 0 self._boards[prev_game_over] = self._empty_board step_type = torch.full((self._batch_size, ), int(StepType.MID)) player = self._get_current_player().to(torch.float32) x = action % 3 y = action // 3 valid = self._boards[self._B, y, x] == 0 self._boards[self._B[valid], y[valid], x[valid]] = player[valid] won = self._check_player_win(player) reward = torch.where(won, -player, torch.tensor(0.)) reward = torch.where(valid, reward, player) game_over = self._check_game_over() game_over = torch.max(game_over, ~valid) step_type[game_over] = int(StepType.LAST) step_type[prev_game_over] = int(StepType.FIRST) discount = torch.ones(self._batch_size) discount[game_over] = 0. self._boards[prev_game_over] = self._empty_board self._game_over = game_over self._prev_action = action player0_win = self._check_player_win(self._player_0) player1_win = self._check_player_win(self._player_1) draw = torch.min(game_over, reward == 0) return TimeStep( observation=self._boards.clone().detach(), reward=reward.detach(), step_type=step_type.detach(), discount=discount.detach(), prev_action=prev_action.detach(), env_id=self._env_ids, env_info={ "play0_win": player0_win.to(torch.float32), "play1_win": player1_win.to(torch.float32), "draw": draw.to(torch.float32), "invalid_move": (~valid).to(torch.float32), }) def _check_player_win(self, player): B = self._B.unsqueeze(-1).unsqueeze(-1) player = player.unsqueeze(-1).unsqueeze(-1) lines = self._boards[B, self._line_y, self._line_x] return ((lines == player).sum(dim=2) == 3).any(dim=1) def _check_game_over(self): board_full = (self._boards == 0).sum(dim=(1, 2)) == 0 B = self._B.unsqueeze(-1).unsqueeze(-1) lines = self._boards[B, self._line_y, self._line_x] player0_won = ((lines == self._player_0).sum(dim=2) == 3).any(dim=1) player1_won = ((lines == self._player_1).sum(dim=2) == 3).any(dim=1) return torch.max(board_full, torch.max(player0_won, player1_won)) def _get_current_player(self): return ((self._boards != 0).sum(dim=(1, 2)) % 2) * 2 - 1
[docs] def render(self, mode): if mode == 'human': action = self._prev_action[0].cpu().numpy() ay = action // 3 ax = action % 3 board = self._boards[0].cpu().numpy() img = '-----\n' for y in range(3): img += '|' for x in range(3): if board[y, x] == 0: img += ' ' elif board[y, x] == -1: img += 'x' elif board[y, x] == 1: img += 'o' if x == ax and y == ay: img = img[:-1] + img[-1].upper() img += '|\n' img += '-----\n' print(img) else: raise ValueError("Unsupported render mode %s" % mode)
[docs]@alf.configurable(whitelist=[]) def load(name='', batch_size=1): """Load TicTacToeEnvironment Args: name (str): not used batch_size (int): the number of games in the simulation. """ return TicTacToeEnvironment(batch_size)
# environments.utils.create_environment() check this flag to see if load() # has direct support for batched environment or not. load.batched = True