# Copyright (c) 2020 Horizon Robotics and ALF Contributors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import alf
from alf.data_structures import TimeStep, StepType
from .alf_environment import AlfEnvironment
[docs]class TicTacToeEnvironment(AlfEnvironment):
"""A Simple 3x3 board game.
For two players, X and O, who take turns marking the spaces in a 3×3 grid.
The player who succeeds in placing three of their marks in a horizontal,
vertical, or diagonal line is the winner.
The reward is +1 if player 0 win, -1 if player 1 win and 0 for draw.
An invalid move will give the reward for the opponent.
"""
def __init__(self, batch_size):
self._batch_size = batch_size
self._observation_spec = alf.TensorSpec((3, 3))
self._action_spec = alf.BoundedTensorSpec((),
minimum=0,
maximum=8,
dtype=torch.int64)
self._line_x = torch.tensor(
[[0, 0, 0], [1, 1, 1], [2, 2, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2],
[0, 1, 2], [0, 1, 2]]).unsqueeze(0)
self._line_y = torch.tensor(
[[0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 0, 0], [1, 1, 1], [2, 2, 2],
[0, 1, 2], [2, 1, 0]]).unsqueeze(0)
self._B = torch.arange(self._batch_size)
self._empty_board = self._observation_spec.zeros()
self._boards = self._observation_spec.zeros((self._batch_size, ))
self._env_ids = torch.arange(batch_size)
self._player_0 = torch.tensor(-1.)
self._player_1 = torch.tensor(1.)
@property
def batched(self):
return True
@property
def batch_size(self):
return self._batch_size
[docs] def env_info_spec(self):
return {
"play0_win": alf.TensorSpec(()),
"play1_win": alf.TensorSpec(()),
"draw": alf.TensorSpec(()),
"invalid_move": alf.TensorSpec(()),
}
[docs] def observation_spec(self):
return self._observation_spec
[docs] def observation_desc(self):
return ""
[docs] def action_spec(self):
return self._action_spec
def _reset(self):
self._boards = self._observation_spec.zeros((self._batch_size, ))
self._game_over = torch.zeros((self._batch_size, ), dtype=torch.bool)
self._prev_action = self._action_spec.zeros((self._batch_size, ))
return TimeStep(
observation=self._boards.clone().detach(),
step_type=torch.full((self._batch_size, ), StepType.FIRST),
reward=torch.zeros((self._batch_size, )),
discount=torch.ones((self._batch_size, )),
prev_action=self._action_spec.zeros((self._batch_size, )),
env_id=self._env_ids,
env_info={
"play0_win": torch.zeros(self._batch_size),
"play1_win": torch.zeros(self._batch_size),
"draw": torch.zeros(self._batch_size),
"invalid_move": torch.zeros(self._batch_size),
})
def _step(self, action):
prev_game_over = self._game_over
prev_action = action.clone()
prev_action[prev_game_over] = 0
self._boards[prev_game_over] = self._empty_board
step_type = torch.full((self._batch_size, ), int(StepType.MID))
player = self._get_current_player().to(torch.float32)
x = action % 3
y = action // 3
valid = self._boards[self._B, y, x] == 0
self._boards[self._B[valid], y[valid], x[valid]] = player[valid]
won = self._check_player_win(player)
reward = torch.where(won, -player, torch.tensor(0.))
reward = torch.where(valid, reward, player)
game_over = self._check_game_over()
game_over = torch.max(game_over, ~valid)
step_type[game_over] = int(StepType.LAST)
step_type[prev_game_over] = int(StepType.FIRST)
discount = torch.ones(self._batch_size)
discount[game_over] = 0.
self._boards[prev_game_over] = self._empty_board
self._game_over = game_over
self._prev_action = action
player0_win = self._check_player_win(self._player_0)
player1_win = self._check_player_win(self._player_1)
draw = torch.min(game_over, reward == 0)
return TimeStep(
observation=self._boards.clone().detach(),
reward=reward.detach(),
step_type=step_type.detach(),
discount=discount.detach(),
prev_action=prev_action.detach(),
env_id=self._env_ids,
env_info={
"play0_win": player0_win.to(torch.float32),
"play1_win": player1_win.to(torch.float32),
"draw": draw.to(torch.float32),
"invalid_move": (~valid).to(torch.float32),
})
def _check_player_win(self, player):
B = self._B.unsqueeze(-1).unsqueeze(-1)
player = player.unsqueeze(-1).unsqueeze(-1)
lines = self._boards[B, self._line_y, self._line_x]
return ((lines == player).sum(dim=2) == 3).any(dim=1)
def _check_game_over(self):
board_full = (self._boards == 0).sum(dim=(1, 2)) == 0
B = self._B.unsqueeze(-1).unsqueeze(-1)
lines = self._boards[B, self._line_y, self._line_x]
player0_won = ((lines == self._player_0).sum(dim=2) == 3).any(dim=1)
player1_won = ((lines == self._player_1).sum(dim=2) == 3).any(dim=1)
return torch.max(board_full, torch.max(player0_won, player1_won))
def _get_current_player(self):
return ((self._boards != 0).sum(dim=(1, 2)) % 2) * 2 - 1
[docs] def render(self, mode):
if mode == 'human':
action = self._prev_action[0].cpu().numpy()
ay = action // 3
ax = action % 3
board = self._boards[0].cpu().numpy()
img = '-----\n'
for y in range(3):
img += '|'
for x in range(3):
if board[y, x] == 0:
img += ' '
elif board[y, x] == -1:
img += 'x'
elif board[y, x] == 1:
img += 'o'
if x == ax and y == ay:
img = img[:-1] + img[-1].upper()
img += '|\n'
img += '-----\n'
print(img)
else:
raise ValueError("Unsupported render mode %s" % mode)
[docs]@alf.configurable(whitelist=[])
def load(name='', batch_size=1):
"""Load TicTacToeEnvironment
Args:
name (str): not used
batch_size (int): the number of games in the simulation.
"""
return TicTacToeEnvironment(batch_size)
# environments.utils.create_environment() check this flag to see if load()
# has direct support for batched environment or not.
load.batched = True