Answer To: We have built an agent that learns the numeric tic tac toe game by Q-Learning .There is no change to...
Ujjwal answered on Feb 03 2021
TCGame_Env.py
from __future__ import print_function
import argparse
import matplotlib.pylab as plt
import os
import pickle
import random
import sys
from agent import QLearner, SarsaLearner, Teacher
def plot_agent_reward(rewards, agent_type):
""" Function to plot agent's accumulated reward vs. episode """
plt.plot(rewards)
if agent_type == 'q':
plt.title('Q-Learning Agent Cumulative Reward vs. Episode')
else:
plt.title('Sarsa Agent Cumulative Reward vs. Episode')
plt.ylabel('Reward')
plt.xlabel('Episode')
plt.show()
class Game(object):
""" The game class. New instance created for each new game. """
def __init__(self, agent, teacher=None):
self.computer = agent
self.teacher = teacher
# initialize the game board
self.board = [['-', '-', '-'], ['-', '-', '-'], ['-', '-', '-']]
def printBoard(self):
""" Prints the game board as text output to the terminal. """
print(' 0 1 2\n')
row_num = 0
for row in self.board:
print('%i ' % row_num, end='')
for elt in row:
print('%s ' % elt, end='')
print('\n')
row_num += 1
print('\n')
def playerMove(self):
""" Querry player for a move and update the board accordingly. """
if self.teacher is not None:
action = self.teacher.makeMove(self.board)
self.board[action[0]][action[1]] = 'X'
else:
self.printBoard()
while True:
move = raw_input("Your move! Please select a row and column from 0-2 "
"in the format row,col: ")
try:
row, col = int(move[0]), int(move[2])
except ValueError:
print("INVALID INPUT! Please use the correct format.")
continue
if row not in range(3) or col not in range(3) or not self.board[row][col] == '-':
print("INVALID MOVE! Choose again.")
continue
self.board[row][col] = 'X'
break
def computerMove(self, action):
""" Update board according to computer move. """
self.board[action[0]][action[1]] = 'O'
def checkForWin(self, key):
"""
Check to see whether the player/agent with token 'key' has won.
Returns a boolean holding truth value.
"""
# check for player win on diagonals
a = [self.board[0][0], self.board[1][1], self.board[2][2]]
b = [self.board[0][2], self.board[1][1], self.board[2][0]]
if a.count(key) == 3 or b.count(key) == 3:
return True
# check for player win on rows/columns
for i in range(3):
col = [self.board[0][i], self.board[1][i], self.board[2][i]]
row = [self.board[i][0], self.board[i][1], self.board[i][2]]
if col.count(key) == 3 or row.count(key) == 3:
return True
return False
def checkForDraw(self):
"""
Check to see whether the game has ended in a draw. Returns a
boolean holding truth value.
"""
draw = True
for row in self.board:
for elt in row:
if elt == '-':
draw = False
return draw
def checkForEnd(self, key):
"""
Checks if player/agent with token 'key' has ended the game. Returns -1
if the game is still going, 0 if it is a draw, and 1 if the player/agent
has won.
"""
if self.checkForWin(key):
if self.teacher is None:
self.printBoard()
if key == 'X':
print("Player wins!")
else:
print("RL agent wins!")
return 1
elif self.checkForDraw():
if self.teacher is None:
self.printBoard()
print("It's a draw!")
return 0
return -1
def getStateKey(self):
"""
Converts 2D list representing the board state into a string key
for that state. Keys are used for Q-value hashing.
"""
key = ''
for row in self.board:
for elt in row:
key += elt
return key
def playGame(self, agent_type, player_first):
""" Begin the tic-tac-toe game loop. """
# Initialize the agent's state and action
if player_first:
self.playerMove()
oldState = self.getStateKey()
if agent_type == 's':
oldAction = self.computer.get_action(oldState)
if agent_type == 'q':
# Dealing with QLearner agent
while True:
action = self.computer.get_action(oldState)
self.computerMove(action)
check = self.checkForEnd('O')
if not check == -1:
reward = check
break
self.playerMove()
state = self.getStateKey()
check = self.checkForEnd('X')
if not check == -1:
reward = -1*check
break
else:
reward = 0
self.computer.update(oldState, state, action, reward)
oldState = state
else:
# Dealing with Sarsa agent
while True:
self.computerMove(oldAction)
check = self.checkForEnd('O')
if not check == -1:
reward = check
break
self.playerMove()
check = self.checkForEnd('X')
if not check == -1:
reward = -1*check
break
else:
reward = 0
state = self.getStateKey()
action = self.computer.get_action(state)
self.computer.update(oldState, state, oldAction, action, reward)
oldState = state
oldAction = action
self.computer.total_reward += reward
self.computer.rewards += [self.computer.total_reward]
# Final update and save
if agent_type == 'q':
self.computer.update(oldState, None, action, reward)
self.computer.save_agent('./qlearner_agent.pkl')
else:
self.computer.update(oldState, None, oldAction, None, reward)
self.computer.save_agent('./sarsa_agent.pkl')
def start(self, agent_type):
"""
Function to determine how to play. Options include whether to employ
teacher and whether to have computer or player go first.
"""
if self.teacher is not None:
# During teaching, chose who goes first randomly with equal probability
if random.random() < 0.5:
self.playGame(agent_type, False)
else:
self.playGame(agent_type, True)
else:
while True:
response = raw_input("Would you like to go first? [y/n]: ")
if response == 'n' or response == 'no':
#self.playComputerFirst(agent_type)
self.playGame(agent_type, False)
break
elif response == 'y' or response == 'yes':
#self.playPlayerFirst(agent_type)
self.playGame(agent_type, True)
break
else:
print("Invalid input. Please enter 'y' or 'n'.")
class GameLearning(object):
"""
A class that holds the state of the learning process. Learning
agents are created/loaded here, and a count is kept of the
games that have been played.
"""
def __init__(self, args, alpha=0.5, gamma=0.9, epsilon=0.1):
self.games_played = 0
if args.load:
# load agent
if args.learner_type == 'q':
# QLearner
try:
f = open('./qlearner_agent.pkl','rb')
except IOError:
print("The agent file does not exist. Quitting.")
sys.exit(0)
self.type = 'q'
else:
# SarsaLearner
try:
f = open('./sarsa_agent.pkl','rb')
except IOError:
print("The agent file does not exist. Quitting.")
sys.exit(0)
self.type = 's'
self.agent = pickle.load(f)
f.close()
# If plotting, show plot and quit
if args.plot:
plot_agent_reward(self.agent.rewards, self.type)
sys.exit(0)
else:
# check if agent state file already exists, and ask user whether to overwrite if so
if ((args.learner_type == "q" and os.path.isfile('./qlearner_agent.pkl')) or
(args.learner_type == "s" and os.path.isfile('./qlearner_agent.pkl'))):
while True:
response = raw_input("An agent state is already saved for this type. "
"Are you sure you want to overwrite? [y/n]: ")
if response == 'y' or response == 'yes':
break
elif response == 'n' or response == 'no':
print("OK. Quitting.")
sys.exit(0)
else:
print("Invalid input. Please choose 'y' or 'n'.")
if args.learner_type == "q":
self.agent = QLearner(alpha,gamma,epsilon)
self.type = 'q'
else:
self.agent = SarsaLearner(alpha,gamma,epsilon)
self.type = 's'
def beginPlaying(self):
""" Loop through game iterations with a human player. """
print("Welcome to Tic-Tac-Toe. You are 'X' and the computer is 'O'.")
def play_again():
print("Games played: %i" % self.games_played)
while True:
play = raw_input("Do you want to play again? [y/n]: ")
if play == 'y' or play == 'yes':
return True
elif play == 'n' or play == 'no':
return False
else:
print("Invalid input. Please choose 'y' or 'n'.")
while True:
game = Game(self.agent)
game.start(self.type)
self.games_played += 1
if not play_again():
print("OK. Quitting.")
break
def beginTeaching(self, episodes):
""" Loop through game iterations with a teaching agent. """
teacher = Teacher()
# Train for alotted number of episodes
while self.games_played < episodes:
game = Game(self.agent, teacher=teacher)
game.start(self.type)
self.games_played += 1
# Monitor progress
if self.games_played % 500 == 0:
print("Games played: %i" % self.games_played)
if __name__ == "__main__":
# Parse command line arguments
parser = argparse.ArgumentParser(description="Play Tic-Tac-Toe.")
parser.add_argument("learner_type", help="Specify the computer agent learning algorithm."
"'q' for Q-learning and 's' for Sarsa-learning",
type=str, default="q")
parser.add_argument("-l", "--load", help="load trained agent", action="store_true")
parser.add_argument("-t", "--teacher", help="employ teacher agent who knows the optimal strategy",
default=None, type=int)
parser.add_argument("-p", "--plot", help="plot reward vs. episode of stored agent and quit",
action="store_true")
args = parser.parse_args()
assert args.learner_type == 'q' or args.learner_type == 's', \
"learner type must be either 'q' or 's'."
if args.plot:
assert args.load, "Must load an agent to plot reward."
assert args.teacher is None, \
"Cannot plot and teach concurrently; must chose one or the other."
gl = GameLearning(args)
if args.teacher is not None:
gl.beginTeaching(args.teacher)
else:
gl.beginPlaying()
Testing_States_tracked.ipynb.py
import unittest
from agent import QLearner, SarsaLearner, Teacher
from game import Game, GameLearning
class TestGameAndAgents(unittest.TestCase):
def setUp(self):
# Use epsilon = 0 so that actions are deterministic
# and therefore testable
self.q_agent = QLearner(0.5, 0.9, 0)
self.s_agent = SarsaLearner(0.5, 0.9, 0)
# test deterministic teacher
self.teacher = Teacher(1)
self.game = Game(self.q_agent)
def testGame(self):
self.game.computerMove((1,1))
self.assertEqual(self.game.board[1][1], 'O')
with self.assertRaises(IndexError):
self.game.computerMove((3, 3))
self.assertFalse(self.game.checkForWin('O'))
self.assertFalse(self.game.checkForDraw())
self.assertEqual(self.game.checkForEnd('O'), -1)
self.game.computerMove((0, 0))
self.game.computerMove((2, 2))
self.assertTrue(self.game.checkForWin('O'))
self.assertFalse(self.game.checkForDraw())
self.assertEqual(self.game.checkForEnd('O'), 1)
def testLearningAgents(self):
a1 = self.q_agent.get_action('---------')
self.assertEqual(a1, (0, 0))
a2 = self.q_agent.get_action('O--------')
self.assertEqual(a2, (1, 0))
self.q_agent.update('---------', '----0X---', (1, 1), 1)
self.s_agent.update('---------', '----0X---', (1, 1), (2, 1), -1)
self.assertEqual(self.q_agent.Q[(1, 1)]['---------'], 0.5)
self.assertEqual(self.s_agent.Q[(1, 1)]['---------'], -0.5)
def testTeachingAgent(self):
board1 = [['-', '-', '-'], ['-', '-', '-'], ['-', '-', '-']]
board2 = [['X', '-', 'O'], ['O', 'X', '-'], ['-', '-', '-']]
# Should be center
self.assertEqual(self.teacher.makeMove(board1), (1, 1))
# Should be corner for win
self.assertEqual(self.teacher.makeMove(board2), (2, 2))
if __name__ == '__main__':
unittest.main()
TicTacToe_Agent.ipynb.py
import collections
import numpy as np
import os
import pickle
import random
class Learner(object):
"""
Parent class for Q-learning and Sarsa-learning agents.
"""
def __init__(self, alpha, gamma, epsilon):
# Reward accumulator
self.total_reward = 0
# Keep a list of accumulated award for each episode
self.rewards = []
self.alpha = alpha
self.gamma = gamma
self.epsilon = epsilon
# Possible actions correspond to the set of all x,y coordinate pairs
self.actions = []
for i in range(3):
self.actions += [(0, i), (1, i), (2, i)]
self.Q = {}
for action in self.actions:
# Initialize Q values of all state-action pairs to 0
self.Q[action] = collections.defaultdict(int)
def get_action(self, s):
# Make sure we only consider empty board spaces
possible_actions = [a for a in self.actions if s[a[0]*3 + a[1]] == '-']
if random.random() < self.epsilon:
# Random choose.
action = possible_actions[random.randint(0,len(possible_actions)-1)]
else:
# Greedy choose. At least one action will always be possible
# when this function is called.
Q_max = -np.inf
for a in possible_actions:
if self.Q[a][s] > Q_max:
Q_max = self.Q[a][s]
action = a
return action
def save_agent(self, path):
""" Pickle the agent object instance to save the agent's state. """
if os.path.isfile(path):
os.remove(path)
f = open(path, 'wb')
pickle.dump(self, f)
f.close()
class QLearner(Learner):
"""
A class to implement the Q-learning agent.
"""
def __init__(self, alpha, gamma, epsilon):
Learner.__init__(self, alpha, gamma, epsilon)
def update(self, s, s_, a, r):
""" Perform the Q-Learning step update of Q values. """
# Update Q(s,a)
if s_ is not None:
# Hold list of Q values for all a_,s_ pairs so we can access max later
Q_options = []
for action in self.actions:
Q_options += [self.Q[action][s_]]
self.Q[a][s] = (1 - self.alpha)*self.Q[a][s] + self.alpha*(r + self.gamma*max(Q_options))
else:
self.Q[a][s] = (1 - self.alpha)*self.Q[a][s] + self.alpha*r
class SarsaLearner(Learner):
"""
A class to implement the Sarsa-learning agent.
"""
def __init__(self, alpha, gamma, epsilon):
Learner.__init__(self, alpha, gamma, epsilon)
def update(self, s, s_, a, a_, r):
""" Perform the Sarsa step update of Q values. """
# Update Q(s,a)
if s_ is not None:
self.Q[a][s] = (1 - self.alpha)*self.Q[a][s] + self.alpha*(r + self.gamma*self.Q[a_][s_])
else:
self.Q[a][s] = (1 - self.alpha)*self.Q[a][s] + self.alpha*r
class Teacher(object):
""" A class to implement a teacher that knows the optimal playing strategy.
Teacher returns the best move at any time given the current state of the game.
Note: things are a bit more hard-coded here, as this was not the main focus of
the exercise so I did not spend as much time on design/style. Everything works
properly when tested."""
def __init__(self, level=0.9):
"""
Ability level determines the probability that the teacher will follow
the optimal strategy as opposed to choosing a random available move.
"""
self.ability_level = level
def win(self, board, key='X'):
""" If we have two in a row and the 3rd is available, take it. """
# Check for diagonal wins
a = [board[0][0], board[1][1], board[2][2]]
b = [board[0][2], board[1][1], board[2][0]]
if a.count('-') == 1 and a.count(key) == 2:
ind = a.index('-')
return ind, ind
elif b.count('-') == 1 and b.count(key) == 2:
ind = b.index('-')
if ind == 0:
return 0, 2
elif ind == 1:
return 1, 1
else:
return 2, 0
# Now check for 2 in a row/column + empty 3rd
for i in range(3):
c = [board[0][i], board[1][i], board[2][i]]
d = [board[i][0], board[i][1], board[i][2]]
if c.count('-') == 1 and c.count(key) == 2:
ind = c.index('-')
return ind, i
elif d.count('-') == 1 and d.count(key) == 2:
ind = d.index('-')
return i, ind
return None
def blockWin(self, board):
""" Block the opponent if she has a win available. """
return self.win(board, key='O')
def fork(self, board):
""" Create a fork opportunity such that we have 2 threats to win. """
# Check all adjacent side middles
if board[1][0] == 'X' and board[0][1] == 'X':
if board[0][0] == '-' and board[2][0] == '-' and board[0][2] == '-':
return 0, 0
elif board[1][1] == '-' and board[2][1] == '-' and board[1][2] == '-':
return 1, 1
elif board[1][0] == 'X' and board[2][1] == 'X':
if board[2][0] == '-' and board[0][0] == '-' and board[2][2] == '-':
return 2, 0
elif board[1][1] == '-' and board[0][1] == '-' and board[1][2] == '-':
return 1, 1
elif board[2][1] == 'X' and board[1][2] == 'X':
if board[2][2] == '-' and board[2][0] == '-' and board[0][2] == '-':
return 2, 2
elif board[1][1] == '-' and board[1][0] == '-' and board[0][1] == '-':
return 1, 1
elif board[1][2] == 'X' and board[0][1] == 'X':
if board[0][2] == '-' and board[0][0] == '-' and board[2][2] == '-':
return 0, 2
elif board[1][1] == '-' and board[1][0] == '-' and board[2][1] == '-':
return 1, 1
# Check all cross corners
elif board[0][0] == 'X' and board[2][2] == 'X':
if board[1][0] == '-' and board[2][1] == '-' and board[2][0] == '-':
return 2, 0
elif board[0][1] == '-' and board[1][2] == '-' and board[0][2] == '-':
return 0, 2
elif board[2][0] == 'X' and board[0][2] == 'X':
if board[2][1] == '-' and board[1][2] == '-' and board[2][2] == '-':
return 2, 2
elif board[1][0] == '-' and board[0][1] == '-' and board[0][0] == '-':
return 0, 0
return None
def blockFork(self, board):
""" Block the opponents fork if she has one available. """
corners = [board[0][0], board[2][0], board[0][2], board[2][2]]
# Check all adjacent side middles
if board[1][0] == 'O' and board[0][1] == 'O':
if board[0][0] == '-' and board[2][0] == '-' and board[0][2] == '-':
return 0, 0
elif board[1][1] == '-' and board[2][1] == '-' and board[1][2] == '-':
return 1, 1
elif board[1][0] == 'O' and board[2][1] == 'O':
if board[2][0] == '-' and board[0][0] == '-' and board[2][2] == '-':
return 2, 0
elif board[1][1] == '-' and board[0][1] == '-' and board[1][2] == '-':
return 1, 1
elif board[2][1] == 'O' and board[1][2] == 'O':
if board[2][2] == '-' and board[2][0] == '-' and board[0][2] == '-':
return 2, 2
elif board[1][1] == '-' and board[1][0] == '-' and board[0][1] == '-':
return 1, 1
elif board[1][2] == 'O' and board[0][1] == 'O':
if board[0][2] == '-' and board[0][0] == '-' and board[2][2] == '-':
return 0, 2
elif board[1][1] == '-' and board[1][0] == '-' and board[2][1] == '-':
return 1, 1
# Check all cross corners (first check for double fork opp using the corners array)
elif corners.count('-') == 1 and corners.count('O') == 2:
return 1, 2
elif board[0][0] == 'O' and board[2][2] == 'O':
if board[1][0] == '-' and board[2][1] == '-' and board[2][0] == '-':
return 2, 0
elif board[0][1] == '-' and board[1][2] == '-' and board[0][2] == '-':
return 0, 2
elif board[2][0] == 'O' and board[0][2] == 'O':
if board[2][1] == '-' and board[1][2] == '-' and board[2][2] == '-':
return 2, 2
elif board[1][0] == '-' and board[0][1] == '-' and board[0][0] == '-':
return 0, 0
return None
def center(self, board):
""" Pick the center if it is available. """
if board[1][1] == '-':
return 1, 1
return None
def corner(self, board):
""" Pick a corner move. """
# Pick opposite corner of opponent if available
if board[0][0] == 'O' and board[2][2] == '-':
return 2, 2
elif board[2][0] == 'O' and board[0][2] == '-':
return 0, 2
elif board[0][2] == 'O' and board[2][0] == '-':
return 2, 0
elif board[2][2] == 'O' and board[0][0] == '-':
return 0, 0
# Pick any corner if no opposites are available
elif board[0][0] == '-':
return 0, 0
elif board[2][0] == '-':
return 2, 0
elif board[0][2] == '-':
return 0, 2
elif board[2][2] == '-':
return 2, 2
return None
def sideEmpty(self, board):
""" Pick an empty side. """
if board[1][0] == '-':
return 1, 0
elif board[2][1] == '-':
return 2, 1
elif board[1][2] == '-':
return 1, 2
elif board[0][1] == '-':
return 0, 1
return None
def randomMove(self, board):
""" Chose a random move from the available options. """
possibles = []
for i in range(3):
for j in range(3):
if board[i][j] == '-':
possibles += [(i, j)]
return possibles[random.randint(0, len(possibles)-1)]
def makeMove(self, board):
"""
Trainer goes through a hierarchy of moves, making the best move that
is currently available each time. A touple is returned that represents
(row, col).
"""
# Chose randomly with some probability so that the teacher does not always win
if random.random() > self.ability_level:
return self.randomMove(board)
# Follow optimal strategy
a = self.win(board)
if a is not None:
return a
a = self.blockWin(board)
if a is not None:
return a
a = self.fork(board)
if a is not None:
return a
a = self.blockFork(board)
if a is not None:
return a
a = self.center(board)
if a is not None:
return a
a = self.corner(board)
if a is not None:
return a
a = self.sideEmpty(board)
if a is not None:
return a
return...