Qnew = Qprev + α * (R + γ * Max(Q) - Qprev)
Qt+1(st, at) = Qt(st, at) + αt(st, at) * (Rt+1 + γ * max(Qt(st+1, a)) - Qt(st, at))
import random
class TicTacToe:
def __init__(self, playerX, playerO):
self.board = [' ']*9
self.playerX, self.playerO = playerX, playerO
self.playerX_turn = random.choice([True, False])
def player_wins(self, char):
for a,b,c in [(0,1,2), (3,4,5), (6,7,8),
(0,3,6), (1,4,7), (2,5,8),
(0,4,8), (2,4,6)]:
if char == self.board[a] == self.board[b] == self.board[c]:
return True
return False
def board_full(self):
return not any([space == ' ' for space in self.board])
def display_board(self):
row = " {} | {} | {}"
hr = "\n-----------\n"
print (row + hr + row + hr + row).format(*self.board)
...
class TicTacToe: # cnt'd
...
def play_game(self):
self.playerX.start_game('X')
self.playerO.start_game('O')
while True: #yolo
if self.playerX_turn:
player, char, other_player = self.playerX, 'X', self.playerO
else:
player, char, other_player = self.playerO, 'O', self.playerX
if player.breed == "human":
self.display_board()
space = player.move(self.board)
if self.board[space-1] != ' ': # illegal move
player.reward(-99, self.board) # score of shame
break
self.board[space-1] = char
if self.player_wins(char):
player.reward(1, self.board)
other_player.reward(-1, self.board)
break
if self.board_full(): # tie game
player.reward(0.5, self.board)
other_player.reward(0.5, self.board)
break
other_player.reward(0, self.board)
self.playerX_turn = not self.playerX_turn
class Player(object):
def __init__(self):
self.breed = "human"
def start_game(self, char):
print "\nNew game!"
def move(self, board):
return int(raw_input("Your move? "))
def reward(self, value, board):
print "{} rewarded: {}".format(self.breed, value)
def available_moves(self, board):
return [i+1 for i in range(0,9) if board[i] == ' ']
class QLearningPlayer(Player):
def __init__(self, epsilon=0.2, alpha=0.3, gamma=0.9):
self.breed = "Qlearner"
self.harm_humans = False
self.q = {} # (state, action) keys: Q values
self.epsilon = epsilon # e-greedy chance of random exploration
self.alpha = alpha # learning rate
self.gamma = gamma # discount factor for future rewards
def start_game(self, char):
self.last_board = (' ',)*9
self.last_move = None
...
class QLearningPlayer(Player): #cnt'd
...
def move(self, board):
self.last_board = tuple(board)
actions = self.available_moves(board)
if random.random() < self.epsilon: # explore!
self.last_move = random.choice(actions)
return self.last_move
qs = [self.getQ(self.last_board, a) for a in actions]
maxQ = max(qs)
if qs.count(maxQ) > 1:
# more than 1 best option; choose among them randomly
best_options = [i for i in range(len(actions)) if qs[i] == maxQ]
i = random.choice(best_options)
else:
i = qs.index(maxQ)
self.last_move = actions[i]
return actions[i]
def learn(self, state, action, reward, result_state):
prev = self.getQ(state, action)
maxqnew = max([self.getQ(result_state, a) for a in self.available_moves(state)])
self.q[(state, action)] = prev + self.alpha * ((reward + self.gamma*maxqnew) - prev)
class RandomPlayer(Player):
def __init__(self):
self.breed = "random"
def reward(self, value, board):
pass
def start_game(self, char):
pass
def move(self, board):
return random.choice(self.available_moves(board))
class MinimaxPlayer(Player):
def __init__(self):
self.breed = "minimax"
self.best_moves = {}
def start_game(self, char):
self.me = char
self.enemy = self.other(char)
def other(self, char):
return 'O' if char == 'X' else 'X'
...
class MinimuddledPlayer(MinimaxPlayer):
def __init__(self):
super(MinimuddledPlayer, self).__init__()
self.breed = "muddled"
self.ideal_player = MinimaxPlayer()
def start_game(self, char):
self.ideal_player.me = char
self.ideal_player.enemy = self.other(char)
def move(self, board):
if random.random() > .1:
return self.ideal_player.move(board)
else:
return random.choice(self.available_moves(board))