Q-learning

Code currently available at:
-
Reinforcement learning
-
Exploration vs Exploitation
-
Unsupervised "online learning"
-
States, Actions, Rewards
-
Take action at to move from state st to state st+1 -> reward rt+1

-
Learning via policy iteration
-
Q-learning: value iteration
-
Value iteration:
Qnew = Qprev + α * (R + γ * Max(Q) - Qprev)
-
α = learning rate
-
R = reward for latest action
-
γ = discount factor
-
Max(Q) = estimate of new value from best action
Qt+1(st, at) = Qt(st, at) + αt(st, at) * (Rt+1 + γ * max(Qt(st+1, a)) - Qt(st, at))

-
"ɛ - greedy"
-
When choosing next action:
-
With probability ɛ, explore any random action
-
Else, choose randomly among the
actions with the maximum Q value
-
-
ɛ can vary (decrease) over time
-
Initial Q values affect exploration
The Game
import random
class TicTacToe:
def __init__(self, playerX, playerO):
self.board = [' ']*9
self.playerX, self.playerO = playerX, playerO
self.playerX_turn = random.choice([True, False])
def player_wins(self, char):
for a,b,c in [(0,1,2), (3,4,5), (6,7,8),
(0,3,6), (1,4,7), (2,5,8),
(0,4,8), (2,4,6)]:
if char == self.board[a] == self.board[b] == self.board[c]:
return True
return False
def board_full(self):
return not any([space == ' ' for space in self.board])
def display_board(self):
row = " {} | {} | {}"
hr = "\n-----------\n"
print (row + hr + row + hr + row).format(*self.board)
...
class TicTacToe: # cnt'd
...
def play_game(self):
self.playerX.start_game('X')
self.playerO.start_game('O')
while True: #yolo
if self.playerX_turn:
player, char, other_player = self.playerX, 'X', self.playerO
else:
player, char, other_player = self.playerO, 'O', self.playerX
if player.breed == "human":
self.display_board()
space = player.move(self.board)
if self.board[space-1] != ' ': # illegal move
player.reward(-99, self.board) # score of shame
break
self.board[space-1] = char
if self.player_wins(char):
player.reward(1, self.board)
other_player.reward(-1, self.board)
break
if self.board_full(): # tie game
player.reward(0.5, self.board)
other_player.reward(0.5, self.board)
break
other_player.reward(0, self.board)
self.playerX_turn = not self.playerX_turn
Enter Player 1
class Player(object):
def __init__(self):
self.breed = "human"
def start_game(self, char):
print "\nNew game!"
def move(self, board):
return int(raw_input("Your move? "))
def reward(self, value, board):
print "{} rewarded: {}".format(self.breed, value)
def available_moves(self, board):
return [i+1 for i in range(0,9) if board[i] == ' ']

class QLearningPlayer(Player):
def __init__(self, epsilon=0.2, alpha=0.3, gamma=0.9):
self.breed = "Qlearner"
self.harm_humans = False
self.q = {} # (state, action) keys: Q values
self.epsilon = epsilon # e-greedy chance of random exploration
self.alpha = alpha # learning rate
self.gamma = gamma # discount factor for future rewards
def start_game(self, char):
self.last_board = (' ',)*9
self.last_move = None
...
class QLearningPlayer(Player): #cnt'd
...
def move(self, board):
self.last_board = tuple(board)
actions = self.available_moves(board)
if random.random() < self.epsilon: # explore!
self.last_move = random.choice(actions)
return self.last_move
qs = [self.getQ(self.last_board, a) for a in actions]
maxQ = max(qs)
if qs.count(maxQ) > 1:
# more than 1 best option; choose among them randomly
best_options = [i for i in range(len(actions)) if qs[i] == maxQ]
i = random.choice(best_options)
else:
i = qs.index(maxQ)
self.last_move = actions[i]
return actions[i]
def learn(self, state, action, reward, result_state):
prev = self.getQ(state, action)
maxqnew = max([self.getQ(result_state, a) for a in self.available_moves(state)])
self.q[(state, action)] = prev + self.alpha * ((reward + self.gamma*maxqnew) - prev)

class RandomPlayer(Player):
def __init__(self):
self.breed = "random"
def reward(self, value, board):
pass
def start_game(self, char):
pass
def move(self, board):
return random.choice(self.available_moves(board))

class MinimaxPlayer(Player):
def __init__(self):
self.breed = "minimax"
self.best_moves = {}
def start_game(self, char):
self.me = char
self.enemy = self.other(char)
def other(self, char):
return 'O' if char == 'X' else 'X'
...

-
Perfect minimax:
not fast enough
-
Minimax with memory:
not random enough
-
Minimax with random selection:
defeatist

Q-learner vs:
-
random player: probability-maximizing strategy
-
minimax player: learns to play against only one policy
-
minimax with random selection: falls for previously unexperienced "traps"

class MinimuddledPlayer(MinimaxPlayer):
def __init__(self):
super(MinimuddledPlayer, self).__init__()
self.breed = "muddled"
self.ideal_player = MinimaxPlayer()
def start_game(self, char):
self.ideal_player.me = char
self.ideal_player.enemy = self.other(char)
def move(self, board):
if random.random() > .1:
return self.ideal_player.move(board)
else:
return random.choice(self.available_moves(board))
Next steps:
-
"natural" ɛ-greedy decay
-
symmetry
-
neural net approximation
-
more complex games!

Q-learning Tic-tac-toe
By fheisler
Q-learning Tic-tac-toe
AI to play Tic-tac-toe against various strategies using Q-learning algorithm
- 9,809