Q-learning

  • Reinforcement learning
     

  • Exploration vs Exploitation
     

  • Unsupervised "online learning"

  • States, Actions, Rewards
     

  • Take action at to move from state st to state st+1  -> reward rt+1

  • Learning via policy iteration
     

  • Q-learning: value iteration

  • Value iteration:

    Qnew = Qprev + α * (R + γ * Max(Q) - Qprev)
    
    
  • α = learning rate

  • R = reward for latest action

  • γ = discount factor

  • Max(Q) = estimate of new value from best action

Qt+1(st, at) = Qt(st, at) + αt(st, at) * 

(Rt+1 + γ * max(Qt(st+1, a)) - Qt(st, at))
  • "ɛ - greedy"

  • When choosing next action:

    • With probability ɛ, explore any random action

    • Else, choose randomly among the 
      actions with the maximum Q value

  • ɛ can vary (decrease) over time

  • Initial Q values affect exploration

The Game

import random

class TicTacToe:
    def __init__(self, playerX, playerO):
        self.board = [' ']*9
        self.playerX, self.playerO = playerX, playerO
        self.playerX_turn = random.choice([True, False])

    def player_wins(self, char):
        for a,b,c in [(0,1,2), (3,4,5), (6,7,8),
                      (0,3,6), (1,4,7), (2,5,8),
                      (0,4,8), (2,4,6)]:
            if char == self.board[a] == self.board[b] == self.board[c]:
                return True
        return False

    def board_full(self):
        return not any([space == ' ' for space in self.board])

    def display_board(self):
        row = " {} | {} | {}"
        hr = "\n-----------\n"
        print (row + hr + row + hr + row).format(*self.board)

    ...
class TicTacToe: # cnt'd
    ...

    def play_game(self):
        self.playerX.start_game('X')
        self.playerO.start_game('O')
        while True: #yolo
            if self.playerX_turn:
                player, char, other_player = self.playerX, 'X', self.playerO
            else:
                player, char, other_player = self.playerO, 'O', self.playerX
            if player.breed == "human":
                self.display_board()
            space = player.move(self.board)
            if self.board[space-1] != ' ': # illegal move
                player.reward(-99, self.board) # score of shame
                break
            self.board[space-1] = char
            if self.player_wins(char):
                player.reward(1, self.board)
                other_player.reward(-1, self.board)
                break
            if self.board_full(): # tie game
                player.reward(0.5, self.board)
                other_player.reward(0.5, self.board)
                break
            other_player.reward(0, self.board)
            self.playerX_turn = not self.playerX_turn

Enter Player 1

class Player(object):
    def __init__(self):
        self.breed = "human"

    def start_game(self, char):
        print "\nNew game!"

    def move(self, board):
        return int(raw_input("Your move? "))

    def reward(self, value, board):
        print "{} rewarded: {}".format(self.breed, value)

    def available_moves(self, board):
        return [i+1 for i in range(0,9) if board[i] == ' ']
class QLearningPlayer(Player):
    def __init__(self, epsilon=0.2, alpha=0.3, gamma=0.9):
        self.breed = "Qlearner"
        self.harm_humans = False
        self.q = {} # (state, action) keys: Q values
        self.epsilon = epsilon # e-greedy chance of random exploration
        self.alpha = alpha # learning rate
        self.gamma = gamma # discount factor for future rewards

    def start_game(self, char):
        self.last_board = (' ',)*9
        self.last_move = None

    ...
class QLearningPlayer(Player): #cnt'd
    ...

    def move(self, board):
        self.last_board = tuple(board)
        actions = self.available_moves(board)

        if random.random() < self.epsilon: # explore!
            self.last_move = random.choice(actions)
            return self.last_move

        qs = [self.getQ(self.last_board, a) for a in actions]
        maxQ = max(qs)

        if qs.count(maxQ) > 1:
            # more than 1 best option; choose among them randomly
            best_options = [i for i in range(len(actions)) if qs[i] == maxQ]
            i = random.choice(best_options)
        else:
            i = qs.index(maxQ)

        self.last_move = actions[i]
        return actions[i]

    def learn(self, state, action, reward, result_state):
        prev = self.getQ(state, action)
        maxqnew = max([self.getQ(result_state, a) for a in self.available_moves(state)])
        self.q[(state, action)] = prev + self.alpha * ((reward + self.gamma*maxqnew) - prev)
class RandomPlayer(Player):
    def __init__(self):
        self.breed = "random"

    def reward(self, value, board):
        pass

    def start_game(self, char):
        pass

    def move(self, board):
        return random.choice(self.available_moves(board))
class MinimaxPlayer(Player):
    def __init__(self):
        self.breed = "minimax"
        self.best_moves = {}

    def start_game(self, char):
        self.me = char
        self.enemy = self.other(char)

    def other(self, char):
        return 'O' if char == 'X' else 'X'

    ...
  • Perfect minimax:
    not fast enough

 

  • Minimax with memory:
    not random enough
     

  • Minimax with random selection:
    defeatist

Q-learner vs:

 

  • random player: probability-maximizing strategy

  • minimax player: learns to play against only one policy

  • minimax with random selection: falls for previously unexperienced "traps"

class MinimuddledPlayer(MinimaxPlayer):
    def __init__(self):
        super(MinimuddledPlayer, self).__init__()
        self.breed = "muddled"
        self.ideal_player = MinimaxPlayer()

    def start_game(self, char):
        self.ideal_player.me = char
        self.ideal_player.enemy = self.other(char)

    def move(self, board):
        if random.random() > .1:
            return self.ideal_player.move(board)
        else:
            return random.choice(self.available_moves(board))

Next steps:

 

  • "natural" ɛ-greedy decay

  • symmetry

  • neural net approximation

  • more complex games!

Q-learning Tic-tac-toe

By fheisler

Q-learning Tic-tac-toe

AI to play Tic-tac-toe against various strategies using Q-learning algorithm

  • 9,514