Udacity RL - Taxi Problem

2021-12-25

This tries to go through the Taxi Gym based on the problem in the Udacity RL course. This here is copied from a notebook that I ran locally.

See the details of the Taxi Gym here: https://github.com/openai/gym/blob/master/gym/envs/toy_text/taxi.py.

OpenAI Gym defines “solving” this task as getting average return of 9.7 over 100 consecutive trials.

import gym
import numpy as np
import sys

env = gym.make("Taxi-v3").env

env.render()
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))
Action Space Discrete(6)
State Space Discrete(500)

Solving Problem with Random Agent

from IPython.display import clear_output
import time

state = env.reset()
score = 0
for t in range(200):
    clear_output(wait=True)
    action = env.action_space.sample() # Pick of the 6 actions
    env.render()
    state, reward, done, _ = env.step(action)
    score += reward
    if done:
        print('done')
        break
    print(f"State: {state}")
    print(f"Action: {action}")
    print(f"Reward: {reward}")
    time.sleep(0.1)
print('Final score:', score)
env.close()

Q Learner

class QLearningAgent:
    """Q-Learning agent for the taxi problem."""

    def __init__(self, env, alpha=0.02, gamma=0.99, epsilon=1.0, epsilon_decay_rate=0.9995, min_epsilon=.01, seed=43):
        """Initialize variables, create grid for discretization."""
        # Environment info
        self.env = env
        self.state_size = self.env.observation_space.n
        self.action_size = self.env.action_space.n  # 1-dimensional discrete action space
        self.seed = np.random.seed(seed)
        print("Environment:", self.env)
        print("State space size:", self.state_size)
        print("Action space size:", self.action_size)

        # Learning parameters
        self.alpha = alpha  # learning rate
        self.gamma = gamma  # discount factor
        self.epsilon = self.initial_epsilon = epsilon  # initial exploration rate
        self.epsilon_decay_rate = epsilon_decay_rate # how quickly should we decrease epsilon
        self.min_epsilon = min_epsilon

        # Create Q-table
        self.q_table = np.zeros(shape=((self.state_size, self.action_size)))
        print("Q table size:", self.q_table.shape)

    def reset_episode(self, state):
        """Reset variables for a new episode."""
        # Gradually decrease exploration rate
        self.epsilon *= self.epsilon_decay_rate
        self.epsilon = max(self.epsilon, self.min_epsilon)

        # Decide initial action
        self.last_state = state
        self.last_action = np.argmax(self.q_table[self.last_state])
        return self.last_action

    def reset_exploration(self, epsilon=None):
        """Reset exploration rate used when training."""
        self.epsilon = epsilon if epsilon is not None else self.initial_epsilon

    def act(self, state, reward=None, done=None, mode='train'):
        """Pick next action and update internal Q table (when mode != 'test')."""
        if mode == 'test':
            # Test mode: Simply produce an action
            action = np.argmax(self.q_table[state])
        else:
            # Train mode (default): Update Q table, pick next action
            # Note: We update the Q table entry for the *last* (state, action) pair with current state, reward
            self.q_table[self.last_state,self.last_action] += self.alpha * \
                (reward + self.gamma * max(self.q_table[state]) - self.q_table[self.last_state,self.last_action])

            # Exploration vs. exploitation
            do_exploration = np.random.uniform(0, 1) < self.epsilon
            if do_exploration:
                # Pick a random action
                action = np.random.randint(0, self.action_size)
            else:
                # Pick the best action from Q table
                action = np.argmax(self.q_table[state])

        # Roll over current state, action for next step
        self.last_state = state
        self.last_action = action
        return action
def run(agent, env, num_episodes=20000, mode='train'):
    """Run agent in given reinforcement learning environment and return scores."""
    scores = []
    max_avg_score = -np.inf
    for i_episode in range(1, num_episodes+1):
        # Initialize episode
        state = env.reset()
        action = agent.reset_episode(state)
        total_reward = 0
        done = False

        # Roll out steps until done
        while not done:
            state, reward, done, info = env.step(action)
            total_reward += reward
            action = agent.act(state, reward, done, mode)

        # Save final score
        scores.append(total_reward)

        # Print episode stats
        if mode == 'train':
            if len(scores) > 100:
                avg_score = np.mean(scores[-100:])
                if avg_score > max_avg_score:
                    max_avg_score = avg_score
            if i_episode % 100 == 0:
                print("\rEpisode {}/{} | Max Average Score: {}".format(i_episode, num_episodes, max_avg_score), end="")
                sys.stdout.flush()

    return scores

So now I run the Q Learner with different options and record the results

q_agent = QLearningAgent(env, epsilon=1)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 8.5395
q_agent = QLearningAgent(env, epsilon=0.8)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 8.7437
q_agent = QLearningAgent(env, epsilon=0.8, alpha=0.1)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 8.537
q_agent = QLearningAgent(env, epsilon=0.8, alpha=0.2)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 8.632
q_agent = QLearningAgent(env, epsilon=0.8, alpha=0.5)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 8.374
q_agent = QLearningAgent(env, epsilon=0.8, gamma=0.9)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 8.2583
q_agent = QLearningAgent(env, epsilon=0.8, gamma=0.5)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 6.7922
q_agent = QLearningAgent(env, epsilon=0.8, gamma=1)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 8.6249
q_agent = QLearningAgent(env, epsilon=0.1, gamma=0.6, alpha=0.1)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 8.578
q_agent = QLearningAgent(env, epsilon=0.1, gamma=0.6, alpha=0.1)
scores = run(q_agent, env, 100001)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 100000/100001 | Max Average Score: 8.63
# Test
new_scores = run(q_agent, env, 100, 'test')
np.mean(new_scores)
7.88