Udacity RL - Taxi Problem
2021-12-25
This tries to go through the Taxi Gym based on the problem in the Udacity RL course. This here is copied from a notebook that I ran locally.
See the details of the Taxi Gym here: https://github.com/openai/gym/blob/master/gym/envs/toy_text/taxi.py.
OpenAI Gym defines “solving” this task as getting average return of 9.7 over 100 consecutive trials.
import gym
import numpy as np
import sys
env = gym.make("Taxi-v3").env
env.render()
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))
Action Space Discrete(6)
State Space Discrete(500)
Solving Problem with Random Agent
from IPython.display import clear_output
import time
state = env.reset()
score = 0
for t in range(200):
clear_output(wait=True)
action = env.action_space.sample() # Pick of the 6 actions
env.render()
state, reward, done, _ = env.step(action)
score += reward
if done:
print('done')
break
print(f"State: {state}")
print(f"Action: {action}")
print(f"Reward: {reward}")
time.sleep(0.1)
print('Final score:', score)
env.close()
Q Learner
class QLearningAgent:
"""Q-Learning agent for the taxi problem."""
def __init__(self, env, alpha=0.02, gamma=0.99, epsilon=1.0, epsilon_decay_rate=0.9995, min_epsilon=.01, seed=43):
"""Initialize variables, create grid for discretization."""
# Environment info
self.env = env
self.state_size = self.env.observation_space.n
self.action_size = self.env.action_space.n # 1-dimensional discrete action space
self.seed = np.random.seed(seed)
print("Environment:", self.env)
print("State space size:", self.state_size)
print("Action space size:", self.action_size)
# Learning parameters
self.alpha = alpha # learning rate
self.gamma = gamma # discount factor
self.epsilon = self.initial_epsilon = epsilon # initial exploration rate
self.epsilon_decay_rate = epsilon_decay_rate # how quickly should we decrease epsilon
self.min_epsilon = min_epsilon
# Create Q-table
self.q_table = np.zeros(shape=((self.state_size, self.action_size)))
print("Q table size:", self.q_table.shape)
def reset_episode(self, state):
"""Reset variables for a new episode."""
# Gradually decrease exploration rate
self.epsilon *= self.epsilon_decay_rate
self.epsilon = max(self.epsilon, self.min_epsilon)
# Decide initial action
self.last_state = state
self.last_action = np.argmax(self.q_table[self.last_state])
return self.last_action
def reset_exploration(self, epsilon=None):
"""Reset exploration rate used when training."""
self.epsilon = epsilon if epsilon is not None else self.initial_epsilon
def act(self, state, reward=None, done=None, mode='train'):
"""Pick next action and update internal Q table (when mode != 'test')."""
if mode == 'test':
# Test mode: Simply produce an action
action = np.argmax(self.q_table[state])
else:
# Train mode (default): Update Q table, pick next action
# Note: We update the Q table entry for the *last* (state, action) pair with current state, reward
self.q_table[self.last_state,self.last_action] += self.alpha * \
(reward + self.gamma * max(self.q_table[state]) - self.q_table[self.last_state,self.last_action])
# Exploration vs. exploitation
do_exploration = np.random.uniform(0, 1) < self.epsilon
if do_exploration:
# Pick a random action
action = np.random.randint(0, self.action_size)
else:
# Pick the best action from Q table
action = np.argmax(self.q_table[state])
# Roll over current state, action for next step
self.last_state = state
self.last_action = action
return action
def run(agent, env, num_episodes=20000, mode='train'):
"""Run agent in given reinforcement learning environment and return scores."""
scores = []
max_avg_score = -np.inf
for i_episode in range(1, num_episodes+1):
# Initialize episode
state = env.reset()
action = agent.reset_episode(state)
total_reward = 0
done = False
# Roll out steps until done
while not done:
state, reward, done, info = env.step(action)
total_reward += reward
action = agent.act(state, reward, done, mode)
# Save final score
scores.append(total_reward)
# Print episode stats
if mode == 'train':
if len(scores) > 100:
avg_score = np.mean(scores[-100:])
if avg_score > max_avg_score:
max_avg_score = avg_score
if i_episode % 100 == 0:
print("\rEpisode {}/{} | Max Average Score: {}".format(i_episode, num_episodes, max_avg_score), end="")
sys.stdout.flush()
return scores
So now I run the Q Learner with different options and record the results
q_agent = QLearningAgent(env, epsilon=1)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 8.5395
q_agent = QLearningAgent(env, epsilon=0.8)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 8.7437
q_agent = QLearningAgent(env, epsilon=0.8, alpha=0.1)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 8.537
q_agent = QLearningAgent(env, epsilon=0.8, alpha=0.2)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 8.632
q_agent = QLearningAgent(env, epsilon=0.8, alpha=0.5)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 8.374
q_agent = QLearningAgent(env, epsilon=0.8, gamma=0.9)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 8.2583
q_agent = QLearningAgent(env, epsilon=0.8, gamma=0.5)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 6.7922
q_agent = QLearningAgent(env, epsilon=0.8, gamma=1)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 8.6249
q_agent = QLearningAgent(env, epsilon=0.1, gamma=0.6, alpha=0.1)
scores = run(q_agent, env)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 20000/20000 | Max Average Score: 8.578
q_agent = QLearningAgent(env, epsilon=0.1, gamma=0.6, alpha=0.1)
scores = run(q_agent, env, 100001)
Environment: <TaxiEnv<Taxi-v3>>
State space size: 500
Action space size: 6
Q table size: (500, 6)
Episode 100000/100001 | Max Average Score: 8.63
# Test
new_scores = run(q_agent, env, 100, 'test')
np.mean(new_scores)
7.88