Commit d744568b by 20200318029

homework8

parent e7f5bc75
import gym
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from skimage.color import rgb2gray
from skimage.transform import resize
from collections import deque
import os.path
import os, time
# Define Hyperparameters
FLAGS = tf.flags.FLAGS
tf.flags.DEFINE_float('optimiser_learning_rate', 0.00025, help='learning rate for optimiser')
tf.flags.DEFINE_integer('observe_step_num', 100000, help='number of steps to observe before choosing actions')
tf.flags.DEFINE_integer('batch_size', 32, help='size of batch')
tf.flags.DEFINE_float('initial_epsilon', 1.0, help='initial value for epsilon')
tf.flags.DEFINE_integer('epsilon_anneal_num', 500000, help='number of steps over to anneal epsilon')
tf.flags.DEFINE_float('final_epsilon', 0.01, help='final value for epsilon')
tf.flags.DEFINE_float('gamma', 0.99, help='decay rate for future reward')
tf.flags.DEFINE_integer('replay_memory', 200000, 'number of previous transitions to remember') # 200000 = 10GB RAM
tf.flags.DEFINE_integer('n_episodes', 100000, 'number of episodes to let the model train for')
tf.flags.DEFINE_integer('no_op_steps', 2, 'number of steps to do nothing at start of each episode')
tf.flags.DEFINE_integer('update_target_model_steps', 100000, 'update target Q model every n episodes')
tf.flags.DEFINE_string('train_dir', 'MK7_train_data', 'location for training data and logs')
# tf.flags.DEFINE_boolean('render', True, 'whether to render the image')
tf.flags.DEFINE_boolean('render', False, 'whether to render the image')
Input_shape = (84, 84, 4) # input image size to model
Action_size = 3
# Create a pre-processing function
# Converts colour image into a smaller grey-scale image
# Converts floats to integers
def pre_processing(observation):
processed_observation = rgb2gray(observation) * 255
# Convering to grey converts it to normalised values [0,255] -> [0,1]
# We need to store the values as integers in the next step to save space, so we need to undo the normalisation
processed_observation = resize(processed_observation, (84, 84), mode='constant')
# Convert from floating point to integers ranging from [0,255]
processed_observation = np.uint8(processed_observation)
return processed_observation
# Create the model to approximate Qvalues for a given set of states and actions
def atari_model():
# Task-1 : to fill the network structure of atari_model
# Define the inputs
frames_input = keras.Input(shape=Input_shape, name='frames')
actions_input = keras.layers.Input(shape=(Action_size,), name='action_mask')
# Normalise the inputs from [0,255] to [0,1] - to make processing easier
normalised = keras.layers.Lambda(lambda x: x/255.0, name='normalised')(frames_input)
# Conv1 is 16 8x8 filters with a stride of 4 and a ReLU
conv1 = '?'
# Conv2 is 32 4x4 filters with a stride of 2 and a ReLU
conv2 = '?'
# Flatten the output from Conv2
conv2_flatten = keras.layers.Flatten()(conv2)
# Then a fully connected layer with 128 ReLU units
dense1 = '?'
# Then a fully connected layer with a unit to map to each of the actions and no activation
output = '?'
# Then we multiply the output by the action mask
# When trying to find the value of all the actions this will be a mask full of 1s
# When trying to find the value of a specific action, the mask will only be 1 for a single action
filtered_output = keras.layers.Multiply(name='Qvalue')([output, actions_input])
# Create the model
# Create a model that maps frames and actions to the filtered output
model = keras.Model(inputs=[frames_input, actions_input], outputs=filtered_output)
# Print a summary of the model
model.summary()
# Define optimiser
optimiser = tf.train.AdamOptimizer()
# Compile model
loss = '?' # Task-2: to choose the loss function
model.compile(optimizer=optimiser, loss=loss)
# Return the model
return model
# Create a model to use as a target
def atari_model_target():
# Task-3: to implement the code for atari_model_target
model = '?'
return model
# get action from model using epsilon-greedy policy
def get_action(history, epsilon, step, model):
if np.random.rand() <= epsilon or step <= FLAGS.observe_step_num:
return random.randrange(Action_size)
else:
q_value = model.predict([history, np.ones(Action_size).reshape(1, Action_size)])
return np.argmax(q_value[0])
# save sample <s,a,r,s'> to the replay memory
def store_memory(memory, history, action, reward, next_history):
memory.append((history, action, reward, next_history))
def get_one_hot(targets, nb_classes):
return np.eye(nb_classes)[np.array(targets).reshape(-1)]
# train model by random batch
def train_memory_batch(memory, model):
# Sample a minibatch
mini_batch = random.sample(memory, FLAGS.batch_size)
# Create empty arrays to load our minibatch into
# These objects have multiple values hence need defined shapes
state = np.zeros((FLAGS.batch_size, Input_shape[0], Input_shape[1], Input_shape[2]))
next_state = np.zeros((FLAGS.batch_size, Input_shape[0], Input_shape[1], Input_shape[2]))
# These objects have a single value, so we can just create a list that we append later
action = []
reward = []
# Create an array that will carry what the target q values will be - based on our target networks weights
target_q = np.zeros((FLAGS.batch_size,))
# Fill up our arrays with our minibatch
for id, val in enumerate(mini_batch):
state[id] = val[0]
print(val[0].shape)
next_state[id] = val[3]
action.append(val[1])
reward.append(val[2])
# We want the model to predict the q value for all actions hence:
actions_mask = np.ones((FLAGS.batch_size, Action_size))
# Get the target model to predict the q values for all actions
next_q_values = model.predict([next_state, actions_mask])
# Fill out target q values based on the max q value in the next state
for i in range(FLAGS.batch_size):
# Standard discounted reward formula
# q(s,a) = r + discount * cumulative future rewards
target_q[i] = reward[i] + FLAGS.gamma * np.amax(next_q_values[i])
# Convert all the actions into one hot vectors
action_one_hot = get_one_hot(action, Action_size)
# Apply one hot mask onto target vector
# This results in a vector that has the max q value in the position corresponding to the action
target_one_hot = action_one_hot * target_q[:, None]
# Then we fit the model
# We map the state and the action from the memory bank to the q value of that state action pair
# s,a -> q(s,a|w)
h = model.fit([state, action_one_hot], target_one_hot, epochs=1, batch_size=FLAGS.batch_size, verbose=0)
# Return the loss
# It's just for monitoring progress
return h.history['loss'][0]
def train():
# Define which game to play
env = gym.make('PongDeterministic-v4')
# Create a space for our memory
# We will use a deque - double ended que
# This will result in a max size so that once all the space is filled,
# older entries will be removed to make room for new
memory = deque(maxlen=FLAGS.replay_memory)
# Start episode counter
episode_number = 0
# Set epsilon
epsilon = FLAGS.initial_epsilon
# Define epsilon decay
epsilon_decay = (FLAGS.initial_epsilon - FLAGS.final_epsilon) / FLAGS.epsilon_anneal_num
# Start global step
global_step = 0
# Define model
model = atari_model()
# Define target model
model_target = atari_model_target()
# Define where to store logs
log_dir = "{}/run-{}-log".format(FLAGS.train_dir, 'MK10')
# Pass graph to TensorBoard
file_writer = tf.summary.FileWriter(log_dir, tf.get_default_graph())
# Start the optimisation loop
while episode_number < FLAGS.n_episodes:
# Initialisise done as false
done = False
step = 0
score = 0
loss = 0.0
# Initialise environment
observation = env.reset()
# For the very start of the episode, we will do nothing but observe
# This way we can get a sense of what's going on
for _ in range(random.randint(1, FLAGS.no_op_steps)):
observation, _, _, _ = env.step(1)
# At the start of the episode there are no preceding frames
# So we just copy the initial states into a stack to make the state history
state = pre_processing(observation)
state_history = np.stack((state, state, state, state), axis=2)
state_history = np.reshape([state_history], (1, 84, 84, 4))
# Perform while we still have lives
while not done:
# Render the image if selected to do so
if FLAGS.render:
env.render()
# Select an action based on our current model
# Task-4: select_model = model_target or select_model = model
select_model = '?'
action = get_action(state_history, epsilon, global_step, select_model)
# Convert action from array numbers to real numbers
real_action = action + 1
# After we're done observing, start scaling down epsilon
if global_step > FLAGS.observe_step_num and epsilon > FLAGS.final_epsilon:
epsilon -= epsilon_decay
# Record output from the environment
observation, reward, done, info = env.step(real_action)
# Process the observation
next_state = pre_processing(observation)
next_state = np.reshape([next_state], (1, 84, 84, 1))
# Update the history with the next state - also remove oldest state
state_history_w_next = np.append(next_state, state_history[:, :, :, :3], axis=3)
# Update score
score += reward
# Save the (s, a, r, s') set to memory
store_memory(memory, state_history, action, reward, state_history_w_next)
# Train model
# Check if we are done observing
if global_step > FLAGS.observe_step_num:
loss = loss + train_memory_batch(memory, model)
# Check if we are ready to update target model with the model we have been training
if global_step % FLAGS.update_target_model_steps == 0:
model_target.set_weights(model.get_weights())
print("UPDATING TARGET WEIGHTS")
state_history = state_history_w_next
#print("step: ", global_step)
global_step += 1
step += 1
# Check if episode is over - lost all lives in breakout
if done:
# Check if we are still observing
if global_step <= FLAGS.observe_step_num:
current_position = 'observe'
# Check if we are still annealing epsilon
elif FLAGS.observe_step_num < global_step <= FLAGS.observe_step_num + FLAGS.epsilon_anneal_num:
current_position = 'explore'
else:
current_position = 'train'
# Print status
print(
'current position: {}, epsilon: {} , episode: {}, score: {}, global_step: {}, avg loss: {}, step: {}, memory length: {}'
.format(current_position, epsilon, episode_number, score, global_step, loss / float(step), step,
len(memory)))
# Save model every 100 episodes and final episode
if episode_number % 100 == 0 or (episode_number + 1) == FLAGS.n_episodes:
file_name = "pong_model_{}.h5".format(episode_number)
model_path = os.path.join(FLAGS.train_dir, file_name)
model.save(model_path)
# Add loss and score data to TensorBoard
loss_summary = tf.Summary(
value=[tf.Summary.Value(tag="loss", simple_value=loss / float(step))])
file_writer.add_summary(loss_summary, global_step=episode_number)
score_summary = tf.Summary(
value=[tf.Summary.Value(tag="score", simple_value=score)])
file_writer.add_summary(score_summary, global_step=episode_number)
# Increment episode number
episode_number += 1
file_writer.close()
if __name__ == "__main__":
t_start = time.time()
train()
t_end = time.time()
t_all = t_end - t_start
print('train.py: whole time: {:.2f} h ({:.2f} min)'.format(t_all / 3600., t_all / 60.))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment