import gym import random import numpy as np import tensorflow as tf from tensorflow import keras from skimage.color import rgb2gray from skimage.transform import resize from collections import deque import os.path import os, time # Define Hyperparameters FLAGS = tf.flags.FLAGS tf.flags.DEFINE_float('optimiser_learning_rate', 0.00025, help='learning rate for optimiser') tf.flags.DEFINE_integer('observe_step_num', 100000, help='number of steps to observe before choosing actions') tf.flags.DEFINE_integer('batch_size', 32, help='size of batch') tf.flags.DEFINE_float('initial_epsilon', 1.0, help='initial value for epsilon') tf.flags.DEFINE_integer('epsilon_anneal_num', 500000, help='number of steps over to anneal epsilon') tf.flags.DEFINE_float('final_epsilon', 0.01, help='final value for epsilon') tf.flags.DEFINE_float('gamma', 0.99, help='decay rate for future reward') tf.flags.DEFINE_integer('replay_memory', 200000, 'number of previous transitions to remember') # 200000 = 10GB RAM tf.flags.DEFINE_integer('n_episodes', 100000, 'number of episodes to let the model train for') tf.flags.DEFINE_integer('no_op_steps', 2, 'number of steps to do nothing at start of each episode') tf.flags.DEFINE_integer('update_target_model_steps', 100000, 'update target Q model every n episodes') tf.flags.DEFINE_string('train_dir', 'MK7_train_data', 'location for training data and logs') # tf.flags.DEFINE_boolean('render', True, 'whether to render the image') tf.flags.DEFINE_boolean('render', False, 'whether to render the image') Input_shape = (84, 84, 4) # input image size to model Action_size = 3 # Create a pre-processing function # Converts colour image into a smaller grey-scale image # Converts floats to integers def pre_processing(observation): processed_observation = rgb2gray(observation) * 255 # Convering to grey converts it to normalised values [0,255] -> [0,1] # We need to store the values as integers in the next step to save space, so we need to undo the normalisation processed_observation = resize(processed_observation, (84, 84), mode='constant') # Convert from floating point to integers ranging from [0,255] processed_observation = np.uint8(processed_observation) return processed_observation # Create the model to approximate Qvalues for a given set of states and actions def atari_model(): # Task-1 : to fill the network structure of atari_model # Define the inputs frames_input = keras.Input(shape=Input_shape, name='frames') actions_input = keras.layers.Input(shape=(Action_size,), name='action_mask') # Normalise the inputs from [0,255] to [0,1] - to make processing easier normalised = keras.layers.Lambda(lambda x: x/255.0, name='normalised')(frames_input) # Conv1 is 16 8x8 filters with a stride of 4 and a ReLU conv1 = '?' # Conv2 is 32 4x4 filters with a stride of 2 and a ReLU conv2 = '?' # Flatten the output from Conv2 conv2_flatten = keras.layers.Flatten()(conv2) # Then a fully connected layer with 128 ReLU units dense1 = '?' # Then a fully connected layer with a unit to map to each of the actions and no activation output = '?' # Then we multiply the output by the action mask # When trying to find the value of all the actions this will be a mask full of 1s # When trying to find the value of a specific action, the mask will only be 1 for a single action filtered_output = keras.layers.Multiply(name='Qvalue')([output, actions_input]) # Create the model # Create a model that maps frames and actions to the filtered output model = keras.Model(inputs=[frames_input, actions_input], outputs=filtered_output) # Print a summary of the model model.summary() # Define optimiser optimiser = tf.train.AdamOptimizer() # Compile model loss = '?' # Task-2: to choose the loss function model.compile(optimizer=optimiser, loss=loss) # Return the model return model # Create a model to use as a target def atari_model_target(): # Task-3: to implement the code for atari_model_target model = '?' return model # get action from model using epsilon-greedy policy def get_action(history, epsilon, step, model): if np.random.rand() <= epsilon or step <= FLAGS.observe_step_num: return random.randrange(Action_size) else: q_value = model.predict([history, np.ones(Action_size).reshape(1, Action_size)]) return np.argmax(q_value[0]) # save sample <s,a,r,s'> to the replay memory def store_memory(memory, history, action, reward, next_history): memory.append((history, action, reward, next_history)) def get_one_hot(targets, nb_classes): return np.eye(nb_classes)[np.array(targets).reshape(-1)] # train model by random batch def train_memory_batch(memory, model): # Sample a minibatch mini_batch = random.sample(memory, FLAGS.batch_size) # Create empty arrays to load our minibatch into # These objects have multiple values hence need defined shapes state = np.zeros((FLAGS.batch_size, Input_shape[0], Input_shape[1], Input_shape[2])) next_state = np.zeros((FLAGS.batch_size, Input_shape[0], Input_shape[1], Input_shape[2])) # These objects have a single value, so we can just create a list that we append later action = [] reward = [] # Create an array that will carry what the target q values will be - based on our target networks weights target_q = np.zeros((FLAGS.batch_size,)) # Fill up our arrays with our minibatch for id, val in enumerate(mini_batch): state[id] = val[0] print(val[0].shape) next_state[id] = val[3] action.append(val[1]) reward.append(val[2]) # We want the model to predict the q value for all actions hence: actions_mask = np.ones((FLAGS.batch_size, Action_size)) # Get the target model to predict the q values for all actions next_q_values = model.predict([next_state, actions_mask]) # Fill out target q values based on the max q value in the next state for i in range(FLAGS.batch_size): # Standard discounted reward formula # q(s,a) = r + discount * cumulative future rewards target_q[i] = reward[i] + FLAGS.gamma * np.amax(next_q_values[i]) # Convert all the actions into one hot vectors action_one_hot = get_one_hot(action, Action_size) # Apply one hot mask onto target vector # This results in a vector that has the max q value in the position corresponding to the action target_one_hot = action_one_hot * target_q[:, None] # Then we fit the model # We map the state and the action from the memory bank to the q value of that state action pair # s,a -> q(s,a|w) h = model.fit([state, action_one_hot], target_one_hot, epochs=1, batch_size=FLAGS.batch_size, verbose=0) # Return the loss # It's just for monitoring progress return h.history['loss'][0] def train(): # Define which game to play env = gym.make('PongDeterministic-v4') # Create a space for our memory # We will use a deque - double ended que # This will result in a max size so that once all the space is filled, # older entries will be removed to make room for new memory = deque(maxlen=FLAGS.replay_memory) # Start episode counter episode_number = 0 # Set epsilon epsilon = FLAGS.initial_epsilon # Define epsilon decay epsilon_decay = (FLAGS.initial_epsilon - FLAGS.final_epsilon) / FLAGS.epsilon_anneal_num # Start global step global_step = 0 # Define model model = atari_model() # Define target model model_target = atari_model_target() # Define where to store logs log_dir = "{}/run-{}-log".format(FLAGS.train_dir, 'MK10') # Pass graph to TensorBoard file_writer = tf.summary.FileWriter(log_dir, tf.get_default_graph()) # Start the optimisation loop while episode_number < FLAGS.n_episodes: # Initialisise done as false done = False step = 0 score = 0 loss = 0.0 # Initialise environment observation = env.reset() # For the very start of the episode, we will do nothing but observe # This way we can get a sense of what's going on for _ in range(random.randint(1, FLAGS.no_op_steps)): observation, _, _, _ = env.step(1) # At the start of the episode there are no preceding frames # So we just copy the initial states into a stack to make the state history state = pre_processing(observation) state_history = np.stack((state, state, state, state), axis=2) state_history = np.reshape([state_history], (1, 84, 84, 4)) # Perform while we still have lives while not done: # Render the image if selected to do so if FLAGS.render: env.render() # Select an action based on our current model # Task-4: select_model = model_target or select_model = model select_model = '?' action = get_action(state_history, epsilon, global_step, select_model) # Convert action from array numbers to real numbers real_action = action + 1 # After we're done observing, start scaling down epsilon if global_step > FLAGS.observe_step_num and epsilon > FLAGS.final_epsilon: epsilon -= epsilon_decay # Record output from the environment observation, reward, done, info = env.step(real_action) # Process the observation next_state = pre_processing(observation) next_state = np.reshape([next_state], (1, 84, 84, 1)) # Update the history with the next state - also remove oldest state state_history_w_next = np.append(next_state, state_history[:, :, :, :3], axis=3) # Update score score += reward # Save the (s, a, r, s') set to memory store_memory(memory, state_history, action, reward, state_history_w_next) # Train model # Check if we are done observing if global_step > FLAGS.observe_step_num: loss = loss + train_memory_batch(memory, model) # Check if we are ready to update target model with the model we have been training if global_step % FLAGS.update_target_model_steps == 0: model_target.set_weights(model.get_weights()) print("UPDATING TARGET WEIGHTS") state_history = state_history_w_next #print("step: ", global_step) global_step += 1 step += 1 # Check if episode is over - lost all lives in breakout if done: # Check if we are still observing if global_step <= FLAGS.observe_step_num: current_position = 'observe' # Check if we are still annealing epsilon elif FLAGS.observe_step_num < global_step <= FLAGS.observe_step_num + FLAGS.epsilon_anneal_num: current_position = 'explore' else: current_position = 'train' # Print status print( 'current position: {}, epsilon: {} , episode: {}, score: {}, global_step: {}, avg loss: {}, step: {}, memory length: {}' .format(current_position, epsilon, episode_number, score, global_step, loss / float(step), step, len(memory))) # Save model every 100 episodes and final episode if episode_number % 100 == 0 or (episode_number + 1) == FLAGS.n_episodes: file_name = "pong_model_{}.h5".format(episode_number) model_path = os.path.join(FLAGS.train_dir, file_name) model.save(model_path) # Add loss and score data to TensorBoard loss_summary = tf.Summary( value=[tf.Summary.Value(tag="loss", simple_value=loss / float(step))]) file_writer.add_summary(loss_summary, global_step=episode_number) score_summary = tf.Summary( value=[tf.Summary.Value(tag="score", simple_value=score)]) file_writer.add_summary(score_summary, global_step=episode_number) # Increment episode number episode_number += 1 file_writer.close() if __name__ == "__main__": t_start = time.time() train() t_end = time.time() t_all = t_end - t_start print('train.py: whole time: {:.2f} h ({:.2f} min)'.format(t_all / 3600., t_all / 60.))