From d744568bb571f072bd1bf8461578dd1b4718629f Mon Sep 17 00:00:00 2001 From: xiaoy <zhaoyin214@qq.com> Date: Wed, 12 Aug 2020 19:27:40 +0800 Subject: [PATCH] homework8 --- homework8/table_tennis.py | 292 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 292 insertions(+) create mode 100644 homework8/table_tennis.py diff --git a/homework8/table_tennis.py b/homework8/table_tennis.py new file mode 100644 index 0000000..665146a --- /dev/null +++ b/homework8/table_tennis.py @@ -0,0 +1,292 @@ +import gym +import random +import numpy as np +import tensorflow as tf +from tensorflow import keras +from skimage.color import rgb2gray +from skimage.transform import resize +from collections import deque +import os.path +import os, time + +# Define Hyperparameters +FLAGS = tf.flags.FLAGS +tf.flags.DEFINE_float('optimiser_learning_rate', 0.00025, help='learning rate for optimiser') +tf.flags.DEFINE_integer('observe_step_num', 100000, help='number of steps to observe before choosing actions') +tf.flags.DEFINE_integer('batch_size', 32, help='size of batch') +tf.flags.DEFINE_float('initial_epsilon', 1.0, help='initial value for epsilon') +tf.flags.DEFINE_integer('epsilon_anneal_num', 500000, help='number of steps over to anneal epsilon') +tf.flags.DEFINE_float('final_epsilon', 0.01, help='final value for epsilon') +tf.flags.DEFINE_float('gamma', 0.99, help='decay rate for future reward') +tf.flags.DEFINE_integer('replay_memory', 200000, 'number of previous transitions to remember') # 200000 = 10GB RAM +tf.flags.DEFINE_integer('n_episodes', 100000, 'number of episodes to let the model train for') +tf.flags.DEFINE_integer('no_op_steps', 2, 'number of steps to do nothing at start of each episode') +tf.flags.DEFINE_integer('update_target_model_steps', 100000, 'update target Q model every n episodes') +tf.flags.DEFINE_string('train_dir', 'MK7_train_data', 'location for training data and logs') +# tf.flags.DEFINE_boolean('render', True, 'whether to render the image') +tf.flags.DEFINE_boolean('render', False, 'whether to render the image') + +Input_shape = (84, 84, 4) # input image size to model +Action_size = 3 + +# Create a pre-processing function +# Converts colour image into a smaller grey-scale image +# Converts floats to integers +def pre_processing(observation): + processed_observation = rgb2gray(observation) * 255 + # Convering to grey converts it to normalised values [0,255] -> [0,1] + # We need to store the values as integers in the next step to save space, so we need to undo the normalisation + processed_observation = resize(processed_observation, (84, 84), mode='constant') + # Convert from floating point to integers ranging from [0,255] + processed_observation = np.uint8(processed_observation) + return processed_observation + +# Create the model to approximate Qvalues for a given set of states and actions +def atari_model(): + # Task-1 : to fill the network structure of atari_model + # Define the inputs + frames_input = keras.Input(shape=Input_shape, name='frames') + actions_input = keras.layers.Input(shape=(Action_size,), name='action_mask') + + # Normalise the inputs from [0,255] to [0,1] - to make processing easier + normalised = keras.layers.Lambda(lambda x: x/255.0, name='normalised')(frames_input) + # Conv1 is 16 8x8 filters with a stride of 4 and a ReLU + conv1 = '?' + # Conv2 is 32 4x4 filters with a stride of 2 and a ReLU + conv2 = '?' + # Flatten the output from Conv2 + conv2_flatten = keras.layers.Flatten()(conv2) + # Then a fully connected layer with 128 ReLU units + dense1 = '?' + # Then a fully connected layer with a unit to map to each of the actions and no activation + output = '?' + # Then we multiply the output by the action mask + # When trying to find the value of all the actions this will be a mask full of 1s + # When trying to find the value of a specific action, the mask will only be 1 for a single action + filtered_output = keras.layers.Multiply(name='Qvalue')([output, actions_input]) + + # Create the model + # Create a model that maps frames and actions to the filtered output + model = keras.Model(inputs=[frames_input, actions_input], outputs=filtered_output) + # Print a summary of the model + model.summary() + # Define optimiser + optimiser = tf.train.AdamOptimizer() + # Compile model + loss = '?' # Task-2: to choose the loss function + model.compile(optimizer=optimiser, loss=loss) + # Return the model + return model + +# Create a model to use as a target +def atari_model_target(): + # Task-3: to implement the code for atari_model_target + model = '?' + return model + +# get action from model using epsilon-greedy policy +def get_action(history, epsilon, step, model): + if np.random.rand() <= epsilon or step <= FLAGS.observe_step_num: + return random.randrange(Action_size) + else: + q_value = model.predict([history, np.ones(Action_size).reshape(1, Action_size)]) + return np.argmax(q_value[0]) + +# save sample <s,a,r,s'> to the replay memory +def store_memory(memory, history, action, reward, next_history): + memory.append((history, action, reward, next_history)) + +def get_one_hot(targets, nb_classes): + return np.eye(nb_classes)[np.array(targets).reshape(-1)] + +# train model by random batch +def train_memory_batch(memory, model): + # Sample a minibatch + mini_batch = random.sample(memory, FLAGS.batch_size) + # Create empty arrays to load our minibatch into + # These objects have multiple values hence need defined shapes + state = np.zeros((FLAGS.batch_size, Input_shape[0], Input_shape[1], Input_shape[2])) + next_state = np.zeros((FLAGS.batch_size, Input_shape[0], Input_shape[1], Input_shape[2])) + # These objects have a single value, so we can just create a list that we append later + action = [] + reward = [] + # Create an array that will carry what the target q values will be - based on our target networks weights + target_q = np.zeros((FLAGS.batch_size,)) + + # Fill up our arrays with our minibatch + for id, val in enumerate(mini_batch): + state[id] = val[0] + print(val[0].shape) + next_state[id] = val[3] + action.append(val[1]) + reward.append(val[2]) + + # We want the model to predict the q value for all actions hence: + actions_mask = np.ones((FLAGS.batch_size, Action_size)) + # Get the target model to predict the q values for all actions + next_q_values = model.predict([next_state, actions_mask]) + + # Fill out target q values based on the max q value in the next state + for i in range(FLAGS.batch_size): + # Standard discounted reward formula + # q(s,a) = r + discount * cumulative future rewards + target_q[i] = reward[i] + FLAGS.gamma * np.amax(next_q_values[i]) + + # Convert all the actions into one hot vectors + action_one_hot = get_one_hot(action, Action_size) + # Apply one hot mask onto target vector + # This results in a vector that has the max q value in the position corresponding to the action + target_one_hot = action_one_hot * target_q[:, None] + + # Then we fit the model + # We map the state and the action from the memory bank to the q value of that state action pair + # s,a -> q(s,a|w) + h = model.fit([state, action_one_hot], target_one_hot, epochs=1, batch_size=FLAGS.batch_size, verbose=0) + + # Return the loss + # It's just for monitoring progress + return h.history['loss'][0] + +def train(): + # Define which game to play + env = gym.make('PongDeterministic-v4') + + # Create a space for our memory + # We will use a deque - double ended que + # This will result in a max size so that once all the space is filled, + # older entries will be removed to make room for new + memory = deque(maxlen=FLAGS.replay_memory) + + # Start episode counter + episode_number = 0 + + # Set epsilon + epsilon = FLAGS.initial_epsilon + # Define epsilon decay + epsilon_decay = (FLAGS.initial_epsilon - FLAGS.final_epsilon) / FLAGS.epsilon_anneal_num + + # Start global step + global_step = 0 + + # Define model + model = atari_model() + + # Define target model + model_target = atari_model_target() + + # Define where to store logs + log_dir = "{}/run-{}-log".format(FLAGS.train_dir, 'MK10') + # Pass graph to TensorBoard + file_writer = tf.summary.FileWriter(log_dir, tf.get_default_graph()) + + # Start the optimisation loop + while episode_number < FLAGS.n_episodes: + # Initialisise done as false + done = False + step = 0 + score = 0 + loss = 0.0 + + # Initialise environment + observation = env.reset() + + # For the very start of the episode, we will do nothing but observe + # This way we can get a sense of what's going on + for _ in range(random.randint(1, FLAGS.no_op_steps)): + observation, _, _, _ = env.step(1) + + # At the start of the episode there are no preceding frames + # So we just copy the initial states into a stack to make the state history + state = pre_processing(observation) + state_history = np.stack((state, state, state, state), axis=2) + state_history = np.reshape([state_history], (1, 84, 84, 4)) + + # Perform while we still have lives + while not done: + # Render the image if selected to do so + if FLAGS.render: + env.render() + + # Select an action based on our current model + # Task-4: select_model = model_target or select_model = model + select_model = '?' + action = get_action(state_history, epsilon, global_step, select_model) + + # Convert action from array numbers to real numbers + real_action = action + 1 + + # After we're done observing, start scaling down epsilon + if global_step > FLAGS.observe_step_num and epsilon > FLAGS.final_epsilon: + epsilon -= epsilon_decay + + # Record output from the environment + observation, reward, done, info = env.step(real_action) + + # Process the observation + next_state = pre_processing(observation) + next_state = np.reshape([next_state], (1, 84, 84, 1)) + # Update the history with the next state - also remove oldest state + state_history_w_next = np.append(next_state, state_history[:, :, :, :3], axis=3) + + # Update score + score += reward + + # Save the (s, a, r, s') set to memory + store_memory(memory, state_history, action, reward, state_history_w_next) + + # Train model + # Check if we are done observing + if global_step > FLAGS.observe_step_num: + loss = loss + train_memory_batch(memory, model) + # Check if we are ready to update target model with the model we have been training + if global_step % FLAGS.update_target_model_steps == 0: + model_target.set_weights(model.get_weights()) + print("UPDATING TARGET WEIGHTS") + state_history = state_history_w_next + + #print("step: ", global_step) + global_step += 1 + step += 1 + + # Check if episode is over - lost all lives in breakout + if done: + # Check if we are still observing + if global_step <= FLAGS.observe_step_num: + current_position = 'observe' + # Check if we are still annealing epsilon + elif FLAGS.observe_step_num < global_step <= FLAGS.observe_step_num + FLAGS.epsilon_anneal_num: + current_position = 'explore' + else: + current_position = 'train' + # Print status + print( + 'current position: {}, epsilon: {} , episode: {}, score: {}, global_step: {}, avg loss: {}, step: {}, memory length: {}' + .format(current_position, epsilon, episode_number, score, global_step, loss / float(step), step, + len(memory))) + + # Save model every 100 episodes and final episode + if episode_number % 100 == 0 or (episode_number + 1) == FLAGS.n_episodes: + file_name = "pong_model_{}.h5".format(episode_number) + model_path = os.path.join(FLAGS.train_dir, file_name) + model.save(model_path) + + # Add loss and score data to TensorBoard + loss_summary = tf.Summary( + value=[tf.Summary.Value(tag="loss", simple_value=loss / float(step))]) + file_writer.add_summary(loss_summary, global_step=episode_number) + + score_summary = tf.Summary( + value=[tf.Summary.Value(tag="score", simple_value=score)]) + file_writer.add_summary(score_summary, global_step=episode_number) + + # Increment episode number + episode_number += 1 + + file_writer.close() + +if __name__ == "__main__": + t_start = time.time() + train() + t_end = time.time() + t_all = t_end - t_start + print('train.py: whole time: {:.2f} h ({:.2f} min)'.format(t_all / 3600., t_all / 60.)) -- libgit2 0.26.0