From d744568bb571f072bd1bf8461578dd1b4718629f Mon Sep 17 00:00:00 2001
From: xiaoy <zhaoyin214@qq.com>
Date: Wed, 12 Aug 2020 19:27:40 +0800
Subject: [PATCH] homework8

---
 homework8/table_tennis.py | 292 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 292 insertions(+)
 create mode 100644 homework8/table_tennis.py

diff --git a/homework8/table_tennis.py b/homework8/table_tennis.py
new file mode 100644
index 0000000..665146a
--- /dev/null
+++ b/homework8/table_tennis.py
@@ -0,0 +1,292 @@
+import gym
+import random
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from skimage.color import rgb2gray
+from skimage.transform import resize
+from collections import deque
+import os.path
+import os, time
+
+# Define Hyperparameters
+FLAGS = tf.flags.FLAGS
+tf.flags.DEFINE_float('optimiser_learning_rate', 0.00025, help='learning rate for optimiser')
+tf.flags.DEFINE_integer('observe_step_num', 100000, help='number of steps to observe before choosing actions')
+tf.flags.DEFINE_integer('batch_size', 32, help='size of batch')
+tf.flags.DEFINE_float('initial_epsilon', 1.0, help='initial value for epsilon')
+tf.flags.DEFINE_integer('epsilon_anneal_num', 500000, help='number of steps over to anneal epsilon')
+tf.flags.DEFINE_float('final_epsilon', 0.01, help='final value for epsilon')
+tf.flags.DEFINE_float('gamma', 0.99, help='decay rate for future reward')
+tf.flags.DEFINE_integer('replay_memory', 200000, 'number of previous transitions to remember')  # 200000 = 10GB RAM
+tf.flags.DEFINE_integer('n_episodes', 100000, 'number of episodes to let the model train for')
+tf.flags.DEFINE_integer('no_op_steps', 2, 'number of steps to do nothing at start of each episode')
+tf.flags.DEFINE_integer('update_target_model_steps', 100000, 'update target Q model every n episodes')
+tf.flags.DEFINE_string('train_dir', 'MK7_train_data', 'location for training data and logs')
+# tf.flags.DEFINE_boolean('render', True, 'whether to render the image')
+tf.flags.DEFINE_boolean('render', False, 'whether to render the image')
+
+Input_shape = (84, 84, 4)  # input image size to model
+Action_size = 3
+
+# Create a pre-processing function
+# Converts colour image into a smaller grey-scale image
+# Converts floats to integers
+def pre_processing(observation):
+    processed_observation = rgb2gray(observation) * 255
+    # Convering to grey converts it to normalised values [0,255] -> [0,1]
+    # We need to store the values as integers in the next step to save space, so we need to undo the normalisation
+    processed_observation = resize(processed_observation, (84, 84), mode='constant')
+    # Convert from floating point to integers ranging from [0,255]
+    processed_observation = np.uint8(processed_observation)
+    return processed_observation
+
+# Create the model to approximate Qvalues for a given set of states and actions
+def atari_model():
+    # Task-1 : to fill the network structure of atari_model
+    # Define the inputs
+    frames_input = keras.Input(shape=Input_shape, name='frames')
+    actions_input = keras.layers.Input(shape=(Action_size,), name='action_mask')
+
+    # Normalise the inputs from [0,255] to [0,1] - to make processing easier
+    normalised = keras.layers.Lambda(lambda x: x/255.0, name='normalised')(frames_input)
+    # Conv1 is 16 8x8 filters with a stride of 4 and a ReLU
+    conv1 = '?'
+    # Conv2 is 32 4x4 filters with a stride of 2 and a ReLU
+    conv2 = '?'
+    # Flatten the output from Conv2
+    conv2_flatten = keras.layers.Flatten()(conv2)
+    # Then a fully connected layer with 128 ReLU units
+    dense1 = '?'
+    # Then a fully connected layer with a unit to map to each of the actions and no activation
+    output =  '?'
+    # Then we multiply the output by the action mask
+    # When trying to find the value of all the actions this will be a mask full of 1s
+    # When trying to find the value of a specific action, the mask will only be 1 for a single action
+    filtered_output = keras.layers.Multiply(name='Qvalue')([output, actions_input])
+
+    # Create the model
+    # Create a model that maps frames and actions to the filtered output
+    model = keras.Model(inputs=[frames_input, actions_input], outputs=filtered_output)
+    # Print a summary of the model
+    model.summary()
+    # Define optimiser
+    optimiser = tf.train.AdamOptimizer()
+    # Compile model
+    loss = '?'  # Task-2: to choose the loss function
+    model.compile(optimizer=optimiser, loss=loss)
+    # Return the model
+    return model
+
+# Create a model to use as a target
+def atari_model_target():
+    # Task-3: to implement the code for atari_model_target
+    model = '?'
+    return model
+
+# get action from model using epsilon-greedy policy
+def get_action(history, epsilon, step, model):
+    if np.random.rand() <= epsilon or step <= FLAGS.observe_step_num:
+        return random.randrange(Action_size)
+    else:
+        q_value = model.predict([history, np.ones(Action_size).reshape(1, Action_size)])
+        return np.argmax(q_value[0])
+
+# save sample <s,a,r,s'> to the replay memory
+def store_memory(memory, history, action, reward, next_history):
+    memory.append((history, action, reward, next_history))
+
+def get_one_hot(targets, nb_classes):
+    return np.eye(nb_classes)[np.array(targets).reshape(-1)]
+
+# train model by random batch
+def train_memory_batch(memory, model):
+    # Sample a minibatch
+    mini_batch = random.sample(memory, FLAGS.batch_size)
+    # Create empty arrays to load our minibatch into
+    # These objects have multiple values hence need defined shapes
+    state = np.zeros((FLAGS.batch_size, Input_shape[0], Input_shape[1], Input_shape[2]))
+    next_state = np.zeros((FLAGS.batch_size, Input_shape[0], Input_shape[1], Input_shape[2]))
+    # These objects have a single value, so we can just create a list that we append later
+    action = []
+    reward = []
+    # Create an array that will carry what the target q values will be - based on our target networks weights
+    target_q = np.zeros((FLAGS.batch_size,))
+
+    # Fill up our arrays with our minibatch
+    for id, val in enumerate(mini_batch):
+        state[id] = val[0]
+        print(val[0].shape)
+        next_state[id] = val[3]
+        action.append(val[1])
+        reward.append(val[2])
+
+    # We want the model to predict the q value for all actions hence:
+    actions_mask = np.ones((FLAGS.batch_size, Action_size))
+    # Get the target model to predict the q values for all actions
+    next_q_values = model.predict([next_state, actions_mask])
+
+    # Fill out target q values based on the max q value in the next state
+    for i in range(FLAGS.batch_size):
+        # Standard discounted reward formula
+        # q(s,a) = r + discount * cumulative future rewards
+        target_q[i] = reward[i] + FLAGS.gamma * np.amax(next_q_values[i])
+
+    # Convert all the actions into one hot vectors
+    action_one_hot = get_one_hot(action, Action_size)
+    # Apply one hot mask onto target vector
+    # This results in a vector that has the max q value in the position corresponding to the action
+    target_one_hot = action_one_hot * target_q[:, None]
+
+    # Then we fit the model
+    # We map the state and the action from the memory bank to the q value of that state action pair
+    # s,a -> q(s,a|w)
+    h = model.fit([state, action_one_hot], target_one_hot, epochs=1, batch_size=FLAGS.batch_size, verbose=0)
+
+    # Return the loss
+    # It's just for monitoring progress
+    return h.history['loss'][0]
+
+def train():
+    # Define which game to play
+    env = gym.make('PongDeterministic-v4')
+
+    # Create a space for our memory
+    # We will use a deque - double ended que
+    # This will result in a max size so that once all the space is filled,
+    # older entries will be removed to make room for new
+    memory = deque(maxlen=FLAGS.replay_memory)
+
+    # Start episode counter
+    episode_number = 0
+
+    # Set epsilon
+    epsilon = FLAGS.initial_epsilon
+    # Define epsilon decay
+    epsilon_decay = (FLAGS.initial_epsilon - FLAGS.final_epsilon) / FLAGS.epsilon_anneal_num
+
+    # Start global step
+    global_step = 0
+
+    # Define model
+    model = atari_model()
+
+    # Define target model
+    model_target = atari_model_target()
+
+    # Define where to store logs
+    log_dir = "{}/run-{}-log".format(FLAGS.train_dir, 'MK10')
+    # Pass graph to TensorBoard
+    file_writer = tf.summary.FileWriter(log_dir, tf.get_default_graph())
+
+    # Start the optimisation loop
+    while episode_number < FLAGS.n_episodes:
+        # Initialisise done as false
+        done = False
+        step = 0
+        score = 0
+        loss = 0.0
+
+        # Initialise environment
+        observation = env.reset()
+
+        # For the very start of the episode, we will do nothing but observe
+        # This way we can get a sense of what's going on
+        for _ in range(random.randint(1, FLAGS.no_op_steps)):
+            observation, _, _, _ = env.step(1)
+
+        # At the start of the episode there are no preceding frames
+        # So we just copy the initial states into a stack to make the state history
+        state = pre_processing(observation)
+        state_history = np.stack((state, state, state, state), axis=2)
+        state_history = np.reshape([state_history], (1, 84, 84, 4))
+
+        # Perform while we still have lives
+        while not done:
+            # Render the image if selected to do so
+            if FLAGS.render:
+                env.render()
+
+            # Select an action based on our current model
+            # Task-4: select_model = model_target or select_model = model
+            select_model = '?'
+            action = get_action(state_history, epsilon, global_step, select_model)
+
+            # Convert action from array numbers to real numbers
+            real_action = action + 1
+
+            # After we're done observing, start scaling down epsilon
+            if global_step > FLAGS.observe_step_num and epsilon > FLAGS.final_epsilon:
+                epsilon -= epsilon_decay
+
+            # Record output from the environment
+            observation, reward, done, info = env.step(real_action)
+
+            # Process the observation
+            next_state = pre_processing(observation)
+            next_state = np.reshape([next_state], (1, 84, 84, 1))
+            # Update the history with the next state - also remove oldest state
+            state_history_w_next = np.append(next_state, state_history[:, :, :, :3], axis=3)
+
+            # Update score
+            score += reward
+
+            # Save the (s, a, r, s') set to memory
+            store_memory(memory, state_history, action, reward, state_history_w_next)
+
+            # Train model
+            # Check if we are done observing
+            if global_step > FLAGS.observe_step_num:
+                loss = loss + train_memory_batch(memory, model)
+                # Check if we are ready to update target model with the model we have been training
+                if global_step % FLAGS.update_target_model_steps == 0:
+                    model_target.set_weights(model.get_weights())
+                    print("UPDATING TARGET WEIGHTS")
+            state_history = state_history_w_next
+
+            #print("step: ", global_step)
+            global_step += 1
+            step += 1
+
+            # Check if episode is over - lost all lives in breakout
+            if done:
+                # Check if we are still observing
+                if global_step <= FLAGS.observe_step_num:
+                    current_position = 'observe'
+                # Check if we are still annealing epsilon
+                elif FLAGS.observe_step_num < global_step <= FLAGS.observe_step_num + FLAGS.epsilon_anneal_num:
+                    current_position = 'explore'
+                else:
+                    current_position = 'train'
+                # Print status
+                print(
+                    'current position: {}, epsilon: {} , episode: {}, score: {}, global_step: {}, avg loss: {}, step: {}, memory length: {}'
+                    .format(current_position, epsilon, episode_number, score, global_step, loss / float(step), step,
+                            len(memory)))
+
+                # Save model every 100 episodes and final episode
+                if episode_number % 100 == 0 or (episode_number + 1) == FLAGS.n_episodes:
+                    file_name = "pong_model_{}.h5".format(episode_number)
+                    model_path = os.path.join(FLAGS.train_dir, file_name)
+                    model.save(model_path)
+
+                # Add loss and score  data to TensorBoard
+                loss_summary = tf.Summary(
+                    value=[tf.Summary.Value(tag="loss", simple_value=loss / float(step))])
+                file_writer.add_summary(loss_summary, global_step=episode_number)
+
+                score_summary = tf.Summary(
+                    value=[tf.Summary.Value(tag="score", simple_value=score)])
+                file_writer.add_summary(score_summary, global_step=episode_number)
+
+                # Increment episode number
+                episode_number += 1
+
+    file_writer.close()
+
+if __name__ == "__main__":
+    t_start = time.time()
+    train()
+    t_end = time.time()
+    t_all = t_end - t_start
+    print('train.py: whole time: {:.2f} h ({:.2f} min)'.format(t_all / 3600., t_all / 60.))
--
libgit2 0.26.0