Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
M
ml2_MiniAssignments
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
20200318029
ml2_MiniAssignments
Commits
d744568b
Commit
d744568b
authored
Aug 12, 2020
by
20200318029
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
homework8
parent
e7f5bc75
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
292 additions
and
0 deletions
+292
-0
homework8/table_tennis.py
+292
-0
No files found.
homework8/table_tennis.py
0 → 100644
View file @
d744568b
import
gym
import
random
import
numpy
as
np
import
tensorflow
as
tf
from
tensorflow
import
keras
from
skimage.color
import
rgb2gray
from
skimage.transform
import
resize
from
collections
import
deque
import
os.path
import
os
,
time
# Define Hyperparameters
FLAGS
=
tf
.
flags
.
FLAGS
tf
.
flags
.
DEFINE_float
(
'optimiser_learning_rate'
,
0.00025
,
help
=
'learning rate for optimiser'
)
tf
.
flags
.
DEFINE_integer
(
'observe_step_num'
,
100000
,
help
=
'number of steps to observe before choosing actions'
)
tf
.
flags
.
DEFINE_integer
(
'batch_size'
,
32
,
help
=
'size of batch'
)
tf
.
flags
.
DEFINE_float
(
'initial_epsilon'
,
1.0
,
help
=
'initial value for epsilon'
)
tf
.
flags
.
DEFINE_integer
(
'epsilon_anneal_num'
,
500000
,
help
=
'number of steps over to anneal epsilon'
)
tf
.
flags
.
DEFINE_float
(
'final_epsilon'
,
0.01
,
help
=
'final value for epsilon'
)
tf
.
flags
.
DEFINE_float
(
'gamma'
,
0.99
,
help
=
'decay rate for future reward'
)
tf
.
flags
.
DEFINE_integer
(
'replay_memory'
,
200000
,
'number of previous transitions to remember'
)
# 200000 = 10GB RAM
tf
.
flags
.
DEFINE_integer
(
'n_episodes'
,
100000
,
'number of episodes to let the model train for'
)
tf
.
flags
.
DEFINE_integer
(
'no_op_steps'
,
2
,
'number of steps to do nothing at start of each episode'
)
tf
.
flags
.
DEFINE_integer
(
'update_target_model_steps'
,
100000
,
'update target Q model every n episodes'
)
tf
.
flags
.
DEFINE_string
(
'train_dir'
,
'MK7_train_data'
,
'location for training data and logs'
)
# tf.flags.DEFINE_boolean('render', True, 'whether to render the image')
tf
.
flags
.
DEFINE_boolean
(
'render'
,
False
,
'whether to render the image'
)
Input_shape
=
(
84
,
84
,
4
)
# input image size to model
Action_size
=
3
# Create a pre-processing function
# Converts colour image into a smaller grey-scale image
# Converts floats to integers
def
pre_processing
(
observation
):
processed_observation
=
rgb2gray
(
observation
)
*
255
# Convering to grey converts it to normalised values [0,255] -> [0,1]
# We need to store the values as integers in the next step to save space, so we need to undo the normalisation
processed_observation
=
resize
(
processed_observation
,
(
84
,
84
),
mode
=
'constant'
)
# Convert from floating point to integers ranging from [0,255]
processed_observation
=
np
.
uint8
(
processed_observation
)
return
processed_observation
# Create the model to approximate Qvalues for a given set of states and actions
def
atari_model
():
# Task-1 : to fill the network structure of atari_model
# Define the inputs
frames_input
=
keras
.
Input
(
shape
=
Input_shape
,
name
=
'frames'
)
actions_input
=
keras
.
layers
.
Input
(
shape
=
(
Action_size
,),
name
=
'action_mask'
)
# Normalise the inputs from [0,255] to [0,1] - to make processing easier
normalised
=
keras
.
layers
.
Lambda
(
lambda
x
:
x
/
255.0
,
name
=
'normalised'
)(
frames_input
)
# Conv1 is 16 8x8 filters with a stride of 4 and a ReLU
conv1
=
'?'
# Conv2 is 32 4x4 filters with a stride of 2 and a ReLU
conv2
=
'?'
# Flatten the output from Conv2
conv2_flatten
=
keras
.
layers
.
Flatten
()(
conv2
)
# Then a fully connected layer with 128 ReLU units
dense1
=
'?'
# Then a fully connected layer with a unit to map to each of the actions and no activation
output
=
'?'
# Then we multiply the output by the action mask
# When trying to find the value of all the actions this will be a mask full of 1s
# When trying to find the value of a specific action, the mask will only be 1 for a single action
filtered_output
=
keras
.
layers
.
Multiply
(
name
=
'Qvalue'
)([
output
,
actions_input
])
# Create the model
# Create a model that maps frames and actions to the filtered output
model
=
keras
.
Model
(
inputs
=
[
frames_input
,
actions_input
],
outputs
=
filtered_output
)
# Print a summary of the model
model
.
summary
()
# Define optimiser
optimiser
=
tf
.
train
.
AdamOptimizer
()
# Compile model
loss
=
'?'
# Task-2: to choose the loss function
model
.
compile
(
optimizer
=
optimiser
,
loss
=
loss
)
# Return the model
return
model
# Create a model to use as a target
def
atari_model_target
():
# Task-3: to implement the code for atari_model_target
model
=
'?'
return
model
# get action from model using epsilon-greedy policy
def
get_action
(
history
,
epsilon
,
step
,
model
):
if
np
.
random
.
rand
()
<=
epsilon
or
step
<=
FLAGS
.
observe_step_num
:
return
random
.
randrange
(
Action_size
)
else
:
q_value
=
model
.
predict
([
history
,
np
.
ones
(
Action_size
)
.
reshape
(
1
,
Action_size
)])
return
np
.
argmax
(
q_value
[
0
])
# save sample <s,a,r,s'> to the replay memory
def
store_memory
(
memory
,
history
,
action
,
reward
,
next_history
):
memory
.
append
((
history
,
action
,
reward
,
next_history
))
def
get_one_hot
(
targets
,
nb_classes
):
return
np
.
eye
(
nb_classes
)[
np
.
array
(
targets
)
.
reshape
(
-
1
)]
# train model by random batch
def
train_memory_batch
(
memory
,
model
):
# Sample a minibatch
mini_batch
=
random
.
sample
(
memory
,
FLAGS
.
batch_size
)
# Create empty arrays to load our minibatch into
# These objects have multiple values hence need defined shapes
state
=
np
.
zeros
((
FLAGS
.
batch_size
,
Input_shape
[
0
],
Input_shape
[
1
],
Input_shape
[
2
]))
next_state
=
np
.
zeros
((
FLAGS
.
batch_size
,
Input_shape
[
0
],
Input_shape
[
1
],
Input_shape
[
2
]))
# These objects have a single value, so we can just create a list that we append later
action
=
[]
reward
=
[]
# Create an array that will carry what the target q values will be - based on our target networks weights
target_q
=
np
.
zeros
((
FLAGS
.
batch_size
,))
# Fill up our arrays with our minibatch
for
id
,
val
in
enumerate
(
mini_batch
):
state
[
id
]
=
val
[
0
]
print
(
val
[
0
]
.
shape
)
next_state
[
id
]
=
val
[
3
]
action
.
append
(
val
[
1
])
reward
.
append
(
val
[
2
])
# We want the model to predict the q value for all actions hence:
actions_mask
=
np
.
ones
((
FLAGS
.
batch_size
,
Action_size
))
# Get the target model to predict the q values for all actions
next_q_values
=
model
.
predict
([
next_state
,
actions_mask
])
# Fill out target q values based on the max q value in the next state
for
i
in
range
(
FLAGS
.
batch_size
):
# Standard discounted reward formula
# q(s,a) = r + discount * cumulative future rewards
target_q
[
i
]
=
reward
[
i
]
+
FLAGS
.
gamma
*
np
.
amax
(
next_q_values
[
i
])
# Convert all the actions into one hot vectors
action_one_hot
=
get_one_hot
(
action
,
Action_size
)
# Apply one hot mask onto target vector
# This results in a vector that has the max q value in the position corresponding to the action
target_one_hot
=
action_one_hot
*
target_q
[:,
None
]
# Then we fit the model
# We map the state and the action from the memory bank to the q value of that state action pair
# s,a -> q(s,a|w)
h
=
model
.
fit
([
state
,
action_one_hot
],
target_one_hot
,
epochs
=
1
,
batch_size
=
FLAGS
.
batch_size
,
verbose
=
0
)
# Return the loss
# It's just for monitoring progress
return
h
.
history
[
'loss'
][
0
]
def
train
():
# Define which game to play
env
=
gym
.
make
(
'PongDeterministic-v4'
)
# Create a space for our memory
# We will use a deque - double ended que
# This will result in a max size so that once all the space is filled,
# older entries will be removed to make room for new
memory
=
deque
(
maxlen
=
FLAGS
.
replay_memory
)
# Start episode counter
episode_number
=
0
# Set epsilon
epsilon
=
FLAGS
.
initial_epsilon
# Define epsilon decay
epsilon_decay
=
(
FLAGS
.
initial_epsilon
-
FLAGS
.
final_epsilon
)
/
FLAGS
.
epsilon_anneal_num
# Start global step
global_step
=
0
# Define model
model
=
atari_model
()
# Define target model
model_target
=
atari_model_target
()
# Define where to store logs
log_dir
=
"{}/run-{}-log"
.
format
(
FLAGS
.
train_dir
,
'MK10'
)
# Pass graph to TensorBoard
file_writer
=
tf
.
summary
.
FileWriter
(
log_dir
,
tf
.
get_default_graph
())
# Start the optimisation loop
while
episode_number
<
FLAGS
.
n_episodes
:
# Initialisise done as false
done
=
False
step
=
0
score
=
0
loss
=
0.0
# Initialise environment
observation
=
env
.
reset
()
# For the very start of the episode, we will do nothing but observe
# This way we can get a sense of what's going on
for
_
in
range
(
random
.
randint
(
1
,
FLAGS
.
no_op_steps
)):
observation
,
_
,
_
,
_
=
env
.
step
(
1
)
# At the start of the episode there are no preceding frames
# So we just copy the initial states into a stack to make the state history
state
=
pre_processing
(
observation
)
state_history
=
np
.
stack
((
state
,
state
,
state
,
state
),
axis
=
2
)
state_history
=
np
.
reshape
([
state_history
],
(
1
,
84
,
84
,
4
))
# Perform while we still have lives
while
not
done
:
# Render the image if selected to do so
if
FLAGS
.
render
:
env
.
render
()
# Select an action based on our current model
# Task-4: select_model = model_target or select_model = model
select_model
=
'?'
action
=
get_action
(
state_history
,
epsilon
,
global_step
,
select_model
)
# Convert action from array numbers to real numbers
real_action
=
action
+
1
# After we're done observing, start scaling down epsilon
if
global_step
>
FLAGS
.
observe_step_num
and
epsilon
>
FLAGS
.
final_epsilon
:
epsilon
-=
epsilon_decay
# Record output from the environment
observation
,
reward
,
done
,
info
=
env
.
step
(
real_action
)
# Process the observation
next_state
=
pre_processing
(
observation
)
next_state
=
np
.
reshape
([
next_state
],
(
1
,
84
,
84
,
1
))
# Update the history with the next state - also remove oldest state
state_history_w_next
=
np
.
append
(
next_state
,
state_history
[:,
:,
:,
:
3
],
axis
=
3
)
# Update score
score
+=
reward
# Save the (s, a, r, s') set to memory
store_memory
(
memory
,
state_history
,
action
,
reward
,
state_history_w_next
)
# Train model
# Check if we are done observing
if
global_step
>
FLAGS
.
observe_step_num
:
loss
=
loss
+
train_memory_batch
(
memory
,
model
)
# Check if we are ready to update target model with the model we have been training
if
global_step
%
FLAGS
.
update_target_model_steps
==
0
:
model_target
.
set_weights
(
model
.
get_weights
())
print
(
"UPDATING TARGET WEIGHTS"
)
state_history
=
state_history_w_next
#print("step: ", global_step)
global_step
+=
1
step
+=
1
# Check if episode is over - lost all lives in breakout
if
done
:
# Check if we are still observing
if
global_step
<=
FLAGS
.
observe_step_num
:
current_position
=
'observe'
# Check if we are still annealing epsilon
elif
FLAGS
.
observe_step_num
<
global_step
<=
FLAGS
.
observe_step_num
+
FLAGS
.
epsilon_anneal_num
:
current_position
=
'explore'
else
:
current_position
=
'train'
# Print status
print
(
'current position: {}, epsilon: {} , episode: {}, score: {}, global_step: {}, avg loss: {}, step: {}, memory length: {}'
.
format
(
current_position
,
epsilon
,
episode_number
,
score
,
global_step
,
loss
/
float
(
step
),
step
,
len
(
memory
)))
# Save model every 100 episodes and final episode
if
episode_number
%
100
==
0
or
(
episode_number
+
1
)
==
FLAGS
.
n_episodes
:
file_name
=
"pong_model_{}.h5"
.
format
(
episode_number
)
model_path
=
os
.
path
.
join
(
FLAGS
.
train_dir
,
file_name
)
model
.
save
(
model_path
)
# Add loss and score data to TensorBoard
loss_summary
=
tf
.
Summary
(
value
=
[
tf
.
Summary
.
Value
(
tag
=
"loss"
,
simple_value
=
loss
/
float
(
step
))])
file_writer
.
add_summary
(
loss_summary
,
global_step
=
episode_number
)
score_summary
=
tf
.
Summary
(
value
=
[
tf
.
Summary
.
Value
(
tag
=
"score"
,
simple_value
=
score
)])
file_writer
.
add_summary
(
score_summary
,
global_step
=
episode_number
)
# Increment episode number
episode_number
+=
1
file_writer
.
close
()
if
__name__
==
"__main__"
:
t_start
=
time
.
time
()
train
()
t_end
=
time
.
time
()
t_all
=
t_end
-
t_start
print
(
'train.py: whole time: {:.2f} h ({:.2f} min)'
.
format
(
t_all
/
3600.
,
t_all
/
60.
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment