QLearningBase.py 3.87 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
#!/usr/bin/env python3
# encoding utf-8

# from DiscreteHFO.HFOAttackingPlayer import HFOAttackingPlayer
# from DiscreteHFO.Agent import Agent
import argparse
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from hfoEnv import hfoEnv

class QLearningAgent(object):
	def __init__(self, learningRate, discountFactor, epsilon, initVals=0.0):
		super(QLearningAgent, self).__init__()
		self.learningRate = learningRate
		self.discountFactor = discountFactor
		self.epsilon = epsilon
		self.currentState = None
		self.actions = ['DRIBBLE_UP', 'DRIBBLE_DOWN', 'DRIBBLE_LEFT', 'DRIBBLE_RIGHT', 'SHOOT']
		self.Qs = defaultdict(lambda: np.ones(len(self.actions)) * initVals)
		self.random = np.random.RandomState(0)

	def learn(self):
		# print( self.Qs)
		(state, action, reward, status, nextState) = self.cache
		max_Q_next = 0.0 if status != 0 else np.max(self.Qs[nextState])
		action = self.actions.index(action)
		# print(action)
		error = self.learningRate*(
			reward + self.discountFactor*max_Q_next 
		-   self.Qs[state][action]
		)
		self.Qs[state][action] = (
				self.Qs[state][action]
			+	error
		)  
		return error

	def act(self):
		p = self.random.uniform()
		Q = self.Qs[self.currentState]
		if p >= self.epsilon:
			return self.actions[np.argmax(Q)]
		else:
			return self.actions[self.random.choice(len(self.actions), size=1, replace=False)[0]]

	def toStateRepresentation(self, state):
		return str(state)

	def setState(self, state):
		self.currentState = state

	def setExperience(self, state, action, reward, status, nextState):
		self.cache = (state, action, reward, status, nextState)

	def setLearningRate(self, learningRate):
		self.learningRate = learningRate

	def setEpsilon(self, epsilon):
		self.epsilon = epsilon

	def reset(self):
		pass
		
	def computeHyperparameters(self, numTakenActions, episodeNumber):
		return self.learningRate, 0.0 if episodeNumber > 4000 else max(0,self.epsilon-((1.0)/4000))
		# return ((1e-3 - 1.0)/4000)*episodeNumber + 1.0, ((0.0 - 1.0)/4000)*episodeNumber + 1.0

if __name__ == '__main__':

	parser = argparse.ArgumentParser()
	parser.add_argument('--id', type=int, default=0)
	parser.add_argument('--numOpponents', type=int, default=0)
	parser.add_argument('--numTeammates', type=int, default=0)
	parser.add_argument('--numEpisodes', type=int, default=500)

	args=parser.parse_args()

	# Initialize connection with the HFO server
	# hfoEnv = HFOAttackingPlayer(numOpponents = args.numOpponents, numTeammates = args.numTeammates, agentId = args.id)
	# hfoEnv.connectToServer()

	hfoEnv = hfoEnv()
	# Initialize a Q-Learning Agent
	agent = QLearningAgent(learningRate = 0.1, discountFactor = 0.99, epsilon = 0.6)
	numEpisodes = args.numEpisodes
	log_freq = 50
	rewards_list = []
	rewards = 0.0
	# Run training using Q-Learning
	numTakenActions = 0 
	for episode in range(numEpisodes):
		status = 0
		observation = hfoEnv.reset()
		while status==0:
			learningRate, epsilon = agent.computeHyperparameters(numTakenActions, episode)
			agent.setEpsilon(epsilon)
			agent.setLearningRate(learningRate)
			
			obsCopy = [observation].copy()[0]
			agent.setState(agent.toStateRepresentation(obsCopy))
			action = agent.act()
			numTakenActions += 1
			
			nextObservation, reward, done, status = hfoEnv.step(action,obsCopy)
			agent.setExperience(agent.toStateRepresentation(obsCopy), action, reward, status, agent.toStateRepresentation(nextObservation))
			update = agent.learn()
			# if update >0:
			# 	print("learn")
			if status == 1 and nextObservation == "GOAL":
				rewards += 1.0
			elif status==1:
				reward += 0
			observation = nextObservation
		if (episode + 1) % log_freq == 0:
			print(episode + 1)
			print("rewards %f" % (rewards/log_freq))
			rewards_list.append(rewards/log_freq)
			rewards = 0
			plt.cla()
			plt.plot(rewards_list)
			plt.pause(0.01)
	print(rewards_list)
	plt.cla()
	plt.plot(rewards_list)
	plt.savefig("Q-Learning.png")
	plt.show()