# coding=utf-8
from MDP import MDP
import numpy as np

import matplotlib.pyplot as plt


def plot_value_and_policy(values, policy):
    data = np.zeros((5, 5))

    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.title('Value')
    for y in range(data.shape[0]):
        for x in range(data.shape[1]):
            data[y][x] = values[(x, y)]
            plt.text(x + 0.5, y + 0.5, '%.4f' % data[y, x], horizontalalignment='center', verticalalignment='center', )

    heatmap = plt.pcolor(data)
    plt.gca().invert_yaxis()
    plt.colorbar(heatmap)

    plt.subplot(1, 2, 2)
    plt.title('Policy')
    for y in range(5):
        for x in range(5):
            for action in policy[(x, y)]:
                if action == 'DRIBBLE_UP':
                    plt.annotate('', (x + 0.5, y), (x + 0.5, y + 0.5), arrowprops={'width': 0.1})
                if action == 'DRIBBLE_DOWN':
                    plt.annotate('', (x + 0.5, y + 1), (x + 0.5, y + 0.5), arrowprops={'width': 0.1})
                if action == 'DRIBBLE_RIGHT':
                    plt.annotate('', (x + 1, y + 0.5), (x + 0.5, y + 0.5), arrowprops={'width': 0.1})
                if action == 'DRIBBLE_LEFT':
                    plt.annotate('', (x, y + 0.5), (x + 0.5, y + 0.5), arrowprops={'width': 0.1})
                if action == 'SHOOT':
                    plt.text(x + 0.5, y + 0.5, action, horizontalalignment='center', verticalalignment='center', )

    heatmap = plt.pcolor(data)
    plt.gca().invert_yaxis()
    plt.colorbar(heatmap)
    plt.show()


class BellmanDPSolver(object):
    def __init__(self, discountRate=0.9):
        self.MDP = MDP()
        self.discountRate = discountRate
        self.initVs()

    def initVs(self):
        self.V = {}
        self.policy = {}
        for state in self.MDP.S:
            self.V[state] = 0
            self.policy[state] = np.array([0.5] * len(self.MDP.A))

    def BellmanUpdate(self):
        # state一共27种，我们每一轮要更新的是 V[state]
        # nextState = self.MDP.probNextStates((2, 2), self.MDP.A)
        # print(nextState)
        # next_V = [0.0] * len(self.V)

        updatedStateValue = self.V.copy()

        for state in self.MDP.S:
            currValue = self.V[state]  # state: position
            tmp_V = [0.0] * len(self.MDP.A)  # compute four next values
            for idx, action in enumerate(self.MDP.A):
                transitions = self.MDP.probNextStates(state, action)
                for newState, prob in transitions.items():  #这里计算value的方式和下面action-value相同, 因为状态转移是通过action给出的, 并没有更直接的状态转移矩阵
                    reward = self.MDP.getRewards(state, action, newState)
                    tmp_V[idx] += prob * 1.0 * (reward + self.discountRate * self.V[newState])

                updatedStateValue[state] = np.max(tmp_V)
        self.V = updatedStateValue

        policy = {}
        for state in self.MDP.S:
            action_value = np.zeros(len(self.MDP.A))
            for idx, action in enumerate(self.MDP.A):
                transitions = self.MDP.probNextStates(state, action)
                for new_state, prob in transitions.items():
                    reward = self.MDP.getRewards(state, action, new_state)
                    action_value[idx] += prob * 1.0 * (reward + self.discountRate * self.V[new_state])
            self.policy[state] = np.zeros((len(self.MDP.A))) # 初始化为0
            max_actions = np.argwhere(action_value == np.amax(action_value)).flatten().tolist()
            max_actions = np.sort(max_actions)
            prob_action = 1. / len(max_actions)  # 这里直接采用贪心平分policy, 并没有使用epsilon greedy
            self.policy[state][max_actions] = prob_action  # multi assignments, 均分概率
            policy[state] = np.array(self.MDP.A)[max_actions].tolist()  # convert index np array to list
        return self.V, policy

if __name__ == '__main__':
    solution = BellmanDPSolver()
    for i in range(100):
        values, policy = solution.BellmanUpdate()
    print("Values : ", values)
    print("Policy : ", policy)
    plot_value_and_policy(values, policy)