File size: 3,758 Bytes
93d8108
 
 
 
 
5fc752e
8ae24a2
93d8108
 
5fc752e
cf8b7c4
 
 
8ae24a2
93d8108
8ae24a2
 
cf8b7c4
 
 
93d8108
f929afb
 
93d8108
8ae24a2
93d8108
8ae24a2
93d8108
 
 
 
f929afb
 
 
 
cf8b7c4
 
 
e282b5d
cf8b7c4
 
 
f929afb
 
 
 
93d8108
 
 
 
 
8ae24a2
 
93d8108
 
f929afb
 
 
 
 
 
 
cf8b7c4
 
 
 
8ae24a2
 
cf8b7c4
 
8ae24a2
f929afb
93d8108
 
 
786f010
17d4626
93d8108
cf8b7c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93d8108
 
 
 
8ae24a2
93d8108
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import gymnasium as gym
import numpy as np
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
from matplotlib import pyplot as plt
from tqdm import trange
from Shared import Shared
import warnings


class DPAgent(Shared):
    def __init__(self, /, **kwargs):
        super().__init__(run_name=self.__class__.__name__, **kwargs)
        self.theta = kwargs.get("theta", 1e-10)
        print(self.theta)
        self.V = np.zeros(self.env.observation_space.n)
        self.Pi = np.zeros(self.env.observation_space.n, self.env.action_space.n)
        if self.gamma >= 1.0:
            warnings.warn(
                "DP will never converge with a gamma value =1.0. Try 0.99?", UserWarning
            )

    def policy(self, state):
        return self.Pi[state]

    def train(self, *args, **kwargs):
        i = 0
        print(self.gamma)
        while True:
            delta = 0
            V_prev = np.copy(self.V)
            for state in range(self.env.observation_space.n):
                # calculate the action-value for each possible action
                Q = np.zeros(self.env.action_space.n)
                for action in range(self.env.action_space.n):
                    expected_value = 0
                    for probability, next_state, reward, done in self.env.P[state][
                        action
                    ]:
                        # if state == self.env.observation_space.n-1: reward = 1
                        expected_value += probability * (
                            reward + self.gamma * self.V[next_state]
                        )
                    Q[action] = expected_value
                action, value = np.argmax(Q), np.max(Q)

                # update the state-value function
                self.V[state] = value
                delta = max(delta, abs(V_prev[state] - self.V[state]))
            if delta < self.theta:
                break
            i += 1
            # if i % 100 == 0 and i != 0:
            #     self.test()
            print(f"Iteration {i}: delta={delta}")
            # break
        # policy = [self.policy(state, return_value=True)[0] for state in range(self.env.observation_space.n)]
        self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
        for s in range(self.env.observation_space.n):
            for a in range(self.env.action_space.n):
                expected_value = 0
                for probability, next_state, reward, done in self.env.P[s][a]:
                    # if state == self.env.observation_space.n-1: reward = 1
                    expected_value += probability * (
                        reward + self.gamma * self.V[next_state]
                    )
                self.Pi[s, a] = expected_value
        idxs = np.argmax(self.Pi, axis=1)
        print(idxs)
        self.Pi = np.zeros((self.env.observation_space.n, self.env.action_space.n))
        self.Pi[np.arange(self.env.observation_space.n), idxs] = 1
        # print(self.Pi)
        # return self.V, self.Pi


if __name__ == "__main__":
    # env = gym.make('FrozenLake-v1', render_mode='human')
    dp = DPAgent(env="FrozenLake-v1", gamma=0.99)
    dp.train()
    dp.save_policy("dp_policy.npy")
    env = gym.make(
        "FrozenLake-v1",
        render_mode="human",
        is_slippery=False,
        desc=[
            "SFFFFFFF",
            "FFFFFFFH",
            "FFFHFFFF",
            "FFFFFHFF",
            "FFFHFFFF",
            "FHHFFFHF",
            "FHFFHFHF",
            "FFFHFFFG",
        ],
    )

    state, _ = env.reset()
    done = False
    while not done:
        action = dp.choose_action(state)
        state, reward, done, _, _ = env.step(action)
        env.render()

    # plt.savefig(f"imgs/{0}.png")