File size: 4,719 Bytes
e0c3c75
 
93d8108
 
 
 
e0c3c75
93d8108
e0c3c75
46b0409
93d8108
 
46b0409
cf8b7c4
 
 
8ae24a2
93d8108
8ae24a2
 
cf8b7c4
 
 
93d8108
f929afb
 
93d8108
8ae24a2
e0c3c75
93d8108
8ae24a2
93d8108
 
 
 
f929afb
 
 
 
cf8b7c4
 
 
46b0409
 
 
 
3266489
cf8b7c4
 
 
f929afb
 
 
 
93d8108
 
e0c3c75
 
 
 
 
 
 
 
93d8108
 
 
46b0409
e0c3c75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f929afb
 
 
 
 
46b0409
 
e0c3c75
46b0409
 
cf8b7c4
 
 
 
8ae24a2
cf8b7c4
 
93d8108
 
 
cf8b7c4
 
e0c3c75
 
cf8b7c4
 
e0c3c75
 
 
 
 
 
 
 
 
 
 
 
93d8108
 
 
 
6a48762
93d8108
e0c3c75
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import warnings

import gymnasium as gym
import numpy as np
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
from matplotlib import pyplot as plt
from PIL import Image
from tqdm import trange

from AgentBase import AgentBase


class DPAgent(AgentBase):
    def __init__(self, /, **kwargs):
        super().__init__(run_name=self.__class__.__name__, **kwargs)
        self.theta = kwargs.get("theta", 1e-10)
        print(self.theta)
        self.V = np.zeros(self.env.observation_space.n)
        self.Pi = np.zeros(self.env.observation_space.n, self.env.action_space.n)
        if self.gamma >= 1.0:
            warnings.warn(
                "DP will never converge with a gamma value =1.0. Try 0.99?", UserWarning
            )

    def policy(self, state):
        return self.Pi[state]

    def train(self, *args, **kwargs):
        success_rate = []
        i = 0
        print(self.gamma)
        while True:
            delta = 0
            V_prev = np.copy(self.V)
            for state in range(self.env.observation_space.n):
                # calculate the action-value for each possible action
                Q = np.zeros(self.env.action_space.n)
                for action in range(self.env.action_space.n):
                    expected_value = 0
                    for probability, next_state, reward, done in self.env.P[state][
                        action
                    ]:
                        if (
                            self.env_name == "CliffWalking-v0"
                            and state == self.env.observation_space.n - 1
                        ):
                            reward = 1
                        expected_value += probability * (
                            reward + self.gamma * self.V[next_state]
                        )
                    Q[action] = expected_value
                action, value = np.argmax(Q), np.max(Q)

                # update the state-value function
                self.V[state] = value
                delta = max(delta, abs(V_prev[state] - self.V[state]))
            self.make_pi()
            suc = self.test(verbose=False, greedy=True)
            success_rate.append(suc)
            if delta < self.theta and self.theta < 1:
                print(f"breaking at {delta}, {self.theta}")
                break
            elif i > self.theta and self.theta > 1:
                print(f"breaking at {i}, {self.theta}")
                break
            i += 1
            print(f"Iteration {i}: delta={delta}")

        # self.write_v(0)
        return success_rate

    def write_v(self, i):
        v_cop = np.copy(self.V).reshape((12, 4))
        print(v_cop)
        v_cop -= np.min(v_cop)
        v_cop /= np.max(v_cop)
        print(np.min(v_cop), np.max(v_cop))
        img = Image.fromarray(np.uint8(v_cop * 255), "L")
        img = img.resize(
            (v_cop.shape[0] * 100, v_cop.shape[1] * 100),
            resample=Image.Resampling.NEAREST,
        )
        img.save(f"imgs/{i}.png")

    def make_pi(self):
        self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
        for s in range(self.env.observation_space.n):
            for a in range(self.env.action_space.n):
                expected_value = 0
                for probability, next_state, reward, done in self.env.P[s][a]:
                    if (
                        self.env_name == "CliffWalking-v0"
                        and s == self.env.observation_space.n - 1
                    ):
                        reward = 1
                    expected_value += probability * (
                        reward + self.gamma * self.V[next_state]
                    )
                self.Pi[s, a] = expected_value
        idxs = np.argmax(self.Pi, axis=1)
        self.Pi = np.zeros((self.env.observation_space.n, self.env.action_space.n))
        self.Pi[np.arange(self.env.observation_space.n), idxs] = 1


if __name__ == "__main__":
    env = gym.make(
        "FrozenLake-v1",
        render_mode="ansi",
        desc=generate_random_map(8, seed=24),
        is_slippery=False,
    )
    dp = DPAgent(env="FrozenLake-v1", gamma=0.99)
    dp.env = env
    dp.env_name = "FrozenLake-v1"
    dp.V = np.zeros(dp.env.observation_space.n)
    dp.Pi = np.zeros(dp.env.observation_space.n, dp.env.action_space.n)
    dp.n_states, dp.n_actions = (
        dp.env.observation_space.n,
        dp.env.action_space.n,
    )
    dp.train()

    print(dp.test())

    state, _ = env.reset()
    done = False
    while not done:
        action = dp.choose_action(dp.Pi, state)
        state, reward, done, _, _ = env.step(action)
        s = env.render()
        print(s)
    plt.savefig(f"imgs/{0}.png")