File size: 3,936 Bytes
93d8108
 
 
 
 
 
 
3e2038a
93d8108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786f010
3e2038a
a7331e4
 
 
 
 
 
 
 
 
93d8108
 
a7331e4
 
 
 
 
 
 
 
 
 
93d8108
 
 
 
 
786f010
93d8108
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import gymnasium as gym
import numpy as np
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
from matplotlib import pyplot as plt
from tqdm import trange


class DPAgent:
    def __init__(self, env_name, gamma=0.9, theta=1e-10, **kwargs):
        self.env = gym.make(env_name, **kwargs)
        self.gamma = gamma
        self.theta = theta
        self.V = np.zeros(self.env.observation_space.n)
        self.epsilon = 0

    def policy(self, state, return_value=False):
        Q = np.zeros(self.env.action_space.n)
        for action in range(self.env.action_space.n):
            expected_value = 0
            for probability, next_state, reward, done in self.env.P[state][action]:
                if state == self.env.observation_space.n-1: reward = 1
                expected_value += probability * (reward + self.gamma * self.V[next_state])
            Q[action] = expected_value
        if return_value:
            return np.argmax(Q), np.max(Q)
        else:
            return Q

    def train(self):
        i = 0
        while True:
            delta = 0
            V_prev = np.copy(self.V)
            for state in range(self.env.observation_space.n):
                action, value = self.policy(state, return_value=True)
                self.V[state] = value
                delta = max(delta, abs(V_prev[state] - self.V[state]))
            if delta < self.theta:
                break
            i += 1
            print(f"Iteration {i}: delta={delta}")
            # break

        policy = [self.policy(state, return_value=True)[0] for state in range(self.env.observation_space.n)]
        return self.V, policy
    
    def save_policy(self, pth):
        np.save(pth, self.V)

    def load_policy(self, pth):
        """
        not really loading the 'policy', but the state-value function but for
        interface's sake, here we are. 
        """
        self.V = np.load(pth)

    def generate_episode(self, max_steps, render=False, **kwargs):
        state, _ = self.env.reset()
        episode_hist, solved, rgb_array = [], False, None

        # Generate an episode following the current policy
        for _ in range(max_steps):
            rgb_array = self.env.render() if render else None
            # Sample an action from the policy
            action = self.policy(state)
            maction = np.argmax(action)
            # Take the action and observe the reward and next state
            next_state, reward, done, truncated, _ = self.env.step(maction)
            # Keeping track of the trajectory
            episode_hist.append((state, maction, reward))
            state = next_state

            yield episode_hist, solved, rgb_array

            # This is where the agent got to the goal.
            # In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
            if done or truncated:
                solved = True
                break

        rgb_array = self.env.render() if render else None

        yield episode_hist, solved, rgb_array


if __name__ == "__main__":
    # env = gym.make('FrozenLake-v1', render_mode='human')
    dp = DPAgent("FrozenLake-v1", is_slippery=False, desc=[
        "SFFFFFFF",
        "FFFFFFFH",
        "FFFHFFFF",
        "FFFFFHFF",
        "FFFHFFFF",
        "FHHFFFHF",
        "FHFFHFHF",
        "FFFHFFFG",
    ])
    dp.train()
    dp.save_policy('dp_policy.npy')
    env = gym.make('FrozenLake-v1', render_mode='human', is_slippery=False, desc=[
        "SFFFFFFF",
        "FFFFFFFH",
        "FFFHFFFF",
        "FFFFFHFF",
        "FFFHFFFF",
        "FHHFFFHF",
        "FHFFHFHF",
        "FFFHFFFG",
    ])

    state, _ = env.reset()
    done = False
    while not done:
        action = dp.policy(state)
        action = np.argmax(action)
        state, reward, done, _, _ = env.step(action)
        env.render()

    # plt.savefig(f"imgs/{0}.png")