File size: 5,846 Bytes
b8a5bf6
 
 
 
 
 
 
8ae24a2
17d4626
b8a5bf6
 
 
 
 
 
17d4626
b8a5bf6
 
 
 
17d4626
b8a5bf6
 
8ae24a2
b8a5bf6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
import numpy as np
import gymnasium as gym

class Shared:
    
    def __init__(
        self,/,
        env="CliffWalking-v0",
        gamma=0.99,
        epsilon=0.1,
        run_name=None,
        **kwargs,
    ):
        print("=" * 80)
        print(f"# Init Agent - {env}")
        print(f"- epsilon: {epsilon}")
        print(f"- gamma: {gamma}")
        print(f"- run_name: {run_name}")
        self.run_name = run_name
        self.env_name = env
        self.epsilon, self.gamma = epsilon, gamma

        self.env_kwargs = {k:v for k,v in kwargs.items() if k in ['render_mode']}
        if self.env_name == "FrozenLake-v1":
            # Can use defaults by defining map_name (4x4 or 8x8) or custom map by defining desc
            # self.env_kwargs["map_name"] = "8x8"
            self.env_kwargs["desc"] = [
                "SFFFFFFF",
                "FFFFFFFH",
                "FFFHFFFF",
                "FFFFFHFF",
                "FFFHFFFF",
                "FHHFFFHF",
                "FHFFHFHF",
                "FFFHFFFG",
            ]
            self.env_kwargs["is_slippery"] = False

        self.env = gym.make(self.env_name, **self.env_kwargs)

        self.n_states, self.n_actions = (
            self.env.observation_space.n,
            self.env.action_space.n,
        )
        print(f"- n_states: {self.n_states}")
        print(f"- n_actions: {self.n_actions}")

    def choose_action(self, state, epsilon_override=None, greedy=False, **kwargs):
        # Sample an action from the policy.
        # The epsilon_override argument allows forcing the use of a new epsilon value than the one previously used during training.
        # The ability to override was mostly added for testing purposes and for the demo.
        greedy_action = np.argmax(self.Pi[state])

        if greedy or epsilon_override == 0:
            return greedy_action

        if epsilon_override is None:
            return np.random.choice(self.n_actions, p=self.Pi[state])

        return np.random.choice(
            [greedy_action, np.random.randint(self.n_actions)],
            p=[1 - epsilon_override, epsilon_override],
        )
        
    def generate_episode(self, max_steps=500, render=False, **kwargs):
        state, _ = self.env.reset()
        episode_hist, solved, rgb_array = (
            [],
            False,
            self.env.render() if render else None,
        )

        # Generate an episode following the current policy
        for _ in range(max_steps):
            # Sample an action from the policy
            action = self.choose_action(state, **kwargs)
            # Take the action and observe the reward and next state
            next_state, reward, done, _, _ = self.env.step(action)

            if self.env_name == "FrozenLake-v1":
                if done:
                    reward = 100 if reward == 1 else -10
                else:
                    reward = -1

            # Keeping track of the trajectory
            episode_hist.append((state, action, reward))
            yield episode_hist, solved, rgb_array

            # Rendering new frame if needed
            rgb_array = self.env.render() if render else None

            # For CliffWalking-v0 and Taxi-v3, the episode is solved when it terminates
            if done and self.env_name in ["CliffWalking-v0", "Taxi-v3"]:
                solved = True
                break

            # For FrozenLake-v1, the episode terminates when the agent moves into a hole or reaches the goal
            # We consider the episode solved when the agent reaches the goal
            if done and self.env_name == "FrozenLake-v1":
                if next_state == self.env.nrow * self.env.ncol - 1:
                    solved = True
                    break
                else:
                    # Instead of terminating the episode when the agent moves into a hole, we reset the environment
                    # This is to keep consistent with the other environments
                    done = False
                    next_state, _ = self.env.reset()

            if solved or done:
                break

            state = next_state

        rgb_array = self.env.render() if render else None
        yield episode_hist, solved, rgb_array

    def run_episode(self, max_steps=500, render=False, **kwargs):
        # Run the generator until the end
        episode_hist, solved, rgb_array = list(self.generate_episode(
            max_steps, render, **kwargs
        ))[-1]
        return episode_hist, solved, rgb_array

    def test(self, n_test_episodes=100, verbose=True, greedy=True, **kwargs):
        if verbose:
            print(f"Testing agent for {n_test_episodes} episodes...")
        num_successes = 0
        for e in range(n_test_episodes):
            _, solved, _ = self.run_episode(greedy=greedy, **kwargs)
            num_successes += solved
            if verbose:
                word = "reached" if solved else "did not reach"
                emoji = "🏁" if solved else "🚫"
                print(
                    f"({e + 1:>{len(str(n_test_episodes))}}/{n_test_episodes}) - Agent {word} the goal {emoji}"
                )

        success_rate = num_successes / n_test_episodes
        if verbose:
            print(
                f"Agent reached the goal in {num_successes}/{n_test_episodes} episodes ({success_rate * 100:.2f}%)"
            )
        return success_rate
    
    def save_policy(self, fname="policy.npy", save_dir=None):
        if save_dir is not None:
            os.makedirs(save_dir, exist_ok=True)
            fname = os.path.join(save_dir, fname)
        print(f"Saving policy to: {fname}")
        np.save(fname, self.Pi)

    def load_policy(self, fname="policy.npy"):
        print(f"Loading policy from: {fname}")
        self.Pi = np.load(fname)