Andrei Cozma commited on
Commit
b8a5bf6
·
1 Parent(s): a7331e4
Files changed (2) hide show
  1. MonteCarloAgent.py +3 -149
  2. Shared.py +163 -0
MonteCarloAgent.py CHANGED
@@ -5,50 +5,14 @@ from tqdm import tqdm
5
  import argparse
6
  from gymnasium.envs.toy_text.frozen_lake import generate_random_map
7
  import wandb
 
8
 
9
-
10
- class MonteCarloAgent:
11
  def __init__(
12
  self,
13
- env_name="CliffWalking-v0",
14
- gamma=0.99,
15
- epsilon=0.1,
16
- run_name=None,
17
  **kwargs,
18
  ):
19
- print("=" * 80)
20
- print(f"# MonteCarloAgent - {env_name}")
21
- print(f"- epsilon: {epsilon}")
22
- print(f"- gamma: {gamma}")
23
- print(f"- run_name: {run_name}")
24
- self.run_name = run_name
25
- self.env_name = env_name
26
- self.epsilon, self.gamma = epsilon, gamma
27
-
28
- self.env_kwargs = kwargs
29
- if self.env_name == "FrozenLake-v1":
30
- # Can use defaults by defining map_name (4x4 or 8x8) or custom map by defining desc
31
- # self.env_kwargs["map_name"] = "8x8"
32
- self.env_kwargs["desc"] = [
33
- "SFFFFFFF",
34
- "FFFFFFFH",
35
- "FFFHFFFF",
36
- "FFFFFHFF",
37
- "FFFHFFFF",
38
- "FHHFFFHF",
39
- "FHFFHFHF",
40
- "FFFHFFFG",
41
- ]
42
- self.env_kwargs["is_slippery"] = False
43
-
44
- self.env = gym.make(self.env_name, **self.env_kwargs)
45
-
46
- self.n_states, self.n_actions = (
47
- self.env.observation_space.n,
48
- self.env.action_space.n,
49
- )
50
- print(f"- n_states: {self.n_states}")
51
- print(f"- n_actions: {self.n_actions}")
52
  self.reset()
53
 
54
  def reset(self):
@@ -71,85 +35,6 @@ class MonteCarloAgent:
71
  print(self.Pi)
72
  print("=" * 80)
73
 
74
- def choose_action(self, state, epsilon_override=None, greedy=False, **kwargs):
75
- # Sample an action from the policy.
76
- # The epsilon_override argument allows forcing the use of a new epsilon value than the one previously used during training.
77
- # The ability to override was mostly added for testing purposes and for the demo.
78
- greedy_action = np.argmax(self.Pi[state])
79
-
80
- if greedy or epsilon_override == 0:
81
- return greedy_action
82
-
83
- if epsilon_override is None:
84
- return np.random.choice(self.n_actions, p=self.Pi[state])
85
-
86
- return np.random.choice(
87
- [greedy_action, np.random.randint(self.n_actions)],
88
- p=[1 - epsilon_override, epsilon_override],
89
- )
90
-
91
- def generate_episode(self, max_steps=500, render=False, **kwargs):
92
- state, _ = self.env.reset()
93
- episode_hist, solved, rgb_array = (
94
- [],
95
- False,
96
- self.env.render() if render else None,
97
- )
98
-
99
- # Generate an episode following the current policy
100
- for _ in range(max_steps):
101
- # Sample an action from the policy
102
- action = self.choose_action(state, **kwargs)
103
- # Take the action and observe the reward and next state
104
- next_state, reward, done, _, _ = self.env.step(action)
105
-
106
- if self.env_name == "FrozenLake-v1":
107
- if done:
108
- reward = 100 if reward == 1 else -10
109
- else:
110
- reward = -1
111
-
112
- # Keeping track of the trajectory
113
- episode_hist.append((state, action, reward))
114
- yield episode_hist, solved, rgb_array
115
-
116
- # Rendering new frame if needed
117
- rgb_array = self.env.render() if render else None
118
-
119
- # For CliffWalking-v0 and Taxi-v3, the episode is solved when it terminates
120
- if done and self.env_name in ["CliffWalking-v0", "Taxi-v3"]:
121
- solved = True
122
- break
123
-
124
- # For FrozenLake-v1, the episode terminates when the agent moves into a hole or reaches the goal
125
- # We consider the episode solved when the agent reaches the goal
126
- if done and self.env_name == "FrozenLake-v1":
127
- if next_state == self.env.nrow * self.env.ncol - 1:
128
- solved = True
129
- break
130
- else:
131
- # Instead of terminating the episode when the agent moves into a hole, we reset the environment
132
- # This is to keep consistent with the other environments
133
- done = False
134
- next_state, _ = self.env.reset()
135
-
136
- if solved or done:
137
- break
138
-
139
- state = next_state
140
-
141
- rgb_array = self.env.render() if render else None
142
- yield episode_hist, solved, rgb_array
143
-
144
- def run_episode(self, max_steps=500, render=False, **kwargs):
145
- # Run the generator until the end
146
- episode_hist, solved, rgb_array = None, False, None
147
- for episode_hist, solved, rgb_array in self.generate_episode(
148
- max_steps, render, **kwargs
149
- ):
150
- pass
151
- return episode_hist, solved, rgb_array
152
-
153
  def update_first_visit(self, episode_hist):
154
  G = 0
155
  # For each step of the episode, in reverse order
@@ -265,37 +150,6 @@ class MonteCarloAgent:
265
  if log_wandb:
266
  wandb.log(stats)
267
 
268
- def test(self, n_test_episodes=100, verbose=True, greedy=True, **kwargs):
269
- if verbose:
270
- print(f"Testing agent for {n_test_episodes} episodes...")
271
- num_successes = 0
272
- for e in range(n_test_episodes):
273
- _, solved, _ = self.run_episode(greedy=greedy, **kwargs)
274
- num_successes += solved
275
- if verbose:
276
- word = "reached" if solved else "did not reach"
277
- emoji = "🏁" if solved else "🚫"
278
- print(
279
- f"({e + 1:>{len(str(n_test_episodes))}}/{n_test_episodes}) - Agent {word} the goal {emoji}"
280
- )
281
-
282
- success_rate = num_successes / n_test_episodes
283
- if verbose:
284
- print(
285
- f"Agent reached the goal in {num_successes}/{n_test_episodes} episodes ({success_rate * 100:.2f}%)"
286
- )
287
- return success_rate
288
-
289
- def save_policy(self, fname="policy.npy", save_dir=None):
290
- if save_dir is not None:
291
- os.makedirs(save_dir, exist_ok=True)
292
- fname = os.path.join(save_dir, fname)
293
- print(f"Saving policy to: {fname}")
294
- np.save(fname, self.Pi)
295
-
296
- def load_policy(self, fname="policy.npy"):
297
- print(f"Loading policy from: {fname}")
298
- self.Pi = np.load(fname)
299
 
300
  def wandb_log_img(self, episode=None):
301
  caption_suffix = "Initial" if episode is None else f"After Episode {episode}"
 
5
  import argparse
6
  from gymnasium.envs.toy_text.frozen_lake import generate_random_map
7
  import wandb
8
+ from .Shared import Shared
9
 
10
+ class MonteCarloAgent(Shared):
 
11
  def __init__(
12
  self,
 
 
 
 
13
  **kwargs,
14
  ):
15
+ super().__init__(**kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  self.reset()
17
 
18
  def reset(self):
 
35
  print(self.Pi)
36
  print("=" * 80)
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def update_first_visit(self, episode_hist):
39
  G = 0
40
  # For each step of the episode, in reverse order
 
150
  if log_wandb:
151
  wandb.log(stats)
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
  def wandb_log_img(self, episode=None):
155
  caption_suffix = "Initial" if episode is None else f"After Episode {episode}"
Shared.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import gymnasium as gym
4
+ from tqdm import tqdm
5
+ import argparse
6
+ import wandb
7
+
8
+ class Shared:
9
+
10
+ def __init__(
11
+ self,
12
+ env_name="CliffWalking-v0",
13
+ gamma=0.99,
14
+ epsilon=0.1,
15
+ run_name=None,
16
+ **kwargs,
17
+ ):
18
+ print("=" * 80)
19
+ print(f"# Init Agent - {env_name}")
20
+ print(f"- epsilon: {epsilon}")
21
+ print(f"- gamma: {gamma}")
22
+ print(f"- run_name: {run_name}")
23
+ self.run_name = run_name
24
+ self.env_name = env_name
25
+ self.epsilon, self.gamma = epsilon, gamma
26
+
27
+ self.env_kwargs = kwargs
28
+ if self.env_name == "FrozenLake-v1":
29
+ # Can use defaults by defining map_name (4x4 or 8x8) or custom map by defining desc
30
+ # self.env_kwargs["map_name"] = "8x8"
31
+ self.env_kwargs["desc"] = [
32
+ "SFFFFFFF",
33
+ "FFFFFFFH",
34
+ "FFFHFFFF",
35
+ "FFFFFHFF",
36
+ "FFFHFFFF",
37
+ "FHHFFFHF",
38
+ "FHFFHFHF",
39
+ "FFFHFFFG",
40
+ ]
41
+ self.env_kwargs["is_slippery"] = False
42
+
43
+ self.env = gym.make(self.env_name, **self.env_kwargs)
44
+
45
+ self.n_states, self.n_actions = (
46
+ self.env.observation_space.n,
47
+ self.env.action_space.n,
48
+ )
49
+ print(f"- n_states: {self.n_states}")
50
+ print(f"- n_actions: {self.n_actions}")
51
+
52
+ def choose_action(self, state, epsilon_override=None, greedy=False, **kwargs):
53
+ # Sample an action from the policy.
54
+ # The epsilon_override argument allows forcing the use of a new epsilon value than the one previously used during training.
55
+ # The ability to override was mostly added for testing purposes and for the demo.
56
+ greedy_action = np.argmax(self.Pi[state])
57
+
58
+ if greedy or epsilon_override == 0:
59
+ return greedy_action
60
+
61
+ if epsilon_override is None:
62
+ return np.random.choice(self.n_actions, p=self.Pi[state])
63
+
64
+ return np.random.choice(
65
+ [greedy_action, np.random.randint(self.n_actions)],
66
+ p=[1 - epsilon_override, epsilon_override],
67
+ )
68
+
69
+ def get_policy():
70
+ pass
71
+
72
+
73
+ def generate_episode(self, max_steps=500, render=False, **kwargs):
74
+ state, _ = self.env.reset()
75
+ episode_hist, solved, rgb_array = (
76
+ [],
77
+ False,
78
+ self.env.render() if render else None,
79
+ )
80
+
81
+ # Generate an episode following the current policy
82
+ for _ in range(max_steps):
83
+ # Sample an action from the policy
84
+ action = self.choose_action(state, **kwargs)
85
+ # Take the action and observe the reward and next state
86
+ next_state, reward, done, _, _ = self.env.step(action)
87
+
88
+ if self.env_name == "FrozenLake-v1":
89
+ if done:
90
+ reward = 100 if reward == 1 else -10
91
+ else:
92
+ reward = -1
93
+
94
+ # Keeping track of the trajectory
95
+ episode_hist.append((state, action, reward))
96
+ yield episode_hist, solved, rgb_array
97
+
98
+ # Rendering new frame if needed
99
+ rgb_array = self.env.render() if render else None
100
+
101
+ # For CliffWalking-v0 and Taxi-v3, the episode is solved when it terminates
102
+ if done and self.env_name in ["CliffWalking-v0", "Taxi-v3"]:
103
+ solved = True
104
+ break
105
+
106
+ # For FrozenLake-v1, the episode terminates when the agent moves into a hole or reaches the goal
107
+ # We consider the episode solved when the agent reaches the goal
108
+ if done and self.env_name == "FrozenLake-v1":
109
+ if next_state == self.env.nrow * self.env.ncol - 1:
110
+ solved = True
111
+ break
112
+ else:
113
+ # Instead of terminating the episode when the agent moves into a hole, we reset the environment
114
+ # This is to keep consistent with the other environments
115
+ done = False
116
+ next_state, _ = self.env.reset()
117
+
118
+ if solved or done:
119
+ break
120
+
121
+ state = next_state
122
+
123
+ rgb_array = self.env.render() if render else None
124
+ yield episode_hist, solved, rgb_array
125
+
126
+ def run_episode(self, max_steps=500, render=False, **kwargs):
127
+ # Run the generator until the end
128
+ episode_hist, solved, rgb_array = list(self.generate_episode(
129
+ max_steps, render, **kwargs
130
+ ))[-1]
131
+ return episode_hist, solved, rgb_array
132
+
133
+ def test(self, n_test_episodes=100, verbose=True, greedy=True, **kwargs):
134
+ if verbose:
135
+ print(f"Testing agent for {n_test_episodes} episodes...")
136
+ num_successes = 0
137
+ for e in range(n_test_episodes):
138
+ _, solved, _ = self.run_episode(greedy=greedy, **kwargs)
139
+ num_successes += solved
140
+ if verbose:
141
+ word = "reached" if solved else "did not reach"
142
+ emoji = "🏁" if solved else "🚫"
143
+ print(
144
+ f"({e + 1:>{len(str(n_test_episodes))}}/{n_test_episodes}) - Agent {word} the goal {emoji}"
145
+ )
146
+
147
+ success_rate = num_successes / n_test_episodes
148
+ if verbose:
149
+ print(
150
+ f"Agent reached the goal in {num_successes}/{n_test_episodes} episodes ({success_rate * 100:.2f}%)"
151
+ )
152
+ return success_rate
153
+
154
+ def save_policy(self, fname="policy.npy", save_dir=None):
155
+ if save_dir is not None:
156
+ os.makedirs(save_dir, exist_ok=True)
157
+ fname = os.path.join(save_dir, fname)
158
+ print(f"Saving policy to: {fname}")
159
+ np.save(fname, self.Pi)
160
+
161
+ def load_policy(self, fname="policy.npy"):
162
+ print(f"Loading policy from: {fname}")
163
+ self.Pi = np.load(fname)