Andrei Cozma commited on
Commit
53c3925
·
1 Parent(s): 1663f39
MonteCarloAgent.py CHANGED
@@ -27,9 +27,18 @@ class MonteCarloAgent:
27
 
28
  self.env_kwargs = kwargs
29
  if self.env_name == "FrozenLake-v1":
30
- self.env_kwargs["desc"] = None
31
- self.env_kwargs["map_name"] = "4x4"
32
- self.env_kwargs["is_slippery"] = "False"
 
 
 
 
 
 
 
 
 
33
 
34
  self.env = gym.make(self.env_name, **self.env_kwargs)
35
 
@@ -67,7 +76,7 @@ class MonteCarloAgent:
67
  # The ability to override was mostly added for testing purposes and for the demo.
68
  greedy_action = np.argmax(self.Pi[state])
69
 
70
- if greedy:
71
  return greedy_action
72
 
73
  if epsilon_override is None:
@@ -80,21 +89,30 @@ class MonteCarloAgent:
80
 
81
  def generate_episode(self, max_steps=500, render=False, **kwargs):
82
  state, _ = self.env.reset()
83
- episode_hist, solved, rgb_array = [], False, None
 
 
 
 
84
 
85
  # Generate an episode following the current policy
86
- while len(episode_hist) < max_steps:
87
- rgb_array = self.env.render() if render else None
88
-
89
  # Sample an action from the policy
90
  action = self.choose_action(state, **kwargs)
91
  # Take the action and observe the reward and next state
92
- next_state, reward, done, truncated, _ = self.env.step(action)
 
 
 
 
 
 
93
 
94
  # Keeping track of the trajectory
95
  episode_hist.append((state, action, reward))
96
  yield episode_hist, solved, rgb_array
97
 
 
98
  # For CliffWalking-v0 and Taxi-v3, the episode is solved when it terminates
99
  if done and (
100
  self.env_name == "CliffWalking-v0" or self.env_name == "Taxi-v3"
@@ -103,12 +121,17 @@ class MonteCarloAgent:
103
  break
104
 
105
  # For FrozenLake-v1, the episode terminates when the agent moves into a hole or reaches the goal
106
- # We consider the episode solved when the agent reaches the goal (done == True and reward == 1)
107
- if done and self.env_name == "FrozenLake-v1" and reward == 1:
108
- solved = True
109
- break
110
-
111
- if done or truncated:
 
 
 
 
 
112
  break
113
 
114
  state = next_state
 
27
 
28
  self.env_kwargs = kwargs
29
  if self.env_name == "FrozenLake-v1":
30
+ self.env_kwargs["desc"] = [
31
+ "SFFFFFFF",
32
+ "FFFFFFFH",
33
+ "FFFHFFFF",
34
+ "FFFFFHFF",
35
+ "FFFHFFFF",
36
+ "FHHFFFHF",
37
+ "FHFFHFHF",
38
+ "FFFHFFFG",
39
+ ]
40
+ # self.env_kwargs["map_name"] = "8x8"
41
+ self.env_kwargs["is_slippery"] = False
42
 
43
  self.env = gym.make(self.env_name, **self.env_kwargs)
44
 
 
76
  # The ability to override was mostly added for testing purposes and for the demo.
77
  greedy_action = np.argmax(self.Pi[state])
78
 
79
+ if greedy or epsilon_override == 0:
80
  return greedy_action
81
 
82
  if epsilon_override is None:
 
89
 
90
  def generate_episode(self, max_steps=500, render=False, **kwargs):
91
  state, _ = self.env.reset()
92
+ episode_hist, solved, rgb_array = (
93
+ [],
94
+ False,
95
+ self.env.render() if render else None,
96
+ )
97
 
98
  # Generate an episode following the current policy
99
+ for _ in range(max_steps):
 
 
100
  # Sample an action from the policy
101
  action = self.choose_action(state, **kwargs)
102
  # Take the action and observe the reward and next state
103
+ next_state, reward, done, _, _ = self.env.step(action)
104
+
105
+ if self.env_name == "FrozenLake-v1":
106
+ if done:
107
+ reward = 100 if reward == 1 else -10
108
+ else:
109
+ reward = -1
110
 
111
  # Keeping track of the trajectory
112
  episode_hist.append((state, action, reward))
113
  yield episode_hist, solved, rgb_array
114
 
115
+ rgb_array = self.env.render() if render else None
116
  # For CliffWalking-v0 and Taxi-v3, the episode is solved when it terminates
117
  if done and (
118
  self.env_name == "CliffWalking-v0" or self.env_name == "Taxi-v3"
 
121
  break
122
 
123
  # For FrozenLake-v1, the episode terminates when the agent moves into a hole or reaches the goal
124
+ # We consider the episode solved when the agent reaches the goal
125
+ if done and self.env_name == "FrozenLake-v1":
126
+ if next_state == self.env.nrow * self.env.ncol - 1:
127
+ solved = True
128
+ # print("Solved!")
129
+ break
130
+ else:
131
+ done = False
132
+ next_state, _ = self.env.reset()
133
+
134
+ if solved or done:
135
  break
136
 
137
  state = next_state
demo.py CHANGED
@@ -134,6 +134,8 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
134
  return f"{step + 1}"
135
 
136
  for episode in range(n_test_episodes):
 
 
137
  for step, (episode_hist, solved, rgb_array) in enumerate(
138
  agent.generate_episode(
139
  max_steps=max_steps, render=True, epsilon_override=live_epsilon
@@ -145,7 +147,7 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
145
  state, action, reward = episode_hist[-1]
146
  curr_policy = agent.Pi[state]
147
 
148
- rgb_array_height, rgb_array_width = 128, 512
149
  rgb_array = cv2.resize(
150
  rgb_array,
151
  (
@@ -202,7 +204,7 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
202
  )
203
 
204
  if env_action_map:
205
- action_name = env_action_map.get(action, action)
206
 
207
  cv2.putText(
208
  policy_viz,
@@ -222,7 +224,7 @@ def run(policy_fname, n_test_episodes, max_steps, render_fps, epsilon):
222
  )
223
 
224
  print(
225
- f"Episode: {ep_str(episode + 1)} - step: {step_str(step)} - state: {state} - action: {action} - reward: {reward} (epsilon: {live_epsilon:.2f}) (frame time: {1 / render_fps:.2f}s)"
226
  )
227
 
228
  yield agent_type, env_name, rgb_array, policy_viz, ep_str(
@@ -396,5 +398,5 @@ with gr.Blocks(title="CS581 Demo") as demo:
396
  ],
397
  )
398
 
399
- demo.queue(concurrency_count=3)
400
  demo.launch()
 
134
  return f"{step + 1}"
135
 
136
  for episode in range(n_test_episodes):
137
+ time.sleep(1.0)
138
+
139
  for step, (episode_hist, solved, rgb_array) in enumerate(
140
  agent.generate_episode(
141
  max_steps=max_steps, render=True, epsilon_override=live_epsilon
 
147
  state, action, reward = episode_hist[-1]
148
  curr_policy = agent.Pi[state]
149
 
150
+ rgb_array_height, rgb_array_width = 150, 512
151
  rgb_array = cv2.resize(
152
  rgb_array,
153
  (
 
204
  )
205
 
206
  if env_action_map:
207
+ action_name = env_action_map.get(action, "")
208
 
209
  cv2.putText(
210
  policy_viz,
 
224
  )
225
 
226
  print(
227
+ f"Episode: {ep_str(episode + 1)} - step: {step_str(step)} - state: {state} - action: {action} - reward: {reward} (epsilon: {live_epsilon:.2f}) (frame time: {1 / live_render_fps:.2f}s)"
228
  )
229
 
230
  yield agent_type, env_name, rgb_array, policy_viz, ep_str(
 
398
  ],
399
  )
400
 
401
+ demo.queue(concurrency_count=2)
402
  demo.launch()
policies/MonteCarloAgent_FrozenLake-v1_e2500_s200_g1.0_e0.2_first_visit.npy ADDED
Binary file (2.18 kB). View file