Andrei Cozma commited on
Commit
73cd2cf
·
1 Parent(s): f422d2f
Files changed (2) hide show
  1. MonteCarloAgent.py +31 -5
  2. run_tests.py +1 -1
MonteCarloAgent.py CHANGED
@@ -45,7 +45,7 @@ class MonteCarloAgent:
45
  # Sample an action from the policy
46
  return np.random.choice(self.n_actions, p=self.Pi[state])
47
 
48
- def run_episode(self, max_steps=250, **kwargs):
49
  state, _ = self.env.reset()
50
  episode_hist = []
51
  finished = False
@@ -65,7 +65,7 @@ class MonteCarloAgent:
65
 
66
  return episode_hist, finished
67
 
68
- def update(self, episode_hist):
69
  G = 0
70
  # For each step of the episode, in reverse order
71
  for t in range(len(episode_hist) - 1, -1, -1):
@@ -84,6 +84,24 @@ class MonteCarloAgent:
84
  1 - self.epsilon + self.epsilon / self.n_actions
85
  )
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  def train(self, n_train_episodes=2000, test_every=100, log_wandb=False, **kwargs):
88
  print(f"Training agent for {n_train_episodes} episodes...")
89
  train_running_success_rate, test_success_rate = 0.0, 0.0
@@ -104,7 +122,7 @@ class MonteCarloAgent:
104
  train_running_success_rate = (
105
  0.99 * train_running_success_rate + 0.01 * finished
106
  )
107
- self.update(episode_hist)
108
 
109
  stats = {
110
  "train_running_success_rate": train_running_success_rate,
@@ -207,10 +225,18 @@ def main():
207
  parser.add_argument(
208
  "--max_steps",
209
  type=int,
210
- default=250,
211
  help="The maximum number of steps per episode before the episode is forced to end. (default: 500)",
212
  )
213
 
 
 
 
 
 
 
 
 
214
  parser.add_argument(
215
  "--no_save",
216
  action="store_true",
@@ -227,7 +253,7 @@ def main():
227
  parser.add_argument(
228
  "--epsilon",
229
  type=float,
230
- default=0.5,
231
  help="The value for the epsilon-greedy policy to use. (default: 0.1)",
232
  )
233
 
 
45
  # Sample an action from the policy
46
  return np.random.choice(self.n_actions, p=self.Pi[state])
47
 
48
+ def run_episode(self, max_steps=500, **kwargs):
49
  state, _ = self.env.reset()
50
  episode_hist = []
51
  finished = False
 
65
 
66
  return episode_hist, finished
67
 
68
+ def update_first_visit(self, episode_hist):
69
  G = 0
70
  # For each step of the episode, in reverse order
71
  for t in range(len(episode_hist) - 1, -1, -1):
 
84
  1 - self.epsilon + self.epsilon / self.n_actions
85
  )
86
 
87
+ def update_every_visit(self, episode_hist):
88
+ G = 0
89
+ # For each step of the episode, in reverse order
90
+ for t in range(len(episode_hist) - 1, -1, -1):
91
+ state, action, reward = episode_hist[t]
92
+ # Update the expected return
93
+ G = self.gamma * G + reward
94
+ # We update the Q-table and policy even if we have visited this state-action pair before
95
+ # This is the every-visit MC method
96
+ self.R[state][action].append(G)
97
+ self.Q[state, action] = np.mean(self.R[state][action])
98
+ # Epsilon-greedy policy update
99
+ self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
100
+ # the greedy action is the one with the highest Q-value
101
+ self.Pi[state, np.argmax(self.Q[state])] = (
102
+ 1 - self.epsilon + self.epsilon / self.n_actions
103
+ )
104
+
105
  def train(self, n_train_episodes=2000, test_every=100, log_wandb=False, **kwargs):
106
  print(f"Training agent for {n_train_episodes} episodes...")
107
  train_running_success_rate, test_success_rate = 0.0, 0.0
 
122
  train_running_success_rate = (
123
  0.99 * train_running_success_rate + 0.01 * finished
124
  )
125
+ self.update_first_visit(episode_hist)
126
 
127
  stats = {
128
  "train_running_success_rate": train_running_success_rate,
 
225
  parser.add_argument(
226
  "--max_steps",
227
  type=int,
228
+ default=500,
229
  help="The maximum number of steps per episode before the episode is forced to end. (default: 500)",
230
  )
231
 
232
+ parser.add_argument(
233
+ "--update_type",
234
+ type=str,
235
+ choices=["first-visit", "every-visit"],
236
+ default="first-visit",
237
+ help="The type of update to use. (default: first-visit)",
238
+ )
239
+
240
  parser.add_argument(
241
  "--no_save",
242
  action="store_true",
 
253
  parser.add_argument(
254
  "--epsilon",
255
  type=float,
256
+ default=0.7,
257
  help="The value for the epsilon-greedy policy to use. (default: 0.1)",
258
  )
259
 
run_tests.py CHANGED
@@ -13,7 +13,7 @@ def run_test(args):
13
  )
14
 
15
 
16
- with multiprocessing.Pool(8) as p:
17
  tests = []
18
  for gamma in vals_gamma:
19
  for eps in vals_eps:
 
13
  )
14
 
15
 
16
+ with multiprocessing.Pool(16) as p:
17
  tests = []
18
  for gamma in vals_gamma:
19
  for eps in vals_eps: