Andrei Cozma commited on
Commit
f422d2f
·
1 Parent(s): 370ef8f
Files changed (3) hide show
  1. MonteCarloAgent.py +3 -3
  2. README.md +3 -1
  3. run_tests.py +5 -4
MonteCarloAgent.py CHANGED
@@ -45,7 +45,7 @@ class MonteCarloAgent:
45
  # Sample an action from the policy
46
  return np.random.choice(self.n_actions, p=self.Pi[state])
47
 
48
- def run_episode(self, max_steps=500, **kwargs):
49
  state, _ = self.env.reset()
50
  episode_hist = []
51
  finished = False
@@ -84,7 +84,7 @@ class MonteCarloAgent:
84
  1 - self.epsilon + self.epsilon / self.n_actions
85
  )
86
 
87
- def train(self, n_train_episodes=2500, test_every=100, log_wandb=False, **kwargs):
88
  print(f"Training agent for {n_train_episodes} episodes...")
89
  train_running_success_rate, test_success_rate = 0.0, 0.0
90
  stats = {
@@ -125,7 +125,7 @@ class MonteCarloAgent:
125
  if log_wandb:
126
  wandb.log(stats)
127
 
128
- def test(self, n_test_episodes=50, verbose=True, **kwargs):
129
  if verbose:
130
  print(f"Testing agent for {n_test_episodes} episodes...")
131
  num_successes = 0
 
45
  # Sample an action from the policy
46
  return np.random.choice(self.n_actions, p=self.Pi[state])
47
 
48
+ def run_episode(self, max_steps=250, **kwargs):
49
  state, _ = self.env.reset()
50
  episode_hist = []
51
  finished = False
 
84
  1 - self.epsilon + self.epsilon / self.n_actions
85
  )
86
 
87
+ def train(self, n_train_episodes=2000, test_every=100, log_wandb=False, **kwargs):
88
  print(f"Training agent for {n_train_episodes} episodes...")
89
  train_running_success_rate, test_success_rate = 0.0, 0.0
90
  stats = {
 
125
  if log_wandb:
126
  wandb.log(stats)
127
 
128
+ def test(self, n_test_episodes=100, verbose=True, **kwargs):
129
  if verbose:
130
  print(f"Testing agent for {n_test_episodes} episodes...")
131
  num_successes = 0
README.md CHANGED
@@ -12,7 +12,9 @@ Evolution of Reinforcement Learning methods from pure Dynamic Programming-based
12
 
13
  ## Monte-Carlo Agent
14
 
15
- The implementation of the epsilon-greedy Monte-Carlo agent for the [Cliff Walking](https://gymnasium.farama.org/environments/toy_text/cliff_walking/) toy environment as part of Gymnasium.
 
 
16
 
17
  ### Training
18
 
 
12
 
13
  ## Monte-Carlo Agent
14
 
15
+ The implementation of the Monte-Carlo agent for the [Cliff Walking](https://gymnasium.farama.org/environments/toy_text/cliff_walking/) toy environment.
16
+
17
+ The agent starts with a randomly initialized epsilon-greedy policy, and uses the first-visit Monte-Carlo method to learn the optimal policy.
18
 
19
  ### Training
20
 
run_tests.py CHANGED
@@ -1,10 +1,10 @@
1
  import os
2
  import multiprocessing
3
-
4
- vals_eps = [0.1, 0.25, 0.5, 0.75, 0.9]
5
- vals_gamma = [1.0, 0.99, 0.98, 0.97, 0.95]
6
 
7
  num_tests = 10
 
 
8
 
9
 
10
  def run_test(args):
@@ -14,9 +14,10 @@ def run_test(args):
14
 
15
 
16
  with multiprocessing.Pool(8) as p:
17
- # make all the tests
18
  tests = []
19
  for gamma in vals_gamma:
20
  for eps in vals_eps:
21
  tests.extend((gamma, eps, i) for i in range(num_tests))
 
 
22
  p.map(run_test, tests)
 
1
  import os
2
  import multiprocessing
3
+ import random
 
 
4
 
5
  num_tests = 10
6
+ vals_eps = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
7
+ vals_gamma = [1.0, 0.99, 0.98, 0.97, 0.95]
8
 
9
 
10
  def run_test(args):
 
14
 
15
 
16
  with multiprocessing.Pool(8) as p:
 
17
  tests = []
18
  for gamma in vals_gamma:
19
  for eps in vals_eps:
20
  tests.extend((gamma, eps, i) for i in range(num_tests))
21
+ random.shuffle(tests)
22
+
23
  p.map(run_test, tests)