Spaces:

acozma
/

CS581-Algos-Demo

Sleeping

Andrei Cozma commited on Apr 19, 2023

Commit

f422d2f

1 Parent(s): 370ef8f

Updates

Files changed (3) hide show

MonteCarloAgent.py CHANGED Viewed

@@ -45,7 +45,7 @@ class MonteCarloAgent:
         # Sample an action from the policy
         return np.random.choice(self.n_actions, p=self.Pi[state])
-    def run_episode(self, max_steps=500, **kwargs):
         state, _ = self.env.reset()
         episode_hist = []
         finished = False
@@ -84,7 +84,7 @@ class MonteCarloAgent:
                     1 - self.epsilon + self.epsilon / self.n_actions
                 )
-    def train(self, n_train_episodes=2500, test_every=100, log_wandb=False, **kwargs):
         print(f"Training agent for {n_train_episodes} episodes...")
         train_running_success_rate, test_success_rate = 0.0, 0.0
         stats = {
@@ -125,7 +125,7 @@ class MonteCarloAgent:
             if log_wandb:
                 wandb.log(stats)
-    def test(self, n_test_episodes=50, verbose=True, **kwargs):
         if verbose:
             print(f"Testing agent for {n_test_episodes} episodes...")
         num_successes = 0

         # Sample an action from the policy
         return np.random.choice(self.n_actions, p=self.Pi[state])
+    def run_episode(self, max_steps=250, **kwargs):
         state, _ = self.env.reset()
         episode_hist = []
         finished = False
                     1 - self.epsilon + self.epsilon / self.n_actions
                 )
+    def train(self, n_train_episodes=2000, test_every=100, log_wandb=False, **kwargs):
         print(f"Training agent for {n_train_episodes} episodes...")
         train_running_success_rate, test_success_rate = 0.0, 0.0
         stats = {
             if log_wandb:
                 wandb.log(stats)
+    def test(self, n_test_episodes=100, verbose=True, **kwargs):
         if verbose:
             print(f"Testing agent for {n_test_episodes} episodes...")
         num_successes = 0

README.md CHANGED Viewed

@@ -12,7 +12,9 @@ Evolution of Reinforcement Learning methods from pure Dynamic Programming-based
 ## Monte-Carlo Agent
-The implementation of the epsilon-greedy Monte-Carlo agent for the [Cliff Walking](https://gymnasium.farama.org/environments/toy_text/cliff_walking/) toy environment as part of Gymnasium.
 ### Training

 ## Monte-Carlo Agent
+The implementation of the Monte-Carlo agent for the [Cliff Walking](https://gymnasium.farama.org/environments/toy_text/cliff_walking/) toy environment.
+The agent starts with a randomly initialized epsilon-greedy policy, and uses the first-visit Monte-Carlo method to learn the optimal policy.
 ### Training

run_tests.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import os
 import multiprocessing
-vals_eps = [0.1, 0.25, 0.5, 0.75, 0.9]
-vals_gamma = [1.0, 0.99, 0.98, 0.97, 0.95]
 num_tests = 10
 def run_test(args):
@@ -14,9 +14,10 @@ def run_test(args):
 with multiprocessing.Pool(8) as p:
-    # make all the tests
     tests = []
     for gamma in vals_gamma:
         for eps in vals_eps:
             tests.extend((gamma, eps, i) for i in range(num_tests))
     p.map(run_test, tests)

 import os
 import multiprocessing
+import random
 num_tests = 10
+vals_eps = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
+vals_gamma = [1.0, 0.99, 0.98, 0.97, 0.95]
 def run_test(args):
 with multiprocessing.Pool(8) as p:
     tests = []
     for gamma in vals_gamma:
         for eps in vals_eps:
             tests.extend((gamma, eps, i) for i in range(num_tests))
+    random.shuffle(tests)
     p.map(run_test, tests)