Spaces:
Sleeping
Sleeping
Andrei Cozma
commited on
Commit
·
f422d2f
1
Parent(s):
370ef8f
Updates
Browse files- MonteCarloAgent.py +3 -3
- README.md +3 -1
- run_tests.py +5 -4
MonteCarloAgent.py
CHANGED
@@ -45,7 +45,7 @@ class MonteCarloAgent:
|
|
45 |
# Sample an action from the policy
|
46 |
return np.random.choice(self.n_actions, p=self.Pi[state])
|
47 |
|
48 |
-
def run_episode(self, max_steps=
|
49 |
state, _ = self.env.reset()
|
50 |
episode_hist = []
|
51 |
finished = False
|
@@ -84,7 +84,7 @@ class MonteCarloAgent:
|
|
84 |
1 - self.epsilon + self.epsilon / self.n_actions
|
85 |
)
|
86 |
|
87 |
-
def train(self, n_train_episodes=
|
88 |
print(f"Training agent for {n_train_episodes} episodes...")
|
89 |
train_running_success_rate, test_success_rate = 0.0, 0.0
|
90 |
stats = {
|
@@ -125,7 +125,7 @@ class MonteCarloAgent:
|
|
125 |
if log_wandb:
|
126 |
wandb.log(stats)
|
127 |
|
128 |
-
def test(self, n_test_episodes=
|
129 |
if verbose:
|
130 |
print(f"Testing agent for {n_test_episodes} episodes...")
|
131 |
num_successes = 0
|
|
|
45 |
# Sample an action from the policy
|
46 |
return np.random.choice(self.n_actions, p=self.Pi[state])
|
47 |
|
48 |
+
def run_episode(self, max_steps=250, **kwargs):
|
49 |
state, _ = self.env.reset()
|
50 |
episode_hist = []
|
51 |
finished = False
|
|
|
84 |
1 - self.epsilon + self.epsilon / self.n_actions
|
85 |
)
|
86 |
|
87 |
+
def train(self, n_train_episodes=2000, test_every=100, log_wandb=False, **kwargs):
|
88 |
print(f"Training agent for {n_train_episodes} episodes...")
|
89 |
train_running_success_rate, test_success_rate = 0.0, 0.0
|
90 |
stats = {
|
|
|
125 |
if log_wandb:
|
126 |
wandb.log(stats)
|
127 |
|
128 |
+
def test(self, n_test_episodes=100, verbose=True, **kwargs):
|
129 |
if verbose:
|
130 |
print(f"Testing agent for {n_test_episodes} episodes...")
|
131 |
num_successes = 0
|
README.md
CHANGED
@@ -12,7 +12,9 @@ Evolution of Reinforcement Learning methods from pure Dynamic Programming-based
|
|
12 |
|
13 |
## Monte-Carlo Agent
|
14 |
|
15 |
-
The implementation of the
|
|
|
|
|
16 |
|
17 |
### Training
|
18 |
|
|
|
12 |
|
13 |
## Monte-Carlo Agent
|
14 |
|
15 |
+
The implementation of the Monte-Carlo agent for the [Cliff Walking](https://gymnasium.farama.org/environments/toy_text/cliff_walking/) toy environment.
|
16 |
+
|
17 |
+
The agent starts with a randomly initialized epsilon-greedy policy, and uses the first-visit Monte-Carlo method to learn the optimal policy.
|
18 |
|
19 |
### Training
|
20 |
|
run_tests.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
import os
|
2 |
import multiprocessing
|
3 |
-
|
4 |
-
vals_eps = [0.1, 0.25, 0.5, 0.75, 0.9]
|
5 |
-
vals_gamma = [1.0, 0.99, 0.98, 0.97, 0.95]
|
6 |
|
7 |
num_tests = 10
|
|
|
|
|
8 |
|
9 |
|
10 |
def run_test(args):
|
@@ -14,9 +14,10 @@ def run_test(args):
|
|
14 |
|
15 |
|
16 |
with multiprocessing.Pool(8) as p:
|
17 |
-
# make all the tests
|
18 |
tests = []
|
19 |
for gamma in vals_gamma:
|
20 |
for eps in vals_eps:
|
21 |
tests.extend((gamma, eps, i) for i in range(num_tests))
|
|
|
|
|
22 |
p.map(run_test, tests)
|
|
|
1 |
import os
|
2 |
import multiprocessing
|
3 |
+
import random
|
|
|
|
|
4 |
|
5 |
num_tests = 10
|
6 |
+
vals_eps = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
|
7 |
+
vals_gamma = [1.0, 0.99, 0.98, 0.97, 0.95]
|
8 |
|
9 |
|
10 |
def run_test(args):
|
|
|
14 |
|
15 |
|
16 |
with multiprocessing.Pool(8) as p:
|
|
|
17 |
tests = []
|
18 |
for gamma in vals_gamma:
|
19 |
for eps in vals_eps:
|
20 |
tests.extend((gamma, eps, i) for i in range(num_tests))
|
21 |
+
random.shuffle(tests)
|
22 |
+
|
23 |
p.map(run_test, tests)
|