Spaces:
Sleeping
Sleeping
Andrei Cozma
commited on
Commit
·
73cd2cf
1
Parent(s):
f422d2f
Updates
Browse files- MonteCarloAgent.py +31 -5
- run_tests.py +1 -1
MonteCarloAgent.py
CHANGED
@@ -45,7 +45,7 @@ class MonteCarloAgent:
|
|
45 |
# Sample an action from the policy
|
46 |
return np.random.choice(self.n_actions, p=self.Pi[state])
|
47 |
|
48 |
-
def run_episode(self, max_steps=
|
49 |
state, _ = self.env.reset()
|
50 |
episode_hist = []
|
51 |
finished = False
|
@@ -65,7 +65,7 @@ class MonteCarloAgent:
|
|
65 |
|
66 |
return episode_hist, finished
|
67 |
|
68 |
-
def
|
69 |
G = 0
|
70 |
# For each step of the episode, in reverse order
|
71 |
for t in range(len(episode_hist) - 1, -1, -1):
|
@@ -84,6 +84,24 @@ class MonteCarloAgent:
|
|
84 |
1 - self.epsilon + self.epsilon / self.n_actions
|
85 |
)
|
86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
def train(self, n_train_episodes=2000, test_every=100, log_wandb=False, **kwargs):
|
88 |
print(f"Training agent for {n_train_episodes} episodes...")
|
89 |
train_running_success_rate, test_success_rate = 0.0, 0.0
|
@@ -104,7 +122,7 @@ class MonteCarloAgent:
|
|
104 |
train_running_success_rate = (
|
105 |
0.99 * train_running_success_rate + 0.01 * finished
|
106 |
)
|
107 |
-
self.
|
108 |
|
109 |
stats = {
|
110 |
"train_running_success_rate": train_running_success_rate,
|
@@ -207,10 +225,18 @@ def main():
|
|
207 |
parser.add_argument(
|
208 |
"--max_steps",
|
209 |
type=int,
|
210 |
-
default=
|
211 |
help="The maximum number of steps per episode before the episode is forced to end. (default: 500)",
|
212 |
)
|
213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
parser.add_argument(
|
215 |
"--no_save",
|
216 |
action="store_true",
|
@@ -227,7 +253,7 @@ def main():
|
|
227 |
parser.add_argument(
|
228 |
"--epsilon",
|
229 |
type=float,
|
230 |
-
default=0.
|
231 |
help="The value for the epsilon-greedy policy to use. (default: 0.1)",
|
232 |
)
|
233 |
|
|
|
45 |
# Sample an action from the policy
|
46 |
return np.random.choice(self.n_actions, p=self.Pi[state])
|
47 |
|
48 |
+
def run_episode(self, max_steps=500, **kwargs):
|
49 |
state, _ = self.env.reset()
|
50 |
episode_hist = []
|
51 |
finished = False
|
|
|
65 |
|
66 |
return episode_hist, finished
|
67 |
|
68 |
+
def update_first_visit(self, episode_hist):
|
69 |
G = 0
|
70 |
# For each step of the episode, in reverse order
|
71 |
for t in range(len(episode_hist) - 1, -1, -1):
|
|
|
84 |
1 - self.epsilon + self.epsilon / self.n_actions
|
85 |
)
|
86 |
|
87 |
+
def update_every_visit(self, episode_hist):
|
88 |
+
G = 0
|
89 |
+
# For each step of the episode, in reverse order
|
90 |
+
for t in range(len(episode_hist) - 1, -1, -1):
|
91 |
+
state, action, reward = episode_hist[t]
|
92 |
+
# Update the expected return
|
93 |
+
G = self.gamma * G + reward
|
94 |
+
# We update the Q-table and policy even if we have visited this state-action pair before
|
95 |
+
# This is the every-visit MC method
|
96 |
+
self.R[state][action].append(G)
|
97 |
+
self.Q[state, action] = np.mean(self.R[state][action])
|
98 |
+
# Epsilon-greedy policy update
|
99 |
+
self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
|
100 |
+
# the greedy action is the one with the highest Q-value
|
101 |
+
self.Pi[state, np.argmax(self.Q[state])] = (
|
102 |
+
1 - self.epsilon + self.epsilon / self.n_actions
|
103 |
+
)
|
104 |
+
|
105 |
def train(self, n_train_episodes=2000, test_every=100, log_wandb=False, **kwargs):
|
106 |
print(f"Training agent for {n_train_episodes} episodes...")
|
107 |
train_running_success_rate, test_success_rate = 0.0, 0.0
|
|
|
122 |
train_running_success_rate = (
|
123 |
0.99 * train_running_success_rate + 0.01 * finished
|
124 |
)
|
125 |
+
self.update_first_visit(episode_hist)
|
126 |
|
127 |
stats = {
|
128 |
"train_running_success_rate": train_running_success_rate,
|
|
|
225 |
parser.add_argument(
|
226 |
"--max_steps",
|
227 |
type=int,
|
228 |
+
default=500,
|
229 |
help="The maximum number of steps per episode before the episode is forced to end. (default: 500)",
|
230 |
)
|
231 |
|
232 |
+
parser.add_argument(
|
233 |
+
"--update_type",
|
234 |
+
type=str,
|
235 |
+
choices=["first-visit", "every-visit"],
|
236 |
+
default="first-visit",
|
237 |
+
help="The type of update to use. (default: first-visit)",
|
238 |
+
)
|
239 |
+
|
240 |
parser.add_argument(
|
241 |
"--no_save",
|
242 |
action="store_true",
|
|
|
253 |
parser.add_argument(
|
254 |
"--epsilon",
|
255 |
type=float,
|
256 |
+
default=0.7,
|
257 |
help="The value for the epsilon-greedy policy to use. (default: 0.1)",
|
258 |
)
|
259 |
|
run_tests.py
CHANGED
@@ -13,7 +13,7 @@ def run_test(args):
|
|
13 |
)
|
14 |
|
15 |
|
16 |
-
with multiprocessing.Pool(
|
17 |
tests = []
|
18 |
for gamma in vals_gamma:
|
19 |
for eps in vals_eps:
|
|
|
13 |
)
|
14 |
|
15 |
|
16 |
+
with multiprocessing.Pool(16) as p:
|
17 |
tests = []
|
18 |
for gamma in vals_gamma:
|
19 |
for eps in vals_eps:
|