Spaces:
Sleeping
Sleeping
Merge branch 'main' of github.com:andreicozma1/CS581-Algorithms-Project
Browse files- AgentBase.py +19 -9
- DPAgent.py +1 -1
- MCAgent.py +95 -38
- run.py +5 -6
- test_params.py +38 -13
AgentBase.py
CHANGED
@@ -62,22 +62,28 @@ class AgentBase:
|
|
62 |
print(f"- n_states: {self.n_states}")
|
63 |
print(f"- n_actions: {self.n_actions}")
|
64 |
|
65 |
-
def choose_action(self, state, greedy=False, **kwargs):
|
66 |
"""
|
67 |
Sample an action from the policy.
|
68 |
Also allows the ability to override the epsilon value (for the purpose of the demo)
|
69 |
:param state: The current state
|
|
|
70 |
:param greedy: If True, always return the greedy action (argmax of the policy at the current state)
|
71 |
:return: The sampled action
|
72 |
"""
|
|
|
|
|
|
|
|
|
|
|
73 |
# If greedy is True, always return the greedy action
|
74 |
-
greedy_action = np.argmax(
|
75 |
if greedy or self.epsilon_override == 0.0:
|
76 |
return greedy_action
|
77 |
|
78 |
# Otherwise, sample an action from the soft policy (epsilon-greedy)
|
79 |
if self.epsilon_override is None:
|
80 |
-
return np.random.choice(self.n_actions, p=
|
81 |
|
82 |
# If we ever want to manually override the epsilon value, it happens here
|
83 |
return np.random.choice(
|
@@ -85,9 +91,13 @@ class AgentBase:
|
|
85 |
p=[1.0 - self.epsilon_override, self.epsilon_override],
|
86 |
)
|
87 |
|
88 |
-
def generate_episode(self, max_steps=
|
|
|
|
|
|
|
|
|
|
|
89 |
state, _ = self.env.reset()
|
90 |
-
# action = self.choose_action(state, **kwargs)
|
91 |
episode_hist, solved, done = [], False, False
|
92 |
rgb_array = self.env.render() if render else None
|
93 |
|
@@ -97,7 +107,7 @@ class AgentBase:
|
|
97 |
# Render the environment if needed
|
98 |
rgb_array = self.env.render() if render else None
|
99 |
# Sample the next action from the policy
|
100 |
-
action = self.choose_action(state, **kwargs)
|
101 |
# Keeping track of the trajectory
|
102 |
episode_hist.append((state, action, None))
|
103 |
# Take the action and observe the reward and next state
|
@@ -134,10 +144,10 @@ class AgentBase:
|
|
134 |
rgb_array = self.env.render() if render else None
|
135 |
yield episode_hist, solved, rgb_array
|
136 |
|
137 |
-
def run_episode(self, max_steps=
|
138 |
# Run the generator until the end
|
139 |
episode_hist, solved, rgb_array = list(
|
140 |
-
self.generate_episode(max_steps, render, **kwargs)
|
141 |
)[-1]
|
142 |
return episode_hist, solved, rgb_array
|
143 |
|
@@ -146,7 +156,7 @@ class AgentBase:
|
|
146 |
print(f"Testing agent for {n_test_episodes} episodes...")
|
147 |
num_successes = 0
|
148 |
for e in range(n_test_episodes):
|
149 |
-
_, solved, _ = self.run_episode(greedy=greedy, **kwargs)
|
150 |
num_successes += solved
|
151 |
if verbose:
|
152 |
word = "reached" if solved else "did not reach"
|
|
|
62 |
print(f"- n_states: {self.n_states}")
|
63 |
print(f"- n_actions: {self.n_actions}")
|
64 |
|
65 |
+
def choose_action(self, policy, state, greedy=False, **kwargs):
|
66 |
"""
|
67 |
Sample an action from the policy.
|
68 |
Also allows the ability to override the epsilon value (for the purpose of the demo)
|
69 |
:param state: The current state
|
70 |
+
:param policy: The policy to sample from. Must be of shape (n_states, n_actions)
|
71 |
:param greedy: If True, always return the greedy action (argmax of the policy at the current state)
|
72 |
:return: The sampled action
|
73 |
"""
|
74 |
+
assert policy.shape == (self.n_states, self.n_actions), (
|
75 |
+
f"ERROR: Policy must be of shape (n_states, n_actions) = ({self.n_states}, {self.n_actions}). "
|
76 |
+
f"Got {policy.shape}."
|
77 |
+
)
|
78 |
+
|
79 |
# If greedy is True, always return the greedy action
|
80 |
+
greedy_action = np.argmax(policy[state])
|
81 |
if greedy or self.epsilon_override == 0.0:
|
82 |
return greedy_action
|
83 |
|
84 |
# Otherwise, sample an action from the soft policy (epsilon-greedy)
|
85 |
if self.epsilon_override is None:
|
86 |
+
return np.random.choice(self.n_actions, p=policy[state])
|
87 |
|
88 |
# If we ever want to manually override the epsilon value, it happens here
|
89 |
return np.random.choice(
|
|
|
91 |
p=[1.0 - self.epsilon_override, self.epsilon_override],
|
92 |
)
|
93 |
|
94 |
+
def generate_episode(self, policy, max_steps=None, render=False, **kwargs):
|
95 |
+
if max_steps is None:
|
96 |
+
# If max_steps is not specified, we use a rough estimate of
|
97 |
+
# the maximum number of steps it should take to solve the environment
|
98 |
+
max_steps = self.n_states * self.n_actions
|
99 |
+
|
100 |
state, _ = self.env.reset()
|
|
|
101 |
episode_hist, solved, done = [], False, False
|
102 |
rgb_array = self.env.render() if render else None
|
103 |
|
|
|
107 |
# Render the environment if needed
|
108 |
rgb_array = self.env.render() if render else None
|
109 |
# Sample the next action from the policy
|
110 |
+
action = self.choose_action(policy, state, **kwargs)
|
111 |
# Keeping track of the trajectory
|
112 |
episode_hist.append((state, action, None))
|
113 |
# Take the action and observe the reward and next state
|
|
|
144 |
rgb_array = self.env.render() if render else None
|
145 |
yield episode_hist, solved, rgb_array
|
146 |
|
147 |
+
def run_episode(self, policy, max_steps=None, render=False, **kwargs):
|
148 |
# Run the generator until the end
|
149 |
episode_hist, solved, rgb_array = list(
|
150 |
+
self.generate_episode(policy, max_steps, render, **kwargs)
|
151 |
)[-1]
|
152 |
return episode_hist, solved, rgb_array
|
153 |
|
|
|
156 |
print(f"Testing agent for {n_test_episodes} episodes...")
|
157 |
num_successes = 0
|
158 |
for e in range(n_test_episodes):
|
159 |
+
_, solved, _ = self.run_episode(policy=self.Pi, greedy=greedy, **kwargs)
|
160 |
num_successes += solved
|
161 |
if verbose:
|
162 |
word = "reached" if solved else "did not reach"
|
DPAgent.py
CHANGED
@@ -125,7 +125,7 @@ if __name__ == "__main__":
|
|
125 |
state, _ = env.reset()
|
126 |
done = False
|
127 |
while not done:
|
128 |
-
action = dp.choose_action(
|
129 |
state, reward, done, _, _ = env.step(action)
|
130 |
s = env.render()
|
131 |
print(s)
|
|
|
125 |
state, _ = env.reset()
|
126 |
done = False
|
127 |
while not done:
|
128 |
+
action = dp.choose_action(dp.Pi, state)
|
129 |
state, reward, done, _, _ = env.step(action)
|
130 |
s = env.render()
|
131 |
print(s)
|
MCAgent.py
CHANGED
@@ -5,35 +5,70 @@ from AgentBase import AgentBase
|
|
5 |
|
6 |
|
7 |
class MCAgent(AgentBase):
|
8 |
-
def __init__(
|
|
|
|
|
9 |
super().__init__(run_name=self.__class__.__name__, **kwargs)
|
|
|
|
|
10 |
self.initialize()
|
11 |
|
12 |
def initialize(self):
|
13 |
print("Resetting all state variables...")
|
14 |
# The Q-Table holds the current expected return for each state-action pair
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
#
|
19 |
-
#
|
20 |
-
self.
|
21 |
-
|
22 |
-
)
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
print("=" * 80)
|
31 |
print("Initial policy:")
|
32 |
print(self.Pi)
|
33 |
print("=" * 80)
|
34 |
|
35 |
-
def
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
# For each step of the episode, in reverse order
|
38 |
for t in range(len(episode_hist) - 1, -1, -1):
|
39 |
state, action, reward = episode_hist[t]
|
@@ -52,30 +87,51 @@ class MCAgent(AgentBase):
|
|
52 |
1 - self.epsilon + self.epsilon / self.n_actions
|
53 |
)
|
54 |
|
55 |
-
def update_every_visit(self, episode_hist):
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
for t in range(len(episode_hist) - 1, -1, -1):
|
59 |
state, action, reward = episode_hist[t]
|
60 |
# Updating the expected return
|
61 |
G = self.gamma * G + reward
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
self.Q[state, action]
|
66 |
-
# Updating the
|
67 |
-
|
68 |
-
self.Pi[state] = np.
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
73 |
|
74 |
def train(
|
75 |
self,
|
76 |
n_train_episodes=2000,
|
77 |
test_every=100,
|
78 |
-
update_type="first_visit",
|
79 |
log_wandb=False,
|
80 |
save_best=True,
|
81 |
save_best_dir=None,
|
@@ -83,7 +139,6 @@ class MCAgent(AgentBase):
|
|
83 |
**kwargs,
|
84 |
):
|
85 |
print(f"Training agent for {n_train_episodes} episodes...")
|
86 |
-
self.run_name = f"{self.run_name}_{update_type}"
|
87 |
|
88 |
(
|
89 |
train_running_success_rate,
|
@@ -99,7 +154,7 @@ class MCAgent(AgentBase):
|
|
99 |
"avg_ep_len": avg_ep_len,
|
100 |
}
|
101 |
|
102 |
-
update_func = getattr(self, f"update_{update_type}")
|
103 |
|
104 |
tqrange = tqdm(range(n_train_episodes))
|
105 |
tqrange.set_description("Training")
|
@@ -108,7 +163,8 @@ class MCAgent(AgentBase):
|
|
108 |
self.wandb_log_img(episode=None)
|
109 |
|
110 |
for e in tqrange:
|
111 |
-
|
|
|
112 |
rewards = [x[2] for x in episode_hist]
|
113 |
total_reward, avg_reward = sum(rewards), np.mean(rewards)
|
114 |
|
@@ -129,8 +185,9 @@ class MCAgent(AgentBase):
|
|
129 |
}
|
130 |
tqrange.set_postfix(stats)
|
131 |
|
132 |
-
# Test the agent every test_every episodes
|
133 |
-
if e % test_every == 0:
|
|
|
134 |
test_success_rate = self.test(verbose=False, **kwargs)
|
135 |
if log_wandb:
|
136 |
self.wandb_log_img(episode=e)
|
|
|
5 |
|
6 |
|
7 |
class MCAgent(AgentBase):
|
8 |
+
def __init__(
|
9 |
+
self, /, update_type="on-policy", **kwargs # "on-policy" or "off-policy
|
10 |
+
):
|
11 |
super().__init__(run_name=self.__class__.__name__, **kwargs)
|
12 |
+
self.update_type = update_type
|
13 |
+
self.run_name = f"{self.run_name}_{self.update_type}"
|
14 |
self.initialize()
|
15 |
|
16 |
def initialize(self):
|
17 |
print("Resetting all state variables...")
|
18 |
# The Q-Table holds the current expected return for each state-action pair
|
19 |
+
# random uniform initialization
|
20 |
+
self.Q = np.random.uniform(-1, 1, size=(self.n_states, self.n_actions))
|
21 |
+
# other alternatives:
|
22 |
+
# self.Q = np.zeros((self.n_states, self.n_actions))
|
23 |
+
# self.Q = np.random.rand(self.n_states, self.n_actions)
|
24 |
+
# self.Q = np.random.normal(0, 1, size=(self.n_states, self.n_actions))
|
25 |
+
|
26 |
+
if self.update_type.startswith("on_policy"):
|
27 |
+
# For On-Policy update type:
|
28 |
+
# R keeps track of all the returns that have been observed for each state-action pair to update Q
|
29 |
+
self.R = [[[] for _ in range(self.n_actions)] for _ in range(self.n_states)]
|
30 |
+
# An arbitrary e-greedy policy:
|
31 |
+
self.Pi = self.create_soft_policy()
|
32 |
+
elif self.update_type.startswith("off_policy"):
|
33 |
+
# For Off-Policy update type:
|
34 |
+
self.C = np.zeros((self.n_states, self.n_actions))
|
35 |
+
# Target policy is greedy with respect to the current Q (ties broken consistently)
|
36 |
+
self.Pi = np.zeros((self.n_states, self.n_actions))
|
37 |
+
self.Pi[np.arange(self.n_states), np.argmax(self.Q, axis=1)] = 1.0
|
38 |
+
# Behavior policy is e-greedy with respect to the current Q
|
39 |
+
self.Pi_behaviour = self.create_soft_policy(coverage_policy=self.Pi)
|
40 |
+
else:
|
41 |
+
raise ValueError(
|
42 |
+
f"update_type must be either 'on_policy' or 'off_policy', but got {self.update_type}"
|
43 |
+
)
|
44 |
print("=" * 80)
|
45 |
print("Initial policy:")
|
46 |
print(self.Pi)
|
47 |
print("=" * 80)
|
48 |
|
49 |
+
def create_soft_policy(self, coverage_policy=None):
|
50 |
+
"""
|
51 |
+
Create a soft policy (epsilon-greedy).
|
52 |
+
If coverage_policy is None, the soft policy is initialized randomly.
|
53 |
+
Otherwise, the soft policy is e-greedy with respect to the coverage policy. (useful for off-policy)
|
54 |
+
"""
|
55 |
+
# With probability epsilon, sample an action uniformly at random
|
56 |
+
Pi = np.full((self.n_states, self.n_actions), self.epsilon / self.n_actions)
|
57 |
+
# The greedy action receives the remaining probability mass
|
58 |
+
# If coverage_policy is not provided, the greedy action is sampled randomly
|
59 |
+
# Otherwise we give the remaining probability mass according to the coverage policy
|
60 |
+
Pi[
|
61 |
+
np.arange(self.n_states),
|
62 |
+
np.random.randint(self.n_actions, size=self.n_states)
|
63 |
+
if coverage_policy is None
|
64 |
+
else np.argmax(coverage_policy, axis=1),
|
65 |
+
] = (
|
66 |
+
1.0 - self.epsilon + self.epsilon / self.n_actions
|
67 |
+
)
|
68 |
+
return Pi
|
69 |
+
|
70 |
+
def update_on_policy(self, episode_hist):
|
71 |
+
G = 0.0
|
72 |
# For each step of the episode, in reverse order
|
73 |
for t in range(len(episode_hist) - 1, -1, -1):
|
74 |
state, action, reward = episode_hist[t]
|
|
|
87 |
1 - self.epsilon + self.epsilon / self.n_actions
|
88 |
)
|
89 |
|
90 |
+
# def update_every_visit(self, episode_hist):
|
91 |
+
# G = 0
|
92 |
+
# # Backward pass through the trajectory
|
93 |
+
# for t in range(len(episode_hist) - 1, -1, -1):
|
94 |
+
# state, action, reward = episode_hist[t]
|
95 |
+
# # Updating the expected return
|
96 |
+
# G = self.gamma * G + reward
|
97 |
+
# # Every-visit MC method:
|
98 |
+
# # Updating the expected return and policy for every visit to this state-action pair
|
99 |
+
# self.R[state][action].append(G)
|
100 |
+
# self.Q[state, action] = np.mean(self.R[state][action])
|
101 |
+
# # Updating the epsilon-greedy policy.
|
102 |
+
# # With probability epsilon, sample an action uniformly at random
|
103 |
+
# self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
|
104 |
+
# # The greedy action receives the remaining probability mass
|
105 |
+
# self.Pi[state, np.argmax(self.Q[state])] = (
|
106 |
+
# 1 - self.epsilon + self.epsilon / self.n_actions
|
107 |
+
# )
|
108 |
+
|
109 |
+
def update_off_policy(self, episode_hist):
|
110 |
+
G, W = 0.0, 1.0
|
111 |
for t in range(len(episode_hist) - 1, -1, -1):
|
112 |
state, action, reward = episode_hist[t]
|
113 |
# Updating the expected return
|
114 |
G = self.gamma * G + reward
|
115 |
+
self.C[state, action] = self.C[state, action] + W
|
116 |
+
self.Q[state, action] = self.Q[state, action] + (
|
117 |
+
W / self.C[state, action]
|
118 |
+
) * (G - self.Q[state, action])
|
119 |
+
# Updating the target policy to be greedy with respect to the current Q
|
120 |
+
greedy_action = np.argmax(self.Q[state])
|
121 |
+
self.Pi[state] = np.zeros(self.n_actions)
|
122 |
+
self.Pi[state, greedy_action] = 1.0
|
123 |
+
# If the greedy action is not the action taken by the behavior policy, then break
|
124 |
+
if action != greedy_action:
|
125 |
+
break
|
126 |
+
W = W * (1.0 / self.Pi_behaviour[state, action])
|
127 |
+
|
128 |
+
# Update the behavior policy such that it has coverage of the target policy
|
129 |
+
self.Pi_behaviour = self.create_soft_policy(coverage_policy=self.Pi)
|
130 |
|
131 |
def train(
|
132 |
self,
|
133 |
n_train_episodes=2000,
|
134 |
test_every=100,
|
|
|
135 |
log_wandb=False,
|
136 |
save_best=True,
|
137 |
save_best_dir=None,
|
|
|
139 |
**kwargs,
|
140 |
):
|
141 |
print(f"Training agent for {n_train_episodes} episodes...")
|
|
|
142 |
|
143 |
(
|
144 |
train_running_success_rate,
|
|
|
154 |
"avg_ep_len": avg_ep_len,
|
155 |
}
|
156 |
|
157 |
+
update_func = getattr(self, f"update_{self.update_type}")
|
158 |
|
159 |
tqrange = tqdm(range(n_train_episodes))
|
160 |
tqrange.set_description("Training")
|
|
|
163 |
self.wandb_log_img(episode=None)
|
164 |
|
165 |
for e in tqrange:
|
166 |
+
policy = self.Pi_behaviour if self.update_type == "off_policy" else self.Pi
|
167 |
+
episode_hist, solved, _ = self.run_episode(policy=policy, **kwargs)
|
168 |
rewards = [x[2] for x in episode_hist]
|
169 |
total_reward, avg_reward = sum(rewards), np.mean(rewards)
|
170 |
|
|
|
185 |
}
|
186 |
tqrange.set_postfix(stats)
|
187 |
|
188 |
+
# Test the agent every test_every episodes
|
189 |
+
if test_every > 0 and e % test_every == 0:
|
190 |
+
# For off policy, self.Pi is the target policy. For on policy, self.Pi is the soft policy
|
191 |
test_success_rate = self.test(verbose=False, **kwargs)
|
192 |
if log_wandb:
|
193 |
self.wandb_log_img(episode=e)
|
run.py
CHANGED
@@ -39,8 +39,8 @@ def main():
|
|
39 |
parser.add_argument(
|
40 |
"--max_steps",
|
41 |
type=int,
|
42 |
-
default=
|
43 |
-
help="The maximum number of steps per episode before the episode is forced to end. (default:
|
44 |
)
|
45 |
|
46 |
### Agent parameters
|
@@ -68,9 +68,9 @@ def main():
|
|
68 |
parser.add_argument(
|
69 |
"--update_type",
|
70 |
type=str,
|
71 |
-
choices=["
|
72 |
-
default="
|
73 |
-
help="The type of update to use. Only supported by Monte-Carlo agent. (default:
|
74 |
)
|
75 |
|
76 |
### Environment parameters
|
@@ -159,7 +159,6 @@ def main():
|
|
159 |
test_every=args.test_every,
|
160 |
n_test_episodes=args.n_test_episodes,
|
161 |
max_steps=args.max_steps,
|
162 |
-
update_type=args.update_type,
|
163 |
log_wandb=args.wandb_project is not None,
|
164 |
save_best=True,
|
165 |
save_best_dir=args.save_dir,
|
|
|
39 |
parser.add_argument(
|
40 |
"--max_steps",
|
41 |
type=int,
|
42 |
+
default=None,
|
43 |
+
help="The maximum number of steps per episode before the episode is forced to end. If not provided, defaults to the number of states in the environment. (default: None)",
|
44 |
)
|
45 |
|
46 |
### Agent parameters
|
|
|
68 |
parser.add_argument(
|
69 |
"--update_type",
|
70 |
type=str,
|
71 |
+
choices=["on_policy", "off_policy"],
|
72 |
+
default="off_policy",
|
73 |
+
help="The type of update to use. Only supported by Monte-Carlo agent. (default: off_policy)",
|
74 |
)
|
75 |
|
76 |
### Environment parameters
|
|
|
159 |
test_every=args.test_every,
|
160 |
n_test_episodes=args.n_test_episodes,
|
161 |
max_steps=args.max_steps,
|
|
|
162 |
log_wandb=args.wandb_project is not None,
|
163 |
save_best=True,
|
164 |
save_best_dir=args.save_dir,
|
test_params.py
CHANGED
@@ -9,13 +9,13 @@ parser = argparse.ArgumentParser(description="Run parameter tests for MC agent")
|
|
9 |
parser.add_argument(
|
10 |
"--env",
|
11 |
type=str,
|
12 |
-
default="
|
13 |
help="environment to run",
|
14 |
)
|
15 |
parser.add_argument(
|
16 |
"--num_tests",
|
17 |
type=int,
|
18 |
-
default=
|
19 |
help="number of tests to run for each parameter combination",
|
20 |
)
|
21 |
parser.add_argument(
|
@@ -31,31 +31,35 @@ env, num_tests, wandb_project = args.env, args.num_tests, args.wandb_project
|
|
31 |
agent = "MCAgent"
|
32 |
|
33 |
vals_update_type = [
|
34 |
-
"
|
|
|
35 |
] # Note: Every visit takes too long due to these environment's reward structure
|
36 |
-
vals_gamma = [1.0, 0.98, 0.96, 0.94]
|
37 |
vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
|
38 |
-
|
39 |
# vals_epsilon = [0.5]
|
40 |
|
|
|
|
|
41 |
if env == "CliffWalking-v0":
|
42 |
n_train_episodes = 2500
|
43 |
-
max_steps = 200
|
44 |
elif env == "FrozenLake-v1":
|
45 |
-
n_train_episodes =
|
46 |
-
max_steps = 200
|
47 |
elif env == "Taxi-v3":
|
48 |
n_train_episodes = 10000
|
49 |
-
max_steps = 500
|
50 |
else:
|
51 |
raise ValueError(f"Unsupported environment: {env}")
|
52 |
|
53 |
|
54 |
def run_test(args):
|
55 |
command = f"python3 run.py --train --agent {agent} --env {env}"
|
56 |
-
command += f" --n_train_episodes {n_train_episodes} --max_steps {max_steps}"
|
57 |
-
command += f" --
|
58 |
-
|
|
|
59 |
if wandb_project is not None:
|
60 |
command += f" --wandb_project {wandb_project}"
|
61 |
command += " --no_save"
|
@@ -67,7 +71,28 @@ with multiprocessing.Pool(8) as p:
|
|
67 |
for update_type in vals_update_type:
|
68 |
for gamma in vals_gamma:
|
69 |
for eps in vals_epsilon:
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
random.shuffle(tests)
|
72 |
|
73 |
p.map(run_test, tests)
|
|
|
9 |
parser.add_argument(
|
10 |
"--env",
|
11 |
type=str,
|
12 |
+
default="FrozenLake-v1",
|
13 |
help="environment to run",
|
14 |
)
|
15 |
parser.add_argument(
|
16 |
"--num_tests",
|
17 |
type=int,
|
18 |
+
default=10,
|
19 |
help="number of tests to run for each parameter combination",
|
20 |
)
|
21 |
parser.add_argument(
|
|
|
31 |
agent = "MCAgent"
|
32 |
|
33 |
vals_update_type = [
|
34 |
+
# "on_policy",
|
35 |
+
"off_policy",
|
36 |
] # Note: Every visit takes too long due to these environment's reward structure
|
37 |
+
# vals_gamma = [1.0, 0.98, 0.96, 0.94]
|
38 |
vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
|
39 |
+
vals_gamma = [1.0]
|
40 |
# vals_epsilon = [0.5]
|
41 |
|
42 |
+
vals_size = [8, 16, 32, 64]
|
43 |
+
|
44 |
if env == "CliffWalking-v0":
|
45 |
n_train_episodes = 2500
|
46 |
+
# max_steps = 200
|
47 |
elif env == "FrozenLake-v1":
|
48 |
+
n_train_episodes = 25000
|
49 |
+
# max_steps = 200
|
50 |
elif env == "Taxi-v3":
|
51 |
n_train_episodes = 10000
|
52 |
+
# max_steps = 500
|
53 |
else:
|
54 |
raise ValueError(f"Unsupported environment: {env}")
|
55 |
|
56 |
|
57 |
def run_test(args):
|
58 |
command = f"python3 run.py --train --agent {agent} --env {env}"
|
59 |
+
# command += f" --n_train_episodes {n_train_episodes} --max_steps {max_steps}"
|
60 |
+
command += f" --n_train_episodes {n_train_episodes}"
|
61 |
+
for k, v in args.items():
|
62 |
+
command += f" --{k} {v}"
|
63 |
if wandb_project is not None:
|
64 |
command += f" --wandb_project {wandb_project}"
|
65 |
command += " --no_save"
|
|
|
71 |
for update_type in vals_update_type:
|
72 |
for gamma in vals_gamma:
|
73 |
for eps in vals_epsilon:
|
74 |
+
if env == "FrozenLake-v1":
|
75 |
+
for size in vals_size:
|
76 |
+
tests.extend(
|
77 |
+
{
|
78 |
+
"gamma": gamma,
|
79 |
+
"epsilon": eps,
|
80 |
+
"update_type": update_type,
|
81 |
+
"size": size,
|
82 |
+
"run_name_suffix": i,
|
83 |
+
}
|
84 |
+
for i in range(num_tests)
|
85 |
+
)
|
86 |
+
else:
|
87 |
+
tests.extend(
|
88 |
+
{
|
89 |
+
"gamma": gamma,
|
90 |
+
"epsilon": eps,
|
91 |
+
"update_type": update_type,
|
92 |
+
"run_name_suffix": i,
|
93 |
+
}
|
94 |
+
for i in range(num_tests)
|
95 |
+
)
|
96 |
random.shuffle(tests)
|
97 |
|
98 |
p.map(run_test, tests)
|