Spaces:
Sleeping
Sleeping
Andrei Cozma
commited on
Commit
·
6a48762
1
Parent(s):
50efa30
Updates
Browse files- AgentBase.py +14 -8
- DPAgent.py +1 -1
- MCAgent.py +83 -36
- run.py +3 -4
AgentBase.py
CHANGED
@@ -62,22 +62,28 @@ class AgentBase:
|
|
62 |
print(f"- n_states: {self.n_states}")
|
63 |
print(f"- n_actions: {self.n_actions}")
|
64 |
|
65 |
-
def choose_action(self, state, greedy=False, **kwargs):
|
66 |
"""
|
67 |
Sample an action from the policy.
|
68 |
Also allows the ability to override the epsilon value (for the purpose of the demo)
|
69 |
:param state: The current state
|
|
|
70 |
:param greedy: If True, always return the greedy action (argmax of the policy at the current state)
|
71 |
:return: The sampled action
|
72 |
"""
|
|
|
|
|
|
|
|
|
|
|
73 |
# If greedy is True, always return the greedy action
|
74 |
-
greedy_action = np.argmax(
|
75 |
if greedy or self.epsilon_override == 0.0:
|
76 |
return greedy_action
|
77 |
|
78 |
# Otherwise, sample an action from the soft policy (epsilon-greedy)
|
79 |
if self.epsilon_override is None:
|
80 |
-
return np.random.choice(self.n_actions, p=
|
81 |
|
82 |
# If we ever want to manually override the epsilon value, it happens here
|
83 |
return np.random.choice(
|
@@ -85,7 +91,7 @@ class AgentBase:
|
|
85 |
p=[1.0 - self.epsilon_override, self.epsilon_override],
|
86 |
)
|
87 |
|
88 |
-
def generate_episode(self, max_steps=500, render=False, **kwargs):
|
89 |
state, _ = self.env.reset()
|
90 |
# action = self.choose_action(state, **kwargs)
|
91 |
episode_hist, solved, done = [], False, False
|
@@ -97,7 +103,7 @@ class AgentBase:
|
|
97 |
# Render the environment if needed
|
98 |
rgb_array = self.env.render() if render else None
|
99 |
# Sample the next action from the policy
|
100 |
-
action = self.choose_action(state, **kwargs)
|
101 |
# Keeping track of the trajectory
|
102 |
episode_hist.append((state, action, None))
|
103 |
# Take the action and observe the reward and next state
|
@@ -134,10 +140,10 @@ class AgentBase:
|
|
134 |
rgb_array = self.env.render() if render else None
|
135 |
yield episode_hist, solved, rgb_array
|
136 |
|
137 |
-
def run_episode(self, max_steps=500, render=False, **kwargs):
|
138 |
# Run the generator until the end
|
139 |
episode_hist, solved, rgb_array = list(
|
140 |
-
self.generate_episode(max_steps, render, **kwargs)
|
141 |
)[-1]
|
142 |
return episode_hist, solved, rgb_array
|
143 |
|
@@ -146,7 +152,7 @@ class AgentBase:
|
|
146 |
print(f"Testing agent for {n_test_episodes} episodes...")
|
147 |
num_successes = 0
|
148 |
for e in range(n_test_episodes):
|
149 |
-
_, solved, _ = self.run_episode(greedy=greedy, **kwargs)
|
150 |
num_successes += solved
|
151 |
if verbose:
|
152 |
word = "reached" if solved else "did not reach"
|
|
|
62 |
print(f"- n_states: {self.n_states}")
|
63 |
print(f"- n_actions: {self.n_actions}")
|
64 |
|
65 |
+
def choose_action(self, policy, state, greedy=False, **kwargs):
|
66 |
"""
|
67 |
Sample an action from the policy.
|
68 |
Also allows the ability to override the epsilon value (for the purpose of the demo)
|
69 |
:param state: The current state
|
70 |
+
:param policy: The policy to sample from. Must be of shape (n_states, n_actions)
|
71 |
:param greedy: If True, always return the greedy action (argmax of the policy at the current state)
|
72 |
:return: The sampled action
|
73 |
"""
|
74 |
+
assert policy.shape == (self.n_states, self.n_actions), (
|
75 |
+
f"ERROR: Policy must be of shape (n_states, n_actions) = ({self.n_states}, {self.n_actions}). "
|
76 |
+
f"Got {policy.shape}."
|
77 |
+
)
|
78 |
+
|
79 |
# If greedy is True, always return the greedy action
|
80 |
+
greedy_action = np.argmax(policy[state])
|
81 |
if greedy or self.epsilon_override == 0.0:
|
82 |
return greedy_action
|
83 |
|
84 |
# Otherwise, sample an action from the soft policy (epsilon-greedy)
|
85 |
if self.epsilon_override is None:
|
86 |
+
return np.random.choice(self.n_actions, p=policy[state])
|
87 |
|
88 |
# If we ever want to manually override the epsilon value, it happens here
|
89 |
return np.random.choice(
|
|
|
91 |
p=[1.0 - self.epsilon_override, self.epsilon_override],
|
92 |
)
|
93 |
|
94 |
+
def generate_episode(self, policy, max_steps=500, render=False, **kwargs):
|
95 |
state, _ = self.env.reset()
|
96 |
# action = self.choose_action(state, **kwargs)
|
97 |
episode_hist, solved, done = [], False, False
|
|
|
103 |
# Render the environment if needed
|
104 |
rgb_array = self.env.render() if render else None
|
105 |
# Sample the next action from the policy
|
106 |
+
action = self.choose_action(policy, state, **kwargs)
|
107 |
# Keeping track of the trajectory
|
108 |
episode_hist.append((state, action, None))
|
109 |
# Take the action and observe the reward and next state
|
|
|
140 |
rgb_array = self.env.render() if render else None
|
141 |
yield episode_hist, solved, rgb_array
|
142 |
|
143 |
+
def run_episode(self, policy, max_steps=500, render=False, **kwargs):
|
144 |
# Run the generator until the end
|
145 |
episode_hist, solved, rgb_array = list(
|
146 |
+
self.generate_episode(policy, max_steps, render, **kwargs)
|
147 |
)[-1]
|
148 |
return episode_hist, solved, rgb_array
|
149 |
|
|
|
152 |
print(f"Testing agent for {n_test_episodes} episodes...")
|
153 |
num_successes = 0
|
154 |
for e in range(n_test_episodes):
|
155 |
+
_, solved, _ = self.run_episode(policy=self.Pi, greedy=greedy, **kwargs)
|
156 |
num_successes += solved
|
157 |
if verbose:
|
158 |
word = "reached" if solved else "did not reach"
|
DPAgent.py
CHANGED
@@ -103,7 +103,7 @@ if __name__ == "__main__":
|
|
103 |
state, _ = env.reset()
|
104 |
done = False
|
105 |
while not done:
|
106 |
-
action = dp.choose_action(state)
|
107 |
state, reward, done, _, _ = env.step(action)
|
108 |
env.render()
|
109 |
|
|
|
103 |
state, _ = env.reset()
|
104 |
done = False
|
105 |
while not done:
|
106 |
+
action = dp.choose_action(dp.Pi, state)
|
107 |
state, reward, done, _, _ = env.step(action)
|
108 |
env.render()
|
109 |
|
MCAgent.py
CHANGED
@@ -5,35 +5,60 @@ from AgentBase import AgentBase
|
|
5 |
|
6 |
|
7 |
class MCAgent(AgentBase):
|
8 |
-
def __init__(
|
|
|
|
|
9 |
super().__init__(run_name=self.__class__.__name__, **kwargs)
|
|
|
|
|
10 |
self.initialize()
|
11 |
|
12 |
def initialize(self):
|
13 |
print("Resetting all state variables...")
|
14 |
# The Q-Table holds the current expected return for each state-action pair
|
15 |
-
self.Q = np.
|
16 |
-
#
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
# An arbitrary e-greedy policy:
|
19 |
# With probability epsilon, sample an action uniformly at random
|
20 |
-
|
21 |
-
(self.n_states, self.n_actions), self.epsilon / self.n_actions
|
22 |
-
)
|
23 |
# For the initial policy, we randomly select a greedy action for each state
|
24 |
-
|
25 |
np.arange(self.n_states),
|
26 |
-
np.random.randint(self.n_actions, size=self.n_states)
|
|
|
|
|
27 |
] = (
|
28 |
-
1 - self.epsilon + self.epsilon / self.n_actions
|
29 |
)
|
30 |
-
|
31 |
-
print("Initial policy:")
|
32 |
-
print(self.Pi)
|
33 |
-
print("=" * 80)
|
34 |
|
35 |
-
def
|
36 |
-
G = 0
|
37 |
# For each step of the episode, in reverse order
|
38 |
for t in range(len(episode_hist) - 1, -1, -1):
|
39 |
state, action, reward = episode_hist[t]
|
@@ -52,30 +77,51 @@ class MCAgent(AgentBase):
|
|
52 |
1 - self.epsilon + self.epsilon / self.n_actions
|
53 |
)
|
54 |
|
55 |
-
def update_every_visit(self, episode_hist):
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
for t in range(len(episode_hist) - 1, -1, -1):
|
59 |
state, action, reward = episode_hist[t]
|
60 |
# Updating the expected return
|
61 |
G = self.gamma * G + reward
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
self.Q[state, action]
|
66 |
-
# Updating the
|
67 |
-
|
68 |
-
self.Pi[state] = np.
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
73 |
|
74 |
def train(
|
75 |
self,
|
76 |
n_train_episodes=2000,
|
77 |
test_every=100,
|
78 |
-
update_type="first_visit",
|
79 |
log_wandb=False,
|
80 |
save_best=True,
|
81 |
save_best_dir=None,
|
@@ -83,7 +129,6 @@ class MCAgent(AgentBase):
|
|
83 |
**kwargs,
|
84 |
):
|
85 |
print(f"Training agent for {n_train_episodes} episodes...")
|
86 |
-
self.run_name = f"{self.run_name}_{update_type}"
|
87 |
|
88 |
(
|
89 |
train_running_success_rate,
|
@@ -99,7 +144,7 @@ class MCAgent(AgentBase):
|
|
99 |
"avg_ep_len": avg_ep_len,
|
100 |
}
|
101 |
|
102 |
-
update_func = getattr(self, f"update_{update_type}")
|
103 |
|
104 |
tqrange = tqdm(range(n_train_episodes))
|
105 |
tqrange.set_description("Training")
|
@@ -108,7 +153,8 @@ class MCAgent(AgentBase):
|
|
108 |
self.wandb_log_img(episode=None)
|
109 |
|
110 |
for e in tqrange:
|
111 |
-
|
|
|
112 |
rewards = [x[2] for x in episode_hist]
|
113 |
total_reward, avg_reward = sum(rewards), np.mean(rewards)
|
114 |
|
@@ -129,8 +175,9 @@ class MCAgent(AgentBase):
|
|
129 |
}
|
130 |
tqrange.set_postfix(stats)
|
131 |
|
132 |
-
# Test the agent every test_every episodes
|
133 |
-
if e % test_every == 0:
|
|
|
134 |
test_success_rate = self.test(verbose=False, **kwargs)
|
135 |
if log_wandb:
|
136 |
self.wandb_log_img(episode=e)
|
|
|
5 |
|
6 |
|
7 |
class MCAgent(AgentBase):
|
8 |
+
def __init__(
|
9 |
+
self, /, update_type="on-policy", **kwargs # "on-policy" or "off-policy
|
10 |
+
):
|
11 |
super().__init__(run_name=self.__class__.__name__, **kwargs)
|
12 |
+
self.update_type = update_type
|
13 |
+
self.run_name = f"{self.run_name}_{self.update_type}"
|
14 |
self.initialize()
|
15 |
|
16 |
def initialize(self):
|
17 |
print("Resetting all state variables...")
|
18 |
# The Q-Table holds the current expected return for each state-action pair
|
19 |
+
self.Q = np.random.rand(self.n_states, self.n_actions)
|
20 |
+
# self.Q = np.zeros((self.n_states, self.n_actions))
|
21 |
+
|
22 |
+
if self.update_type.startswith("on_policy"):
|
23 |
+
# For On-Policy update type:
|
24 |
+
# R keeps track of all the returns that have been observed for each state-action pair to update Q
|
25 |
+
self.R = [[[] for _ in range(self.n_actions)] for _ in range(self.n_states)]
|
26 |
+
# An arbitrary e-greedy policy:
|
27 |
+
self.Pi = self.create_soft_policy()
|
28 |
+
elif self.update_type.startswith("off_policy"):
|
29 |
+
# For Off-Policy update type:
|
30 |
+
self.C = np.zeros((self.n_states, self.n_actions))
|
31 |
+
# Target policy is greedy with respect to the current Q
|
32 |
+
self.Pi = np.zeros((self.n_states, self.n_actions))
|
33 |
+
self.Pi[np.arange(self.n_states), np.argmax(self.Q, axis=1)] = 1.0
|
34 |
+
# Behavior policy is e-greedy with respect to the current Q
|
35 |
+
self.Pi_behaviour = self.create_soft_policy(random=False)
|
36 |
+
else:
|
37 |
+
raise ValueError(
|
38 |
+
f"update_type must be either 'on_policy' or 'off_policy', but got {self.update_type}"
|
39 |
+
)
|
40 |
+
print("=" * 80)
|
41 |
+
print("Initial policy:")
|
42 |
+
print(self.Pi)
|
43 |
+
print("=" * 80)
|
44 |
+
|
45 |
+
def create_soft_policy(self, random=True):
|
46 |
# An arbitrary e-greedy policy:
|
47 |
# With probability epsilon, sample an action uniformly at random
|
48 |
+
Pi = np.full((self.n_states, self.n_actions), self.epsilon / self.n_actions)
|
|
|
|
|
49 |
# For the initial policy, we randomly select a greedy action for each state
|
50 |
+
Pi[
|
51 |
np.arange(self.n_states),
|
52 |
+
np.random.randint(self.n_actions, size=self.n_states)
|
53 |
+
if random
|
54 |
+
else np.argmax(self.Q, axis=1),
|
55 |
] = (
|
56 |
+
1.0 - self.epsilon + self.epsilon / self.n_actions
|
57 |
)
|
58 |
+
return Pi
|
|
|
|
|
|
|
59 |
|
60 |
+
def update_on_policy(self, episode_hist):
|
61 |
+
G = 0.0
|
62 |
# For each step of the episode, in reverse order
|
63 |
for t in range(len(episode_hist) - 1, -1, -1):
|
64 |
state, action, reward = episode_hist[t]
|
|
|
77 |
1 - self.epsilon + self.epsilon / self.n_actions
|
78 |
)
|
79 |
|
80 |
+
# def update_every_visit(self, episode_hist):
|
81 |
+
# G = 0
|
82 |
+
# # Backward pass through the trajectory
|
83 |
+
# for t in range(len(episode_hist) - 1, -1, -1):
|
84 |
+
# state, action, reward = episode_hist[t]
|
85 |
+
# # Updating the expected return
|
86 |
+
# G = self.gamma * G + reward
|
87 |
+
# # Every-visit MC method:
|
88 |
+
# # Updating the expected return and policy for every visit to this state-action pair
|
89 |
+
# self.R[state][action].append(G)
|
90 |
+
# self.Q[state, action] = np.mean(self.R[state][action])
|
91 |
+
# # Updating the epsilon-greedy policy.
|
92 |
+
# # With probability epsilon, sample an action uniformly at random
|
93 |
+
# self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
|
94 |
+
# # The greedy action receives the remaining probability mass
|
95 |
+
# self.Pi[state, np.argmax(self.Q[state])] = (
|
96 |
+
# 1 - self.epsilon + self.epsilon / self.n_actions
|
97 |
+
# )
|
98 |
+
|
99 |
+
def update_off_policy(self, episode_hist):
|
100 |
+
G, W = 0.0, 1.0
|
101 |
for t in range(len(episode_hist) - 1, -1, -1):
|
102 |
state, action, reward = episode_hist[t]
|
103 |
# Updating the expected return
|
104 |
G = self.gamma * G + reward
|
105 |
+
self.C[state, action] = self.C[state, action] + W
|
106 |
+
self.Q[state, action] = self.Q[state, action] + (
|
107 |
+
W / self.C[state, action]
|
108 |
+
) * (G - self.Q[state, action])
|
109 |
+
# Updating the target policy to be greedy with respect to the current Q
|
110 |
+
greedy_action = np.argmax(self.Q[state])
|
111 |
+
self.Pi[state] = np.zeros(self.n_actions)
|
112 |
+
self.Pi[state, greedy_action] = 1.0
|
113 |
+
# if At != At*, then break
|
114 |
+
if action != greedy_action:
|
115 |
+
break
|
116 |
+
W = W * (1.0 / self.Pi_behaviour[state, action])
|
117 |
+
|
118 |
+
# Update the behavior policy such that it has coverage of the target policy
|
119 |
+
self.Pi_behaviour = self.create_soft_policy(random=False)
|
120 |
|
121 |
def train(
|
122 |
self,
|
123 |
n_train_episodes=2000,
|
124 |
test_every=100,
|
|
|
125 |
log_wandb=False,
|
126 |
save_best=True,
|
127 |
save_best_dir=None,
|
|
|
129 |
**kwargs,
|
130 |
):
|
131 |
print(f"Training agent for {n_train_episodes} episodes...")
|
|
|
132 |
|
133 |
(
|
134 |
train_running_success_rate,
|
|
|
144 |
"avg_ep_len": avg_ep_len,
|
145 |
}
|
146 |
|
147 |
+
update_func = getattr(self, f"update_{self.update_type}")
|
148 |
|
149 |
tqrange = tqdm(range(n_train_episodes))
|
150 |
tqrange.set_description("Training")
|
|
|
153 |
self.wandb_log_img(episode=None)
|
154 |
|
155 |
for e in tqrange:
|
156 |
+
policy = self.Pi_behaviour if self.update_type == "off_policy" else self.Pi
|
157 |
+
episode_hist, solved, _ = self.run_episode(policy=policy, **kwargs)
|
158 |
rewards = [x[2] for x in episode_hist]
|
159 |
total_reward, avg_reward = sum(rewards), np.mean(rewards)
|
160 |
|
|
|
175 |
}
|
176 |
tqrange.set_postfix(stats)
|
177 |
|
178 |
+
# Test the agent every test_every episodes
|
179 |
+
if test_every > 0 and e % test_every == 0:
|
180 |
+
# For off policy, self.Pi is the target policy. For on policy, self.Pi is the soft policy
|
181 |
test_success_rate = self.test(verbose=False, **kwargs)
|
182 |
if log_wandb:
|
183 |
self.wandb_log_img(episode=e)
|
run.py
CHANGED
@@ -68,9 +68,9 @@ def main():
|
|
68 |
parser.add_argument(
|
69 |
"--update_type",
|
70 |
type=str,
|
71 |
-
choices=["
|
72 |
-
default="
|
73 |
-
help="The type of update to use. Only supported by Monte-Carlo agent. (default:
|
74 |
)
|
75 |
|
76 |
### Environment parameters
|
@@ -159,7 +159,6 @@ def main():
|
|
159 |
test_every=args.test_every,
|
160 |
n_test_episodes=args.n_test_episodes,
|
161 |
max_steps=args.max_steps,
|
162 |
-
update_type=args.update_type,
|
163 |
log_wandb=args.wandb_project is not None,
|
164 |
save_best=True,
|
165 |
save_best_dir=args.save_dir,
|
|
|
68 |
parser.add_argument(
|
69 |
"--update_type",
|
70 |
type=str,
|
71 |
+
choices=["on_policy", "off_policy"],
|
72 |
+
default="off_policy",
|
73 |
+
help="The type of update to use. Only supported by Monte-Carlo agent. (default: off_policy)",
|
74 |
)
|
75 |
|
76 |
### Environment parameters
|
|
|
159 |
test_every=args.test_every,
|
160 |
n_test_episodes=args.n_test_episodes,
|
161 |
max_steps=args.max_steps,
|
|
|
162 |
log_wandb=args.wandb_project is not None,
|
163 |
save_best=True,
|
164 |
save_best_dir=args.save_dir,
|