Spaces:
Sleeping
Sleeping
Andrei Cozma
commited on
Commit
·
4a6d8ec
1
Parent(s):
6a48762
Updates
Browse files- AgentBase.py +0 -1
- MCAgent.py +20 -10
AgentBase.py
CHANGED
@@ -93,7 +93,6 @@ class AgentBase:
|
|
93 |
|
94 |
def generate_episode(self, policy, max_steps=500, render=False, **kwargs):
|
95 |
state, _ = self.env.reset()
|
96 |
-
# action = self.choose_action(state, **kwargs)
|
97 |
episode_hist, solved, done = [], False, False
|
98 |
rgb_array = self.env.render() if render else None
|
99 |
|
|
|
93 |
|
94 |
def generate_episode(self, policy, max_steps=500, render=False, **kwargs):
|
95 |
state, _ = self.env.reset()
|
|
|
96 |
episode_hist, solved, done = [], False, False
|
97 |
rgb_array = self.env.render() if render else None
|
98 |
|
MCAgent.py
CHANGED
@@ -16,8 +16,12 @@ class MCAgent(AgentBase):
|
|
16 |
def initialize(self):
|
17 |
print("Resetting all state variables...")
|
18 |
# The Q-Table holds the current expected return for each state-action pair
|
19 |
-
|
|
|
|
|
20 |
# self.Q = np.zeros((self.n_states, self.n_actions))
|
|
|
|
|
21 |
|
22 |
if self.update_type.startswith("on_policy"):
|
23 |
# For On-Policy update type:
|
@@ -28,11 +32,11 @@ class MCAgent(AgentBase):
|
|
28 |
elif self.update_type.startswith("off_policy"):
|
29 |
# For Off-Policy update type:
|
30 |
self.C = np.zeros((self.n_states, self.n_actions))
|
31 |
-
# Target policy is greedy with respect to the current Q
|
32 |
self.Pi = np.zeros((self.n_states, self.n_actions))
|
33 |
self.Pi[np.arange(self.n_states), np.argmax(self.Q, axis=1)] = 1.0
|
34 |
# Behavior policy is e-greedy with respect to the current Q
|
35 |
-
self.Pi_behaviour = self.create_soft_policy(
|
36 |
else:
|
37 |
raise ValueError(
|
38 |
f"update_type must be either 'on_policy' or 'off_policy', but got {self.update_type}"
|
@@ -42,16 +46,22 @@ class MCAgent(AgentBase):
|
|
42 |
print(self.Pi)
|
43 |
print("=" * 80)
|
44 |
|
45 |
-
def create_soft_policy(self,
|
46 |
-
|
|
|
|
|
|
|
|
|
47 |
# With probability epsilon, sample an action uniformly at random
|
48 |
Pi = np.full((self.n_states, self.n_actions), self.epsilon / self.n_actions)
|
49 |
-
#
|
|
|
|
|
50 |
Pi[
|
51 |
np.arange(self.n_states),
|
52 |
np.random.randint(self.n_actions, size=self.n_states)
|
53 |
-
if
|
54 |
-
else np.argmax(
|
55 |
] = (
|
56 |
1.0 - self.epsilon + self.epsilon / self.n_actions
|
57 |
)
|
@@ -110,13 +120,13 @@ class MCAgent(AgentBase):
|
|
110 |
greedy_action = np.argmax(self.Q[state])
|
111 |
self.Pi[state] = np.zeros(self.n_actions)
|
112 |
self.Pi[state, greedy_action] = 1.0
|
113 |
-
#
|
114 |
if action != greedy_action:
|
115 |
break
|
116 |
W = W * (1.0 / self.Pi_behaviour[state, action])
|
117 |
|
118 |
# Update the behavior policy such that it has coverage of the target policy
|
119 |
-
self.Pi_behaviour = self.create_soft_policy(
|
120 |
|
121 |
def train(
|
122 |
self,
|
|
|
16 |
def initialize(self):
|
17 |
print("Resetting all state variables...")
|
18 |
# The Q-Table holds the current expected return for each state-action pair
|
19 |
+
# random uniform initialization
|
20 |
+
self.Q = np.random.uniform(-1, 1, size=(self.n_states, self.n_actions))
|
21 |
+
# other alternatives:
|
22 |
# self.Q = np.zeros((self.n_states, self.n_actions))
|
23 |
+
# self.Q = np.random.rand(self.n_states, self.n_actions)
|
24 |
+
# self.Q = np.random.normal(0, 1, size=(self.n_states, self.n_actions))
|
25 |
|
26 |
if self.update_type.startswith("on_policy"):
|
27 |
# For On-Policy update type:
|
|
|
32 |
elif self.update_type.startswith("off_policy"):
|
33 |
# For Off-Policy update type:
|
34 |
self.C = np.zeros((self.n_states, self.n_actions))
|
35 |
+
# Target policy is greedy with respect to the current Q (ties broken consistently)
|
36 |
self.Pi = np.zeros((self.n_states, self.n_actions))
|
37 |
self.Pi[np.arange(self.n_states), np.argmax(self.Q, axis=1)] = 1.0
|
38 |
# Behavior policy is e-greedy with respect to the current Q
|
39 |
+
self.Pi_behaviour = self.create_soft_policy(coverage_policy=self.Pi)
|
40 |
else:
|
41 |
raise ValueError(
|
42 |
f"update_type must be either 'on_policy' or 'off_policy', but got {self.update_type}"
|
|
|
46 |
print(self.Pi)
|
47 |
print("=" * 80)
|
48 |
|
49 |
+
def create_soft_policy(self, coverage_policy=None):
|
50 |
+
"""
|
51 |
+
Create a soft policy (epsilon-greedy).
|
52 |
+
If coverage_policy is None, the soft policy is initialized randomly.
|
53 |
+
Otherwise, the soft policy is e-greedy with respect to the coverage policy. (useful for off-policy)
|
54 |
+
"""
|
55 |
# With probability epsilon, sample an action uniformly at random
|
56 |
Pi = np.full((self.n_states, self.n_actions), self.epsilon / self.n_actions)
|
57 |
+
# The greedy action receives the remaining probability mass
|
58 |
+
# If coverage_policy is not provided, the greedy action is sampled randomly
|
59 |
+
# Otherwise we give the remaining probability mass according to the coverage policy
|
60 |
Pi[
|
61 |
np.arange(self.n_states),
|
62 |
np.random.randint(self.n_actions, size=self.n_states)
|
63 |
+
if coverage_policy is None
|
64 |
+
else np.argmax(coverage_policy, axis=1),
|
65 |
] = (
|
66 |
1.0 - self.epsilon + self.epsilon / self.n_actions
|
67 |
)
|
|
|
120 |
greedy_action = np.argmax(self.Q[state])
|
121 |
self.Pi[state] = np.zeros(self.n_actions)
|
122 |
self.Pi[state, greedy_action] = 1.0
|
123 |
+
# If the greedy action is not the action taken by the behavior policy, then break
|
124 |
if action != greedy_action:
|
125 |
break
|
126 |
W = W * (1.0 / self.Pi_behaviour[state, action])
|
127 |
|
128 |
# Update the behavior policy such that it has coverage of the target policy
|
129 |
+
self.Pi_behaviour = self.create_soft_policy(coverage_policy=self.Pi)
|
130 |
|
131 |
def train(
|
132 |
self,
|