Spaces:
Sleeping
Sleeping
Updates
Browse files- DPAgent.py +19 -20
- MCAgent.py +1 -0
- Shared.py +2 -2
- dp_policy.npy +0 -0
- policies/DPAgent_CliffWalking-v0_e2500_s200_g0.99_e0.4_first_visit.npy +0 -0
- policies/DPAgent_CliffWalking-v0_i219_g0.9.npy +0 -0
- policies/DPAgent_FrozenLake-v1_e2500_s200_g0.99_e0.4_first_visit.npy +0 -0
- policies/DPAgent_FrozenLake-v1_i219_g0.9.npy +0 -0
- policies/{MonteCarloAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy → MCAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy} +0 -0
- policies/{MonteCarloAgent_FrozenLake-v1_e2500_s200_g1.0_e0.2_first_visit.npy → MCAgent_FrozenLake-v1_e2500_s200_g1.0_e0.2_first_visit.npy} +0 -0
- run.py +3 -9
DPAgent.py
CHANGED
@@ -4,20 +4,25 @@ from gymnasium.envs.toy_text.frozen_lake import generate_random_map
|
|
4 |
from matplotlib import pyplot as plt
|
5 |
from tqdm import trange
|
6 |
from Shared import Shared
|
|
|
7 |
|
8 |
|
9 |
class DPAgent(Shared):
|
10 |
-
def __init__(self
|
11 |
super().__init__(**kwargs)
|
12 |
-
self.theta = theta
|
|
|
13 |
self.V = np.zeros(self.env.observation_space.n)
|
14 |
-
self.Pi =
|
|
|
|
|
15 |
|
16 |
def policy(self, state):
|
17 |
return self.Pi[state]
|
18 |
|
19 |
-
def train(self, **kwargs):
|
20 |
i = 0
|
|
|
21 |
while True:
|
22 |
delta = 0
|
23 |
V_prev = np.copy(self.V)
|
@@ -38,10 +43,10 @@ class DPAgent(Shared):
|
|
38 |
if delta < self.theta:
|
39 |
break
|
40 |
i += 1
|
41 |
-
|
|
|
42 |
print(f"Iteration {i}: delta={delta}")
|
43 |
# break
|
44 |
-
|
45 |
# policy = [self.policy(state, return_value=True)[0] for state in range(self.env.observation_space.n)]
|
46 |
self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
|
47 |
for s in range(self.env.observation_space.n):
|
@@ -51,23 +56,18 @@ class DPAgent(Shared):
|
|
51 |
# if state == self.env.observation_space.n-1: reward = 1
|
52 |
expected_value += probability * (reward + self.gamma * self.V[next_state])
|
53 |
self.Pi[s,a] = expected_value
|
54 |
-
|
55 |
-
print(
|
|
|
|
|
|
|
|
|
56 |
# return self.V, self.Pi
|
57 |
|
58 |
|
59 |
if __name__ == "__main__":
|
60 |
# env = gym.make('FrozenLake-v1', render_mode='human')
|
61 |
-
dp = DPAgent("FrozenLake-v1"
|
62 |
-
"SFFFFFFF",
|
63 |
-
"FFFFFFFH",
|
64 |
-
"FFFHFFFF",
|
65 |
-
"FFFFFHFF",
|
66 |
-
"FFFHFFFF",
|
67 |
-
"FHHFFFHF",
|
68 |
-
"FHFFHFHF",
|
69 |
-
"FFFHFFFG",
|
70 |
-
])
|
71 |
dp.train()
|
72 |
dp.save_policy('dp_policy.npy')
|
73 |
env = gym.make('FrozenLake-v1', render_mode='human', is_slippery=False, desc=[
|
@@ -84,8 +84,7 @@ if __name__ == "__main__":
|
|
84 |
state, _ = env.reset()
|
85 |
done = False
|
86 |
while not done:
|
87 |
-
action = dp.
|
88 |
-
action = np.argmax(action)
|
89 |
state, reward, done, _, _ = env.step(action)
|
90 |
env.render()
|
91 |
|
|
|
4 |
from matplotlib import pyplot as plt
|
5 |
from tqdm import trange
|
6 |
from Shared import Shared
|
7 |
+
import warnings
|
8 |
|
9 |
|
10 |
class DPAgent(Shared):
|
11 |
+
def __init__(self,/,**kwargs):
|
12 |
super().__init__(**kwargs)
|
13 |
+
self.theta = kwargs.get('theta', 1e-10)
|
14 |
+
print(self.theta)
|
15 |
self.V = np.zeros(self.env.observation_space.n)
|
16 |
+
self.Pi = np.zeros(self.env.observation_space.n, self.env.action_space.n)
|
17 |
+
if self.gamma >= 1.0:
|
18 |
+
warnings.warn("DP will never converge with a gamma value =1.0. Try 0.99?", UserWarning)
|
19 |
|
20 |
def policy(self, state):
|
21 |
return self.Pi[state]
|
22 |
|
23 |
+
def train(self, *args, **kwargs):
|
24 |
i = 0
|
25 |
+
print(self.gamma)
|
26 |
while True:
|
27 |
delta = 0
|
28 |
V_prev = np.copy(self.V)
|
|
|
43 |
if delta < self.theta:
|
44 |
break
|
45 |
i += 1
|
46 |
+
# if i % 100 == 0 and i != 0:
|
47 |
+
# self.test()
|
48 |
print(f"Iteration {i}: delta={delta}")
|
49 |
# break
|
|
|
50 |
# policy = [self.policy(state, return_value=True)[0] for state in range(self.env.observation_space.n)]
|
51 |
self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
|
52 |
for s in range(self.env.observation_space.n):
|
|
|
56 |
# if state == self.env.observation_space.n-1: reward = 1
|
57 |
expected_value += probability * (reward + self.gamma * self.V[next_state])
|
58 |
self.Pi[s,a] = expected_value
|
59 |
+
idxs = np.argmax(self.Pi, axis=1)
|
60 |
+
print(idxs)
|
61 |
+
self.Pi = np.zeros((self.env.observation_space.n,self.env.action_space.n))
|
62 |
+
self.Pi[np.arange(self.env.observation_space.n),idxs] = 1
|
63 |
+
|
64 |
+
# print(self.Pi)
|
65 |
# return self.V, self.Pi
|
66 |
|
67 |
|
68 |
if __name__ == "__main__":
|
69 |
# env = gym.make('FrozenLake-v1', render_mode='human')
|
70 |
+
dp = DPAgent(env_name="FrozenLake-v1")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
dp.train()
|
72 |
dp.save_policy('dp_policy.npy')
|
73 |
env = gym.make('FrozenLake-v1', render_mode='human', is_slippery=False, desc=[
|
|
|
84 |
state, _ = env.reset()
|
85 |
done = False
|
86 |
while not done:
|
87 |
+
action = dp.choose_action(state)
|
|
|
88 |
state, reward, done, _, _ = env.step(action)
|
89 |
env.render()
|
90 |
|
MCAgent.py
CHANGED
@@ -8,6 +8,7 @@ class MCAgent(Shared):
|
|
8 |
|
9 |
def __init__(
|
10 |
self,
|
|
|
11 |
env_name="CliffWalking-v0",
|
12 |
gamma=0.99,
|
13 |
epsilon=0.1,
|
|
|
8 |
|
9 |
def __init__(
|
10 |
self,
|
11 |
+
/,
|
12 |
env_name="CliffWalking-v0",
|
13 |
gamma=0.99,
|
14 |
epsilon=0.1,
|
Shared.py
CHANGED
@@ -8,7 +8,7 @@ import wandb
|
|
8 |
class Shared:
|
9 |
|
10 |
def __init__(
|
11 |
-
self
|
12 |
env_name="CliffWalking-v0",
|
13 |
gamma=0.99,
|
14 |
epsilon=0.1,
|
@@ -24,7 +24,7 @@ class Shared:
|
|
24 |
self.env_name = env_name
|
25 |
self.epsilon, self.gamma = epsilon, gamma
|
26 |
|
27 |
-
self.env_kwargs = kwargs
|
28 |
if self.env_name == "FrozenLake-v1":
|
29 |
# Can use defaults by defining map_name (4x4 or 8x8) or custom map by defining desc
|
30 |
# self.env_kwargs["map_name"] = "8x8"
|
|
|
8 |
class Shared:
|
9 |
|
10 |
def __init__(
|
11 |
+
self,/,
|
12 |
env_name="CliffWalking-v0",
|
13 |
gamma=0.99,
|
14 |
epsilon=0.1,
|
|
|
24 |
self.env_name = env_name
|
25 |
self.epsilon, self.gamma = epsilon, gamma
|
26 |
|
27 |
+
self.env_kwargs = {k:v for k,v in kwargs.items() if k in ['render_mode']}
|
28 |
if self.env_name == "FrozenLake-v1":
|
29 |
# Can use defaults by defining map_name (4x4 or 8x8) or custom map by defining desc
|
30 |
# self.env_kwargs["map_name"] = "8x8"
|
dp_policy.npy
ADDED
Binary file (2.18 kB). View file
|
|
policies/DPAgent_CliffWalking-v0_e2500_s200_g0.99_e0.4_first_visit.npy
ADDED
Binary file (1.66 kB). View file
|
|
policies/DPAgent_CliffWalking-v0_i219_g0.9.npy
DELETED
Binary file (512 Bytes)
|
|
policies/DPAgent_FrozenLake-v1_e2500_s200_g0.99_e0.4_first_visit.npy
ADDED
Binary file (1.66 kB). View file
|
|
policies/DPAgent_FrozenLake-v1_i219_g0.9.npy
DELETED
Binary file (640 Bytes)
|
|
policies/{MonteCarloAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy → MCAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy}
RENAMED
File without changes
|
policies/{MonteCarloAgent_FrozenLake-v1_e2500_s200_g1.0_e0.2_first_visit.npy → MCAgent_FrozenLake-v1_e2500_s200_g1.0_e0.2_first_visit.npy}
RENAMED
File without changes
|
run.py
CHANGED
@@ -5,7 +5,6 @@ from agents import AGENTS_MAP
|
|
5 |
|
6 |
def main():
|
7 |
parser = argparse.ArgumentParser()
|
8 |
-
|
9 |
### Train/Test parameters
|
10 |
parser.add_argument(
|
11 |
"--train",
|
@@ -77,7 +76,7 @@ def main():
|
|
77 |
parser.add_argument(
|
78 |
"--gamma",
|
79 |
type=float,
|
80 |
-
default=
|
81 |
help="The value for the discount factor to use. (default: 1.0)",
|
82 |
)
|
83 |
parser.add_argument(
|
@@ -128,13 +127,8 @@ def main():
|
|
128 |
)
|
129 |
|
130 |
args = parser.parse_args()
|
131 |
-
|
132 |
-
agent = AGENTS_MAP[args.agent](
|
133 |
-
args.env,
|
134 |
-
gamma=args.gamma,
|
135 |
-
epsilon=args.epsilon,
|
136 |
-
render_mode=args.render_mode,
|
137 |
-
)
|
138 |
|
139 |
run_name = f"{agent.__class__.__name__}_{args.env}_e{args.n_train_episodes}_s{args.max_steps}_g{args.gamma}_e{args.epsilon}_{args.update_type}"
|
140 |
if args.wandb_run_name_suffix is not None:
|
|
|
5 |
|
6 |
def main():
|
7 |
parser = argparse.ArgumentParser()
|
|
|
8 |
### Train/Test parameters
|
9 |
parser.add_argument(
|
10 |
"--train",
|
|
|
76 |
parser.add_argument(
|
77 |
"--gamma",
|
78 |
type=float,
|
79 |
+
default=0.99,
|
80 |
help="The value for the discount factor to use. (default: 1.0)",
|
81 |
)
|
82 |
parser.add_argument(
|
|
|
127 |
)
|
128 |
|
129 |
args = parser.parse_args()
|
130 |
+
print(vars(args))
|
131 |
+
agent = AGENTS_MAP[args.agent](**dict(args._get_kwargs()))
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
run_name = f"{agent.__class__.__name__}_{args.env}_e{args.n_train_episodes}_s{args.max_steps}_g{args.gamma}_e{args.epsilon}_{args.update_type}"
|
134 |
if args.wandb_run_name_suffix is not None:
|