Spaces:
Sleeping
Sleeping
Andrei Cozma
commited on
Commit
·
46b0409
1
Parent(s):
ebd7110
Updates
Browse files- Shared.py → AgentBase.py +1 -1
- DPAgent.py +12 -6
- MCAgent.py +2 -3
- README.md +44 -36
- demo.py +1 -1
- run.py +32 -36
- run_tests_MC_CliffWalking-v0.py +0 -34
- run_tests_MC_FrozenLake-v1.py +0 -34
- run_tests_MC_Taxi-v3.py +0 -34
- test_params.py +76 -0
- agents.py → utils.py +19 -8
Shared.py → AgentBase.py
RENAMED
@@ -4,7 +4,7 @@ import gymnasium as gym
|
|
4 |
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
|
5 |
|
6 |
|
7 |
-
class
|
8 |
def __init__(
|
9 |
self,
|
10 |
/,
|
|
|
4 |
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
|
5 |
|
6 |
|
7 |
+
class AgentBase:
|
8 |
def __init__(
|
9 |
self,
|
10 |
/,
|
DPAgent.py
CHANGED
@@ -3,11 +3,11 @@ import numpy as np
|
|
3 |
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
|
4 |
from matplotlib import pyplot as plt
|
5 |
from tqdm import trange
|
6 |
-
from
|
7 |
import warnings
|
8 |
|
9 |
|
10 |
-
class DPAgent(
|
11 |
def __init__(self, /, **kwargs):
|
12 |
super().__init__(run_name=self.__class__.__name__, **kwargs)
|
13 |
self.theta = kwargs.get("theta", 1e-10)
|
@@ -36,7 +36,10 @@ class DPAgent(Shared):
|
|
36 |
for probability, next_state, reward, done in self.env.P[state][
|
37 |
action
|
38 |
]:
|
39 |
-
if
|
|
|
|
|
|
|
40 |
reward = 1
|
41 |
expected_value += probability * (
|
42 |
reward + self.gamma * self.V[next_state]
|
@@ -53,14 +56,17 @@ class DPAgent(Shared):
|
|
53 |
# if i % 5 == 0 and i != 0:
|
54 |
# self.test(verbose=False)
|
55 |
print(f"Iteration {i}: delta={delta}")
|
56 |
-
|
57 |
self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
|
58 |
for s in range(self.env.observation_space.n):
|
59 |
for a in range(self.env.action_space.n):
|
60 |
expected_value = 0
|
61 |
for probability, next_state, reward, done in self.env.P[s][a]:
|
62 |
-
if
|
63 |
-
|
|
|
|
|
|
|
64 |
expected_value += probability * (
|
65 |
reward + self.gamma * self.V[next_state]
|
66 |
)
|
|
|
3 |
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
|
4 |
from matplotlib import pyplot as plt
|
5 |
from tqdm import trange
|
6 |
+
from AgentBase import AgentBase
|
7 |
import warnings
|
8 |
|
9 |
|
10 |
+
class DPAgent(AgentBase):
|
11 |
def __init__(self, /, **kwargs):
|
12 |
super().__init__(run_name=self.__class__.__name__, **kwargs)
|
13 |
self.theta = kwargs.get("theta", 1e-10)
|
|
|
36 |
for probability, next_state, reward, done in self.env.P[state][
|
37 |
action
|
38 |
]:
|
39 |
+
if (
|
40 |
+
self.env_name == "CliffWalking-v0"
|
41 |
+
and state == self.env.observation_space.n - 1
|
42 |
+
):
|
43 |
reward = 1
|
44 |
expected_value += probability * (
|
45 |
reward + self.gamma * self.V[next_state]
|
|
|
56 |
# if i % 5 == 0 and i != 0:
|
57 |
# self.test(verbose=False)
|
58 |
print(f"Iteration {i}: delta={delta}")
|
59 |
+
|
60 |
self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
|
61 |
for s in range(self.env.observation_space.n):
|
62 |
for a in range(self.env.action_space.n):
|
63 |
expected_value = 0
|
64 |
for probability, next_state, reward, done in self.env.P[s][a]:
|
65 |
+
if (
|
66 |
+
self.env_name == "CliffWalking-v0"
|
67 |
+
and state == self.env.observation_space.n - 1
|
68 |
+
):
|
69 |
+
reward = 1
|
70 |
expected_value += probability * (
|
71 |
reward + self.gamma * self.V[next_state]
|
72 |
)
|
MCAgent.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
import numpy as np
|
2 |
from tqdm import tqdm
|
3 |
-
from Shared import Shared
|
4 |
import wandb
|
5 |
-
from
|
6 |
|
7 |
|
8 |
-
class MCAgent(
|
9 |
def __init__(self, /, **kwargs):
|
10 |
super().__init__(run_name=self.__class__.__name__, **kwargs)
|
11 |
self.reset()
|
|
|
1 |
import numpy as np
|
2 |
from tqdm import tqdm
|
|
|
3 |
import wandb
|
4 |
+
from AgentBase import AgentBase
|
5 |
|
6 |
|
7 |
+
class MCAgent(AgentBase):
|
8 |
def __init__(self, /, **kwargs):
|
9 |
super().__init__(run_name=self.__class__.__name__, **kwargs)
|
10 |
self.reset()
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 🧠
|
4 |
colorFrom: yellow
|
5 |
colorTo: orange
|
@@ -9,7 +9,7 @@ fullWidth: true
|
|
9 |
pinned: true
|
10 |
---
|
11 |
|
12 |
-
# CS581 Project -
|
13 |
|
14 |
[Google Slides](https://docs.google.com/presentation/d/1v4WwBQKoPnGiyCMXgUs-pCCJ8IwZqM3thUf-Ky00eTQ/edit?usp=sharing)
|
15 |
|
@@ -48,45 +48,51 @@ Running on local URL: http://127.0.0.1:7860
|
|
48 |
|
49 |
TODO
|
50 |
|
51 |
-
**DP Usage:**
|
52 |
-
|
53 |
-
```bash
|
54 |
-
TODO
|
55 |
-
```
|
56 |
-
|
57 |
## 2.2. Monte-Carlo Agent
|
58 |
|
59 |
This is the implementation of an On-Policy Monte-Carlo agent to solve several toy problems from the OpenAI Gymnasium.
|
60 |
|
61 |
The agent starts with a randomly initialized epsilon-greedy policy and uses either the first-visit or every-visit Monte-Carlo update method to learn the optimal policy. Training is performed using a soft (epsilon-greedy) policy and testing uses the resulting greedy policy.
|
62 |
|
63 |
-
|
64 |
|
65 |
-
|
66 |
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
```bash
|
77 |
# Training: Policy will be saved as a `.npy` file.
|
78 |
-
python3 run.py --agent "MCAgent" --train
|
79 |
|
80 |
# Testing: Use the `--test` flag with the path to the policy file.
|
81 |
-
python3 run.py --agent "MCAgent" --test "policies/
|
82 |
-
```
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
usage: MonteCarloAgent.py [-h] [--train] [--test TEST] [--n_train_episodes N_TRAIN_EPISODES] [--n_test_episodes N_TEST_EPISODES] [--test_every TEST_EVERY] [--max_steps MAX_STEPS] [--update_type {first_visit,every_visit}] [--save_dir SAVE_DIR] [--no_save]
|
88 |
-
[--gamma GAMMA] [--epsilon EPSILON] [--env {CliffWalking-v0,FrozenLake-v1,Taxi-v3}] [--render_mode RENDER_MODE] [--wandb_project WANDB_PROJECT] [--wandb_group WANDB_GROUP] [--wandb_job_type WANDB_JOB_TYPE]
|
89 |
-
[--wandb_run_name_suffix WANDB_RUN_NAME_SUFFIX]
|
90 |
|
91 |
options:
|
92 |
-h, --help show this help message and exit
|
@@ -100,22 +106,24 @@ options:
|
|
100 |
During training, test the agent every n episodes. (default: 100)
|
101 |
--max_steps MAX_STEPS
|
102 |
The maximum number of steps per episode before the episode is forced to end. (default: 200)
|
103 |
-
--
|
104 |
-
The
|
105 |
-
--
|
106 |
-
--no_save Use this flag to disable saving the policy.
|
107 |
-
--gamma GAMMA The value for the discount factor to use. (default: 1.0)
|
108 |
--epsilon EPSILON The value for the epsilon-greedy policy to use. (default: 0.4)
|
|
|
|
|
109 |
--env {CliffWalking-v0,FrozenLake-v1,Taxi-v3}
|
110 |
The Gymnasium environment to use. (default: CliffWalking-v0)
|
|
|
|
|
111 |
--render_mode RENDER_MODE
|
112 |
Render mode passed to the gym.make() function. Use 'human' to render the environment. (default: None)
|
|
|
|
|
|
|
|
|
113 |
--wandb_project WANDB_PROJECT
|
114 |
WandB project name for logging. If not provided, no logging is done. (default: None)
|
115 |
-
--wandb_group WANDB_GROUP
|
116 |
-
WandB group name for logging. (default: monte-carlo)
|
117 |
--wandb_job_type WANDB_JOB_TYPE
|
118 |
WandB job type for logging. (default: train)
|
119 |
-
--wandb_run_name_suffix WANDB_RUN_NAME_SUFFIX
|
120 |
-
WandB run name suffix for logging. (default: None)
|
121 |
```
|
|
|
1 |
---
|
2 |
+
title: CS581 Final Project Demo - Dynamic Programming & Monte-Carlo RL Methods
|
3 |
emoji: 🧠
|
4 |
colorFrom: yellow
|
5 |
colorTo: orange
|
|
|
9 |
pinned: true
|
10 |
---
|
11 |
|
12 |
+
# CS581 Final Project - Dynamic Programming & Monte-Carlo RL Methods
|
13 |
|
14 |
[Google Slides](https://docs.google.com/presentation/d/1v4WwBQKoPnGiyCMXgUs-pCCJ8IwZqM3thUf-Ky00eTQ/edit?usp=sharing)
|
15 |
|
|
|
48 |
|
49 |
TODO
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
## 2.2. Monte-Carlo Agent
|
52 |
|
53 |
This is the implementation of an On-Policy Monte-Carlo agent to solve several toy problems from the OpenAI Gymnasium.
|
54 |
|
55 |
The agent starts with a randomly initialized epsilon-greedy policy and uses either the first-visit or every-visit Monte-Carlo update method to learn the optimal policy. Training is performed using a soft (epsilon-greedy) policy and testing uses the resulting greedy policy.
|
56 |
|
57 |
+
### Parameter testing results
|
58 |
|
59 |
+
**CliffWalking-v0**
|
60 |
|
61 |
+
<table>
|
62 |
+
<tr>
|
63 |
+
<td><img src="./plots/MC/MCAgent_CliffWalking-v0_gammas.png"/></td>
|
64 |
+
<td><img src="./plots/MC/MCAgent_CliffWalking-v0_epsilons.png"/></td>
|
65 |
+
</tr>
|
66 |
+
</table>
|
67 |
+
|
68 |
+
**FrozenLake-v1**
|
69 |
+
<table>
|
70 |
+
<tr>
|
71 |
+
<td><img src="./plots/MC/MCAgent_FrozenLake-v1_gammas.png"/></td>
|
72 |
+
<td><img src="./plots/MC/MCAgent_FrozenLake-v1_epsilons.png"/></td>
|
73 |
+
</tr>
|
74 |
+
</table>
|
75 |
+
|
76 |
+
**Taxi-v3**
|
77 |
+
<table>
|
78 |
+
<tr>
|
79 |
+
<td><img src="./plots/MC/MCAgent_Taxi-v3_gammas.png"/></td>
|
80 |
+
<td><img src="./plots/MC/MCAgent_Taxi-v3_epsilons.png"/></td>
|
81 |
+
</tr>
|
82 |
+
</table>
|
83 |
+
|
84 |
+
# 3. Run Script Usage
|
85 |
|
86 |
```bash
|
87 |
# Training: Policy will be saved as a `.npy` file.
|
88 |
+
❯ python3 run.py --agent "MCAgent" --train
|
89 |
|
90 |
# Testing: Use the `--test` flag with the path to the policy file.
|
91 |
+
❯ python3 run.py --agent "MCAgent" --test "./policies/[saved_policy_file].npy" --render_mode human
|
|
|
92 |
|
93 |
+
❯ python3 run.py --help
|
94 |
+
usage: run.py [-h] [--train] [--test TEST] [--n_train_episodes N_TRAIN_EPISODES] [--n_test_episodes N_TEST_EPISODES] [--test_every TEST_EVERY] [--max_steps MAX_STEPS] --agent {MCAgent,DPAgent} [--gamma GAMMA] [--epsilon EPSILON] [--update_type {first_visit,every_visit}]
|
95 |
+
[--env {CliffWalking-v0,FrozenLake-v1,Taxi-v3}] [--seed SEED] [--size SIZE] [--render_mode RENDER_MODE] [--save_dir SAVE_DIR] [--no_save] [--run_name_suffix RUN_NAME_SUFFIX] [--wandb_project WANDB_PROJECT] [--wandb_job_type WANDB_JOB_TYPE]
|
|
|
|
|
|
|
96 |
|
97 |
options:
|
98 |
-h, --help show this help message and exit
|
|
|
106 |
During training, test the agent every n episodes. (default: 100)
|
107 |
--max_steps MAX_STEPS
|
108 |
The maximum number of steps per episode before the episode is forced to end. (default: 200)
|
109 |
+
--agent {MCAgent,DPAgent}
|
110 |
+
The agent to use. Currently supports one of: ['MCAgent', 'DPAgent']
|
111 |
+
--gamma GAMMA The value for the discount factor to use. (default: 0.99)
|
|
|
|
|
112 |
--epsilon EPSILON The value for the epsilon-greedy policy to use. (default: 0.4)
|
113 |
+
--update_type {first_visit,every_visit}
|
114 |
+
The type of update to use. Only supported by Monte-Carlo agent. (default: first_visit)
|
115 |
--env {CliffWalking-v0,FrozenLake-v1,Taxi-v3}
|
116 |
The Gymnasium environment to use. (default: CliffWalking-v0)
|
117 |
+
--seed SEED The seed to use when generating the FrozenLake environment. If not provided, a random seed is used. (default: None)
|
118 |
+
--size SIZE The size to use when generating the FrozenLake environment. (default: 8)
|
119 |
--render_mode RENDER_MODE
|
120 |
Render mode passed to the gym.make() function. Use 'human' to render the environment. (default: None)
|
121 |
+
--save_dir SAVE_DIR The directory to save the policy to. (default: policies)
|
122 |
+
--no_save Use this flag to disable saving the policy.
|
123 |
+
--run_name_suffix RUN_NAME_SUFFIX
|
124 |
+
Run name suffix for logging and policy checkpointing. (default: None)
|
125 |
--wandb_project WANDB_PROJECT
|
126 |
WandB project name for logging. If not provided, no logging is done. (default: None)
|
|
|
|
|
127 |
--wandb_job_type WANDB_JOB_TYPE
|
128 |
WandB job type for logging. (default: train)
|
|
|
|
|
129 |
```
|
demo.py
CHANGED
@@ -6,7 +6,7 @@ import gradio as gr
|
|
6 |
import scipy.ndimage
|
7 |
import cv2
|
8 |
|
9 |
-
from
|
10 |
|
11 |
default_n_test_episodes = 10
|
12 |
default_max_steps = 500
|
|
|
6 |
import scipy.ndimage
|
7 |
import cv2
|
8 |
|
9 |
+
from utils import load_agent
|
10 |
|
11 |
default_n_test_episodes = 10
|
12 |
default_max_steps = 500
|
run.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import argparse
|
2 |
import wandb
|
3 |
|
4 |
-
from
|
5 |
|
6 |
|
7 |
def main():
|
@@ -36,7 +36,6 @@ def main():
|
|
36 |
default=100,
|
37 |
help="During training, test the agent every n episodes. (default: 100)",
|
38 |
)
|
39 |
-
|
40 |
parser.add_argument(
|
41 |
"--max_steps",
|
42 |
type=int,
|
@@ -44,41 +43,20 @@ def main():
|
|
44 |
help="The maximum number of steps per episode before the episode is forced to end. (default: 200)",
|
45 |
)
|
46 |
|
47 |
-
parser.add_argument(
|
48 |
-
"--update_type",
|
49 |
-
type=str,
|
50 |
-
choices=["first_visit", "every_visit"],
|
51 |
-
default="first_visit",
|
52 |
-
help="The type of update to use. Only supported by Monte-Carlo agent. (default: first_visit)",
|
53 |
-
)
|
54 |
-
|
55 |
-
parser.add_argument(
|
56 |
-
"--save_dir",
|
57 |
-
type=str,
|
58 |
-
default="policies",
|
59 |
-
help="The directory to save the policy to. (default: policies)",
|
60 |
-
)
|
61 |
-
|
62 |
-
parser.add_argument(
|
63 |
-
"--no_save",
|
64 |
-
action="store_true",
|
65 |
-
help="Use this flag to disable saving the policy.",
|
66 |
-
)
|
67 |
-
|
68 |
### Agent parameters
|
69 |
parser.add_argument(
|
70 |
"--agent",
|
71 |
type=str,
|
72 |
required=True,
|
73 |
choices=AGENTS_MAP.keys(),
|
74 |
-
help=f"The agent to use.
|
75 |
)
|
76 |
|
77 |
parser.add_argument(
|
78 |
"--gamma",
|
79 |
type=float,
|
80 |
default=0.99,
|
81 |
-
help="The value for the discount factor to use. (default:
|
82 |
)
|
83 |
parser.add_argument(
|
84 |
"--epsilon",
|
@@ -87,6 +65,14 @@ def main():
|
|
87 |
help="The value for the epsilon-greedy policy to use. (default: 0.4)",
|
88 |
)
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
### Environment parameters
|
91 |
parser.add_argument(
|
92 |
"--env",
|
@@ -95,27 +81,43 @@ def main():
|
|
95 |
choices=["CliffWalking-v0", "FrozenLake-v1", "Taxi-v3"],
|
96 |
help="The Gymnasium environment to use. (default: CliffWalking-v0)",
|
97 |
)
|
98 |
-
|
99 |
parser.add_argument(
|
100 |
"--seed",
|
101 |
type=int,
|
102 |
default=None,
|
103 |
help="The seed to use when generating the FrozenLake environment. If not provided, a random seed is used. (default: None)",
|
104 |
)
|
105 |
-
|
106 |
parser.add_argument(
|
107 |
"--size",
|
108 |
type=int,
|
109 |
default=8,
|
110 |
help="The size to use when generating the FrozenLake environment. (default: 8)",
|
111 |
)
|
112 |
-
|
113 |
parser.add_argument(
|
114 |
"--render_mode",
|
115 |
type=str,
|
116 |
default=None,
|
117 |
help="Render mode passed to the gym.make() function. Use 'human' to render the environment. (default: None)",
|
118 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
parser.add_argument(
|
120 |
"--wandb_project",
|
121 |
type=str,
|
@@ -128,12 +130,6 @@ def main():
|
|
128 |
default="train",
|
129 |
help="WandB job type for logging. (default: train)",
|
130 |
)
|
131 |
-
parser.add_argument(
|
132 |
-
"--wandb_run_name_suffix",
|
133 |
-
type=str,
|
134 |
-
default=None,
|
135 |
-
help="WandB run name suffix for logging. (default: None)",
|
136 |
-
)
|
137 |
|
138 |
args = parser.parse_args()
|
139 |
print(vars(args))
|
@@ -143,8 +139,8 @@ def main():
|
|
143 |
)
|
144 |
|
145 |
agent.run_name += f"_e{args.n_train_episodes}_s{args.max_steps}"
|
146 |
-
if args.
|
147 |
-
agent.run_name += f"+{args.
|
148 |
|
149 |
try:
|
150 |
if args.train:
|
|
|
1 |
import argparse
|
2 |
import wandb
|
3 |
|
4 |
+
from utils import AGENTS_MAP, load_agent
|
5 |
|
6 |
|
7 |
def main():
|
|
|
36 |
default=100,
|
37 |
help="During training, test the agent every n episodes. (default: 100)",
|
38 |
)
|
|
|
39 |
parser.add_argument(
|
40 |
"--max_steps",
|
41 |
type=int,
|
|
|
43 |
help="The maximum number of steps per episode before the episode is forced to end. (default: 200)",
|
44 |
)
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
### Agent parameters
|
47 |
parser.add_argument(
|
48 |
"--agent",
|
49 |
type=str,
|
50 |
required=True,
|
51 |
choices=AGENTS_MAP.keys(),
|
52 |
+
help=f"The agent to use. Currently supports one of: {list(AGENTS_MAP.keys())}",
|
53 |
)
|
54 |
|
55 |
parser.add_argument(
|
56 |
"--gamma",
|
57 |
type=float,
|
58 |
default=0.99,
|
59 |
+
help="The value for the discount factor to use. (default: 0.99)",
|
60 |
)
|
61 |
parser.add_argument(
|
62 |
"--epsilon",
|
|
|
65 |
help="The value for the epsilon-greedy policy to use. (default: 0.4)",
|
66 |
)
|
67 |
|
68 |
+
parser.add_argument(
|
69 |
+
"--update_type",
|
70 |
+
type=str,
|
71 |
+
choices=["first_visit", "every_visit"],
|
72 |
+
default="first_visit",
|
73 |
+
help="The type of update to use. Only supported by Monte-Carlo agent. (default: first_visit)",
|
74 |
+
)
|
75 |
+
|
76 |
### Environment parameters
|
77 |
parser.add_argument(
|
78 |
"--env",
|
|
|
81 |
choices=["CliffWalking-v0", "FrozenLake-v1", "Taxi-v3"],
|
82 |
help="The Gymnasium environment to use. (default: CliffWalking-v0)",
|
83 |
)
|
|
|
84 |
parser.add_argument(
|
85 |
"--seed",
|
86 |
type=int,
|
87 |
default=None,
|
88 |
help="The seed to use when generating the FrozenLake environment. If not provided, a random seed is used. (default: None)",
|
89 |
)
|
|
|
90 |
parser.add_argument(
|
91 |
"--size",
|
92 |
type=int,
|
93 |
default=8,
|
94 |
help="The size to use when generating the FrozenLake environment. (default: 8)",
|
95 |
)
|
|
|
96 |
parser.add_argument(
|
97 |
"--render_mode",
|
98 |
type=str,
|
99 |
default=None,
|
100 |
help="Render mode passed to the gym.make() function. Use 'human' to render the environment. (default: None)",
|
101 |
)
|
102 |
+
|
103 |
+
# Logging and saving parameters
|
104 |
+
parser.add_argument(
|
105 |
+
"--save_dir",
|
106 |
+
type=str,
|
107 |
+
default="policies",
|
108 |
+
help="The directory to save the policy to. (default: policies)",
|
109 |
+
)
|
110 |
+
parser.add_argument(
|
111 |
+
"--no_save",
|
112 |
+
action="store_true",
|
113 |
+
help="Use this flag to disable saving the policy.",
|
114 |
+
)
|
115 |
+
parser.add_argument(
|
116 |
+
"--run_name_suffix",
|
117 |
+
type=str,
|
118 |
+
default=None,
|
119 |
+
help="Run name suffix for logging and policy checkpointing. (default: None)",
|
120 |
+
)
|
121 |
parser.add_argument(
|
122 |
"--wandb_project",
|
123 |
type=str,
|
|
|
130 |
default="train",
|
131 |
help="WandB job type for logging. (default: train)",
|
132 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
args = parser.parse_args()
|
135 |
print(vars(args))
|
|
|
139 |
)
|
140 |
|
141 |
agent.run_name += f"_e{args.n_train_episodes}_s{args.max_steps}"
|
142 |
+
if args.run_name_suffix is not None:
|
143 |
+
agent.run_name += f"+{args.run_name_suffix}"
|
144 |
|
145 |
try:
|
146 |
if args.train:
|
run_tests_MC_CliffWalking-v0.py
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import multiprocessing
|
3 |
-
import random
|
4 |
-
|
5 |
-
wandb_project = "cs581"
|
6 |
-
|
7 |
-
env = "CliffWalking-v0"
|
8 |
-
n_train_episodes = 2500
|
9 |
-
max_steps = 200
|
10 |
-
|
11 |
-
num_tests = 10
|
12 |
-
|
13 |
-
vals_update_type = [
|
14 |
-
"first_visit"
|
15 |
-
] # Every visit takes too long due to this environment's reward structure
|
16 |
-
vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
|
17 |
-
vals_gamma = [1.0, 0.98, 0.96, 0.94]
|
18 |
-
|
19 |
-
|
20 |
-
def run_test(args):
|
21 |
-
os.system(
|
22 |
-
f"python3 run.py --agent MCAgent --train --n_train_episodes {n_train_episodes} --max_steps {max_steps} --env {env} --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]} --wandb_project {wandb_project} --wandb_run_name_suffix {args[3]} --no_save"
|
23 |
-
)
|
24 |
-
|
25 |
-
|
26 |
-
with multiprocessing.Pool(8) as p:
|
27 |
-
tests = []
|
28 |
-
for update_type in vals_update_type:
|
29 |
-
for gamma in vals_gamma:
|
30 |
-
for eps in vals_epsilon:
|
31 |
-
tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
|
32 |
-
random.shuffle(tests)
|
33 |
-
|
34 |
-
p.map(run_test, tests)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
run_tests_MC_FrozenLake-v1.py
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import multiprocessing
|
3 |
-
import random
|
4 |
-
|
5 |
-
wandb_project = "cs581"
|
6 |
-
|
7 |
-
env = "FrozenLake-v1"
|
8 |
-
n_train_episodes = 5000
|
9 |
-
max_steps = 200
|
10 |
-
|
11 |
-
num_tests = 10
|
12 |
-
|
13 |
-
vals_update_type = [
|
14 |
-
"first_visit"
|
15 |
-
] # Every visit takes too long due to this environment's reward structure
|
16 |
-
vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
|
17 |
-
vals_gamma = [1.0, 0.98, 0.96, 0.94]
|
18 |
-
|
19 |
-
|
20 |
-
def run_test(args):
|
21 |
-
os.system(
|
22 |
-
f"python3 run.py --agent MCAgent --train --n_train_episodes {n_train_episodes} --max_steps {max_steps} --env {env} --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]} --wandb_project {wandb_project} --wandb_run_name_suffix {args[3]} --no_save"
|
23 |
-
)
|
24 |
-
|
25 |
-
|
26 |
-
with multiprocessing.Pool(8) as p:
|
27 |
-
tests = []
|
28 |
-
for update_type in vals_update_type:
|
29 |
-
for gamma in vals_gamma:
|
30 |
-
for eps in vals_epsilon:
|
31 |
-
tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
|
32 |
-
random.shuffle(tests)
|
33 |
-
|
34 |
-
p.map(run_test, tests)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
run_tests_MC_Taxi-v3.py
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import multiprocessing
|
3 |
-
import random
|
4 |
-
|
5 |
-
wandb_project = "cs581"
|
6 |
-
|
7 |
-
env = "Taxi-v3"
|
8 |
-
n_train_episodes = 10000
|
9 |
-
max_steps = 500
|
10 |
-
|
11 |
-
num_tests = 10
|
12 |
-
|
13 |
-
vals_update_type = [
|
14 |
-
"first_visit"
|
15 |
-
] # Every visit takes too long due to this environment's reward structure
|
16 |
-
vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
|
17 |
-
vals_gamma = [1.0, 0.98, 0.96, 0.94]
|
18 |
-
|
19 |
-
|
20 |
-
def run_test(args):
|
21 |
-
os.system(
|
22 |
-
f"python3 run.py --agent MCAgent --train --n_train_episodes {n_train_episodes} --max_steps {max_steps} --env {env} --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]} --wandb_project {wandb_project} --wandb_run_name_suffix {args[3]} --no_save"
|
23 |
-
)
|
24 |
-
|
25 |
-
|
26 |
-
with multiprocessing.Pool(8) as p:
|
27 |
-
tests = []
|
28 |
-
for update_type in vals_update_type:
|
29 |
-
for gamma in vals_gamma:
|
30 |
-
for eps in vals_epsilon:
|
31 |
-
tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
|
32 |
-
random.shuffle(tests)
|
33 |
-
|
34 |
-
p.map(run_test, tests)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_params.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import os
|
3 |
+
import multiprocessing
|
4 |
+
import random
|
5 |
+
|
6 |
+
|
7 |
+
def run(args):
|
8 |
+
env, num_tests, wandb_project = args.env, args.num_tests, args.wandb_project
|
9 |
+
agent = "MCAgent"
|
10 |
+
|
11 |
+
vals_update_type = [
|
12 |
+
"first_visit"
|
13 |
+
] # Note: Every visit takes too long due to these environment's reward structure
|
14 |
+
vals_epsilon = [0.1, 0.2, 0.3, 0.4, 0.5]
|
15 |
+
vals_gamma = [1.0, 0.98, 0.96, 0.94]
|
16 |
+
|
17 |
+
if env == "Taxi-v3":
|
18 |
+
n_train_episodes = 10000
|
19 |
+
max_steps = 500
|
20 |
+
elif env == "FrozenLake-v1":
|
21 |
+
n_train_episodes = 5000
|
22 |
+
max_steps = 200
|
23 |
+
elif env == "CliffWalking-v0":
|
24 |
+
n_train_episodes = 2500
|
25 |
+
max_steps = 200
|
26 |
+
else:
|
27 |
+
raise ValueError(f"Unsupported environment: {env}")
|
28 |
+
|
29 |
+
|
30 |
+
def run_test(args):
|
31 |
+
command = f"python3 run.py --train --agent {agent} --env {env}"
|
32 |
+
command += f" --n_train_episodes {n_train_episodes} --max_steps {max_steps}"
|
33 |
+
command += f" --gamma {args[0]} --epsilon {args[1]} --update_type {args[2]}"
|
34 |
+
command += f" --run_name_suffix {args[3]}"
|
35 |
+
if wandb_project is not None:
|
36 |
+
command += f" --wandb_project {wandb_project}"
|
37 |
+
command += " --no_save"
|
38 |
+
os.system(command)
|
39 |
+
|
40 |
+
with multiprocessing.Pool(8) as p:
|
41 |
+
tests = []
|
42 |
+
for update_type in vals_update_type:
|
43 |
+
for gamma in vals_gamma:
|
44 |
+
for eps in vals_epsilon:
|
45 |
+
tests.extend((gamma, eps, update_type, i) for i in range(num_tests))
|
46 |
+
random.shuffle(tests)
|
47 |
+
|
48 |
+
p.map(run_test, tests)
|
49 |
+
|
50 |
+
|
51 |
+
def main():
|
52 |
+
# argument parsing
|
53 |
+
parser = argparse.ArgumentParser(description="Run parameter tests for MC agent")
|
54 |
+
parser.add_argument(
|
55 |
+
"--env",
|
56 |
+
type=str,
|
57 |
+
default="Taxi-v3",
|
58 |
+
help="environment to run",
|
59 |
+
)
|
60 |
+
parser.add_argument(
|
61 |
+
"--num_tests",
|
62 |
+
type=int,
|
63 |
+
default=10,
|
64 |
+
help="number of tests to run for each parameter combination",
|
65 |
+
)
|
66 |
+
parser.add_argument(
|
67 |
+
"--wandb_project",
|
68 |
+
type=str,
|
69 |
+
default=None,
|
70 |
+
help="wandb project name to log to",
|
71 |
+
)
|
72 |
+
|
73 |
+
args = parser.parse_args()
|
74 |
+
|
75 |
+
run(args)
|
76 |
+
|
agents.py → utils.py
RENAMED
@@ -8,36 +8,47 @@ AGENTS_MAP = {"MCAgent": MCAgent, "DPAgent": DPAgent}
|
|
8 |
|
9 |
|
10 |
def load_agent(agent_key, **kwargs):
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
agent_policy_file = agent_key if agent_key.endswith(".npy") else None
|
|
|
12 |
if agent_policy_file is not None:
|
13 |
props = os.path.basename(agent_key).split("_")
|
14 |
try:
|
|
|
15 |
agent_key, env_key = props[0], props[1]
|
16 |
-
|
17 |
for prop in props[2:]:
|
18 |
props_split = prop.split(":")
|
19 |
if len(props_split) == 2:
|
20 |
-
|
21 |
else:
|
22 |
warnings.warn(
|
23 |
f"Skipping property {prop} as it does not have the format 'key:value'.",
|
24 |
UserWarning,
|
25 |
)
|
26 |
-
|
27 |
-
|
28 |
-
kwargs
|
29 |
print("agent_args:", kwargs)
|
30 |
-
except IndexError:
|
31 |
raise ValueError(
|
32 |
-
|
33 |
-
)
|
34 |
|
|
|
35 |
if agent_key not in AGENTS_MAP:
|
36 |
raise ValueError(
|
37 |
f"ERROR: Agent '{agent_key}' not valid. Must be one of: {AGENTS_MAP.keys()}"
|
38 |
)
|
39 |
|
|
|
40 |
agent = AGENTS_MAP[agent_key](**kwargs)
|
|
|
41 |
if agent_policy_file is not None:
|
42 |
agent.load_policy(agent_policy_file)
|
43 |
|
|
|
8 |
|
9 |
|
10 |
def load_agent(agent_key, **kwargs):
|
11 |
+
"""
|
12 |
+
Loads an agent from a file or from the AGENTS_MAP.
|
13 |
+
:param agent_key: Which agent to load. Can be a key in AGENTS_MAP or a path to a policy file ending with ".npy".
|
14 |
+
If a policy file is provided, the agent name, environment name, and other parameters will be parsed from the file name.
|
15 |
+
:param kwargs: Additional arguments to pass to the agent constructor. If loading from a policy file, any conflicting arguments will be overwritten.
|
16 |
+
"""
|
17 |
agent_policy_file = agent_key if agent_key.endswith(".npy") else None
|
18 |
+
# if loading from a policy file, parse the agent key, environment key, and other parameters from the file name
|
19 |
if agent_policy_file is not None:
|
20 |
props = os.path.basename(agent_key).split("_")
|
21 |
try:
|
22 |
+
# Parsing arguments from file name
|
23 |
agent_key, env_key = props[0], props[1]
|
24 |
+
parsed_args = {}
|
25 |
for prop in props[2:]:
|
26 |
props_split = prop.split(":")
|
27 |
if len(props_split) == 2:
|
28 |
+
parsed_args[props_split[0]] = props_split[1]
|
29 |
else:
|
30 |
warnings.warn(
|
31 |
f"Skipping property {prop} as it does not have the format 'key:value'.",
|
32 |
UserWarning,
|
33 |
)
|
34 |
+
# Overwrite any conflicting arguments with those from the file name
|
35 |
+
parsed_args["env"] = env_key
|
36 |
+
kwargs |= parsed_args
|
37 |
print("agent_args:", kwargs)
|
38 |
+
except IndexError as e:
|
39 |
raise ValueError(
|
40 |
+
"ERROR: Could not parse agent properties. Must be of the format 'AgentName_EnvName_key:value_key:value...'."
|
41 |
+
) from e
|
42 |
|
43 |
+
# Check if agent key is valid
|
44 |
if agent_key not in AGENTS_MAP:
|
45 |
raise ValueError(
|
46 |
f"ERROR: Agent '{agent_key}' not valid. Must be one of: {AGENTS_MAP.keys()}"
|
47 |
)
|
48 |
|
49 |
+
# Load agent based on key and arguments
|
50 |
agent = AGENTS_MAP[agent_key](**kwargs)
|
51 |
+
# If loading from a policy file, load the policy into the agent
|
52 |
if agent_policy_file is not None:
|
53 |
agent.load_policy(agent_policy_file)
|
54 |
|