lharri73 commited on
Commit
e0c3c75
·
1 Parent(s): 361fc9e
Files changed (5) hide show
  1. DPAgent.py +49 -27
  2. creategif.py +21 -14
  3. qtable_policy.gif +0 -0
  4. scripts/dp_exp.py +52 -0
  5. scripts/speedtest.py +83 -0
DPAgent.py CHANGED
@@ -1,10 +1,13 @@
 
 
1
  import gymnasium as gym
2
  import numpy as np
3
  from gymnasium.envs.toy_text.frozen_lake import generate_random_map
4
  from matplotlib import pyplot as plt
 
5
  from tqdm import trange
 
6
  from AgentBase import AgentBase
7
- import warnings
8
 
9
 
10
  class DPAgent(AgentBase):
@@ -23,6 +26,7 @@ class DPAgent(AgentBase):
23
  return self.Pi[state]
24
 
25
  def train(self, *args, **kwargs):
 
26
  i = 0
27
  print(self.gamma)
28
  while True:
@@ -50,13 +54,35 @@ class DPAgent(AgentBase):
50
  # update the state-value function
51
  self.V[state] = value
52
  delta = max(delta, abs(V_prev[state] - self.V[state]))
53
- if delta < self.theta:
 
 
 
 
 
 
 
54
  break
55
  i += 1
56
- # if i % 5 == 0 and i != 0:
57
- # self.test(verbose=False)
58
  print(f"Iteration {i}: delta={delta}")
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
61
  for s in range(self.env.observation_space.n):
62
  for a in range(self.env.action_space.n):
@@ -64,7 +90,7 @@ class DPAgent(AgentBase):
64
  for probability, next_state, reward, done in self.env.P[s][a]:
65
  if (
66
  self.env_name == "CliffWalking-v0"
67
- and state == self.env.observation_space.n - 1
68
  ):
69
  reward = 1
70
  expected_value += probability * (
@@ -72,39 +98,35 @@ class DPAgent(AgentBase):
72
  )
73
  self.Pi[s, a] = expected_value
74
  idxs = np.argmax(self.Pi, axis=1)
75
- print(idxs)
76
  self.Pi = np.zeros((self.env.observation_space.n, self.env.action_space.n))
77
  self.Pi[np.arange(self.env.observation_space.n), idxs] = 1
78
- # print(self.Pi)
79
- # return self.V, self.Pi
80
 
81
 
82
  if __name__ == "__main__":
83
- # env = gym.make('FrozenLake-v1', render_mode='human')
84
- dp = DPAgent(env="FrozenLake-v1", gamma=0.99)
85
- dp.train()
86
- dp.save_policy("dp_policy.npy")
87
  env = gym.make(
88
  "FrozenLake-v1",
89
- render_mode="human",
 
90
  is_slippery=False,
91
- desc=[
92
- "SFFFFFFF",
93
- "FFFFFFFH",
94
- "FFFHFFFF",
95
- "FFFFFHFF",
96
- "FFFHFFFF",
97
- "FHHFFFHF",
98
- "FHFFHFHF",
99
- "FFFHFFFG",
100
- ],
101
  )
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  state, _ = env.reset()
104
  done = False
105
  while not done:
106
- action = dp.choose_action(state)
107
  state, reward, done, _, _ = env.step(action)
108
- env.render()
109
-
110
- # plt.savefig(f"imgs/{0}.png")
 
1
+ import warnings
2
+
3
  import gymnasium as gym
4
  import numpy as np
5
  from gymnasium.envs.toy_text.frozen_lake import generate_random_map
6
  from matplotlib import pyplot as plt
7
+ from PIL import Image
8
  from tqdm import trange
9
+
10
  from AgentBase import AgentBase
 
11
 
12
 
13
  class DPAgent(AgentBase):
 
26
  return self.Pi[state]
27
 
28
  def train(self, *args, **kwargs):
29
+ success_rate = []
30
  i = 0
31
  print(self.gamma)
32
  while True:
 
54
  # update the state-value function
55
  self.V[state] = value
56
  delta = max(delta, abs(V_prev[state] - self.V[state]))
57
+ self.make_pi()
58
+ suc = self.test(verbose=False, greedy=True)
59
+ success_rate.append(suc)
60
+ if delta < self.theta and self.theta < 1:
61
+ print(f"breaking at {delta}, {self.theta}")
62
+ break
63
+ elif i > self.theta and self.theta > 1:
64
+ print(f"breaking at {i}, {self.theta}")
65
  break
66
  i += 1
 
 
67
  print(f"Iteration {i}: delta={delta}")
68
 
69
+ # self.write_v(0)
70
+ return success_rate
71
+
72
+ def write_v(self, i):
73
+ v_cop = np.copy(self.V).reshape((12, 4))
74
+ print(v_cop)
75
+ v_cop -= np.min(v_cop)
76
+ v_cop /= np.max(v_cop)
77
+ print(np.min(v_cop), np.max(v_cop))
78
+ img = Image.fromarray(np.uint8(v_cop * 255), "L")
79
+ img = img.resize(
80
+ (v_cop.shape[0] * 100, v_cop.shape[1] * 100),
81
+ resample=Image.Resampling.NEAREST,
82
+ )
83
+ img.save(f"imgs/{i}.png")
84
+
85
+ def make_pi(self):
86
  self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
87
  for s in range(self.env.observation_space.n):
88
  for a in range(self.env.action_space.n):
 
90
  for probability, next_state, reward, done in self.env.P[s][a]:
91
  if (
92
  self.env_name == "CliffWalking-v0"
93
+ and s == self.env.observation_space.n - 1
94
  ):
95
  reward = 1
96
  expected_value += probability * (
 
98
  )
99
  self.Pi[s, a] = expected_value
100
  idxs = np.argmax(self.Pi, axis=1)
 
101
  self.Pi = np.zeros((self.env.observation_space.n, self.env.action_space.n))
102
  self.Pi[np.arange(self.env.observation_space.n), idxs] = 1
 
 
103
 
104
 
105
  if __name__ == "__main__":
 
 
 
 
106
  env = gym.make(
107
  "FrozenLake-v1",
108
+ render_mode="ansi",
109
+ desc=generate_random_map(8, seed=24),
110
  is_slippery=False,
 
 
 
 
 
 
 
 
 
 
111
  )
112
+ dp = DPAgent(env="FrozenLake-v1", gamma=0.99)
113
+ dp.env = env
114
+ dp.env_name = "FrozenLake-v1"
115
+ dp.V = np.zeros(dp.env.observation_space.n)
116
+ dp.Pi = np.zeros(dp.env.observation_space.n, dp.env.action_space.n)
117
+ dp.n_states, dp.n_actions = (
118
+ dp.env.observation_space.n,
119
+ dp.env.action_space.n,
120
+ )
121
+ dp.train()
122
+
123
+ print(dp.test())
124
 
125
  state, _ = env.reset()
126
  done = False
127
  while not done:
128
+ action = dp.choose_action(state, greedy=True)
129
  state, reward, done, _, _ = env.step(action)
130
+ s = env.render()
131
+ print(s)
132
+ plt.savefig(f"imgs/{0}.png")
creategif.py CHANGED
@@ -5,21 +5,29 @@ from PIL import Image
5
  from PIL.Image import Transpose, Resampling
6
 
7
 
8
- api = wandb.Api()
9
- run = api.run("acozma/cs581/5ttfkav8")
10
 
11
- print(run.summary)
12
- print("Downloading images...")
13
 
14
- for file in run.files():
15
- if file.name.endswith(".png"):
16
- file.download(exist_ok=True)
17
 
18
- print("Finished downloading images")
 
 
 
 
 
 
 
19
 
20
 
21
  def process_images(image_fnames, upscale=20):
22
- image_fnames.sort(key=lambda x: int(x.split("_")[-2]))
 
23
  frames = [Image.open(image) for image in image_fnames]
24
  frames = [frame.transpose(Transpose.ROTATE_90) for frame in frames]
25
  frames = [
@@ -46,14 +54,13 @@ def images_to_gif(frames, fname, duration=500):
46
  )
47
 
48
 
49
- folder_path = "./media/images"
50
  all_fnames = [os.path.join(folder_path, f) for f in os.listdir(folder_path)]
 
51
 
52
-
53
- fnames_policy = [f for f in all_fnames if os.path.basename(f).startswith("Policy")]
54
  policy_frames = process_images(fnames_policy)
55
 
56
- fnames_qtable = [f for f in all_fnames if os.path.basename(f).startswith("Q-table")]
57
  qtable_frames = process_images(fnames_qtable)
58
 
59
  spacing_factor = 1 / 2
@@ -67,4 +74,4 @@ for i, (qtable, policy) in enumerate(zip(qtable_frames, policy_frames)):
67
  new_frame.paste(policy, (0, height + int(height * spacing_factor)))
68
  final_frames.append(new_frame)
69
 
70
- images_to_gif(final_frames, "qtable_policy")
 
5
  from PIL.Image import Transpose, Resampling
6
 
7
 
8
+ # api = wandb.Api()
9
+ # run = api.run("acozma/cs581/5ttfkav8")
10
 
11
+ # print(run.summary)
12
+ # print("Downloading images...")
13
 
14
+ # for file in run.files():
15
+ # if file.name.endswith(".png"):
16
+ # file.download(exist_ok=True)
17
 
18
+ # print("Finished downloading images")
19
+
20
+
21
+ folder_path = "./imgs/"
22
+ policy_file_prefix = "Pi"
23
+ q_file_prefix = "Q"
24
+ out_name = "qtable_policy"
25
+ sort_lambda = lambda x: int(x.split("_")[1].split(".")[0]) # key used to sort image filenames
26
 
27
 
28
  def process_images(image_fnames, upscale=20):
29
+ print(image_fnames)
30
+ image_fnames.sort(key=sort_lambda)
31
  frames = [Image.open(image) for image in image_fnames]
32
  frames = [frame.transpose(Transpose.ROTATE_90) for frame in frames]
33
  frames = [
 
54
  )
55
 
56
 
 
57
  all_fnames = [os.path.join(folder_path, f) for f in os.listdir(folder_path)]
58
+ print(all_fnames)
59
 
60
+ fnames_policy = [f for f in all_fnames if os.path.basename(f).startswith(policy_file_prefix)))]
 
61
  policy_frames = process_images(fnames_policy)
62
 
63
+ fnames_qtable = [f for f in all_fnames if os.path.basename(f).startswith(q_file_prefix)]
64
  qtable_frames = process_images(fnames_qtable)
65
 
66
  spacing_factor = 1 / 2
 
74
  new_frame.paste(policy, (0, height + int(height * spacing_factor)))
75
  final_frames.append(new_frame)
76
 
77
+ images_to_gif(final_frames, out_name)
qtable_policy.gif DELETED
Binary file (583 kB)
 
scripts/dp_exp.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import multiprocessing
2
+ import time
3
+
4
+ import gymnasium as gym
5
+ import numpy as np
6
+ from gymnasium.envs.toy_text.frozen_lake import generate_random_map
7
+
8
+ import wandb
9
+ from DPAgent import DPAgent
10
+ from MCAgent import MCAgent
11
+
12
+ env_ver = "FrozenLake-v1"
13
+
14
+
15
+ def test_dp(gamma=0.99):
16
+ env = gym.make(
17
+ env_ver,
18
+ render_mode="ansi",
19
+ # desc=generate_random_map(8, seed=3141),
20
+ # is_slippery=False,
21
+ )
22
+ dp = DPAgent(env=env_ver, gamma=0.99)
23
+ dp.env = env
24
+ dp.env_name = env_ver
25
+ dp.V = np.zeros(dp.env.observation_space.n)
26
+ dp.Pi = np.zeros(dp.env.observation_space.n, dp.env.action_space.n)
27
+ dp.n_states, dp.n_actions = (
28
+ dp.env.observation_space.n,
29
+ dp.env.action_space.n,
30
+ )
31
+ times = dp.train()
32
+
33
+ # np.save(f"times_{gamma}.npy", times)
34
+ s = env.render()
35
+ print(s)
36
+
37
+
38
+ def main():
39
+ wandb.init(
40
+ project="cs581",
41
+ # job_type=args.wandb_job_type,
42
+ # config=dict(args._get_kwargs()),
43
+ )
44
+ np.set_printoptions(linewidth=500, precision=3)
45
+ # with multiprocessing.Pool(8) as p:
46
+ # gamma = [0.99, 0.95, 0.9, 0.8, 0.7, 0.6, 0.5, 0.1]
47
+ # p.map(test_dp, gamma)
48
+ test_dp(0.99)
49
+
50
+
51
+ if __name__ == "__main__":
52
+ main()
scripts/speedtest.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import multiprocessing
2
+ import time
3
+
4
+ import gymnasium as gym
5
+ import numpy as np
6
+ from gymnasium.envs.toy_text.frozen_lake import generate_random_map
7
+
8
+ from DPAgent import DPAgent
9
+ from MCAgent import MCAgent
10
+
11
+ env_ver = "FrozenLake-v1"
12
+
13
+
14
+ def test_mc(i, seed):
15
+ env = gym.make(
16
+ env_ver, desc=generate_random_map(size=i, p=0.4, seed=seed), is_slippery=False
17
+ )
18
+ agent = MCAgent(env=env_ver, gamma=1.0, epsilon=0.4)
19
+ agent.env = env
20
+ agent.n_states, agent.n_actions = (
21
+ agent.env.observation_space.n,
22
+ agent.env.action_space.n,
23
+ )
24
+ agent.initialize()
25
+ tic = time.perf_counter()
26
+ trained = agent.train(
27
+ max_steps=int((i**2) * 3),
28
+ n_train_episodes=10_000,
29
+ save_best=False,
30
+ early_stopping=True,
31
+ update_type="every_visit",
32
+ )
33
+ toc = time.perf_counter()
34
+ return trained, toc - tic
35
+
36
+
37
+ def test_dp(i, seed, gamma=0.99):
38
+ env = gym.make(env_ver, desc=generate_random_map(i, seed=seed), is_slippery=False)
39
+ agent = DPAgent(env=env_ver, gamma=gamma)
40
+ agent.env = env
41
+ agent.V = np.zeros(agent.env.observation_space.n)
42
+ agent.Pi = np.zeros(agent.env.observation_space.n, agent.env.action_space.n)
43
+ agent.n_states, agent.n_actions = (
44
+ agent.env.observation_space.n,
45
+ agent.env.action_space.n,
46
+ )
47
+
48
+ return agent.train()
49
+
50
+
51
+ def run_test(i):
52
+ mc_trained = False
53
+ seed = 0
54
+ mc_time = 0
55
+ dp_time = 0
56
+ while not mc_trained:
57
+ seed = np.random.randint(0, 100000)
58
+ mc_trained, train_time = test_mc(i, seed)
59
+ mc_time = train_time
60
+ dp_time = test_dp(i, seed)
61
+
62
+ return mc_time, dp_time
63
+
64
+
65
+ def run_exp(gamma):
66
+ times = []
67
+ for i in range(8, 512, 8):
68
+ # mc_time, dp_time = run_test(i)
69
+ dp_time = test_dp(i, 0, gamma=gamma)
70
+ times.append((i, dp_time))
71
+ times = np.array(times)
72
+ np.save(f"times_{gamma}.npy", times)
73
+ return
74
+
75
+
76
+ def main():
77
+ with multiprocessing.Pool(8) as p:
78
+ gamma = [0.95, 0.9, 0.8, 0.7, 0.6, 0.5, 0.1]
79
+ p.map(run_exp, gamma)
80
+
81
+
82
+ if __name__ == "__main__":
83
+ main()