Spaces:

acozma
/

CS581-Algos-Demo

Sleeping

App Files Files Community

lharri73 commited on Apr 25, 2023

Commit

e0c3c75

1 Parent(s): 361fc9e

add exps

Browse files

Files changed (5) hide show

DPAgent.py +49 -27
creategif.py +21 -14
qtable_policy.gif +0 -0
scripts/dp_exp.py +52 -0
scripts/speedtest.py +83 -0

DPAgent.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import gymnasium as gym
 import numpy as np
 from gymnasium.envs.toy_text.frozen_lake import generate_random_map
 from matplotlib import pyplot as plt
 from tqdm import trange
 from AgentBase import AgentBase
-import warnings
 class DPAgent(AgentBase):
@@ -23,6 +26,7 @@ class DPAgent(AgentBase):
         return self.Pi[state]
     def train(self, *args, **kwargs):
         i = 0
         print(self.gamma)
         while True:
@@ -50,13 +54,35 @@ class DPAgent(AgentBase):
                 # update the state-value function
                 self.V[state] = value
                 delta = max(delta, abs(V_prev[state] - self.V[state]))
-            if delta < self.theta:
                 break
             i += 1
-            # if i % 5 == 0 and i != 0:
-            #     self.test(verbose=False)
             print(f"Iteration {i}: delta={delta}")
         self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
         for s in range(self.env.observation_space.n):
             for a in range(self.env.action_space.n):
@@ -64,7 +90,7 @@ class DPAgent(AgentBase):
                 for probability, next_state, reward, done in self.env.P[s][a]:
                     if (
                         self.env_name == "CliffWalking-v0"
-                        and state == self.env.observation_space.n - 1
                     ):
                         reward = 1
                     expected_value += probability * (
@@ -72,39 +98,35 @@ class DPAgent(AgentBase):
                     )
                 self.Pi[s, a] = expected_value
         idxs = np.argmax(self.Pi, axis=1)
-        print(idxs)
         self.Pi = np.zeros((self.env.observation_space.n, self.env.action_space.n))
         self.Pi[np.arange(self.env.observation_space.n), idxs] = 1
-        # print(self.Pi)
-        # return self.V, self.Pi
 if __name__ == "__main__":
-    # env = gym.make('FrozenLake-v1', render_mode='human')
-    dp = DPAgent(env="FrozenLake-v1", gamma=0.99)
-    dp.train()
-    dp.save_policy("dp_policy.npy")
     env = gym.make(
         "FrozenLake-v1",
-        render_mode="human",
         is_slippery=False,
-        desc=[
-            "SFFFFFFF",
-            "FFFFFFFH",
-            "FFFHFFFF",
-            "FFFFFHFF",
-            "FFFHFFFF",
-            "FHHFFFHF",
-            "FHFFHFHF",
-            "FFFHFFFG",
-        ],
     )
     state, _ = env.reset()
     done = False
     while not done:
-        action = dp.choose_action(state)
         state, reward, done, _, _ = env.step(action)
-        env.render()
-    # plt.savefig(f"imgs/{0}.png")

+import warnings
 import gymnasium as gym
 import numpy as np
 from gymnasium.envs.toy_text.frozen_lake import generate_random_map
 from matplotlib import pyplot as plt
+from PIL import Image
 from tqdm import trange
 from AgentBase import AgentBase
 class DPAgent(AgentBase):
         return self.Pi[state]
     def train(self, *args, **kwargs):
+        success_rate = []
         i = 0
         print(self.gamma)
         while True:
                 # update the state-value function
                 self.V[state] = value
                 delta = max(delta, abs(V_prev[state] - self.V[state]))
+            self.make_pi()
+            suc = self.test(verbose=False, greedy=True)
+            success_rate.append(suc)
+            if delta < self.theta and self.theta < 1:
+                print(f"breaking at {delta}, {self.theta}")
+                break
+            elif i > self.theta and self.theta > 1:
+                print(f"breaking at {i}, {self.theta}")
                 break
             i += 1
             print(f"Iteration {i}: delta={delta}")
+        # self.write_v(0)
+        return success_rate
+    def write_v(self, i):
+        v_cop = np.copy(self.V).reshape((12, 4))
+        print(v_cop)
+        v_cop -= np.min(v_cop)
+        v_cop /= np.max(v_cop)
+        print(np.min(v_cop), np.max(v_cop))
+        img = Image.fromarray(np.uint8(v_cop * 255), "L")
+        img = img.resize(
+            (v_cop.shape[0] * 100, v_cop.shape[1] * 100),
+            resample=Image.Resampling.NEAREST,
+        )
+        img.save(f"imgs/{i}.png")
+    def make_pi(self):
         self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
         for s in range(self.env.observation_space.n):
             for a in range(self.env.action_space.n):
                 for probability, next_state, reward, done in self.env.P[s][a]:
                     if (
                         self.env_name == "CliffWalking-v0"
+                        and s == self.env.observation_space.n - 1
                     ):
                         reward = 1
                     expected_value += probability * (
                     )
                 self.Pi[s, a] = expected_value
         idxs = np.argmax(self.Pi, axis=1)
         self.Pi = np.zeros((self.env.observation_space.n, self.env.action_space.n))
         self.Pi[np.arange(self.env.observation_space.n), idxs] = 1
 if __name__ == "__main__":
     env = gym.make(
         "FrozenLake-v1",
+        render_mode="ansi",
+        desc=generate_random_map(8, seed=24),
         is_slippery=False,
     )
+    dp = DPAgent(env="FrozenLake-v1", gamma=0.99)
+    dp.env = env
+    dp.env_name = "FrozenLake-v1"
+    dp.V = np.zeros(dp.env.observation_space.n)
+    dp.Pi = np.zeros(dp.env.observation_space.n, dp.env.action_space.n)
+    dp.n_states, dp.n_actions = (
+        dp.env.observation_space.n,
+        dp.env.action_space.n,
+    )
+    dp.train()
+    print(dp.test())
     state, _ = env.reset()
     done = False
     while not done:
+        action = dp.choose_action(state, greedy=True)
         state, reward, done, _, _ = env.step(action)
+        s = env.render()
+        print(s)
+    plt.savefig(f"imgs/{0}.png")

creategif.py CHANGED Viewed

@@ -5,21 +5,29 @@ from PIL import Image
 from PIL.Image import Transpose, Resampling
-api = wandb.Api()
-run = api.run("acozma/cs581/5ttfkav8")
-print(run.summary)
-print("Downloading images...")
-for file in run.files():
-    if file.name.endswith(".png"):
-        file.download(exist_ok=True)
-print("Finished downloading images")
 def process_images(image_fnames, upscale=20):
-    image_fnames.sort(key=lambda x: int(x.split("_")[-2]))
     frames = [Image.open(image) for image in image_fnames]
     frames = [frame.transpose(Transpose.ROTATE_90) for frame in frames]
     frames = [
@@ -46,14 +54,13 @@ def images_to_gif(frames, fname, duration=500):
     )
-folder_path = "./media/images"
 all_fnames = [os.path.join(folder_path, f) for f in os.listdir(folder_path)]
-fnames_policy = [f for f in all_fnames if os.path.basename(f).startswith("Policy")]
 policy_frames = process_images(fnames_policy)
-fnames_qtable = [f for f in all_fnames if os.path.basename(f).startswith("Q-table")]
 qtable_frames = process_images(fnames_qtable)
 spacing_factor = 1 / 2
@@ -67,4 +74,4 @@ for i, (qtable, policy) in enumerate(zip(qtable_frames, policy_frames)):
     new_frame.paste(policy, (0, height + int(height * spacing_factor)))
     final_frames.append(new_frame)
-images_to_gif(final_frames, "qtable_policy")

 from PIL.Image import Transpose, Resampling
+# api = wandb.Api()
+# run = api.run("acozma/cs581/5ttfkav8")
+# print(run.summary)
+# print("Downloading images...")
+# for file in run.files():
+#     if file.name.endswith(".png"):
+#         file.download(exist_ok=True)
+# print("Finished downloading images")
+folder_path = "./imgs/"
+policy_file_prefix = "Pi"
+q_file_prefix = "Q"
+out_name = "qtable_policy"
+sort_lambda = lambda x: int(x.split("_")[1].split(".")[0])  # key used to sort image filenames
 def process_images(image_fnames, upscale=20):
+    print(image_fnames)
+    image_fnames.sort(key=sort_lambda)
     frames = [Image.open(image) for image in image_fnames]
     frames = [frame.transpose(Transpose.ROTATE_90) for frame in frames]
     frames = [
     )
 all_fnames = [os.path.join(folder_path, f) for f in os.listdir(folder_path)]
+print(all_fnames)
+fnames_policy = [f for f in all_fnames if os.path.basename(f).startswith(policy_file_prefix)))]
 policy_frames = process_images(fnames_policy)
+fnames_qtable = [f for f in all_fnames if os.path.basename(f).startswith(q_file_prefix)]
 qtable_frames = process_images(fnames_qtable)
 spacing_factor = 1 / 2
     new_frame.paste(policy, (0, height + int(height * spacing_factor)))
     final_frames.append(new_frame)
+images_to_gif(final_frames, out_name)

qtable_policy.gif DELETED Viewed

Binary file (583 kB)

scripts/dp_exp.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import multiprocessing
+import time
+import gymnasium as gym
+import numpy as np
+from gymnasium.envs.toy_text.frozen_lake import generate_random_map
+import wandb
+from DPAgent import DPAgent
+from MCAgent import MCAgent
+env_ver = "FrozenLake-v1"
+def test_dp(gamma=0.99):
+    env = gym.make(
+        env_ver,
+        render_mode="ansi",
+        # desc=generate_random_map(8, seed=3141),
+        # is_slippery=False,
+    )
+    dp = DPAgent(env=env_ver, gamma=0.99)
+    dp.env = env
+    dp.env_name = env_ver
+    dp.V = np.zeros(dp.env.observation_space.n)
+    dp.Pi = np.zeros(dp.env.observation_space.n, dp.env.action_space.n)
+    dp.n_states, dp.n_actions = (
+        dp.env.observation_space.n,
+        dp.env.action_space.n,
+    )
+    times = dp.train()
+    # np.save(f"times_{gamma}.npy", times)
+    s = env.render()
+    print(s)
+def main():
+    wandb.init(
+        project="cs581",
+        # job_type=args.wandb_job_type,
+        # config=dict(args._get_kwargs()),
+    )
+    np.set_printoptions(linewidth=500, precision=3)
+    # with multiprocessing.Pool(8) as p:
+    #     gamma = [0.99, 0.95, 0.9, 0.8, 0.7, 0.6, 0.5, 0.1]
+    #     p.map(test_dp, gamma)
+    test_dp(0.99)
+if __name__ == "__main__":
+    main()

scripts/speedtest.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import multiprocessing
+import time
+import gymnasium as gym
+import numpy as np
+from gymnasium.envs.toy_text.frozen_lake import generate_random_map
+from DPAgent import DPAgent
+from MCAgent import MCAgent
+env_ver = "FrozenLake-v1"
+def test_mc(i, seed):
+    env = gym.make(
+        env_ver, desc=generate_random_map(size=i, p=0.4, seed=seed), is_slippery=False
+    )
+    agent = MCAgent(env=env_ver, gamma=1.0, epsilon=0.4)
+    agent.env = env
+    agent.n_states, agent.n_actions = (
+        agent.env.observation_space.n,
+        agent.env.action_space.n,
+    )
+    agent.initialize()
+    tic = time.perf_counter()
+    trained = agent.train(
+        max_steps=int((i**2) * 3),
+        n_train_episodes=10_000,
+        save_best=False,
+        early_stopping=True,
+        update_type="every_visit",
+    )
+    toc = time.perf_counter()
+    return trained, toc - tic
+def test_dp(i, seed, gamma=0.99):
+    env = gym.make(env_ver, desc=generate_random_map(i, seed=seed), is_slippery=False)
+    agent = DPAgent(env=env_ver, gamma=gamma)
+    agent.env = env
+    agent.V = np.zeros(agent.env.observation_space.n)
+    agent.Pi = np.zeros(agent.env.observation_space.n, agent.env.action_space.n)
+    agent.n_states, agent.n_actions = (
+        agent.env.observation_space.n,
+        agent.env.action_space.n,
+    )
+    return agent.train()
+def run_test(i):
+    mc_trained = False
+    seed = 0
+    mc_time = 0
+    dp_time = 0
+    while not mc_trained:
+        seed = np.random.randint(0, 100000)
+        mc_trained, train_time = test_mc(i, seed)
+    mc_time = train_time
+    dp_time = test_dp(i, seed)
+    return mc_time, dp_time
+def run_exp(gamma):
+    times = []
+    for i in range(8, 512, 8):
+        # mc_time, dp_time = run_test(i)
+        dp_time = test_dp(i, 0, gamma=gamma)
+        times.append((i, dp_time))
+    times = np.array(times)
+    np.save(f"times_{gamma}.npy", times)
+    return
+def main():
+    with multiprocessing.Pool(8) as p:
+        gamma = [0.95, 0.9, 0.8, 0.7, 0.6, 0.5, 0.1]
+        p.map(run_exp, gamma)
+if __name__ == "__main__":
+    main()