lharri73 commited on
Commit
8ae24a2
·
1 Parent(s): 22ea33b
DPAgent.py CHANGED
@@ -4,20 +4,25 @@ from gymnasium.envs.toy_text.frozen_lake import generate_random_map
4
  from matplotlib import pyplot as plt
5
  from tqdm import trange
6
  from Shared import Shared
 
7
 
8
 
9
  class DPAgent(Shared):
10
- def __init__(self, theta=1e-10, **kwargs):
11
  super().__init__(**kwargs)
12
- self.theta = theta
 
13
  self.V = np.zeros(self.env.observation_space.n)
14
- self.Pi = None
 
 
15
 
16
  def policy(self, state):
17
  return self.Pi[state]
18
 
19
- def train(self, **kwargs):
20
  i = 0
 
21
  while True:
22
  delta = 0
23
  V_prev = np.copy(self.V)
@@ -38,10 +43,10 @@ class DPAgent(Shared):
38
  if delta < self.theta:
39
  break
40
  i += 1
41
- self.test()
 
42
  print(f"Iteration {i}: delta={delta}")
43
  # break
44
-
45
  # policy = [self.policy(state, return_value=True)[0] for state in range(self.env.observation_space.n)]
46
  self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
47
  for s in range(self.env.observation_space.n):
@@ -51,23 +56,18 @@ class DPAgent(Shared):
51
  # if state == self.env.observation_space.n-1: reward = 1
52
  expected_value += probability * (reward + self.gamma * self.V[next_state])
53
  self.Pi[s,a] = expected_value
54
- self.Pi = np.argmax(self.Pi, axis=1)
55
- print(self.Pi)
 
 
 
 
56
  # return self.V, self.Pi
57
 
58
 
59
  if __name__ == "__main__":
60
  # env = gym.make('FrozenLake-v1', render_mode='human')
61
- dp = DPAgent("FrozenLake-v1", is_slippery=False, desc=[
62
- "SFFFFFFF",
63
- "FFFFFFFH",
64
- "FFFHFFFF",
65
- "FFFFFHFF",
66
- "FFFHFFFF",
67
- "FHHFFFHF",
68
- "FHFFHFHF",
69
- "FFFHFFFG",
70
- ])
71
  dp.train()
72
  dp.save_policy('dp_policy.npy')
73
  env = gym.make('FrozenLake-v1', render_mode='human', is_slippery=False, desc=[
@@ -84,8 +84,7 @@ if __name__ == "__main__":
84
  state, _ = env.reset()
85
  done = False
86
  while not done:
87
- action = dp.policy(state)
88
- action = np.argmax(action)
89
  state, reward, done, _, _ = env.step(action)
90
  env.render()
91
 
 
4
  from matplotlib import pyplot as plt
5
  from tqdm import trange
6
  from Shared import Shared
7
+ import warnings
8
 
9
 
10
  class DPAgent(Shared):
11
+ def __init__(self,/,**kwargs):
12
  super().__init__(**kwargs)
13
+ self.theta = kwargs.get('theta', 1e-10)
14
+ print(self.theta)
15
  self.V = np.zeros(self.env.observation_space.n)
16
+ self.Pi = np.zeros(self.env.observation_space.n, self.env.action_space.n)
17
+ if self.gamma >= 1.0:
18
+ warnings.warn("DP will never converge with a gamma value =1.0. Try 0.99?", UserWarning)
19
 
20
  def policy(self, state):
21
  return self.Pi[state]
22
 
23
+ def train(self, *args, **kwargs):
24
  i = 0
25
+ print(self.gamma)
26
  while True:
27
  delta = 0
28
  V_prev = np.copy(self.V)
 
43
  if delta < self.theta:
44
  break
45
  i += 1
46
+ # if i % 100 == 0 and i != 0:
47
+ # self.test()
48
  print(f"Iteration {i}: delta={delta}")
49
  # break
 
50
  # policy = [self.policy(state, return_value=True)[0] for state in range(self.env.observation_space.n)]
51
  self.Pi = np.empty((self.env.observation_space.n, self.env.action_space.n))
52
  for s in range(self.env.observation_space.n):
 
56
  # if state == self.env.observation_space.n-1: reward = 1
57
  expected_value += probability * (reward + self.gamma * self.V[next_state])
58
  self.Pi[s,a] = expected_value
59
+ idxs = np.argmax(self.Pi, axis=1)
60
+ print(idxs)
61
+ self.Pi = np.zeros((self.env.observation_space.n,self.env.action_space.n))
62
+ self.Pi[np.arange(self.env.observation_space.n),idxs] = 1
63
+
64
+ # print(self.Pi)
65
  # return self.V, self.Pi
66
 
67
 
68
  if __name__ == "__main__":
69
  # env = gym.make('FrozenLake-v1', render_mode='human')
70
+ dp = DPAgent(env_name="FrozenLake-v1")
 
 
 
 
 
 
 
 
 
71
  dp.train()
72
  dp.save_policy('dp_policy.npy')
73
  env = gym.make('FrozenLake-v1', render_mode='human', is_slippery=False, desc=[
 
84
  state, _ = env.reset()
85
  done = False
86
  while not done:
87
+ action = dp.choose_action(state)
 
88
  state, reward, done, _, _ = env.step(action)
89
  env.render()
90
 
MCAgent.py CHANGED
@@ -8,6 +8,7 @@ class MCAgent(Shared):
8
 
9
  def __init__(
10
  self,
 
11
  env_name="CliffWalking-v0",
12
  gamma=0.99,
13
  epsilon=0.1,
 
8
 
9
  def __init__(
10
  self,
11
+ /,
12
  env_name="CliffWalking-v0",
13
  gamma=0.99,
14
  epsilon=0.1,
Shared.py CHANGED
@@ -8,7 +8,7 @@ import wandb
8
  class Shared:
9
 
10
  def __init__(
11
- self,
12
  env_name="CliffWalking-v0",
13
  gamma=0.99,
14
  epsilon=0.1,
@@ -24,7 +24,7 @@ class Shared:
24
  self.env_name = env_name
25
  self.epsilon, self.gamma = epsilon, gamma
26
 
27
- self.env_kwargs = kwargs
28
  if self.env_name == "FrozenLake-v1":
29
  # Can use defaults by defining map_name (4x4 or 8x8) or custom map by defining desc
30
  # self.env_kwargs["map_name"] = "8x8"
 
8
  class Shared:
9
 
10
  def __init__(
11
+ self,/,
12
  env_name="CliffWalking-v0",
13
  gamma=0.99,
14
  epsilon=0.1,
 
24
  self.env_name = env_name
25
  self.epsilon, self.gamma = epsilon, gamma
26
 
27
+ self.env_kwargs = {k:v for k,v in kwargs.items() if k in ['render_mode']}
28
  if self.env_name == "FrozenLake-v1":
29
  # Can use defaults by defining map_name (4x4 or 8x8) or custom map by defining desc
30
  # self.env_kwargs["map_name"] = "8x8"
dp_policy.npy ADDED
Binary file (2.18 kB). View file
 
policies/DPAgent_CliffWalking-v0_e2500_s200_g0.99_e0.4_first_visit.npy ADDED
Binary file (1.66 kB). View file
 
policies/DPAgent_CliffWalking-v0_i219_g0.9.npy DELETED
Binary file (512 Bytes)
 
policies/DPAgent_FrozenLake-v1_e2500_s200_g0.99_e0.4_first_visit.npy ADDED
Binary file (1.66 kB). View file
 
policies/DPAgent_FrozenLake-v1_i219_g0.9.npy DELETED
Binary file (640 Bytes)
 
policies/{MonteCarloAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy → MCAgent_CliffWalking-v0_e2500_s200_g1.0_e0.4_first_visit.npy} RENAMED
File without changes
policies/{MonteCarloAgent_FrozenLake-v1_e2500_s200_g1.0_e0.2_first_visit.npy → MCAgent_FrozenLake-v1_e2500_s200_g1.0_e0.2_first_visit.npy} RENAMED
File without changes
run.py CHANGED
@@ -5,7 +5,6 @@ from agents import AGENTS_MAP
5
 
6
  def main():
7
  parser = argparse.ArgumentParser()
8
-
9
  ### Train/Test parameters
10
  parser.add_argument(
11
  "--train",
@@ -77,7 +76,7 @@ def main():
77
  parser.add_argument(
78
  "--gamma",
79
  type=float,
80
- default=1.0,
81
  help="The value for the discount factor to use. (default: 1.0)",
82
  )
83
  parser.add_argument(
@@ -128,13 +127,8 @@ def main():
128
  )
129
 
130
  args = parser.parse_args()
131
-
132
- agent = AGENTS_MAP[args.agent](
133
- args.env,
134
- gamma=args.gamma,
135
- epsilon=args.epsilon,
136
- render_mode=args.render_mode,
137
- )
138
 
139
  run_name = f"{agent.__class__.__name__}_{args.env}_e{args.n_train_episodes}_s{args.max_steps}_g{args.gamma}_e{args.epsilon}_{args.update_type}"
140
  if args.wandb_run_name_suffix is not None:
 
5
 
6
  def main():
7
  parser = argparse.ArgumentParser()
 
8
  ### Train/Test parameters
9
  parser.add_argument(
10
  "--train",
 
76
  parser.add_argument(
77
  "--gamma",
78
  type=float,
79
+ default=0.99,
80
  help="The value for the discount factor to use. (default: 1.0)",
81
  )
82
  parser.add_argument(
 
127
  )
128
 
129
  args = parser.parse_args()
130
+ print(vars(args))
131
+ agent = AGENTS_MAP[args.agent](**dict(args._get_kwargs()))
 
 
 
 
 
132
 
133
  run_name = f"{agent.__class__.__name__}_{args.env}_e{args.n_train_episodes}_s{args.max_steps}_g{args.gamma}_e{args.epsilon}_{args.update_type}"
134
  if args.wandb_run_name_suffix is not None: