Andrei Cozma commited on
Commit
01901c5
·
1 Parent(s): 8f61dac

Added monte carlo training and testing scripts for cliff walking gym env

Browse files
Files changed (3) hide show
  1. mc/mc_test.py +43 -0
  2. mc/mc_train.py +109 -0
  3. mc/policy.npy +0 -0
mc/mc_test.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import gymnasium as gym
3
+ from tqdm import tqdm
4
+
5
+ policy_file = "policy.npy"
6
+ n_steps = 500
7
+ n_test_episodes = 10
8
+
9
+
10
+ def main():
11
+ print("=" * 80)
12
+ print("# Cliff Walking - Monte Carlo Test")
13
+ print("=" * 80)
14
+ # save the policy
15
+ print(f"Loading policy from file: '{policy_file}'...")
16
+ Pi = np.load(policy_file)
17
+ print("Policy:")
18
+ print(Pi)
19
+ print(f"shape: {Pi.shape}")
20
+ _, n_actions = Pi.shape
21
+
22
+ print("=" * 80)
23
+ print(f"Testing policy for {n_test_episodes} episodes...")
24
+ env = gym.make("CliffWalking-v0", render_mode="human")
25
+ for e in range(n_test_episodes):
26
+ print(f"Test #{e + 1}:", end=" ")
27
+
28
+ state, _ = env.reset()
29
+ for _ in range(n_steps):
30
+ action = np.random.choice(n_actions, p=Pi[state])
31
+ next_state, reward, done, _, _ = env.step(action)
32
+ state = next_state
33
+ if done:
34
+ print("Success!")
35
+ break
36
+ else:
37
+ print("Failed!")
38
+
39
+ env.close()
40
+
41
+
42
+ if __name__ == "__main__":
43
+ main()
mc/mc_train.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import gymnasium as gym
3
+ from tqdm import tqdm
4
+
5
+
6
+ def main():
7
+ print("# Cliff Walking - Monte Carlo Train")
8
+ env = gym.make("CliffWalking-v0")
9
+
10
+ # Training parameters
11
+ gamma, epsilon = 0.99, 0.1
12
+ n_train_episodes, n_test_episodes, n_max_steps = 2000, 10, 500
13
+ n_states, n_actions = env.observation_space.n, env.action_space.n
14
+ print("=" * 80)
15
+ print(f"gamma: {gamma}")
16
+ print(f"epsilon: {epsilon}")
17
+ print(f"n_episodes: {n_train_episodes}")
18
+ print(f"n_steps: {n_max_steps}")
19
+ print(f"n_states: {n_states}")
20
+ print(f"n_actions: {n_actions}")
21
+ print("=" * 80)
22
+
23
+ # An arbitrary e-greedy policy
24
+ Pi = np.full((n_states, n_actions), epsilon / n_actions)
25
+ Pi[np.arange(n_states), np.random.randint(n_actions, size=n_states)] = (
26
+ 1 - epsilon + epsilon / n_actions
27
+ )
28
+ print("=" * 80)
29
+ print("Initial policy:")
30
+ print(Pi)
31
+ print("=" * 80)
32
+ Q = np.zeros((n_states, n_actions))
33
+ R = [[[] for _ in range(n_actions)] for _ in range(n_states)]
34
+
35
+ successes = []
36
+ tqrange = tqdm(range(n_train_episodes))
37
+ for i in tqrange:
38
+ tqrange.set_description(f"Episode {i + 1:>4}")
39
+ state, _ = env.reset()
40
+ # Generate an episode following the current policy
41
+ episode = []
42
+ for _ in range(n_max_steps):
43
+ # Randomly choose an action from the e-greedy policy
44
+ action = np.random.choice(n_actions, p=Pi[state])
45
+ # Take the action and observe the reward and next state
46
+ next_state, reward, done, _, _ = env.step(action)
47
+ episode.append((state, action, reward))
48
+ state = next_state
49
+ # This is where the agent got to the goal.
50
+ # In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
51
+ if done:
52
+ successes.append(1)
53
+ break
54
+ else:
55
+ successes.append(0)
56
+
57
+ G = 0
58
+ # For each step of the episode, in reverse order
59
+ for t in range(len(episode) - 1, -1, -1):
60
+ state, action, reward = episode[t]
61
+ # Update the expected return
62
+ G = gamma * G + reward
63
+ # If we haven't already visited this state-action pair up to this point, then we can update the Q-table and policy
64
+ # This is the first-visit MC method
65
+ if (state, action) not in [(x[0], x[1]) for x in episode[:t]]:
66
+ R[state][action].append(G)
67
+ Q[state, action] = np.mean(R[state][action])
68
+ # e-greedy policy update
69
+ Pi[state] = np.full(n_actions, epsilon / n_actions)
70
+ # the greedy action is the one with the highest Q-value
71
+ Pi[state, np.argmax(Q[state])] = 1 - epsilon + epsilon / n_actions
72
+
73
+ success_rate_100 = np.mean(successes[-100:])
74
+ success_rate_250 = np.mean(successes[-250:])
75
+ success_rate_500 = np.mean(successes[-500:])
76
+ tqrange.set_postfix(
77
+ success_rate_100=f"{success_rate_100:.3f}",
78
+ success_rate_250=f"{success_rate_250:.3f}",
79
+ success_rate_500=f"{success_rate_500:.3f}",
80
+ )
81
+
82
+ print("Final policy:")
83
+ print(Pi)
84
+ np.save("policy.npy", Pi)
85
+
86
+ print("=" * 80)
87
+ print(f"Testing policy for {n_test_episodes} episodes...")
88
+ # Test the policy for a few episodes
89
+ env = gym.make("CliffWalking-v0", render_mode="human")
90
+ for e in range(n_test_episodes):
91
+ print(f"Test #{e + 1}:", end=" ")
92
+
93
+ state, _ = env.reset()
94
+ for _ in range(n_max_steps):
95
+ action = np.random.choice(n_actions, p=Pi[state])
96
+ next_state, reward, done, _, _ = env.step(action)
97
+ state = next_state
98
+ if done:
99
+ print("Success!")
100
+ break
101
+ else:
102
+ print("Failed!")
103
+
104
+ # Close the environment
105
+ env.close()
106
+
107
+
108
+ if __name__ == "__main__":
109
+ main()
mc/policy.npy ADDED
Binary file (1.66 kB). View file