Spaces:
Sleeping
Sleeping
Andrei Cozma
commited on
Commit
·
01901c5
1
Parent(s):
8f61dac
Added monte carlo training and testing scripts for cliff walking gym env
Browse files- mc/mc_test.py +43 -0
- mc/mc_train.py +109 -0
- mc/policy.npy +0 -0
mc/mc_test.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import gymnasium as gym
|
3 |
+
from tqdm import tqdm
|
4 |
+
|
5 |
+
policy_file = "policy.npy"
|
6 |
+
n_steps = 500
|
7 |
+
n_test_episodes = 10
|
8 |
+
|
9 |
+
|
10 |
+
def main():
|
11 |
+
print("=" * 80)
|
12 |
+
print("# Cliff Walking - Monte Carlo Test")
|
13 |
+
print("=" * 80)
|
14 |
+
# save the policy
|
15 |
+
print(f"Loading policy from file: '{policy_file}'...")
|
16 |
+
Pi = np.load(policy_file)
|
17 |
+
print("Policy:")
|
18 |
+
print(Pi)
|
19 |
+
print(f"shape: {Pi.shape}")
|
20 |
+
_, n_actions = Pi.shape
|
21 |
+
|
22 |
+
print("=" * 80)
|
23 |
+
print(f"Testing policy for {n_test_episodes} episodes...")
|
24 |
+
env = gym.make("CliffWalking-v0", render_mode="human")
|
25 |
+
for e in range(n_test_episodes):
|
26 |
+
print(f"Test #{e + 1}:", end=" ")
|
27 |
+
|
28 |
+
state, _ = env.reset()
|
29 |
+
for _ in range(n_steps):
|
30 |
+
action = np.random.choice(n_actions, p=Pi[state])
|
31 |
+
next_state, reward, done, _, _ = env.step(action)
|
32 |
+
state = next_state
|
33 |
+
if done:
|
34 |
+
print("Success!")
|
35 |
+
break
|
36 |
+
else:
|
37 |
+
print("Failed!")
|
38 |
+
|
39 |
+
env.close()
|
40 |
+
|
41 |
+
|
42 |
+
if __name__ == "__main__":
|
43 |
+
main()
|
mc/mc_train.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import gymnasium as gym
|
3 |
+
from tqdm import tqdm
|
4 |
+
|
5 |
+
|
6 |
+
def main():
|
7 |
+
print("# Cliff Walking - Monte Carlo Train")
|
8 |
+
env = gym.make("CliffWalking-v0")
|
9 |
+
|
10 |
+
# Training parameters
|
11 |
+
gamma, epsilon = 0.99, 0.1
|
12 |
+
n_train_episodes, n_test_episodes, n_max_steps = 2000, 10, 500
|
13 |
+
n_states, n_actions = env.observation_space.n, env.action_space.n
|
14 |
+
print("=" * 80)
|
15 |
+
print(f"gamma: {gamma}")
|
16 |
+
print(f"epsilon: {epsilon}")
|
17 |
+
print(f"n_episodes: {n_train_episodes}")
|
18 |
+
print(f"n_steps: {n_max_steps}")
|
19 |
+
print(f"n_states: {n_states}")
|
20 |
+
print(f"n_actions: {n_actions}")
|
21 |
+
print("=" * 80)
|
22 |
+
|
23 |
+
# An arbitrary e-greedy policy
|
24 |
+
Pi = np.full((n_states, n_actions), epsilon / n_actions)
|
25 |
+
Pi[np.arange(n_states), np.random.randint(n_actions, size=n_states)] = (
|
26 |
+
1 - epsilon + epsilon / n_actions
|
27 |
+
)
|
28 |
+
print("=" * 80)
|
29 |
+
print("Initial policy:")
|
30 |
+
print(Pi)
|
31 |
+
print("=" * 80)
|
32 |
+
Q = np.zeros((n_states, n_actions))
|
33 |
+
R = [[[] for _ in range(n_actions)] for _ in range(n_states)]
|
34 |
+
|
35 |
+
successes = []
|
36 |
+
tqrange = tqdm(range(n_train_episodes))
|
37 |
+
for i in tqrange:
|
38 |
+
tqrange.set_description(f"Episode {i + 1:>4}")
|
39 |
+
state, _ = env.reset()
|
40 |
+
# Generate an episode following the current policy
|
41 |
+
episode = []
|
42 |
+
for _ in range(n_max_steps):
|
43 |
+
# Randomly choose an action from the e-greedy policy
|
44 |
+
action = np.random.choice(n_actions, p=Pi[state])
|
45 |
+
# Take the action and observe the reward and next state
|
46 |
+
next_state, reward, done, _, _ = env.step(action)
|
47 |
+
episode.append((state, action, reward))
|
48 |
+
state = next_state
|
49 |
+
# This is where the agent got to the goal.
|
50 |
+
# In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
|
51 |
+
if done:
|
52 |
+
successes.append(1)
|
53 |
+
break
|
54 |
+
else:
|
55 |
+
successes.append(0)
|
56 |
+
|
57 |
+
G = 0
|
58 |
+
# For each step of the episode, in reverse order
|
59 |
+
for t in range(len(episode) - 1, -1, -1):
|
60 |
+
state, action, reward = episode[t]
|
61 |
+
# Update the expected return
|
62 |
+
G = gamma * G + reward
|
63 |
+
# If we haven't already visited this state-action pair up to this point, then we can update the Q-table and policy
|
64 |
+
# This is the first-visit MC method
|
65 |
+
if (state, action) not in [(x[0], x[1]) for x in episode[:t]]:
|
66 |
+
R[state][action].append(G)
|
67 |
+
Q[state, action] = np.mean(R[state][action])
|
68 |
+
# e-greedy policy update
|
69 |
+
Pi[state] = np.full(n_actions, epsilon / n_actions)
|
70 |
+
# the greedy action is the one with the highest Q-value
|
71 |
+
Pi[state, np.argmax(Q[state])] = 1 - epsilon + epsilon / n_actions
|
72 |
+
|
73 |
+
success_rate_100 = np.mean(successes[-100:])
|
74 |
+
success_rate_250 = np.mean(successes[-250:])
|
75 |
+
success_rate_500 = np.mean(successes[-500:])
|
76 |
+
tqrange.set_postfix(
|
77 |
+
success_rate_100=f"{success_rate_100:.3f}",
|
78 |
+
success_rate_250=f"{success_rate_250:.3f}",
|
79 |
+
success_rate_500=f"{success_rate_500:.3f}",
|
80 |
+
)
|
81 |
+
|
82 |
+
print("Final policy:")
|
83 |
+
print(Pi)
|
84 |
+
np.save("policy.npy", Pi)
|
85 |
+
|
86 |
+
print("=" * 80)
|
87 |
+
print(f"Testing policy for {n_test_episodes} episodes...")
|
88 |
+
# Test the policy for a few episodes
|
89 |
+
env = gym.make("CliffWalking-v0", render_mode="human")
|
90 |
+
for e in range(n_test_episodes):
|
91 |
+
print(f"Test #{e + 1}:", end=" ")
|
92 |
+
|
93 |
+
state, _ = env.reset()
|
94 |
+
for _ in range(n_max_steps):
|
95 |
+
action = np.random.choice(n_actions, p=Pi[state])
|
96 |
+
next_state, reward, done, _, _ = env.step(action)
|
97 |
+
state = next_state
|
98 |
+
if done:
|
99 |
+
print("Success!")
|
100 |
+
break
|
101 |
+
else:
|
102 |
+
print("Failed!")
|
103 |
+
|
104 |
+
# Close the environment
|
105 |
+
env.close()
|
106 |
+
|
107 |
+
|
108 |
+
if __name__ == "__main__":
|
109 |
+
main()
|
mc/policy.npy
ADDED
Binary file (1.66 kB). View file
|
|