File size: 3,960 Bytes
01901c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import numpy as np
import gymnasium as gym
from tqdm import tqdm


def main():
    print("# Cliff Walking - Monte Carlo Train")
    env = gym.make("CliffWalking-v0")

    # Training parameters
    gamma, epsilon = 0.99, 0.1
    n_train_episodes, n_test_episodes, n_max_steps = 2000, 10, 500
    n_states, n_actions = env.observation_space.n, env.action_space.n
    print("=" * 80)
    print(f"gamma: {gamma}")
    print(f"epsilon: {epsilon}")
    print(f"n_episodes: {n_train_episodes}")
    print(f"n_steps: {n_max_steps}")
    print(f"n_states: {n_states}")
    print(f"n_actions: {n_actions}")
    print("=" * 80)

    # An arbitrary e-greedy policy
    Pi = np.full((n_states, n_actions), epsilon / n_actions)
    Pi[np.arange(n_states), np.random.randint(n_actions, size=n_states)] = (
        1 - epsilon + epsilon / n_actions
    )
    print("=" * 80)
    print("Initial policy:")
    print(Pi)
    print("=" * 80)
    Q = np.zeros((n_states, n_actions))
    R = [[[] for _ in range(n_actions)] for _ in range(n_states)]

    successes = []
    tqrange = tqdm(range(n_train_episodes))
    for i in tqrange:
        tqrange.set_description(f"Episode {i + 1:>4}")
        state, _ = env.reset()
        # Generate an episode following the current policy
        episode = []
        for _ in range(n_max_steps):
            # Randomly choose an action from the e-greedy policy
            action = np.random.choice(n_actions, p=Pi[state])
            # Take the action and observe the reward and next state
            next_state, reward, done, _, _ = env.step(action)
            episode.append((state, action, reward))
            state = next_state
            # This is where the agent got to the goal.
            # In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
            if done:
                successes.append(1)
                break
        else:
            successes.append(0)

        G = 0
        # For each step of the episode, in reverse order
        for t in range(len(episode) - 1, -1, -1):
            state, action, reward = episode[t]
            # Update the expected return
            G = gamma * G + reward
            # If we haven't already visited this state-action pair up to this point, then we can update the Q-table and policy
            # This is the first-visit MC method
            if (state, action) not in [(x[0], x[1]) for x in episode[:t]]:
                R[state][action].append(G)
                Q[state, action] = np.mean(R[state][action])
                # e-greedy policy update
                Pi[state] = np.full(n_actions, epsilon / n_actions)
                # the greedy action is the one with the highest Q-value
                Pi[state, np.argmax(Q[state])] = 1 - epsilon + epsilon / n_actions

        success_rate_100 = np.mean(successes[-100:])
        success_rate_250 = np.mean(successes[-250:])
        success_rate_500 = np.mean(successes[-500:])
        tqrange.set_postfix(
            success_rate_100=f"{success_rate_100:.3f}",
            success_rate_250=f"{success_rate_250:.3f}",
            success_rate_500=f"{success_rate_500:.3f}",
        )

    print("Final policy:")
    print(Pi)
    np.save("policy.npy", Pi)

    print("=" * 80)
    print(f"Testing policy for {n_test_episodes} episodes...")
    # Test the policy for a few episodes
    env = gym.make("CliffWalking-v0", render_mode="human")
    for e in range(n_test_episodes):
        print(f"Test #{e + 1}:", end=" ")

        state, _ = env.reset()
        for _ in range(n_max_steps):
            action = np.random.choice(n_actions, p=Pi[state])
            next_state, reward, done, _, _ = env.step(action)
            state = next_state
            if done:
                print("Success!")
                break
        else:
            print("Failed!")

    # Close the environment
    env.close()


if __name__ == "__main__":
    main()