Spaces:

acozma
/

CS581-Algos-Demo

Sleeping

Andrei Cozma

Added monte carlo training and testing scripts for cliff walking gym env

01901c5 about 2 years ago

3.96 kB

	import numpy as np
	import gymnasium as gym
	from tqdm import tqdm


	def main():
	print("# Cliff Walking - Monte Carlo Train")
	env = gym.make("CliffWalking-v0")

	# Training parameters
	gamma, epsilon = 0.99, 0.1
	n_train_episodes, n_test_episodes, n_max_steps = 2000, 10, 500
	n_states, n_actions = env.observation_space.n, env.action_space.n
	print("=" * 80)
	print(f"gamma: {gamma}")
	print(f"epsilon: {epsilon}")
	print(f"n_episodes: {n_train_episodes}")
	print(f"n_steps: {n_max_steps}")
	print(f"n_states: {n_states}")
	print(f"n_actions: {n_actions}")
	print("=" * 80)

	# An arbitrary e-greedy policy
	Pi = np.full((n_states, n_actions), epsilon / n_actions)
	Pi[np.arange(n_states), np.random.randint(n_actions, size=n_states)] = (
	1 - epsilon + epsilon / n_actions
	)
	print("=" * 80)
	print("Initial policy:")
	print(Pi)
	print("=" * 80)
	Q = np.zeros((n_states, n_actions))
	R = [[[] for _ in range(n_actions)] for _ in range(n_states)]

	successes = []
	tqrange = tqdm(range(n_train_episodes))
	for i in tqrange:
	tqrange.set_description(f"Episode {i + 1:>4}")
	state, _ = env.reset()
	# Generate an episode following the current policy
	episode = []
	for _ in range(n_max_steps):
	# Randomly choose an action from the e-greedy policy
	action = np.random.choice(n_actions, p=Pi[state])
	# Take the action and observe the reward and next state
	next_state, reward, done, _, _ = env.step(action)
	episode.append((state, action, reward))
	state = next_state
	# This is where the agent got to the goal.
	# In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
	if done:
	successes.append(1)
	break
	else:
	successes.append(0)

	G = 0
	# For each step of the episode, in reverse order
	for t in range(len(episode) - 1, -1, -1):
	state, action, reward = episode[t]
	# Update the expected return
	G = gamma * G + reward
	# If we haven't already visited this state-action pair up to this point, then we can update the Q-table and policy
	# This is the first-visit MC method
	if (state, action) not in [(x[0], x[1]) for x in episode[:t]]:
	R[state][action].append(G)
	Q[state, action] = np.mean(R[state][action])
	# e-greedy policy update
	Pi[state] = np.full(n_actions, epsilon / n_actions)
	# the greedy action is the one with the highest Q-value
	Pi[state, np.argmax(Q[state])] = 1 - epsilon + epsilon / n_actions

	success_rate_100 = np.mean(successes[-100:])
	success_rate_250 = np.mean(successes[-250:])
	success_rate_500 = np.mean(successes[-500:])
	tqrange.set_postfix(
	success_rate_100=f"{success_rate_100:.3f}",
	success_rate_250=f"{success_rate_250:.3f}",
	success_rate_500=f"{success_rate_500:.3f}",
	)

	print("Final policy:")
	print(Pi)
	np.save("policy.npy", Pi)

	print("=" * 80)
	print(f"Testing policy for {n_test_episodes} episodes...")
	# Test the policy for a few episodes
	env = gym.make("CliffWalking-v0", render_mode="human")
	for e in range(n_test_episodes):
	print(f"Test #{e + 1}:", end=" ")

	state, _ = env.reset()
	for _ in range(n_max_steps):
	action = np.random.choice(n_actions, p=Pi[state])
	next_state, reward, done, _, _ = env.step(action)
	state = next_state
	if done:
	print("Success!")
	break
	else:
	print("Failed!")

	# Close the environment
	env.close()


	if __name__ == "__main__":
	main()