Spaces:

acozma
/

CS581-Algos-Demo

Sleeping

CS581-Algos-Demo / MonteCarloAgent.py

Andrei Cozma

Updates

8ceccef about 2 years ago

14.4 kB

	import os
	import numpy as np
	import gymnasium as gym
	from tqdm import tqdm
	import argparse

	import wandb


	class MonteCarloAgent:
	def __init__(self, env_name="CliffWalking-v0", gamma=0.99, epsilon=0.1, **kwargs):
	print("=" * 80)
	print(f"# MonteCarloAgent - {env_name}")
	print(f"- epsilon: {epsilon}")
	print(f"- gamma: {gamma}")
	self.env = gym.make(env_name, **kwargs)
	self.epsilon, self.gamma = epsilon, gamma
	self.n_states, self.n_actions = (
	self.env.observation_space.n,
	self.env.action_space.n,
	)
	print(f"- n_states: {self.n_states}")
	print(f"- n_actions: {self.n_actions}")
	self.reset()

	def reset(self):
	print("Resetting all state variables...")
	self.Q = np.zeros((self.n_states, self.n_actions))
	self.R = [[[] for _ in range(self.n_actions)] for _ in range(self.n_states)]

	# An arbitrary e-greedy policy
	self.Pi = np.full(
	(self.n_states, self.n_actions), self.epsilon / self.n_actions
	)
	self.Pi[
	np.arange(self.n_states),
	np.random.randint(self.n_actions, size=self.n_states),
	] = (
	1 - self.epsilon + self.epsilon / self.n_actions
	)
	print("=" * 80)
	print("Initial policy:")
	print(self.Pi)
	print("=" * 80)

	def choose_action(self, state):
	# Sample an action from the policy
	return np.random.choice(self.n_actions, p=self.Pi[state])

	# def run_episode(self, max_steps=500, render=False, **kwargs):
	# state, _ = self.env.reset()
	# episode_hist, solved, rgb_array = [], False, None

	# # Generate an episode following the current policy
	# for _ in range(max_steps):
	# rgb_array = self.env.render() if render else None
	# # Sample an action from the policy
	# action = self.choose_action(state)
	# # Take the action and observe the reward and next state
	# next_state, reward, done, truncated, _ = self.env.step(action)
	# # Keeping track of the trajectory
	# episode_hist.append((state, action, reward))
	# state = next_state

	# # This is where the agent got to the goal.
	# # In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
	# if done:
	# solved = True
	# break
	# if truncated:
	# break

	# rgb_array = self.env.render() if render else None

	# return episode_hist, solved, rgb_array

	def generate_episode(self, max_steps=500, render=False, **kwargs):
	state, _ = self.env.reset()
	episode_hist, solved, rgb_array = [], False, None

	# Generate an episode following the current policy
	for _ in range(max_steps):
	rgb_array = self.env.render() if render else None
	# Sample an action from the policy
	action = self.choose_action(state)
	# Take the action and observe the reward and next state
	next_state, reward, done, truncated, _ = self.env.step(action)
	# Keeping track of the trajectory
	episode_hist.append((state, action, reward))
	state = next_state

	yield episode_hist, solved, rgb_array

	# This is where the agent got to the goal.
	# In the case in which agent jumped off the cliff, it is simply respawned at the start position without termination.
	if done or truncated:
	solved = True
	break

	rgb_array = self.env.render() if render else None

	yield episode_hist, solved, rgb_array

	def run_episode(self, max_steps=500, render=False, **kwargs):
	# Run the generator until the end
	episode_hist, solved, rgb_array = None, False, None
	for episode_hist, solved, rgb_array in self.generate_episode(
	max_steps, render, **kwargs
	):
	pass
	return episode_hist, solved, rgb_array

	def update_first_visit(self, episode_hist):
	G = 0
	# For each step of the episode, in reverse order
	for t in range(len(episode_hist) - 1, -1, -1):
	state, action, reward = episode_hist[t]
	# Update the expected return
	G = self.gamma * G + reward
	# If we haven't already visited this state-action pair up to this point, then we can update the Q-table and policy
	# This is the first-visit MC method
	if (state, action) not in [(x[0], x[1]) for x in episode_hist[:t]]:
	self.R[state][action].append(G)
	self.Q[state, action] = np.mean(self.R[state][action])
	# Epsilon-greedy policy update
	self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
	# the greedy action is the one with the highest Q-value
	self.Pi[state, np.argmax(self.Q[state])] = (
	1 - self.epsilon + self.epsilon / self.n_actions
	)

	def update_every_visit(self, episode_hist):
	G = 0
	# For each step of the episode, in reverse order
	for t in range(len(episode_hist) - 1, -1, -1):
	state, action, reward = episode_hist[t]
	# Update the expected return
	G = self.gamma * G + reward
	# We update the Q-table and policy even if we have visited this state-action pair before
	# This is the every-visit MC method
	self.R[state][action].append(G)
	self.Q[state, action] = np.mean(self.R[state][action])
	# Epsilon-greedy policy update
	self.Pi[state] = np.full(self.n_actions, self.epsilon / self.n_actions)
	# the greedy action is the one with the highest Q-value
	self.Pi[state, np.argmax(self.Q[state])] = (
	1 - self.epsilon + self.epsilon / self.n_actions
	)

	def train(
	self,
	n_train_episodes=2000,
	test_every=100,
	update_type="first_visit",
	log_wandb=False,
	**kwargs,
	):
	print(f"Training agent for {n_train_episodes} episodes...")

	train_running_success_rate, test_success_rate = 0.0, 0.0
	stats = {
	"train_running_success_rate": train_running_success_rate,
	"test_success_rate": test_success_rate,
	}

	update_func = getattr(self, f"update_{update_type}")

	tqrange = tqdm(range(n_train_episodes))
	tqrange.set_description("Training")

	if log_wandb:
	self.wandb_log_img(episode=None)

	for e in tqrange:
	episode_hist, solved, _ = self.run_episode(**kwargs)
	rewards = [x[2] for x in episode_hist]
	total_reward, avg_reward = sum(rewards), np.mean(rewards)
	train_running_success_rate = (
	0.99 * train_running_success_rate + 0.01 * solved
	)
	update_func(episode_hist)

	stats = {
	"train_running_success_rate": train_running_success_rate,
	"test_success_rate": test_success_rate,
	"total_reward": total_reward,
	"avg_reward": avg_reward,
	}
	tqrange.set_postfix(stats)

	if e % test_every == 0:
	test_success_rate = self.test(verbose=False, **kwargs)
	if log_wandb:
	self.wandb_log_img(episode=e)

	stats["test_success_rate"] = test_success_rate
	tqrange.set_postfix(stats)

	if log_wandb:
	wandb.log(stats)

	def test(self, n_test_episodes=100, verbose=True, **kwargs):
	if verbose:
	print(f"Testing agent for {n_test_episodes} episodes...")
	num_successes = 0
	for e in range(n_test_episodes):
	_, solved, _ = self.run_episode(**kwargs)
	num_successes += solved
	if verbose:
	word = "reached" if solved else "did not reach"
	emoji = "🏁" if solved else "🚫"
	print(
	f"({e + 1:>{len(str(n_test_episodes))}}/{n_test_episodes}) - Agent {word} the goal {emoji}"
	)

	success_rate = num_successes / n_test_episodes
	if verbose:
	print(
	f"Agent reached the goal in {num_successes}/{n_test_episodes} episodes ({success_rate * 100:.2f}%)"
	)
	return success_rate

	def save_policy(self, fname="policy.npy", save_dir=None):
	if save_dir is not None:
	os.makedirs(save_dir, exist_ok=True)
	fname = os.path.join(save_dir, fname)
	print(f"Saving policy to: {fname}")
	np.save(fname, self.Pi)

	def load_policy(self, fname="policy.npy"):
	print(f"Loading policy from: {fname}")
	self.Pi = np.load(fname)

	def wandb_log_img(self, episode=None):
	caption_suffix = "Initial" if episode is None else f"After Episode {episode}"
	wandb.log(
	{
	"Q-table": wandb.Image(
	self.Q,
	caption=f"Q-table - {caption_suffix}",
	),
	"Policy": wandb.Image(
	self.Pi,
	caption=f"Policy - {caption_suffix}",
	),
	}
	)


	def main():
	parser = argparse.ArgumentParser()

	### Train/Test parameters
	parser.add_argument(
	"--train",
	action="store_true",
	help="Use this flag to train the agent.",
	)
	parser.add_argument(
	"--test",
	type=str,
	default=None,
	help="Use this flag to test the agent. Provide the path to the policy file.",
	)
	parser.add_argument(
	"--n_train_episodes",
	type=int,
	default=2000,
	help="The number of episodes to train for. (default: 2000)",
	)
	parser.add_argument(
	"--n_test_episodes",
	type=int,
	default=100,
	help="The number of episodes to test for. (default: 100)",
	)
	parser.add_argument(
	"--test_every",
	type=int,
	default=100,
	help="During training, test the agent every n episodes. (default: 100)",
	)

	parser.add_argument(
	"--max_steps",
	type=int,
	default=500,
	help="The maximum number of steps per episode before the episode is forced to end. (default: 500)",
	)

	parser.add_argument(
	"--update_type",
	type=str,
	choices=["first_visit", "every_visit"],
	default="first_visit",
	help="The type of update to use. (default: first_visit)",
	)

	parser.add_argument(
	"--save_dir",
	type=str,
	default="policies",
	help="The directory to save the policy to. (default: policies)",
	)

	parser.add_argument(
	"--no_save",
	action="store_true",
	help="Use this flag to disable saving the policy.",
	)

	### Agent parameters
	parser.add_argument(
	"--gamma",
	type=float,
	default=0.99,
	help="The value for the discount factor to use. (default: 0.99)",
	)
	parser.add_argument(
	"--epsilon",
	type=float,
	default=0.5,
	help="The value for the epsilon-greedy policy to use. (default: 0.1)",
	)

	### Environment parameters
	parser.add_argument(
	"--env",
	type=str,
	default="CliffWalking-v0",
	help="The Gymnasium environment to use. (default: CliffWalking-v0)",
	)
	parser.add_argument(
	"--render_mode",
	type=str,
	default=None,
	help="Render mode passed to the gym.make() function. Use 'human' to render the environment. (default: None)",
	)
	parser.add_argument(
	"--wandb_project",
	type=str,
	default=None,
	help="WandB project name for logging. If not provided, no logging is done. (default: None)",
	)
	parser.add_argument(
	"--wandb_group",
	type=str,
	default="monte-carlo",
	help="WandB group name for logging. (default: monte-carlo)",
	)
	parser.add_argument(
	"--wandb_job_type",
	type=str,
	default="train",
	help="WandB job type for logging. (default: train)",
	)
	parser.add_argument(
	"--wandb_run_name_suffix",
	type=str,
	default=None,
	help="WandB run name suffix for logging. (default: None)",
	)

	args = parser.parse_args()

	agent = MonteCarloAgent(
	args.env,
	gamma=args.gamma,
	epsilon=args.epsilon,
	render_mode=args.render_mode,
	)

	run_name = f"{agent.__class__.__name__}_{args.env}_e{args.n_train_episodes}_s{args.max_steps}_g{args.gamma}_e{args.epsilon}"
	if args.wandb_run_name_suffix is not None:
	run_name += f"+{args.wandb_run_name_suffix}"

	try:
	if args.train:
	# Log to WandB
	if args.wandb_project is not None:
	wandb.init(
	project=args.wandb_project,
	name=run_name,
	group=args.wandb_group,
	job_type=args.wandb_job_type,
	config=dict(args._get_kwargs()),
	)

	agent.train(
	n_train_episodes=args.n_train_episodes,
	test_every=args.test_every,
	n_test_episodes=args.n_test_episodes,
	max_steps=args.max_steps,
	update_type=args.update_type,
	log_wandb=args.wandb_project is not None,
	)
	if not args.no_save:
	agent.save_policy(
	fname=f"{run_name}.npy",
	save_dir=args.save_dir,
	)
	elif args.test is not None:
	if not args.test.endswith(".npy"):
	args.test += ".npy"
	agent.load_policy(args.test)
	agent.test(
	n_test_episodes=args.n_test_episodes,
	max_steps=args.max_steps,
	)
	else:
	print("ERROR: Please provide either --train or --test.")
	except KeyboardInterrupt:
	print("Exiting...")


	if __name__ == "__main__":
	main()