Spaces:

TroglodyteDerivations
/

Meta_Learning_With_HRL_Homer_Simpson_Lecture

Running

App Files Files Community

Meta_Learning_With_HRL_Homer_Simpson_Lecture / algo2.txt

TroglodyteDerivations

Create algo2.txt

682f2b2 verified 2 days ago

raw

history blame

12.7 kB

	!pip install gym

	# Import necessary libraries
	import torch
	import torch.nn as nn
	import pandas as pd
	import torch.optim as optim
	import numpy as np
	import random
	import warnings
	import gym
	from gym import spaces
	import matplotlib.pyplot as plt

	# Suppress possible warnings for cleaner output
	warnings.filterwarnings("ignore")

	# Set random seeds for reproducibility
	def set_seed(seed):
	np.random.seed(seed)
	torch.manual_seed(seed)
	random.seed(seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed_all(seed)

	set_seed(42)

	# Define the device for computation (use CUDA if available)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	# Custom grid environment with fixed size and traps
	class CustomGridEnv:
	def __init__(self, size=6, num_traps=3):
	# Fixed complexity: grid 6x6 with 3 traps

	self.size = size
	self.num_traps = num_traps
	self.obversation_space = spaces.Discrete(self.size * self.size)
	self.action_space = spaces.Discrete(4) # 4 actions: Up, Down, Left, Right
	self.reset()

	def reset(self):
	self.agent_pos = [0, 0]
	self.goal_pos = [self.size - 1, self.size - 1]
	self._generate_traps()
	return self._get_obs()

	def _generate_traps(self):
	self.traps = []
	while len(self.traps) < self.num_traps:
	trap = [np.random.randint(self.size), np.random.randint(self.size)]
	if trap != self.agent_pos and trap != self.goal_pos:
	self.traps.append(trap)

	def step(self, action):
	if action == 0 and self.agent_pos[0] > 0:

	# Up
	self.agent_pos[0] -= 1

	elif action == 1 and self.agent_pos[0] < self.size - 1:

	# Down
	self.agent_pos[0] += 1

	elif action == 2 and self.agent_pos[1] > 0:

	# Left
	self.agent_pos[1] -= 1

	elif action == 3 and self.agent_pos[1] < self.size - 1:

	# Right
	self.agent_pos[1] += 1

	done = False

	if self.agent_pos == self.goal_pos:

	reward = 10
	done = True

	elif self.agent_pos in self.traps:

	reward = -5
	done = True

	else:

	reward = -1

	return self._get_obs(), reward, done, {}

	def _get_obs(self):
	return self.agent_pos[0] * self.size + self.agent_pos[1]

	# Neural network for the policy
	class PolicyNet(nn.Module):
	def __init__(self, input_size, num_actions):
	super(PolicyNet, self).__init__()
	self.fc1 = nn.Linear(input_size, 64) # Fully-connected layer 1 [Input]
	self.fc2 = nn.Linear(64, num_actions) # Fully-connected layer 2 [State-action pair selections based on fc1 input]

	def forward(self, state):
	x = torch.relu(self.fc1(state))
	return self.fc2(x)

	# Helper function to convert a state to one-hot representation
	def state_to_one_hot(state, num_states):
	one_hot = np.zeros(num_states)
	one_hot[state] = 1
	return torch.FloatTensor([one_hot]).to(device)


	# Epsilon-greedy action selection
	def select_action(network, state, epsilon, num_actions):
	if np.random.uniform(0, 1) < epsilon: # Coin flip selection then epsilon comparison
	return np.random.choice(num_actions)
	else: # If less than epsilon then select largest q-values out of the q-values obtained
	with torch.no_grad():
	q_values = network(state)
	return torch.argmax(q_values).item() #e.g., q_values: Q[Q1(100), Q2(277.7123), Q3(69.1234567)] selects Q2

	# Version 5
	# Validate the df_goal_rows that appear in the visualization
	# Compare the success rate based on goal visits with the success rate calculated during the training process


	# Single complexity meta-training process with success rate tracking
	# The Intrinsic Reward Analysis derives from within the anatomy of the function with the name meta_train_fixed_complexity
	def meta_train_fixed_complexity(meta_learning_rate, epsilon_start, epsilon_decay, num_iterations, num_inner_steps, eta=0.1, epsilon=1e-5):
	num_states = 6 * 6 # Fixed grid size of 6x6
	num_actions = 4 # Up, Down, Left, Right
	discount_factor = 0.99 # Gamma

	# Initialize policy network
	policy_net = PolicyNet(input_size=num_states, num_actions=num_actions).to(device)
	optimizer = optim.Adam(policy_net.parameters(), lr=meta_learning_rate)

	epsilon_greedy = epsilon_start
	meta_losses, meta_rewards, success_rates = [], [], []

	env = CustomGridEnv(size=6, num_traps=3) # Fixed complexity level: grid 6x6 with 3 traps

	# Intrinsic Reward Analysis data capture encapsulated by 3 variables:
	# 1. state_visitation_counts
	# 2. intrinsic_reward
	# 3. total_reward

	# 1. state_visitation_counts
	# State visitation counts
	state_visitation_counts = np.zeros(num_states)

	# Initialize intrinsic analysis list
	intrinsic_analysis = []

	for iteration in range(num_iterations):
	print(f"Iteration {iteration + 1}/{num_iterations}")

	total_loss = 0
	total_reward = 0
	successes = 0

	for task in range(10): # Fixed number of tasks for each iteration
	state = env.reset()
	state = state_to_one_hot(state, num_states)
	optimizer.zero_grad()

	for step in range(num_inner_steps):
	action = select_action(policy_net, state, epsilon_greedy, num_actions)
	next_state, reward_ext, done, _ = env.step(action)
	next_state = state_to_one_hot(next_state, num_states)

	# 1. state_visitation_counts
	# Update state visitation count
	state_visitation_counts[state.argmax().item()] += 1

	# 2. intrinsic_reward
	# Calculate intrinsic reward
	intrinsic_reward = eta * (1 / np.sqrt(state_visitation_counts[state.argmax().item()] + epsilon))

	# 3. total_reward
	# Calculate total reward
	total_reward = reward_ext + intrinsic_reward

	# Convert state index to 2D grid representation
	state_2d = (state.argmax().item() // 6, state.argmax().item() % 6)

	# Append intrinsic analysis data
	intrinsic_analysis.append({
	'State_2D': state_2d,
	'Intrinsic Reward': intrinsic_reward,
	'Total Reward': total_reward,
	'Extrinsic Reward': reward_ext
	})

	with torch.no_grad():
	target = total_reward + discount_factor * torch.max(policy_net(next_state))

	prediction = policy_net(state)[0][action]
	loss = nn.functional.smooth_l1_loss(prediction, target)
	loss.backward()
	total_loss += loss.item()

	optimizer.step()
	state = next_state
	total_reward += reward_ext
	if done:
	if reward_ext == 10: # Success is defined as reaching the goal
	successes += 1
	break

	meta_losses.append(total_loss / 10)
	meta_rewards.append(total_reward / 10)
	success_rates.append(successes / 10)
	epsilon_greedy = max(0.1, epsilon_greedy * epsilon_decay)

	# Convert intrinsic analysis list to DataFrame and save to CSV
	df_intrinsic_analysis = pd.DataFrame(intrinsic_analysis)
	df_intrinsic_analysis.to_csv('intrinsic_analysis.csv', index=False)

	# Find all rows associated with the goal position (Extrinsic Reward == 10)
	df_goal_rows = df_intrinsic_analysis[df_intrinsic_analysis['Extrinsic Reward'] == 10]
	print(f"Shape of df_goal_rows: {df_goal_rows.shape}")
	print("Rows associated with the goal position:")
	print(df_goal_rows)

	# Filter for positions (4,5) and (5,4)
	df_goal_rows_4_5 = df_goal_rows[df_goal_rows['State_2D'] == (4, 5)]
	df_goal_rows_5_4 = df_goal_rows[df_goal_rows['State_2D'] == (5, 4)]
	print("Rows associated with position (4,5):")
	print(df_goal_rows_4_5)
	print("Rows associated with position (5,4):")
	print(df_goal_rows_5_4)

	# Filter for any other positions
	df_goal_rows_other = df_goal_rows[~df_goal_rows['State_2D'].isin([(4, 5), (5, 4)])]
	print("Rows associated with any other positions:")
	print(df_goal_rows_other)

	# Calculate success rate based on goal visits
	total_tasks = num_iterations * 10
	success_rate_goal_visits = len(df_goal_rows) / total_tasks * 100
	print(f"Success Rate based on Goal Visits: {success_rate_goal_visits:.2f}%")

	# Visualize state visitation counts
	plt.figure(figsize=(15, 5))
	plt.subplot(1, 3, 1)
	plt.imshow(state_visitation_counts.reshape(6, 6), cmap='hot', interpolation='nearest')
	plt.title('State Visitation Counts')
	plt.colorbar()

	# Visualize intrinsic rewards
	intrinsic_rewards = df_intrinsic_analysis['Intrinsic Reward'].values
	plt.subplot(1, 3, 2)
	plt.hist(intrinsic_rewards, bins=50, color='blue', alpha=0.7)
	plt.title('Intrinsic Reward Distribution')
	plt.xlabel('Intrinsic Reward')
	plt.ylabel('Frequency')

	# Visualize goal position visits
	goal_positions = df_goal_rows['State_2D'].apply(lambda x: x[0] * 6 + x[1]).values
	goal_counts = np.zeros(num_states)
	for pos in goal_positions:
	goal_counts[pos] += 1
	plt.subplot(1, 3, 3)
	plt.imshow(goal_counts.reshape(6, 6), cmap='hot', interpolation='nearest')
	plt.title('Goal Position Visitation Counts')
	plt.colorbar()

	plt.tight_layout()
	plt.show()

	return meta_losses, meta_rewards, success_rates

	# Plot function for meta-loss, average reward, and success rate with white background and markers
	def plot_meta_losses_rewards_success(meta_losses, meta_rewards, success_rates, window_size=10):
	smoothed_losses = moving_average(meta_losses, window_size)
	smoothed_rewards = moving_average(meta_rewards, window_size)
	smoothed_success_rates = moving_average(success_rates, window_size)

	# Create the figure and axes with white background
	fig, ax1 = plt.subplots(figsize=(14, 7), facecolor='white')

	# Set axes background color to white
	ax1.set_facecolor('white')

	color = 'tab:red'
	ax1.set_xlabel('Meta-Iteration')
	ax1.set_ylabel('Meta-Loss', color=color)
	ax1.plot(meta_losses, color=color, alpha=0.1, label='Meta-Loss', marker='o', markersize=5)
	ax1.plot(range(window_size - 1, len(meta_losses)), smoothed_losses, color=color, label=f'Smoothed Meta-Loss (window={window_size})', marker='o', markersize=3)
	ax1.tick_params(axis='y', labelcolor=color)

	# Twin x-axis for Average Reward
	ax2 = ax1.twinx()
	ax2.set_facecolor('white') # Set the background color of the second axis to white
	color = 'tab:blue'
	ax2.set_ylabel('Average Reward', color=color)
	ax2.plot(meta_rewards, color=color, alpha=0.1, label='Average Reward', marker='s', markersize=5)
	ax2.plot(range(window_size - 1, len(meta_rewards)), smoothed_rewards, color=color, label=f'Smoothed Average Reward (window={window_size})', marker='s', markersize=3)
	ax2.tick_params(axis='y', labelcolor=color)

	# Third axis for Success Rate
	ax3 = ax1.twinx()
	ax3.spines['right'].set_position(('outward', 60))
	ax3.set_facecolor('white') # Set the background color of the third axis to white
	color = 'tab:green'
	ax3.set_ylabel('Success Rate', color=color)
	ax3.plot(success_rates, color=color, alpha=0.1, label='Success Rate', marker='^', markersize=5)
	ax3.plot(range(window_size - 1, len(success_rates)), smoothed_success_rates, color=color, label=f'Smoothed Success Rate (window={window_size})', marker='^', markersize=3)
	ax3.tick_params(axis='y', labelcolor=color)

	# Title and grid
	plt.title("Meta-Loss, Average Reward, and Success Rate Progress")
	fig.tight_layout() # Adjust layout to prevent label clipping
	plt.grid(True)

	# Show the plot
	plt.show()

	# Function to calculate moving average
	def moving_average(data, window_size=30):
	return np.convolve(data, np.ones(window_size) / window_size, mode='valid')

	# Simplified run with single complexity and success rate tracking
	if __name__ == "__main__":
	meta_learning_rate = 1e-3
	epsilon_start = 0.9
	epsilon_decay = 0.99
	num_iterations = 500
	num_inner_steps = 50
	eta = 0.1
	epsilon = 1e-5

	meta_losses, meta_rewards, success_rates = meta_train_fixed_complexity(
	meta_learning_rate=meta_learning_rate,
	epsilon_start=epsilon_start,
	epsilon_decay=epsilon_decay,
	num_iterations=num_iterations,
	num_inner_steps=num_inner_steps,
	eta=eta,
	epsilon=epsilon
	)

	# Plot results
	plot_meta_losses_rewards_success(meta_losses, meta_rewards, success_rates)