|
!pip install gym |
|
|
|
# Import necessary libraries |
|
import torch |
|
import torch.nn as nn |
|
import pandas as pd |
|
import torch.optim as optim |
|
import numpy as np |
|
import random |
|
import warnings |
|
import gym |
|
from gym import spaces |
|
import matplotlib.pyplot as plt |
|
|
|
# Suppress possible warnings for cleaner output |
|
warnings.filterwarnings("ignore") |
|
|
|
# Set random seeds for reproducibility |
|
def set_seed(seed): |
|
np.random.seed(seed) |
|
torch.manual_seed(seed) |
|
random.seed(seed) |
|
if torch.cuda.is_available(): |
|
torch.cuda.manual_seed_all(seed) |
|
|
|
set_seed(42) |
|
|
|
# Define the device for computation (use CUDA if available) |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
print(f"Using device: {device}") |
|
|
|
# Custom grid environment with fixed size and traps |
|
class CustomGridEnv: |
|
def __init__(self, size=6, num_traps=3): |
|
# Fixed complexity: grid 6x6 with 3 traps |
|
|
|
self.size = size |
|
self.num_traps = num_traps |
|
self.obversation_space = spaces.Discrete(self.size * self.size) |
|
self.action_space = spaces.Discrete(4) # 4 actions: Up, Down, Left, Right |
|
self.reset() |
|
|
|
def reset(self): |
|
self.agent_pos = [0, 0] |
|
self.goal_pos = [self.size - 1, self.size - 1] |
|
self._generate_traps() |
|
return self._get_obs() |
|
|
|
def _generate_traps(self): |
|
self.traps = [] |
|
while len(self.traps) < self.num_traps: |
|
trap = [np.random.randint(self.size), np.random.randint(self.size)] |
|
if trap != self.agent_pos and trap != self.goal_pos: |
|
self.traps.append(trap) |
|
|
|
def step(self, action): |
|
if action == 0 and self.agent_pos[0] > 0: |
|
|
|
# Up |
|
self.agent_pos[0] -= 1 |
|
|
|
elif action == 1 and self.agent_pos[0] < self.size - 1: |
|
|
|
# Down |
|
self.agent_pos[0] += 1 |
|
|
|
elif action == 2 and self.agent_pos[1] > 0: |
|
|
|
# Left |
|
self.agent_pos[1] -= 1 |
|
|
|
elif action == 3 and self.agent_pos[1] < self.size - 1: |
|
|
|
# Right |
|
self.agent_pos[1] += 1 |
|
|
|
done = False |
|
|
|
if self.agent_pos == self.goal_pos: |
|
|
|
reward = 10 |
|
done = True |
|
|
|
elif self.agent_pos in self.traps: |
|
|
|
reward = -5 |
|
done = True |
|
|
|
else: |
|
|
|
reward = -1 |
|
|
|
return self._get_obs(), reward, done, {} |
|
|
|
def _get_obs(self): |
|
return self.agent_pos[0] * self.size + self.agent_pos[1] |
|
|
|
# Neural network for the policy |
|
class PolicyNet(nn.Module): |
|
def __init__(self, input_size, num_actions): |
|
super(PolicyNet, self).__init__() |
|
self.fc1 = nn.Linear(input_size, 64) # Fully-connected layer 1 [Input] |
|
self.fc2 = nn.Linear(64, num_actions) # Fully-connected layer 2 [State-action pair selections based on fc1 input] |
|
|
|
def forward(self, state): |
|
x = torch.relu(self.fc1(state)) |
|
return self.fc2(x) |
|
|
|
# Helper function to convert a state to one-hot representation |
|
def state_to_one_hot(state, num_states): |
|
one_hot = np.zeros(num_states) |
|
one_hot[state] = 1 |
|
return torch.FloatTensor([one_hot]).to(device) |
|
|
|
|
|
# Epsilon-greedy action selection |
|
def select_action(network, state, epsilon, num_actions): |
|
if np.random.uniform(0, 1) < epsilon: # Coin flip selection then epsilon comparison |
|
return np.random.choice(num_actions) |
|
else: # If less than epsilon then select largest q-values out of the q-values obtained |
|
with torch.no_grad(): |
|
q_values = network(state) |
|
return torch.argmax(q_values).item() #e.g., q_values: Q[Q1(100), Q2(277.7123), Q3(69.1234567)] selects Q2 |
|
|
|
# Version 5 |
|
# Validate the df_goal_rows that appear in the visualization |
|
# Compare the success rate based on goal visits with the success rate calculated during the training process |
|
|
|
|
|
# Single complexity meta-training process with success rate tracking |
|
# The Intrinsic Reward Analysis derives from within the anatomy of the function with the name meta_train_fixed_complexity |
|
def meta_train_fixed_complexity(meta_learning_rate, epsilon_start, epsilon_decay, num_iterations, num_inner_steps, eta=0.1, epsilon=1e-5): |
|
num_states = 6 * 6 # Fixed grid size of 6x6 |
|
num_actions = 4 # Up, Down, Left, Right |
|
discount_factor = 0.99 # Gamma |
|
|
|
# Initialize policy network |
|
policy_net = PolicyNet(input_size=num_states, num_actions=num_actions).to(device) |
|
optimizer = optim.Adam(policy_net.parameters(), lr=meta_learning_rate) |
|
|
|
epsilon_greedy = epsilon_start |
|
meta_losses, meta_rewards, success_rates = [], [], [] |
|
|
|
env = CustomGridEnv(size=6, num_traps=3) # Fixed complexity level: grid 6x6 with 3 traps |
|
|
|
# Intrinsic Reward Analysis data capture encapsulated by 3 variables: |
|
# 1. state_visitation_counts |
|
# 2. intrinsic_reward |
|
# 3. total_reward |
|
|
|
# 1. state_visitation_counts |
|
# State visitation counts |
|
state_visitation_counts = np.zeros(num_states) |
|
|
|
# Initialize intrinsic analysis list |
|
intrinsic_analysis = [] |
|
|
|
for iteration in range(num_iterations): |
|
print(f"Iteration {iteration + 1}/{num_iterations}") |
|
|
|
total_loss = 0 |
|
total_reward = 0 |
|
successes = 0 |
|
|
|
for task in range(10): # Fixed number of tasks for each iteration |
|
state = env.reset() |
|
state = state_to_one_hot(state, num_states) |
|
optimizer.zero_grad() |
|
|
|
for step in range(num_inner_steps): |
|
action = select_action(policy_net, state, epsilon_greedy, num_actions) |
|
next_state, reward_ext, done, _ = env.step(action) |
|
next_state = state_to_one_hot(next_state, num_states) |
|
|
|
# 1. state_visitation_counts |
|
# Update state visitation count |
|
state_visitation_counts[state.argmax().item()] += 1 |
|
|
|
# 2. intrinsic_reward |
|
# Calculate intrinsic reward |
|
intrinsic_reward = eta * (1 / np.sqrt(state_visitation_counts[state.argmax().item()] + epsilon)) |
|
|
|
# 3. total_reward |
|
# Calculate total reward |
|
total_reward = reward_ext + intrinsic_reward |
|
|
|
# Convert state index to 2D grid representation |
|
state_2d = (state.argmax().item() // 6, state.argmax().item() % 6) |
|
|
|
# Append intrinsic analysis data |
|
intrinsic_analysis.append({ |
|
'State_2D': state_2d, |
|
'Intrinsic Reward': intrinsic_reward, |
|
'Total Reward': total_reward, |
|
'Extrinsic Reward': reward_ext |
|
}) |
|
|
|
with torch.no_grad(): |
|
target = total_reward + discount_factor * torch.max(policy_net(next_state)) |
|
|
|
prediction = policy_net(state)[0][action] |
|
loss = nn.functional.smooth_l1_loss(prediction, target) |
|
loss.backward() |
|
total_loss += loss.item() |
|
|
|
optimizer.step() |
|
state = next_state |
|
total_reward += reward_ext |
|
if done: |
|
if reward_ext == 10: # Success is defined as reaching the goal |
|
successes += 1 |
|
break |
|
|
|
meta_losses.append(total_loss / 10) |
|
meta_rewards.append(total_reward / 10) |
|
success_rates.append(successes / 10) |
|
epsilon_greedy = max(0.1, epsilon_greedy * epsilon_decay) |
|
|
|
# Convert intrinsic analysis list to DataFrame and save to CSV |
|
df_intrinsic_analysis = pd.DataFrame(intrinsic_analysis) |
|
df_intrinsic_analysis.to_csv('intrinsic_analysis.csv', index=False) |
|
|
|
# Find all rows associated with the goal position (Extrinsic Reward == 10) |
|
df_goal_rows = df_intrinsic_analysis[df_intrinsic_analysis['Extrinsic Reward'] == 10] |
|
print(f"Shape of df_goal_rows: {df_goal_rows.shape}") |
|
print("Rows associated with the goal position:") |
|
print(df_goal_rows) |
|
|
|
# Filter for positions (4,5) and (5,4) |
|
df_goal_rows_4_5 = df_goal_rows[df_goal_rows['State_2D'] == (4, 5)] |
|
df_goal_rows_5_4 = df_goal_rows[df_goal_rows['State_2D'] == (5, 4)] |
|
print("Rows associated with position (4,5):") |
|
print(df_goal_rows_4_5) |
|
print("Rows associated with position (5,4):") |
|
print(df_goal_rows_5_4) |
|
|
|
# Filter for any other positions |
|
df_goal_rows_other = df_goal_rows[~df_goal_rows['State_2D'].isin([(4, 5), (5, 4)])] |
|
print("Rows associated with any other positions:") |
|
print(df_goal_rows_other) |
|
|
|
# Calculate success rate based on goal visits |
|
total_tasks = num_iterations * 10 |
|
success_rate_goal_visits = len(df_goal_rows) / total_tasks * 100 |
|
print(f"Success Rate based on Goal Visits: {success_rate_goal_visits:.2f}%") |
|
|
|
# Visualize state visitation counts |
|
plt.figure(figsize=(15, 5)) |
|
plt.subplot(1, 3, 1) |
|
plt.imshow(state_visitation_counts.reshape(6, 6), cmap='hot', interpolation='nearest') |
|
plt.title('State Visitation Counts') |
|
plt.colorbar() |
|
|
|
# Visualize intrinsic rewards |
|
intrinsic_rewards = df_intrinsic_analysis['Intrinsic Reward'].values |
|
plt.subplot(1, 3, 2) |
|
plt.hist(intrinsic_rewards, bins=50, color='blue', alpha=0.7) |
|
plt.title('Intrinsic Reward Distribution') |
|
plt.xlabel('Intrinsic Reward') |
|
plt.ylabel('Frequency') |
|
|
|
# Visualize goal position visits |
|
goal_positions = df_goal_rows['State_2D'].apply(lambda x: x[0] * 6 + x[1]).values |
|
goal_counts = np.zeros(num_states) |
|
for pos in goal_positions: |
|
goal_counts[pos] += 1 |
|
plt.subplot(1, 3, 3) |
|
plt.imshow(goal_counts.reshape(6, 6), cmap='hot', interpolation='nearest') |
|
plt.title('Goal Position Visitation Counts') |
|
plt.colorbar() |
|
|
|
plt.tight_layout() |
|
plt.show() |
|
|
|
return meta_losses, meta_rewards, success_rates |
|
|
|
# Plot function for meta-loss, average reward, and success rate with white background and markers |
|
def plot_meta_losses_rewards_success(meta_losses, meta_rewards, success_rates, window_size=10): |
|
smoothed_losses = moving_average(meta_losses, window_size) |
|
smoothed_rewards = moving_average(meta_rewards, window_size) |
|
smoothed_success_rates = moving_average(success_rates, window_size) |
|
|
|
# Create the figure and axes with white background |
|
fig, ax1 = plt.subplots(figsize=(14, 7), facecolor='white') |
|
|
|
# Set axes background color to white |
|
ax1.set_facecolor('white') |
|
|
|
color = 'tab:red' |
|
ax1.set_xlabel('Meta-Iteration') |
|
ax1.set_ylabel('Meta-Loss', color=color) |
|
ax1.plot(meta_losses, color=color, alpha=0.1, label='Meta-Loss', marker='o', markersize=5) |
|
ax1.plot(range(window_size - 1, len(meta_losses)), smoothed_losses, color=color, label=f'Smoothed Meta-Loss (window={window_size})', marker='o', markersize=3) |
|
ax1.tick_params(axis='y', labelcolor=color) |
|
|
|
# Twin x-axis for Average Reward |
|
ax2 = ax1.twinx() |
|
ax2.set_facecolor('white') # Set the background color of the second axis to white |
|
color = 'tab:blue' |
|
ax2.set_ylabel('Average Reward', color=color) |
|
ax2.plot(meta_rewards, color=color, alpha=0.1, label='Average Reward', marker='s', markersize=5) |
|
ax2.plot(range(window_size - 1, len(meta_rewards)), smoothed_rewards, color=color, label=f'Smoothed Average Reward (window={window_size})', marker='s', markersize=3) |
|
ax2.tick_params(axis='y', labelcolor=color) |
|
|
|
# Third axis for Success Rate |
|
ax3 = ax1.twinx() |
|
ax3.spines['right'].set_position(('outward', 60)) |
|
ax3.set_facecolor('white') # Set the background color of the third axis to white |
|
color = 'tab:green' |
|
ax3.set_ylabel('Success Rate', color=color) |
|
ax3.plot(success_rates, color=color, alpha=0.1, label='Success Rate', marker='^', markersize=5) |
|
ax3.plot(range(window_size - 1, len(success_rates)), smoothed_success_rates, color=color, label=f'Smoothed Success Rate (window={window_size})', marker='^', markersize=3) |
|
ax3.tick_params(axis='y', labelcolor=color) |
|
|
|
# Title and grid |
|
plt.title("Meta-Loss, Average Reward, and Success Rate Progress") |
|
fig.tight_layout() # Adjust layout to prevent label clipping |
|
plt.grid(True) |
|
|
|
# Show the plot |
|
plt.show() |
|
|
|
# Function to calculate moving average |
|
def moving_average(data, window_size=30): |
|
return np.convolve(data, np.ones(window_size) / window_size, mode='valid') |
|
|
|
# Simplified run with single complexity and success rate tracking |
|
if __name__ == "__main__": |
|
meta_learning_rate = 1e-3 |
|
epsilon_start = 0.9 |
|
epsilon_decay = 0.99 |
|
num_iterations = 500 |
|
num_inner_steps = 50 |
|
eta = 0.1 |
|
epsilon = 1e-5 |
|
|
|
meta_losses, meta_rewards, success_rates = meta_train_fixed_complexity( |
|
meta_learning_rate=meta_learning_rate, |
|
epsilon_start=epsilon_start, |
|
epsilon_decay=epsilon_decay, |
|
num_iterations=num_iterations, |
|
num_inner_steps=num_inner_steps, |
|
eta=eta, |
|
epsilon=epsilon |
|
) |
|
|
|
# Plot results |
|
plot_meta_losses_rewards_success(meta_losses, meta_rewards, success_rates) |