TroglodyteDerivations's picture
Create algo2.txt
682f2b2 verified
raw
history blame
12.7 kB
!pip install gym
# Import necessary libraries
import torch
import torch.nn as nn
import pandas as pd
import torch.optim as optim
import numpy as np
import random
import warnings
import gym
from gym import spaces
import matplotlib.pyplot as plt
# Suppress possible warnings for cleaner output
warnings.filterwarnings("ignore")
# Set random seeds for reproducibility
def set_seed(seed):
np.random.seed(seed)
torch.manual_seed(seed)
random.seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
set_seed(42)
# Define the device for computation (use CUDA if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Custom grid environment with fixed size and traps
class CustomGridEnv:
def __init__(self, size=6, num_traps=3):
# Fixed complexity: grid 6x6 with 3 traps
self.size = size
self.num_traps = num_traps
self.obversation_space = spaces.Discrete(self.size * self.size)
self.action_space = spaces.Discrete(4) # 4 actions: Up, Down, Left, Right
self.reset()
def reset(self):
self.agent_pos = [0, 0]
self.goal_pos = [self.size - 1, self.size - 1]
self._generate_traps()
return self._get_obs()
def _generate_traps(self):
self.traps = []
while len(self.traps) < self.num_traps:
trap = [np.random.randint(self.size), np.random.randint(self.size)]
if trap != self.agent_pos and trap != self.goal_pos:
self.traps.append(trap)
def step(self, action):
if action == 0 and self.agent_pos[0] > 0:
# Up
self.agent_pos[0] -= 1
elif action == 1 and self.agent_pos[0] < self.size - 1:
# Down
self.agent_pos[0] += 1
elif action == 2 and self.agent_pos[1] > 0:
# Left
self.agent_pos[1] -= 1
elif action == 3 and self.agent_pos[1] < self.size - 1:
# Right
self.agent_pos[1] += 1
done = False
if self.agent_pos == self.goal_pos:
reward = 10
done = True
elif self.agent_pos in self.traps:
reward = -5
done = True
else:
reward = -1
return self._get_obs(), reward, done, {}
def _get_obs(self):
return self.agent_pos[0] * self.size + self.agent_pos[1]
# Neural network for the policy
class PolicyNet(nn.Module):
def __init__(self, input_size, num_actions):
super(PolicyNet, self).__init__()
self.fc1 = nn.Linear(input_size, 64) # Fully-connected layer 1 [Input]
self.fc2 = nn.Linear(64, num_actions) # Fully-connected layer 2 [State-action pair selections based on fc1 input]
def forward(self, state):
x = torch.relu(self.fc1(state))
return self.fc2(x)
# Helper function to convert a state to one-hot representation
def state_to_one_hot(state, num_states):
one_hot = np.zeros(num_states)
one_hot[state] = 1
return torch.FloatTensor([one_hot]).to(device)
# Epsilon-greedy action selection
def select_action(network, state, epsilon, num_actions):
if np.random.uniform(0, 1) < epsilon: # Coin flip selection then epsilon comparison
return np.random.choice(num_actions)
else: # If less than epsilon then select largest q-values out of the q-values obtained
with torch.no_grad():
q_values = network(state)
return torch.argmax(q_values).item() #e.g., q_values: Q[Q1(100), Q2(277.7123), Q3(69.1234567)] selects Q2
# Version 5
# Validate the df_goal_rows that appear in the visualization
# Compare the success rate based on goal visits with the success rate calculated during the training process
# Single complexity meta-training process with success rate tracking
# The Intrinsic Reward Analysis derives from within the anatomy of the function with the name meta_train_fixed_complexity
def meta_train_fixed_complexity(meta_learning_rate, epsilon_start, epsilon_decay, num_iterations, num_inner_steps, eta=0.1, epsilon=1e-5):
num_states = 6 * 6 # Fixed grid size of 6x6
num_actions = 4 # Up, Down, Left, Right
discount_factor = 0.99 # Gamma
# Initialize policy network
policy_net = PolicyNet(input_size=num_states, num_actions=num_actions).to(device)
optimizer = optim.Adam(policy_net.parameters(), lr=meta_learning_rate)
epsilon_greedy = epsilon_start
meta_losses, meta_rewards, success_rates = [], [], []
env = CustomGridEnv(size=6, num_traps=3) # Fixed complexity level: grid 6x6 with 3 traps
# Intrinsic Reward Analysis data capture encapsulated by 3 variables:
# 1. state_visitation_counts
# 2. intrinsic_reward
# 3. total_reward
# 1. state_visitation_counts
# State visitation counts
state_visitation_counts = np.zeros(num_states)
# Initialize intrinsic analysis list
intrinsic_analysis = []
for iteration in range(num_iterations):
print(f"Iteration {iteration + 1}/{num_iterations}")
total_loss = 0
total_reward = 0
successes = 0
for task in range(10): # Fixed number of tasks for each iteration
state = env.reset()
state = state_to_one_hot(state, num_states)
optimizer.zero_grad()
for step in range(num_inner_steps):
action = select_action(policy_net, state, epsilon_greedy, num_actions)
next_state, reward_ext, done, _ = env.step(action)
next_state = state_to_one_hot(next_state, num_states)
# 1. state_visitation_counts
# Update state visitation count
state_visitation_counts[state.argmax().item()] += 1
# 2. intrinsic_reward
# Calculate intrinsic reward
intrinsic_reward = eta * (1 / np.sqrt(state_visitation_counts[state.argmax().item()] + epsilon))
# 3. total_reward
# Calculate total reward
total_reward = reward_ext + intrinsic_reward
# Convert state index to 2D grid representation
state_2d = (state.argmax().item() // 6, state.argmax().item() % 6)
# Append intrinsic analysis data
intrinsic_analysis.append({
'State_2D': state_2d,
'Intrinsic Reward': intrinsic_reward,
'Total Reward': total_reward,
'Extrinsic Reward': reward_ext
})
with torch.no_grad():
target = total_reward + discount_factor * torch.max(policy_net(next_state))
prediction = policy_net(state)[0][action]
loss = nn.functional.smooth_l1_loss(prediction, target)
loss.backward()
total_loss += loss.item()
optimizer.step()
state = next_state
total_reward += reward_ext
if done:
if reward_ext == 10: # Success is defined as reaching the goal
successes += 1
break
meta_losses.append(total_loss / 10)
meta_rewards.append(total_reward / 10)
success_rates.append(successes / 10)
epsilon_greedy = max(0.1, epsilon_greedy * epsilon_decay)
# Convert intrinsic analysis list to DataFrame and save to CSV
df_intrinsic_analysis = pd.DataFrame(intrinsic_analysis)
df_intrinsic_analysis.to_csv('intrinsic_analysis.csv', index=False)
# Find all rows associated with the goal position (Extrinsic Reward == 10)
df_goal_rows = df_intrinsic_analysis[df_intrinsic_analysis['Extrinsic Reward'] == 10]
print(f"Shape of df_goal_rows: {df_goal_rows.shape}")
print("Rows associated with the goal position:")
print(df_goal_rows)
# Filter for positions (4,5) and (5,4)
df_goal_rows_4_5 = df_goal_rows[df_goal_rows['State_2D'] == (4, 5)]
df_goal_rows_5_4 = df_goal_rows[df_goal_rows['State_2D'] == (5, 4)]
print("Rows associated with position (4,5):")
print(df_goal_rows_4_5)
print("Rows associated with position (5,4):")
print(df_goal_rows_5_4)
# Filter for any other positions
df_goal_rows_other = df_goal_rows[~df_goal_rows['State_2D'].isin([(4, 5), (5, 4)])]
print("Rows associated with any other positions:")
print(df_goal_rows_other)
# Calculate success rate based on goal visits
total_tasks = num_iterations * 10
success_rate_goal_visits = len(df_goal_rows) / total_tasks * 100
print(f"Success Rate based on Goal Visits: {success_rate_goal_visits:.2f}%")
# Visualize state visitation counts
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.imshow(state_visitation_counts.reshape(6, 6), cmap='hot', interpolation='nearest')
plt.title('State Visitation Counts')
plt.colorbar()
# Visualize intrinsic rewards
intrinsic_rewards = df_intrinsic_analysis['Intrinsic Reward'].values
plt.subplot(1, 3, 2)
plt.hist(intrinsic_rewards, bins=50, color='blue', alpha=0.7)
plt.title('Intrinsic Reward Distribution')
plt.xlabel('Intrinsic Reward')
plt.ylabel('Frequency')
# Visualize goal position visits
goal_positions = df_goal_rows['State_2D'].apply(lambda x: x[0] * 6 + x[1]).values
goal_counts = np.zeros(num_states)
for pos in goal_positions:
goal_counts[pos] += 1
plt.subplot(1, 3, 3)
plt.imshow(goal_counts.reshape(6, 6), cmap='hot', interpolation='nearest')
plt.title('Goal Position Visitation Counts')
plt.colorbar()
plt.tight_layout()
plt.show()
return meta_losses, meta_rewards, success_rates
# Plot function for meta-loss, average reward, and success rate with white background and markers
def plot_meta_losses_rewards_success(meta_losses, meta_rewards, success_rates, window_size=10):
smoothed_losses = moving_average(meta_losses, window_size)
smoothed_rewards = moving_average(meta_rewards, window_size)
smoothed_success_rates = moving_average(success_rates, window_size)
# Create the figure and axes with white background
fig, ax1 = plt.subplots(figsize=(14, 7), facecolor='white')
# Set axes background color to white
ax1.set_facecolor('white')
color = 'tab:red'
ax1.set_xlabel('Meta-Iteration')
ax1.set_ylabel('Meta-Loss', color=color)
ax1.plot(meta_losses, color=color, alpha=0.1, label='Meta-Loss', marker='o', markersize=5)
ax1.plot(range(window_size - 1, len(meta_losses)), smoothed_losses, color=color, label=f'Smoothed Meta-Loss (window={window_size})', marker='o', markersize=3)
ax1.tick_params(axis='y', labelcolor=color)
# Twin x-axis for Average Reward
ax2 = ax1.twinx()
ax2.set_facecolor('white') # Set the background color of the second axis to white
color = 'tab:blue'
ax2.set_ylabel('Average Reward', color=color)
ax2.plot(meta_rewards, color=color, alpha=0.1, label='Average Reward', marker='s', markersize=5)
ax2.plot(range(window_size - 1, len(meta_rewards)), smoothed_rewards, color=color, label=f'Smoothed Average Reward (window={window_size})', marker='s', markersize=3)
ax2.tick_params(axis='y', labelcolor=color)
# Third axis for Success Rate
ax3 = ax1.twinx()
ax3.spines['right'].set_position(('outward', 60))
ax3.set_facecolor('white') # Set the background color of the third axis to white
color = 'tab:green'
ax3.set_ylabel('Success Rate', color=color)
ax3.plot(success_rates, color=color, alpha=0.1, label='Success Rate', marker='^', markersize=5)
ax3.plot(range(window_size - 1, len(success_rates)), smoothed_success_rates, color=color, label=f'Smoothed Success Rate (window={window_size})', marker='^', markersize=3)
ax3.tick_params(axis='y', labelcolor=color)
# Title and grid
plt.title("Meta-Loss, Average Reward, and Success Rate Progress")
fig.tight_layout() # Adjust layout to prevent label clipping
plt.grid(True)
# Show the plot
plt.show()
# Function to calculate moving average
def moving_average(data, window_size=30):
return np.convolve(data, np.ones(window_size) / window_size, mode='valid')
# Simplified run with single complexity and success rate tracking
if __name__ == "__main__":
meta_learning_rate = 1e-3
epsilon_start = 0.9
epsilon_decay = 0.99
num_iterations = 500
num_inner_steps = 50
eta = 0.1
epsilon = 1e-5
meta_losses, meta_rewards, success_rates = meta_train_fixed_complexity(
meta_learning_rate=meta_learning_rate,
epsilon_start=epsilon_start,
epsilon_decay=epsilon_decay,
num_iterations=num_iterations,
num_inner_steps=num_inner_steps,
eta=eta,
epsilon=epsilon
)
# Plot results
plot_meta_losses_rewards_success(meta_losses, meta_rewards, success_rates)