Spaces:

TroglodyteDerivations
/

Meta_Learning_With_HRL_Homer_Simpson_Lecture

Running

App Files Files Community

TroglodyteDerivations commited on 1 day ago

Commit

682f2b2

•

1 Parent(s): 5b42e60

Create algo2.txt

Browse files

Files changed (1) hide show

algo2.txt +348 -0

algo2.txt ADDED Viewed

	@@ -0,0 +1,348 @@

+!pip install gym
+# Import necessary libraries
+import torch
+import torch.nn as nn
+import pandas as pd
+import torch.optim as optim
+import numpy as np
+import random
+import warnings
+import gym
+from gym import spaces
+import matplotlib.pyplot as plt
+# Suppress possible warnings for cleaner output
+warnings.filterwarnings("ignore")
+# Set random seeds for reproducibility
+def set_seed(seed):
+  np.random.seed(seed)
+  torch.manual_seed(seed)
+  random.seed(seed)
+  if torch.cuda.is_available():
+    torch.cuda.manual_seed_all(seed)
+set_seed(42)
+# Define the device for computation (use CUDA if available)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+# Custom grid environment with fixed size and traps
+class CustomGridEnv:
+  def __init__(self, size=6, num_traps=3):
+    # Fixed complexity: grid 6x6 with 3 traps
+    self.size = size
+    self.num_traps = num_traps
+    self.obversation_space = spaces.Discrete(self.size * self.size)
+    self.action_space = spaces.Discrete(4) # 4 actions: Up, Down, Left, Right
+    self.reset()
+  def reset(self):
+    self.agent_pos = [0, 0]
+    self.goal_pos = [self.size - 1, self.size - 1]
+    self._generate_traps()
+    return self._get_obs()
+  def _generate_traps(self):
+    self.traps = []
+    while len(self.traps) < self.num_traps:
+      trap = [np.random.randint(self.size), np.random.randint(self.size)]
+      if trap != self.agent_pos and trap != self.goal_pos:
+        self.traps.append(trap)
+  def step(self, action):
+    if action == 0 and self.agent_pos[0] > 0:
+      # Up
+      self.agent_pos[0] -= 1
+    elif action == 1 and self.agent_pos[0] < self.size - 1:
+      # Down
+      self.agent_pos[0] += 1
+    elif action == 2 and self.agent_pos[1] > 0:
+      # Left
+      self.agent_pos[1] -= 1
+    elif action == 3 and self.agent_pos[1] < self.size - 1:
+      # Right
+      self.agent_pos[1] += 1
+    done = False
+    if self.agent_pos == self.goal_pos:
+      reward = 10
+      done = True
+    elif self.agent_pos in self.traps:
+      reward = -5
+      done = True
+    else:
+      reward = -1
+    return self._get_obs(), reward, done, {}
+  def _get_obs(self):
+    return self.agent_pos[0] * self.size + self.agent_pos[1]
+# Neural network for the policy
+class PolicyNet(nn.Module):
+  def __init__(self, input_size, num_actions):
+    super(PolicyNet, self).__init__()
+    self.fc1 = nn.Linear(input_size, 64) # Fully-connected layer 1 [Input]
+    self.fc2 = nn.Linear(64, num_actions) # Fully-connected layer 2 [State-action pair selections based on fc1 input]
+  def forward(self, state):
+    x = torch.relu(self.fc1(state))
+    return self.fc2(x)
+# Helper function to convert a state to one-hot representation
+def state_to_one_hot(state, num_states):
+  one_hot = np.zeros(num_states)
+  one_hot[state] = 1
+  return torch.FloatTensor([one_hot]).to(device)
+# Epsilon-greedy action selection
+def select_action(network, state, epsilon, num_actions):
+  if np.random.uniform(0, 1) < epsilon: # Coin flip selection then epsilon comparison
+    return np.random.choice(num_actions)
+  else: # If less than epsilon then select largest q-values out of the q-values obtained
+    with torch.no_grad():
+      q_values = network(state)
+      return torch.argmax(q_values).item() #e.g., q_values: Q[Q1(100), Q2(277.7123), Q3(69.1234567)] selects Q2
+# Version 5
+# Validate the df_goal_rows that appear in the visualization
+# Compare the success rate based on goal visits with the success rate calculated during the training process
+# Single complexity meta-training process with success rate tracking
+# The Intrinsic Reward Analysis derives from within the anatomy of the function with the name meta_train_fixed_complexity
+def meta_train_fixed_complexity(meta_learning_rate, epsilon_start, epsilon_decay, num_iterations, num_inner_steps, eta=0.1, epsilon=1e-5):
+    num_states = 6 * 6  # Fixed grid size of 6x6
+    num_actions = 4  # Up, Down, Left, Right
+    discount_factor = 0.99  # Gamma
+    # Initialize policy network
+    policy_net = PolicyNet(input_size=num_states, num_actions=num_actions).to(device)
+    optimizer = optim.Adam(policy_net.parameters(), lr=meta_learning_rate)
+    epsilon_greedy = epsilon_start
+    meta_losses, meta_rewards, success_rates = [], [], []
+    env = CustomGridEnv(size=6, num_traps=3)  # Fixed complexity level: grid 6x6 with 3 traps
+    # Intrinsic Reward Analysis data capture encapsulated by 3 variables:
+    # 1. state_visitation_counts
+    # 2. intrinsic_reward
+    # 3. total_reward
+    # 1. state_visitation_counts
+    # State visitation counts
+    state_visitation_counts = np.zeros(num_states)
+    # Initialize intrinsic analysis list
+    intrinsic_analysis = []
+    for iteration in range(num_iterations):
+        print(f"Iteration {iteration + 1}/{num_iterations}")
+        total_loss = 0
+        total_reward = 0
+        successes = 0
+        for task in range(10):  # Fixed number of tasks for each iteration
+            state = env.reset()
+            state = state_to_one_hot(state, num_states)
+            optimizer.zero_grad()
+            for step in range(num_inner_steps):
+                action = select_action(policy_net, state, epsilon_greedy, num_actions)
+                next_state, reward_ext, done, _ = env.step(action)
+                next_state = state_to_one_hot(next_state, num_states)
+                # 1. state_visitation_counts
+                # Update state visitation count
+                state_visitation_counts[state.argmax().item()] += 1
+                # 2. intrinsic_reward
+                # Calculate intrinsic reward
+                intrinsic_reward = eta * (1 / np.sqrt(state_visitation_counts[state.argmax().item()] + epsilon))
+                # 3. total_reward
+                # Calculate total reward
+                total_reward = reward_ext + intrinsic_reward
+                # Convert state index to 2D grid representation
+                state_2d = (state.argmax().item() // 6, state.argmax().item() % 6)
+                # Append intrinsic analysis data
+                intrinsic_analysis.append({
+                    'State_2D': state_2d,
+                    'Intrinsic Reward': intrinsic_reward,
+                    'Total Reward': total_reward,
+                    'Extrinsic Reward': reward_ext
+                })
+                with torch.no_grad():
+                    target = total_reward + discount_factor * torch.max(policy_net(next_state))
+                prediction = policy_net(state)[0][action]
+                loss = nn.functional.smooth_l1_loss(prediction, target)
+                loss.backward()
+                total_loss += loss.item()
+                optimizer.step()
+                state = next_state
+                total_reward += reward_ext
+                if done:
+                    if reward_ext == 10:  # Success is defined as reaching the goal
+                        successes += 1
+                    break
+        meta_losses.append(total_loss / 10)
+        meta_rewards.append(total_reward / 10)
+        success_rates.append(successes / 10)
+        epsilon_greedy = max(0.1, epsilon_greedy * epsilon_decay)
+    # Convert intrinsic analysis list to DataFrame and save to CSV
+    df_intrinsic_analysis = pd.DataFrame(intrinsic_analysis)
+    df_intrinsic_analysis.to_csv('intrinsic_analysis.csv', index=False)
+    # Find all rows associated with the goal position (Extrinsic Reward == 10)
+    df_goal_rows = df_intrinsic_analysis[df_intrinsic_analysis['Extrinsic Reward'] == 10]
+    print(f"Shape of df_goal_rows: {df_goal_rows.shape}")
+    print("Rows associated with the goal position:")
+    print(df_goal_rows)
+    # Filter for positions (4,5) and (5,4)
+    df_goal_rows_4_5 = df_goal_rows[df_goal_rows['State_2D'] == (4, 5)]
+    df_goal_rows_5_4 = df_goal_rows[df_goal_rows['State_2D'] == (5, 4)]
+    print("Rows associated with position (4,5):")
+    print(df_goal_rows_4_5)
+    print("Rows associated with position (5,4):")
+    print(df_goal_rows_5_4)
+    # Filter for any other positions
+    df_goal_rows_other = df_goal_rows[~df_goal_rows['State_2D'].isin([(4, 5), (5, 4)])]
+    print("Rows associated with any other positions:")
+    print(df_goal_rows_other)
+    # Calculate success rate based on goal visits
+    total_tasks = num_iterations * 10
+    success_rate_goal_visits = len(df_goal_rows) / total_tasks * 100
+    print(f"Success Rate based on Goal Visits: {success_rate_goal_visits:.2f}%")
+    # Visualize state visitation counts
+    plt.figure(figsize=(15, 5))
+    plt.subplot(1, 3, 1)
+    plt.imshow(state_visitation_counts.reshape(6, 6), cmap='hot', interpolation='nearest')
+    plt.title('State Visitation Counts')
+    plt.colorbar()
+    # Visualize intrinsic rewards
+    intrinsic_rewards = df_intrinsic_analysis['Intrinsic Reward'].values
+    plt.subplot(1, 3, 2)
+    plt.hist(intrinsic_rewards, bins=50, color='blue', alpha=0.7)
+    plt.title('Intrinsic Reward Distribution')
+    plt.xlabel('Intrinsic Reward')
+    plt.ylabel('Frequency')
+    # Visualize goal position visits
+    goal_positions = df_goal_rows['State_2D'].apply(lambda x: x[0] * 6 + x[1]).values
+    goal_counts = np.zeros(num_states)
+    for pos in goal_positions:
+        goal_counts[pos] += 1
+    plt.subplot(1, 3, 3)
+    plt.imshow(goal_counts.reshape(6, 6), cmap='hot', interpolation='nearest')
+    plt.title('Goal Position Visitation Counts')
+    plt.colorbar()
+    plt.tight_layout()
+    plt.show()
+    return meta_losses, meta_rewards, success_rates
+# Plot function for meta-loss, average reward, and success rate with white background and markers
+def plot_meta_losses_rewards_success(meta_losses, meta_rewards, success_rates, window_size=10):
+    smoothed_losses = moving_average(meta_losses, window_size)
+    smoothed_rewards = moving_average(meta_rewards, window_size)
+    smoothed_success_rates = moving_average(success_rates, window_size)
+    # Create the figure and axes with white background
+    fig, ax1 = plt.subplots(figsize=(14, 7), facecolor='white')
+    # Set axes background color to white
+    ax1.set_facecolor('white')
+    color = 'tab:red'
+    ax1.set_xlabel('Meta-Iteration')
+    ax1.set_ylabel('Meta-Loss', color=color)
+    ax1.plot(meta_losses, color=color, alpha=0.1, label='Meta-Loss', marker='o', markersize=5)
+    ax1.plot(range(window_size - 1, len(meta_losses)), smoothed_losses, color=color, label=f'Smoothed Meta-Loss (window={window_size})', marker='o', markersize=3)
+    ax1.tick_params(axis='y', labelcolor=color)
+    # Twin x-axis for Average Reward
+    ax2 = ax1.twinx()
+    ax2.set_facecolor('white')  # Set the background color of the second axis to white
+    color = 'tab:blue'
+    ax2.set_ylabel('Average Reward', color=color)
+    ax2.plot(meta_rewards, color=color, alpha=0.1, label='Average Reward', marker='s', markersize=5)
+    ax2.plot(range(window_size - 1, len(meta_rewards)), smoothed_rewards, color=color, label=f'Smoothed Average Reward (window={window_size})', marker='s', markersize=3)
+    ax2.tick_params(axis='y', labelcolor=color)
+    # Third axis for Success Rate
+    ax3 = ax1.twinx()
+    ax3.spines['right'].set_position(('outward', 60))
+    ax3.set_facecolor('white')  # Set the background color of the third axis to white
+    color = 'tab:green'
+    ax3.set_ylabel('Success Rate', color=color)
+    ax3.plot(success_rates, color=color, alpha=0.1, label='Success Rate', marker='^', markersize=5)
+    ax3.plot(range(window_size - 1, len(success_rates)), smoothed_success_rates, color=color, label=f'Smoothed Success Rate (window={window_size})', marker='^', markersize=3)
+    ax3.tick_params(axis='y', labelcolor=color)
+    # Title and grid
+    plt.title("Meta-Loss, Average Reward, and Success Rate Progress")
+    fig.tight_layout()  # Adjust layout to prevent label clipping
+    plt.grid(True)
+    # Show the plot
+    plt.show()
+# Function to calculate moving average
+def moving_average(data, window_size=30):
+    return np.convolve(data, np.ones(window_size) / window_size, mode='valid')
+# Simplified run with single complexity and success rate tracking
+if __name__ == "__main__":
+    meta_learning_rate = 1e-3
+    epsilon_start = 0.9
+    epsilon_decay = 0.99
+    num_iterations = 500
+    num_inner_steps = 50
+    eta = 0.1
+    epsilon = 1e-5
+    meta_losses, meta_rewards, success_rates = meta_train_fixed_complexity(
+        meta_learning_rate=meta_learning_rate,
+        epsilon_start=epsilon_start,
+        epsilon_decay=epsilon_decay,
+        num_iterations=num_iterations,
+        num_inner_steps=num_inner_steps,
+        eta=eta,
+        epsilon=epsilon
+    )
+    # Plot results
+    plot_meta_losses_rewards_success(meta_losses, meta_rewards, success_rates)