!pip install gym

# Import necessary libraries
import torch
import torch.nn as nn
import pandas as pd
import torch.optim as optim
import numpy as np
import random
import warnings
import gym
from gym import spaces
import matplotlib.pyplot as plt

# Suppress possible warnings for cleaner output
warnings.filterwarnings("ignore")

# Set random seeds for reproducibility
def set_seed(seed):
  np.random.seed(seed)
  torch.manual_seed(seed)
  random.seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# Define the device for computation (use CUDA if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Custom grid environment with fixed size and traps
class CustomGridEnv:
  def __init__(self, size=6, num_traps=3):
    # Fixed complexity: grid 6x6 with 3 traps

    self.size = size
    self.num_traps = num_traps
    self.obversation_space = spaces.Discrete(self.size * self.size)
    self.action_space = spaces.Discrete(4) # 4 actions: Up, Down, Left, Right
    self.reset()

  def reset(self):
    self.agent_pos = [0, 0]
    self.goal_pos = [self.size - 1, self.size - 1]
    self._generate_traps()
    return self._get_obs()

  def _generate_traps(self):
    self.traps = []
    while len(self.traps) < self.num_traps:
      trap = [np.random.randint(self.size), np.random.randint(self.size)]
      if trap != self.agent_pos and trap != self.goal_pos:
        self.traps.append(trap)

  def step(self, action):
    if action == 0 and self.agent_pos[0] > 0:

      # Up
      self.agent_pos[0] -= 1

    elif action == 1 and self.agent_pos[0] < self.size - 1:

      # Down
      self.agent_pos[0] += 1

    elif action == 2 and self.agent_pos[1] > 0:

      # Left
      self.agent_pos[1] -= 1

    elif action == 3 and self.agent_pos[1] < self.size - 1:

      # Right
      self.agent_pos[1] += 1

    done = False

    if self.agent_pos == self.goal_pos:

      reward = 10
      done = True

    elif self.agent_pos in self.traps:

      reward = -5
      done = True

    else:

      reward = -1

    return self._get_obs(), reward, done, {}

  def _get_obs(self):
    return self.agent_pos[0] * self.size + self.agent_pos[1]

# Neural network for the policy
class PolicyNet(nn.Module):
  def __init__(self, input_size, num_actions):
    super(PolicyNet, self).__init__()
    self.fc1 = nn.Linear(input_size, 64) # Fully-connected layer 1 [Input]
    self.fc2 = nn.Linear(64, num_actions) # Fully-connected layer 2 [State-action pair selections based on fc1 input]

  def forward(self, state):
    x = torch.relu(self.fc1(state))
    return self.fc2(x)

# Helper function to convert a state to one-hot representation
def state_to_one_hot(state, num_states):
  one_hot = np.zeros(num_states)
  one_hot[state] = 1
  return torch.FloatTensor([one_hot]).to(device)


# Epsilon-greedy action selection
def select_action(network, state, epsilon, num_actions):
  if np.random.uniform(0, 1) < epsilon: # Coin flip selection then epsilon comparison
    return np.random.choice(num_actions)
  else: # If less than epsilon then select largest q-values out of the q-values obtained
    with torch.no_grad():
      q_values = network(state)
      return torch.argmax(q_values).item() #e.g., q_values: Q[Q1(100), Q2(277.7123), Q3(69.1234567)] selects Q2

# Version 5
# Validate the df_goal_rows that appear in the visualization
# Compare the success rate based on goal visits with the success rate calculated during the training process


# Single complexity meta-training process with success rate tracking
# The Intrinsic Reward Analysis derives from within the anatomy of the function with the name meta_train_fixed_complexity
def meta_train_fixed_complexity(meta_learning_rate, epsilon_start, epsilon_decay, num_iterations, num_inner_steps, eta=0.1, epsilon=1e-5):
    num_states = 6 * 6  # Fixed grid size of 6x6
    num_actions = 4  # Up, Down, Left, Right
    discount_factor = 0.99  # Gamma

    # Initialize policy network
    policy_net = PolicyNet(input_size=num_states, num_actions=num_actions).to(device)
    optimizer = optim.Adam(policy_net.parameters(), lr=meta_learning_rate)

    epsilon_greedy = epsilon_start
    meta_losses, meta_rewards, success_rates = [], [], []

    env = CustomGridEnv(size=6, num_traps=3)  # Fixed complexity level: grid 6x6 with 3 traps

    # Intrinsic Reward Analysis data capture encapsulated by 3 variables:
    # 1. state_visitation_counts
    # 2. intrinsic_reward
    # 3. total_reward

    # 1. state_visitation_counts
    # State visitation counts
    state_visitation_counts = np.zeros(num_states)

    # Initialize intrinsic analysis list
    intrinsic_analysis = []

    for iteration in range(num_iterations):
        print(f"Iteration {iteration + 1}/{num_iterations}")

        total_loss = 0
        total_reward = 0
        successes = 0

        for task in range(10):  # Fixed number of tasks for each iteration
            state = env.reset()
            state = state_to_one_hot(state, num_states)
            optimizer.zero_grad()

            for step in range(num_inner_steps):
                action = select_action(policy_net, state, epsilon_greedy, num_actions)
                next_state, reward_ext, done, _ = env.step(action)
                next_state = state_to_one_hot(next_state, num_states)

                # 1. state_visitation_counts
                # Update state visitation count
                state_visitation_counts[state.argmax().item()] += 1

                # 2. intrinsic_reward
                # Calculate intrinsic reward
                intrinsic_reward = eta * (1 / np.sqrt(state_visitation_counts[state.argmax().item()] + epsilon))

                # 3. total_reward
                # Calculate total reward
                total_reward = reward_ext + intrinsic_reward

                # Convert state index to 2D grid representation
                state_2d = (state.argmax().item() // 6, state.argmax().item() % 6)

                # Append intrinsic analysis data
                intrinsic_analysis.append({
                    'State_2D': state_2d,
                    'Intrinsic Reward': intrinsic_reward,
                    'Total Reward': total_reward,
                    'Extrinsic Reward': reward_ext
                })

                with torch.no_grad():
                    target = total_reward + discount_factor * torch.max(policy_net(next_state))

                prediction = policy_net(state)[0][action]
                loss = nn.functional.smooth_l1_loss(prediction, target)
                loss.backward()
                total_loss += loss.item()

                optimizer.step()
                state = next_state
                total_reward += reward_ext
                if done:
                    if reward_ext == 10:  # Success is defined as reaching the goal
                        successes += 1
                    break

        meta_losses.append(total_loss / 10)
        meta_rewards.append(total_reward / 10)
        success_rates.append(successes / 10)
        epsilon_greedy = max(0.1, epsilon_greedy * epsilon_decay)

    # Convert intrinsic analysis list to DataFrame and save to CSV
    df_intrinsic_analysis = pd.DataFrame(intrinsic_analysis)
    df_intrinsic_analysis.to_csv('intrinsic_analysis.csv', index=False)

    # Find all rows associated with the goal position (Extrinsic Reward == 10)
    df_goal_rows = df_intrinsic_analysis[df_intrinsic_analysis['Extrinsic Reward'] == 10]
    print(f"Shape of df_goal_rows: {df_goal_rows.shape}")
    print("Rows associated with the goal position:")
    print(df_goal_rows)

    # Filter for positions (4,5) and (5,4)
    df_goal_rows_4_5 = df_goal_rows[df_goal_rows['State_2D'] == (4, 5)]
    df_goal_rows_5_4 = df_goal_rows[df_goal_rows['State_2D'] == (5, 4)]
    print("Rows associated with position (4,5):")
    print(df_goal_rows_4_5)
    print("Rows associated with position (5,4):")
    print(df_goal_rows_5_4)

    # Filter for any other positions
    df_goal_rows_other = df_goal_rows[~df_goal_rows['State_2D'].isin([(4, 5), (5, 4)])]
    print("Rows associated with any other positions:")
    print(df_goal_rows_other)

    # Calculate success rate based on goal visits
    total_tasks = num_iterations * 10
    success_rate_goal_visits = len(df_goal_rows) / total_tasks * 100
    print(f"Success Rate based on Goal Visits: {success_rate_goal_visits:.2f}%")

    # Visualize state visitation counts
    plt.figure(figsize=(15, 5))
    plt.subplot(1, 3, 1)
    plt.imshow(state_visitation_counts.reshape(6, 6), cmap='hot', interpolation='nearest')
    plt.title('State Visitation Counts')
    plt.colorbar()

    # Visualize intrinsic rewards
    intrinsic_rewards = df_intrinsic_analysis['Intrinsic Reward'].values
    plt.subplot(1, 3, 2)
    plt.hist(intrinsic_rewards, bins=50, color='blue', alpha=0.7)
    plt.title('Intrinsic Reward Distribution')
    plt.xlabel('Intrinsic Reward')
    plt.ylabel('Frequency')

    # Visualize goal position visits
    goal_positions = df_goal_rows['State_2D'].apply(lambda x: x[0] * 6 + x[1]).values
    goal_counts = np.zeros(num_states)
    for pos in goal_positions:
        goal_counts[pos] += 1
    plt.subplot(1, 3, 3)
    plt.imshow(goal_counts.reshape(6, 6), cmap='hot', interpolation='nearest')
    plt.title('Goal Position Visitation Counts')
    plt.colorbar()

    plt.tight_layout()
    plt.show()

    return meta_losses, meta_rewards, success_rates

# Plot function for meta-loss, average reward, and success rate with white background and markers
def plot_meta_losses_rewards_success(meta_losses, meta_rewards, success_rates, window_size=10):
    smoothed_losses = moving_average(meta_losses, window_size)
    smoothed_rewards = moving_average(meta_rewards, window_size)
    smoothed_success_rates = moving_average(success_rates, window_size)

    # Create the figure and axes with white background
    fig, ax1 = plt.subplots(figsize=(14, 7), facecolor='white')

    # Set axes background color to white
    ax1.set_facecolor('white')

    color = 'tab:red'
    ax1.set_xlabel('Meta-Iteration')
    ax1.set_ylabel('Meta-Loss', color=color)
    ax1.plot(meta_losses, color=color, alpha=0.1, label='Meta-Loss', marker='o', markersize=5)
    ax1.plot(range(window_size - 1, len(meta_losses)), smoothed_losses, color=color, label=f'Smoothed Meta-Loss (window={window_size})', marker='o', markersize=3)
    ax1.tick_params(axis='y', labelcolor=color)

    # Twin x-axis for Average Reward
    ax2 = ax1.twinx()
    ax2.set_facecolor('white')  # Set the background color of the second axis to white
    color = 'tab:blue'
    ax2.set_ylabel('Average Reward', color=color)
    ax2.plot(meta_rewards, color=color, alpha=0.1, label='Average Reward', marker='s', markersize=5)
    ax2.plot(range(window_size - 1, len(meta_rewards)), smoothed_rewards, color=color, label=f'Smoothed Average Reward (window={window_size})', marker='s', markersize=3)
    ax2.tick_params(axis='y', labelcolor=color)

    # Third axis for Success Rate
    ax3 = ax1.twinx()
    ax3.spines['right'].set_position(('outward', 60))
    ax3.set_facecolor('white')  # Set the background color of the third axis to white
    color = 'tab:green'
    ax3.set_ylabel('Success Rate', color=color)
    ax3.plot(success_rates, color=color, alpha=0.1, label='Success Rate', marker='^', markersize=5)
    ax3.plot(range(window_size - 1, len(success_rates)), smoothed_success_rates, color=color, label=f'Smoothed Success Rate (window={window_size})', marker='^', markersize=3)
    ax3.tick_params(axis='y', labelcolor=color)

    # Title and grid
    plt.title("Meta-Loss, Average Reward, and Success Rate Progress")
    fig.tight_layout()  # Adjust layout to prevent label clipping
    plt.grid(True)

    # Show the plot
    plt.show()

# Function to calculate moving average
def moving_average(data, window_size=30):
    return np.convolve(data, np.ones(window_size) / window_size, mode='valid')

# Simplified run with single complexity and success rate tracking
if __name__ == "__main__":
    meta_learning_rate = 1e-3
    epsilon_start = 0.9
    epsilon_decay = 0.99
    num_iterations = 500
    num_inner_steps = 50
    eta = 0.1
    epsilon = 1e-5

    meta_losses, meta_rewards, success_rates = meta_train_fixed_complexity(
        meta_learning_rate=meta_learning_rate,
        epsilon_start=epsilon_start,
        epsilon_decay=epsilon_decay,
        num_iterations=num_iterations,
        num_inner_steps=num_inner_steps,
        eta=eta,
        epsilon=epsilon
    )

    # Plot results
    plot_meta_losses_rewards_success(meta_losses, meta_rewards, success_rates)