File size: 12,690 Bytes
682f2b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
!pip install gym

# Import necessary libraries
import torch
import torch.nn as nn
import pandas as pd
import torch.optim as optim
import numpy as np
import random
import warnings
import gym
from gym import spaces
import matplotlib.pyplot as plt

# Suppress possible warnings for cleaner output
warnings.filterwarnings("ignore")

# Set random seeds for reproducibility
def set_seed(seed):
  np.random.seed(seed)
  torch.manual_seed(seed)
  random.seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# Define the device for computation (use CUDA if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Custom grid environment with fixed size and traps
class CustomGridEnv:
  def __init__(self, size=6, num_traps=3):
    # Fixed complexity: grid 6x6 with 3 traps

    self.size = size
    self.num_traps = num_traps
    self.obversation_space = spaces.Discrete(self.size * self.size)
    self.action_space = spaces.Discrete(4) # 4 actions: Up, Down, Left, Right
    self.reset()

  def reset(self):
    self.agent_pos = [0, 0]
    self.goal_pos = [self.size - 1, self.size - 1]
    self._generate_traps()
    return self._get_obs()

  def _generate_traps(self):
    self.traps = []
    while len(self.traps) < self.num_traps:
      trap = [np.random.randint(self.size), np.random.randint(self.size)]
      if trap != self.agent_pos and trap != self.goal_pos:
        self.traps.append(trap)

  def step(self, action):
    if action == 0 and self.agent_pos[0] > 0:

      # Up
      self.agent_pos[0] -= 1

    elif action == 1 and self.agent_pos[0] < self.size - 1:

      # Down
      self.agent_pos[0] += 1

    elif action == 2 and self.agent_pos[1] > 0:

      # Left
      self.agent_pos[1] -= 1

    elif action == 3 and self.agent_pos[1] < self.size - 1:

      # Right
      self.agent_pos[1] += 1

    done = False

    if self.agent_pos == self.goal_pos:

      reward = 10
      done = True

    elif self.agent_pos in self.traps:

      reward = -5
      done = True

    else:

      reward = -1

    return self._get_obs(), reward, done, {}

  def _get_obs(self):
    return self.agent_pos[0] * self.size + self.agent_pos[1]

# Neural network for the policy
class PolicyNet(nn.Module):
  def __init__(self, input_size, num_actions):
    super(PolicyNet, self).__init__()
    self.fc1 = nn.Linear(input_size, 64) # Fully-connected layer 1 [Input]
    self.fc2 = nn.Linear(64, num_actions) # Fully-connected layer 2 [State-action pair selections based on fc1 input]

  def forward(self, state):
    x = torch.relu(self.fc1(state))
    return self.fc2(x)

# Helper function to convert a state to one-hot representation
def state_to_one_hot(state, num_states):
  one_hot = np.zeros(num_states)
  one_hot[state] = 1
  return torch.FloatTensor([one_hot]).to(device)


# Epsilon-greedy action selection
def select_action(network, state, epsilon, num_actions):
  if np.random.uniform(0, 1) < epsilon: # Coin flip selection then epsilon comparison
    return np.random.choice(num_actions)
  else: # If less than epsilon then select largest q-values out of the q-values obtained
    with torch.no_grad():
      q_values = network(state)
      return torch.argmax(q_values).item() #e.g., q_values: Q[Q1(100), Q2(277.7123), Q3(69.1234567)] selects Q2

# Version 5
# Validate the df_goal_rows that appear in the visualization
# Compare the success rate based on goal visits with the success rate calculated during the training process


# Single complexity meta-training process with success rate tracking
# The Intrinsic Reward Analysis derives from within the anatomy of the function with the name meta_train_fixed_complexity
def meta_train_fixed_complexity(meta_learning_rate, epsilon_start, epsilon_decay, num_iterations, num_inner_steps, eta=0.1, epsilon=1e-5):
    num_states = 6 * 6  # Fixed grid size of 6x6
    num_actions = 4  # Up, Down, Left, Right
    discount_factor = 0.99  # Gamma

    # Initialize policy network
    policy_net = PolicyNet(input_size=num_states, num_actions=num_actions).to(device)
    optimizer = optim.Adam(policy_net.parameters(), lr=meta_learning_rate)

    epsilon_greedy = epsilon_start
    meta_losses, meta_rewards, success_rates = [], [], []

    env = CustomGridEnv(size=6, num_traps=3)  # Fixed complexity level: grid 6x6 with 3 traps

    # Intrinsic Reward Analysis data capture encapsulated by 3 variables:
    # 1. state_visitation_counts
    # 2. intrinsic_reward
    # 3. total_reward

    # 1. state_visitation_counts
    # State visitation counts
    state_visitation_counts = np.zeros(num_states)

    # Initialize intrinsic analysis list
    intrinsic_analysis = []

    for iteration in range(num_iterations):
        print(f"Iteration {iteration + 1}/{num_iterations}")

        total_loss = 0
        total_reward = 0
        successes = 0

        for task in range(10):  # Fixed number of tasks for each iteration
            state = env.reset()
            state = state_to_one_hot(state, num_states)
            optimizer.zero_grad()

            for step in range(num_inner_steps):
                action = select_action(policy_net, state, epsilon_greedy, num_actions)
                next_state, reward_ext, done, _ = env.step(action)
                next_state = state_to_one_hot(next_state, num_states)

                # 1. state_visitation_counts
                # Update state visitation count
                state_visitation_counts[state.argmax().item()] += 1

                # 2. intrinsic_reward
                # Calculate intrinsic reward
                intrinsic_reward = eta * (1 / np.sqrt(state_visitation_counts[state.argmax().item()] + epsilon))

                # 3. total_reward
                # Calculate total reward
                total_reward = reward_ext + intrinsic_reward

                # Convert state index to 2D grid representation
                state_2d = (state.argmax().item() // 6, state.argmax().item() % 6)

                # Append intrinsic analysis data
                intrinsic_analysis.append({
                    'State_2D': state_2d,
                    'Intrinsic Reward': intrinsic_reward,
                    'Total Reward': total_reward,
                    'Extrinsic Reward': reward_ext
                })

                with torch.no_grad():
                    target = total_reward + discount_factor * torch.max(policy_net(next_state))

                prediction = policy_net(state)[0][action]
                loss = nn.functional.smooth_l1_loss(prediction, target)
                loss.backward()
                total_loss += loss.item()

                optimizer.step()
                state = next_state
                total_reward += reward_ext
                if done:
                    if reward_ext == 10:  # Success is defined as reaching the goal
                        successes += 1
                    break

        meta_losses.append(total_loss / 10)
        meta_rewards.append(total_reward / 10)
        success_rates.append(successes / 10)
        epsilon_greedy = max(0.1, epsilon_greedy * epsilon_decay)

    # Convert intrinsic analysis list to DataFrame and save to CSV
    df_intrinsic_analysis = pd.DataFrame(intrinsic_analysis)
    df_intrinsic_analysis.to_csv('intrinsic_analysis.csv', index=False)

    # Find all rows associated with the goal position (Extrinsic Reward == 10)
    df_goal_rows = df_intrinsic_analysis[df_intrinsic_analysis['Extrinsic Reward'] == 10]
    print(f"Shape of df_goal_rows: {df_goal_rows.shape}")
    print("Rows associated with the goal position:")
    print(df_goal_rows)

    # Filter for positions (4,5) and (5,4)
    df_goal_rows_4_5 = df_goal_rows[df_goal_rows['State_2D'] == (4, 5)]
    df_goal_rows_5_4 = df_goal_rows[df_goal_rows['State_2D'] == (5, 4)]
    print("Rows associated with position (4,5):")
    print(df_goal_rows_4_5)
    print("Rows associated with position (5,4):")
    print(df_goal_rows_5_4)

    # Filter for any other positions
    df_goal_rows_other = df_goal_rows[~df_goal_rows['State_2D'].isin([(4, 5), (5, 4)])]
    print("Rows associated with any other positions:")
    print(df_goal_rows_other)

    # Calculate success rate based on goal visits
    total_tasks = num_iterations * 10
    success_rate_goal_visits = len(df_goal_rows) / total_tasks * 100
    print(f"Success Rate based on Goal Visits: {success_rate_goal_visits:.2f}%")

    # Visualize state visitation counts
    plt.figure(figsize=(15, 5))
    plt.subplot(1, 3, 1)
    plt.imshow(state_visitation_counts.reshape(6, 6), cmap='hot', interpolation='nearest')
    plt.title('State Visitation Counts')
    plt.colorbar()

    # Visualize intrinsic rewards
    intrinsic_rewards = df_intrinsic_analysis['Intrinsic Reward'].values
    plt.subplot(1, 3, 2)
    plt.hist(intrinsic_rewards, bins=50, color='blue', alpha=0.7)
    plt.title('Intrinsic Reward Distribution')
    plt.xlabel('Intrinsic Reward')
    plt.ylabel('Frequency')

    # Visualize goal position visits
    goal_positions = df_goal_rows['State_2D'].apply(lambda x: x[0] * 6 + x[1]).values
    goal_counts = np.zeros(num_states)
    for pos in goal_positions:
        goal_counts[pos] += 1
    plt.subplot(1, 3, 3)
    plt.imshow(goal_counts.reshape(6, 6), cmap='hot', interpolation='nearest')
    plt.title('Goal Position Visitation Counts')
    plt.colorbar()

    plt.tight_layout()
    plt.show()

    return meta_losses, meta_rewards, success_rates

# Plot function for meta-loss, average reward, and success rate with white background and markers
def plot_meta_losses_rewards_success(meta_losses, meta_rewards, success_rates, window_size=10):
    smoothed_losses = moving_average(meta_losses, window_size)
    smoothed_rewards = moving_average(meta_rewards, window_size)
    smoothed_success_rates = moving_average(success_rates, window_size)

    # Create the figure and axes with white background
    fig, ax1 = plt.subplots(figsize=(14, 7), facecolor='white')

    # Set axes background color to white
    ax1.set_facecolor('white')

    color = 'tab:red'
    ax1.set_xlabel('Meta-Iteration')
    ax1.set_ylabel('Meta-Loss', color=color)
    ax1.plot(meta_losses, color=color, alpha=0.1, label='Meta-Loss', marker='o', markersize=5)
    ax1.plot(range(window_size - 1, len(meta_losses)), smoothed_losses, color=color, label=f'Smoothed Meta-Loss (window={window_size})', marker='o', markersize=3)
    ax1.tick_params(axis='y', labelcolor=color)

    # Twin x-axis for Average Reward
    ax2 = ax1.twinx()
    ax2.set_facecolor('white')  # Set the background color of the second axis to white
    color = 'tab:blue'
    ax2.set_ylabel('Average Reward', color=color)
    ax2.plot(meta_rewards, color=color, alpha=0.1, label='Average Reward', marker='s', markersize=5)
    ax2.plot(range(window_size - 1, len(meta_rewards)), smoothed_rewards, color=color, label=f'Smoothed Average Reward (window={window_size})', marker='s', markersize=3)
    ax2.tick_params(axis='y', labelcolor=color)

    # Third axis for Success Rate
    ax3 = ax1.twinx()
    ax3.spines['right'].set_position(('outward', 60))
    ax3.set_facecolor('white')  # Set the background color of the third axis to white
    color = 'tab:green'
    ax3.set_ylabel('Success Rate', color=color)
    ax3.plot(success_rates, color=color, alpha=0.1, label='Success Rate', marker='^', markersize=5)
    ax3.plot(range(window_size - 1, len(success_rates)), smoothed_success_rates, color=color, label=f'Smoothed Success Rate (window={window_size})', marker='^', markersize=3)
    ax3.tick_params(axis='y', labelcolor=color)

    # Title and grid
    plt.title("Meta-Loss, Average Reward, and Success Rate Progress")
    fig.tight_layout()  # Adjust layout to prevent label clipping
    plt.grid(True)

    # Show the plot
    plt.show()

# Function to calculate moving average
def moving_average(data, window_size=30):
    return np.convolve(data, np.ones(window_size) / window_size, mode='valid')

# Simplified run with single complexity and success rate tracking
if __name__ == "__main__":
    meta_learning_rate = 1e-3
    epsilon_start = 0.9
    epsilon_decay = 0.99
    num_iterations = 500
    num_inner_steps = 50
    eta = 0.1
    epsilon = 1e-5

    meta_losses, meta_rewards, success_rates = meta_train_fixed_complexity(
        meta_learning_rate=meta_learning_rate,
        epsilon_start=epsilon_start,
        epsilon_decay=epsilon_decay,
        num_iterations=num_iterations,
        num_inner_steps=num_inner_steps,
        eta=eta,
        epsilon=epsilon
    )

    # Plot results
    plot_meta_losses_rewards_success(meta_losses, meta_rewards, success_rates)