TroglodyteDerivations commited on
Commit
682f2b2
1 Parent(s): 5b42e60

Create algo2.txt

Browse files
Files changed (1) hide show
  1. algo2.txt +348 -0
algo2.txt ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install gym
2
+
3
+ # Import necessary libraries
4
+ import torch
5
+ import torch.nn as nn
6
+ import pandas as pd
7
+ import torch.optim as optim
8
+ import numpy as np
9
+ import random
10
+ import warnings
11
+ import gym
12
+ from gym import spaces
13
+ import matplotlib.pyplot as plt
14
+
15
+ # Suppress possible warnings for cleaner output
16
+ warnings.filterwarnings("ignore")
17
+
18
+ # Set random seeds for reproducibility
19
+ def set_seed(seed):
20
+ np.random.seed(seed)
21
+ torch.manual_seed(seed)
22
+ random.seed(seed)
23
+ if torch.cuda.is_available():
24
+ torch.cuda.manual_seed_all(seed)
25
+
26
+ set_seed(42)
27
+
28
+ # Define the device for computation (use CUDA if available)
29
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
30
+ print(f"Using device: {device}")
31
+
32
+ # Custom grid environment with fixed size and traps
33
+ class CustomGridEnv:
34
+ def __init__(self, size=6, num_traps=3):
35
+ # Fixed complexity: grid 6x6 with 3 traps
36
+
37
+ self.size = size
38
+ self.num_traps = num_traps
39
+ self.obversation_space = spaces.Discrete(self.size * self.size)
40
+ self.action_space = spaces.Discrete(4) # 4 actions: Up, Down, Left, Right
41
+ self.reset()
42
+
43
+ def reset(self):
44
+ self.agent_pos = [0, 0]
45
+ self.goal_pos = [self.size - 1, self.size - 1]
46
+ self._generate_traps()
47
+ return self._get_obs()
48
+
49
+ def _generate_traps(self):
50
+ self.traps = []
51
+ while len(self.traps) < self.num_traps:
52
+ trap = [np.random.randint(self.size), np.random.randint(self.size)]
53
+ if trap != self.agent_pos and trap != self.goal_pos:
54
+ self.traps.append(trap)
55
+
56
+ def step(self, action):
57
+ if action == 0 and self.agent_pos[0] > 0:
58
+
59
+ # Up
60
+ self.agent_pos[0] -= 1
61
+
62
+ elif action == 1 and self.agent_pos[0] < self.size - 1:
63
+
64
+ # Down
65
+ self.agent_pos[0] += 1
66
+
67
+ elif action == 2 and self.agent_pos[1] > 0:
68
+
69
+ # Left
70
+ self.agent_pos[1] -= 1
71
+
72
+ elif action == 3 and self.agent_pos[1] < self.size - 1:
73
+
74
+ # Right
75
+ self.agent_pos[1] += 1
76
+
77
+ done = False
78
+
79
+ if self.agent_pos == self.goal_pos:
80
+
81
+ reward = 10
82
+ done = True
83
+
84
+ elif self.agent_pos in self.traps:
85
+
86
+ reward = -5
87
+ done = True
88
+
89
+ else:
90
+
91
+ reward = -1
92
+
93
+ return self._get_obs(), reward, done, {}
94
+
95
+ def _get_obs(self):
96
+ return self.agent_pos[0] * self.size + self.agent_pos[1]
97
+
98
+ # Neural network for the policy
99
+ class PolicyNet(nn.Module):
100
+ def __init__(self, input_size, num_actions):
101
+ super(PolicyNet, self).__init__()
102
+ self.fc1 = nn.Linear(input_size, 64) # Fully-connected layer 1 [Input]
103
+ self.fc2 = nn.Linear(64, num_actions) # Fully-connected layer 2 [State-action pair selections based on fc1 input]
104
+
105
+ def forward(self, state):
106
+ x = torch.relu(self.fc1(state))
107
+ return self.fc2(x)
108
+
109
+ # Helper function to convert a state to one-hot representation
110
+ def state_to_one_hot(state, num_states):
111
+ one_hot = np.zeros(num_states)
112
+ one_hot[state] = 1
113
+ return torch.FloatTensor([one_hot]).to(device)
114
+
115
+
116
+ # Epsilon-greedy action selection
117
+ def select_action(network, state, epsilon, num_actions):
118
+ if np.random.uniform(0, 1) < epsilon: # Coin flip selection then epsilon comparison
119
+ return np.random.choice(num_actions)
120
+ else: # If less than epsilon then select largest q-values out of the q-values obtained
121
+ with torch.no_grad():
122
+ q_values = network(state)
123
+ return torch.argmax(q_values).item() #e.g., q_values: Q[Q1(100), Q2(277.7123), Q3(69.1234567)] selects Q2
124
+
125
+ # Version 5
126
+ # Validate the df_goal_rows that appear in the visualization
127
+ # Compare the success rate based on goal visits with the success rate calculated during the training process
128
+
129
+
130
+ # Single complexity meta-training process with success rate tracking
131
+ # The Intrinsic Reward Analysis derives from within the anatomy of the function with the name meta_train_fixed_complexity
132
+ def meta_train_fixed_complexity(meta_learning_rate, epsilon_start, epsilon_decay, num_iterations, num_inner_steps, eta=0.1, epsilon=1e-5):
133
+ num_states = 6 * 6 # Fixed grid size of 6x6
134
+ num_actions = 4 # Up, Down, Left, Right
135
+ discount_factor = 0.99 # Gamma
136
+
137
+ # Initialize policy network
138
+ policy_net = PolicyNet(input_size=num_states, num_actions=num_actions).to(device)
139
+ optimizer = optim.Adam(policy_net.parameters(), lr=meta_learning_rate)
140
+
141
+ epsilon_greedy = epsilon_start
142
+ meta_losses, meta_rewards, success_rates = [], [], []
143
+
144
+ env = CustomGridEnv(size=6, num_traps=3) # Fixed complexity level: grid 6x6 with 3 traps
145
+
146
+ # Intrinsic Reward Analysis data capture encapsulated by 3 variables:
147
+ # 1. state_visitation_counts
148
+ # 2. intrinsic_reward
149
+ # 3. total_reward
150
+
151
+ # 1. state_visitation_counts
152
+ # State visitation counts
153
+ state_visitation_counts = np.zeros(num_states)
154
+
155
+ # Initialize intrinsic analysis list
156
+ intrinsic_analysis = []
157
+
158
+ for iteration in range(num_iterations):
159
+ print(f"Iteration {iteration + 1}/{num_iterations}")
160
+
161
+ total_loss = 0
162
+ total_reward = 0
163
+ successes = 0
164
+
165
+ for task in range(10): # Fixed number of tasks for each iteration
166
+ state = env.reset()
167
+ state = state_to_one_hot(state, num_states)
168
+ optimizer.zero_grad()
169
+
170
+ for step in range(num_inner_steps):
171
+ action = select_action(policy_net, state, epsilon_greedy, num_actions)
172
+ next_state, reward_ext, done, _ = env.step(action)
173
+ next_state = state_to_one_hot(next_state, num_states)
174
+
175
+ # 1. state_visitation_counts
176
+ # Update state visitation count
177
+ state_visitation_counts[state.argmax().item()] += 1
178
+
179
+ # 2. intrinsic_reward
180
+ # Calculate intrinsic reward
181
+ intrinsic_reward = eta * (1 / np.sqrt(state_visitation_counts[state.argmax().item()] + epsilon))
182
+
183
+ # 3. total_reward
184
+ # Calculate total reward
185
+ total_reward = reward_ext + intrinsic_reward
186
+
187
+ # Convert state index to 2D grid representation
188
+ state_2d = (state.argmax().item() // 6, state.argmax().item() % 6)
189
+
190
+ # Append intrinsic analysis data
191
+ intrinsic_analysis.append({
192
+ 'State_2D': state_2d,
193
+ 'Intrinsic Reward': intrinsic_reward,
194
+ 'Total Reward': total_reward,
195
+ 'Extrinsic Reward': reward_ext
196
+ })
197
+
198
+ with torch.no_grad():
199
+ target = total_reward + discount_factor * torch.max(policy_net(next_state))
200
+
201
+ prediction = policy_net(state)[0][action]
202
+ loss = nn.functional.smooth_l1_loss(prediction, target)
203
+ loss.backward()
204
+ total_loss += loss.item()
205
+
206
+ optimizer.step()
207
+ state = next_state
208
+ total_reward += reward_ext
209
+ if done:
210
+ if reward_ext == 10: # Success is defined as reaching the goal
211
+ successes += 1
212
+ break
213
+
214
+ meta_losses.append(total_loss / 10)
215
+ meta_rewards.append(total_reward / 10)
216
+ success_rates.append(successes / 10)
217
+ epsilon_greedy = max(0.1, epsilon_greedy * epsilon_decay)
218
+
219
+ # Convert intrinsic analysis list to DataFrame and save to CSV
220
+ df_intrinsic_analysis = pd.DataFrame(intrinsic_analysis)
221
+ df_intrinsic_analysis.to_csv('intrinsic_analysis.csv', index=False)
222
+
223
+ # Find all rows associated with the goal position (Extrinsic Reward == 10)
224
+ df_goal_rows = df_intrinsic_analysis[df_intrinsic_analysis['Extrinsic Reward'] == 10]
225
+ print(f"Shape of df_goal_rows: {df_goal_rows.shape}")
226
+ print("Rows associated with the goal position:")
227
+ print(df_goal_rows)
228
+
229
+ # Filter for positions (4,5) and (5,4)
230
+ df_goal_rows_4_5 = df_goal_rows[df_goal_rows['State_2D'] == (4, 5)]
231
+ df_goal_rows_5_4 = df_goal_rows[df_goal_rows['State_2D'] == (5, 4)]
232
+ print("Rows associated with position (4,5):")
233
+ print(df_goal_rows_4_5)
234
+ print("Rows associated with position (5,4):")
235
+ print(df_goal_rows_5_4)
236
+
237
+ # Filter for any other positions
238
+ df_goal_rows_other = df_goal_rows[~df_goal_rows['State_2D'].isin([(4, 5), (5, 4)])]
239
+ print("Rows associated with any other positions:")
240
+ print(df_goal_rows_other)
241
+
242
+ # Calculate success rate based on goal visits
243
+ total_tasks = num_iterations * 10
244
+ success_rate_goal_visits = len(df_goal_rows) / total_tasks * 100
245
+ print(f"Success Rate based on Goal Visits: {success_rate_goal_visits:.2f}%")
246
+
247
+ # Visualize state visitation counts
248
+ plt.figure(figsize=(15, 5))
249
+ plt.subplot(1, 3, 1)
250
+ plt.imshow(state_visitation_counts.reshape(6, 6), cmap='hot', interpolation='nearest')
251
+ plt.title('State Visitation Counts')
252
+ plt.colorbar()
253
+
254
+ # Visualize intrinsic rewards
255
+ intrinsic_rewards = df_intrinsic_analysis['Intrinsic Reward'].values
256
+ plt.subplot(1, 3, 2)
257
+ plt.hist(intrinsic_rewards, bins=50, color='blue', alpha=0.7)
258
+ plt.title('Intrinsic Reward Distribution')
259
+ plt.xlabel('Intrinsic Reward')
260
+ plt.ylabel('Frequency')
261
+
262
+ # Visualize goal position visits
263
+ goal_positions = df_goal_rows['State_2D'].apply(lambda x: x[0] * 6 + x[1]).values
264
+ goal_counts = np.zeros(num_states)
265
+ for pos in goal_positions:
266
+ goal_counts[pos] += 1
267
+ plt.subplot(1, 3, 3)
268
+ plt.imshow(goal_counts.reshape(6, 6), cmap='hot', interpolation='nearest')
269
+ plt.title('Goal Position Visitation Counts')
270
+ plt.colorbar()
271
+
272
+ plt.tight_layout()
273
+ plt.show()
274
+
275
+ return meta_losses, meta_rewards, success_rates
276
+
277
+ # Plot function for meta-loss, average reward, and success rate with white background and markers
278
+ def plot_meta_losses_rewards_success(meta_losses, meta_rewards, success_rates, window_size=10):
279
+ smoothed_losses = moving_average(meta_losses, window_size)
280
+ smoothed_rewards = moving_average(meta_rewards, window_size)
281
+ smoothed_success_rates = moving_average(success_rates, window_size)
282
+
283
+ # Create the figure and axes with white background
284
+ fig, ax1 = plt.subplots(figsize=(14, 7), facecolor='white')
285
+
286
+ # Set axes background color to white
287
+ ax1.set_facecolor('white')
288
+
289
+ color = 'tab:red'
290
+ ax1.set_xlabel('Meta-Iteration')
291
+ ax1.set_ylabel('Meta-Loss', color=color)
292
+ ax1.plot(meta_losses, color=color, alpha=0.1, label='Meta-Loss', marker='o', markersize=5)
293
+ ax1.plot(range(window_size - 1, len(meta_losses)), smoothed_losses, color=color, label=f'Smoothed Meta-Loss (window={window_size})', marker='o', markersize=3)
294
+ ax1.tick_params(axis='y', labelcolor=color)
295
+
296
+ # Twin x-axis for Average Reward
297
+ ax2 = ax1.twinx()
298
+ ax2.set_facecolor('white') # Set the background color of the second axis to white
299
+ color = 'tab:blue'
300
+ ax2.set_ylabel('Average Reward', color=color)
301
+ ax2.plot(meta_rewards, color=color, alpha=0.1, label='Average Reward', marker='s', markersize=5)
302
+ ax2.plot(range(window_size - 1, len(meta_rewards)), smoothed_rewards, color=color, label=f'Smoothed Average Reward (window={window_size})', marker='s', markersize=3)
303
+ ax2.tick_params(axis='y', labelcolor=color)
304
+
305
+ # Third axis for Success Rate
306
+ ax3 = ax1.twinx()
307
+ ax3.spines['right'].set_position(('outward', 60))
308
+ ax3.set_facecolor('white') # Set the background color of the third axis to white
309
+ color = 'tab:green'
310
+ ax3.set_ylabel('Success Rate', color=color)
311
+ ax3.plot(success_rates, color=color, alpha=0.1, label='Success Rate', marker='^', markersize=5)
312
+ ax3.plot(range(window_size - 1, len(success_rates)), smoothed_success_rates, color=color, label=f'Smoothed Success Rate (window={window_size})', marker='^', markersize=3)
313
+ ax3.tick_params(axis='y', labelcolor=color)
314
+
315
+ # Title and grid
316
+ plt.title("Meta-Loss, Average Reward, and Success Rate Progress")
317
+ fig.tight_layout() # Adjust layout to prevent label clipping
318
+ plt.grid(True)
319
+
320
+ # Show the plot
321
+ plt.show()
322
+
323
+ # Function to calculate moving average
324
+ def moving_average(data, window_size=30):
325
+ return np.convolve(data, np.ones(window_size) / window_size, mode='valid')
326
+
327
+ # Simplified run with single complexity and success rate tracking
328
+ if __name__ == "__main__":
329
+ meta_learning_rate = 1e-3
330
+ epsilon_start = 0.9
331
+ epsilon_decay = 0.99
332
+ num_iterations = 500
333
+ num_inner_steps = 50
334
+ eta = 0.1
335
+ epsilon = 1e-5
336
+
337
+ meta_losses, meta_rewards, success_rates = meta_train_fixed_complexity(
338
+ meta_learning_rate=meta_learning_rate,
339
+ epsilon_start=epsilon_start,
340
+ epsilon_decay=epsilon_decay,
341
+ num_iterations=num_iterations,
342
+ num_inner_steps=num_inner_steps,
343
+ eta=eta,
344
+ epsilon=epsilon
345
+ )
346
+
347
+ # Plot results
348
+ plot_meta_losses_rewards_success(meta_losses, meta_rewards, success_rates)