TroglodyteDerivations
commited on
Commit
•
682f2b2
1
Parent(s):
5b42e60
Create algo2.txt
Browse files
algo2.txt
ADDED
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
!pip install gym
|
2 |
+
|
3 |
+
# Import necessary libraries
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
import pandas as pd
|
7 |
+
import torch.optim as optim
|
8 |
+
import numpy as np
|
9 |
+
import random
|
10 |
+
import warnings
|
11 |
+
import gym
|
12 |
+
from gym import spaces
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
|
15 |
+
# Suppress possible warnings for cleaner output
|
16 |
+
warnings.filterwarnings("ignore")
|
17 |
+
|
18 |
+
# Set random seeds for reproducibility
|
19 |
+
def set_seed(seed):
|
20 |
+
np.random.seed(seed)
|
21 |
+
torch.manual_seed(seed)
|
22 |
+
random.seed(seed)
|
23 |
+
if torch.cuda.is_available():
|
24 |
+
torch.cuda.manual_seed_all(seed)
|
25 |
+
|
26 |
+
set_seed(42)
|
27 |
+
|
28 |
+
# Define the device for computation (use CUDA if available)
|
29 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
30 |
+
print(f"Using device: {device}")
|
31 |
+
|
32 |
+
# Custom grid environment with fixed size and traps
|
33 |
+
class CustomGridEnv:
|
34 |
+
def __init__(self, size=6, num_traps=3):
|
35 |
+
# Fixed complexity: grid 6x6 with 3 traps
|
36 |
+
|
37 |
+
self.size = size
|
38 |
+
self.num_traps = num_traps
|
39 |
+
self.obversation_space = spaces.Discrete(self.size * self.size)
|
40 |
+
self.action_space = spaces.Discrete(4) # 4 actions: Up, Down, Left, Right
|
41 |
+
self.reset()
|
42 |
+
|
43 |
+
def reset(self):
|
44 |
+
self.agent_pos = [0, 0]
|
45 |
+
self.goal_pos = [self.size - 1, self.size - 1]
|
46 |
+
self._generate_traps()
|
47 |
+
return self._get_obs()
|
48 |
+
|
49 |
+
def _generate_traps(self):
|
50 |
+
self.traps = []
|
51 |
+
while len(self.traps) < self.num_traps:
|
52 |
+
trap = [np.random.randint(self.size), np.random.randint(self.size)]
|
53 |
+
if trap != self.agent_pos and trap != self.goal_pos:
|
54 |
+
self.traps.append(trap)
|
55 |
+
|
56 |
+
def step(self, action):
|
57 |
+
if action == 0 and self.agent_pos[0] > 0:
|
58 |
+
|
59 |
+
# Up
|
60 |
+
self.agent_pos[0] -= 1
|
61 |
+
|
62 |
+
elif action == 1 and self.agent_pos[0] < self.size - 1:
|
63 |
+
|
64 |
+
# Down
|
65 |
+
self.agent_pos[0] += 1
|
66 |
+
|
67 |
+
elif action == 2 and self.agent_pos[1] > 0:
|
68 |
+
|
69 |
+
# Left
|
70 |
+
self.agent_pos[1] -= 1
|
71 |
+
|
72 |
+
elif action == 3 and self.agent_pos[1] < self.size - 1:
|
73 |
+
|
74 |
+
# Right
|
75 |
+
self.agent_pos[1] += 1
|
76 |
+
|
77 |
+
done = False
|
78 |
+
|
79 |
+
if self.agent_pos == self.goal_pos:
|
80 |
+
|
81 |
+
reward = 10
|
82 |
+
done = True
|
83 |
+
|
84 |
+
elif self.agent_pos in self.traps:
|
85 |
+
|
86 |
+
reward = -5
|
87 |
+
done = True
|
88 |
+
|
89 |
+
else:
|
90 |
+
|
91 |
+
reward = -1
|
92 |
+
|
93 |
+
return self._get_obs(), reward, done, {}
|
94 |
+
|
95 |
+
def _get_obs(self):
|
96 |
+
return self.agent_pos[0] * self.size + self.agent_pos[1]
|
97 |
+
|
98 |
+
# Neural network for the policy
|
99 |
+
class PolicyNet(nn.Module):
|
100 |
+
def __init__(self, input_size, num_actions):
|
101 |
+
super(PolicyNet, self).__init__()
|
102 |
+
self.fc1 = nn.Linear(input_size, 64) # Fully-connected layer 1 [Input]
|
103 |
+
self.fc2 = nn.Linear(64, num_actions) # Fully-connected layer 2 [State-action pair selections based on fc1 input]
|
104 |
+
|
105 |
+
def forward(self, state):
|
106 |
+
x = torch.relu(self.fc1(state))
|
107 |
+
return self.fc2(x)
|
108 |
+
|
109 |
+
# Helper function to convert a state to one-hot representation
|
110 |
+
def state_to_one_hot(state, num_states):
|
111 |
+
one_hot = np.zeros(num_states)
|
112 |
+
one_hot[state] = 1
|
113 |
+
return torch.FloatTensor([one_hot]).to(device)
|
114 |
+
|
115 |
+
|
116 |
+
# Epsilon-greedy action selection
|
117 |
+
def select_action(network, state, epsilon, num_actions):
|
118 |
+
if np.random.uniform(0, 1) < epsilon: # Coin flip selection then epsilon comparison
|
119 |
+
return np.random.choice(num_actions)
|
120 |
+
else: # If less than epsilon then select largest q-values out of the q-values obtained
|
121 |
+
with torch.no_grad():
|
122 |
+
q_values = network(state)
|
123 |
+
return torch.argmax(q_values).item() #e.g., q_values: Q[Q1(100), Q2(277.7123), Q3(69.1234567)] selects Q2
|
124 |
+
|
125 |
+
# Version 5
|
126 |
+
# Validate the df_goal_rows that appear in the visualization
|
127 |
+
# Compare the success rate based on goal visits with the success rate calculated during the training process
|
128 |
+
|
129 |
+
|
130 |
+
# Single complexity meta-training process with success rate tracking
|
131 |
+
# The Intrinsic Reward Analysis derives from within the anatomy of the function with the name meta_train_fixed_complexity
|
132 |
+
def meta_train_fixed_complexity(meta_learning_rate, epsilon_start, epsilon_decay, num_iterations, num_inner_steps, eta=0.1, epsilon=1e-5):
|
133 |
+
num_states = 6 * 6 # Fixed grid size of 6x6
|
134 |
+
num_actions = 4 # Up, Down, Left, Right
|
135 |
+
discount_factor = 0.99 # Gamma
|
136 |
+
|
137 |
+
# Initialize policy network
|
138 |
+
policy_net = PolicyNet(input_size=num_states, num_actions=num_actions).to(device)
|
139 |
+
optimizer = optim.Adam(policy_net.parameters(), lr=meta_learning_rate)
|
140 |
+
|
141 |
+
epsilon_greedy = epsilon_start
|
142 |
+
meta_losses, meta_rewards, success_rates = [], [], []
|
143 |
+
|
144 |
+
env = CustomGridEnv(size=6, num_traps=3) # Fixed complexity level: grid 6x6 with 3 traps
|
145 |
+
|
146 |
+
# Intrinsic Reward Analysis data capture encapsulated by 3 variables:
|
147 |
+
# 1. state_visitation_counts
|
148 |
+
# 2. intrinsic_reward
|
149 |
+
# 3. total_reward
|
150 |
+
|
151 |
+
# 1. state_visitation_counts
|
152 |
+
# State visitation counts
|
153 |
+
state_visitation_counts = np.zeros(num_states)
|
154 |
+
|
155 |
+
# Initialize intrinsic analysis list
|
156 |
+
intrinsic_analysis = []
|
157 |
+
|
158 |
+
for iteration in range(num_iterations):
|
159 |
+
print(f"Iteration {iteration + 1}/{num_iterations}")
|
160 |
+
|
161 |
+
total_loss = 0
|
162 |
+
total_reward = 0
|
163 |
+
successes = 0
|
164 |
+
|
165 |
+
for task in range(10): # Fixed number of tasks for each iteration
|
166 |
+
state = env.reset()
|
167 |
+
state = state_to_one_hot(state, num_states)
|
168 |
+
optimizer.zero_grad()
|
169 |
+
|
170 |
+
for step in range(num_inner_steps):
|
171 |
+
action = select_action(policy_net, state, epsilon_greedy, num_actions)
|
172 |
+
next_state, reward_ext, done, _ = env.step(action)
|
173 |
+
next_state = state_to_one_hot(next_state, num_states)
|
174 |
+
|
175 |
+
# 1. state_visitation_counts
|
176 |
+
# Update state visitation count
|
177 |
+
state_visitation_counts[state.argmax().item()] += 1
|
178 |
+
|
179 |
+
# 2. intrinsic_reward
|
180 |
+
# Calculate intrinsic reward
|
181 |
+
intrinsic_reward = eta * (1 / np.sqrt(state_visitation_counts[state.argmax().item()] + epsilon))
|
182 |
+
|
183 |
+
# 3. total_reward
|
184 |
+
# Calculate total reward
|
185 |
+
total_reward = reward_ext + intrinsic_reward
|
186 |
+
|
187 |
+
# Convert state index to 2D grid representation
|
188 |
+
state_2d = (state.argmax().item() // 6, state.argmax().item() % 6)
|
189 |
+
|
190 |
+
# Append intrinsic analysis data
|
191 |
+
intrinsic_analysis.append({
|
192 |
+
'State_2D': state_2d,
|
193 |
+
'Intrinsic Reward': intrinsic_reward,
|
194 |
+
'Total Reward': total_reward,
|
195 |
+
'Extrinsic Reward': reward_ext
|
196 |
+
})
|
197 |
+
|
198 |
+
with torch.no_grad():
|
199 |
+
target = total_reward + discount_factor * torch.max(policy_net(next_state))
|
200 |
+
|
201 |
+
prediction = policy_net(state)[0][action]
|
202 |
+
loss = nn.functional.smooth_l1_loss(prediction, target)
|
203 |
+
loss.backward()
|
204 |
+
total_loss += loss.item()
|
205 |
+
|
206 |
+
optimizer.step()
|
207 |
+
state = next_state
|
208 |
+
total_reward += reward_ext
|
209 |
+
if done:
|
210 |
+
if reward_ext == 10: # Success is defined as reaching the goal
|
211 |
+
successes += 1
|
212 |
+
break
|
213 |
+
|
214 |
+
meta_losses.append(total_loss / 10)
|
215 |
+
meta_rewards.append(total_reward / 10)
|
216 |
+
success_rates.append(successes / 10)
|
217 |
+
epsilon_greedy = max(0.1, epsilon_greedy * epsilon_decay)
|
218 |
+
|
219 |
+
# Convert intrinsic analysis list to DataFrame and save to CSV
|
220 |
+
df_intrinsic_analysis = pd.DataFrame(intrinsic_analysis)
|
221 |
+
df_intrinsic_analysis.to_csv('intrinsic_analysis.csv', index=False)
|
222 |
+
|
223 |
+
# Find all rows associated with the goal position (Extrinsic Reward == 10)
|
224 |
+
df_goal_rows = df_intrinsic_analysis[df_intrinsic_analysis['Extrinsic Reward'] == 10]
|
225 |
+
print(f"Shape of df_goal_rows: {df_goal_rows.shape}")
|
226 |
+
print("Rows associated with the goal position:")
|
227 |
+
print(df_goal_rows)
|
228 |
+
|
229 |
+
# Filter for positions (4,5) and (5,4)
|
230 |
+
df_goal_rows_4_5 = df_goal_rows[df_goal_rows['State_2D'] == (4, 5)]
|
231 |
+
df_goal_rows_5_4 = df_goal_rows[df_goal_rows['State_2D'] == (5, 4)]
|
232 |
+
print("Rows associated with position (4,5):")
|
233 |
+
print(df_goal_rows_4_5)
|
234 |
+
print("Rows associated with position (5,4):")
|
235 |
+
print(df_goal_rows_5_4)
|
236 |
+
|
237 |
+
# Filter for any other positions
|
238 |
+
df_goal_rows_other = df_goal_rows[~df_goal_rows['State_2D'].isin([(4, 5), (5, 4)])]
|
239 |
+
print("Rows associated with any other positions:")
|
240 |
+
print(df_goal_rows_other)
|
241 |
+
|
242 |
+
# Calculate success rate based on goal visits
|
243 |
+
total_tasks = num_iterations * 10
|
244 |
+
success_rate_goal_visits = len(df_goal_rows) / total_tasks * 100
|
245 |
+
print(f"Success Rate based on Goal Visits: {success_rate_goal_visits:.2f}%")
|
246 |
+
|
247 |
+
# Visualize state visitation counts
|
248 |
+
plt.figure(figsize=(15, 5))
|
249 |
+
plt.subplot(1, 3, 1)
|
250 |
+
plt.imshow(state_visitation_counts.reshape(6, 6), cmap='hot', interpolation='nearest')
|
251 |
+
plt.title('State Visitation Counts')
|
252 |
+
plt.colorbar()
|
253 |
+
|
254 |
+
# Visualize intrinsic rewards
|
255 |
+
intrinsic_rewards = df_intrinsic_analysis['Intrinsic Reward'].values
|
256 |
+
plt.subplot(1, 3, 2)
|
257 |
+
plt.hist(intrinsic_rewards, bins=50, color='blue', alpha=0.7)
|
258 |
+
plt.title('Intrinsic Reward Distribution')
|
259 |
+
plt.xlabel('Intrinsic Reward')
|
260 |
+
plt.ylabel('Frequency')
|
261 |
+
|
262 |
+
# Visualize goal position visits
|
263 |
+
goal_positions = df_goal_rows['State_2D'].apply(lambda x: x[0] * 6 + x[1]).values
|
264 |
+
goal_counts = np.zeros(num_states)
|
265 |
+
for pos in goal_positions:
|
266 |
+
goal_counts[pos] += 1
|
267 |
+
plt.subplot(1, 3, 3)
|
268 |
+
plt.imshow(goal_counts.reshape(6, 6), cmap='hot', interpolation='nearest')
|
269 |
+
plt.title('Goal Position Visitation Counts')
|
270 |
+
plt.colorbar()
|
271 |
+
|
272 |
+
plt.tight_layout()
|
273 |
+
plt.show()
|
274 |
+
|
275 |
+
return meta_losses, meta_rewards, success_rates
|
276 |
+
|
277 |
+
# Plot function for meta-loss, average reward, and success rate with white background and markers
|
278 |
+
def plot_meta_losses_rewards_success(meta_losses, meta_rewards, success_rates, window_size=10):
|
279 |
+
smoothed_losses = moving_average(meta_losses, window_size)
|
280 |
+
smoothed_rewards = moving_average(meta_rewards, window_size)
|
281 |
+
smoothed_success_rates = moving_average(success_rates, window_size)
|
282 |
+
|
283 |
+
# Create the figure and axes with white background
|
284 |
+
fig, ax1 = plt.subplots(figsize=(14, 7), facecolor='white')
|
285 |
+
|
286 |
+
# Set axes background color to white
|
287 |
+
ax1.set_facecolor('white')
|
288 |
+
|
289 |
+
color = 'tab:red'
|
290 |
+
ax1.set_xlabel('Meta-Iteration')
|
291 |
+
ax1.set_ylabel('Meta-Loss', color=color)
|
292 |
+
ax1.plot(meta_losses, color=color, alpha=0.1, label='Meta-Loss', marker='o', markersize=5)
|
293 |
+
ax1.plot(range(window_size - 1, len(meta_losses)), smoothed_losses, color=color, label=f'Smoothed Meta-Loss (window={window_size})', marker='o', markersize=3)
|
294 |
+
ax1.tick_params(axis='y', labelcolor=color)
|
295 |
+
|
296 |
+
# Twin x-axis for Average Reward
|
297 |
+
ax2 = ax1.twinx()
|
298 |
+
ax2.set_facecolor('white') # Set the background color of the second axis to white
|
299 |
+
color = 'tab:blue'
|
300 |
+
ax2.set_ylabel('Average Reward', color=color)
|
301 |
+
ax2.plot(meta_rewards, color=color, alpha=0.1, label='Average Reward', marker='s', markersize=5)
|
302 |
+
ax2.plot(range(window_size - 1, len(meta_rewards)), smoothed_rewards, color=color, label=f'Smoothed Average Reward (window={window_size})', marker='s', markersize=3)
|
303 |
+
ax2.tick_params(axis='y', labelcolor=color)
|
304 |
+
|
305 |
+
# Third axis for Success Rate
|
306 |
+
ax3 = ax1.twinx()
|
307 |
+
ax3.spines['right'].set_position(('outward', 60))
|
308 |
+
ax3.set_facecolor('white') # Set the background color of the third axis to white
|
309 |
+
color = 'tab:green'
|
310 |
+
ax3.set_ylabel('Success Rate', color=color)
|
311 |
+
ax3.plot(success_rates, color=color, alpha=0.1, label='Success Rate', marker='^', markersize=5)
|
312 |
+
ax3.plot(range(window_size - 1, len(success_rates)), smoothed_success_rates, color=color, label=f'Smoothed Success Rate (window={window_size})', marker='^', markersize=3)
|
313 |
+
ax3.tick_params(axis='y', labelcolor=color)
|
314 |
+
|
315 |
+
# Title and grid
|
316 |
+
plt.title("Meta-Loss, Average Reward, and Success Rate Progress")
|
317 |
+
fig.tight_layout() # Adjust layout to prevent label clipping
|
318 |
+
plt.grid(True)
|
319 |
+
|
320 |
+
# Show the plot
|
321 |
+
plt.show()
|
322 |
+
|
323 |
+
# Function to calculate moving average
|
324 |
+
def moving_average(data, window_size=30):
|
325 |
+
return np.convolve(data, np.ones(window_size) / window_size, mode='valid')
|
326 |
+
|
327 |
+
# Simplified run with single complexity and success rate tracking
|
328 |
+
if __name__ == "__main__":
|
329 |
+
meta_learning_rate = 1e-3
|
330 |
+
epsilon_start = 0.9
|
331 |
+
epsilon_decay = 0.99
|
332 |
+
num_iterations = 500
|
333 |
+
num_inner_steps = 50
|
334 |
+
eta = 0.1
|
335 |
+
epsilon = 1e-5
|
336 |
+
|
337 |
+
meta_losses, meta_rewards, success_rates = meta_train_fixed_complexity(
|
338 |
+
meta_learning_rate=meta_learning_rate,
|
339 |
+
epsilon_start=epsilon_start,
|
340 |
+
epsilon_decay=epsilon_decay,
|
341 |
+
num_iterations=num_iterations,
|
342 |
+
num_inner_steps=num_inner_steps,
|
343 |
+
eta=eta,
|
344 |
+
epsilon=epsilon
|
345 |
+
)
|
346 |
+
|
347 |
+
# Plot results
|
348 |
+
plot_meta_losses_rewards_success(meta_losses, meta_rewards, success_rates)
|