00BER commited on
Commit
85e4824
·
1 Parent(s): 4c6ae5f

Added lunar lander files

Browse files
Files changed (3) hide show
  1. agent.py +849 -0
  2. lunar_lander.py +332 -0
  3. params.py +12 -0
agent.py ADDED
@@ -0,0 +1,849 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import random
4
+ import torch.nn as nn
5
+ import copy
6
+ import time, datetime
7
+ import matplotlib.pyplot as plt
8
+ from collections import deque
9
+ from torch.utils.tensorboard import SummaryWriter
10
+
11
+
12
+ class DQNet(nn.Module):
13
+ """mini cnn structure"""
14
+
15
+ def __init__(self, input_dim, output_dim):
16
+ super().__init__()
17
+
18
+ self.online = nn.Sequential(
19
+ nn.Linear(input_dim, 100),
20
+ nn.ReLU(),
21
+ nn.Linear(100, 120),
22
+ nn.ReLU(),
23
+ nn.Linear(120, output_dim),
24
+ )
25
+
26
+
27
+ self.target = copy.deepcopy(self.online)
28
+
29
+ # Q_target parameters are frozen.
30
+ for p in self.target.parameters():
31
+ p.requires_grad = False
32
+
33
+ def forward(self, input, model):
34
+ if model == "online":
35
+ return self.online(input)
36
+ elif model == "target":
37
+ return self.target(input)
38
+
39
+
40
+
41
+ class MetricLogger:
42
+ def __init__(self, save_dir):
43
+ self.writer = SummaryWriter(log_dir=save_dir)
44
+ self.save_log = save_dir / "log"
45
+ with open(self.save_log, "w") as f:
46
+ f.write(
47
+ f"{'Episode':>8}{'Step':>8}{'Epsilon':>10}{'MeanReward':>15}"
48
+ f"{'MeanLength':>15}{'MeanLoss':>15}{'MeanQValue':>15}"
49
+ f"{'TimeDelta':>15}{'Time':>20}\n"
50
+ )
51
+ self.ep_rewards_plot = save_dir / "reward_plot.jpg"
52
+ self.ep_lengths_plot = save_dir / "length_plot.jpg"
53
+ self.ep_avg_losses_plot = save_dir / "loss_plot.jpg"
54
+ self.ep_avg_qs_plot = save_dir / "q_plot.jpg"
55
+
56
+ # History metrics
57
+ self.ep_rewards = []
58
+ self.ep_lengths = []
59
+ self.ep_avg_losses = []
60
+ self.ep_avg_qs = []
61
+
62
+ # Moving averages, added for every call to record()
63
+ self.moving_avg_ep_rewards = []
64
+ self.moving_avg_ep_lengths = []
65
+ self.moving_avg_ep_avg_losses = []
66
+ self.moving_avg_ep_avg_qs = []
67
+
68
+ # Current episode metric
69
+ self.init_episode()
70
+
71
+ # Timing
72
+ self.record_time = time.time()
73
+
74
+ def log_step(self, reward, loss, q):
75
+ self.curr_ep_reward += reward
76
+ self.curr_ep_length += 1
77
+ if loss:
78
+ self.curr_ep_loss += loss
79
+ self.curr_ep_q += q
80
+ self.curr_ep_loss_length += 1
81
+
82
+ def log_episode(self, episode_number):
83
+ "Mark end of episode"
84
+ self.ep_rewards.append(self.curr_ep_reward)
85
+ self.ep_lengths.append(self.curr_ep_length)
86
+ if self.curr_ep_loss_length == 0:
87
+ ep_avg_loss = 0
88
+ ep_avg_q = 0
89
+ else:
90
+ ep_avg_loss = np.round(self.curr_ep_loss / self.curr_ep_loss_length, 5)
91
+ ep_avg_q = np.round(self.curr_ep_q / self.curr_ep_loss_length, 5)
92
+ self.ep_avg_losses.append(ep_avg_loss)
93
+ self.ep_avg_qs.append(ep_avg_q)
94
+ self.writer.add_scalar("Avg Loss for episode", ep_avg_loss, episode_number)
95
+ self.writer.add_scalar("Avg Q value for episode", ep_avg_q, episode_number)
96
+ self.writer.flush()
97
+ self.init_episode()
98
+
99
+ def init_episode(self):
100
+ self.curr_ep_reward = 0.0
101
+ self.curr_ep_length = 0
102
+ self.curr_ep_loss = 0.0
103
+ self.curr_ep_q = 0.0
104
+ self.curr_ep_loss_length = 0
105
+
106
+ def record(self, episode, epsilon, step):
107
+ mean_ep_reward = np.round(np.mean(self.ep_rewards[-100:]), 3)
108
+ mean_ep_length = np.round(np.mean(self.ep_lengths[-100:]), 3)
109
+ mean_ep_loss = np.round(np.mean(self.ep_avg_losses[-100:]), 3)
110
+ mean_ep_q = np.round(np.mean(self.ep_avg_qs[-100:]), 3)
111
+ self.moving_avg_ep_rewards.append(mean_ep_reward)
112
+ self.moving_avg_ep_lengths.append(mean_ep_length)
113
+ self.moving_avg_ep_avg_losses.append(mean_ep_loss)
114
+ self.moving_avg_ep_avg_qs.append(mean_ep_q)
115
+
116
+ last_record_time = self.record_time
117
+ self.record_time = time.time()
118
+ time_since_last_record = np.round(self.record_time - last_record_time, 3)
119
+
120
+ print(
121
+ f"Episode {episode} - "
122
+ f"Step {step} - "
123
+ f"Epsilon {epsilon} - "
124
+ f"Mean Reward {mean_ep_reward} - "
125
+ f"Mean Length {mean_ep_length} - "
126
+ f"Mean Loss {mean_ep_loss} - "
127
+ f"Mean Q Value {mean_ep_q} - "
128
+ f"Time Delta {time_since_last_record} - "
129
+ f"Time {datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}"
130
+ )
131
+ self.writer.add_scalar("Mean reward last 100 episodes", mean_ep_reward, episode)
132
+ self.writer.add_scalar("Mean length last 100 episodes", mean_ep_length, episode)
133
+ self.writer.add_scalar("Mean loss last 100 episodes", mean_ep_loss, episode)
134
+ self.writer.add_scalar("Mean reward last 100 episodes", mean_ep_reward, episode)
135
+ self.writer.add_scalar("Epsilon value", epsilon, episode)
136
+ self.writer.add_scalar("Mean Q Value last 100 episodes", mean_ep_q, episode)
137
+ self.writer.flush()
138
+ with open(self.save_log, "a") as f:
139
+ f.write(
140
+ f"{episode:8d}{step:8d}{epsilon:10.3f}"
141
+ f"{mean_ep_reward:15.3f}{mean_ep_length:15.3f}{mean_ep_loss:15.3f}{mean_ep_q:15.3f}"
142
+ f"{time_since_last_record:15.3f}"
143
+ f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'):>20}\n"
144
+ )
145
+
146
+ for metric in ["ep_rewards", "ep_lengths", "ep_avg_losses", "ep_avg_qs"]:
147
+ plt.plot(getattr(self, f"moving_avg_{metric}"))
148
+ plt.savefig(getattr(self, f"{metric}_plot"))
149
+ plt.clf()
150
+
151
+
152
+ class DQNAgent:
153
+ def __init__(self,
154
+ state_dim,
155
+ action_dim,
156
+ save_dir,
157
+ checkpoint=None,
158
+ learning_rate=0.00025,
159
+ max_memory_size=100000,
160
+ batch_size=32,
161
+ exploration_rate=1,
162
+ exploration_rate_decay=0.9999999,
163
+ exploration_rate_min=0.1,
164
+ training_frequency=1,
165
+ learning_starts=1000,
166
+ target_network_sync_frequency=500,
167
+ reset_exploration_rate=False,
168
+ save_frequency=100000,
169
+ gamma=0.9,
170
+ load_replay_buffer=True):
171
+ self.state_dim = state_dim
172
+ self.action_dim = action_dim
173
+ self.max_memory_size = max_memory_size
174
+ self.memory = deque(maxlen=max_memory_size)
175
+ self.batch_size = batch_size
176
+
177
+ self.exploration_rate = exploration_rate
178
+ self.exploration_rate_decay = exploration_rate_decay
179
+ self.exploration_rate_min = exploration_rate_min
180
+ self.gamma = gamma
181
+
182
+ self.curr_step = 0
183
+ self.learning_starts = learning_starts # min. experiences before training
184
+
185
+ self.training_frequency = training_frequency # no. of experiences between updates to Q_online
186
+ self.target_network_sync_frequency = target_network_sync_frequency # no. of experiences between Q_target & Q_online sync
187
+
188
+ self.save_every = save_frequency # no. of experiences between saving the network
189
+ self.save_dir = save_dir
190
+
191
+ self.use_cuda = torch.cuda.is_available()
192
+
193
+ self.net = DQNet(self.state_dim, self.action_dim).float()
194
+ if self.use_cuda:
195
+ self.net = self.net.to(device='cuda')
196
+ if checkpoint:
197
+ self.load(checkpoint, reset_exploration_rate, load_replay_buffer)
198
+
199
+ self.optimizer = torch.optim.AdamW(self.net.parameters(), lr=learning_rate, amsgrad=True)
200
+ self.loss_fn = torch.nn.SmoothL1Loss()
201
+ # self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate)
202
+ # self.loss_fn = torch.nn.MSELoss()
203
+
204
+
205
+ def act(self, state):
206
+ """
207
+ Given a state, choose an epsilon-greedy action and update value of step.
208
+
209
+ Inputs:
210
+ state(LazyFrame): A single observation of the current state, dimension is (state_dim)
211
+ Outputs:
212
+ action_idx (int): An integer representing which action the agent will perform
213
+ """
214
+ # EXPLORE
215
+ if np.random.rand() < self.exploration_rate:
216
+ action_idx = np.random.randint(self.action_dim)
217
+
218
+ # EXPLOIT
219
+ else:
220
+ state = torch.FloatTensor(state).cuda() if self.use_cuda else torch.FloatTensor(state)
221
+ state = state.unsqueeze(0)
222
+ action_values = self.net(state, model='online')
223
+ action_idx = torch.argmax(action_values, axis=1).item()
224
+
225
+ # decrease exploration_rate
226
+
227
+ self.exploration_rate *= self.exploration_rate_decay
228
+ self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)
229
+
230
+ # increment step
231
+ self.curr_step += 1
232
+ return action_idx
233
+
234
+ def cache(self, state, next_state, action, reward, done):
235
+ """
236
+ Store the experience to self.memory (replay buffer)
237
+
238
+ Inputs:
239
+ state (LazyFrame),
240
+ next_state (LazyFrame),
241
+ action (int),
242
+ reward (float),
243
+ done(bool))
244
+ """
245
+ state = torch.FloatTensor(state).cuda() if self.use_cuda else torch.FloatTensor(state)
246
+ next_state = torch.FloatTensor(next_state).cuda() if self.use_cuda else torch.FloatTensor(next_state)
247
+ action = torch.LongTensor([action]).cuda() if self.use_cuda else torch.LongTensor([action])
248
+ reward = torch.DoubleTensor([reward]).cuda() if self.use_cuda else torch.DoubleTensor([reward])
249
+ done = torch.BoolTensor([done]).cuda() if self.use_cuda else torch.BoolTensor([done])
250
+
251
+ self.memory.append( (state, next_state, action, reward, done,) )
252
+
253
+
254
+ def recall(self):
255
+ """
256
+ Retrieve a batch of experiences from memory
257
+ """
258
+ batch = random.sample(self.memory, self.batch_size)
259
+ state, next_state, action, reward, done = map(torch.stack, zip(*batch))
260
+ return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()
261
+
262
+
263
+ def td_estimate(self, states, actions):
264
+ actions = actions.reshape(-1, 1)
265
+ predicted_qs = self.net(states, model='online')# Q_online(s,a)
266
+ predicted_qs = predicted_qs.gather(1, actions)
267
+ return predicted_qs
268
+
269
+
270
+ @torch.no_grad()
271
+ def td_target(self, rewards, next_states, dones):
272
+ rewards = rewards.reshape(-1, 1)
273
+ dones = dones.reshape(-1, 1)
274
+ target_qs = self.net(next_states, model='target')
275
+ target_qs = torch.max(target_qs, dim=1).values
276
+ target_qs = target_qs.reshape(-1, 1)
277
+ target_qs[dones] = 0.0
278
+ return (rewards + (self.gamma * target_qs))
279
+
280
+ def update_Q_online(self, td_estimate, td_target) :
281
+ loss = self.loss_fn(td_estimate.float(), td_target.float())
282
+ self.optimizer.zero_grad()
283
+ loss.backward()
284
+ self.optimizer.step()
285
+ return loss.item()
286
+
287
+
288
+ def sync_Q_target(self):
289
+ self.net.target.load_state_dict(self.net.online.state_dict())
290
+
291
+
292
+ def learn(self):
293
+ if self.curr_step % self.target_network_sync_frequency == 0:
294
+ self.sync_Q_target()
295
+
296
+ if self.curr_step % self.save_every == 0:
297
+ self.save()
298
+
299
+ if self.curr_step < self.learning_starts:
300
+ return None, None
301
+
302
+ if self.curr_step % self.training_frequency != 0:
303
+ return None, None
304
+
305
+ # Sample from memory
306
+ state, next_state, action, reward, done = self.recall()
307
+
308
+ # Get TD Estimate
309
+ td_est = self.td_estimate(state, action)
310
+
311
+ # Get TD Target
312
+ td_tgt = self.td_target(reward, next_state, done)
313
+
314
+ # Backpropagate loss through Q_online
315
+
316
+ loss = self.update_Q_online(td_est, td_tgt)
317
+
318
+ return (td_est.mean().item(), loss)
319
+
320
+
321
+ def save(self):
322
+ save_path = self.save_dir / f"airstriker_net_{int(self.curr_step // self.save_every)}.chkpt"
323
+ torch.save(
324
+ dict(
325
+ model=self.net.state_dict(),
326
+ exploration_rate=self.exploration_rate,
327
+ replay_memory=self.memory
328
+ ),
329
+ save_path
330
+ )
331
+
332
+ print(f"Airstriker model saved to {save_path} at step {self.curr_step}")
333
+
334
+
335
+ def load(self, load_path, reset_exploration_rate, load_replay_buffer):
336
+ if not load_path.exists():
337
+ raise ValueError(f"{load_path} does not exist")
338
+
339
+ ckp = torch.load(load_path, map_location=('cuda' if self.use_cuda else 'cpu'))
340
+ exploration_rate = ckp.get('exploration_rate')
341
+ state_dict = ckp.get('model')
342
+
343
+
344
+ print(f"Loading model at {load_path} with exploration rate {exploration_rate}")
345
+ self.net.load_state_dict(state_dict)
346
+
347
+ if load_replay_buffer:
348
+ replay_memory = ckp.get('replay_memory')
349
+ print(f"Loading replay memory. Len {len(replay_memory)}" if replay_memory else "Saved replay memory not found. Not restoring replay memory.")
350
+ self.memory = replay_memory if replay_memory else self.memory
351
+
352
+ if reset_exploration_rate:
353
+ print(f"Reset exploration rate option specified. Not restoring saved exploration rate {exploration_rate}. The current exploration rate is {self.exploration_rate}")
354
+ else:
355
+ print(f"Setting exploration rate to {exploration_rate} not loaded.")
356
+ self.exploration_rate = exploration_rate
357
+
358
+
359
+ class DDQNAgent(DQNAgent):
360
+ @torch.no_grad()
361
+ def td_target(self, rewards, next_states, dones):
362
+ rewards = rewards.reshape(-1, 1)
363
+ dones = dones.reshape(-1, 1)
364
+ q_vals = self.net(next_states, model='online')
365
+ target_actions = torch.argmax(q_vals, axis=1)
366
+ target_actions = target_actions.reshape(-1, 1)
367
+
368
+ target_qs = self.net(next_states, model='target')
369
+ target_qs = target_qs.gather(1, target_actions)
370
+ target_qs = target_qs.reshape(-1, 1)
371
+ target_qs[dones] = 0.0
372
+ return (rewards + (self.gamma * target_qs))
373
+
374
+
375
+ class DuelingDQNet(nn.Module):
376
+ def __init__(self, input_dim, output_dim):
377
+ super().__init__()
378
+ self.feature_layer = nn.Sequential(
379
+ nn.Linear(input_dim, 150),
380
+ nn.ReLU(),
381
+ nn.Linear(150, 120),
382
+ nn.ReLU()
383
+ )
384
+
385
+ self.value_layer = nn.Sequential(
386
+ nn.Linear(120, 120),
387
+ nn.ReLU(),
388
+ nn.Linear(120, 1)
389
+ )
390
+
391
+ self.advantage_layer = nn.Sequential(
392
+ nn.Linear(120, 120),
393
+ nn.ReLU(),
394
+ nn.Linear(120, output_dim)
395
+ )
396
+
397
+ def forward(self, state):
398
+ feature_output = self.feature_layer(state)
399
+ # feature_output = feature_output.view(feature_output.size(0), -1)
400
+ value = self.value_layer(feature_output)
401
+ advantage = self.advantage_layer(feature_output)
402
+ q_value = value + (advantage - advantage.mean())
403
+
404
+ return q_value
405
+
406
+
407
+ class DuelingDQNAgent:
408
+ def __init__(self,
409
+ state_dim,
410
+ action_dim,
411
+ save_dir,
412
+ checkpoint=None,
413
+ learning_rate=0.00025,
414
+ max_memory_size=100000,
415
+ batch_size=32,
416
+ exploration_rate=1,
417
+ exploration_rate_decay=0.9999999,
418
+ exploration_rate_min=0.1,
419
+ training_frequency=1,
420
+ learning_starts=1000,
421
+ target_network_sync_frequency=500,
422
+ reset_exploration_rate=False,
423
+ save_frequency=100000,
424
+ gamma=0.9,
425
+ load_replay_buffer=True):
426
+ self.state_dim = state_dim
427
+ self.action_dim = action_dim
428
+ self.max_memory_size = max_memory_size
429
+ self.memory = deque(maxlen=max_memory_size)
430
+ self.batch_size = batch_size
431
+
432
+ self.exploration_rate = exploration_rate
433
+ self.exploration_rate_decay = exploration_rate_decay
434
+ self.exploration_rate_min = exploration_rate_min
435
+ self.gamma = gamma
436
+
437
+ self.curr_step = 0
438
+ self.learning_starts = learning_starts # min. experiences before training
439
+
440
+ self.training_frequency = training_frequency # no. of experiences between updates to Q_online
441
+ self.target_network_sync_frequency = target_network_sync_frequency # no. of experiences between Q_target & Q_online sync
442
+
443
+ self.save_every = save_frequency # no. of experiences between saving the network
444
+ self.save_dir = save_dir
445
+
446
+ self.use_cuda = torch.cuda.is_available()
447
+
448
+
449
+ self.online_net = DuelingDQNet(self.state_dim, self.action_dim).float()
450
+ self.target_net = copy.deepcopy(self.online_net)
451
+ # Q_target parameters are frozen.
452
+ for p in self.target_net.parameters():
453
+ p.requires_grad = False
454
+
455
+ if self.use_cuda:
456
+ self.online_net = self.online_net(device='cuda')
457
+ self.target_net = self.target_net(device='cuda')
458
+ if checkpoint:
459
+ self.load(checkpoint, reset_exploration_rate, load_replay_buffer)
460
+
461
+ self.optimizer = torch.optim.AdamW(self.online_net.parameters(), lr=learning_rate, amsgrad=True)
462
+ self.loss_fn = torch.nn.SmoothL1Loss()
463
+ # self.optimizer = torch.optim.Adam(self.online_net.parameters(), lr=learning_rate)
464
+ # self.loss_fn = torch.nn.MSELoss()
465
+
466
+
467
+ def act(self, state):
468
+ """
469
+ Given a state, choose an epsilon-greedy action and update value of step.
470
+
471
+ Inputs:
472
+ state(LazyFrame): A single observation of the current state, dimension is (state_dim)
473
+ Outputs:
474
+ action_idx (int): An integer representing which action the agent will perform
475
+ """
476
+ # EXPLORE
477
+ if np.random.rand() < self.exploration_rate:
478
+ action_idx = np.random.randint(self.action_dim)
479
+
480
+ # EXPLOIT
481
+ else:
482
+ state = torch.FloatTensor(state).cuda() if self.use_cuda else torch.FloatTensor(state)
483
+ state = state.unsqueeze(0)
484
+ action_values = self.online_net(state)
485
+ action_idx = torch.argmax(action_values, axis=1).item()
486
+
487
+ # decrease exploration_rate
488
+ self.exploration_rate *= self.exploration_rate_decay
489
+ self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)
490
+
491
+ # increment step
492
+ self.curr_step += 1
493
+ return action_idx
494
+
495
+ def cache(self, state, next_state, action, reward, done):
496
+ """
497
+ Store the experience to self.memory (replay buffer)
498
+
499
+ Inputs:
500
+ state (LazyFrame),
501
+ next_state (LazyFrame),
502
+ action (int),
503
+ reward (float),
504
+ done(bool))
505
+ """
506
+ print("####################################")
507
+ print(state)
508
+ state = torch.FloatTensor(state).cuda() if self.use_cuda else torch.FloatTensor(state)
509
+ next_state = torch.FloatTensor(next_state).cuda() if self.use_cuda else torch.FloatTensor(next_state)
510
+ action = torch.LongTensor([action]).cuda() if self.use_cuda else torch.LongTensor([action])
511
+ reward = torch.DoubleTensor([reward]).cuda() if self.use_cuda else torch.DoubleTensor([reward])
512
+ done = torch.BoolTensor([done]).cuda() if self.use_cuda else torch.BoolTensor([done])
513
+
514
+ self.memory.append( (state, next_state, action, reward, done,) )
515
+
516
+
517
+ def recall(self):
518
+ """
519
+ Retrieve a batch of experiences from memory
520
+ """
521
+ batch = random.sample(self.memory, self.batch_size)
522
+ state, next_state, action, reward, done = map(torch.stack, zip(*batch))
523
+ return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()
524
+
525
+
526
+ def td_estimate(self, states, actions):
527
+ actions = actions.reshape(-1, 1)
528
+ predicted_qs = self.online_net(states)# Q_online(s,a)
529
+ predicted_qs = predicted_qs.gather(1, actions)
530
+ return predicted_qs
531
+
532
+
533
+ @torch.no_grad()
534
+ def td_target(self, rewards, next_states, dones):
535
+ rewards = rewards.reshape(-1, 1)
536
+ dones = dones.reshape(-1, 1)
537
+ target_qs = self.target_net.forward(next_states)
538
+ target_qs = torch.max(target_qs, dim=1).values
539
+ target_qs = target_qs.reshape(-1, 1)
540
+ target_qs[dones] = 0.0
541
+ return (rewards + (self.gamma * target_qs))
542
+
543
+ def update_Q_online(self, td_estimate, td_target) :
544
+ loss = self.loss_fn(td_estimate.float(), td_target.float())
545
+ self.optimizer.zero_grad()
546
+ loss.backward()
547
+ self.optimizer.step()
548
+ return loss.item()
549
+
550
+
551
+ def sync_Q_target(self):
552
+ self.target_net.load_state_dict(self.online_net.state_dict())
553
+
554
+
555
+ def learn(self):
556
+ if self.curr_step % self.target_network_sync_frequency == 0:
557
+ self.sync_Q_target()
558
+
559
+ if self.curr_step % self.save_every == 0:
560
+ self.save()
561
+
562
+ if self.curr_step < self.learning_starts:
563
+ return None, None
564
+
565
+ if self.curr_step % self.training_frequency != 0:
566
+ return None, None
567
+
568
+ # Sample from memory
569
+ state, next_state, action, reward, done = self.recall()
570
+
571
+ # Get TD Estimate
572
+ td_est = self.td_estimate(state, action)
573
+
574
+ # Get TD Target
575
+ td_tgt = self.td_target(reward, next_state, done)
576
+
577
+ # Backpropagate loss through Q_online
578
+ loss = self.update_Q_online(td_est, td_tgt)
579
+
580
+ return (td_est.mean().item(), loss)
581
+
582
+
583
+ def save(self):
584
+ save_path = self.save_dir / f"airstriker_net_{int(self.curr_step // self.save_every)}.chkpt"
585
+ torch.save(
586
+ dict(
587
+ model=self.online_net.state_dict(),
588
+ exploration_rate=self.exploration_rate,
589
+ replay_memory=self.memory
590
+ ),
591
+ save_path
592
+ )
593
+
594
+ print(f"Airstriker model saved to {save_path} at step {self.curr_step}")
595
+
596
+
597
+ def load(self, load_path, reset_exploration_rate, load_replay_buffer):
598
+ if not load_path.exists():
599
+ raise ValueError(f"{load_path} does not exist")
600
+
601
+ ckp = torch.load(load_path, map_location=('cuda' if self.use_cuda else 'cpu'))
602
+ exploration_rate = ckp.get('exploration_rate')
603
+ state_dict = ckp.get('model')
604
+
605
+
606
+ print(f"Loading model at {load_path} with exploration rate {exploration_rate}")
607
+ self.online_net.load_state_dict(state_dict)
608
+ self.target_net = copy.deepcopy(self.online_net)
609
+ self.sync_Q_target()
610
+
611
+ if load_replay_buffer:
612
+ replay_memory = ckp.get('replay_memory')
613
+ print(f"Loading replay memory. Len {len(replay_memory)}" if replay_memory else "Saved replay memory not found. Not restoring replay memory.")
614
+ self.memory = replay_memory if replay_memory else self.memory
615
+
616
+ if reset_exploration_rate:
617
+ print(f"Reset exploration rate option specified. Not restoring saved exploration rate {exploration_rate}. The current exploration rate is {self.exploration_rate}")
618
+ else:
619
+ print(f"Setting exploration rate to {exploration_rate} not loaded.")
620
+ self.exploration_rate = exploration_rate
621
+
622
+
623
+
624
+
625
+ class DuelingDDQNAgent(DuelingDQNAgent):
626
+ @torch.no_grad()
627
+ def td_target(self, rewards, next_states, dones):
628
+ rewards = rewards.reshape(-1, 1)
629
+ dones = dones.reshape(-1, 1)
630
+ q_vals = self.online_net.forward(next_states)
631
+ target_actions = torch.argmax(q_vals, axis=1)
632
+ target_actions = target_actions.reshape(-1, 1)
633
+
634
+ target_qs = self.target_net.forward(next_states)
635
+ target_qs = target_qs.gather(1, target_actions)
636
+ target_qs = target_qs.reshape(-1, 1)
637
+ target_qs[dones] = 0.0
638
+ return (rewards + (self.gamma * target_qs))
639
+
640
+
641
+
642
+ class DQNAgentWithStepDecay:
643
+ def __init__(self,
644
+ state_dim,
645
+ action_dim,
646
+ save_dir,
647
+ checkpoint=None,
648
+ learning_rate=0.00025,
649
+ max_memory_size=100000,
650
+ batch_size=32,
651
+ exploration_rate=1,
652
+ exploration_rate_decay=0.9999999,
653
+ exploration_rate_min=0.1,
654
+ training_frequency=1,
655
+ learning_starts=1000,
656
+ target_network_sync_frequency=500,
657
+ reset_exploration_rate=False,
658
+ save_frequency=100000,
659
+ gamma=0.9,
660
+ load_replay_buffer=True):
661
+ self.state_dim = state_dim
662
+ self.action_dim = action_dim
663
+ self.max_memory_size = max_memory_size
664
+ self.memory = deque(maxlen=max_memory_size)
665
+ self.batch_size = batch_size
666
+
667
+ self.exploration_rate = exploration_rate
668
+ self.exploration_rate_decay = exploration_rate_decay
669
+ self.exploration_rate_min = exploration_rate_min
670
+ self.gamma = gamma
671
+
672
+ self.curr_step = 0
673
+ self.learning_starts = learning_starts # min. experiences before training
674
+
675
+ self.training_frequency = training_frequency # no. of experiences between updates to Q_online
676
+ self.target_network_sync_frequency = target_network_sync_frequency # no. of experiences between Q_target & Q_online sync
677
+
678
+ self.save_every = save_frequency # no. of experiences between saving the network
679
+ self.save_dir = save_dir
680
+
681
+ self.use_cuda = torch.cuda.is_available()
682
+
683
+ self.net = DQNet(self.state_dim, self.action_dim).float()
684
+ if self.use_cuda:
685
+ self.net = self.net.to(device='cuda')
686
+ if checkpoint:
687
+ self.load(checkpoint, reset_exploration_rate, load_replay_buffer)
688
+
689
+ self.optimizer = torch.optim.AdamW(self.net.parameters(), lr=learning_rate, amsgrad=True)
690
+ self.loss_fn = torch.nn.SmoothL1Loss()
691
+ # self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate)
692
+ # self.loss_fn = torch.nn.MSELoss()
693
+
694
+
695
+ def act(self, state):
696
+ """
697
+ Given a state, choose an epsilon-greedy action and update value of step.
698
+
699
+ Inputs:
700
+ state(LazyFrame): A single observation of the current state, dimension is (state_dim)
701
+ Outputs:
702
+ action_idx (int): An integer representing which action the agent will perform
703
+ """
704
+ # EXPLORE
705
+ if np.random.rand() < self.exploration_rate:
706
+ action_idx = np.random.randint(self.action_dim)
707
+
708
+ # EXPLOIT
709
+ else:
710
+ state = torch.FloatTensor(state).cuda() if self.use_cuda else torch.FloatTensor(state)
711
+ state = state.unsqueeze(0)
712
+ action_values = self.net(state, model='online')
713
+ action_idx = torch.argmax(action_values, axis=1).item()
714
+
715
+ # decrease exploration_rate
716
+
717
+ self.exploration_rate *= self.exploration_rate_decay
718
+ self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)
719
+
720
+ # increment step
721
+ self.curr_step += 1
722
+ return action_idx
723
+
724
+ def cache(self, state, next_state, action, reward, done):
725
+ """
726
+ Store the experience to self.memory (replay buffer)
727
+
728
+ Inputs:
729
+ state (LazyFrame),
730
+ next_state (LazyFrame),
731
+ action (int),
732
+ reward (float),
733
+ done(bool))
734
+ """
735
+ state = torch.FloatTensor(state).cuda() if self.use_cuda else torch.FloatTensor(state)
736
+ next_state = torch.FloatTensor(next_state).cuda() if self.use_cuda else torch.FloatTensor(next_state)
737
+ action = torch.LongTensor([action]).cuda() if self.use_cuda else torch.LongTensor([action])
738
+ reward = torch.DoubleTensor([reward]).cuda() if self.use_cuda else torch.DoubleTensor([reward])
739
+ done = torch.BoolTensor([done]).cuda() if self.use_cuda else torch.BoolTensor([done])
740
+
741
+ self.memory.append( (state, next_state, action, reward, done) )
742
+
743
+
744
+ def recall(self):
745
+ """
746
+ Retrieve a batch of experiences from memory
747
+ """
748
+ batch = random.sample(self.memory, self.batch_size)
749
+ state, next_state, action, reward, done = map(torch.stack, zip(*batch))
750
+ return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()
751
+
752
+
753
+ def td_estimate(self, states, actions):
754
+ actions = actions.reshape(-1, 1)
755
+ predicted_qs = self.net(states, model='online')# Q_online(s,a)
756
+ predicted_qs = predicted_qs.gather(1, actions)
757
+ return predicted_qs
758
+
759
+
760
+ @torch.no_grad()
761
+ def td_target(self, rewards, next_states, dones):
762
+ rewards = rewards.reshape(-1, 1)
763
+ dones = dones.reshape(-1, 1)
764
+ target_qs = self.net(next_states, model='target')
765
+ target_qs = torch.max(target_qs, dim=1).values
766
+ target_qs = target_qs.reshape(-1, 1)
767
+ target_qs[dones] = 0.0
768
+ val = self.gamma * target_qs
769
+ return (rewards + val)
770
+
771
+ def update_Q_online(self, td_estimate, td_target) :
772
+ loss = self.loss_fn(td_estimate.float(), td_target.float())
773
+ self.optimizer.zero_grad()
774
+ loss.backward()
775
+ self.optimizer.step()
776
+ return loss.item()
777
+
778
+
779
+ def sync_Q_target(self):
780
+ self.net.target.load_state_dict(self.net.online.state_dict())
781
+
782
+
783
+ def learn(self):
784
+ if self.curr_step % self.target_network_sync_frequency == 0:
785
+ self.sync_Q_target()
786
+
787
+ if self.curr_step % self.save_every == 0:
788
+ self.save()
789
+
790
+ if self.curr_step < self.learning_starts:
791
+ return None, None
792
+
793
+ if self.curr_step % self.training_frequency != 0:
794
+ return None, None
795
+
796
+ # Sample from memory
797
+ state, next_state, action, reward, done = self.recall()
798
+
799
+ # Get TD Estimate
800
+ td_est = self.td_estimate(state, action)
801
+
802
+ # Get TD Target
803
+ td_tgt = self.td_target(reward, next_state, done)
804
+
805
+ # Backpropagate loss through Q_online
806
+
807
+ loss = self.update_Q_online(td_est, td_tgt)
808
+
809
+ return (td_est.mean().item(), loss)
810
+
811
+
812
+ def save(self):
813
+ save_path = self.save_dir / f"airstriker_net_{int(self.curr_step // self.save_every)}.chkpt"
814
+ torch.save(
815
+ dict(
816
+ model=self.net.state_dict(),
817
+ exploration_rate=self.exploration_rate,
818
+ replay_memory=self.memory
819
+ ),
820
+ save_path
821
+ )
822
+
823
+ print(f"Airstriker model saved to {save_path} at step {self.curr_step}")
824
+
825
+
826
+ def load(self, load_path, reset_exploration_rate, load_replay_buffer):
827
+ if not load_path.exists():
828
+ raise ValueError(f"{load_path} does not exist")
829
+
830
+ ckp = torch.load(load_path, map_location=('cuda' if self.use_cuda else 'cpu'))
831
+ exploration_rate = ckp.get('exploration_rate')
832
+ state_dict = ckp.get('model')
833
+
834
+
835
+ print(f"Loading model at {load_path} with exploration rate {exploration_rate}")
836
+ self.net.load_state_dict(state_dict)
837
+
838
+ if load_replay_buffer:
839
+ replay_memory = ckp.get('replay_memory')
840
+ print(f"Loading replay memory. Len {len(replay_memory)}" if replay_memory else "Saved replay memory not found. Not restoring replay memory.")
841
+ self.memory = replay_memory if replay_memory else self.memory
842
+
843
+ if reset_exploration_rate:
844
+ print(f"Reset exploration rate option specified. Not restoring saved exploration rate {exploration_rate}. The current exploration rate is {self.exploration_rate}")
845
+ else:
846
+ print(f"Setting exploration rate to {exploration_rate} not loaded.")
847
+ self.exploration_rate = exploration_rate
848
+
849
+
lunar_lander.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 The HuggingFace Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # File inspired by source: https://github.com/openai/gym/blob/master/gym/envs/box2d/lunar_lander.py
16
+
17
+ import argparse
18
+ import time
19
+ import os
20
+ import numpy as np
21
+
22
+ import simulate as sm
23
+ import os
24
+ from pathlib import Path
25
+ from agent import DuelingDQNAgent, MetricLogger
26
+ from params import hyperparams
27
+
28
+ # This example reimplements the famous lunar lander reinforcement learning environment.
29
+
30
+ # CONSTANTS From source
31
+ # TODO implement scaling
32
+ SCALE = 30.0 # affects how fast-paced the game is, forces should be adjusted as well
33
+
34
+ # TODO integrate random initial forces
35
+ INITIAL_RANDOM = 1000.0 # Set 1500 to make game harder
36
+
37
+ # Lander construction
38
+ LANDER_POLY = np.array([(-17, -10, 0), (-17, 0, 0), (-14, 17, 0), (14, 17, 0), (17, 0, 0), (17, -10, 0)])[::-1] / SCALE
39
+ LEG_AWAY = 20
40
+ LEG_DOWN = -7
41
+ LEG_ANGLE = 0.25 # radians
42
+ LEG_W, LEG_H = 2, 8
43
+
44
+ LEG_RIGHT_POLY = (
45
+ np.array(
46
+ [
47
+ (LEG_AWAY, LEG_DOWN, 0),
48
+ (LEG_AWAY + LEG_H * np.sin(LEG_ANGLE), LEG_DOWN - LEG_H * np.cos(LEG_ANGLE), 0),
49
+ (
50
+ LEG_AWAY + LEG_H * np.sin(LEG_ANGLE) + LEG_W * np.sin(np.pi / 2 - LEG_ANGLE),
51
+ LEG_DOWN - LEG_H * np.cos(LEG_ANGLE) + LEG_W * np.cos(np.pi / 2 - LEG_ANGLE),
52
+ 0,
53
+ ),
54
+ (LEG_AWAY + LEG_W * np.sin(np.pi / 2 - LEG_ANGLE), LEG_DOWN + LEG_W * np.cos(np.pi / 2 - LEG_ANGLE), 0),
55
+ ]
56
+ )
57
+ / SCALE
58
+ )
59
+
60
+ LEG_LEFT_POLY = [[-x, y, z] for x, y, z in LEG_RIGHT_POLY][::-1]
61
+ LANDER_COLOR = [128 / 255, 102 / 255, 230 / 255]
62
+
63
+ # terrain construction
64
+ VIEWPORT_W = 600 # TODO integrate camera with these exact dimensions
65
+ VIEWPORT_H = 400
66
+
67
+ W = VIEWPORT_W / SCALE
68
+ H = VIEWPORT_H / SCALE
69
+
70
+ CHUNKS = 11
71
+ HEIGHTS = np.random.uniform(0, H / 2, size=(CHUNKS + 1,))
72
+ CHUNK_X = [W / (CHUNKS - 1) * i for i in range(CHUNKS)]
73
+ HELIPAD_x1 = CHUNK_X[CHUNKS // 2 - 1]
74
+ HELIPAD_x2 = CHUNK_X[CHUNKS // 2 + 1]
75
+ HELIPAD_y = H / 4
76
+ HEIGHTS[CHUNKS // 2 - 2] = HELIPAD_y
77
+ HEIGHTS[CHUNKS // 2 - 1] = HELIPAD_y
78
+ HEIGHTS[CHUNKS // 2 + 0] = HELIPAD_y
79
+ HEIGHTS[CHUNKS // 2 + 1] = HELIPAD_y
80
+ HEIGHTS[CHUNKS // 2 + 2] = HELIPAD_y
81
+ SMOOTH_Y = [0.33 * (HEIGHTS[i - 1] + HEIGHTS[i + 0] + HEIGHTS[i + 1]) for i in range(CHUNKS)]
82
+
83
+ # advanced features
84
+ MAIN_ENGINE_POWER = 13.0 # TODO integrate specific forces
85
+ SIDE_ENGINE_POWER = 0.6 # TODO integrate specific forces
86
+ LEG_SPRING_TORQUE = 40 # TODO integrate specific forces
87
+ SIDE_ENGINE_HEIGHT = 14.0 # TODO integrate specific forces
88
+ SIDE_ENGINE_AWAY = 12.0 # TODO integrate specific forces
89
+
90
+ LAND_POLY = (
91
+ [[CHUNK_X[0], SMOOTH_Y[0] - 3, 0]]
92
+ + [[x, y, 0] for x, y in zip(CHUNK_X, SMOOTH_Y)]
93
+ + [[CHUNK_X[-1], SMOOTH_Y[0] - 3, 0]]
94
+ )
95
+
96
+
97
+ def make_lander(engine="unity", engine_exe=""):
98
+ # Add sm scene
99
+ sc = sm.Scene(engine=engine, engine_exe=engine_exe)
100
+
101
+ # initial lander position sampling
102
+ lander_init_pos = (10, 15, 0) + np.random.uniform(2, 4, 3)
103
+ lander_init_pos[2] = 0.0 # z axis is always 0, for 2D
104
+
105
+ lander_material = sm.Material(base_color=LANDER_COLOR)
106
+
107
+ # create the lander polygons
108
+
109
+ # first, the main lander body
110
+ lander = sm.Polygon(
111
+ points=LANDER_POLY,
112
+ material=lander_material,
113
+ position=lander_init_pos,
114
+ name="lunar_lander",
115
+ is_actor=True,
116
+ physics_component=sm.RigidBodyComponent(
117
+ use_gravity=True,
118
+ constraints=["freeze_rotation_x", "freeze_rotation_y", "freeze_position_z"],
119
+ mass=1,
120
+ ),
121
+ )
122
+
123
+ # extrude to make 3D visually.
124
+ lander.mesh.extrude((0, 0, -1), capping=True, inplace=True)
125
+ lander.actuator = sm.Actuator(
126
+ mapping=[
127
+ sm.ActionMapping("add_force", axis=[1, 0, 0], amplitude=5),
128
+ sm.ActionMapping("add_force", axis=[1, 0, 0], amplitude=-5),
129
+ sm.ActionMapping("add_force", axis=[0, 1, 0], amplitude=2.5),
130
+ ],
131
+ n=3,
132
+ )
133
+
134
+ # add an invisible box as collider until convex meshes are completed
135
+ lander += sm.Box(
136
+ position=[0, np.min(LEG_RIGHT_POLY, axis=0)[1], -0.5],
137
+ bounds=[0.1, 2 * np.max(LEG_RIGHT_POLY, axis=0)[0], 1],
138
+ material=sm.Material.TRANSPARENT,
139
+ rotation=[0, 0, 90],
140
+ with_collider=True,
141
+ name="lander_collider_box_bottom",
142
+ )
143
+ lander += sm.Box(
144
+ position=[-0.6, 0, -0.5],
145
+ bounds=[0.1, 26 / SCALE, 1],
146
+ material=sm.Material.TRANSPARENT,
147
+ rotation=[0, 0, -15],
148
+ with_collider=True,
149
+ name="lander_collider_box_right",
150
+ )
151
+ lander += sm.Box(
152
+ position=[0.6, 0, -0.5],
153
+ bounds=[0.1, 26 / SCALE, 1],
154
+ material=sm.Material.TRANSPARENT,
155
+ rotation=[0, 0, 15],
156
+ with_collider=True,
157
+ name="lander_collider_box_left",
158
+ )
159
+
160
+ # add legs as children objects (they take positions as local coordinates!)
161
+ r_leg = sm.Polygon(
162
+ points=LEG_RIGHT_POLY,
163
+ material=lander_material,
164
+ parent=lander,
165
+ name="lander_r_leg",
166
+ # with_collider=True, # TODO can use this when convex colliders is added
167
+ )
168
+ r_leg.mesh.extrude((0, 0, -1), capping=True, inplace=True)
169
+
170
+ l_leg = sm.Polygon(
171
+ points=LEG_LEFT_POLY,
172
+ material=lander_material,
173
+ parent=lander,
174
+ name="lander_l_leg",
175
+ # with_collider=True, # TODO can use this when convex colliders is added
176
+ )
177
+ l_leg.mesh.extrude((0, 0, -1), capping=True, inplace=True)
178
+
179
+ # Create land object
180
+ land = sm.Polygon(
181
+ points=LAND_POLY[::-1], # Reversing vertex order so the normal faces the right direction
182
+ material=sm.Material.GRAY,
183
+ name="Moon",
184
+ )
185
+ land.mesh.extrude((0, 0, -1), capping=True, inplace=True)
186
+
187
+ # Create collider blocks for the land (non-convex meshes are TODO)
188
+ for i in range(len(CHUNK_X) - 1):
189
+ x1, x2 = CHUNK_X[i], CHUNK_X[i + 1]
190
+ y1, y2 = SMOOTH_Y[i], SMOOTH_Y[i + 1]
191
+
192
+ # compute rotation from generated coordinates
193
+ rotation = [0, 0, +90 + np.degrees(np.arctan2(y2 - (y1 + y2) / 2, (x2 - x1) / 2))]
194
+ block_i = sm.Box(
195
+ position=[(x1 + x2) / 2, (y1 + y2) / 2, -0.5],
196
+ bounds=[0.2, 1.025 * np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2), 1], # adjustment for better colliders
197
+ material=sm.Material.GRAY,
198
+ rotation=rotation,
199
+ with_collider=True,
200
+ name="land_collider_" + str(i),
201
+ )
202
+ sc += block_i
203
+
204
+ # add target triangle / cone for reward
205
+ sc += sm.Cone(
206
+ position=[(HELIPAD_x1 + HELIPAD_x2) / 2, HELIPAD_y, -0.5],
207
+ height=10 / SCALE,
208
+ radius=10 / SCALE,
209
+ material=sm.Material.YELLOW,
210
+ name="target",
211
+ )
212
+
213
+ # TODO add lander state sensors for state-based RL
214
+ sc += sm.StateSensor(
215
+ target_entity=sc.target,
216
+ reference_entity=lander,
217
+ properties=["position", "rotation", "distance"],
218
+ name="goal_sense",
219
+ )
220
+
221
+ # create Euclidean distance reward, scalar changes the reward to a cost
222
+ cost = sm.RewardFunction(
223
+ type="dense", entity_a=lander, entity_b=sc.target, scalar=-1
224
+ ) # By default a dense reward equal to the distance between 2 entities
225
+ lander += cost
226
+
227
+ sc += lander
228
+ sc += land
229
+
230
+ return sc
231
+
232
+
233
+ def get_values(state):
234
+ return state.get("StateSensor")
235
+
236
+ def train(agent, env, logger):
237
+ episodes = 20000
238
+ for e in range(episodes):
239
+
240
+ state = env.reset()
241
+ # Play the game!
242
+ for i in range(100):
243
+
244
+ # Run agent on the state
245
+ action = agent.act(get_values(state))
246
+ # env.render()
247
+ # Agent performs action
248
+ next_state, reward, done, info = env.step(action)
249
+
250
+ print("####################")
251
+ print(done)
252
+ print("####################")
253
+
254
+ # Remember
255
+ agent.cache(get_values(state), get_values(next_state), action, reward, done)
256
+
257
+ # Learn
258
+ q, loss = agent.learn()
259
+
260
+ # Logging
261
+ logger.log_step(reward, loss, q)
262
+
263
+ # Update state
264
+ state = next_state
265
+
266
+ # Check if end of game
267
+ if done:
268
+ break
269
+
270
+ logger.log_episode(e)
271
+
272
+ if e % 20 == 0:
273
+ logger.record(episode=e, epsilon=agent.exploration_rate, step=agent.curr_step)
274
+
275
+
276
+ if __name__ == "__main__":
277
+ parser = argparse.ArgumentParser()
278
+ parser.add_argument("--build_exe", default="", type=str, required=False, help="Pre-built unity app for simulate")
279
+ parser.add_argument(
280
+ "--num_steps", default=100, type=int, required=False, help="number of steps to run the simulator"
281
+ )
282
+ args = parser.parse_args()
283
+
284
+ sc = make_lander(engine="unity", engine_exe=args.build_exe)
285
+ sc += sm.LightSun()
286
+
287
+ env = sm.RLEnv(sc, frame_skip=1)
288
+ env.reset()
289
+
290
+ # for i in range(500):
291
+ # print(sc.observation_space.sample())
292
+ # action = [sc.action_space.sample()]
293
+ # print("###############")
294
+ # print(action)
295
+ # obs, reward, done, info = env.step(action)
296
+ # print(obs)
297
+ # print(f"step {i}, reward {reward[0]}")
298
+ # time.sleep(0.1)
299
+
300
+ # env.close()
301
+
302
+ checkpoint = None
303
+ # checkpoint = Path('checkpoints/latest/airstriker_net_3.chkpt')
304
+
305
+ path = "checkpoints/lunar-lander-dueling-dqn-rc"
306
+ save_dir = Path(path)
307
+
308
+ isExist = os.path.exists(path)
309
+ if not isExist:
310
+ os.makedirs(path)
311
+
312
+ logger = MetricLogger(save_dir)
313
+
314
+ print("Training Dueling DQN Agent with step decay!")
315
+ agent = DuelingDQNAgent(
316
+ state_dim=7,
317
+ action_dim=env.action_space.n,
318
+ save_dir=save_dir,
319
+ checkpoint=checkpoint,
320
+ **hyperparams
321
+ )
322
+ # print("Training Dueling DQN Agent!")
323
+ # agent = DuelingDQNAgent(
324
+ # state_dim=8,
325
+ # action_dim=env.action_space.n,
326
+ # save_dir=save_dir,
327
+ # checkpoint=checkpoint,
328
+ # **hyperparams
329
+ # )
330
+
331
+ # fill_memory(agent, env, 5000)
332
+ train(agent, env, logger)
params.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hyperparams = dict(
2
+ batch_size=128,
3
+ exploration_rate=1,
4
+ exploration_rate_decay=0.99999,
5
+ exploration_rate_min=0.01,
6
+ training_frequency=1,
7
+ target_network_sync_frequency=20,
8
+ max_memory_size=1000000,
9
+ learning_rate=0.001,
10
+ learning_starts=128,
11
+ save_frequency=100000
12
+ )