asataura commited on
Commit
7439a65
·
1 Parent(s): e508771

removing all irrelevant files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. DDQN_FNN.py +103 -0
  2. agents/Base_Agent.py +0 -394
  3. agents/DQN_agents/DDQN.py +0 -18
  4. agents/DQN_agents/DDQN_With_Prioritised_Experience_Replay.py +0 -37
  5. agents/DQN_agents/DQN.py +0 -135
  6. agents/DQN_agents/DQN_HER.py +0 -30
  7. agents/DQN_agents/DQN_With_Fixed_Q_Targets.py +0 -23
  8. agents/DQN_agents/Dueling_DDQN.py +0 -64
  9. agents/DQN_agents/__init__.py +0 -1
  10. agents/DQN_agents/__pycache__/DDQN.cpython-310.pyc +0 -0
  11. agents/DQN_agents/__pycache__/DDQN.cpython-38.pyc +0 -0
  12. agents/DQN_agents/__pycache__/DDQN.cpython-39.pyc +0 -0
  13. agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-310.pyc +0 -0
  14. agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-39.pyc +0 -0
  15. agents/DQN_agents/__pycache__/DQN.cpython-310.pyc +0 -0
  16. agents/DQN_agents/__pycache__/DQN.cpython-39.pyc +0 -0
  17. agents/DQN_agents/__pycache__/DQN_HER.cpython-310.pyc +0 -0
  18. agents/DQN_agents/__pycache__/DQN_HER.cpython-39.pyc +0 -0
  19. agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-310.pyc +0 -0
  20. agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-38.pyc +0 -0
  21. agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-39.pyc +0 -0
  22. agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-310.pyc +0 -0
  23. agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-39.pyc +0 -0
  24. agents/DQN_agents/__pycache__/__init__.cpython-310.pyc +0 -0
  25. agents/DQN_agents/__pycache__/__init__.cpython-38.pyc +0 -0
  26. agents/DQN_agents/__pycache__/__init__.cpython-39.pyc +0 -0
  27. agents/HER_Base.py +0 -100
  28. agents/Trainer.py +0 -304
  29. agents/__init__.py +0 -1
  30. agents/__pycache__/Base_Agent.cpython-310.pyc +0 -0
  31. agents/__pycache__/Base_Agent.cpython-38.pyc +0 -0
  32. agents/__pycache__/Base_Agent.cpython-39.pyc +0 -0
  33. agents/__pycache__/HER_Base.cpython-310.pyc +0 -0
  34. agents/__pycache__/HER_Base.cpython-39.pyc +0 -0
  35. agents/__pycache__/Trainer.cpython-310.pyc +0 -0
  36. agents/__pycache__/Trainer.cpython-39.pyc +0 -0
  37. agents/__pycache__/__init__.cpython-310.pyc +0 -0
  38. agents/__pycache__/__init__.cpython-38.pyc +0 -0
  39. agents/__pycache__/__init__.cpython-39.pyc +0 -0
  40. agents/actor_critic_agents/A2C.py +0 -25
  41. agents/actor_critic_agents/A3C.py +0 -229
  42. agents/actor_critic_agents/DDPG.py +0 -115
  43. agents/actor_critic_agents/DDPG_HER.py +0 -38
  44. agents/actor_critic_agents/SAC.py +0 -211
  45. agents/actor_critic_agents/SAC_Discrete.py +0 -94
  46. agents/actor_critic_agents/TD3.py +0 -54
  47. agents/actor_critic_agents/__pycache__/A2C.cpython-39.pyc +0 -0
  48. agents/actor_critic_agents/__pycache__/A3C.cpython-39.pyc +0 -0
  49. agents/actor_critic_agents/__pycache__/DDPG.cpython-39.pyc +0 -0
  50. agents/actor_critic_agents/__pycache__/SAC.cpython-310.pyc +0 -0
DDQN_FNN.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from collections import deque
3
+ from tensorflow import keras
4
+ import random
5
+
6
+
7
+ class DoubleDeepQNetwork:
8
+ def __init__(self, states, actions, history, alpha, gamma, epsilon, epsilon_min, epsilon_decay):
9
+ self.nS = states
10
+ self.nA = actions
11
+ self.history = history
12
+ self.memory = deque([], maxlen=2500)
13
+ self.alpha = alpha
14
+ self.gamma = gamma
15
+ # Explore/Exploit
16
+ self.epsilon = epsilon
17
+ self.epsilon_min = epsilon_min
18
+ self.epsilon_decay = epsilon_decay
19
+ self.model = self.build_model()
20
+ self.model_target = self.build_model() # Second (target) neural network
21
+ self.update_target_from_model() # Update weights
22
+ self.loss = []
23
+
24
+ def build_model(self):
25
+ model = keras.Sequential() # linear stack of layers https://keras.io/models/sequential/
26
+ model.add(keras.layers.Dense(24, input_dim=self.history * self.nS, activation='relu')) # [Input] -> Layer 1
27
+ # Dense: Densely connected layer https://keras.io/layers/core/
28
+ # 24: Number of neurons
29
+ # input_dim: Number of input variables
30
+ # activation: Rectified Linear Unit (relu) ranges >= 0
31
+ model.add(keras.layers.Dense(24, activation='relu')) # Layer 2 -> 3
32
+ model.add(keras.layers.Dense(self.nA, activation='linear')) # Layer 3 -> [output]
33
+ # Size has to match the output (different actions)
34
+ # Linear activation on the last layer
35
+ model.compile(loss='mean_squared_error', # Loss function: Mean Squared Error
36
+ optimizer=keras.optimizers.Adam(
37
+ lr=self.alpha)) # Optimizer: Adam (Feel free to check other options)
38
+ return model
39
+
40
+ def update_target_from_model(self):
41
+ # Update the target model from the base model
42
+ self.model_target.set_weights(self.model.get_weights())
43
+
44
+ def action(self, state):
45
+ if np.random.rand() <= self.epsilon:
46
+ return random.randrange(self.nA) # Explore
47
+ action_vals = self.model.predict(state) # Exploit: Use the NN to predict the correct action from this state
48
+ return np.argmax(action_vals[0])
49
+
50
+ def test_action(self, state): # Exploit
51
+ action_vals = self.model.predict(state)
52
+ return np.argmax(action_vals[0])
53
+
54
+ def store(self, state, action, reward, next_state, done):
55
+ # Store the experience in memory
56
+ self.memory.append((state, action, reward, next_state, done))
57
+
58
+ def save_model(self, agentName):
59
+ # Save the agent model weights in a file
60
+ self.model.save(agentName)
61
+
62
+ def experience_replay(self, batch_size):
63
+ # Execute the experience replay
64
+ minibatch = random.sample(self.memory, batch_size) # Randomly sample from memory
65
+
66
+ # Convert to numpy for speed by vectorization
67
+ x = []
68
+ y = []
69
+ np_array = np.array(minibatch)
70
+ st = np.zeros((0, self.history*self.nS)) # States
71
+ nst = np.zeros((0, self.history*self.nS)) # Next States
72
+ for i in range(len(np_array)): # Creating the state and next state np arrays
73
+ st = np.append(st, np_array[i, 0], axis=0)
74
+ nst = np.append(nst, np_array[i, 3], axis=0)
75
+ st_predict = self.model.predict(st) # Here is the speedup! I can predict on the ENTIRE batch
76
+ nst_predict = self.model.predict(nst)
77
+ nst_predict_target = self.model_target.predict(nst) # Predict from the TARGET
78
+ index = 0
79
+ for state, action, reward, next_state, done in minibatch:
80
+ x.append(state)
81
+ # Predict from state
82
+ nst_action_predict_target = nst_predict_target[index]
83
+ nst_action_predict_model = nst_predict[index]
84
+ if done: # Terminal: Just assign reward much like {* (not done) - QB[state][action]}
85
+ target = reward
86
+ else: # Non-terminal
87
+ target = reward + self.gamma * nst_action_predict_target[
88
+ np.argmax(nst_action_predict_model)] # Using Q to get T is Double DQN
89
+ target_f = st_predict[index]
90
+ target_f[action] = target
91
+ y.append(target_f)
92
+ index += 1
93
+ # Reshape for Keras Fit
94
+ x_reshape = np.array(x).reshape(batch_size, self.history * self.nS)
95
+ y_reshape = np.array(y)
96
+ epoch_count = 1
97
+ hist = self.model.fit(x_reshape, y_reshape, epochs=epoch_count, verbose=0)
98
+ # Graph Losses
99
+ for i in range(epoch_count):
100
+ self.loss.append(hist.history['loss'][i])
101
+ # Decay Epsilon
102
+ if self.epsilon > self.epsilon_min:
103
+ self.epsilon *= self.epsilon_decay
agents/Base_Agent.py DELETED
@@ -1,394 +0,0 @@
1
- import logging
2
- import os
3
- import sys
4
- import gym
5
- import random
6
- import numpy as np
7
- import torch
8
- import time
9
- # import tensorflow as tf
10
- from nn_builder.pytorch.NN import NN
11
- # from tensorboardX import SummaryWriter
12
- from torch.optim import optimizer
13
-
14
-
15
- class Base_Agent(object):
16
-
17
- def __init__(self, config):
18
- self.logger = self.setup_logger()
19
- self.debug_mode = config.debug_mode
20
- # if self.debug_mode: self.tensorboard = SummaryWriter()
21
- self.config = config
22
- self.set_random_seeds(config.seed)
23
- self.environment = config.environment
24
- self.environment_title = self.get_environment_title()
25
- self.action_types = "DISCRETE" if self.environment.action_space.dtype == np.int64 else "CONTINUOUS"
26
- self.action_size = int(self.get_action_size())
27
- self.config.action_size = self.action_size
28
-
29
- self.lowest_possible_episode_score = self.get_lowest_possible_episode_score()
30
-
31
- self.state_size = int(self.get_state_size())
32
- self.hyperparameters = config.hyperparameters
33
- self.average_score_required_to_win = self.get_score_required_to_win()
34
- self.rolling_score_window = self.get_trials()
35
- # self.max_steps_per_episode = self.environment.spec.max_episode_steps
36
- self.total_episode_score_so_far = 0
37
- self.game_full_episode_scores = []
38
- self.game_full_episode_signals = []
39
- self.rolling_results = []
40
- self.max_rolling_score_seen = float("-inf")
41
- self.max_episode_score_seen = float("-inf")
42
- self.episode_number = 0
43
- self.device = "cuda:0" if config.use_GPU else "cpu"
44
- self.visualise_results_boolean = config.visualise_individual_results
45
- self.global_step_number = 0
46
- self.turn_off_exploration = False if config.training else True
47
- gym.logger.set_level(40) # stops it from printing an unnecessary warning
48
- self.log_game_info()
49
-
50
- def step(self):
51
- """Takes a step in the game. This method must be overriden by any agent"""
52
- raise ValueError("Step needs to be implemented by the agent")
53
-
54
- def get_environment_title(self):
55
- """Extracts name of environment from it"""
56
- try:
57
- name = self.environment.unwrapped.id
58
- except AttributeError:
59
- try:
60
- if str(self.environment.unwrapped)[1:11] == "FetchReach":
61
- return "FetchReach"
62
- elif str(self.environment.unwrapped)[1:8] == "AntMaze":
63
- return "AntMaze"
64
- elif str(self.environment.unwrapped)[1:7] == "Hopper":
65
- return "Hopper"
66
- elif str(self.environment.unwrapped)[1:9] == "Walker2d":
67
- return "Walker2d"
68
- else:
69
- name = self.environment.spec.id.split("-")[0]
70
- except AttributeError:
71
- name = str(self.environment.env)
72
- if name[0:10] == "TimeLimit<": name = name[10:]
73
- name = name.split(" ")[0]
74
- if name[0] == "<": name = name[1:]
75
- if name[-3:] == "Env": name = name[:-3]
76
- return name
77
-
78
- def get_lowest_possible_episode_score(self):
79
- """Returns the lowest possible episode score you can get in an environment"""
80
- if self.environment_title == "Taxi": return -800
81
- return None
82
-
83
- def get_action_size(self):
84
- """Gets the action_size for the gym env into the correct shape for a neural network"""
85
- if "overwrite_action_size" in self.config.__dict__: return self.config.overwrite_action_size
86
- if "action_size" in self.environment.__dict__: return self.environment.action_size
87
- if self.action_types == "DISCRETE":
88
- return self.environment.action_space.n
89
- else:
90
- return self.environment.action_space.shape[0]
91
-
92
- def get_state_size(self):
93
- """Gets the state_size for the gym env into the correct shape for a neural network"""
94
- random_state = self.environment.reset()
95
- if isinstance(random_state, dict):
96
- state_size = random_state["observation"].shape[0] + random_state["desired_goal"].shape[0]
97
- return state_size
98
- else:
99
- return random_state.size
100
-
101
- def get_score_required_to_win(self):
102
- """Gets average score required to win game"""
103
- print("TITLE ", self.environment_title)
104
- if self.environment_title == "FetchReach": return -5
105
- if self.environment_title in ["AntMaze", "Hopper", "Walker2d"]:
106
- print("Score required to win set to infinity therefore no learning rate annealing will happen")
107
- return float("inf")
108
- try:
109
- return self.environment.unwrapped.reward_threshold
110
- except AttributeError:
111
- try:
112
- return self.environment.spec.reward_threshold
113
- except AttributeError:
114
- return self.environment.unwrapped.spec.reward_threshold
115
-
116
- def get_trials(self):
117
- """Gets the number of trials to average a score over"""
118
- if self.environment_title in ["AntMaze", "FetchReach", "Hopper", "Walker2d", "CartPole"]: return 100
119
- try:
120
- return self.environment.unwrapped.trials
121
- except AttributeError:
122
- return self.environment.spec.trials
123
-
124
- def setup_logger(self):
125
- """Sets up the logger"""
126
- filename = "Training.log"
127
- try:
128
- if os.path.isfile(filename):
129
- os.remove(filename)
130
- except:
131
- pass
132
-
133
- logger = logging.getLogger(__name__)
134
- logger.setLevel(logging.INFO)
135
- # create a file handler
136
- handler = logging.FileHandler(filename)
137
- handler.setLevel(logging.INFO)
138
- # create a logging format
139
- formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
140
- handler.setFormatter(formatter)
141
- # add the handlers to the logger
142
- logger.addHandler(handler)
143
- return logger
144
-
145
- def log_game_info(self):
146
- """Logs info relating to the game"""
147
- for ix, param in enumerate(
148
- [self.environment_title, self.action_types, self.action_size, self.lowest_possible_episode_score,
149
- self.state_size, self.hyperparameters, self.average_score_required_to_win, self.rolling_score_window,
150
- self.device]):
151
- self.logger.info("{} -- {}".format(ix, param))
152
-
153
- def set_random_seeds(self, random_seed):
154
- """Sets all possible random seeds so results can be reproduced"""
155
- os.environ['PYTHONHASHSEED'] = str(random_seed)
156
- torch.backends.cudnn.deterministic = True
157
- torch.backends.cudnn.benchmark = False
158
- torch.manual_seed(random_seed)
159
- # tf.set_random_seed(random_seed)
160
- random.seed(random_seed)
161
- np.random.seed(random_seed)
162
- if torch.cuda.is_available():
163
- torch.cuda.manual_seed_all(random_seed)
164
- torch.cuda.manual_seed(random_seed)
165
- if hasattr(gym.spaces, 'prng'):
166
- gym.spaces.prng.seed(random_seed)
167
-
168
- def reset_game(self):
169
- """Resets the game information so we are ready to play a new episode"""
170
- self.environment.seed(self.config.seed)
171
- self.state = self.environment.reset()
172
- self.next_state = None
173
- self.action = None
174
- self.reward = None
175
- self.signal = None
176
- self.done = False
177
- self.total_episode_score_so_far = 0
178
- self.total_episode_signal_so_far = 0
179
- self.episode_states = []
180
- self.episode_rewards = []
181
- self.episode_signals = []
182
- self.episode_actions = []
183
- self.episode_next_states = []
184
- self.episode_dones = []
185
- self.episode_desired_goals = []
186
- self.episode_achieved_goals = []
187
- self.episode_observations = []
188
- if "exploration_strategy" in self.__dict__.keys(): self.exploration_strategy.reset()
189
- self.logger.info("Reseting game -- New start state {}".format(self.state))
190
-
191
- def track_episodes_data(self):
192
- """Saves the data from the recent episodes"""
193
- self.episode_states.append(self.state)
194
- self.episode_actions.append(self.action)
195
- self.episode_rewards.append(self.reward)
196
- self.episode_signals.append(self.signal)
197
- self.episode_next_states.append(self.next_state)
198
- self.episode_dones.append(self.done)
199
-
200
- def run_n_episodes(self, num_episodes=None, show_whether_achieved_goal=True, save_and_print_results=True):
201
- """Runs game to completion n times and then summarises results and saves model (if asked to)"""
202
- if num_episodes is None: num_episodes = self.config.num_episodes_to_run
203
- start = time.time()
204
- while self.episode_number < num_episodes:
205
- self.reset_game()
206
- self.step()
207
- if save_and_print_results: self.save_and_print_result()
208
- time_taken = time.time() - start
209
- if show_whether_achieved_goal: self.show_whether_achieved_goal()
210
- if self.config.save_model: self.locally_save_policy()
211
- return self.game_full_episode_scores, self.rolling_results, time_taken, self.game_full_episode_signals
212
-
213
- def conduct_action(self, action):
214
- """Conducts an action in the environment"""
215
- self.next_state, self.reward, self.done, self.signal = self.environment.step(action)
216
- self.total_episode_score_so_far += self.reward
217
- self.total_episode_signal_so_far += self.signal
218
- if self.hyperparameters["clip_rewards"]: self.reward = max(min(self.reward, 1.0), -1.0)
219
-
220
- def save_and_print_result(self):
221
- """Saves and prints results of the game"""
222
- self.save_result()
223
- self.print_rolling_result()
224
-
225
- def save_result(self):
226
- """Saves the result of an episode of the game"""
227
- self.game_full_episode_scores.append(self.total_episode_score_so_far)
228
- self.game_full_episode_signals.append(self.total_episode_signal_so_far)
229
- self.rolling_results.append(np.mean(self.game_full_episode_scores[-1 * self.rolling_score_window:]))
230
- self.save_max_result_seen()
231
-
232
- def save_max_result_seen(self):
233
- """Updates the best episode result seen so far"""
234
- if self.game_full_episode_scores[-1] > self.max_episode_score_seen:
235
- self.max_episode_score_seen = self.game_full_episode_scores[-1]
236
-
237
- if self.rolling_results[-1] > self.max_rolling_score_seen:
238
- if len(self.rolling_results) > self.rolling_score_window:
239
- self.max_rolling_score_seen = self.rolling_results[-1]
240
-
241
- def print_rolling_result(self):
242
- """Prints out the latest episode results"""
243
- text = """"\r Episode {0}, Score: {3: .2f}, Max score seen: {4: .2f}, Rolling score: {1: .2f}, Max rolling score seen: {2: .2f}"""
244
- sys.stdout.write(
245
- text.format(len(self.game_full_episode_scores), self.rolling_results[-1], self.max_rolling_score_seen,
246
- self.game_full_episode_scores[-1], self.max_episode_score_seen))
247
- sys.stdout.flush()
248
-
249
- def show_whether_achieved_goal(self):
250
- """Prints out whether the agent achieved the environment target goal"""
251
- index_achieved_goal = self.achieved_required_score_at_index()
252
- print(" ")
253
- if index_achieved_goal == -1: # this means agent never achieved goal
254
- print("\033[91m" + "\033[1m" +
255
- "{} did not achieve required score \n".format(self.agent_name) +
256
- "\033[0m" + "\033[0m")
257
- else:
258
- print("\033[92m" + "\033[1m" +
259
- "{} achieved required score at episode {} \n".format(self.agent_name, index_achieved_goal) +
260
- "\033[0m" + "\033[0m")
261
-
262
- def achieved_required_score_at_index(self):
263
- """Returns the episode at which agent achieved goal or -1 if it never achieved it"""
264
- for ix, score in enumerate(self.rolling_results):
265
- if score > self.average_score_required_to_win:
266
- return ix
267
- return -1
268
-
269
- def update_learning_rate(self, starting_lr, optimizer):
270
- """Lowers the learning rate according to how close we are to the solution"""
271
- if len(self.rolling_results) > 0:
272
- last_rolling_score = self.rolling_results[-1]
273
- if last_rolling_score > 0.75 * self.average_score_required_to_win:
274
- new_lr = starting_lr / 100.0
275
- elif last_rolling_score > 0.6 * self.average_score_required_to_win:
276
- new_lr = starting_lr / 20.0
277
- elif last_rolling_score > 0.5 * self.average_score_required_to_win:
278
- new_lr = starting_lr / 10.0
279
- elif last_rolling_score > 0.25 * self.average_score_required_to_win:
280
- new_lr = starting_lr / 2.0
281
- else:
282
- new_lr = starting_lr
283
- for g in optimizer.param_groups:
284
- g['lr'] = new_lr
285
- if random.random() < 0.001: self.logger.info("Learning rate {}".format(new_lr))
286
-
287
- def enough_experiences_to_learn_from(self):
288
- """Boolean indicated whether there are enough experiences in the memory buffer to learn from"""
289
- return len(self.memory) > self.hyperparameters["batch_size"]
290
-
291
- def save_experience(self, memory=None, experience=None):
292
- """Saves the recent experience to the memory buffer"""
293
- if memory is None: memory = self.memory
294
- if experience is None: experience = self.state, self.action, self.reward, self.next_state, self.done
295
- memory.add_experience(*experience)
296
-
297
- def take_optimisation_step(self, optimizer, network, loss, clipping_norm=None, retain_graph=False):
298
- """Takes an optimisation step by calculating gradients given the loss and then updating the parameters"""
299
- if not isinstance(network, list): network = [network]
300
- optimizer.zero_grad() # reset gradients to 0
301
- loss.backward(retain_graph=retain_graph) # this calculates the gradients
302
- self.logger.info("Loss -- {}".format(loss.item()))
303
- if self.debug_mode: self.log_gradient_and_weight_information(network, optimizer)
304
- if clipping_norm is not None:
305
- for net in network:
306
- torch.nn.utils.clip_grad_norm_(net.parameters(),
307
- clipping_norm) # clip gradients to help stabilise training
308
- optimizer.step() # this applies the gradients
309
-
310
- def log_gradient_and_weight_information(self, network, optimizer):
311
-
312
- # log weight information
313
- total_norm = 0
314
- for name, param in network.named_parameters():
315
- param_norm = param.grad.data.norm(2)
316
- total_norm += param_norm.item() ** 2
317
- total_norm = total_norm ** (1. / 2)
318
- self.logger.info("Gradient Norm {}".format(total_norm))
319
-
320
- for g in optimizer.param_groups:
321
- learning_rate = g['lr']
322
- break
323
- self.logger.info("Learning Rate {}".format(learning_rate))
324
-
325
- def soft_update_of_target_network(self, local_model, target_model, tau):
326
- """Updates the target network in the direction of the local network but by taking a step size
327
- less than one so the target network's parameter values trail the local networks. This helps stabilise training"""
328
- for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
329
- target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
330
-
331
- def create_NN(self, input_dim, output_dim, key_to_use=None, override_seed=None, hyperparameters=None):
332
- """Creates a neural network for the agents to use"""
333
- if hyperparameters is None: hyperparameters = self.hyperparameters
334
- if key_to_use: hyperparameters = hyperparameters[key_to_use]
335
- if override_seed:
336
- seed = override_seed
337
- else:
338
- seed = self.config.seed
339
-
340
- default_hyperparameter_choices = {"output_activation": None, "hidden_activations": "relu", "dropout": 0.0,
341
- "initialiser": "default", "batch_norm": False,
342
- "columns_of_data_to_be_embedded": [],
343
- "embedding_dimensions": [], "y_range": ()}
344
-
345
- for key in default_hyperparameter_choices:
346
- if key not in hyperparameters.keys():
347
- hyperparameters[key] = default_hyperparameter_choices[key]
348
-
349
- return NN(input_dim=input_dim, layers_info=hyperparameters["linear_hidden_units"] + [output_dim],
350
- output_activation=hyperparameters["final_layer_activation"],
351
- batch_norm=hyperparameters["batch_norm"], dropout=hyperparameters["dropout"],
352
- hidden_activations=hyperparameters["hidden_activations"], initialiser=hyperparameters["initialiser"],
353
- columns_of_data_to_be_embedded=hyperparameters["columns_of_data_to_be_embedded"],
354
- embedding_dimensions=hyperparameters["embedding_dimensions"], y_range=hyperparameters["y_range"],
355
- random_seed=seed).to(self.device)
356
-
357
- def turn_on_any_epsilon_greedy_exploration(self):
358
- """Turns off all exploration with respect to the epsilon greedy exploration strategy"""
359
- print("Turning on epsilon greedy exploration")
360
- self.turn_off_exploration = False
361
-
362
- def turn_off_any_epsilon_greedy_exploration(self):
363
- """Turns off all exploration with respect to the epsilon greedy exploration strategy"""
364
- print("Turning off epsilon greedy exploration")
365
- self.turn_off_exploration = True
366
-
367
- def freeze_all_but_output_layers(self, network):
368
- """Freezes all layers except the output layer of a network"""
369
- print("Freezing hidden layers")
370
- for param in network.named_parameters():
371
- param_name = param[0]
372
- assert "hidden" in param_name or "output" in param_name or "embedding" in param_name, "Name {} of network layers not understood".format(
373
- param_name)
374
- if "output" not in param_name:
375
- param[1].requires_grad = False
376
-
377
- def unfreeze_all_layers(self, network):
378
- """Unfreezes all layers of a network"""
379
- print("Unfreezing all layers")
380
- for param in network.parameters():
381
- param.requires_grad = True
382
-
383
- @staticmethod
384
- def move_gradients_one_model_to_another(from_model, to_model, set_from_gradients_to_zero=False):
385
- """Copies gradients from from_model to to_model"""
386
- for from_model, to_model in zip(from_model.parameters(), to_model.parameters()):
387
- to_model._grad = from_model.grad.clone()
388
- if set_from_gradients_to_zero: from_model._grad = None
389
-
390
- @staticmethod
391
- def copy_model_over(from_model, to_model):
392
- """Copies model parameters from from_model to to_model"""
393
- for to_model, from_model in zip(to_model.parameters(), from_model.parameters()):
394
- to_model.data.copy_(from_model.data.clone())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agents/DQN_agents/DDQN.py DELETED
@@ -1,18 +0,0 @@
1
- from agents.DQN_agents.DQN_With_Fixed_Q_Targets import DQN_With_Fixed_Q_Targets
2
-
3
- class DDQN(DQN_With_Fixed_Q_Targets):
4
- """A double DQN agent"""
5
- agent_name = "DDQN"
6
-
7
- def __init__(self, config):
8
- DQN_With_Fixed_Q_Targets.__init__(self, config)
9
-
10
- def compute_q_values_for_next_states(self, next_states):
11
- """Computes the q_values for next state we will use to create the loss to train the Q network. Double DQN
12
- uses the local index to pick the maximum q_value action and then the target network to calculate the q_value.
13
- The reasoning behind this is that it will help stop the network from overestimating q values"""
14
- max_action_indexes = self.q_network_local(next_states).detach().argmax(1)
15
- Q_targets_next = self.q_network_target(next_states).gather(1, max_action_indexes.unsqueeze(1))
16
- return Q_targets_next
17
-
18
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agents/DQN_agents/DDQN_With_Prioritised_Experience_Replay.py DELETED
@@ -1,37 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from agents.DQN_agents.DDQN import DDQN
4
- from utilities.data_structures.Prioritised_Replay_Buffer import Prioritised_Replay_Buffer
5
-
6
- class DDQN_With_Prioritised_Experience_Replay(DDQN):
7
- """A DQN agent with prioritised experience replay"""
8
- agent_name = "DDQN with Prioritised Replay"
9
-
10
- def __init__(self, config):
11
- DDQN.__init__(self, config)
12
- self.memory = Prioritised_Replay_Buffer(self.hyperparameters, config.seed)
13
-
14
- def learn(self):
15
- """Runs a learning iteration for the Q network after sampling from the replay buffer in a prioritised way"""
16
- sampled_experiences, importance_sampling_weights = self.memory.sample()
17
- states, actions, rewards, next_states, dones = sampled_experiences
18
- loss, td_errors = self.compute_loss_and_td_errors(states, next_states, rewards, actions, dones, importance_sampling_weights)
19
- self.take_optimisation_step(self.q_network_optimizer, self.q_network_local, loss, self.hyperparameters["gradient_clipping_norm"])
20
- self.soft_update_of_target_network(self.q_network_local, self.q_network_target, self.hyperparameters["tau"])
21
- self.memory.update_td_errors(td_errors.squeeze(1))
22
-
23
- def save_experience(self):
24
- """Saves the latest experience including the td_error"""
25
- max_td_error_in_experiences = self.memory.give_max_td_error() + 1e-9
26
- self.memory.add_experience(max_td_error_in_experiences, self.state, self.action, self.reward, self.next_state, self.done)
27
-
28
- def compute_loss_and_td_errors(self, states, next_states, rewards, actions, dones, importance_sampling_weights):
29
- """Calculates the loss for the local Q network. It weighs each observations loss according to the importance
30
- sampling weights which come from the prioritised replay buffer"""
31
- Q_targets = self.compute_q_targets(next_states, rewards, dones)
32
- Q_expected = self.compute_expected_q_values(states, actions)
33
- loss = F.mse_loss(Q_expected, Q_targets)
34
- loss = loss * importance_sampling_weights
35
- loss = torch.mean(loss)
36
- td_errors = Q_targets.data.cpu().numpy() - Q_expected.data.cpu().numpy()
37
- return loss, td_errors
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agents/DQN_agents/DQN.py DELETED
@@ -1,135 +0,0 @@
1
- from collections import Counter
2
-
3
- import torch
4
- import random
5
- import torch.optim as optim
6
- import torch.nn.functional as F
7
- import numpy as np
8
- from agents.Base_Agent import Base_Agent
9
- from exploration_strategies.Epsilon_Greedy_Exploration import Epsilon_Greedy_Exploration
10
- from utilities.data_structures.Replay_Buffer import Replay_Buffer
11
-
12
-
13
- class DQN(Base_Agent):
14
- """A deep Q learning agent"""
15
- agent_name = "DQN"
16
-
17
- def __init__(self, config):
18
- Base_Agent.__init__(self, config)
19
- self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"],
20
- config.seed, self.device)
21
- self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size)
22
- self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(),
23
- lr=self.hyperparameters["learning_rate"], eps=1e-4)
24
- self.exploration_strategy = Epsilon_Greedy_Exploration(config)
25
-
26
- def reset_game(self):
27
- super(DQN, self).reset_game()
28
- self.update_learning_rate(self.hyperparameters["learning_rate"], self.q_network_optimizer)
29
-
30
- def step(self):
31
- """Runs a step within a game including a learning step if required"""
32
- while not self.done:
33
- self.action = self.pick_action()
34
- self.conduct_action(self.action)
35
- # If we are in training mode
36
- if self.config.training:
37
- if self.time_for_q_network_to_learn():
38
- for _ in range(self.hyperparameters["learning_iterations"]):
39
- self.learn()
40
- self.save_experience()
41
- self.state = self.next_state # this is to set the state for the next iteration
42
- self.global_step_number += 1
43
- self.episode_number += 1
44
-
45
- def pick_action(self, state=None):
46
- """Uses the local Q network and an epsilon greedy policy to pick an action"""
47
- # PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add
48
- # a "fake" dimension to make it a mini-batch rather than a single observation
49
- if state is None: state = self.state
50
- if isinstance(state, np.int64) or isinstance(state, int): state = np.array([state])
51
- state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
52
- if len(state.shape) < 2: state = state.unsqueeze(0)
53
- if not self.config.training:
54
- self.q_network_local = self.locally_load_policy()
55
- self.q_network_local.eval() # puts network in evaluation mode
56
- with torch.no_grad():
57
- action_values = self.q_network_local(state)
58
- if self.config.training:
59
- self.q_network_local.train() # puts network back in training mode
60
- action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action_values": action_values,
61
- "turn_off_exploration": self.turn_off_exploration,
62
- "episode_number": self.episode_number})
63
- self.logger.info("Q values {} -- Action chosen {}".format(action_values, action))
64
- return action
65
-
66
- def learn(self, experiences=None):
67
- """Runs a learning iteration for the Q network"""
68
- if experiences is None:
69
- states, actions, rewards, next_states, dones = self.sample_experiences() # Sample experiences
70
- else:
71
- states, actions, rewards, next_states, dones = experiences
72
- loss = self.compute_loss(states, next_states, rewards, actions, dones)
73
-
74
- actions_list = [action_X.item() for action_X in actions]
75
-
76
- self.logger.info("Action counts {}".format(Counter(actions_list)))
77
- self.take_optimisation_step(self.q_network_optimizer, self.q_network_local, loss,
78
- self.hyperparameters["gradient_clipping_norm"])
79
-
80
- def compute_loss(self, states, next_states, rewards, actions, dones):
81
- """Computes the loss required to train the Q network"""
82
- with torch.no_grad():
83
- Q_targets = self.compute_q_targets(next_states, rewards, dones)
84
- Q_expected = self.compute_expected_q_values(states, actions)
85
- loss = F.mse_loss(Q_expected, Q_targets)
86
- return loss
87
-
88
- def compute_q_targets(self, next_states, rewards, dones):
89
- """Computes the q_targets we will compare to predicted q values to create the loss to train the Q network"""
90
- Q_targets_next = self.compute_q_values_for_next_states(next_states)
91
- Q_targets = self.compute_q_values_for_current_states(rewards, Q_targets_next, dones)
92
- return Q_targets
93
-
94
- def compute_q_values_for_next_states(self, next_states):
95
- """Computes the q_values for next state we will use to create the loss to train the Q network"""
96
- Q_targets_next = self.q_network_local(next_states).detach().max(1)[0].unsqueeze(1)
97
- return Q_targets_next
98
-
99
- def compute_q_values_for_current_states(self, rewards, Q_targets_next, dones):
100
- """Computes the q_values for current state we will use to create the loss to train the Q network"""
101
- Q_targets_current = rewards + (self.hyperparameters["discount_rate"] * Q_targets_next * (1 - dones))
102
- return Q_targets_current
103
-
104
- def compute_expected_q_values(self, states, actions):
105
- """Computes the expected q_values we will use to create the loss to train the Q network"""
106
- Q_expected = self.q_network_local(states).gather(1,
107
- actions.long()) # must convert actions to long so can be used as index
108
- return Q_expected
109
-
110
- def locally_save_policy(self):
111
- """Saves the policy"""
112
- torch.save(self.q_network_local.state_dict(),
113
- "{}/{}_network.pt".format(self.config.models_dir, self.agent_name))
114
-
115
- def locally_load_policy(self):
116
- """loads the policy"""
117
- filename = f'{self.config.models_dir}/{self.agent_name}_network.pt'
118
- saved_q_network_local = self.q_network_local
119
- saved_q_network_local.load_state_dict(torch.load(filename))
120
- return saved_q_network_local
121
-
122
- def time_for_q_network_to_learn(self):
123
- """Returns boolean indicating whether enough steps have been taken for learning to begin and there are
124
- enough experiences in the replay buffer to learn from"""
125
- return self.right_amount_of_steps_taken() and self.enough_experiences_to_learn_from()
126
-
127
- def right_amount_of_steps_taken(self):
128
- """Returns boolean indicating whether enough steps have been taken for learning to begin"""
129
- return self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0
130
-
131
- def sample_experiences(self):
132
- """Draws a random sample of experience from the memory buffer"""
133
- experiences = self.memory.sample()
134
- states, actions, rewards, next_states, dones = experiences
135
- return states, actions, rewards, next_states, dones
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agents/DQN_agents/DQN_HER.py DELETED
@@ -1,30 +0,0 @@
1
- from agents.DQN_agents.DQN import DQN
2
- from agents.HER_Base import HER_Base
3
-
4
- class DQN_HER(HER_Base, DQN):
5
- """DQN algorithm with hindsight experience replay"""
6
- agent_name = "DQN-HER"
7
- def __init__(self, config):
8
- DQN.__init__(self, config)
9
- HER_Base.__init__(self, self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"],
10
- self.hyperparameters["HER_sample_proportion"])
11
-
12
- def step(self):
13
- """Runs a step within a game including a learning step if required"""
14
- while not self.done:
15
- self.action = self.pick_action()
16
- self.conduct_action_in_changeable_goal_envs(self.action)
17
- if self.time_for_q_network_to_learn():
18
- for _ in range(self.hyperparameters["learning_iterations"]):
19
- self.learn(experiences=self.sample_from_HER_and_Ordinary_Buffer())
20
- self.track_changeable_goal_episodes_data()
21
- self.save_experience()
22
- if self.done: self.save_alternative_experience()
23
- self.state_dict = self.next_state_dict # this is to set the state for the next iteration
24
- self.state = self.next_state
25
- self.global_step_number += 1
26
- self.episode_number += 1
27
-
28
- def enough_experiences_to_learn_from(self):
29
- """Returns booleans indicating whether there are enough experiences in the two replay buffers to learn from"""
30
- return len(self.memory) > self.ordinary_buffer_batch_size and len(self.HER_memory) > self.HER_buffer_batch_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agents/DQN_agents/DQN_With_Fixed_Q_Targets.py DELETED
@@ -1,23 +0,0 @@
1
- import copy
2
-
3
- from agents.Base_Agent import Base_Agent
4
- from agents.DQN_agents.DQN import DQN
5
-
6
- class DQN_With_Fixed_Q_Targets(DQN):
7
- """A DQN agent that uses an older version of the q_network as the target network"""
8
- agent_name = "DQN with Fixed Q Targets"
9
- def __init__(self, config):
10
- DQN.__init__(self, config)
11
- self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size)
12
- Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
13
-
14
- def learn(self, experiences=None):
15
- """Runs a learning iteration for the Q network"""
16
- super(DQN_With_Fixed_Q_Targets, self).learn(experiences=experiences)
17
- self.soft_update_of_target_network(self.q_network_local, self.q_network_target,
18
- self.hyperparameters["tau"]) # Update the target network
19
-
20
- def compute_q_values_for_next_states(self, next_states):
21
- """Computes the q_values for next state we will use to create the loss to train the Q network"""
22
- Q_targets_next = self.q_network_target(next_states).detach().max(1)[0].unsqueeze(1)
23
- return Q_targets_next
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agents/DQN_agents/Dueling_DDQN.py DELETED
@@ -1,64 +0,0 @@
1
- import torch
2
- from torch import optim
3
- from agents.Base_Agent import Base_Agent
4
- from agents.DQN_agents.DDQN import DDQN
5
-
6
- class Dueling_DDQN(DDQN):
7
- """A dueling double DQN agent as described in the paper http://proceedings.mlr.press/v48/wangf16.pdf"""
8
- agent_name = "Dueling DDQN"
9
-
10
- def __init__(self, config):
11
- DDQN.__init__(self, config)
12
- self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
13
- self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4)
14
- self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
15
- Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
16
-
17
- def pick_action(self, state=None):
18
- """Uses the local Q network and an epsilon greedy policy to pick an action"""
19
- # PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add
20
- # a "fake" dimension to make it a mini-batch rather than a single observation
21
- if state is None: state = self.state
22
- state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
23
- if len(state.shape) < 2: state = state.unsqueeze(0)
24
- self.q_network_local.eval()
25
- with torch.no_grad():
26
- action_values = self.q_network_local(state)
27
- action_values = action_values[:, :-1] #because we treat the last output element as state-value and rest as advantages
28
- self.q_network_local.train()
29
- action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action_values": action_values,
30
- "turn_off_exploration": self.turn_off_exploration,
31
- "episode_number": self.episode_number})
32
- return action
33
-
34
- def compute_q_values_for_next_states(self, next_states):
35
- """Computes the q_values for next state we will use to create the loss to train the Q network. Double DQN
36
- uses the local index to pick the maximum q_value action and then the target network to calculate the q_value.
37
- The reasoning behind this is that it will help stop the network from overestimating q values"""
38
- max_action_indexes = self.q_network_local(next_states)[:, :-1].detach().argmax(1)
39
- duelling_network_output = self.q_network_target(next_states)
40
- q_values = self.calculate_duelling_q_values(duelling_network_output)
41
- Q_targets_next = q_values.gather(1, max_action_indexes.unsqueeze(1))
42
- return Q_targets_next
43
-
44
- def calculate_duelling_q_values(self, duelling_q_network_output):
45
- """Calculates the q_values using the duelling network architecture. This is equation (9) in the paper
46
- referenced at the top of the class"""
47
- state_value = duelling_q_network_output[:, -1]
48
- avg_advantage = torch.mean(duelling_q_network_output[:, :-1], dim=1)
49
- q_values = state_value.unsqueeze(1) + (duelling_q_network_output[:, :-1] - avg_advantage.unsqueeze(1))
50
- return q_values
51
-
52
- def compute_expected_q_values(self, states, actions):
53
- """Computes the expected q_values we will use to create the loss to train the Q network"""
54
- duelling_network_output = self.q_network_local(states)
55
- q_values = self.calculate_duelling_q_values(duelling_network_output)
56
- Q_expected = q_values.gather(1, actions.long())
57
- return Q_expected
58
-
59
-
60
-
61
-
62
-
63
-
64
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agents/DQN_agents/__init__.py DELETED
@@ -1 +0,0 @@
1
- import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))
 
 
agents/DQN_agents/__pycache__/DDQN.cpython-310.pyc DELETED
Binary file (1.25 kB)
 
agents/DQN_agents/__pycache__/DDQN.cpython-38.pyc DELETED
Binary file (1.31 kB)
 
agents/DQN_agents/__pycache__/DDQN.cpython-39.pyc DELETED
Binary file (1.24 kB)
 
agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-310.pyc DELETED
Binary file (2.63 kB)
 
agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-39.pyc DELETED
Binary file (2.69 kB)
 
agents/DQN_agents/__pycache__/DQN.cpython-310.pyc DELETED
Binary file (6.52 kB)
 
agents/DQN_agents/__pycache__/DQN.cpython-39.pyc DELETED
Binary file (6.18 kB)
 
agents/DQN_agents/__pycache__/DQN_HER.cpython-310.pyc DELETED
Binary file (1.84 kB)
 
agents/DQN_agents/__pycache__/DQN_HER.cpython-39.pyc DELETED
Binary file (1.9 kB)
 
agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-310.pyc DELETED
Binary file (1.67 kB)
 
agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-38.pyc DELETED
Binary file (1.73 kB)
 
agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-39.pyc DELETED
Binary file (1.67 kB)
 
agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-310.pyc DELETED
Binary file (3.32 kB)
 
agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-39.pyc DELETED
Binary file (3.39 kB)
 
agents/DQN_agents/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (251 Bytes)
 
agents/DQN_agents/__pycache__/__init__.cpython-38.pyc DELETED
Binary file (318 Bytes)
 
agents/DQN_agents/__pycache__/__init__.cpython-39.pyc DELETED
Binary file (249 Bytes)
 
agents/HER_Base.py DELETED
@@ -1,100 +0,0 @@
1
- import torch
2
- import numpy as np
3
- from utilities.data_structures.Replay_Buffer import Replay_Buffer
4
- from utilities.Utility_Functions import abstract
5
-
6
- @abstract
7
- class HER_Base(object):
8
- """Contains methods needed to turn an algorithm into a hindsight experience replay (HER) algorithm"""
9
- def __init__(self, buffer_size, batch_size, HER_sample_proportion):
10
- self.HER_memory = Replay_Buffer(buffer_size, batch_size, self.config.seed)
11
- self.ordinary_buffer_batch_size = int(batch_size * (1.0 - HER_sample_proportion))
12
- self.HER_buffer_batch_size = batch_size - self.ordinary_buffer_batch_size
13
-
14
- def reset_game(self):
15
- """Resets the game information so we are ready to play a new episode"""
16
- self.state_dict = self.environment.reset()
17
- self.observation = self.state_dict["observation"]
18
- self.desired_goal = self.state_dict["desired_goal"]
19
- self.achieved_goal = self.state_dict["achieved_goal"]
20
-
21
- self.state = self.create_state_from_observation_and_desired_goal(self.observation, self.desired_goal)
22
- self.next_state = None
23
- self.action = None
24
- self.reward = None
25
- self.done = False
26
-
27
- self.episode_states = []
28
- self.episode_rewards = []
29
- self.episode_actions = []
30
- self.episode_next_states = []
31
- self.episode_dones = []
32
-
33
- self.episode_desired_goals = []
34
- self.episode_achieved_goals = []
35
- self.episode_observations = []
36
-
37
- self.episode_next_desired_goals = []
38
- self.episode_next_achieved_goals = []
39
- self.episode_next_observations = []
40
-
41
- self.total_episode_score_so_far = 0
42
-
43
- def track_changeable_goal_episodes_data(self):
44
- """Saves the data from the recent episodes in a way compatible with changeable goal environments"""
45
- self.episode_rewards.append(self.reward)
46
- self.episode_actions.append(self.action)
47
- self.episode_dones.append(self.done)
48
-
49
- self.episode_states.append(self.state)
50
- self.episode_next_states.append(self.next_state)
51
-
52
- self.episode_desired_goals.append(self.state_dict["desired_goal"])
53
- self.episode_achieved_goals.append(self.state_dict["achieved_goal"])
54
- self.episode_observations.append(self.state_dict["observation"])
55
-
56
- self.episode_next_desired_goals.append(self.next_state_dict["desired_goal"])
57
- self.episode_next_achieved_goals.append(self.next_state_dict["achieved_goal"])
58
- self.episode_next_observations.append(self.next_state_dict["observation"])
59
-
60
- def conduct_action_in_changeable_goal_envs(self, action):
61
- """Adapts conduct_action from base agent so that can handle changeable goal environments"""
62
- self.next_state_dict, self.reward, self.done, _ = self.environment.step(action)
63
- self.total_episode_score_so_far += self.reward
64
- if self.hyperparameters["clip_rewards"]:
65
- self.reward = max(min(self.reward, 1.0), -1.0)
66
- self.observation = self.next_state_dict["observation"]
67
- self.desired_goal = self.next_state_dict["desired_goal"]
68
- self.achieved_goal = self.next_state_dict["achieved_goal"]
69
- self.next_state = self.create_state_from_observation_and_desired_goal(self.observation, self.desired_goal)
70
-
71
-
72
- def create_state_from_observation_and_desired_goal(self, observation, desired_goal):
73
- return np.concatenate((observation, desired_goal))
74
-
75
- def save_alternative_experience(self):
76
- """Saves the experiences as if the final state visited in the episode was the goal state"""
77
- new_goal = self.achieved_goal
78
- new_states = [self.create_state_from_observation_and_desired_goal(observation, new_goal) for observation in self.episode_observations]
79
- new_next_states = [self.create_state_from_observation_and_desired_goal(observation, new_goal) for observation in
80
- self.episode_next_observations]
81
- new_rewards = [self.environment.compute_reward(next_achieved_goal, new_goal, None) for next_achieved_goal in self.episode_next_achieved_goals]
82
-
83
- if self.hyperparameters["clip_rewards"]:
84
- new_rewards = [max(min(reward, 1.0), -1.0) for reward in new_rewards]
85
-
86
- self.HER_memory.add_experience(new_states, self.episode_actions, new_rewards, new_next_states, self.episode_dones)
87
-
88
- def sample_from_HER_and_Ordinary_Buffer(self):
89
- """Samples from the ordinary replay buffer and HER replay buffer according to a proportion specified in config"""
90
- states, actions, rewards, next_states, dones = self.memory.sample(self.ordinary_buffer_batch_size)
91
- HER_states, HER_actions, HER_rewards, HER_next_states, HER_dones = self.HER_memory.sample(self.HER_buffer_batch_size)
92
-
93
- states = torch.cat((states, HER_states))
94
- actions = torch.cat((actions, HER_actions))
95
- rewards = torch.cat((rewards, HER_rewards))
96
- next_states = torch.cat((next_states, HER_next_states))
97
- dones = torch.cat((dones, HER_dones))
98
- return states, actions, rewards, next_states, dones
99
-
100
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agents/Trainer.py DELETED
@@ -1,304 +0,0 @@
1
- import copy
2
- import random
3
- import pickle
4
- import os
5
- import gym
6
- from gym import wrappers
7
- import numpy as np
8
- import matplotlib.pyplot as plt
9
-
10
- class Trainer(object):
11
- """Runs games for given agents. Optionally will visualise and save the results"""
12
- def __init__(self, config, agents):
13
- self.config = config
14
- self.agents = agents
15
- self.agent_to_agent_group = self.create_agent_to_agent_group_dictionary()
16
- self.agent_to_color_group = self.create_agent_to_color_dictionary()
17
- self.results = None
18
- self.signals_result = None
19
- self.colors = ["red", "blue", "green", "orange", "yellow", "purple"]
20
- self.colour_ix = 0
21
- self.y_limits = None
22
-
23
- def create_agent_to_agent_group_dictionary(self):
24
- """Creates a dictionary that maps an agent to their wider agent group"""
25
- agent_to_agent_group_dictionary = {
26
- "DQN": "DQN_Agents",
27
- "DQN-HER": "DQN_Agents",
28
- "DDQN": "DQN_Agents",
29
- "DDQN with Prioritised Replay": "DQN_Agents",
30
- "DQN with Fixed Q Targets": "DQN_Agents",
31
- "Duelling DQN": "DQN_Agents",
32
- "PPO": "Policy_Gradient_Agents",
33
- "REINFORCE": "Policy_Gradient_Agents",
34
- "Genetic_Agent": "Stochastic_Policy_Search_Agents",
35
- "Hill Climbing": "Stochastic_Policy_Search_Agents",
36
- "DDPG": "Actor_Critic_Agents",
37
- "DDPG-HER": "Actor_Critic_Agents",
38
- "TD3": "Actor_Critic_Agents",
39
- "A2C": "Actor_Critic_Agents",
40
- "A3C": "Actor_Critic_Agents",
41
- "h-DQN": "h_DQN",
42
- "SNN-HRL": "SNN_HRL",
43
- "HIRO": "HIRO",
44
- "SAC": "Actor_Critic_Agents",
45
- "HRL": "HRL",
46
- "Model_HRL": "HRL",
47
- "DIAYN": "DIAYN",
48
- "Dueling DDQN": "DQN_Agents"
49
- }
50
- return agent_to_agent_group_dictionary
51
-
52
- def create_agent_to_color_dictionary(self):
53
- """Creates a dictionary that maps an agent to a hex color (for plotting purposes)
54
- See https://en.wikipedia.org/wiki/Web_colors and https://htmlcolorcodes.com/ for hex colors"""
55
- agent_to_color_dictionary = {
56
- "DQN": "#0000FF",
57
- "DQN with Fixed Q Targets": "#1F618D",
58
- "DDQN": "#2980B9",
59
- "DDQN with Prioritised Replay": "#7FB3D5",
60
- "Dueling DDQN": "#22DAF3",
61
- "PPO": "#5B2C6F",
62
- "DDPG": "#800000",
63
- "DQN-HER": "#008000",
64
- "DDPG-HER": "#008000",
65
- "TD3": "#E74C3C",
66
- "h-DQN": "#D35400",
67
- "SNN-HRL": "#800000",
68
- "A3C": "#E74C3C",
69
- "A2C": "#F1948A",
70
- "SAC": "#1C2833",
71
- "DIAYN": "#F322CD",
72
- "HRL": "#0E0F0F"
73
- }
74
- return agent_to_color_dictionary
75
-
76
- def run_games_for_agents(self):
77
- """Run a set of games for each agent. Optionally visualising and/or saving the results"""
78
- self.results = self.create_object_to_store_results()
79
- self.signals_result = self.create_object_to_store_results()
80
- for agent_number, agent_class in enumerate(self.agents):
81
- agent_name = agent_class.agent_name
82
- self.run_games_for_agent(agent_number + 1, agent_class)
83
- if self.config.visualise_overall_agent_results:
84
- agent_rolling_score_results = [results[1] for results in self.results[agent_name]]
85
- self.visualise_overall_agent_results(agent_rolling_score_results, agent_name, show_mean_and_std_range=True, y_limits=self.y_limits)
86
- if self.config.file_to_save_data_results: self.save_obj(self.results, self.config.file_to_save_data_results)
87
- if self.config.file_to_save_results_graph: plt.savefig(self.config.file_to_save_results_graph, bbox_inches="tight")
88
- plt.show()
89
- return self.results
90
-
91
- def create_object_to_store_results(self):
92
- """Creates a dictionary that we will store the results in if it doesn't exist, otherwise it loads it up"""
93
- if self.config.overwrite_existing_results_file or not self.config.file_to_save_data_results or not os.path.isfile(self.config.file_to_save_data_results):
94
- results = {}
95
- else: results = self.load_obj(self.config.file_to_save_data_results)
96
- return results
97
-
98
- def run_games_for_agent(self, agent_number, agent_class):
99
- """Runs a set of games for a given agent, saving the results in self.results"""
100
- agent_results = []
101
- agent_name = agent_class.agent_name
102
- agent_group = self.agent_to_agent_group[agent_name]
103
- agent_round = 1
104
- for run in range(self.config.runs_per_agent):
105
- agent_config = copy.deepcopy(self.config)
106
-
107
- if self.environment_has_changeable_goals(agent_config.environment) and self.agent_cant_handle_changeable_goals_without_flattening(agent_name):
108
- print("Flattening changeable-goal environment for agent {}".format(agent_name))
109
- agent_config.environment = gym.wrappers.FlattenDictWrapper(agent_config.environment,
110
- dict_keys=["observation", "desired_goal"])
111
-
112
- if self.config.randomise_random_seed: agent_config.seed = random.randint(0, 2**32 - 2)
113
- agent_config.hyperparameters = agent_config.hyperparameters[agent_group]
114
- print("AGENT NAME: {}".format(agent_name))
115
- print("\033[1m" + "{}.{}: {}".format(agent_number, agent_round, agent_name) + "\033[0m", flush=True)
116
- agent = agent_class(agent_config)
117
- self.environment_name = agent.environment_title
118
- print(agent.hyperparameters)
119
- print("RANDOM SEED " , agent_config.seed)
120
- game_scores, rolling_scores, time_taken, game_signals = agent.run_n_episodes()
121
- print("Time taken: {}".format(time_taken), flush=True)
122
- self.print_two_empty_lines()
123
- agent_results.append([game_scores, rolling_scores, len(rolling_scores), -1 * max(rolling_scores), time_taken, game_signals])
124
- if self.config.visualise_individual_results:
125
- self.visualise_overall_agent_results([rolling_scores], agent_name, show_each_run=True, y_limits=self.y_limits)
126
- plt.show()
127
- agent_round += 1
128
- self.results[agent_name] = agent_results
129
-
130
- def environment_has_changeable_goals(self, env):
131
- """Determines whether environment is such that for each episode there is a different goal or not"""
132
- return isinstance(env.reset(), dict)
133
-
134
- def agent_cant_handle_changeable_goals_without_flattening(self, agent_name):
135
- """Boolean indicating whether the agent is set up to handle changeable goals"""
136
- return "HER" not in agent_name
137
-
138
- def visualise_overall_agent_results(self, agent_results, agent_name, show_mean_and_std_range=False, show_each_run=False,
139
- color=None, ax=None, title=None, y_limits=None):
140
- """Visualises the results for one agent"""
141
- assert isinstance(agent_results, list), "agent_results must be a list of lists, 1 set of results per list"
142
- assert isinstance(agent_results[0], list), "agent_results must be a list of lists, 1 set of results per list"
143
- assert bool(show_mean_and_std_range) ^ bool(show_each_run), "either show_mean_and_std_range or show_each_run must be true"
144
- if not ax: ax = plt.gca()
145
- if not color: color = self.agent_to_color_group[agent_name]
146
- if show_mean_and_std_range:
147
- mean_minus_x_std, mean_results, mean_plus_x_std = self.get_mean_and_standard_deviation_difference_results(agent_results)
148
- x_vals = list(range(len(mean_results)))
149
- ax.plot(x_vals, mean_results, label=agent_name, color=color)
150
- ax.plot(x_vals, mean_plus_x_std, color=color, alpha=0.1)
151
- ax.plot(x_vals, mean_minus_x_std, color=color, alpha=0.1)
152
- ax.fill_between(x_vals, y1=mean_minus_x_std, y2=mean_plus_x_std, alpha=0.1, color=color)
153
- else:
154
- for ix, result in enumerate(agent_results):
155
- x_vals = list(range(len(agent_results[0])))
156
- plt.plot(x_vals, result, label=agent_name + "_{}".format(ix+1), color=color)
157
- color = self.get_next_color()
158
-
159
- ax.set_facecolor('xkcd:white')
160
-
161
- # Shrink current axis's height by 10% on the bottom
162
- box = ax.get_position()
163
- ax.set_position([box.x0, box.y0 + box.height * 0.05,
164
- box.width, box.height * 0.95])
165
-
166
- # Put a legend below current axis
167
- ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15),
168
- fancybox=True, shadow=True, ncol=3)
169
-
170
- if not title: title = self.environment_name
171
-
172
- ax.set_title(title, fontsize=15, fontweight='bold')
173
- ax.set_ylabel('Rolling Episode Scores')
174
- ax.set_xlabel('Episode Number')
175
- self.hide_spines(ax, ['right', 'top'])
176
- ax.set_xlim([0, x_vals[-1]])
177
-
178
- if y_limits is None: y_min, y_max = self.get_y_limits(agent_results)
179
- else: y_min, y_max = y_limits
180
-
181
- ax.set_ylim([y_min, y_max])
182
-
183
- if self.config.show_solution_score:
184
- self.draw_horizontal_line_with_label(ax, y_value=self.config.environment.get_score_to_win(), x_min=0,
185
- x_max=self.config.num_episodes_to_run * 1.02, label="Target \n score")
186
-
187
- def get_y_limits(self, results):
188
- """Extracts the minimum and maximum seen y_values from a set of results"""
189
- min_result = float("inf")
190
- max_result = float("-inf")
191
- for result in results:
192
- temp_max = np.max(result)
193
- temp_min = np.min(result)
194
- if temp_max > max_result:
195
- max_result = temp_max
196
- if temp_min < min_result:
197
- min_result = temp_min
198
- return min_result, max_result
199
-
200
- def get_next_color(self):
201
- """Gets the next color in list self.colors. If it gets to the end then it starts from beginning"""
202
- self.colour_ix += 1
203
- if self.colour_ix >= len(self.colors): self.colour_ix = 0
204
- color = self.colors[self.colour_ix]
205
- return color
206
-
207
- def get_mean_and_standard_deviation_difference_results(self, results):
208
- """From a list of lists of agent results it extracts the mean results and the mean results plus or minus
209
- some multiple of the standard deviation"""
210
- def get_results_at_a_time_step(results, timestep):
211
- results_at_a_time_step = [result[timestep] for result in results]
212
- return results_at_a_time_step
213
- def get_standard_deviation_at_time_step(results, timestep):
214
- results_at_a_time_step = [result[timestep] for result in results]
215
- return np.std(results_at_a_time_step)
216
- mean_results = [np.mean(get_results_at_a_time_step(results, timestep)) for timestep in range(len(results[0]))]
217
- mean_minus_x_std = [mean_val - self.config.standard_deviation_results * get_standard_deviation_at_time_step(results, timestep) for
218
- timestep, mean_val in enumerate(mean_results)]
219
- mean_plus_x_std = [mean_val + self.config.standard_deviation_results * get_standard_deviation_at_time_step(results, timestep) for
220
- timestep, mean_val in enumerate(mean_results)]
221
- return mean_minus_x_std, mean_results, mean_plus_x_std
222
-
223
- def hide_spines(self, ax, spines_to_hide):
224
- """Hides splines on a matplotlib image"""
225
- for spine in spines_to_hide:
226
- ax.spines[spine].set_visible(False)
227
-
228
- def ignore_points_after_game_solved(self, mean_minus_x_std, mean_results, mean_plus_x_std):
229
- """Removes the datapoints after the mean result achieves the score required to solve the game"""
230
- for ix in range(len(mean_results)):
231
- if mean_results[ix] >= self.config.environment.get_score_to_win():
232
- break
233
- return mean_minus_x_std[:ix], mean_results[:ix], mean_plus_x_std[:ix]
234
-
235
- def draw_horizontal_line_with_label(self, ax, y_value, x_min, x_max, label):
236
- """Draws a dotted horizontal line on the given image at the given point and with the given label"""
237
- ax.hlines(y=y_value, xmin=x_min, xmax=x_max,
238
- linewidth=2, color='k', linestyles='dotted', alpha=0.5)
239
- ax.text(x_max, y_value * 0.965, label)
240
-
241
- def print_two_empty_lines(self):
242
- print("-----------------------------------------------------------------------------------")
243
- print("-----------------------------------------------------------------------------------")
244
- print(" ")
245
-
246
- def save_obj(self, obj, name):
247
- """Saves given object as a pickle file"""
248
- if name[-4:] != ".pkl":
249
- name += ".pkl"
250
- with open(name, 'wb') as f:
251
- pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
252
-
253
- def load_obj(self, name):
254
- """Loads a pickle file object"""
255
- with open(name, 'rb') as f:
256
- return pickle.load(f)
257
-
258
- def visualise_preexisting_results(self, save_image_path=None, data_path=None, colors=None, show_image=True, ax=None,
259
- title=None, y_limits=None):
260
- """Visualises saved data results and then optionally saves the image"""
261
- if not data_path: preexisting_results = self.create_object_to_store_results()
262
- else: preexisting_results = self.load_obj(data_path)
263
- for ix, agent in enumerate(list(preexisting_results.keys())):
264
- agent_rolling_score_results = [results[1] for results in preexisting_results[agent]]
265
- if colors: color = colors[ix]
266
- else: color = None
267
- self.visualise_overall_agent_results(agent_rolling_score_results, agent, show_mean_and_std_range=True,
268
- color=color, ax=ax, title=title, y_limits=y_limits)
269
- if save_image_path: plt.savefig(save_image_path, bbox_inches="tight")
270
- if show_image: plt.show()
271
-
272
- def visualise_set_of_preexisting_results(self, results_data_paths, save_image_path=None, show_image=True, plot_titles=None,
273
- y_limits=[None,None]):
274
- """Visualises a set of preexisting results on 1 plot by making subplots"""
275
- assert isinstance(results_data_paths, list), "all_results must be a list of data paths"
276
-
277
- num_figures = len(results_data_paths)
278
- col_width = 15
279
- row_height = 6
280
-
281
- if num_figures <= 2:
282
- fig, axes = plt.subplots(1, num_figures, figsize=(col_width, row_height ))
283
- elif num_figures <= 4:
284
- fig, axes = plt.subplots(2, num_figures, figsize=(row_height, col_width))
285
- else:
286
- raise ValueError("Need to tell this method how to deal with more than 4 plots")
287
- for ax_ix in range(len(results_data_paths)):
288
- self.visualise_preexisting_results(show_image=False, data_path=results_data_paths[ax_ix], ax=axes[ax_ix],
289
- title=plot_titles[ax_ix], y_limits=y_limits[ax_ix])
290
- fig.tight_layout()
291
- fig.subplots_adjust(bottom=0.25)
292
-
293
- if save_image_path: plt.savefig(save_image_path) #, bbox_inches="tight")
294
- if show_image: plt.show()
295
-
296
- # ax.imshow(z, aspect="auto")
297
-
298
-
299
-
300
-
301
-
302
-
303
-
304
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agents/__init__.py DELETED
@@ -1 +0,0 @@
1
- import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))
 
 
agents/__pycache__/Base_Agent.cpython-310.pyc DELETED
Binary file (15.5 kB)
 
agents/__pycache__/Base_Agent.cpython-38.pyc DELETED
Binary file (15.4 kB)
 
agents/__pycache__/Base_Agent.cpython-39.pyc DELETED
Binary file (15.3 kB)
 
agents/__pycache__/HER_Base.cpython-310.pyc DELETED
Binary file (4.65 kB)
 
agents/__pycache__/HER_Base.cpython-39.pyc DELETED
Binary file (4.73 kB)
 
agents/__pycache__/Trainer.cpython-310.pyc DELETED
Binary file (13.5 kB)
 
agents/__pycache__/Trainer.cpython-39.pyc DELETED
Binary file (13.3 kB)
 
agents/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (240 Bytes)
 
agents/__pycache__/__init__.cpython-38.pyc DELETED
Binary file (307 Bytes)
 
agents/__pycache__/__init__.cpython-39.pyc DELETED
Binary file (238 Bytes)
 
agents/actor_critic_agents/A2C.py DELETED
@@ -1,25 +0,0 @@
1
- from agents.actor_critic_agents.A3C import A3C
2
-
3
- class A2C(A3C):
4
- """Synchronous version of A2C algorithm from deepmind paper https://arxiv.org/pdf/1602.01783.pdf. The only
5
- difference between this and the A3C is that gradient updates get done in a batch rather than 1 by 1 as the gradients
6
- come in"""
7
- agent_name = "A2C"
8
- def __init__(self, config):
9
- super(A2C, self).__init__(config)
10
-
11
- def update_shared_model(self, gradient_updates_queue):
12
- """Worker that updates the shared model with gradients as they get put into the queue"""
13
- while True:
14
- gradients_seen = 0
15
- while gradients_seen < self.worker_processes:
16
- if gradients_seen == 0:
17
- gradients = gradient_updates_queue.get()
18
- else:
19
- new_grads = gradient_updates_queue.get()
20
- gradients = [grad + new_grad for grad, new_grad in zip(gradients, new_grads)]
21
- gradients_seen += 1
22
- self.actor_critic_optimizer.zero_grad()
23
- for grads, params in zip(gradients, self.actor_critic.parameters()):
24
- params._grad = grads
25
- self.actor_critic_optimizer.step()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agents/actor_critic_agents/A3C.py DELETED
@@ -1,229 +0,0 @@
1
- import copy
2
- import random
3
- import time
4
- import numpy as np
5
- import torch
6
- from torch import multiprocessing
7
- from torch.multiprocessing import Queue
8
- from torch.optim import Adam
9
- from agents.Base_Agent import Base_Agent
10
- from utilities.Utility_Functions import create_actor_distribution, SharedAdam
11
-
12
- class A3C(Base_Agent):
13
- """Actor critic A3C algorithm from deepmind paper https://arxiv.org/pdf/1602.01783.pdf"""
14
- agent_name = "A3C"
15
- def __init__(self, config):
16
- super(A3C, self).__init__(config)
17
- self.num_processes = multiprocessing.cpu_count()
18
- self.worker_processes = max(1, self.num_processes - 2)
19
- self.actor_critic = self.create_NN(input_dim=self.state_size, output_dim=[self.action_size, 1])
20
- self.actor_critic_optimizer = SharedAdam(self.actor_critic.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4)
21
-
22
- def run_n_episodes(self):
23
- """Runs game to completion n times and then summarises results and saves model (if asked to)"""
24
- start = time.time()
25
- results_queue = Queue()
26
- gradient_updates_queue = Queue()
27
- episode_number = multiprocessing.Value('i', 0)
28
- self.optimizer_lock = multiprocessing.Lock()
29
- episodes_per_process = int(self.config.num_episodes_to_run / self.worker_processes) + 1
30
- processes = []
31
- self.actor_critic.share_memory()
32
- self.actor_critic_optimizer.share_memory()
33
-
34
- optimizer_worker = multiprocessing.Process(target=self.update_shared_model, args=(gradient_updates_queue,))
35
- optimizer_worker.start(,
36
-
37
- for process_num in range(self.worker_processes):
38
- worker = Actor_Critic_Worker(process_num, copy.deepcopy(self.environment), self.actor_critic, episode_number, self.optimizer_lock,
39
- self.actor_critic_optimizer, self.config, episodes_per_process,
40
- self.hyperparameters["epsilon_decay_rate_denominator"],
41
- self.action_size, self.action_types,
42
- results_queue, copy.deepcopy(self.actor_critic), gradient_updates_queue)
43
- worker.start()
44
- processes.append(worker)
45
- self.print_results(episode_number, results_queue)
46
- for worker in processes:
47
- worker.join()
48
- optimizer_worker.kill()
49
-
50
- time_taken = time.time() - start
51
- return self.game_full_episode_scores, self.rolling_results, time_taken
52
-
53
- def print_results(self, episode_number, results_queue):
54
- """Worker that prints out results as they get put into a queue"""
55
- while True:
56
- with episode_number.get_lock():
57
- carry_on = episode_number.value < self.config.num_episodes_to_run
58
- if carry_on:
59
- if not results_queue.empty():
60
- self.total_episode_score_so_far = results_queue.get()
61
- self.save_and_print_result()
62
- else: break
63
-
64
- def update_shared_model(self, gradient_updates_queue):
65
- """Worker that updates the shared model with gradients as they get put into the queue"""
66
- while True:
67
- gradients = gradient_updates_queue.get()
68
- with self.optimizer_lock:
69
- self.actor_critic_optimizer.zero_grad()
70
- for grads, params in zip(gradients, self.actor_critic.parameters()):
71
- params._grad = grads # maybe need to do grads.clone()
72
- self.actor_critic_optimizer.step()
73
-
74
- class Actor_Critic_Worker(torch.multiprocessing.Process):
75
- """Actor critic worker that will play the game for the designated number of episodes """
76
- def __init__(self, worker_num, environment, shared_model, counter, optimizer_lock, shared_optimizer,
77
- config, episodes_to_run, epsilon_decay_denominator, action_size, action_types, results_queue,
78
- local_model, gradient_updates_queue):
79
- super(Actor_Critic_Worker, self).__init__()
80
- self.environment = environment
81
- self.config = config
82
- self.worker_num = worker_num
83
-
84
- self.gradient_clipping_norm = self.config.hyperparameters["gradient_clipping_norm"]
85
- self.discount_rate = self.config.hyperparameters["discount_rate"]
86
- self.normalise_rewards = self.config.hyperparameters["normalise_rewards"]
87
-
88
- self.action_size = action_size
89
- self.set_seeds(self.worker_num)
90
- self.shared_model = shared_model
91
- self.local_model = local_model
92
- self.local_optimizer = Adam(self.local_model.parameters(), lr=0.0, eps=1e-4)
93
- self.counter = counter
94
- self.optimizer_lock = optimizer_lock
95
- self.shared_optimizer = shared_optimizer
96
- self.episodes_to_run = episodes_to_run
97
- self.epsilon_decay_denominator = epsilon_decay_denominator
98
- self.exploration_worker_difference = self.config.hyperparameters["exploration_worker_difference"]
99
- self.action_types = action_types
100
- self.results_queue = results_queue
101
- self.episode_number = 0
102
-
103
- self.gradient_updates_queue = gradient_updates_queue
104
-
105
- def set_seeds(self, worker_num):
106
- """Sets random seeds for this worker"""
107
- torch.manual_seed(self.config.seed + worker_num)
108
- self.environment.seed(self.config.seed + worker_num)
109
-
110
- def run(self):
111
- """Starts the worker"""
112
- torch.set_num_threads(1)
113
- for ep_ix in range(self.episodes_to_run):
114
- with self.optimizer_lock:
115
- Base_Agent.copy_model_over(self.shared_model, self.local_model)
116
- epsilon_exploration = self.calculate_new_exploration()
117
- state = self.reset_game_for_worker()
118
- done = False
119
- self.episode_states = []
120
- self.episode_actions = []
121
- self.episode_rewards = []
122
- self.episode_log_action_probabilities = []
123
- self.critic_outputs = []
124
-
125
- while not done:
126
- action, action_log_prob, critic_outputs = self.pick_action_and_get_critic_values(self.local_model, state, epsilon_exploration)
127
- next_state, reward, done, _ = self.environment.step(action)
128
- self.episode_states.append(state)
129
- self.episode_actions.append(action)
130
- self.episode_rewards.append(reward)
131
- self.episode_log_action_probabilities.append(action_log_prob)
132
- self.critic_outputs.append(critic_outputs)
133
- state = next_state
134
-
135
- total_loss = self.calculate_total_loss()
136
- self.put_gradients_in_queue(total_loss)
137
- self.episode_number += 1
138
- with self.counter.get_lock():
139
- self.counter.value += 1
140
- self.results_queue.put(np.sum(self.episode_rewards))
141
-
142
- def calculate_new_exploration(self):
143
- """Calculates the new exploration parameter epsilon. It picks a random point within 3X above and below the
144
- current epsilon"""
145
- with self.counter.get_lock():
146
- epsilon = 1.0 / (1.0 + (self.counter.value / self.epsilon_decay_denominator))
147
- epsilon = max(0.0, random.uniform(epsilon / self.exploration_worker_difference, epsilon * self.exploration_worker_difference))
148
- return epsilon
149
-
150
- def reset_game_for_worker(self):
151
- """Resets the game environment so it is ready to play a new episode"""
152
- state = self.environment.reset()
153
- if self.action_types == "CONTINUOUS": self.noise.reset()
154
- return state
155
-
156
- def pick_action_and_get_critic_values(self, policy, state, epsilon_exploration=None):
157
- """Picks an action using the policy"""
158
- state = torch.from_numpy(state).float().unsqueeze(0)
159
- model_output = policy.forward(state)
160
- actor_output = model_output[:, list(range(self.action_size))] #we only use first set of columns to decide action, last column is state-value
161
- critic_output = model_output[:, -1]
162
- action_distribution = create_actor_distribution(self.action_types, actor_output, self.action_size)
163
- action = action_distribution.sample().cpu().numpy()
164
- if self.action_types == "CONTINUOUS": action += self.noise.sample()
165
- if self.action_types == "DISCRETE":
166
- if random.random() <= epsilon_exploration:
167
- action = random.randint(0, self.action_size - 1)
168
- else:
169
- action = action[0]
170
- action_log_prob = self.calculate_log_action_probability(action, action_distribution)
171
- return action, action_log_prob, critic_output
172
-
173
- def calculate_log_action_probability(self, actions, action_distribution):
174
- """Calculates the log probability of the chosen action"""
175
- policy_distribution_log_prob = action_distribution.log_prob(torch.Tensor([actions]))
176
- return policy_distribution_log_prob
177
-
178
- def calculate_total_loss(self):
179
- """Calculates the actor loss + critic loss"""
180
- discounted_returns = self.calculate_discounted_returns()
181
- if self.normalise_rewards:
182
- discounted_returns = self.normalise_discounted_returns(discounted_returns)
183
- critic_loss, advantages = self.calculate_critic_loss_and_advantages(discounted_returns)
184
- actor_loss = self.calculate_actor_loss(advantages)
185
- total_loss = actor_loss + critic_loss
186
- return total_loss
187
-
188
- def calculate_discounted_returns(self):
189
- """Calculates the cumulative discounted return for an episode which we will then use in a learning iteration"""
190
- discounted_returns = [0]
191
- for ix in range(len(self.episode_states)):
192
- return_value = self.episode_rewards[-(ix + 1)] + self.discount_rate*discounted_returns[-1]
193
- discounted_returns.append(return_value)
194
- discounted_returns = discounted_returns[1:]
195
- discounted_returns = discounted_returns[::-1]
196
- return discounted_returns
197
-
198
- def normalise_discounted_returns(self, discounted_returns):
199
- """Normalises the discounted returns by dividing by mean and std of returns that episode"""
200
- mean = np.mean(discounted_returns)
201
- std = np.std(discounted_returns)
202
- discounted_returns -= mean
203
- discounted_returns /= (std + 1e-5)
204
- return discounted_returns
205
-
206
- def calculate_critic_loss_and_advantages(self, all_discounted_returns):
207
- """Calculates the critic's loss and the advantages"""
208
- critic_values = torch.cat(self.critic_outputs)
209
- advantages = torch.Tensor(all_discounted_returns) - critic_values
210
- advantages = advantages.detach()
211
- critic_loss = (torch.Tensor(all_discounted_returns) - critic_values)**2
212
- critic_loss = critic_loss.mean()
213
- return critic_loss, advantages
214
-
215
- def calculate_actor_loss(self, advantages):
216
- """Calculates the loss for the actor"""
217
- action_log_probabilities_for_all_episodes = torch.cat(self.episode_log_action_probabilities)
218
- actor_loss = -1.0 * action_log_probabilities_for_all_episodes * advantages
219
- actor_loss = actor_loss.mean()
220
- return actor_loss
221
-
222
- def put_gradients_in_queue(self, total_loss):
223
- """Puts gradients in a queue for the optimisation process to use to update the shared model"""
224
- self.local_optimizer.zero_grad()
225
- total_loss.backward()
226
- torch.nn.utils.clip_grad_norm_(self.local_model.parameters(), self.gradient_clipping_norm)
227
- gradients = [param.grad.clone() for param in self.local_model.parameters()]
228
- self.gradient_updates_queue.put(gradients)
229
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agents/actor_critic_agents/DDPG.py DELETED
@@ -1,115 +0,0 @@
1
- import torch
2
- import torch.nn.functional as functional
3
- from torch import optim
4
- from agents.Base_Agent import Base_Agent
5
- from utilities.data_structures.Replay_Buffer import Replay_Buffer
6
- from exploration_strategies.OU_Noise_Exploration import OU_Noise_Exploration
7
-
8
- class DDPG(Base_Agent):
9
- """A DDPG Agent"""
10
- agent_name = "DDPG"
11
-
12
- def __init__(self, config):
13
- Base_Agent.__init__(self, config)
14
- self.hyperparameters = config.hyperparameters
15
- self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
16
- self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
17
- Base_Agent.copy_model_over(self.critic_local, self.critic_target)
18
-
19
- self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
20
- lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
21
- self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
22
- self.config.seed)
23
- self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
24
- self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
25
- Base_Agent.copy_model_over(self.actor_local, self.actor_target)
26
-
27
- self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
28
- lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
29
- self.exploration_strategy = OU_Noise_Exploration(self.config)
30
-
31
- def step(self):
32
- """Runs a step in the game"""
33
- while not self.done:
34
- # print("State ", self.state.shape)
35
- self.action = self.pick_action()
36
- self.conduct_action(self.action)
37
- if self.time_for_critic_and_actor_to_learn():
38
- for _ in range(self.hyperparameters["learning_updates_per_learning_session"]):
39
- states, actions, rewards, next_states, dones = self.sample_experiences()
40
- self.critic_learn(states, actions, rewards, next_states, dones)
41
- self.actor_learn(states)
42
- self.save_experience()
43
- self.state = self.next_state #this is to set the state for the next iteration
44
- self.global_step_number += 1
45
- self.episode_number += 1
46
-
47
- def sample_experiences(self):
48
- return self.memory.sample()
49
-
50
- def pick_action(self, state=None):
51
- """Picks an action using the actor network and then adds some noise to it to ensure exploration"""
52
- if state is None: state = torch.from_numpy(self.state).float().unsqueeze(0).to(self.device)
53
- self.actor_local.eval()
54
- with torch.no_grad():
55
- action = self.actor_local(state).cpu().data.numpy()
56
- self.actor_local.train()
57
- action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action": action})
58
- return action.squeeze(0)
59
-
60
- def critic_learn(self, states, actions, rewards, next_states, dones):
61
- """Runs a learning iteration for the critic"""
62
- loss = self.compute_loss(states, next_states, rewards, actions, dones)
63
- self.take_optimisation_step(self.critic_optimizer, self.critic_local, loss, self.hyperparameters["Critic"]["gradient_clipping_norm"])
64
- self.soft_update_of_target_network(self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"])
65
-
66
- def compute_loss(self, states, next_states, rewards, actions, dones):
67
- """Computes the loss for the critic"""
68
- with torch.no_grad():
69
- critic_targets = self.compute_critic_targets(next_states, rewards, dones)
70
- critic_expected = self.compute_expected_critic_values(states, actions)
71
- loss = functional.mse_loss(critic_expected, critic_targets)
72
- return loss
73
-
74
- def compute_critic_targets(self, next_states, rewards, dones):
75
- """Computes the critic target values to be used in the loss for the critic"""
76
- critic_targets_next = self.compute_critic_values_for_next_states(next_states)
77
- critic_targets = self.compute_critic_values_for_current_states(rewards, critic_targets_next, dones)
78
- return critic_targets
79
-
80
- def compute_critic_values_for_next_states(self, next_states):
81
- """Computes the critic values for next states to be used in the loss for the critic"""
82
- with torch.no_grad():
83
- actions_next = self.actor_target(next_states)
84
- critic_targets_next = self.critic_target(torch.cat((next_states, actions_next), 1))
85
- return critic_targets_next
86
-
87
- def compute_critic_values_for_current_states(self, rewards, critic_targets_next, dones):
88
- """Computes the critic values for current states to be used in the loss for the critic"""
89
- critic_targets_current = rewards + (self.hyperparameters["discount_rate"] * critic_targets_next * (1.0 - dones))
90
- return critic_targets_current
91
-
92
- def compute_expected_critic_values(self, states, actions):
93
- """Computes the expected critic values to be used in the loss for the critic"""
94
- critic_expected = self.critic_local(torch.cat((states, actions), 1))
95
- return critic_expected
96
-
97
- def time_for_critic_and_actor_to_learn(self):
98
- """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the
99
- actor and critic"""
100
- return self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0
101
-
102
- def actor_learn(self, states):
103
- """Runs a learning iteration for the actor"""
104
- if self.done: #we only update the learning rate at end of each episode
105
- self.update_learning_rate(self.hyperparameters["Actor"]["learning_rate"], self.actor_optimizer)
106
- actor_loss = self.calculate_actor_loss(states)
107
- self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss,
108
- self.hyperparameters["Actor"]["gradient_clipping_norm"])
109
- self.soft_update_of_target_network(self.actor_local, self.actor_target, self.hyperparameters["Actor"]["tau"])
110
-
111
- def calculate_actor_loss(self, states):
112
- """Calculates the loss for the actor"""
113
- actions_pred = self.actor_local(states)
114
- actor_loss = -self.critic_local(torch.cat((states, actions_pred), 1)).mean()
115
- return actor_loss
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agents/actor_critic_agents/DDPG_HER.py DELETED
@@ -1,38 +0,0 @@
1
- from agents.actor_critic_agents.DDPG import DDPG
2
- from agents.HER_Base import HER_Base
3
-
4
- class DDPG_HER(HER_Base, DDPG):
5
- """DDPG algorithm with hindsight experience replay"""
6
- agent_name = "DDPG-HER"
7
-
8
- def __init__(self, config):
9
- DDPG.__init__(self, config)
10
- HER_Base.__init__(self, self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
11
- self.hyperparameters["HER_sample_proportion"])
12
-
13
- def step(self):
14
- """Runs a step within a game including a learning step if required"""
15
- while not self.done:
16
- self.action = self.pick_action()
17
- self.conduct_action_in_changeable_goal_envs(self.action)
18
- if self.time_for_critic_and_actor_to_learn():
19
- for _ in range(self.hyperparameters["learning_updates_per_learning_session"]):
20
- states, actions, rewards, next_states, dones = self.sample_from_HER_and_Ordinary_Buffer() # Samples experiences from buffer
21
- self.critic_learn(states, actions, rewards, next_states, dones)
22
- self.actor_learn(states)
23
- self.track_changeable_goal_episodes_data()
24
- self.save_experience()
25
- if self.done: self.save_alternative_experience()
26
- self.state_dict = self.next_state_dict # this is to set the state for the next iteration
27
- self.state = self.next_state
28
- self.global_step_number += 1
29
- self.episode_number += 1
30
-
31
- def enough_experiences_to_learn_from(self):
32
- """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn"""
33
- return len(self.memory) > self.ordinary_buffer_batch_size and len(self.HER_memory) > self.HER_buffer_batch_size
34
-
35
-
36
-
37
-
38
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agents/actor_critic_agents/SAC.py DELETED
@@ -1,211 +0,0 @@
1
- from agents.Base_Agent import Base_Agent
2
- from utilities.OU_Noise import OU_Noise
3
- from utilities.data_structures.Replay_Buffer import Replay_Buffer
4
- from torch.optim import Adam
5
- import torch
6
- import torch.nn.functional as F
7
- from torch.distributions import Normal
8
- import numpy as np
9
-
10
- LOG_SIG_MAX = 2
11
- LOG_SIG_MIN = -20
12
- TRAINING_EPISODES_PER_EVAL_EPISODE = 10
13
- EPSILON = 1e-6
14
-
15
- class SAC(Base_Agent):
16
- """Soft Actor-Critic model based on the 2018 paper https://arxiv.org/abs/1812.05905 and on this github implementation
17
- https://github.com/pranz24/pytorch-soft-actor-critic. It is an actor-critic algorithm where the agent is also trained
18
- to maximise the entropy of their actions as well as their cumulative reward"""
19
- agent_name = "SAC"
20
- def __init__(self, config):
21
- Base_Agent.__init__(self, config)
22
- assert self.action_types == "CONTINUOUS", "Action types must be continuous. Use SAC Discrete instead for discrete actions"
23
- assert self.config.hyperparameters["Actor"]["final_layer_activation"] != "Softmax", "Final actor layer must not be softmax"
24
- self.hyperparameters = config.hyperparameters
25
- self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
26
- self.critic_local_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
27
- key_to_use="Critic", override_seed=self.config.seed + 1)
28
- self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(),
29
- lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
30
- self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(),
31
- lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
32
- self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
33
- key_to_use="Critic")
34
- self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
35
- key_to_use="Critic")
36
- Base_Agent.copy_model_over(self.critic_local, self.critic_target)
37
- Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
38
- self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
39
- self.config.seed)
40
- self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size * 2, key_to_use="Actor")
41
- self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
42
- lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
43
- self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"]
44
- if self.automatic_entropy_tuning:
45
- self.target_entropy = -torch.prod(torch.Tensor(self.environment.action_space.shape).to(self.device)).item() # heuristic value from the paper
46
- self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
47
- self.alpha = self.log_alpha.exp()
48
- self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
49
- else:
50
- self.alpha = self.hyperparameters["entropy_term_weight"]
51
-
52
- self.add_extra_noise = self.hyperparameters["add_extra_noise"]
53
- if self.add_extra_noise:
54
- self.noise = OU_Noise(self.action_size, self.config.seed, self.hyperparameters["mu"],
55
- self.hyperparameters["theta"], self.hyperparameters["sigma"])
56
-
57
- self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"]
58
-
59
- def save_result(self):
60
- """Saves the result of an episode of the game. Overriding the method in Base Agent that does this because we only
61
- want to keep track of the results during the evaluation episodes"""
62
- if self.episode_number == 1 or not self.do_evaluation_iterations:
63
- self.game_full_episode_scores.extend([self.total_episode_score_so_far])
64
- self.rolling_results.append(np.mean(self.game_full_episode_scores[-1 * self.rolling_score_window:]))
65
- self.save_max_result_seen()
66
-
67
- elif (self.episode_number - 1) % TRAINING_EPISODES_PER_EVAL_EPISODE == 0:
68
- self.game_full_episode_scores.extend([self.total_episode_score_so_far for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE)])
69
- self.rolling_results.extend([np.mean(self.game_full_episode_scores[-1 * self.rolling_score_window:]) for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE)])
70
- self.save_max_result_seen()
71
-
72
- def reset_game(self):
73
- """Resets the game information so we are ready to play a new episode"""
74
- Base_Agent.reset_game(self)
75
- if self.add_extra_noise: self.noise.reset()
76
-
77
- def step(self):
78
- """Runs an episode on the game, saving the experience and running a learning step if appropriate"""
79
- eval_ep = self.episode_number % TRAINING_EPISODES_PER_EVAL_EPISODE == 0 and self.do_evaluation_iterations
80
- self.episode_step_number_val = 0
81
- while not self.done:
82
- self.episode_step_number_val += 1
83
- self.action = self.pick_action(eval_ep)
84
- self.conduct_action(self.action)
85
- if self.time_for_critic_and_actor_to_learn():
86
- for _ in range(self.hyperparameters["learning_updates_per_learning_session"]):
87
- self.learn()
88
- mask = False if self.episode_step_number_val >= self.environment._max_episode_steps else self.done
89
- if not eval_ep: self.save_experience(experience=(self.state, self.action, self.reward, self.next_state, mask))
90
- self.state = self.next_state
91
- self.global_step_number += 1
92
- print(self.total_episode_score_so_far)
93
- if eval_ep: self.print_summary_of_latest_evaluation_episode()
94
- self.episode_number += 1
95
-
96
- def pick_action(self, eval_ep, state=None):
97
- """Picks an action using one of three methods: 1) Randomly if we haven't passed a certain number of steps,
98
- 2) Using the actor in evaluation mode if eval_ep is True 3) Using the actor in training mode if eval_ep is False.
99
- The difference between evaluation and training mode is that training mode does more exploration"""
100
- if state is None: state = self.state
101
- if eval_ep: action = self.actor_pick_action(state=state, eval=True)
102
- elif self.global_step_number < self.hyperparameters["min_steps_before_learning"]:
103
- action = self.environment.action_space.sample()
104
- print("Picking random action ", action)
105
- else: action = self.actor_pick_action(state=state)
106
- if self.add_extra_noise:
107
- action += self.noise.sample()
108
- return action
109
-
110
- def actor_pick_action(self, state=None, eval=False):
111
- """Uses actor to pick an action in one of two ways: 1) If eval = False and we aren't in eval mode then it picks
112
- an action that has partly been randomly sampled 2) If eval = True then we pick the action that comes directly
113
- from the network and so did not involve any random sampling"""
114
- if state is None: state = self.state
115
- state = torch.FloatTensor([state]).to(self.device)
116
- if len(state.shape) == 1: state = state.unsqueeze(0)
117
- if eval == False: action, _, _ = self.produce_action_and_action_info(state)
118
- else:
119
- with torch.no_grad():
120
- _, z, action = self.produce_action_and_action_info(state)
121
- action = action.detach().cpu().numpy()
122
- return action[0]
123
-
124
- def produce_action_and_action_info(self, state):
125
- """Given the state, produces an action, the log probability of the action, and the tanh of the mean action"""
126
- actor_output = self.actor_local(state)
127
- mean, log_std = actor_output[:, :self.action_size], actor_output[:, self.action_size:]
128
- std = log_std.exp()
129
- normal = Normal(mean, std)
130
- x_t = normal.rsample() #rsample means it is sampled using reparameterisation trick
131
- action = torch.tanh(x_t)
132
- log_prob = normal.log_prob(x_t)
133
- log_prob -= torch.log(1 - action.pow(2) + EPSILON)
134
- log_prob = log_prob.sum(1, keepdim=True)
135
- return action, log_prob, torch.tanh(mean)
136
-
137
- def time_for_critic_and_actor_to_learn(self):
138
- """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the
139
- actor and critic"""
140
- return self.global_step_number > self.hyperparameters["min_steps_before_learning"] and \
141
- self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0
142
-
143
- def learn(self):
144
- """Runs a learning iteration for the actor, both critics and (if specified) the temperature parameter"""
145
- state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.sample_experiences()
146
- qf1_loss, qf2_loss = self.calculate_critic_losses(state_batch, action_batch, reward_batch, next_state_batch, mask_batch)
147
- self.update_critic_parameters(qf1_loss, qf2_loss)
148
-
149
- policy_loss, log_pi = self.calculate_actor_loss(state_batch)
150
- if self.automatic_entropy_tuning: alpha_loss = self.calculate_entropy_tuning_loss(log_pi)
151
- else: alpha_loss = None
152
- self.update_actor_parameters(policy_loss, alpha_loss)
153
-
154
- def sample_experiences(self):
155
- return self.memory.sample()
156
-
157
- def calculate_critic_losses(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch):
158
- """Calculates the losses for the two critics. This is the ordinary Q-learning loss except the additional entropy
159
- term is taken into account"""
160
- with torch.no_grad():
161
- next_state_action, next_state_log_pi, _ = self.produce_action_and_action_info(next_state_batch)
162
- qf1_next_target = self.critic_target(torch.cat((next_state_batch, next_state_action), 1))
163
- qf2_next_target = self.critic_target_2(torch.cat((next_state_batch, next_state_action), 1))
164
- min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi
165
- next_q_value = reward_batch + (1.0 - mask_batch) * self.hyperparameters["discount_rate"] * (min_qf_next_target)
166
- qf1 = self.critic_local(torch.cat((state_batch, action_batch), 1))
167
- qf2 = self.critic_local_2(torch.cat((state_batch, action_batch), 1))
168
- qf1_loss = F.mse_loss(qf1, next_q_value)
169
- qf2_loss = F.mse_loss(qf2, next_q_value)
170
- return qf1_loss, qf2_loss
171
-
172
- def calculate_actor_loss(self, state_batch):
173
- """Calculates the loss for the actor. This loss includes the additional entropy term"""
174
- action, log_pi, _ = self.produce_action_and_action_info(state_batch)
175
- qf1_pi = self.critic_local(torch.cat((state_batch, action), 1))
176
- qf2_pi = self.critic_local_2(torch.cat((state_batch, action), 1))
177
- min_qf_pi = torch.min(qf1_pi, qf2_pi)
178
- policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean()
179
- return policy_loss, log_pi
180
-
181
- def calculate_entropy_tuning_loss(self, log_pi):
182
- """Calculates the loss for the entropy temperature parameter. This is only relevant if self.automatic_entropy_tuning
183
- is True."""
184
- alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()
185
- return alpha_loss
186
-
187
- def update_critic_parameters(self, critic_loss_1, critic_loss_2):
188
- """Updates the parameters for both critics"""
189
- self.take_optimisation_step(self.critic_optimizer, self.critic_local, critic_loss_1,
190
- self.hyperparameters["Critic"]["gradient_clipping_norm"])
191
- self.take_optimisation_step(self.critic_optimizer_2, self.critic_local_2, critic_loss_2,
192
- self.hyperparameters["Critic"]["gradient_clipping_norm"])
193
- self.soft_update_of_target_network(self.critic_local, self.critic_target,
194
- self.hyperparameters["Critic"]["tau"])
195
- self.soft_update_of_target_network(self.critic_local_2, self.critic_target_2,
196
- self.hyperparameters["Critic"]["tau"])
197
-
198
- def update_actor_parameters(self, actor_loss, alpha_loss):
199
- """Updates the parameters for the actor and (if specified) the temperature parameter"""
200
- self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss,
201
- self.hyperparameters["Actor"]["gradient_clipping_norm"])
202
- if alpha_loss is not None:
203
- self.take_optimisation_step(self.alpha_optim, None, alpha_loss, None)
204
- self.alpha = self.log_alpha.exp()
205
-
206
- def print_summary_of_latest_evaluation_episode(self):
207
- """Prints a summary of the latest episode"""
208
- print(" ")
209
- print("----------------------------")
210
- print("Episode score {} ".format(self.total_episode_score_so_far))
211
- print("----------------------------")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agents/actor_critic_agents/SAC_Discrete.py DELETED
@@ -1,94 +0,0 @@
1
- import torch
2
- from torch.optim import Adam
3
- import torch.nn.functional as F
4
- import numpy as np
5
- from agents.Base_Agent import Base_Agent
6
- from utilities.data_structures.Replay_Buffer import Replay_Buffer
7
- from agents.actor_critic_agents.SAC import SAC
8
- from utilities.Utility_Functions import create_actor_distribution
9
-
10
- class SAC_Discrete(SAC):
11
- """The Soft Actor Critic for discrete actions. It inherits from SAC for continuous actions and only changes a few
12
- methods."""
13
- agent_name = "SAC"
14
- def __init__(self, config):
15
- Base_Agent.__init__(self, config)
16
- assert self.action_types == "DISCRETE", "Action types must be discrete. Use SAC instead for continuous actions"
17
- assert self.config.hyperparameters["Actor"]["final_layer_activation"] == "Softmax", "Final actor layer must be softmax"
18
- self.hyperparameters = config.hyperparameters
19
- self.critic_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic")
20
- self.critic_local_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
21
- key_to_use="Critic", override_seed=self.config.seed + 1)
22
- self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(),
23
- lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
24
- self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(),
25
- lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
26
- self.critic_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
27
- key_to_use="Critic")
28
- self.critic_target_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
29
- key_to_use="Critic")
30
- Base_Agent.copy_model_over(self.critic_local, self.critic_target)
31
- Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
32
- self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
33
- self.config.seed, device=self.device)
34
-
35
- self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
36
- self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
37
- lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
38
- self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"]
39
- if self.automatic_entropy_tuning:
40
- # we set the max possible entropy as the target entropy
41
- self.target_entropy = -np.log((1.0 / self.action_size)) * 0.98
42
- self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
43
- self.alpha = self.log_alpha.exp()
44
- self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
45
- else:
46
- self.alpha = self.hyperparameters["entropy_term_weight"]
47
- assert not self.hyperparameters["add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment"
48
- self.add_extra_noise = False
49
- self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"]
50
-
51
- def produce_action_and_action_info(self, state):
52
- """Given the state, produces an action, the probability of the action, the log probability of the action, and
53
- the argmax action"""
54
- action_probabilities = self.actor_local(state)
55
- max_probability_action = torch.argmax(action_probabilities, dim=-1)
56
- action_distribution = create_actor_distribution(self.action_types, action_probabilities, self.action_size)
57
- action = action_distribution.sample().cpu()
58
- # Have to deal with situation of 0.0 probabilities because we can't do log 0
59
- z = action_probabilities == 0.0
60
- z = z.float() * 1e-8
61
- log_action_probabilities = torch.log(action_probabilities + z)
62
- return action, (action_probabilities, log_action_probabilities), max_probability_action
63
-
64
- def calculate_critic_losses(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch):
65
- """Calculates the losses for the two critics. This is the ordinary Q-learning loss except the additional entropy
66
- term is taken into account"""
67
- with torch.no_grad():
68
- next_state_action, (action_probabilities, log_action_probabilities), _ = self.produce_action_and_action_info(next_state_batch)
69
- qf1_next_target = self.critic_target(next_state_batch)
70
- qf2_next_target = self.critic_target_2(next_state_batch)
71
- min_qf_next_target = action_probabilities * (torch.min(qf1_next_target, qf2_next_target) - self.alpha * log_action_probabilities)
72
- min_qf_next_target = min_qf_next_target.sum(dim=1).unsqueeze(-1)
73
- next_q_value = reward_batch + (1.0 - mask_batch) * self.hyperparameters["discount_rate"] * (min_qf_next_target)
74
-
75
- qf1 = self.critic_local(state_batch).gather(1, action_batch.long())
76
- qf2 = self.critic_local_2(state_batch).gather(1, action_batch.long())
77
- qf1_loss = F.mse_loss(qf1, next_q_value)
78
- qf2_loss = F.mse_loss(qf2, next_q_value)
79
- return qf1_loss, qf2_loss
80
-
81
- def calculate_actor_loss(self, state_batch):
82
- """Calculates the loss for the actor. This loss includes the additional entropy term"""
83
- action, (action_probabilities, log_action_probabilities), _ = self.produce_action_and_action_info(state_batch)
84
- qf1_pi = self.critic_local(state_batch)
85
- qf2_pi = self.critic_local_2(state_batch)
86
- min_qf_pi = torch.min(qf1_pi, qf2_pi)
87
- inside_term = self.alpha * log_action_probabilities - min_qf_pi
88
- policy_loss = (action_probabilities * inside_term).sum(dim=1).mean()
89
- log_action_probabilities = torch.sum(log_action_probabilities * action_probabilities, dim=1)
90
- return policy_loss, log_action_probabilities
91
-
92
- def locally_save_policy(self):
93
- """Saves the policy"""
94
- torch.save(self.actor_local.state_dict(), "{}/{}_network.pt".format(self.config.models_dir, self.agent_name))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agents/actor_critic_agents/TD3.py DELETED
@@ -1,54 +0,0 @@
1
- import torch
2
- import torch.nn.functional as functional
3
- from torch import optim
4
- from agents.Base_Agent import Base_Agent
5
- from .DDPG import DDPG
6
- from exploration_strategies.Gaussian_Exploration import Gaussian_Exploration
7
-
8
- class TD3(DDPG):
9
- """A TD3 Agent from the paper Addressing Function Approximation Error in Actor-Critic Methods (Fujimoto et al. 2018)
10
- https://arxiv.org/abs/1802.09477"""
11
- agent_name = "TD3"
12
-
13
- def __init__(self, config):
14
- DDPG.__init__(self, config)
15
- self.critic_local_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
16
- key_to_use="Critic", override_seed=self.config.seed + 1)
17
- self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
18
- key_to_use="Critic")
19
- Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
20
- self.critic_optimizer_2 = optim.Adam(self.critic_local_2.parameters(),
21
- lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
22
- self.exploration_strategy_critic = Gaussian_Exploration(self.config)
23
-
24
- def compute_critic_values_for_next_states(self, next_states):
25
- """Computes the critic values for next states to be used in the loss for the critic"""
26
- with torch.no_grad():
27
- actions_next = self.actor_target(next_states)
28
- actions_next_with_noise = self.exploration_strategy_critic.perturb_action_for_exploration_purposes({"action": actions_next})
29
- critic_targets_next_1 = self.critic_target(torch.cat((next_states, actions_next_with_noise), 1))
30
- critic_targets_next_2 = self.critic_target_2(torch.cat((next_states, actions_next_with_noise), 1))
31
- critic_targets_next = torch.min(torch.cat((critic_targets_next_1, critic_targets_next_2),1), dim=1)[0].unsqueeze(-1)
32
- return critic_targets_next
33
-
34
- def critic_learn(self, states, actions, rewards, next_states, dones):
35
- """Runs a learning iteration for both the critics"""
36
- critic_targets_next = self.compute_critic_values_for_next_states(next_states)
37
- critic_targets = self.compute_critic_values_for_current_states(rewards, critic_targets_next, dones)
38
-
39
- critic_expected_1 = self.critic_local(torch.cat((states, actions), 1))
40
- critic_expected_2 = self.critic_local_2(torch.cat((states, actions), 1))
41
-
42
- critic_loss_1 = functional.mse_loss(critic_expected_1, critic_targets)
43
- critic_loss_2 = functional.mse_loss(critic_expected_2, critic_targets)
44
-
45
- self.take_optimisation_step(self.critic_optimizer, self.critic_local, critic_loss_1, self.hyperparameters["Critic"]["gradient_clipping_norm"])
46
- self.take_optimisation_step(self.critic_optimizer_2, self.critic_local_2, critic_loss_2,
47
- self.hyperparameters["Critic"]["gradient_clipping_norm"])
48
-
49
- self.soft_update_of_target_network(self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"])
50
- self.soft_update_of_target_network(self.critic_local_2, self.critic_target_2, self.hyperparameters["Critic"]["tau"])
51
-
52
-
53
-
54
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agents/actor_critic_agents/__pycache__/A2C.cpython-39.pyc DELETED
Binary file (1.61 kB)
 
agents/actor_critic_agents/__pycache__/A3C.cpython-39.pyc DELETED
Binary file (9.51 kB)
 
agents/actor_critic_agents/__pycache__/DDPG.cpython-39.pyc DELETED
Binary file (5.81 kB)
 
agents/actor_critic_agents/__pycache__/SAC.cpython-310.pyc DELETED
Binary file (10.6 kB)