diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..5ee0a015d631f032aa906fe6913b100370fad343 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Abubakar Sani Ali + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6a4e6eeb54d178a33487939171cf3ae27ba611cd --- /dev/null +++ b/README.md @@ -0,0 +1,94 @@ +--- +title: Anti Jam +emoji: 😻 +colorFrom: yellow +colorTo: yellow +sdk: streamlit +sdk_version: 1.25.0 +app_file: app.py +pinned: false +license: apache-2.0 +--- + +# Beyond the Anti-Jam: LLM for Zero Touch Networks + +[![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/yourusername/yourrepository/issues) + +![LLM](utilities/LLM_image.png) ![PyTorch](utilities/PyTorch-logo-2.jpg) + +This project explores the integration of Large Language Models (LLMs) with Deep Reinforcement Learning (DRL) to enhance the transparency and interpretability of anti-jamming strategies in Zero Touch Networks (ZTNs). The goal is to provide human-readable explanations for DRL-based decisions, making complex strategies intuitive for network administrators. The project leverages LLMs to generate natural language descriptions for DRL actions based on observed state vectors and rewards. + +## Getting Started + +Follow these instructions to set up and run the project on your local machine for development and testing. + +### Prerequisites + +- Python 3.7 or higher +- PyTorch +- OpenAI Gym +- Matplotlib +- Numpy +- Pandas +- StreamLit + +For specific library versions, please refer to the `requirements.txt` file. + +### Installation + +1. Clone the repository to your local machine. +2. Install the required packages using pip: + + ```bash + pip install -r requirements.txt + + ``` +3. Execute the script: + + ```bash + python3 app.py + ``` + +### Usage + +The primary script trains different DQN agent variants for a specified number of episodes. After training, the agent's performance is evaluated and plotted. Relevant data, such as agent behavior, rewards, throughput, and channel switching times, are saved for further analysis. + +#### Repository Structure + +The structure of the repository is designed to maintain clarity and organization: + +- **agents**: This directory contains various agent implementations, categorized into different types such as actor-critic, DQN, policy gradient, and stochastic policy search agents. + +- **environments**: The directory houses the implementation of the RFSpectrum environment, where the agent operates and learns. + +- **results**: This directory stores the data and graphs generated during training and evaluation. The `Anti_Jam.py` script is the main entry point for running the training and evaluation process. + +- **tests**: This directory can be used to write and execute tests for the codebase. + +- **utilities**: The directory contains utility files, including data structures and visual assets. + +#### License + +This project is licensed under the MIT License - see the LICENSE.md file for details. + +#### Acknowledgements + +This project is supported by the following: + +- [Deep Reinforcement Learning Algorithms with PyTorch](https://github.com/p-christ/Deep-Reinforcement-Learning-Algorithms-with-PyTorch): This repository provides PyTorch implementations of deep reinforcement learning algorithms and environments. + +- **Research Paper**: The implementation is based on the research paper titled "Beyond the Anti-Jam: Unraveling DRL-based Anti-Jamming Strategy in Zero Touch Networks through Large Language Models". The paper serves as the theoretical foundation for the project and can be accessed [here](https://arxiv.org/abs/2307.06796). + +- **Hugging Face Transformers Library**: This repository provides tools for integrating and fine-tuning large language models, enabling natural language understanding and generation. + +#### Contributing + +Contributions to this project are welcome! If you'd like to contribute, please follow these steps: + +1. Fork the repository. +2. Create a new branch for your feature/fix. +3. Make your changes and commit them with clear messages. +4. Push your changes to your forked repository. +5. Submit a pull request, detailing the changes you made and why they should be merged. + +Let's work together to improve this project and make it even more effective in countering jamming attacks! diff --git a/agents/Base_Agent.py b/agents/Base_Agent.py new file mode 100644 index 0000000000000000000000000000000000000000..3a3e143e8307d6b4e497ce3990147971ab966e72 --- /dev/null +++ b/agents/Base_Agent.py @@ -0,0 +1,394 @@ +import logging +import os +import sys +import gym +import random +import numpy as np +import torch +import time +# import tensorflow as tf +from nn_builder.pytorch.NN import NN +# from tensorboardX import SummaryWriter +from torch.optim import optimizer + + +class Base_Agent(object): + + def __init__(self, config): + self.logger = self.setup_logger() + self.debug_mode = config.debug_mode + # if self.debug_mode: self.tensorboard = SummaryWriter() + self.config = config + self.set_random_seeds(config.seed) + self.environment = config.environment + self.environment_title = self.get_environment_title() + self.action_types = "DISCRETE" if self.environment.action_space.dtype == np.int64 else "CONTINUOUS" + self.action_size = int(self.get_action_size()) + self.config.action_size = self.action_size + + self.lowest_possible_episode_score = self.get_lowest_possible_episode_score() + + self.state_size = int(self.get_state_size()) + self.hyperparameters = config.hyperparameters + self.average_score_required_to_win = self.get_score_required_to_win() + self.rolling_score_window = self.get_trials() + # self.max_steps_per_episode = self.environment.spec.max_episode_steps + self.total_episode_score_so_far = 0 + self.game_full_episode_scores = [] + self.game_full_episode_signals = [] + self.rolling_results = [] + self.max_rolling_score_seen = float("-inf") + self.max_episode_score_seen = float("-inf") + self.episode_number = 0 + self.device = "cuda:0" if config.use_GPU else "cpu" + self.visualise_results_boolean = config.visualise_individual_results + self.global_step_number = 0 + self.turn_off_exploration = False if config.training else True + gym.logger.set_level(40) # stops it from printing an unnecessary warning + self.log_game_info() + + def step(self): + """Takes a step in the game. This method must be overriden by any agent""" + raise ValueError("Step needs to be implemented by the agent") + + def get_environment_title(self): + """Extracts name of environment from it""" + try: + name = self.environment.unwrapped.id + except AttributeError: + try: + if str(self.environment.unwrapped)[1:11] == "FetchReach": + return "FetchReach" + elif str(self.environment.unwrapped)[1:8] == "AntMaze": + return "AntMaze" + elif str(self.environment.unwrapped)[1:7] == "Hopper": + return "Hopper" + elif str(self.environment.unwrapped)[1:9] == "Walker2d": + return "Walker2d" + else: + name = self.environment.spec.id.split("-")[0] + except AttributeError: + name = str(self.environment.env) + if name[0:10] == "TimeLimit<": name = name[10:] + name = name.split(" ")[0] + if name[0] == "<": name = name[1:] + if name[-3:] == "Env": name = name[:-3] + return name + + def get_lowest_possible_episode_score(self): + """Returns the lowest possible episode score you can get in an environment""" + if self.environment_title == "Taxi": return -800 + return None + + def get_action_size(self): + """Gets the action_size for the gym env into the correct shape for a neural network""" + if "overwrite_action_size" in self.config.__dict__: return self.config.overwrite_action_size + if "action_size" in self.environment.__dict__: return self.environment.action_size + if self.action_types == "DISCRETE": + return self.environment.action_space.n + else: + return self.environment.action_space.shape[0] + + def get_state_size(self): + """Gets the state_size for the gym env into the correct shape for a neural network""" + random_state = self.environment.reset() + if isinstance(random_state, dict): + state_size = random_state["observation"].shape[0] + random_state["desired_goal"].shape[0] + return state_size + else: + return random_state.size + + def get_score_required_to_win(self): + """Gets average score required to win game""" + print("TITLE ", self.environment_title) + if self.environment_title == "FetchReach": return -5 + if self.environment_title in ["AntMaze", "Hopper", "Walker2d"]: + print("Score required to win set to infinity therefore no learning rate annealing will happen") + return float("inf") + try: + return self.environment.unwrapped.reward_threshold + except AttributeError: + try: + return self.environment.spec.reward_threshold + except AttributeError: + return self.environment.unwrapped.spec.reward_threshold + + def get_trials(self): + """Gets the number of trials to average a score over""" + if self.environment_title in ["AntMaze", "FetchReach", "Hopper", "Walker2d", "CartPole"]: return 100 + try: + return self.environment.unwrapped.trials + except AttributeError: + return self.environment.spec.trials + + def setup_logger(self): + """Sets up the logger""" + filename = "Training.log" + try: + if os.path.isfile(filename): + os.remove(filename) + except: + pass + + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + # create a file handler + handler = logging.FileHandler(filename) + handler.setLevel(logging.INFO) + # create a logging format + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + # add the handlers to the logger + logger.addHandler(handler) + return logger + + def log_game_info(self): + """Logs info relating to the game""" + for ix, param in enumerate( + [self.environment_title, self.action_types, self.action_size, self.lowest_possible_episode_score, + self.state_size, self.hyperparameters, self.average_score_required_to_win, self.rolling_score_window, + self.device]): + self.logger.info("{} -- {}".format(ix, param)) + + def set_random_seeds(self, random_seed): + """Sets all possible random seeds so results can be reproduced""" + os.environ['PYTHONHASHSEED'] = str(random_seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + torch.manual_seed(random_seed) + # tf.set_random_seed(random_seed) + random.seed(random_seed) + np.random.seed(random_seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(random_seed) + torch.cuda.manual_seed(random_seed) + if hasattr(gym.spaces, 'prng'): + gym.spaces.prng.seed(random_seed) + + def reset_game(self): + """Resets the game information so we are ready to play a new episode""" + self.environment.seed(self.config.seed) + self.state = self.environment.reset() + self.next_state = None + self.action = None + self.reward = None + self.signal = None + self.done = False + self.total_episode_score_so_far = 0 + self.total_episode_signal_so_far = 0 + self.episode_states = [] + self.episode_rewards = [] + self.episode_signals = [] + self.episode_actions = [] + self.episode_next_states = [] + self.episode_dones = [] + self.episode_desired_goals = [] + self.episode_achieved_goals = [] + self.episode_observations = [] + if "exploration_strategy" in self.__dict__.keys(): self.exploration_strategy.reset() + self.logger.info("Reseting game -- New start state {}".format(self.state)) + + def track_episodes_data(self): + """Saves the data from the recent episodes""" + self.episode_states.append(self.state) + self.episode_actions.append(self.action) + self.episode_rewards.append(self.reward) + self.episode_signals.append(self.signal) + self.episode_next_states.append(self.next_state) + self.episode_dones.append(self.done) + + def run_n_episodes(self, num_episodes=None, show_whether_achieved_goal=True, save_and_print_results=True): + """Runs game to completion n times and then summarises results and saves model (if asked to)""" + if num_episodes is None: num_episodes = self.config.num_episodes_to_run + start = time.time() + while self.episode_number < num_episodes: + self.reset_game() + self.step() + if save_and_print_results: self.save_and_print_result() + time_taken = time.time() - start + if show_whether_achieved_goal: self.show_whether_achieved_goal() + if self.config.save_model: self.locally_save_policy() + return self.game_full_episode_scores, self.rolling_results, time_taken, self.game_full_episode_signals + + def conduct_action(self, action): + """Conducts an action in the environment""" + self.next_state, self.reward, self.done, self.signal = self.environment.step(action) + self.total_episode_score_so_far += self.reward + self.total_episode_signal_so_far += self.signal + if self.hyperparameters["clip_rewards"]: self.reward = max(min(self.reward, 1.0), -1.0) + + def save_and_print_result(self): + """Saves and prints results of the game""" + self.save_result() + self.print_rolling_result() + + def save_result(self): + """Saves the result of an episode of the game""" + self.game_full_episode_scores.append(self.total_episode_score_so_far) + self.game_full_episode_signals.append(self.total_episode_signal_so_far) + self.rolling_results.append(np.mean(self.game_full_episode_scores[-1 * self.rolling_score_window:])) + self.save_max_result_seen() + + def save_max_result_seen(self): + """Updates the best episode result seen so far""" + if self.game_full_episode_scores[-1] > self.max_episode_score_seen: + self.max_episode_score_seen = self.game_full_episode_scores[-1] + + if self.rolling_results[-1] > self.max_rolling_score_seen: + if len(self.rolling_results) > self.rolling_score_window: + self.max_rolling_score_seen = self.rolling_results[-1] + + def print_rolling_result(self): + """Prints out the latest episode results""" + text = """"\r Episode {0}, Score: {3: .2f}, Max score seen: {4: .2f}, Rolling score: {1: .2f}, Max rolling score seen: {2: .2f}""" + sys.stdout.write( + text.format(len(self.game_full_episode_scores), self.rolling_results[-1], self.max_rolling_score_seen, + self.game_full_episode_scores[-1], self.max_episode_score_seen)) + sys.stdout.flush() + + def show_whether_achieved_goal(self): + """Prints out whether the agent achieved the environment target goal""" + index_achieved_goal = self.achieved_required_score_at_index() + print(" ") + if index_achieved_goal == -1: # this means agent never achieved goal + print("\033[91m" + "\033[1m" + + "{} did not achieve required score \n".format(self.agent_name) + + "\033[0m" + "\033[0m") + else: + print("\033[92m" + "\033[1m" + + "{} achieved required score at episode {} \n".format(self.agent_name, index_achieved_goal) + + "\033[0m" + "\033[0m") + + def achieved_required_score_at_index(self): + """Returns the episode at which agent achieved goal or -1 if it never achieved it""" + for ix, score in enumerate(self.rolling_results): + if score > self.average_score_required_to_win: + return ix + return -1 + + def update_learning_rate(self, starting_lr, optimizer): + """Lowers the learning rate according to how close we are to the solution""" + if len(self.rolling_results) > 0: + last_rolling_score = self.rolling_results[-1] + if last_rolling_score > 0.75 * self.average_score_required_to_win: + new_lr = starting_lr / 100.0 + elif last_rolling_score > 0.6 * self.average_score_required_to_win: + new_lr = starting_lr / 20.0 + elif last_rolling_score > 0.5 * self.average_score_required_to_win: + new_lr = starting_lr / 10.0 + elif last_rolling_score > 0.25 * self.average_score_required_to_win: + new_lr = starting_lr / 2.0 + else: + new_lr = starting_lr + for g in optimizer.param_groups: + g['lr'] = new_lr + if random.random() < 0.001: self.logger.info("Learning rate {}".format(new_lr)) + + def enough_experiences_to_learn_from(self): + """Boolean indicated whether there are enough experiences in the memory buffer to learn from""" + return len(self.memory) > self.hyperparameters["batch_size"] + + def save_experience(self, memory=None, experience=None): + """Saves the recent experience to the memory buffer""" + if memory is None: memory = self.memory + if experience is None: experience = self.state, self.action, self.reward, self.next_state, self.done + memory.add_experience(*experience) + + def take_optimisation_step(self, optimizer, network, loss, clipping_norm=None, retain_graph=False): + """Takes an optimisation step by calculating gradients given the loss and then updating the parameters""" + if not isinstance(network, list): network = [network] + optimizer.zero_grad() # reset gradients to 0 + loss.backward(retain_graph=retain_graph) # this calculates the gradients + self.logger.info("Loss -- {}".format(loss.item())) + if self.debug_mode: self.log_gradient_and_weight_information(network, optimizer) + if clipping_norm is not None: + for net in network: + torch.nn.utils.clip_grad_norm_(net.parameters(), + clipping_norm) # clip gradients to help stabilise training + optimizer.step() # this applies the gradients + + def log_gradient_and_weight_information(self, network, optimizer): + + # log weight information + total_norm = 0 + for name, param in network.named_parameters(): + param_norm = param.grad.data.norm(2) + total_norm += param_norm.item() ** 2 + total_norm = total_norm ** (1. / 2) + self.logger.info("Gradient Norm {}".format(total_norm)) + + for g in optimizer.param_groups: + learning_rate = g['lr'] + break + self.logger.info("Learning Rate {}".format(learning_rate)) + + def soft_update_of_target_network(self, local_model, target_model, tau): + """Updates the target network in the direction of the local network but by taking a step size + less than one so the target network's parameter values trail the local networks. This helps stabilise training""" + for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): + target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) + + def create_NN(self, input_dim, output_dim, key_to_use=None, override_seed=None, hyperparameters=None): + """Creates a neural network for the agents to use""" + if hyperparameters is None: hyperparameters = self.hyperparameters + if key_to_use: hyperparameters = hyperparameters[key_to_use] + if override_seed: + seed = override_seed + else: + seed = self.config.seed + + default_hyperparameter_choices = {"output_activation": None, "hidden_activations": "relu", "dropout": 0.0, + "initialiser": "default", "batch_norm": False, + "columns_of_data_to_be_embedded": [], + "embedding_dimensions": [], "y_range": ()} + + for key in default_hyperparameter_choices: + if key not in hyperparameters.keys(): + hyperparameters[key] = default_hyperparameter_choices[key] + + return NN(input_dim=input_dim, layers_info=hyperparameters["linear_hidden_units"] + [output_dim], + output_activation=hyperparameters["final_layer_activation"], + batch_norm=hyperparameters["batch_norm"], dropout=hyperparameters["dropout"], + hidden_activations=hyperparameters["hidden_activations"], initialiser=hyperparameters["initialiser"], + columns_of_data_to_be_embedded=hyperparameters["columns_of_data_to_be_embedded"], + embedding_dimensions=hyperparameters["embedding_dimensions"], y_range=hyperparameters["y_range"], + random_seed=seed).to(self.device) + + def turn_on_any_epsilon_greedy_exploration(self): + """Turns off all exploration with respect to the epsilon greedy exploration strategy""" + print("Turning on epsilon greedy exploration") + self.turn_off_exploration = False + + def turn_off_any_epsilon_greedy_exploration(self): + """Turns off all exploration with respect to the epsilon greedy exploration strategy""" + print("Turning off epsilon greedy exploration") + self.turn_off_exploration = True + + def freeze_all_but_output_layers(self, network): + """Freezes all layers except the output layer of a network""" + print("Freezing hidden layers") + for param in network.named_parameters(): + param_name = param[0] + assert "hidden" in param_name or "output" in param_name or "embedding" in param_name, "Name {} of network layers not understood".format( + param_name) + if "output" not in param_name: + param[1].requires_grad = False + + def unfreeze_all_layers(self, network): + """Unfreezes all layers of a network""" + print("Unfreezing all layers") + for param in network.parameters(): + param.requires_grad = True + + @staticmethod + def move_gradients_one_model_to_another(from_model, to_model, set_from_gradients_to_zero=False): + """Copies gradients from from_model to to_model""" + for from_model, to_model in zip(from_model.parameters(), to_model.parameters()): + to_model._grad = from_model.grad.clone() + if set_from_gradients_to_zero: from_model._grad = None + + @staticmethod + def copy_model_over(from_model, to_model): + """Copies model parameters from from_model to to_model""" + for to_model, from_model in zip(to_model.parameters(), from_model.parameters()): + to_model.data.copy_(from_model.data.clone()) diff --git a/agents/DQN_agents/DDQN.py b/agents/DQN_agents/DDQN.py new file mode 100644 index 0000000000000000000000000000000000000000..a0c36e58d72e8e5a53071ce313e9eba28d0bb517 --- /dev/null +++ b/agents/DQN_agents/DDQN.py @@ -0,0 +1,18 @@ +from agents.DQN_agents.DQN_With_Fixed_Q_Targets import DQN_With_Fixed_Q_Targets + +class DDQN(DQN_With_Fixed_Q_Targets): + """A double DQN agent""" + agent_name = "DDQN" + + def __init__(self, config): + DQN_With_Fixed_Q_Targets.__init__(self, config) + + def compute_q_values_for_next_states(self, next_states): + """Computes the q_values for next state we will use to create the loss to train the Q network. Double DQN + uses the local index to pick the maximum q_value action and then the target network to calculate the q_value. + The reasoning behind this is that it will help stop the network from overestimating q values""" + max_action_indexes = self.q_network_local(next_states).detach().argmax(1) + Q_targets_next = self.q_network_target(next_states).gather(1, max_action_indexes.unsqueeze(1)) + return Q_targets_next + + diff --git a/agents/DQN_agents/DDQN_With_Prioritised_Experience_Replay.py b/agents/DQN_agents/DDQN_With_Prioritised_Experience_Replay.py new file mode 100644 index 0000000000000000000000000000000000000000..ccdd6d627c6e74d8956b41693186b13692772568 --- /dev/null +++ b/agents/DQN_agents/DDQN_With_Prioritised_Experience_Replay.py @@ -0,0 +1,37 @@ +import torch +import torch.nn.functional as F +from agents.DQN_agents.DDQN import DDQN +from utilities.data_structures.Prioritised_Replay_Buffer import Prioritised_Replay_Buffer + +class DDQN_With_Prioritised_Experience_Replay(DDQN): + """A DQN agent with prioritised experience replay""" + agent_name = "DDQN with Prioritised Replay" + + def __init__(self, config): + DDQN.__init__(self, config) + self.memory = Prioritised_Replay_Buffer(self.hyperparameters, config.seed) + + def learn(self): + """Runs a learning iteration for the Q network after sampling from the replay buffer in a prioritised way""" + sampled_experiences, importance_sampling_weights = self.memory.sample() + states, actions, rewards, next_states, dones = sampled_experiences + loss, td_errors = self.compute_loss_and_td_errors(states, next_states, rewards, actions, dones, importance_sampling_weights) + self.take_optimisation_step(self.q_network_optimizer, self.q_network_local, loss, self.hyperparameters["gradient_clipping_norm"]) + self.soft_update_of_target_network(self.q_network_local, self.q_network_target, self.hyperparameters["tau"]) + self.memory.update_td_errors(td_errors.squeeze(1)) + + def save_experience(self): + """Saves the latest experience including the td_error""" + max_td_error_in_experiences = self.memory.give_max_td_error() + 1e-9 + self.memory.add_experience(max_td_error_in_experiences, self.state, self.action, self.reward, self.next_state, self.done) + + def compute_loss_and_td_errors(self, states, next_states, rewards, actions, dones, importance_sampling_weights): + """Calculates the loss for the local Q network. It weighs each observations loss according to the importance + sampling weights which come from the prioritised replay buffer""" + Q_targets = self.compute_q_targets(next_states, rewards, dones) + Q_expected = self.compute_expected_q_values(states, actions) + loss = F.mse_loss(Q_expected, Q_targets) + loss = loss * importance_sampling_weights + loss = torch.mean(loss) + td_errors = Q_targets.data.cpu().numpy() - Q_expected.data.cpu().numpy() + return loss, td_errors \ No newline at end of file diff --git a/agents/DQN_agents/DQN.py b/agents/DQN_agents/DQN.py new file mode 100644 index 0000000000000000000000000000000000000000..42a14f4a53913581709848b8fbcff7f154543752 --- /dev/null +++ b/agents/DQN_agents/DQN.py @@ -0,0 +1,135 @@ +from collections import Counter + +import torch +import random +import torch.optim as optim +import torch.nn.functional as F +import numpy as np +from agents.Base_Agent import Base_Agent +from exploration_strategies.Epsilon_Greedy_Exploration import Epsilon_Greedy_Exploration +from utilities.data_structures.Replay_Buffer import Replay_Buffer + + +class DQN(Base_Agent): + """A deep Q learning agent""" + agent_name = "DQN" + + def __init__(self, config): + Base_Agent.__init__(self, config) + self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], + config.seed, self.device) + self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) + self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), + lr=self.hyperparameters["learning_rate"], eps=1e-4) + self.exploration_strategy = Epsilon_Greedy_Exploration(config) + + def reset_game(self): + super(DQN, self).reset_game() + self.update_learning_rate(self.hyperparameters["learning_rate"], self.q_network_optimizer) + + def step(self): + """Runs a step within a game including a learning step if required""" + while not self.done: + self.action = self.pick_action() + self.conduct_action(self.action) + # If we are in training mode + if self.config.training: + if self.time_for_q_network_to_learn(): + for _ in range(self.hyperparameters["learning_iterations"]): + self.learn() + self.save_experience() + self.state = self.next_state # this is to set the state for the next iteration + self.global_step_number += 1 + self.episode_number += 1 + + def pick_action(self, state=None): + """Uses the local Q network and an epsilon greedy policy to pick an action""" + # PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add + # a "fake" dimension to make it a mini-batch rather than a single observation + if state is None: state = self.state + if isinstance(state, np.int64) or isinstance(state, int): state = np.array([state]) + state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) + if len(state.shape) < 2: state = state.unsqueeze(0) + if not self.config.training: + self.q_network_local = self.locally_load_policy() + self.q_network_local.eval() # puts network in evaluation mode + with torch.no_grad(): + action_values = self.q_network_local(state) + if self.config.training: + self.q_network_local.train() # puts network back in training mode + action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action_values": action_values, + "turn_off_exploration": self.turn_off_exploration, + "episode_number": self.episode_number}) + self.logger.info("Q values {} -- Action chosen {}".format(action_values, action)) + return action + + def learn(self, experiences=None): + """Runs a learning iteration for the Q network""" + if experiences is None: + states, actions, rewards, next_states, dones = self.sample_experiences() # Sample experiences + else: + states, actions, rewards, next_states, dones = experiences + loss = self.compute_loss(states, next_states, rewards, actions, dones) + + actions_list = [action_X.item() for action_X in actions] + + self.logger.info("Action counts {}".format(Counter(actions_list))) + self.take_optimisation_step(self.q_network_optimizer, self.q_network_local, loss, + self.hyperparameters["gradient_clipping_norm"]) + + def compute_loss(self, states, next_states, rewards, actions, dones): + """Computes the loss required to train the Q network""" + with torch.no_grad(): + Q_targets = self.compute_q_targets(next_states, rewards, dones) + Q_expected = self.compute_expected_q_values(states, actions) + loss = F.mse_loss(Q_expected, Q_targets) + return loss + + def compute_q_targets(self, next_states, rewards, dones): + """Computes the q_targets we will compare to predicted q values to create the loss to train the Q network""" + Q_targets_next = self.compute_q_values_for_next_states(next_states) + Q_targets = self.compute_q_values_for_current_states(rewards, Q_targets_next, dones) + return Q_targets + + def compute_q_values_for_next_states(self, next_states): + """Computes the q_values for next state we will use to create the loss to train the Q network""" + Q_targets_next = self.q_network_local(next_states).detach().max(1)[0].unsqueeze(1) + return Q_targets_next + + def compute_q_values_for_current_states(self, rewards, Q_targets_next, dones): + """Computes the q_values for current state we will use to create the loss to train the Q network""" + Q_targets_current = rewards + (self.hyperparameters["discount_rate"] * Q_targets_next * (1 - dones)) + return Q_targets_current + + def compute_expected_q_values(self, states, actions): + """Computes the expected q_values we will use to create the loss to train the Q network""" + Q_expected = self.q_network_local(states).gather(1, + actions.long()) # must convert actions to long so can be used as index + return Q_expected + + def locally_save_policy(self): + """Saves the policy""" + torch.save(self.q_network_local.state_dict(), + "{}/{}_network.pt".format(self.config.models_dir, self.agent_name)) + + def locally_load_policy(self): + """loads the policy""" + filename = f'{self.config.models_dir}/{self.agent_name}_network.pt' + saved_q_network_local = self.q_network_local + saved_q_network_local.load_state_dict(torch.load(filename)) + return saved_q_network_local + + def time_for_q_network_to_learn(self): + """Returns boolean indicating whether enough steps have been taken for learning to begin and there are + enough experiences in the replay buffer to learn from""" + return self.right_amount_of_steps_taken() and self.enough_experiences_to_learn_from() + + def right_amount_of_steps_taken(self): + """Returns boolean indicating whether enough steps have been taken for learning to begin""" + return self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0 + + def sample_experiences(self): + """Draws a random sample of experience from the memory buffer""" + experiences = self.memory.sample() + states, actions, rewards, next_states, dones = experiences + return states, actions, rewards, next_states, dones diff --git a/agents/DQN_agents/DQN_HER.py b/agents/DQN_agents/DQN_HER.py new file mode 100644 index 0000000000000000000000000000000000000000..9110a385d9f9da61216d8a853a8af12749d57cee --- /dev/null +++ b/agents/DQN_agents/DQN_HER.py @@ -0,0 +1,30 @@ +from agents.DQN_agents.DQN import DQN +from agents.HER_Base import HER_Base + +class DQN_HER(HER_Base, DQN): + """DQN algorithm with hindsight experience replay""" + agent_name = "DQN-HER" + def __init__(self, config): + DQN.__init__(self, config) + HER_Base.__init__(self, self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], + self.hyperparameters["HER_sample_proportion"]) + + def step(self): + """Runs a step within a game including a learning step if required""" + while not self.done: + self.action = self.pick_action() + self.conduct_action_in_changeable_goal_envs(self.action) + if self.time_for_q_network_to_learn(): + for _ in range(self.hyperparameters["learning_iterations"]): + self.learn(experiences=self.sample_from_HER_and_Ordinary_Buffer()) + self.track_changeable_goal_episodes_data() + self.save_experience() + if self.done: self.save_alternative_experience() + self.state_dict = self.next_state_dict # this is to set the state for the next iteration + self.state = self.next_state + self.global_step_number += 1 + self.episode_number += 1 + + def enough_experiences_to_learn_from(self): + """Returns booleans indicating whether there are enough experiences in the two replay buffers to learn from""" + return len(self.memory) > self.ordinary_buffer_batch_size and len(self.HER_memory) > self.HER_buffer_batch_size \ No newline at end of file diff --git a/agents/DQN_agents/DQN_With_Fixed_Q_Targets.py b/agents/DQN_agents/DQN_With_Fixed_Q_Targets.py new file mode 100644 index 0000000000000000000000000000000000000000..7cdd492189aab9473ccaae6fe6aca21c8a848a0c --- /dev/null +++ b/agents/DQN_agents/DQN_With_Fixed_Q_Targets.py @@ -0,0 +1,23 @@ +import copy + +from agents.Base_Agent import Base_Agent +from agents.DQN_agents.DQN import DQN + +class DQN_With_Fixed_Q_Targets(DQN): + """A DQN agent that uses an older version of the q_network as the target network""" + agent_name = "DQN with Fixed Q Targets" + def __init__(self, config): + DQN.__init__(self, config) + self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) + Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) + + def learn(self, experiences=None): + """Runs a learning iteration for the Q network""" + super(DQN_With_Fixed_Q_Targets, self).learn(experiences=experiences) + self.soft_update_of_target_network(self.q_network_local, self.q_network_target, + self.hyperparameters["tau"]) # Update the target network + + def compute_q_values_for_next_states(self, next_states): + """Computes the q_values for next state we will use to create the loss to train the Q network""" + Q_targets_next = self.q_network_target(next_states).detach().max(1)[0].unsqueeze(1) + return Q_targets_next \ No newline at end of file diff --git a/agents/DQN_agents/Dueling_DDQN.py b/agents/DQN_agents/Dueling_DDQN.py new file mode 100644 index 0000000000000000000000000000000000000000..793ccba92570de0be1ea6d27831638145d3bf8b9 --- /dev/null +++ b/agents/DQN_agents/Dueling_DDQN.py @@ -0,0 +1,64 @@ +import torch +from torch import optim +from agents.Base_Agent import Base_Agent +from agents.DQN_agents.DDQN import DDQN + +class Dueling_DDQN(DDQN): + """A dueling double DQN agent as described in the paper http://proceedings.mlr.press/v48/wangf16.pdf""" + agent_name = "Dueling DDQN" + + def __init__(self, config): + DDQN.__init__(self, config) + self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) + self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) + self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1) + Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target) + + def pick_action(self, state=None): + """Uses the local Q network and an epsilon greedy policy to pick an action""" + # PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add + # a "fake" dimension to make it a mini-batch rather than a single observation + if state is None: state = self.state + state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) + if len(state.shape) < 2: state = state.unsqueeze(0) + self.q_network_local.eval() + with torch.no_grad(): + action_values = self.q_network_local(state) + action_values = action_values[:, :-1] #because we treat the last output element as state-value and rest as advantages + self.q_network_local.train() + action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action_values": action_values, + "turn_off_exploration": self.turn_off_exploration, + "episode_number": self.episode_number}) + return action + + def compute_q_values_for_next_states(self, next_states): + """Computes the q_values for next state we will use to create the loss to train the Q network. Double DQN + uses the local index to pick the maximum q_value action and then the target network to calculate the q_value. + The reasoning behind this is that it will help stop the network from overestimating q values""" + max_action_indexes = self.q_network_local(next_states)[:, :-1].detach().argmax(1) + duelling_network_output = self.q_network_target(next_states) + q_values = self.calculate_duelling_q_values(duelling_network_output) + Q_targets_next = q_values.gather(1, max_action_indexes.unsqueeze(1)) + return Q_targets_next + + def calculate_duelling_q_values(self, duelling_q_network_output): + """Calculates the q_values using the duelling network architecture. This is equation (9) in the paper + referenced at the top of the class""" + state_value = duelling_q_network_output[:, -1] + avg_advantage = torch.mean(duelling_q_network_output[:, :-1], dim=1) + q_values = state_value.unsqueeze(1) + (duelling_q_network_output[:, :-1] - avg_advantage.unsqueeze(1)) + return q_values + + def compute_expected_q_values(self, states, actions): + """Computes the expected q_values we will use to create the loss to train the Q network""" + duelling_network_output = self.q_network_local(states) + q_values = self.calculate_duelling_q_values(duelling_network_output) + Q_expected = q_values.gather(1, actions.long()) + return Q_expected + + + + + + + diff --git a/agents/DQN_agents/__init__.py b/agents/DQN_agents/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c7df41cea8f9e72cc7e67b84e34436626f9b5875 --- /dev/null +++ b/agents/DQN_agents/__init__.py @@ -0,0 +1 @@ +import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__))) \ No newline at end of file diff --git a/agents/DQN_agents/__pycache__/DDQN.cpython-310.pyc b/agents/DQN_agents/__pycache__/DDQN.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6f9ed9115b85f14fa5a4276216ecbe574b3a894 Binary files /dev/null and b/agents/DQN_agents/__pycache__/DDQN.cpython-310.pyc differ diff --git a/agents/DQN_agents/__pycache__/DDQN.cpython-38.pyc b/agents/DQN_agents/__pycache__/DDQN.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee18712cc33c41df248277bbc0d82b9d1dd43468 Binary files /dev/null and b/agents/DQN_agents/__pycache__/DDQN.cpython-38.pyc differ diff --git a/agents/DQN_agents/__pycache__/DDQN.cpython-39.pyc b/agents/DQN_agents/__pycache__/DDQN.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..82fc5b3144db3043a942fa053f58edff0970aa0e Binary files /dev/null and b/agents/DQN_agents/__pycache__/DDQN.cpython-39.pyc differ diff --git a/agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-310.pyc b/agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d9d151469bb24d1c762d1f5d08b49817f295428c Binary files /dev/null and b/agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-310.pyc differ diff --git a/agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-39.pyc b/agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1919f0b6c6ed2d271e57516f86ceb3573148642 Binary files /dev/null and b/agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-39.pyc differ diff --git a/agents/DQN_agents/__pycache__/DQN.cpython-310.pyc b/agents/DQN_agents/__pycache__/DQN.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4dec91f1c1eb1d3102b538f4aedb1100ce0d9db5 Binary files /dev/null and b/agents/DQN_agents/__pycache__/DQN.cpython-310.pyc differ diff --git a/agents/DQN_agents/__pycache__/DQN.cpython-39.pyc b/agents/DQN_agents/__pycache__/DQN.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d3d00fa11df9efe5c1b022aa417ad28e91f18341 Binary files /dev/null and b/agents/DQN_agents/__pycache__/DQN.cpython-39.pyc differ diff --git a/agents/DQN_agents/__pycache__/DQN_HER.cpython-310.pyc b/agents/DQN_agents/__pycache__/DQN_HER.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..300b859fd6eb005b9bbcfab0b213e0ed9fbac2c8 Binary files /dev/null and b/agents/DQN_agents/__pycache__/DQN_HER.cpython-310.pyc differ diff --git a/agents/DQN_agents/__pycache__/DQN_HER.cpython-39.pyc b/agents/DQN_agents/__pycache__/DQN_HER.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..356b2570c72409ff52cec8f6129f7faf54ce490b Binary files /dev/null and b/agents/DQN_agents/__pycache__/DQN_HER.cpython-39.pyc differ diff --git a/agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-310.pyc b/agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79bfe6f815a8db64db78a51f8a7631dd3e2dc02b Binary files /dev/null and b/agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-310.pyc differ diff --git a/agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-38.pyc b/agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..925f9d836942ebc5976a5ed7d45cbea099674ff0 Binary files /dev/null and b/agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-38.pyc differ diff --git a/agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-39.pyc b/agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7261847d33c284b8077e86e8140a2489cc1df93 Binary files /dev/null and b/agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-39.pyc differ diff --git a/agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-310.pyc b/agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..856c7eab40e1df46bd5fef6d7628c1e0c1dd722a Binary files /dev/null and b/agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-310.pyc differ diff --git a/agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-39.pyc b/agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be5f0bdfdad29674fa27769e4fe127b02a0553c3 Binary files /dev/null and b/agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-39.pyc differ diff --git a/agents/DQN_agents/__pycache__/__init__.cpython-310.pyc b/agents/DQN_agents/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fdc109ac604a86c895ed32ec4246c6848c6a0b6f Binary files /dev/null and b/agents/DQN_agents/__pycache__/__init__.cpython-310.pyc differ diff --git a/agents/DQN_agents/__pycache__/__init__.cpython-38.pyc b/agents/DQN_agents/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..18974bbde85c27bec71d0f514d9d9e5316720648 Binary files /dev/null and b/agents/DQN_agents/__pycache__/__init__.cpython-38.pyc differ diff --git a/agents/DQN_agents/__pycache__/__init__.cpython-39.pyc b/agents/DQN_agents/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd6717db4458a69814c11ad8d8a26ac77e2c89f4 Binary files /dev/null and b/agents/DQN_agents/__pycache__/__init__.cpython-39.pyc differ diff --git a/agents/HER_Base.py b/agents/HER_Base.py new file mode 100644 index 0000000000000000000000000000000000000000..f7de66c2a7e61ac057af894f891b3ef8651ffb06 --- /dev/null +++ b/agents/HER_Base.py @@ -0,0 +1,100 @@ +import torch +import numpy as np +from utilities.data_structures.Replay_Buffer import Replay_Buffer +from utilities.Utility_Functions import abstract + +@abstract +class HER_Base(object): + """Contains methods needed to turn an algorithm into a hindsight experience replay (HER) algorithm""" + def __init__(self, buffer_size, batch_size, HER_sample_proportion): + self.HER_memory = Replay_Buffer(buffer_size, batch_size, self.config.seed) + self.ordinary_buffer_batch_size = int(batch_size * (1.0 - HER_sample_proportion)) + self.HER_buffer_batch_size = batch_size - self.ordinary_buffer_batch_size + + def reset_game(self): + """Resets the game information so we are ready to play a new episode""" + self.state_dict = self.environment.reset() + self.observation = self.state_dict["observation"] + self.desired_goal = self.state_dict["desired_goal"] + self.achieved_goal = self.state_dict["achieved_goal"] + + self.state = self.create_state_from_observation_and_desired_goal(self.observation, self.desired_goal) + self.next_state = None + self.action = None + self.reward = None + self.done = False + + self.episode_states = [] + self.episode_rewards = [] + self.episode_actions = [] + self.episode_next_states = [] + self.episode_dones = [] + + self.episode_desired_goals = [] + self.episode_achieved_goals = [] + self.episode_observations = [] + + self.episode_next_desired_goals = [] + self.episode_next_achieved_goals = [] + self.episode_next_observations = [] + + self.total_episode_score_so_far = 0 + + def track_changeable_goal_episodes_data(self): + """Saves the data from the recent episodes in a way compatible with changeable goal environments""" + self.episode_rewards.append(self.reward) + self.episode_actions.append(self.action) + self.episode_dones.append(self.done) + + self.episode_states.append(self.state) + self.episode_next_states.append(self.next_state) + + self.episode_desired_goals.append(self.state_dict["desired_goal"]) + self.episode_achieved_goals.append(self.state_dict["achieved_goal"]) + self.episode_observations.append(self.state_dict["observation"]) + + self.episode_next_desired_goals.append(self.next_state_dict["desired_goal"]) + self.episode_next_achieved_goals.append(self.next_state_dict["achieved_goal"]) + self.episode_next_observations.append(self.next_state_dict["observation"]) + + def conduct_action_in_changeable_goal_envs(self, action): + """Adapts conduct_action from base agent so that can handle changeable goal environments""" + self.next_state_dict, self.reward, self.done, _ = self.environment.step(action) + self.total_episode_score_so_far += self.reward + if self.hyperparameters["clip_rewards"]: + self.reward = max(min(self.reward, 1.0), -1.0) + self.observation = self.next_state_dict["observation"] + self.desired_goal = self.next_state_dict["desired_goal"] + self.achieved_goal = self.next_state_dict["achieved_goal"] + self.next_state = self.create_state_from_observation_and_desired_goal(self.observation, self.desired_goal) + + + def create_state_from_observation_and_desired_goal(self, observation, desired_goal): + return np.concatenate((observation, desired_goal)) + + def save_alternative_experience(self): + """Saves the experiences as if the final state visited in the episode was the goal state""" + new_goal = self.achieved_goal + new_states = [self.create_state_from_observation_and_desired_goal(observation, new_goal) for observation in self.episode_observations] + new_next_states = [self.create_state_from_observation_and_desired_goal(observation, new_goal) for observation in + self.episode_next_observations] + new_rewards = [self.environment.compute_reward(next_achieved_goal, new_goal, None) for next_achieved_goal in self.episode_next_achieved_goals] + + if self.hyperparameters["clip_rewards"]: + new_rewards = [max(min(reward, 1.0), -1.0) for reward in new_rewards] + + self.HER_memory.add_experience(new_states, self.episode_actions, new_rewards, new_next_states, self.episode_dones) + + def sample_from_HER_and_Ordinary_Buffer(self): + """Samples from the ordinary replay buffer and HER replay buffer according to a proportion specified in config""" + states, actions, rewards, next_states, dones = self.memory.sample(self.ordinary_buffer_batch_size) + HER_states, HER_actions, HER_rewards, HER_next_states, HER_dones = self.HER_memory.sample(self.HER_buffer_batch_size) + + states = torch.cat((states, HER_states)) + actions = torch.cat((actions, HER_actions)) + rewards = torch.cat((rewards, HER_rewards)) + next_states = torch.cat((next_states, HER_next_states)) + dones = torch.cat((dones, HER_dones)) + return states, actions, rewards, next_states, dones + + diff --git a/agents/Trainer.py b/agents/Trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..56d7f252d2540ac385cb24130ea70f9ab8ae580e --- /dev/null +++ b/agents/Trainer.py @@ -0,0 +1,304 @@ +import copy +import random +import pickle +import os +import gym +from gym import wrappers +import numpy as np +import matplotlib.pyplot as plt + +class Trainer(object): + """Runs games for given agents. Optionally will visualise and save the results""" + def __init__(self, config, agents): + self.config = config + self.agents = agents + self.agent_to_agent_group = self.create_agent_to_agent_group_dictionary() + self.agent_to_color_group = self.create_agent_to_color_dictionary() + self.results = None + self.signals_result = None + self.colors = ["red", "blue", "green", "orange", "yellow", "purple"] + self.colour_ix = 0 + self.y_limits = None + + def create_agent_to_agent_group_dictionary(self): + """Creates a dictionary that maps an agent to their wider agent group""" + agent_to_agent_group_dictionary = { + "DQN": "DQN_Agents", + "DQN-HER": "DQN_Agents", + "DDQN": "DQN_Agents", + "DDQN with Prioritised Replay": "DQN_Agents", + "DQN with Fixed Q Targets": "DQN_Agents", + "Duelling DQN": "DQN_Agents", + "PPO": "Policy_Gradient_Agents", + "REINFORCE": "Policy_Gradient_Agents", + "Genetic_Agent": "Stochastic_Policy_Search_Agents", + "Hill Climbing": "Stochastic_Policy_Search_Agents", + "DDPG": "Actor_Critic_Agents", + "DDPG-HER": "Actor_Critic_Agents", + "TD3": "Actor_Critic_Agents", + "A2C": "Actor_Critic_Agents", + "A3C": "Actor_Critic_Agents", + "h-DQN": "h_DQN", + "SNN-HRL": "SNN_HRL", + "HIRO": "HIRO", + "SAC": "Actor_Critic_Agents", + "HRL": "HRL", + "Model_HRL": "HRL", + "DIAYN": "DIAYN", + "Dueling DDQN": "DQN_Agents" + } + return agent_to_agent_group_dictionary + + def create_agent_to_color_dictionary(self): + """Creates a dictionary that maps an agent to a hex color (for plotting purposes) + See https://en.wikipedia.org/wiki/Web_colors and https://htmlcolorcodes.com/ for hex colors""" + agent_to_color_dictionary = { + "DQN": "#0000FF", + "DQN with Fixed Q Targets": "#1F618D", + "DDQN": "#2980B9", + "DDQN with Prioritised Replay": "#7FB3D5", + "Dueling DDQN": "#22DAF3", + "PPO": "#5B2C6F", + "DDPG": "#800000", + "DQN-HER": "#008000", + "DDPG-HER": "#008000", + "TD3": "#E74C3C", + "h-DQN": "#D35400", + "SNN-HRL": "#800000", + "A3C": "#E74C3C", + "A2C": "#F1948A", + "SAC": "#1C2833", + "DIAYN": "#F322CD", + "HRL": "#0E0F0F" + } + return agent_to_color_dictionary + + def run_games_for_agents(self): + """Run a set of games for each agent. Optionally visualising and/or saving the results""" + self.results = self.create_object_to_store_results() + self.signals_result = self.create_object_to_store_results() + for agent_number, agent_class in enumerate(self.agents): + agent_name = agent_class.agent_name + self.run_games_for_agent(agent_number + 1, agent_class) + if self.config.visualise_overall_agent_results: + agent_rolling_score_results = [results[1] for results in self.results[agent_name]] + self.visualise_overall_agent_results(agent_rolling_score_results, agent_name, show_mean_and_std_range=True, y_limits=self.y_limits) + if self.config.file_to_save_data_results: self.save_obj(self.results, self.config.file_to_save_data_results) + if self.config.file_to_save_results_graph: plt.savefig(self.config.file_to_save_results_graph, bbox_inches="tight") + plt.show() + return self.results + + def create_object_to_store_results(self): + """Creates a dictionary that we will store the results in if it doesn't exist, otherwise it loads it up""" + if self.config.overwrite_existing_results_file or not self.config.file_to_save_data_results or not os.path.isfile(self.config.file_to_save_data_results): + results = {} + else: results = self.load_obj(self.config.file_to_save_data_results) + return results + + def run_games_for_agent(self, agent_number, agent_class): + """Runs a set of games for a given agent, saving the results in self.results""" + agent_results = [] + agent_name = agent_class.agent_name + agent_group = self.agent_to_agent_group[agent_name] + agent_round = 1 + for run in range(self.config.runs_per_agent): + agent_config = copy.deepcopy(self.config) + + if self.environment_has_changeable_goals(agent_config.environment) and self.agent_cant_handle_changeable_goals_without_flattening(agent_name): + print("Flattening changeable-goal environment for agent {}".format(agent_name)) + agent_config.environment = gym.wrappers.FlattenDictWrapper(agent_config.environment, + dict_keys=["observation", "desired_goal"]) + + if self.config.randomise_random_seed: agent_config.seed = random.randint(0, 2**32 - 2) + agent_config.hyperparameters = agent_config.hyperparameters[agent_group] + print("AGENT NAME: {}".format(agent_name)) + print("\033[1m" + "{}.{}: {}".format(agent_number, agent_round, agent_name) + "\033[0m", flush=True) + agent = agent_class(agent_config) + self.environment_name = agent.environment_title + print(agent.hyperparameters) + print("RANDOM SEED " , agent_config.seed) + game_scores, rolling_scores, time_taken, game_signals = agent.run_n_episodes() + print("Time taken: {}".format(time_taken), flush=True) + self.print_two_empty_lines() + agent_results.append([game_scores, rolling_scores, len(rolling_scores), -1 * max(rolling_scores), time_taken, game_signals]) + if self.config.visualise_individual_results: + self.visualise_overall_agent_results([rolling_scores], agent_name, show_each_run=True, y_limits=self.y_limits) + plt.show() + agent_round += 1 + self.results[agent_name] = agent_results + + def environment_has_changeable_goals(self, env): + """Determines whether environment is such that for each episode there is a different goal or not""" + return isinstance(env.reset(), dict) + + def agent_cant_handle_changeable_goals_without_flattening(self, agent_name): + """Boolean indicating whether the agent is set up to handle changeable goals""" + return "HER" not in agent_name + + def visualise_overall_agent_results(self, agent_results, agent_name, show_mean_and_std_range=False, show_each_run=False, + color=None, ax=None, title=None, y_limits=None): + """Visualises the results for one agent""" + assert isinstance(agent_results, list), "agent_results must be a list of lists, 1 set of results per list" + assert isinstance(agent_results[0], list), "agent_results must be a list of lists, 1 set of results per list" + assert bool(show_mean_and_std_range) ^ bool(show_each_run), "either show_mean_and_std_range or show_each_run must be true" + if not ax: ax = plt.gca() + if not color: color = self.agent_to_color_group[agent_name] + if show_mean_and_std_range: + mean_minus_x_std, mean_results, mean_plus_x_std = self.get_mean_and_standard_deviation_difference_results(agent_results) + x_vals = list(range(len(mean_results))) + ax.plot(x_vals, mean_results, label=agent_name, color=color) + ax.plot(x_vals, mean_plus_x_std, color=color, alpha=0.1) + ax.plot(x_vals, mean_minus_x_std, color=color, alpha=0.1) + ax.fill_between(x_vals, y1=mean_minus_x_std, y2=mean_plus_x_std, alpha=0.1, color=color) + else: + for ix, result in enumerate(agent_results): + x_vals = list(range(len(agent_results[0]))) + plt.plot(x_vals, result, label=agent_name + "_{}".format(ix+1), color=color) + color = self.get_next_color() + + ax.set_facecolor('xkcd:white') + + # Shrink current axis's height by 10% on the bottom + box = ax.get_position() + ax.set_position([box.x0, box.y0 + box.height * 0.05, + box.width, box.height * 0.95]) + + # Put a legend below current axis + ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), + fancybox=True, shadow=True, ncol=3) + + if not title: title = self.environment_name + + ax.set_title(title, fontsize=15, fontweight='bold') + ax.set_ylabel('Rolling Episode Scores') + ax.set_xlabel('Episode Number') + self.hide_spines(ax, ['right', 'top']) + ax.set_xlim([0, x_vals[-1]]) + + if y_limits is None: y_min, y_max = self.get_y_limits(agent_results) + else: y_min, y_max = y_limits + + ax.set_ylim([y_min, y_max]) + + if self.config.show_solution_score: + self.draw_horizontal_line_with_label(ax, y_value=self.config.environment.get_score_to_win(), x_min=0, + x_max=self.config.num_episodes_to_run * 1.02, label="Target \n score") + + def get_y_limits(self, results): + """Extracts the minimum and maximum seen y_values from a set of results""" + min_result = float("inf") + max_result = float("-inf") + for result in results: + temp_max = np.max(result) + temp_min = np.min(result) + if temp_max > max_result: + max_result = temp_max + if temp_min < min_result: + min_result = temp_min + return min_result, max_result + + def get_next_color(self): + """Gets the next color in list self.colors. If it gets to the end then it starts from beginning""" + self.colour_ix += 1 + if self.colour_ix >= len(self.colors): self.colour_ix = 0 + color = self.colors[self.colour_ix] + return color + + def get_mean_and_standard_deviation_difference_results(self, results): + """From a list of lists of agent results it extracts the mean results and the mean results plus or minus + some multiple of the standard deviation""" + def get_results_at_a_time_step(results, timestep): + results_at_a_time_step = [result[timestep] for result in results] + return results_at_a_time_step + def get_standard_deviation_at_time_step(results, timestep): + results_at_a_time_step = [result[timestep] for result in results] + return np.std(results_at_a_time_step) + mean_results = [np.mean(get_results_at_a_time_step(results, timestep)) for timestep in range(len(results[0]))] + mean_minus_x_std = [mean_val - self.config.standard_deviation_results * get_standard_deviation_at_time_step(results, timestep) for + timestep, mean_val in enumerate(mean_results)] + mean_plus_x_std = [mean_val + self.config.standard_deviation_results * get_standard_deviation_at_time_step(results, timestep) for + timestep, mean_val in enumerate(mean_results)] + return mean_minus_x_std, mean_results, mean_plus_x_std + + def hide_spines(self, ax, spines_to_hide): + """Hides splines on a matplotlib image""" + for spine in spines_to_hide: + ax.spines[spine].set_visible(False) + + def ignore_points_after_game_solved(self, mean_minus_x_std, mean_results, mean_plus_x_std): + """Removes the datapoints after the mean result achieves the score required to solve the game""" + for ix in range(len(mean_results)): + if mean_results[ix] >= self.config.environment.get_score_to_win(): + break + return mean_minus_x_std[:ix], mean_results[:ix], mean_plus_x_std[:ix] + + def draw_horizontal_line_with_label(self, ax, y_value, x_min, x_max, label): + """Draws a dotted horizontal line on the given image at the given point and with the given label""" + ax.hlines(y=y_value, xmin=x_min, xmax=x_max, + linewidth=2, color='k', linestyles='dotted', alpha=0.5) + ax.text(x_max, y_value * 0.965, label) + + def print_two_empty_lines(self): + print("-----------------------------------------------------------------------------------") + print("-----------------------------------------------------------------------------------") + print(" ") + + def save_obj(self, obj, name): + """Saves given object as a pickle file""" + if name[-4:] != ".pkl": + name += ".pkl" + with open(name, 'wb') as f: + pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) + + def load_obj(self, name): + """Loads a pickle file object""" + with open(name, 'rb') as f: + return pickle.load(f) + + def visualise_preexisting_results(self, save_image_path=None, data_path=None, colors=None, show_image=True, ax=None, + title=None, y_limits=None): + """Visualises saved data results and then optionally saves the image""" + if not data_path: preexisting_results = self.create_object_to_store_results() + else: preexisting_results = self.load_obj(data_path) + for ix, agent in enumerate(list(preexisting_results.keys())): + agent_rolling_score_results = [results[1] for results in preexisting_results[agent]] + if colors: color = colors[ix] + else: color = None + self.visualise_overall_agent_results(agent_rolling_score_results, agent, show_mean_and_std_range=True, + color=color, ax=ax, title=title, y_limits=y_limits) + if save_image_path: plt.savefig(save_image_path, bbox_inches="tight") + if show_image: plt.show() + + def visualise_set_of_preexisting_results(self, results_data_paths, save_image_path=None, show_image=True, plot_titles=None, + y_limits=[None,None]): + """Visualises a set of preexisting results on 1 plot by making subplots""" + assert isinstance(results_data_paths, list), "all_results must be a list of data paths" + + num_figures = len(results_data_paths) + col_width = 15 + row_height = 6 + + if num_figures <= 2: + fig, axes = plt.subplots(1, num_figures, figsize=(col_width, row_height )) + elif num_figures <= 4: + fig, axes = plt.subplots(2, num_figures, figsize=(row_height, col_width)) + else: + raise ValueError("Need to tell this method how to deal with more than 4 plots") + for ax_ix in range(len(results_data_paths)): + self.visualise_preexisting_results(show_image=False, data_path=results_data_paths[ax_ix], ax=axes[ax_ix], + title=plot_titles[ax_ix], y_limits=y_limits[ax_ix]) + fig.tight_layout() + fig.subplots_adjust(bottom=0.25) + + if save_image_path: plt.savefig(save_image_path) #, bbox_inches="tight") + if show_image: plt.show() + + # ax.imshow(z, aspect="auto") + + + + + + + + diff --git a/agents/__init__.py b/agents/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c7df41cea8f9e72cc7e67b84e34436626f9b5875 --- /dev/null +++ b/agents/__init__.py @@ -0,0 +1 @@ +import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__))) \ No newline at end of file diff --git a/agents/__pycache__/Base_Agent.cpython-310.pyc b/agents/__pycache__/Base_Agent.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a898a8a38a02bf46199849c1f6f3dfd08ab373bd Binary files /dev/null and b/agents/__pycache__/Base_Agent.cpython-310.pyc differ diff --git a/agents/__pycache__/Base_Agent.cpython-38.pyc b/agents/__pycache__/Base_Agent.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d8ddf73e6d6144f852f5187bef49408d1c58e83 Binary files /dev/null and b/agents/__pycache__/Base_Agent.cpython-38.pyc differ diff --git a/agents/__pycache__/Base_Agent.cpython-39.pyc b/agents/__pycache__/Base_Agent.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a22ca47022133db818b197243c93ee896eda7740 Binary files /dev/null and b/agents/__pycache__/Base_Agent.cpython-39.pyc differ diff --git a/agents/__pycache__/HER_Base.cpython-310.pyc b/agents/__pycache__/HER_Base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df50f4af285706e7f34bbd093c1e5832499eeaca Binary files /dev/null and b/agents/__pycache__/HER_Base.cpython-310.pyc differ diff --git a/agents/__pycache__/HER_Base.cpython-39.pyc b/agents/__pycache__/HER_Base.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..26e4805586d5ee609dca8237c1ccceb076e73a51 Binary files /dev/null and b/agents/__pycache__/HER_Base.cpython-39.pyc differ diff --git a/agents/__pycache__/Trainer.cpython-310.pyc b/agents/__pycache__/Trainer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c3deb25119ccd90ddffb77173dd5c4f43f026b02 Binary files /dev/null and b/agents/__pycache__/Trainer.cpython-310.pyc differ diff --git a/agents/__pycache__/Trainer.cpython-39.pyc b/agents/__pycache__/Trainer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..60a56d5f33bc5ee88a3642bc9b2afdaa079cfa3a Binary files /dev/null and b/agents/__pycache__/Trainer.cpython-39.pyc differ diff --git a/agents/__pycache__/__init__.cpython-310.pyc b/agents/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b2188918c014a02be37e6afedccb7f9e8a2e0da8 Binary files /dev/null and b/agents/__pycache__/__init__.cpython-310.pyc differ diff --git a/agents/__pycache__/__init__.cpython-38.pyc b/agents/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d9915ce21a539284c94d5b6bc8ab24663ed4bae9 Binary files /dev/null and b/agents/__pycache__/__init__.cpython-38.pyc differ diff --git a/agents/__pycache__/__init__.cpython-39.pyc b/agents/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c99960406ec57b5ac1f5fa1af4062ca3c081ef57 Binary files /dev/null and b/agents/__pycache__/__init__.cpython-39.pyc differ diff --git a/agents/actor_critic_agents/A2C.py b/agents/actor_critic_agents/A2C.py new file mode 100644 index 0000000000000000000000000000000000000000..24f1515a39f3993d3a9b11665fc00d97a3f2a712 --- /dev/null +++ b/agents/actor_critic_agents/A2C.py @@ -0,0 +1,25 @@ +from agents.actor_critic_agents.A3C import A3C + +class A2C(A3C): + """Synchronous version of A2C algorithm from deepmind paper https://arxiv.org/pdf/1602.01783.pdf. The only + difference between this and the A3C is that gradient updates get done in a batch rather than 1 by 1 as the gradients + come in""" + agent_name = "A2C" + def __init__(self, config): + super(A2C, self).__init__(config) + + def update_shared_model(self, gradient_updates_queue): + """Worker that updates the shared model with gradients as they get put into the queue""" + while True: + gradients_seen = 0 + while gradients_seen < self.worker_processes: + if gradients_seen == 0: + gradients = gradient_updates_queue.get() + else: + new_grads = gradient_updates_queue.get() + gradients = [grad + new_grad for grad, new_grad in zip(gradients, new_grads)] + gradients_seen += 1 + self.actor_critic_optimizer.zero_grad() + for grads, params in zip(gradients, self.actor_critic.parameters()): + params._grad = grads + self.actor_critic_optimizer.step() \ No newline at end of file diff --git a/agents/actor_critic_agents/A3C.py b/agents/actor_critic_agents/A3C.py new file mode 100644 index 0000000000000000000000000000000000000000..966b6cf34fde7701657214c354b463760f3ebc45 --- /dev/null +++ b/agents/actor_critic_agents/A3C.py @@ -0,0 +1,229 @@ +import copy +import random +import time +import numpy as np +import torch +from torch import multiprocessing +from torch.multiprocessing import Queue +from torch.optim import Adam +from agents.Base_Agent import Base_Agent +from utilities.Utility_Functions import create_actor_distribution, SharedAdam + +class A3C(Base_Agent): + """Actor critic A3C algorithm from deepmind paper https://arxiv.org/pdf/1602.01783.pdf""" + agent_name = "A3C" + def __init__(self, config): + super(A3C, self).__init__(config) + self.num_processes = multiprocessing.cpu_count() + self.worker_processes = max(1, self.num_processes - 2) + self.actor_critic = self.create_NN(input_dim=self.state_size, output_dim=[self.action_size, 1]) + self.actor_critic_optimizer = SharedAdam(self.actor_critic.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) + + def run_n_episodes(self): + """Runs game to completion n times and then summarises results and saves model (if asked to)""" + start = time.time() + results_queue = Queue() + gradient_updates_queue = Queue() + episode_number = multiprocessing.Value('i', 0) + self.optimizer_lock = multiprocessing.Lock() + episodes_per_process = int(self.config.num_episodes_to_run / self.worker_processes) + 1 + processes = [] + self.actor_critic.share_memory() + self.actor_critic_optimizer.share_memory() + + optimizer_worker = multiprocessing.Process(target=self.update_shared_model, args=(gradient_updates_queue,)) + optimizer_worker.start(, + + for process_num in range(self.worker_processes): + worker = Actor_Critic_Worker(process_num, copy.deepcopy(self.environment), self.actor_critic, episode_number, self.optimizer_lock, + self.actor_critic_optimizer, self.config, episodes_per_process, + self.hyperparameters["epsilon_decay_rate_denominator"], + self.action_size, self.action_types, + results_queue, copy.deepcopy(self.actor_critic), gradient_updates_queue) + worker.start() + processes.append(worker) + self.print_results(episode_number, results_queue) + for worker in processes: + worker.join() + optimizer_worker.kill() + + time_taken = time.time() - start + return self.game_full_episode_scores, self.rolling_results, time_taken + + def print_results(self, episode_number, results_queue): + """Worker that prints out results as they get put into a queue""" + while True: + with episode_number.get_lock(): + carry_on = episode_number.value < self.config.num_episodes_to_run + if carry_on: + if not results_queue.empty(): + self.total_episode_score_so_far = results_queue.get() + self.save_and_print_result() + else: break + + def update_shared_model(self, gradient_updates_queue): + """Worker that updates the shared model with gradients as they get put into the queue""" + while True: + gradients = gradient_updates_queue.get() + with self.optimizer_lock: + self.actor_critic_optimizer.zero_grad() + for grads, params in zip(gradients, self.actor_critic.parameters()): + params._grad = grads # maybe need to do grads.clone() + self.actor_critic_optimizer.step() + +class Actor_Critic_Worker(torch.multiprocessing.Process): + """Actor critic worker that will play the game for the designated number of episodes """ + def __init__(self, worker_num, environment, shared_model, counter, optimizer_lock, shared_optimizer, + config, episodes_to_run, epsilon_decay_denominator, action_size, action_types, results_queue, + local_model, gradient_updates_queue): + super(Actor_Critic_Worker, self).__init__() + self.environment = environment + self.config = config + self.worker_num = worker_num + + self.gradient_clipping_norm = self.config.hyperparameters["gradient_clipping_norm"] + self.discount_rate = self.config.hyperparameters["discount_rate"] + self.normalise_rewards = self.config.hyperparameters["normalise_rewards"] + + self.action_size = action_size + self.set_seeds(self.worker_num) + self.shared_model = shared_model + self.local_model = local_model + self.local_optimizer = Adam(self.local_model.parameters(), lr=0.0, eps=1e-4) + self.counter = counter + self.optimizer_lock = optimizer_lock + self.shared_optimizer = shared_optimizer + self.episodes_to_run = episodes_to_run + self.epsilon_decay_denominator = epsilon_decay_denominator + self.exploration_worker_difference = self.config.hyperparameters["exploration_worker_difference"] + self.action_types = action_types + self.results_queue = results_queue + self.episode_number = 0 + + self.gradient_updates_queue = gradient_updates_queue + + def set_seeds(self, worker_num): + """Sets random seeds for this worker""" + torch.manual_seed(self.config.seed + worker_num) + self.environment.seed(self.config.seed + worker_num) + + def run(self): + """Starts the worker""" + torch.set_num_threads(1) + for ep_ix in range(self.episodes_to_run): + with self.optimizer_lock: + Base_Agent.copy_model_over(self.shared_model, self.local_model) + epsilon_exploration = self.calculate_new_exploration() + state = self.reset_game_for_worker() + done = False + self.episode_states = [] + self.episode_actions = [] + self.episode_rewards = [] + self.episode_log_action_probabilities = [] + self.critic_outputs = [] + + while not done: + action, action_log_prob, critic_outputs = self.pick_action_and_get_critic_values(self.local_model, state, epsilon_exploration) + next_state, reward, done, _ = self.environment.step(action) + self.episode_states.append(state) + self.episode_actions.append(action) + self.episode_rewards.append(reward) + self.episode_log_action_probabilities.append(action_log_prob) + self.critic_outputs.append(critic_outputs) + state = next_state + + total_loss = self.calculate_total_loss() + self.put_gradients_in_queue(total_loss) + self.episode_number += 1 + with self.counter.get_lock(): + self.counter.value += 1 + self.results_queue.put(np.sum(self.episode_rewards)) + + def calculate_new_exploration(self): + """Calculates the new exploration parameter epsilon. It picks a random point within 3X above and below the + current epsilon""" + with self.counter.get_lock(): + epsilon = 1.0 / (1.0 + (self.counter.value / self.epsilon_decay_denominator)) + epsilon = max(0.0, random.uniform(epsilon / self.exploration_worker_difference, epsilon * self.exploration_worker_difference)) + return epsilon + + def reset_game_for_worker(self): + """Resets the game environment so it is ready to play a new episode""" + state = self.environment.reset() + if self.action_types == "CONTINUOUS": self.noise.reset() + return state + + def pick_action_and_get_critic_values(self, policy, state, epsilon_exploration=None): + """Picks an action using the policy""" + state = torch.from_numpy(state).float().unsqueeze(0) + model_output = policy.forward(state) + actor_output = model_output[:, list(range(self.action_size))] #we only use first set of columns to decide action, last column is state-value + critic_output = model_output[:, -1] + action_distribution = create_actor_distribution(self.action_types, actor_output, self.action_size) + action = action_distribution.sample().cpu().numpy() + if self.action_types == "CONTINUOUS": action += self.noise.sample() + if self.action_types == "DISCRETE": + if random.random() <= epsilon_exploration: + action = random.randint(0, self.action_size - 1) + else: + action = action[0] + action_log_prob = self.calculate_log_action_probability(action, action_distribution) + return action, action_log_prob, critic_output + + def calculate_log_action_probability(self, actions, action_distribution): + """Calculates the log probability of the chosen action""" + policy_distribution_log_prob = action_distribution.log_prob(torch.Tensor([actions])) + return policy_distribution_log_prob + + def calculate_total_loss(self): + """Calculates the actor loss + critic loss""" + discounted_returns = self.calculate_discounted_returns() + if self.normalise_rewards: + discounted_returns = self.normalise_discounted_returns(discounted_returns) + critic_loss, advantages = self.calculate_critic_loss_and_advantages(discounted_returns) + actor_loss = self.calculate_actor_loss(advantages) + total_loss = actor_loss + critic_loss + return total_loss + + def calculate_discounted_returns(self): + """Calculates the cumulative discounted return for an episode which we will then use in a learning iteration""" + discounted_returns = [0] + for ix in range(len(self.episode_states)): + return_value = self.episode_rewards[-(ix + 1)] + self.discount_rate*discounted_returns[-1] + discounted_returns.append(return_value) + discounted_returns = discounted_returns[1:] + discounted_returns = discounted_returns[::-1] + return discounted_returns + + def normalise_discounted_returns(self, discounted_returns): + """Normalises the discounted returns by dividing by mean and std of returns that episode""" + mean = np.mean(discounted_returns) + std = np.std(discounted_returns) + discounted_returns -= mean + discounted_returns /= (std + 1e-5) + return discounted_returns + + def calculate_critic_loss_and_advantages(self, all_discounted_returns): + """Calculates the critic's loss and the advantages""" + critic_values = torch.cat(self.critic_outputs) + advantages = torch.Tensor(all_discounted_returns) - critic_values + advantages = advantages.detach() + critic_loss = (torch.Tensor(all_discounted_returns) - critic_values)**2 + critic_loss = critic_loss.mean() + return critic_loss, advantages + + def calculate_actor_loss(self, advantages): + """Calculates the loss for the actor""" + action_log_probabilities_for_all_episodes = torch.cat(self.episode_log_action_probabilities) + actor_loss = -1.0 * action_log_probabilities_for_all_episodes * advantages + actor_loss = actor_loss.mean() + return actor_loss + + def put_gradients_in_queue(self, total_loss): + """Puts gradients in a queue for the optimisation process to use to update the shared model""" + self.local_optimizer.zero_grad() + total_loss.backward() + torch.nn.utils.clip_grad_norm_(self.local_model.parameters(), self.gradient_clipping_norm) + gradients = [param.grad.clone() for param in self.local_model.parameters()] + self.gradient_updates_queue.put(gradients) + diff --git a/agents/actor_critic_agents/DDPG.py b/agents/actor_critic_agents/DDPG.py new file mode 100644 index 0000000000000000000000000000000000000000..8c3e5fde434f2903efe0432e86dd035e9a8db5e8 --- /dev/null +++ b/agents/actor_critic_agents/DDPG.py @@ -0,0 +1,115 @@ +import torch +import torch.nn.functional as functional +from torch import optim +from agents.Base_Agent import Base_Agent +from utilities.data_structures.Replay_Buffer import Replay_Buffer +from exploration_strategies.OU_Noise_Exploration import OU_Noise_Exploration + +class DDPG(Base_Agent): + """A DDPG Agent""" + agent_name = "DDPG" + + def __init__(self, config): + Base_Agent.__init__(self, config) + self.hyperparameters = config.hyperparameters + self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") + self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") + Base_Agent.copy_model_over(self.critic_local, self.critic_target) + + self.critic_optimizer = optim.Adam(self.critic_local.parameters(), + lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) + self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], + self.config.seed) + self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") + self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") + Base_Agent.copy_model_over(self.actor_local, self.actor_target) + + self.actor_optimizer = optim.Adam(self.actor_local.parameters(), + lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) + self.exploration_strategy = OU_Noise_Exploration(self.config) + + def step(self): + """Runs a step in the game""" + while not self.done: + # print("State ", self.state.shape) + self.action = self.pick_action() + self.conduct_action(self.action) + if self.time_for_critic_and_actor_to_learn(): + for _ in range(self.hyperparameters["learning_updates_per_learning_session"]): + states, actions, rewards, next_states, dones = self.sample_experiences() + self.critic_learn(states, actions, rewards, next_states, dones) + self.actor_learn(states) + self.save_experience() + self.state = self.next_state #this is to set the state for the next iteration + self.global_step_number += 1 + self.episode_number += 1 + + def sample_experiences(self): + return self.memory.sample() + + def pick_action(self, state=None): + """Picks an action using the actor network and then adds some noise to it to ensure exploration""" + if state is None: state = torch.from_numpy(self.state).float().unsqueeze(0).to(self.device) + self.actor_local.eval() + with torch.no_grad(): + action = self.actor_local(state).cpu().data.numpy() + self.actor_local.train() + action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action": action}) + return action.squeeze(0) + + def critic_learn(self, states, actions, rewards, next_states, dones): + """Runs a learning iteration for the critic""" + loss = self.compute_loss(states, next_states, rewards, actions, dones) + self.take_optimisation_step(self.critic_optimizer, self.critic_local, loss, self.hyperparameters["Critic"]["gradient_clipping_norm"]) + self.soft_update_of_target_network(self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"]) + + def compute_loss(self, states, next_states, rewards, actions, dones): + """Computes the loss for the critic""" + with torch.no_grad(): + critic_targets = self.compute_critic_targets(next_states, rewards, dones) + critic_expected = self.compute_expected_critic_values(states, actions) + loss = functional.mse_loss(critic_expected, critic_targets) + return loss + + def compute_critic_targets(self, next_states, rewards, dones): + """Computes the critic target values to be used in the loss for the critic""" + critic_targets_next = self.compute_critic_values_for_next_states(next_states) + critic_targets = self.compute_critic_values_for_current_states(rewards, critic_targets_next, dones) + return critic_targets + + def compute_critic_values_for_next_states(self, next_states): + """Computes the critic values for next states to be used in the loss for the critic""" + with torch.no_grad(): + actions_next = self.actor_target(next_states) + critic_targets_next = self.critic_target(torch.cat((next_states, actions_next), 1)) + return critic_targets_next + + def compute_critic_values_for_current_states(self, rewards, critic_targets_next, dones): + """Computes the critic values for current states to be used in the loss for the critic""" + critic_targets_current = rewards + (self.hyperparameters["discount_rate"] * critic_targets_next * (1.0 - dones)) + return critic_targets_current + + def compute_expected_critic_values(self, states, actions): + """Computes the expected critic values to be used in the loss for the critic""" + critic_expected = self.critic_local(torch.cat((states, actions), 1)) + return critic_expected + + def time_for_critic_and_actor_to_learn(self): + """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the + actor and critic""" + return self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0 + + def actor_learn(self, states): + """Runs a learning iteration for the actor""" + if self.done: #we only update the learning rate at end of each episode + self.update_learning_rate(self.hyperparameters["Actor"]["learning_rate"], self.actor_optimizer) + actor_loss = self.calculate_actor_loss(states) + self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss, + self.hyperparameters["Actor"]["gradient_clipping_norm"]) + self.soft_update_of_target_network(self.actor_local, self.actor_target, self.hyperparameters["Actor"]["tau"]) + + def calculate_actor_loss(self, states): + """Calculates the loss for the actor""" + actions_pred = self.actor_local(states) + actor_loss = -self.critic_local(torch.cat((states, actions_pred), 1)).mean() + return actor_loss \ No newline at end of file diff --git a/agents/actor_critic_agents/DDPG_HER.py b/agents/actor_critic_agents/DDPG_HER.py new file mode 100644 index 0000000000000000000000000000000000000000..cff29b220f7d6ceb3182876e5db9dea841b4b4f3 --- /dev/null +++ b/agents/actor_critic_agents/DDPG_HER.py @@ -0,0 +1,38 @@ +from agents.actor_critic_agents.DDPG import DDPG +from agents.HER_Base import HER_Base + +class DDPG_HER(HER_Base, DDPG): + """DDPG algorithm with hindsight experience replay""" + agent_name = "DDPG-HER" + + def __init__(self, config): + DDPG.__init__(self, config) + HER_Base.__init__(self, self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], + self.hyperparameters["HER_sample_proportion"]) + + def step(self): + """Runs a step within a game including a learning step if required""" + while not self.done: + self.action = self.pick_action() + self.conduct_action_in_changeable_goal_envs(self.action) + if self.time_for_critic_and_actor_to_learn(): + for _ in range(self.hyperparameters["learning_updates_per_learning_session"]): + states, actions, rewards, next_states, dones = self.sample_from_HER_and_Ordinary_Buffer() # Samples experiences from buffer + self.critic_learn(states, actions, rewards, next_states, dones) + self.actor_learn(states) + self.track_changeable_goal_episodes_data() + self.save_experience() + if self.done: self.save_alternative_experience() + self.state_dict = self.next_state_dict # this is to set the state for the next iteration + self.state = self.next_state + self.global_step_number += 1 + self.episode_number += 1 + + def enough_experiences_to_learn_from(self): + """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn""" + return len(self.memory) > self.ordinary_buffer_batch_size and len(self.HER_memory) > self.HER_buffer_batch_size + + + + + diff --git a/agents/actor_critic_agents/SAC.py b/agents/actor_critic_agents/SAC.py new file mode 100644 index 0000000000000000000000000000000000000000..b997fe97d9b83cc6e5153a27a8ba8715c8b5c940 --- /dev/null +++ b/agents/actor_critic_agents/SAC.py @@ -0,0 +1,211 @@ +from agents.Base_Agent import Base_Agent +from utilities.OU_Noise import OU_Noise +from utilities.data_structures.Replay_Buffer import Replay_Buffer +from torch.optim import Adam +import torch +import torch.nn.functional as F +from torch.distributions import Normal +import numpy as np + +LOG_SIG_MAX = 2 +LOG_SIG_MIN = -20 +TRAINING_EPISODES_PER_EVAL_EPISODE = 10 +EPSILON = 1e-6 + +class SAC(Base_Agent): + """Soft Actor-Critic model based on the 2018 paper https://arxiv.org/abs/1812.05905 and on this github implementation + https://github.com/pranz24/pytorch-soft-actor-critic. It is an actor-critic algorithm where the agent is also trained + to maximise the entropy of their actions as well as their cumulative reward""" + agent_name = "SAC" + def __init__(self, config): + Base_Agent.__init__(self, config) + assert self.action_types == "CONTINUOUS", "Action types must be continuous. Use SAC Discrete instead for discrete actions" + assert self.config.hyperparameters["Actor"]["final_layer_activation"] != "Softmax", "Final actor layer must not be softmax" + self.hyperparameters = config.hyperparameters + self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic") + self.critic_local_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, + key_to_use="Critic", override_seed=self.config.seed + 1) + self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(), + lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) + self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(), + lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) + self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, + key_to_use="Critic") + self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, + key_to_use="Critic") + Base_Agent.copy_model_over(self.critic_local, self.critic_target) + Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) + self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], + self.config.seed) + self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size * 2, key_to_use="Actor") + self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(), + lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) + self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"] + if self.automatic_entropy_tuning: + self.target_entropy = -torch.prod(torch.Tensor(self.environment.action_space.shape).to(self.device)).item() # heuristic value from the paper + self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) + self.alpha = self.log_alpha.exp() + self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) + else: + self.alpha = self.hyperparameters["entropy_term_weight"] + + self.add_extra_noise = self.hyperparameters["add_extra_noise"] + if self.add_extra_noise: + self.noise = OU_Noise(self.action_size, self.config.seed, self.hyperparameters["mu"], + self.hyperparameters["theta"], self.hyperparameters["sigma"]) + + self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"] + + def save_result(self): + """Saves the result of an episode of the game. Overriding the method in Base Agent that does this because we only + want to keep track of the results during the evaluation episodes""" + if self.episode_number == 1 or not self.do_evaluation_iterations: + self.game_full_episode_scores.extend([self.total_episode_score_so_far]) + self.rolling_results.append(np.mean(self.game_full_episode_scores[-1 * self.rolling_score_window:])) + self.save_max_result_seen() + + elif (self.episode_number - 1) % TRAINING_EPISODES_PER_EVAL_EPISODE == 0: + self.game_full_episode_scores.extend([self.total_episode_score_so_far for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE)]) + self.rolling_results.extend([np.mean(self.game_full_episode_scores[-1 * self.rolling_score_window:]) for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE)]) + self.save_max_result_seen() + + def reset_game(self): + """Resets the game information so we are ready to play a new episode""" + Base_Agent.reset_game(self) + if self.add_extra_noise: self.noise.reset() + + def step(self): + """Runs an episode on the game, saving the experience and running a learning step if appropriate""" + eval_ep = self.episode_number % TRAINING_EPISODES_PER_EVAL_EPISODE == 0 and self.do_evaluation_iterations + self.episode_step_number_val = 0 + while not self.done: + self.episode_step_number_val += 1 + self.action = self.pick_action(eval_ep) + self.conduct_action(self.action) + if self.time_for_critic_and_actor_to_learn(): + for _ in range(self.hyperparameters["learning_updates_per_learning_session"]): + self.learn() + mask = False if self.episode_step_number_val >= self.environment._max_episode_steps else self.done + if not eval_ep: self.save_experience(experience=(self.state, self.action, self.reward, self.next_state, mask)) + self.state = self.next_state + self.global_step_number += 1 + print(self.total_episode_score_so_far) + if eval_ep: self.print_summary_of_latest_evaluation_episode() + self.episode_number += 1 + + def pick_action(self, eval_ep, state=None): + """Picks an action using one of three methods: 1) Randomly if we haven't passed a certain number of steps, + 2) Using the actor in evaluation mode if eval_ep is True 3) Using the actor in training mode if eval_ep is False. + The difference between evaluation and training mode is that training mode does more exploration""" + if state is None: state = self.state + if eval_ep: action = self.actor_pick_action(state=state, eval=True) + elif self.global_step_number < self.hyperparameters["min_steps_before_learning"]: + action = self.environment.action_space.sample() + print("Picking random action ", action) + else: action = self.actor_pick_action(state=state) + if self.add_extra_noise: + action += self.noise.sample() + return action + + def actor_pick_action(self, state=None, eval=False): + """Uses actor to pick an action in one of two ways: 1) If eval = False and we aren't in eval mode then it picks + an action that has partly been randomly sampled 2) If eval = True then we pick the action that comes directly + from the network and so did not involve any random sampling""" + if state is None: state = self.state + state = torch.FloatTensor([state]).to(self.device) + if len(state.shape) == 1: state = state.unsqueeze(0) + if eval == False: action, _, _ = self.produce_action_and_action_info(state) + else: + with torch.no_grad(): + _, z, action = self.produce_action_and_action_info(state) + action = action.detach().cpu().numpy() + return action[0] + + def produce_action_and_action_info(self, state): + """Given the state, produces an action, the log probability of the action, and the tanh of the mean action""" + actor_output = self.actor_local(state) + mean, log_std = actor_output[:, :self.action_size], actor_output[:, self.action_size:] + std = log_std.exp() + normal = Normal(mean, std) + x_t = normal.rsample() #rsample means it is sampled using reparameterisation trick + action = torch.tanh(x_t) + log_prob = normal.log_prob(x_t) + log_prob -= torch.log(1 - action.pow(2) + EPSILON) + log_prob = log_prob.sum(1, keepdim=True) + return action, log_prob, torch.tanh(mean) + + def time_for_critic_and_actor_to_learn(self): + """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the + actor and critic""" + return self.global_step_number > self.hyperparameters["min_steps_before_learning"] and \ + self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0 + + def learn(self): + """Runs a learning iteration for the actor, both critics and (if specified) the temperature parameter""" + state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.sample_experiences() + qf1_loss, qf2_loss = self.calculate_critic_losses(state_batch, action_batch, reward_batch, next_state_batch, mask_batch) + self.update_critic_parameters(qf1_loss, qf2_loss) + + policy_loss, log_pi = self.calculate_actor_loss(state_batch) + if self.automatic_entropy_tuning: alpha_loss = self.calculate_entropy_tuning_loss(log_pi) + else: alpha_loss = None + self.update_actor_parameters(policy_loss, alpha_loss) + + def sample_experiences(self): + return self.memory.sample() + + def calculate_critic_losses(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch): + """Calculates the losses for the two critics. This is the ordinary Q-learning loss except the additional entropy + term is taken into account""" + with torch.no_grad(): + next_state_action, next_state_log_pi, _ = self.produce_action_and_action_info(next_state_batch) + qf1_next_target = self.critic_target(torch.cat((next_state_batch, next_state_action), 1)) + qf2_next_target = self.critic_target_2(torch.cat((next_state_batch, next_state_action), 1)) + min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi + next_q_value = reward_batch + (1.0 - mask_batch) * self.hyperparameters["discount_rate"] * (min_qf_next_target) + qf1 = self.critic_local(torch.cat((state_batch, action_batch), 1)) + qf2 = self.critic_local_2(torch.cat((state_batch, action_batch), 1)) + qf1_loss = F.mse_loss(qf1, next_q_value) + qf2_loss = F.mse_loss(qf2, next_q_value) + return qf1_loss, qf2_loss + + def calculate_actor_loss(self, state_batch): + """Calculates the loss for the actor. This loss includes the additional entropy term""" + action, log_pi, _ = self.produce_action_and_action_info(state_batch) + qf1_pi = self.critic_local(torch.cat((state_batch, action), 1)) + qf2_pi = self.critic_local_2(torch.cat((state_batch, action), 1)) + min_qf_pi = torch.min(qf1_pi, qf2_pi) + policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() + return policy_loss, log_pi + + def calculate_entropy_tuning_loss(self, log_pi): + """Calculates the loss for the entropy temperature parameter. This is only relevant if self.automatic_entropy_tuning + is True.""" + alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() + return alpha_loss + + def update_critic_parameters(self, critic_loss_1, critic_loss_2): + """Updates the parameters for both critics""" + self.take_optimisation_step(self.critic_optimizer, self.critic_local, critic_loss_1, + self.hyperparameters["Critic"]["gradient_clipping_norm"]) + self.take_optimisation_step(self.critic_optimizer_2, self.critic_local_2, critic_loss_2, + self.hyperparameters["Critic"]["gradient_clipping_norm"]) + self.soft_update_of_target_network(self.critic_local, self.critic_target, + self.hyperparameters["Critic"]["tau"]) + self.soft_update_of_target_network(self.critic_local_2, self.critic_target_2, + self.hyperparameters["Critic"]["tau"]) + + def update_actor_parameters(self, actor_loss, alpha_loss): + """Updates the parameters for the actor and (if specified) the temperature parameter""" + self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss, + self.hyperparameters["Actor"]["gradient_clipping_norm"]) + if alpha_loss is not None: + self.take_optimisation_step(self.alpha_optim, None, alpha_loss, None) + self.alpha = self.log_alpha.exp() + + def print_summary_of_latest_evaluation_episode(self): + """Prints a summary of the latest episode""" + print(" ") + print("----------------------------") + print("Episode score {} ".format(self.total_episode_score_so_far)) + print("----------------------------") \ No newline at end of file diff --git a/agents/actor_critic_agents/SAC_Discrete.py b/agents/actor_critic_agents/SAC_Discrete.py new file mode 100644 index 0000000000000000000000000000000000000000..c4ebe0582133b21b2521b73ccb5ae2dbdeff6d78 --- /dev/null +++ b/agents/actor_critic_agents/SAC_Discrete.py @@ -0,0 +1,94 @@ +import torch +from torch.optim import Adam +import torch.nn.functional as F +import numpy as np +from agents.Base_Agent import Base_Agent +from utilities.data_structures.Replay_Buffer import Replay_Buffer +from agents.actor_critic_agents.SAC import SAC +from utilities.Utility_Functions import create_actor_distribution + +class SAC_Discrete(SAC): + """The Soft Actor Critic for discrete actions. It inherits from SAC for continuous actions and only changes a few + methods.""" + agent_name = "SAC" + def __init__(self, config): + Base_Agent.__init__(self, config) + assert self.action_types == "DISCRETE", "Action types must be discrete. Use SAC instead for continuous actions" + assert self.config.hyperparameters["Actor"]["final_layer_activation"] == "Softmax", "Final actor layer must be softmax" + self.hyperparameters = config.hyperparameters + self.critic_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic") + self.critic_local_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, + key_to_use="Critic", override_seed=self.config.seed + 1) + self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(), + lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) + self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(), + lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) + self.critic_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, + key_to_use="Critic") + self.critic_target_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, + key_to_use="Critic") + Base_Agent.copy_model_over(self.critic_local, self.critic_target) + Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) + self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"], + self.config.seed, device=self.device) + + self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor") + self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(), + lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) + self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"] + if self.automatic_entropy_tuning: + # we set the max possible entropy as the target entropy + self.target_entropy = -np.log((1.0 / self.action_size)) * 0.98 + self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) + self.alpha = self.log_alpha.exp() + self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4) + else: + self.alpha = self.hyperparameters["entropy_term_weight"] + assert not self.hyperparameters["add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment" + self.add_extra_noise = False + self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"] + + def produce_action_and_action_info(self, state): + """Given the state, produces an action, the probability of the action, the log probability of the action, and + the argmax action""" + action_probabilities = self.actor_local(state) + max_probability_action = torch.argmax(action_probabilities, dim=-1) + action_distribution = create_actor_distribution(self.action_types, action_probabilities, self.action_size) + action = action_distribution.sample().cpu() + # Have to deal with situation of 0.0 probabilities because we can't do log 0 + z = action_probabilities == 0.0 + z = z.float() * 1e-8 + log_action_probabilities = torch.log(action_probabilities + z) + return action, (action_probabilities, log_action_probabilities), max_probability_action + + def calculate_critic_losses(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch): + """Calculates the losses for the two critics. This is the ordinary Q-learning loss except the additional entropy + term is taken into account""" + with torch.no_grad(): + next_state_action, (action_probabilities, log_action_probabilities), _ = self.produce_action_and_action_info(next_state_batch) + qf1_next_target = self.critic_target(next_state_batch) + qf2_next_target = self.critic_target_2(next_state_batch) + min_qf_next_target = action_probabilities * (torch.min(qf1_next_target, qf2_next_target) - self.alpha * log_action_probabilities) + min_qf_next_target = min_qf_next_target.sum(dim=1).unsqueeze(-1) + next_q_value = reward_batch + (1.0 - mask_batch) * self.hyperparameters["discount_rate"] * (min_qf_next_target) + + qf1 = self.critic_local(state_batch).gather(1, action_batch.long()) + qf2 = self.critic_local_2(state_batch).gather(1, action_batch.long()) + qf1_loss = F.mse_loss(qf1, next_q_value) + qf2_loss = F.mse_loss(qf2, next_q_value) + return qf1_loss, qf2_loss + + def calculate_actor_loss(self, state_batch): + """Calculates the loss for the actor. This loss includes the additional entropy term""" + action, (action_probabilities, log_action_probabilities), _ = self.produce_action_and_action_info(state_batch) + qf1_pi = self.critic_local(state_batch) + qf2_pi = self.critic_local_2(state_batch) + min_qf_pi = torch.min(qf1_pi, qf2_pi) + inside_term = self.alpha * log_action_probabilities - min_qf_pi + policy_loss = (action_probabilities * inside_term).sum(dim=1).mean() + log_action_probabilities = torch.sum(log_action_probabilities * action_probabilities, dim=1) + return policy_loss, log_action_probabilities + + def locally_save_policy(self): + """Saves the policy""" + torch.save(self.actor_local.state_dict(), "{}/{}_network.pt".format(self.config.models_dir, self.agent_name)) diff --git a/agents/actor_critic_agents/TD3.py b/agents/actor_critic_agents/TD3.py new file mode 100644 index 0000000000000000000000000000000000000000..512482ea3451311f0b7e5b26c4a0c96f4d34af7b --- /dev/null +++ b/agents/actor_critic_agents/TD3.py @@ -0,0 +1,54 @@ +import torch +import torch.nn.functional as functional +from torch import optim +from agents.Base_Agent import Base_Agent +from .DDPG import DDPG +from exploration_strategies.Gaussian_Exploration import Gaussian_Exploration + +class TD3(DDPG): + """A TD3 Agent from the paper Addressing Function Approximation Error in Actor-Critic Methods (Fujimoto et al. 2018) + https://arxiv.org/abs/1802.09477""" + agent_name = "TD3" + + def __init__(self, config): + DDPG.__init__(self, config) + self.critic_local_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, + key_to_use="Critic", override_seed=self.config.seed + 1) + self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, + key_to_use="Critic") + Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2) + self.critic_optimizer_2 = optim.Adam(self.critic_local_2.parameters(), + lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4) + self.exploration_strategy_critic = Gaussian_Exploration(self.config) + + def compute_critic_values_for_next_states(self, next_states): + """Computes the critic values for next states to be used in the loss for the critic""" + with torch.no_grad(): + actions_next = self.actor_target(next_states) + actions_next_with_noise = self.exploration_strategy_critic.perturb_action_for_exploration_purposes({"action": actions_next}) + critic_targets_next_1 = self.critic_target(torch.cat((next_states, actions_next_with_noise), 1)) + critic_targets_next_2 = self.critic_target_2(torch.cat((next_states, actions_next_with_noise), 1)) + critic_targets_next = torch.min(torch.cat((critic_targets_next_1, critic_targets_next_2),1), dim=1)[0].unsqueeze(-1) + return critic_targets_next + + def critic_learn(self, states, actions, rewards, next_states, dones): + """Runs a learning iteration for both the critics""" + critic_targets_next = self.compute_critic_values_for_next_states(next_states) + critic_targets = self.compute_critic_values_for_current_states(rewards, critic_targets_next, dones) + + critic_expected_1 = self.critic_local(torch.cat((states, actions), 1)) + critic_expected_2 = self.critic_local_2(torch.cat((states, actions), 1)) + + critic_loss_1 = functional.mse_loss(critic_expected_1, critic_targets) + critic_loss_2 = functional.mse_loss(critic_expected_2, critic_targets) + + self.take_optimisation_step(self.critic_optimizer, self.critic_local, critic_loss_1, self.hyperparameters["Critic"]["gradient_clipping_norm"]) + self.take_optimisation_step(self.critic_optimizer_2, self.critic_local_2, critic_loss_2, + self.hyperparameters["Critic"]["gradient_clipping_norm"]) + + self.soft_update_of_target_network(self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"]) + self.soft_update_of_target_network(self.critic_local_2, self.critic_target_2, self.hyperparameters["Critic"]["tau"]) + + + + diff --git a/agents/actor_critic_agents/__pycache__/A2C.cpython-39.pyc b/agents/actor_critic_agents/__pycache__/A2C.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42c70fb0e8012f59da32c66d12fae4e2ffdacc89 Binary files /dev/null and b/agents/actor_critic_agents/__pycache__/A2C.cpython-39.pyc differ diff --git a/agents/actor_critic_agents/__pycache__/A3C.cpython-39.pyc b/agents/actor_critic_agents/__pycache__/A3C.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c76a1fe0078a711343dcf99464168f32d6c2a66 Binary files /dev/null and b/agents/actor_critic_agents/__pycache__/A3C.cpython-39.pyc differ diff --git a/agents/actor_critic_agents/__pycache__/DDPG.cpython-39.pyc b/agents/actor_critic_agents/__pycache__/DDPG.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e81d516174254b4e6932316cbe4e0f71c83e6f9d Binary files /dev/null and b/agents/actor_critic_agents/__pycache__/DDPG.cpython-39.pyc differ diff --git a/agents/actor_critic_agents/__pycache__/SAC.cpython-310.pyc b/agents/actor_critic_agents/__pycache__/SAC.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8340c1af03d92941e6ba013f843ae544be1ec0a6 Binary files /dev/null and b/agents/actor_critic_agents/__pycache__/SAC.cpython-310.pyc differ diff --git a/agents/actor_critic_agents/__pycache__/SAC.cpython-39.pyc b/agents/actor_critic_agents/__pycache__/SAC.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b64dc1d5730e63df4220d1f998be5e43b83905dc Binary files /dev/null and b/agents/actor_critic_agents/__pycache__/SAC.cpython-39.pyc differ diff --git a/agents/actor_critic_agents/__pycache__/SAC_Discrete.cpython-310.pyc b/agents/actor_critic_agents/__pycache__/SAC_Discrete.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..78dc28c5b412aa5e9e66b003218d7d73693e06f8 Binary files /dev/null and b/agents/actor_critic_agents/__pycache__/SAC_Discrete.cpython-310.pyc differ diff --git a/agents/actor_critic_agents/__pycache__/SAC_Discrete.cpython-39.pyc b/agents/actor_critic_agents/__pycache__/SAC_Discrete.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41b7446733875ddd2aed27dd7bbb134d284a070a Binary files /dev/null and b/agents/actor_critic_agents/__pycache__/SAC_Discrete.cpython-39.pyc differ diff --git a/agents/hierarchical_agents/DIAYN.py b/agents/hierarchical_agents/DIAYN.py new file mode 100644 index 0000000000000000000000000000000000000000..a44be6425433acedff285a2872b332b168ececca --- /dev/null +++ b/agents/hierarchical_agents/DIAYN.py @@ -0,0 +1,128 @@ +import torch +from gym import Wrapper, spaces +from torch import optim, nn +import numpy as np +import random +import time +import copy +import torch.nn.functional as F +from agents.Base_Agent import Base_Agent +from agents.DQN_agents.DDQN import DDQN +from agents.actor_critic_agents.SAC import SAC + +# NOTE: DIAYN calculates diversity of states penalty each timestep but it might be better to only base it on where the +# agent got to in the last timestep, or after X timesteps +# NOTE another problem with this is that the discriminator is trained from online data as it comes in which isn't iid +# so we could probably make it perform better by maintaining a replay buffer and using that to train the discriminator instead + +class DIAYN(Base_Agent): + """Hierarchical RL agent based on the paper Diversity is all you need (2018) - https://arxiv.org/pdf/1802.06070.pdf. + Works in two stages: + 1) First it trains an agent that tries to reach different states depending on which skill number is + inputted + 2) Then it trains an agent to maximise reward using its choice of skill for the lower level agent""" + agent_name = "DIAYN" + def __init__(self, config): + super().__init__(config) + self.training_mode = True + self.num_skills = config.hyperparameters["num_skills"] + self.unsupervised_episodes = config.hyperparameters["num_unsupservised_episodes"] + self.supervised_episodes = config.num_episodes_to_run - self.unsupervised_episodes + + assert self.hyperparameters["DISCRIMINATOR"]["final_layer_activation"] == None, "Final layer activation for disciminator should be None" + self.discriminator = self.create_NN(self.state_size, self.num_skills, key_to_use="DISCRIMINATOR") + self.discriminator_optimizer = optim.Adam(self.discriminator.parameters(), + lr=self.hyperparameters["DISCRIMINATOR"]["learning_rate"]) + self.agent_config = copy.deepcopy(config) + self.agent_config.environment = DIAYN_Skill_Wrapper(copy.deepcopy(self.environment), self.num_skills, self) + self.agent_config.hyperparameters = self.agent_config.hyperparameters["AGENT"] + self.agent_config.hyperparameters["do_evaluation_iterations"] = False + self.agent = SAC(self.agent_config) #We have to use SAC because it involves maximising the policy's entropy over actions which is also a part of DIAYN + + self.timesteps_to_give_up_control_for = self.hyperparameters["MANAGER"]["timesteps_to_give_up_control_for"] + self.manager_agent_config = copy.deepcopy(config) + self.manager_agent_config.environment = DIAYN_Manager_Agent_Wrapper(copy.deepcopy(self.environment), self.agent, + self.timesteps_to_give_up_control_for, self.num_skills) + self.manager_agent_config.hyperparameters = self.manager_agent_config.hyperparameters["MANAGER"] + self.manager_agent = DDQN(self.manager_agent_config) + + def run_n_episodes(self, num_episodes=None, show_whether_achieved_goal=True, save_and_print_results=True): + start = time.time() + self.agent.run_n_episodes(num_episodes=self.unsupervised_episodes, show_whether_achieved_goal=False) + game_full_episode_scores, rolling_results, _ = self.manager_agent.run_n_episodes(num_episodes=self.supervised_episodes) + time_taken = time.time() - start + pretraining_results = [np.min(self.agent.game_full_episode_scores)]*self.unsupervised_episodes + return pretraining_results + game_full_episode_scores, pretraining_results + rolling_results, time_taken + + def disciminator_learn(self, skill, discriminator_outputs): + if not self.training_mode: return + assert isinstance(skill, int) + assert discriminator_outputs.shape[0] == 1 + assert discriminator_outputs.shape[1] == self.num_skills + loss = nn.CrossEntropyLoss()(discriminator_outputs, torch.Tensor([skill]).long()) + self.take_optimisation_step(self.discriminator_optimizer, self.discriminator, loss, + self.hyperparameters["DISCRIMINATOR"]["gradient_clipping_norm"]) + + def get_predicted_probability_of_skill(self, skill, next_state): + """Gets the probability that the disciminator gives to the correct skill""" + predicted_probabilities_unnormalised = self.discriminator(torch.Tensor(next_state).unsqueeze(0)) + probability_of_correct_skill = F.softmax(predicted_probabilities_unnormalised)[:, skill] + return probability_of_correct_skill.item(), predicted_probabilities_unnormalised + +class DIAYN_Skill_Wrapper(Wrapper): + """Open AI gym wrapper to help create a pretraining environment in which to train diverse skills according to the + specification in the Diversity is all you need (2018) paper """ + def __init__(self, env, num_skills, meta_agent): + Wrapper.__init__(self, env) + self.num_skills = num_skills + self.meta_agent = meta_agent + self.prior_probability_of_skill = 1.0 / self.num_skills #Each skill equally likely to be chosen + self._max_episode_steps = self.env._max_episode_steps + + def reset(self, **kwargs): + observation = self.env.reset(**kwargs) + self.skill = random.randint(0, self.num_skills - 1) + return self.observation(observation) + + def observation(self, observation): + return np.concatenate((np.array(observation), np.array([self.skill]))) + + def step(self, action): + next_state, _, done, _ = self.env.step(action) + new_reward, discriminator_outputs = self.calculate_new_reward(next_state) + self.meta_agent.disciminator_learn(self.skill, discriminator_outputs) + return self.observation(next_state), new_reward, done, _ + + def calculate_new_reward(self, next_state): + """Calculates an intrinsic reward that encourages maximum exploration. It also keeps track of the discriminator + outputs so they can be used for training""" + probability_correct_skill, disciminator_outputs = self.meta_agent.get_predicted_probability_of_skill(self.skill, next_state) + new_reward = np.log(probability_correct_skill + 1e-8) - np.log(self.prior_probability_of_skill) + return new_reward, disciminator_outputs + + +class DIAYN_Manager_Agent_Wrapper(Wrapper): + """Environment wrapper for the meta agent. The meta agent uses this environment to take in the state, decide on a skill + and then grant over control to the lower-level skill for a set number of timesteps""" + def __init__(self, env, lower_level_agent, timesteps_to_give_up_control_for, num_skills): + Wrapper.__init__(self, env) + self.action_space = spaces.Discrete(num_skills) + self.lower_level_agent = lower_level_agent + self.timesteps_to_give_up_control_for = timesteps_to_give_up_control_for + + def reset(self, **kwargs): + self.state = self.env.reset(**kwargs) + return self.state + + def step(self, skill_chosen): + """Runs a step in the game from the perspective of the manager agent. This involves giving up control to the + lower-level agent for a set number of steps""" + cumulative_reward = 0 + for _ in range(self.timesteps_to_give_up_control_for): + combined_state = np.concatenate((np.array(self.state), np.array([skill_chosen]))) + action = self.lower_level_agent.pick_action(eval_ep=True, state=combined_state) + next_state, reward, done, _ = self.env.step(action) + cumulative_reward += reward + self.state = next_state + if done: break + return next_state, cumulative_reward, done, _ diff --git a/agents/hierarchical_agents/HIRO.py b/agents/hierarchical_agents/HIRO.py new file mode 100644 index 0000000000000000000000000000000000000000..558a0f91adb8c0107c37ccd77e9465840c57363b --- /dev/null +++ b/agents/hierarchical_agents/HIRO.py @@ -0,0 +1,267 @@ +import copy +import torch +import numpy as np +from gym import Wrapper +from agents.Base_Agent import Base_Agent +from agents.actor_critic_agents.DDPG import DDPG +from agents.Trainer import Trainer + + +class HIRO(Base_Agent): + agent_name = "HIRO" + + def __init__(self, config): + super().__init__(config) + self.max_sub_policy_timesteps = config.hyperparameters["LOWER_LEVEL"]["max_lower_level_timesteps"] + self.config.hyperparameters = self.config.hyperparameters + + self.higher_level_state = None #true state of environment + self.higher_level_next_state = None + + self.higher_level_reward = None + self.lower_level_reward = None + + self.higher_level_done = False + self.lower_level_done = False + + self.goal = None + + self.lower_level_state = None #state of environment with goal appended + self.lower_level_next_state = None + + self.lower_level_agent_config = copy.deepcopy(config) + self.lower_level_agent_config.hyperparameters = self.lower_level_agent_config.hyperparameters["LOWER_LEVEL"] + + self.lower_level_agent_config.environment = Lower_Level_Agent_Environment_Wrapper(self.environment, self, self.max_sub_policy_timesteps) + self.lower_level_agent = DDPG(self.lower_level_agent_config) + + self.lower_level_agent.average_score_required_to_win = float("inf") + + self.higher_level_agent_config = copy.deepcopy(config) + self.higher_level_agent_config.hyperparameters = self.higher_level_agent_config.hyperparameters["HIGHER_LEVEL"] + self.higher_level_agent_config.environment = Higher_Level_Agent_Environment_Wrapper(self.environment, self) + self.higher_level_agent = HIRO_Higher_Level_DDPG_Agent(self.higher_level_agent_config, self.lower_level_agent.actor_local) + + self.step_lower_level_states = [] + self.step_lower_level_action_seen = [] + + + def run_n_episodes(self): + """Runs game to completion n times and then summarises results and saves model (if asked to)""" + self.higher_level_agent.run_n_episodes(self.config.num_episodes_to_run) + + @staticmethod + def goal_transition(state, goal, next_state): + """Provides updated goal according to the goal transition function in the HIRO paper""" + return state + goal - next_state + + def save_higher_level_experience(self): + self.higher_level_agent.step_lower_level_states = self.step_lower_level_states + self.higher_level_agent.step_lower_level_action_seen = self.step_lower_level_action_seen + +class HIRO_Higher_Level_DDPG_Agent(DDPG): + """Extends DDPG so that it can function as the higher level agent in the HIRO hierarchical RL algorithm. This only involves + changing how the agent saves experiences and samples them for learning""" + + def __init__(self, config, lower_level_policy): + super(HIRO_Higher_Level_DDPG_Agent, self).__init__(config) + self.lower_level_policy = lower_level_policy + self.number_goal_candidates = config.hyperparameters["number_goal_candidates"] + + def save_experience(self, memory=None, experience=None): + """Saves the recent experience to the memory buffer. Adapted from normal DDPG so that it saves the sequence of + states, goals and actions that we saw whilst control was given to the lower level""" + if memory is None: memory = self.memory + if experience is None: experience = self.step_lower_level_states, self.step_lower_level_action_seen, self.reward, self.next_state, self.done + memory.add_experience(*experience) + + def sample_experiences(self): + experiences = self.memory.produce_action_and_action_info(separate_out_data_types=False) + assert len(experiences[0].state) == self.hyperparameters["max_lower_level_timesteps"] or experiences[0].done + assert experiences[0].state[0].shape[0] == self.state_size * 2 + assert len(experiences[0].action) == self.hyperparameters["max_lower_level_timesteps"] or experiences[0].done + + states = [] + actions = [] + rewards = [] + next_states = [] + dones = [] + + for ix, experience in enumerate(experiences): + state, action, reward, next_state, done = self.transform_goal_to_one_most_likely_to_have_induced_actions(experience) + states.append(state) + actions.append(action) + rewards.append(reward) + next_states.append(next_state) + dones.append(done) + + states = torch.from_numpy(np.vstack([state for state in states])).float().to(self.device) + actions = torch.from_numpy(np.vstack([action for action in actions])).float().to(self.device) + rewards = torch.from_numpy(np.vstack([reward for reward in rewards])).float().to(self.device) + next_states = torch.from_numpy(np.vstack([next_state for next_state in next_states])).float().to(self.device) + dones = torch.from_numpy(np.vstack([int(done) for done in dones])).float().to(self.device) + + return states, actions, rewards, next_states, dones + + def transform_goal_to_one_most_likely_to_have_induced_actions(self, experience): + """Transforms the goal in an experience to the goal that would have been most likely to induce the actions chosen + by the lower level agent in the experience""" + goal_candidate_state_change = [experience.state[-1][:self.state_size] - experience.state[0][:self.state_size]] + goal_candidate_actual_goal = [experience.state[0][self.state_size:]] + goal_candidate_state_change_random_iterations = [np.random.normal(goal_candidate_state_change[0]) for _ in range(self.number_goal_candidates - 2)] + goal_candidates = goal_candidate_state_change + goal_candidate_actual_goal + goal_candidate_state_change_random_iterations + + max = float("-inf") + timesteps_in_experience = len(experience.state) + + for goal_ix, goal in enumerate(goal_candidates): + log_probability_total = 0 + for state_ix in range(timesteps_in_experience): + state_obs = experience.state[state_ix][:self.state_size] + action = experience.action[state_ix] + log_probability= self.log_probability_lower_level_picks_action(state_obs, goal, action) + log_probability_total += log_probability + if state_ix != timesteps_in_experience - 1: + next_state = experience.state[state_ix+1][:self.state_size] + goal = HIRO.goal_transition(state_obs, goal, next_state) + if log_probability_total >= max: + max = log_probability_total + best_goal_ix = goal_ix + + state = experience.state[0][:self.state_size] + next_state = experience.next_state + reward = experience.reward + action = goal_candidates[best_goal_ix] + done = experience.done + + assert next_state.shape[0] == self.state_size + + return state, action, reward, next_state, done + + + def log_probability_lower_level_picks_action(self, state, goal, action): + """Calculates the log probability that the lower level agent would have chosen this action given the state + and goal as inputs""" + state_and_goal = torch.from_numpy(np.concatenate((state, goal))).float().unsqueeze(0).to(self.device) + action_would_have_taken = self.lower_level_policy(state_and_goal).detach() + return -0.5 * torch.norm(action - action_would_have_taken, 2)**2 + + +class Higher_Level_Agent_Environment_Wrapper(Wrapper): + """Adapts the game environment so that it is compatible with the higher level agent which sets goals for the lower + level agent""" + def __init__(self, env, HIRO_agent): + Wrapper.__init__(self, env) + self.env = env + self.HIRO_agent = HIRO_agent + self.action_space = self.observation_space + + + def reset(self, **kwargs): + self.HIRO_agent.higher_level_state = self.env.reset(**kwargs) + return self.HIRO_agent.higher_level_state + + def step(self, goal): + self.HIRO_agent.higher_level_reward = 0 + self.HIRO_agent.step_lower_level_states = [] + self.HIRO_agent.step_lower_level_action_seen = [] + + self.HIRO_agent.goal = goal + self.HIRO_agent.lower_level_agent.episode_number = 0 #must reset lower level agent to 0 episodes completed otherwise won't run more episodes + self.HIRO_agent.lower_level_agent.run_n_episodes(num_episodes=1, show_whether_achieved_goal=False, save_and_print_results=False) + + self.HIRO_agent.save_higher_level_experience() + + return self.HIRO_agent.higher_level_next_state, self.HIRO_agent.higher_level_reward, self.HIRO_agent.higher_level_done, {} + +class Lower_Level_Agent_Environment_Wrapper(Wrapper): + """Open AI gym wrapper to help create an environment where a goal from a higher-level agent is treated as part + of the environment state""" + def __init__(self, env, HIRO_agent, max_sub_policy_timesteps): + Wrapper.__init__(self, env) + self.env = env + self.meta_agent = HIRO_agent + self.max_sub_policy_timesteps = max_sub_policy_timesteps + + self.track_intrinsic_rewards = [] + + def reset(self, **kwargs): + if self.meta_agent.higher_level_state is not None: state = self.meta_agent.higher_level_state + else: + print("INITIATION ONLY") + state = self.env.reset() + + if self.meta_agent.goal is not None: goal = self.meta_agent.goal + else: + print("INITIATION ONLY") + goal = state + + self.lower_level_timesteps = 0 + self.meta_agent.lower_level_done = False + + self.meta_agent.lower_level_state = self.turn_internal_state_to_external_state(state, goal) + + return self.meta_agent.lower_level_state + + def turn_internal_state_to_external_state(self, internal_state, goal): + return np.concatenate((np.array(internal_state), goal)) + + def step(self, action): + import random + if random.random() < 0.008: + print("Rolling intrinsic rewards {}".format(np.mean(self.track_intrinsic_rewards[-100:]))) + + + self.meta_agent.step_lower_level_states.append(self.meta_agent.lower_level_state) + self.meta_agent.step_lower_level_action_seen.append(action) + + self.lower_level_timesteps += 1 + next_state, extrinsic_reward, done, _ = self.env.step(action) + + + + self.update_rewards(extrinsic_reward, next_state) + self.update_goal(next_state) + self.update_state_and_next_state(next_state) + self.update_done(done) + + return self.meta_agent.lower_level_next_state, self.meta_agent.lower_level_reward, self.meta_agent.lower_level_done, _ + + def update_rewards(self, extrinsic_reward, next_state): + self.meta_agent.higher_level_reward += extrinsic_reward + self.meta_agent.lower_level_reward = self.calculate_intrinsic_reward(self.meta_agent.higher_level_state, + next_state, + self.meta_agent.goal) + def update_goal(self, next_state): + + self.meta_agent.goal = HIRO.goal_transition(self.meta_agent.higher_level_state, self.meta_agent.goal, + next_state) + + def update_state_and_next_state(self, next_state): + self.meta_agent.higher_level_next_state = next_state + self.meta_agent.lower_level_next_state = self.turn_internal_state_to_external_state(next_state, + self.meta_agent.goal) + self.meta_agent.higher_level_state = self.meta_agent.higher_level_next_state + self.meta_agent.lower_level_state = self.meta_agent.lower_level_next_state + + def update_done(self, done): + self.meta_agent.higher_level_done = done + self.meta_agent.lower_level_done = done or self.lower_level_timesteps >= self.max_sub_policy_timesteps + + + def calculate_intrinsic_reward(self, internal_state, internal_next_state, goal): + """Calculates the intrinsic reward for the agent according to whether it has made progress towards the goal + or not since the last timestep""" + desired_next_state = internal_state + goal + error = desired_next_state - internal_next_state + intrinsic_reward = -(np.dot(error, error))**0.5 + + self.track_intrinsic_rewards.append(intrinsic_reward) + + return intrinsic_reward + + + + + + diff --git a/agents/hierarchical_agents/SNN_HRL.py b/agents/hierarchical_agents/SNN_HRL.py new file mode 100644 index 0000000000000000000000000000000000000000..53a381967aa7609a353622a8185f9252b4c9c3ab --- /dev/null +++ b/agents/hierarchical_agents/SNN_HRL.py @@ -0,0 +1,146 @@ +import copy +import random +import time +import numpy as np +import torch +from gym import Wrapper, spaces +from agents.Base_Agent import Base_Agent +from agents.policy_gradient_agents.PPO import PPO +from agents.DQN_agents.DDQN import DDQN + + +class SNN_HRL(Base_Agent): + """Implements the hierarchical RL agent that uses stochastic neural networks (SNN) from the paper Florensa et al. 2017 + https://arxiv.org/pdf/1704.03012.pdf + Works by: + 1) Creating a pre-training environment within which the skill_agent can learn for some period of time + 2) Then skill_agent is frozen + 3) Then we train a manager agent that chooses which of the pre-trained skills to let act for it for some period of time + Note that it only works with discrete states at the moment. + + Note that this agent will not work well in environments where it is beneficial to end the game as quickly as possible + because then there isn't enough incentive for the skills to learn to explore different parts of the state space + """ + agent_name = "SNN-HRL" + + def __init__(self, config): + Base_Agent.__init__(self, config) + assert isinstance(self.environment.reset(), int) or isinstance(self.environment.reset(), np.int64) or self.environment.reset().dtype == np.int64, "only works for discrete states currently" + self.num_skills = self.hyperparameters["SKILL_AGENT"]["num_skills"] + self.episodes_for_pretraining = self.hyperparameters["SKILL_AGENT"]["episodes_for_pretraining"] + self.timesteps_before_changing_skill = self.hyperparameters["MANAGER"]["timesteps_before_changing_skill"] + + self.skill_agent_config = copy.deepcopy(config) + self.skill_agent_config.hyperparameters = self.skill_agent_config.hyperparameters["SKILL_AGENT"] + self.skill_agent_config.num_episodes_to_run = self.episodes_for_pretraining + + self.manager_config = copy.deepcopy(config) + self.manager_config.hyperparameters = self.manager_config.hyperparameters["MANAGER"] + self.manager_config.num_episodes_to_run = self.config.num_episodes_to_run - self.skill_agent_config.num_episodes_to_run + + def run_n_episodes(self): + """Runs game to completion n times and then summarises results and saves model (if asked to)""" + start = time.time() + + skill_agent = self.create_skill_training_agent() + skill_agent.run_n_episodes() + self.skill_agent_config.environment.print_state_distribution() + skill_agent.turn_off_any_epsilon_greedy_exploration() + + manager_agent = self.create_manager_agent(skill_agent) + manager_agent.run_n_episodes() + + time_taken = time.time() - start + pretraining_results = [np.min(manager_agent.game_full_episode_scores)]*self.episodes_for_pretraining + return pretraining_results + manager_agent.game_full_episode_scores, pretraining_results + manager_agent.rolling_results, time_taken + + def create_skill_training_agent(self): + """Creates and instantiates a pre-training environment for the agent to learn skills in and then instantiates + and agent to learn in this environment""" + self.skill_agent_config.environment = Skill_Wrapper(copy.deepcopy(self.environment), self.environment.observation_space.n, + self.num_skills, + self.skill_agent_config.hyperparameters[ + "regularisation_weight"], self.skill_agent_config.hyperparameters["visitations_decay"]) + return DDQN(self.skill_agent_config) + + def create_manager_agent(self, skill_agent): + """Instantiates a manager agent""" + self.manager_config.environment = Manager_Frozen_Worker_Wrapper(copy.deepcopy(self.environment), self.num_skills, + self.timesteps_before_changing_skill, skill_agent) + return DDQN(self.manager_config) + + +class Skill_Wrapper(Wrapper): + """Open AI gym wrapper to help create a pretraining environment in which to train skills""" + def __init__(self, env, num_states, num_skills, regularisation_weight, visitations_decay): + Wrapper.__init__(self, env) + self.num_skills = num_skills + self.num_states = num_states + self.state_visitations = [[0 for _ in range(num_states)] for _ in range(num_skills)] + self.regularisation_weight = regularisation_weight + self.visitations_decay = visitations_decay + + def reset(self, **kwargs): + observation = self.env.reset(**kwargs) + self.skill = random.randint(0, self.num_skills - 1) + return self.observation(observation) + + def observation(self, observation): + return np.concatenate((np.array(observation), np.array([self.skill]))) + + def step(self, action): + next_state, reward, done, _ = self.env.step(action) + new_reward = self.calculate_new_reward(reward, next_state) + return self.observation(next_state), new_reward, done, _ + + def calculate_new_reward(self, reward, next_state): + self.update_state_visitations(next_state) + probability_correct_skill = self.calculate_probability_correct_skill(next_state) + new_reward = reward + self.regularisation_weight * np.log(probability_correct_skill) + return new_reward + + def update_state_visitations(self, next_state): + """Updates table keeping track of number of times each state visited under each skill""" + self.state_visitations = [[val * self.visitations_decay for val in sublist] for sublist in + self.state_visitations] + self.state_visitations[self.skill][next_state[0]] += 1 + + def calculate_probability_correct_skill(self, next_state): + """Calculates the probability that being in a state implies a certain skill""" + visitations_correct_skill = self.state_visitations[self.skill][next_state[0]] + visitations_any_skill = np.sum([visit[next_state[0]] for visit in self.state_visitations]) + probability = float(visitations_correct_skill) / float(visitations_any_skill) + return probability + + def print_state_distribution(self): + """Prints the observed probability of skills depending on the state we are in""" + print(self.state_visitations) + state_count = {k: 0 for k in range(self.num_states)} + for skill in range(len(self.state_visitations)): + for state in range(len(self.state_visitations[0])): + state_count[state] += self.state_visitations[skill][state] + probability_visitations = [[row[ix] / max(1.0, state_count[ix]) for ix in range(len(row))] for row in + self.state_visitations] + print(" ") + print(probability_visitations) + print(" ") + +class Manager_Frozen_Worker_Wrapper(Wrapper): + """Open AI gym wrapper to help create an environment where manager learns to act by instructing a frozen worker""" + def __init__(self, env, num_skills, timesteps_before_changing_skill, skills_agent): + Wrapper.__init__(self, env) + self.action_space = spaces.Discrete(num_skills) + self.timesteps_before_changing_skill = timesteps_before_changing_skill + self.skills_agent = skills_agent + + def step(self, action): + """Moves a step in manager environment which involves committing to using a skill for a set number of timesteps""" + next_state = self.env.unwrapped.s + cumulative_reward = 0 + for _ in range(self.timesteps_before_changing_skill): + with torch.no_grad(): + skill_action = self.skills_agent.pick_action(np.array([next_state[0], action])) + next_state, reward, done, _ = self.env.step(skill_action) + cumulative_reward += reward + if done: break + return next_state, cumulative_reward, done, _ \ No newline at end of file diff --git a/agents/hierarchical_agents/__pycache__/DIAYN.cpython-39.pyc b/agents/hierarchical_agents/__pycache__/DIAYN.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..01f99e8dbda8aadd750e93bb0167d6ca24664333 Binary files /dev/null and b/agents/hierarchical_agents/__pycache__/DIAYN.cpython-39.pyc differ diff --git a/agents/hierarchical_agents/__pycache__/HIRO.cpython-39.pyc b/agents/hierarchical_agents/__pycache__/HIRO.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..caf6b1fb8d3b80b0dbfd24cb30fcaabbb09ab580 Binary files /dev/null and b/agents/hierarchical_agents/__pycache__/HIRO.cpython-39.pyc differ diff --git a/agents/hierarchical_agents/__pycache__/SNN_HRL.cpython-39.pyc b/agents/hierarchical_agents/__pycache__/SNN_HRL.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d91419a055d4dbcd68d35245cac311ee8c574d4f Binary files /dev/null and b/agents/hierarchical_agents/__pycache__/SNN_HRL.cpython-39.pyc differ diff --git a/agents/hierarchical_agents/__pycache__/h_DQN.cpython-39.pyc b/agents/hierarchical_agents/__pycache__/h_DQN.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..04a1597a4fdcdbc378d5f69697383af8be08d32c Binary files /dev/null and b/agents/hierarchical_agents/__pycache__/h_DQN.cpython-39.pyc differ diff --git a/agents/hierarchical_agents/h_DQN.py b/agents/hierarchical_agents/h_DQN.py new file mode 100644 index 0000000000000000000000000000000000000000..a9fcc75cf6a4c22a634dc209128b0e2d35634dd8 --- /dev/null +++ b/agents/hierarchical_agents/h_DQN.py @@ -0,0 +1,121 @@ +import copy +import numpy as np +from agents.Base_Agent import Base_Agent +from agents.DQN_agents.DDQN import DDQN + +class h_DQN(Base_Agent): + """Implements hierarchical RL agent h-DQN from paper Kulkarni et al. (2016) https://arxiv.org/abs/1604.06057?context=stat + Note also that this algorithm only works when we have discrete states and discrete actions currently because otherwise + it is not clear what it means to achieve a subgoal state designated by the meta-controller""" + agent_name = "h-DQN" + + def __init__(self, config): + Base_Agent.__init__(self, config) + self.controller_config = copy.deepcopy(config) + self.controller_config.hyperparameters = self.controller_config.hyperparameters["CONTROLLER"] + self.controller = DDQN(self.controller_config) + self.controller.q_network_local = self.create_NN(input_dim=self.state_size*2, output_dim=self.action_size, + key_to_use="CONTROLLER") + self.meta_controller_config = copy.deepcopy(config) + self.meta_controller_config.hyperparameters = self.meta_controller_config.hyperparameters["META_CONTROLLER"] + self.meta_controller = DDQN(self.meta_controller_config) + self.meta_controller.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.state_size, + key_to_use="META_CONTROLLER") + self.rolling_intrinsic_rewards = [] + self.goals_seen = [] + self.controller_learnt_enough = False + self.controller_actions = [] + + def reset_game(self): + """Resets the game information so we are ready to play a new episode""" + self.state = self.environment.reset() + self.next_state = None + self.action = None + self.reward = None + self.done = False + self.cumulative_meta_controller_reward = 0 + self.episode_over = False + self.subgoal_achieved = False + self.total_episode_score_so_far = 0 + self.meta_controller_steps = 0 + self.update_learning_rate(self.controller_config.hyperparameters["learning_rate"], self.controller.q_network_optimizer) + self.update_learning_rate(self.meta_controller_config.hyperparameters["learning_rate"], self.meta_controller.q_network_optimizer) + + def step(self): + + self.episode_steps = 0 + + while not self.episode_over: + episode_intrinsic_rewards = [] + self.meta_controller_state = self.environment.state + self.subgoal = self.meta_controller.pick_action(state=self.meta_controller_state) + self.goals_seen.append(self.subgoal) + self.subgoal_achieved = False + self.state = np.concatenate((self.environment.state, np.array([self.subgoal]))) + self.cumulative_meta_controller_reward = 0 + + while not (self.episode_over or self.subgoal_achieved): + self.pick_and_conduct_controller_action() + self.update_data() + if self.time_to_learn(self.controller.memory, self.global_step_number, "CONTROLLER"): #means it is time to train controller + for _ in range(self.hyperparameters["CONTROLLER"]["learning_iterations"]): + self.controller.learn() + self.save_experience(memory=self.controller.memory, experience=(self.state, self.action, self.reward, self.next_state, self.done)) + self.state = self.next_state #this is to set the state for the next iteration + self.global_step_number += 1 + episode_intrinsic_rewards.append(self.reward) + + if self.time_to_learn(self.meta_controller.memory, self.meta_controller_steps, "META_CONTROLLER"): + for _ in range(self.hyperparameters["META_CONTROLLER"]["learning_iterations"]): + self.meta_controller.learn() + + self.save_experience(memory=self.meta_controller.memory, + experience=(self.meta_controller_state, self.subgoal, self.cumulative_meta_controller_reward, + self.meta_controller_next_state, self.episode_over)) + self.meta_controller_steps += 1 + self.episode_steps += 1 + + self.rolling_intrinsic_rewards.append(np.sum(episode_intrinsic_rewards)) + if self.episode_number % 100 == 0: + print(" ") + print("Most common goal -- {} -- ".format( max(set(self.goals_seen[-100:]), key=self.goals_seen[-100:].count) )) + print("Intrinsic Rewards -- {} -- ".format(np.mean(self.rolling_intrinsic_rewards[-100:]))) + print("Average controller action -- {} ".format(np.mean(self.controller_actions[-100:]))) + print("Latest subgoal -- {}".format(self.goals_seen[-1])) + self.episode_number += 1 + self.controller.episode_number += 1 + self.meta_controller.episode_number += 1 + + def pick_and_conduct_controller_action(self): + """Picks and conducts an action for controller""" + self.action = self.controller.pick_action(state=self.state) + self.controller_actions.append(self.action) + self.conduct_action() + + def update_data(self): + """Updates stored data for controller and meta-controller. It must occur in the order shown""" + self.episode_over = self.environment.get_done() + self.update_controller_data() + self.update_meta_controller_data() + + def update_controller_data(self): + """Gets the next state, reward and done information from the environment""" + environment_next_state = self.environment.get_next_state() + assert environment_next_state.shape[0] == 1 + self.next_state = np.concatenate((environment_next_state, np.array([self.subgoal]))) + self.subgoal_achieved = environment_next_state[0] == self.subgoal + self.reward = 1.0 * self.subgoal_achieved + self.done = self.subgoal_achieved or self.episode_over + + def update_meta_controller_data(self): + """Updates data relating to meta controller""" + self.cumulative_meta_controller_reward += self.environment.get_reward() + self.total_episode_score_so_far += self.environment.get_reward() + if self.done: + self.meta_controller_next_state = self.environment.get_next_state() + + def time_to_learn(self, memory, steps_taken, controller_name): + """Boolean indicating whether it is time for meta-controller or controller to learn""" + enough_experiences = len(memory) > self.hyperparameters[controller_name]["batch_size"] + enough_steps_taken = steps_taken % self.hyperparameters[controller_name]["update_every_n_steps"] == 0 + return enough_experiences and enough_steps_taken diff --git a/agents/policy_gradient_agents/PPO.py b/agents/policy_gradient_agents/PPO.py new file mode 100644 index 0000000000000000000000000000000000000000..a1910101edd121ec84ea9e52652ec37e6abf9f87 --- /dev/null +++ b/agents/policy_gradient_agents/PPO.py @@ -0,0 +1,132 @@ +import copy +import sys +import torch +import numpy as np +from torch import optim +from agents.Base_Agent import Base_Agent +from exploration_strategies.Epsilon_Greedy_Exploration import Epsilon_Greedy_Exploration +from utilities.Parallel_Experience_Generator import Parallel_Experience_Generator +from utilities.Utility_Functions import normalise_rewards, create_actor_distribution + +class PPO(Base_Agent): + """Proximal Policy Optimization agent""" + agent_name = "PPO" + + def __init__(self, config): + Base_Agent.__init__(self, config) + self.policy_output_size = self.calculate_policy_output_size() + self.policy_new = self.create_NN(input_dim=self.state_size, output_dim=self.policy_output_size) + self.policy_old = self.create_NN(input_dim=self.state_size, output_dim=self.policy_output_size) + self.policy_old.load_state_dict(copy.deepcopy(self.policy_new.state_dict())) + self.policy_new_optimizer = optim.Adam(self.policy_new.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4) + self.episode_number = 0 + self.many_episode_states = [] + self.many_episode_actions = [] + self.many_episode_rewards = [] + self.experience_generator = Parallel_Experience_Generator(self.environment, self.policy_new, self.config.seed, + self.hyperparameters, self.action_size) + self.exploration_strategy = Epsilon_Greedy_Exploration(self.config) + + def calculate_policy_output_size(self): + """Initialises the policies""" + if self.action_types == "DISCRETE": + return self.action_size + elif self.action_types == "CONTINUOUS": + return self.action_size * 2 #Because we need 1 parameter for mean and 1 for std of distribution + + def step(self): + """Runs a step for the PPO agent""" + exploration_epsilon = self.exploration_strategy.get_updated_epsilon_exploration({"episode_number": self.episode_number}) + self.many_episode_states, self.many_episode_actions, self.many_episode_rewards = self.experience_generator.play_n_episodes( + self.hyperparameters["episodes_per_learning_round"], exploration_epsilon) + self.episode_number += self.hyperparameters["episodes_per_learning_round"] + self.policy_learn() + self.update_learning_rate(self.hyperparameters["learning_rate"], self.policy_new_optimizer) + self.equalise_policies() + + def policy_learn(self): + """A learning iteration for the policy""" + all_discounted_returns = self.calculate_all_discounted_returns() + if self.hyperparameters["normalise_rewards"]: + all_discounted_returns = normalise_rewards(all_discounted_returns) + for _ in range(self.hyperparameters["learning_iterations_per_round"]): + all_ratio_of_policy_probabilities = self.calculate_all_ratio_of_policy_probabilities() + loss = self.calculate_loss([all_ratio_of_policy_probabilities], all_discounted_returns) + self.take_policy_new_optimisation_step(loss) + + def calculate_all_discounted_returns(self): + """Calculates the cumulative discounted return for each episode which we will then use in a learning iteration""" + all_discounted_returns = [] + for episode in range(len(self.many_episode_states)): + discounted_returns = [0] + for ix in range(len(self.many_episode_states[episode])): + return_value = self.many_episode_rewards[episode][-(ix + 1)] + self.hyperparameters["discount_rate"]*discounted_returns[-1] + discounted_returns.append(return_value) + discounted_returns = discounted_returns[1:] + all_discounted_returns.extend(discounted_returns[::-1]) + return all_discounted_returns + + def calculate_all_ratio_of_policy_probabilities(self): + """For each action calculates the ratio of the probability that the new policy would have picked the action vs. + the probability the old policy would have picked it. This will then be used to inform the loss""" + all_states = [state for states in self.many_episode_states for state in states] + all_actions = [[action] if self.action_types == "DISCRETE" else action for actions in self.many_episode_actions for action in actions ] + all_states = torch.stack([torch.Tensor(states).float().to(self.device) for states in all_states]) + + all_actions = torch.stack([torch.Tensor(actions).float().to(self.device) for actions in all_actions]) + all_actions = all_actions.view(-1, len(all_states)) + + new_policy_distribution_log_prob = self.calculate_log_probability_of_actions(self.policy_new, all_states, all_actions) + old_policy_distribution_log_prob = self.calculate_log_probability_of_actions(self.policy_old, all_states, all_actions) + ratio_of_policy_probabilities = torch.exp(new_policy_distribution_log_prob) / (torch.exp(old_policy_distribution_log_prob) + 1e-8) + return ratio_of_policy_probabilities + + def calculate_log_probability_of_actions(self, policy, states, actions): + """Calculates the log probability of an action occuring given a policy and starting state""" + policy_output = policy.forward(states).to(self.device) + policy_distribution = create_actor_distribution(self.action_types, policy_output, self.action_size) + policy_distribution_log_prob = policy_distribution.log_prob(actions) + return policy_distribution_log_prob + + def calculate_loss(self, all_ratio_of_policy_probabilities, all_discounted_returns): + """Calculates the PPO loss""" + all_ratio_of_policy_probabilities = torch.squeeze(torch.stack(all_ratio_of_policy_probabilities)) + all_ratio_of_policy_probabilities = torch.clamp(input=all_ratio_of_policy_probabilities, + min = -sys.maxsize, + max = sys.maxsize) + all_discounted_returns = torch.tensor(all_discounted_returns).to(all_ratio_of_policy_probabilities) + potential_loss_value_1 = all_discounted_returns * all_ratio_of_policy_probabilities + potential_loss_value_2 = all_discounted_returns * self.clamp_probability_ratio(all_ratio_of_policy_probabilities) + loss = torch.min(potential_loss_value_1, potential_loss_value_2) + loss = -torch.mean(loss) + return loss + + def clamp_probability_ratio(self, value): + """Clamps a value between a certain range determined by hyperparameter clip epsilon""" + return torch.clamp(input=value, min=1.0 - self.hyperparameters["clip_epsilon"], + max=1.0 + self.hyperparameters["clip_epsilon"]) + + def take_policy_new_optimisation_step(self, loss): + """Takes an optimisation step for the new policy""" + self.policy_new_optimizer.zero_grad() # reset gradients to 0 + loss.backward() # this calculates the gradients + torch.nn.utils.clip_grad_norm_(self.policy_new.parameters(), self.hyperparameters[ + "gradient_clipping_norm"]) # clip gradients to help stabilise training + self.policy_new_optimizer.step() # this applies the gradients + + def equalise_policies(self): + """Sets the old policy's parameters equal to the new policy's parameters""" + for old_param, new_param in zip(self.policy_old.parameters(), self.policy_new.parameters()): + old_param.data.copy_(new_param.data) + + def save_result(self): + """Save the results seen by the agent in the most recent experiences""" + for ep in range(len(self.many_episode_rewards)): + total_reward = np.sum(self.many_episode_rewards[ep]) + self.game_full_episode_scores.append(total_reward) + self.rolling_results.append(np.mean(self.game_full_episode_scores[-1 * self.rolling_score_window:])) + self.save_max_result_seen() + + def locally_save_policy(self): + """Saves the policy""" + torch.save(self.policy_new.state_dict(), "{}/{}_network.pt".format(self.config.models_dir, self.agent_name)) diff --git a/agents/policy_gradient_agents/REINFORCE.py b/agents/policy_gradient_agents/REINFORCE.py new file mode 100644 index 0000000000000000000000000000000000000000..2b15bedf35124f0fee7e36d0c3f4345e1a99d011 --- /dev/null +++ b/agents/policy_gradient_agents/REINFORCE.py @@ -0,0 +1,95 @@ +import numpy as np +import torch +import torch.optim as optim +from torch.distributions import Categorical +from agents.Base_Agent import Base_Agent + +class REINFORCE(Base_Agent): + agent_name = "REINFORCE" + def __init__(self, config): + Base_Agent.__init__(self, config) + self.policy = self.create_NN(input_dim=self.state_size, output_dim=self.action_size) + self.optimizer = optim.Adam(self.policy.parameters(), lr=self.hyperparameters["learning_rate"]) + self.episode_rewards = [] + self.episode_log_probabilities = [] + + def reset_game(self): + """Resets the game information so we are ready to play a new episode""" + self.state = self.environment.reset() + self.next_state = None + self.action = None + self.reward = None + self.done = False + self.total_episode_score_so_far = 0 + self.episode_rewards = [] + self.episode_log_probabilities = [] + self.episode_step_number = 0 + + def step(self): + """Runs a step within a game including a learning step if required""" + while not self.done: + self.pick_and_conduct_action_and_save_log_probabilities() + self.update_next_state_reward_done_and_score() + self.store_reward() + if self.time_to_learn(): + self.actor_learn() + self.state = self.next_state #this is to set the state for the next iteration + self.episode_step_number += 1 + self.episode_number += 1 + + def pick_and_conduct_action_and_save_log_probabilities(self): + """Picks and then conducts actions. Then saves the log probabilities of the actions it conducted to be used for + learning later""" + action, log_probabilities = self.pick_action_and_get_log_probabilities() + self.store_log_probabilities(log_probabilities) + self.store_action(action) + self.conduct_action(action) + + def pick_action_and_get_log_probabilities(self): + """Picks actions and then calculates the log probabilities of the actions it picked given the policy""" + # PyTorch only accepts mini-batches and not individual observations so we have to add + # a "fake" dimension to our observation using unsqueeze + state = torch.from_numpy(self.state).float().unsqueeze(0).to(self.device) + action_probabilities = self.policy.forward(state).cpu() + action_distribution = Categorical(action_probabilities) # this creates a distribution to sample from + action = action_distribution.sample() + return action.item(), action_distribution.log_prob(action) + + def store_log_probabilities(self, log_probabilities): + """Stores the log probabilities of picked actions to be used for learning later""" + self.episode_log_probabilities.append(log_probabilities) + + def store_action(self, action): + """Stores the action picked""" + self.action = action + + def store_reward(self): + """Stores the reward picked""" + self.episode_rewards.append(self.reward) + + def actor_learn(self): + """Runs a learning iteration for the policy""" + total_discounted_reward = self.calculate_episode_discounted_reward() + policy_loss = self.calculate_policy_loss_on_episode(total_discounted_reward) + self.optimizer.zero_grad() + policy_loss.backward() + self.optimizer.step() + + def calculate_episode_discounted_reward(self): + """Calculates the cumulative discounted return for the episode""" + discounts = self.hyperparameters["discount_rate"] ** np.arange(len(self.episode_rewards)) + total_discounted_reward = np.dot(discounts, self.episode_rewards) + return total_discounted_reward + + def calculate_policy_loss_on_episode(self, total_discounted_reward): + """Calculates the loss from an episode""" + policy_loss = [] + for log_prob in self.episode_log_probabilities: + policy_loss.append(-log_prob * total_discounted_reward) + policy_loss = torch.cat(policy_loss).sum() # We need to add up the losses across the mini-batch to get 1 overall loss + return policy_loss + + def time_to_learn(self): + """Tells us whether it is time for the algorithm to learn. With REINFORCE we only learn at the end of every + episode so this just returns whether the episode is over""" + return self.done diff --git a/agents/policy_gradient_agents/__pycache__/PPO.cpython-310.pyc b/agents/policy_gradient_agents/__pycache__/PPO.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c0d43590e8c325928eaccdca5a52c2bdee5acd2 Binary files /dev/null and b/agents/policy_gradient_agents/__pycache__/PPO.cpython-310.pyc differ diff --git a/agents/policy_gradient_agents/__pycache__/PPO.cpython-39.pyc b/agents/policy_gradient_agents/__pycache__/PPO.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9fc84f63185a2fc529e8f23acf0a4074c14e0a6 Binary files /dev/null and b/agents/policy_gradient_agents/__pycache__/PPO.cpython-39.pyc differ diff --git a/agents/policy_gradient_agents/__pycache__/REINFORCE.cpython-39.pyc b/agents/policy_gradient_agents/__pycache__/REINFORCE.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee601f14b3785691bfac3c9a6e54149a81182409 Binary files /dev/null and b/agents/policy_gradient_agents/__pycache__/REINFORCE.cpython-39.pyc differ diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..820e21e8f194acb34fec68727114346dd6c3e704 --- /dev/null +++ b/app.py @@ -0,0 +1,22 @@ +import streamlit as st + +def main(): + st.title("Anti-Jamming Configuration App") + mode = st.radio("Choose Mode", ["Auto", "Manual"]) + + if mode == "Auto": + jammer_type = "dynamic" + agent_type = "DQN with prioritized replay memory" + channel_switching_cost = 0.1 + else: + jammer_type = st.selectbox("Select Jammer Type", ["constant", "sweeping", "random", "dynamic"]) + agent_type = st.selectbox("Select Agent Type", ["DQN", "DQN with fixed targets", "DDQN", "Dueling DDQN", "DQN with prioritized replay memory"]) + channel_switching_cost = st.selectbox("Select Channel Switching Cost", [0, 0.05, 0.1, 0.15, 0.2]) + + st.write("Configuration:") + st.write(f"Jammer Type: {jammer_type}") + st.write(f"Agent Type: {agent_type}") + st.write(f"Channel Switching Cost: {channel_switching_cost}") + +if __name__ == "__main__": + main() diff --git a/data/__init__.py b/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/data/__pycache__/__init__.cpython-310.pyc b/data/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f0507b5ae1f10dfc554d188a9acee216ef0db4a Binary files /dev/null and b/data/__pycache__/__init__.cpython-310.pyc differ diff --git a/data/__pycache__/__init__.cpython-39.pyc b/data/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a47b7ca23df42b3053cc7521634923266947ab28 Binary files /dev/null and b/data/__pycache__/__init__.cpython-39.pyc differ diff --git a/data/__pycache__/dataset.cpython-310.pyc b/data/__pycache__/dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fbf7bb704d276417fefb79096ba21e8c4dc262eb Binary files /dev/null and b/data/__pycache__/dataset.cpython-310.pyc differ diff --git a/data/__pycache__/dataset.cpython-39.pyc b/data/__pycache__/dataset.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3fa61b51f6ff0e04e8e5fe34d37b63a3e54e6c51 Binary files /dev/null and b/data/__pycache__/dataset.cpython-39.pyc differ diff --git a/data/dataset.py b/data/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..91b12ac8886fa9a37a5fb11e0758a45972fda853 --- /dev/null +++ b/data/dataset.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python +# coding:utf-8 +""" +name : dataset.py, +version : 1.0.0, +url : https://github.com/abubakar-sani/RFF, +license : MIT License, +copyright : Copyright 2021 by Abubakar Sani Ali, Khalifa University, +author : Abubakar Sani Ali, +email : engrabubakarsani@gmail.com, +date : 9/5/2022, +description : Dataset module that returns the processed data, +""" + +import math +import os +import numpy as np +import pandas as pd + +import json + +base_dir = os.path.abspath('') +raw_dir = f'{base_dir}/datasets/raw/spectral_scans_QC9880_ht20_background' +processed_dir = f'{base_dir}/datasets/processed/spectral_scans_QC9880_ht20_background' + + +def load_spectral_scans(jammer, jammed_freq, jammer_dist, jamming_power, channels, n_features, n_scans=10): + if jammer == 'combined': + scenario = f'samples_chamber_{jammer_dist}cm_{jamming_power}dBm' + elif jammer == 'none': + interference = np.random.choice(['high', 'medium', 'low'], p=[0.4, 0.4, 0.2]) + if interference == 'high': + scenario = f'samples_office_None' + elif interference == 'low': + scenario = f'samples_lab_None' + else: + scenario = f'samples_chamber_None' + else: + scenario = f'samples_chamber_{jammed_freq}MHz_{jammer_dist}cm_{jamming_power}dBm' + # Process the dataset and generate the raw numpy (to processing it every run) + if not os.path.isfile(f'{processed_dir}/{scenario}.npy'): + spectral_data = [] + for channel in channels: + channel_data = np.empty((0, n_features), int) + for scan in range(n_scans): + if jammer == 'combined': + scenario = f'samples_chamber_{channel}MHz_{jammer_dist}cm_{jamming_power}dBm' + + df = pd.read_csv(f'{raw_dir}/{scenario}_{scan}.csv') + temp_data = df.where(df['freq1'] == channel).dropna() + temp_data = temp_data.to_numpy() + channel_data = np.append(channel_data, temp_data, axis=0) + + spectral_data.append(channel_data) + + if not os.path.exists(processed_dir): + os.makedirs(processed_dir) + if jammer == 'combined': + scenario = f'samples_chamber_{jammer_dist}cm_{jamming_power}dBm' + np.save(f'{processed_dir}/{scenario}', np.array(spectral_data, dtype='object')) + + spectral_data = np.load(f'{processed_dir}/{scenario}.npy', allow_pickle=True) + + return spectral_data + + +def get_feats_dirs(scenario, scan=1): + spectral_df = pd.read_csv(f'{raw_dir}/{scenario}_{scan}.csv') + features = spectral_df.columns + n_features = len(features) + return features, n_features + + +def process_data(data, channels, length, stride, time_step, mode=0): + if mode == 0: + min_len = len(data[0]) + for i in range(len(data)): + min_len = len(data[i]) if len(data[i]) < min_len else min_len + t_samples = min_len + else: + t_samples = len(data) + n_samples = math.floor((t_samples - length) / stride) + if mode == 1: + # Breaking data into batches + data = data[0:t_samples] + batches = [] + for sample in range(n_samples): + batch = [] + for i in range(length): + batch_sample = data[(sample * stride) + i] + batch.append(batch_sample) + batches.append(batch) + + else: + batches = [] + for sample in range(n_samples): + batch = [] + for i in range(len(channels)): + channel_data = data[i] + channel_samples = channel_data[(sample * stride):(length + (sample * stride)), :] + batch.append(channel_samples) + batches.append(batch) + + processed_data = np.array(batches[time_step % len(batches)]) + + return processed_data diff --git a/environments/RF_spectrum.py b/environments/RF_spectrum.py new file mode 100644 index 0000000000000000000000000000000000000000..7461e8165326970334f4919f54dd28d0cca43048 --- /dev/null +++ b/environments/RF_spectrum.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python +# coding:utf-8 +""" +name : RF_spectrum.py, +version : 1.0.0, +url : https://github.com/abubakar-sani/Anti-Jam, +license : MIT License, +copyright : Copyright 2022 by Abubakar Sani Ali, +author : Abubakar Sani Ali, +email : engrabubakarsani@gmail.com, +date : 9/9/2022, +description : Environment class for the anti-jamming problem, +""" + +# %% Loading libraries +import os +import numpy as np +import pandas as pd +import gym +from gym import spaces +from gym.utils import seeding +from random import randint +from matplotlib import pyplot +import matplotlib as mpl +import copy +import random +from data.dataset import load_spectral_scans, process_data, get_feats_dirs + + +class RfEnvironment(gym.Env): + environment_name = "Anti Jamming" + + def __init__(self, jammers, jammer_dist, jamming_power, csc, channels, length, stride, n_scans=10): + self.selected_freq_prob = None + self.reward = None + self.jammed_freq = None + self.jammer = None + self.sweep_counter = None + self.channel_data = None + self.spectral_data = None + self.state = None + self.cstime = 0 + self.csc = csc + self.interference_types = ['high', 'medium', 'low'] + self.jammers = jammers + self.jammer_dists = [20] + self.jammer_dist = jammer_dist + self.jamming_powers = [10] + self.jamming_power = jamming_power + self.jammer_bandwidth = 20 # MHz + self.n_scans = n_scans + self.length = length + self.time_step = 1 + self.stride = stride # if self.mode == 1 else self.length + self.channels = channels + self.freq = self.channels[0] + self.previous_action = 0 + self.scenario = f'samples_chamber_{self.freq}MHz_{self.jammer_dist}cm_{self.jamming_power}dBm' + self.features, self.n_features = get_feats_dirs(self.scenario) + self.stat_features = ['mean'] + + self.average_rssi = 0 + + self.action_space = spaces.Discrete(len(self.channels)) + + self.observation_size = len(self.channels) # * (1 + (self.n_features - 1) * len(self.stat_features)) + self.observation_space = spaces.Box(low=np.ones(self.observation_size) * -100, + high=np.ones(self.observation_size) * 100) + # self.seed() + self.reward_RF = 1 + self.n_collisions = 0 + self.trials = 5 + self._max_episode_steps = 5 + self.reward_threshold = 0.95 * self._max_episode_steps + self.id = "Dynamic Anti-jamming" if len(jammers) > 1 else f"{self.jammers[0]} Anti-jamming" + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def reset(self): + """ + Here we reshuffle our dataset and then obtain the RF spectrum scans at the beginning of an episode + + """ + # Getting data + self.step_count = 0 + self.jammer = np.random.choice(self.jammers) + self.jammer_dist = np.random.choice(self.jammer_dists) + self.jamming_power = np.random.choice(self.jamming_powers) + + if self.jammer == 'constant': + self.jammed_freq = np.random.choice(self.channels) + elif self.jammer == 'sweeping': + self.sweep_counter = 0 + self.jammed_freq = self.freq + elif self.jammer == 'random': + self.jammed_freq = np.random.choice(self.channels) + else: + self.jammed_freq = self.channels[0] + + self.time_step = np.random.randint(100) # Introducing randomisation at the start + + self.spectral_data = load_spectral_scans(self.jammer, self.jammed_freq, self.jammer_dist, self.jamming_power, + self.channels, self.n_features) + self.state = self.get_state( + process_data(self.spectral_data, self.channels, self.length, self.stride, self.time_step)) + + # self.state = np.eye(len(self.channels))[self.channels.index(self.jammed_freq)] + + return self.state.flatten() + + def step(self, action): + # Get reward of previous action taken (r_t) + self.step_count += 1 + self.freq = self.channels[action] + if action != self.previous_action: self.cstime = 1 + + # When the agent transmits and no jammer + if self.jammer == 'none': + self.reward = self.get_reward(action) + + if self.freq > (self.jammed_freq + self.jammer_bandwidth/2) or self.freq < (self.jammed_freq - self.jammer_bandwidth/2): + self.reward = self.get_reward(action) + else: + # There is collision + self.reward = 0 + self.n_collisions += 1 + + self.previous_action = action + + # Go to next state (s_t+1) + if self.jammer == 'sweeping': + self.sweep_counter = self.sweep_counter + 1 + sweep_slot = self.sweep_counter % len(self.channels) + self.jammed_freq = self.channels[sweep_slot] + elif self.jammer == 'random': + self.jammed_freq = np.random.choice(self.channels) + + self.spectral_data = load_spectral_scans(self.jammer, self.jammed_freq, self.jammer_dist, self.jamming_power, + self.channels, self.n_features) + + self.state = self.get_state( + process_data(self.spectral_data, self.channels, self.length, self.stride, self.time_step)) + + self.time_step = self.time_step + 1 + # self.state = np.eye(len(self.channels))[self.channels.index(self.jammed_freq)] + if self.step_count >= self._max_episode_steps: + self.done = True + else: + self.done = False + + return self.state.flatten(), self.reward, self.done, self.cstime + + def get_state(self, processed_channel_data): + state = np.zeros((len(self.channels), 1)) + for channel in range(len(self.channels)): + channel_state = self.construct_state(processed_channel_data[channel]) + # freq = self.channels[channel] + # if freq > (self.jammed_freq + self.jammer_bandwidth/2) or freq < (self.jammed_freq - self.jammer_bandwidth/2): + # inf_data = self.get_interference_data(channel) + # channel_state = self.construct_state(inf_data) + # else: + # channel_state = self.construct_state(processed_channel_data[channel]) + state[channel, :] = channel_state + return state + + def construct_state(self, processed_channel_data): + df_channel = pd.DataFrame(processed_channel_data, columns=self.features) + df_channel[df_channel['snr'] < -100] = np.NaN + df_channel[df_channel['snr'] > 100] = np.NaN + df_channel.fillna(df_channel['snr'].mean(), inplace=True) + state = df_channel['snr'].mean() + return state + + def get_interference_data(self, channel): + jammer = 'none' + inf_spectral_data = load_spectral_scans(jammer, self.jammed_freq, self.jammer_dist, self.jamming_power, + self.channels, self.n_features) + data_index = np.where(np.array(self.channels) == np.array(self.channels)[channel]) + inf_channel_data = inf_spectral_data[data_index[0][0]] + inf_data = process_data(inf_channel_data, self.channels, self.length, self.stride, self.time_step, 1) + return inf_data + + def get_reward(self, action): + # Penalize agent for switching channel if the channel is not jammed + return self.reward_RF * (1 - self.csc) if action != self.previous_action else self.reward_RF + + def get_score_to_win(self): + return self.reward_threshold diff --git a/environments/__pycache__/RF_spectrum.cpython-310.pyc b/environments/__pycache__/RF_spectrum.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..670378a3bd0eb94173e9af970efd63852f9ea7cd Binary files /dev/null and b/environments/__pycache__/RF_spectrum.cpython-310.pyc differ diff --git a/environments/__pycache__/RF_spectrum.cpython-39.pyc b/environments/__pycache__/RF_spectrum.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..671d17316b8cca86d8e9e8962da7c41e4cf26dd8 Binary files /dev/null and b/environments/__pycache__/RF_spectrum.cpython-39.pyc differ diff --git a/exploration_strategies/Base_Exploration_Strategy.py b/exploration_strategies/Base_Exploration_Strategy.py new file mode 100644 index 0000000000000000000000000000000000000000..a8041d3ba21c327480b26808956f0358a93e7b3d --- /dev/null +++ b/exploration_strategies/Base_Exploration_Strategy.py @@ -0,0 +1,19 @@ + + +class Base_Exploration_Strategy(object): + """Base abstract class for agent exploration strategies. Every exploration strategy must inherit from this class + and implement the methods perturb_action_for_exploration_purposes and add_exploration_rewards""" + def __init__(self, config): + self.config = config + + def perturb_action_for_exploration_purposes(self, action_info): + """Perturbs the action of the agent to encourage exploration""" + raise ValueError("Must be implemented") + + def add_exploration_rewards(self, reward_info): + """Actions intrinsic rewards to encourage exploration""" + raise ValueError("Must be implemented") + + def reset(self): + """Resets the noise process""" + raise ValueError("Must be implemented") \ No newline at end of file diff --git a/exploration_strategies/Epsilon_Greedy_Exploration.py b/exploration_strategies/Epsilon_Greedy_Exploration.py new file mode 100644 index 0000000000000000000000000000000000000000..98316c5e29a1be51fa370bd7a34da301a6e5f667 --- /dev/null +++ b/exploration_strategies/Epsilon_Greedy_Exploration.py @@ -0,0 +1,68 @@ +from exploration_strategies.Base_Exploration_Strategy import Base_Exploration_Strategy +import numpy as np +import random +import torch + +class Epsilon_Greedy_Exploration(Base_Exploration_Strategy): + """Implements an epsilon greedy exploration strategy""" + def __init__(self, config): + super().__init__(config) + self.notified_that_exploration_turned_off = False + if "exploration_cycle_episodes_length" in self.config.hyperparameters.keys(): + print("Using a cyclical exploration strategy") + self.exploration_cycle_episodes_length = self.config.hyperparameters["exploration_cycle_episodes_length"] + else: + self.exploration_cycle_episodes_length = None + + if "random_episodes_to_run" in self.config.hyperparameters.keys(): + self.random_episodes_to_run = self.config.hyperparameters["random_episodes_to_run"] + print("Running {} random episodes".format(self.random_episodes_to_run)) + else: + self.random_episodes_to_run = 0 + + def perturb_action_for_exploration_purposes(self, action_info): + """Perturbs the action of the agent to encourage exploration""" + action_values = action_info["action_values"] + turn_off_exploration = action_info["turn_off_exploration"] + episode_number = action_info["episode_number"] + if turn_off_exploration and not self.notified_that_exploration_turned_off: + print(" ") + print("Exploration has been turned OFF") + print(" ") + self.notified_that_exploration_turned_off = True + epsilon = self.get_updated_epsilon_exploration(action_info) + + + if (random.random() > epsilon or turn_off_exploration) and (episode_number >= self.random_episodes_to_run): + return torch.argmax(action_values).item() + return np.random.randint(0, action_values.shape[1]) + + def get_updated_epsilon_exploration(self, action_info, epsilon=1.0): + """Gets the probability that we just pick a random action. This probability decays the more episodes we have seen""" + episode_number = action_info["episode_number"] + epsilon_decay_denominator = self.config.hyperparameters["epsilon_decay_rate_denominator"] + + if self.exploration_cycle_episodes_length is None: + epsilon = epsilon / (1.0 + (episode_number / epsilon_decay_denominator)) + else: + epsilon = self.calculate_epsilon_with_cyclical_strategy(episode_number) + return epsilon + + def calculate_epsilon_with_cyclical_strategy(self, episode_number): + """Calculates epsilon according to a cyclical strategy""" + max_epsilon = 0.5 + min_epsilon = 0.001 + increment = (max_epsilon - min_epsilon) / float(self.exploration_cycle_episodes_length / 2) + cycle = [ix for ix in range(int(self.exploration_cycle_episodes_length / 2))] + [ix for ix in range( + int(self.exploration_cycle_episodes_length / 2), 0, -1)] + cycle_ix = episode_number % self.exploration_cycle_episodes_length + epsilon = max_epsilon - cycle[cycle_ix] * increment + return epsilon + + def add_exploration_rewards(self, reward_info): + """Actions intrinsic rewards to encourage exploration""" + return reward_info["reward"] + + def reset(self): + """Resets the noise process""" + pass diff --git a/exploration_strategies/Gaussian_Exploration.py b/exploration_strategies/Gaussian_Exploration.py new file mode 100644 index 0000000000000000000000000000000000000000..c8186ee977232c227f941866ea2e8efff93e973c --- /dev/null +++ b/exploration_strategies/Gaussian_Exploration.py @@ -0,0 +1,32 @@ +from exploration_strategies.Base_Exploration_Strategy import Base_Exploration_Strategy +import torch +from torch.distributions.normal import Normal + +class Gaussian_Exploration(Base_Exploration_Strategy): + + """Gaussian noise exploration strategy""" + def __init__(self, config): + super().__init__(config) + self.action_noise_std = self.config.hyperparameters["action_noise_std"] + self.action_noise_distribution = Normal(torch.Tensor([0.0]), torch.Tensor([self.action_noise_std])) + self.action_noise_clipping_range = self.config.hyperparameters["action_noise_clipping_range"] + + + def perturb_action_for_exploration_purposes(self, action_info): + """Perturbs the action of the agent to encourage exploration""" + action = action_info["action"] + action_noise = self.action_noise_distribution.sample(sample_shape=action.shape) + action_noise = action_noise.squeeze(-1) + clipped_action_noise = torch.clamp(action_noise, min=-self.action_noise_clipping_range, + max=self.action_noise_clipping_range) + action += clipped_action_noise + return action + + def add_exploration_rewards(self, reward_info): + """Actions intrinsic rewards to encourage exploration""" + raise ValueError("Must be implemented") + + def reset(self): + """Resets the noise process""" + pass + diff --git a/exploration_strategies/OU_Noise_Exploration.py b/exploration_strategies/OU_Noise_Exploration.py new file mode 100644 index 0000000000000000000000000000000000000000..26a55b00f4614fb087be1f16ec6f2cb10c1850a1 --- /dev/null +++ b/exploration_strategies/OU_Noise_Exploration.py @@ -0,0 +1,23 @@ +from utilities.OU_Noise import OU_Noise +from exploration_strategies.Base_Exploration_Strategy import Base_Exploration_Strategy + +class OU_Noise_Exploration(Base_Exploration_Strategy): + """Ornstein-Uhlenbeck noise process exploration strategy""" + def __init__(self, config): + super().__init__(config) + self.noise = OU_Noise(self.config.action_size, self.config.seed, self.config.hyperparameters["mu"], + self.config.hyperparameters["theta"], self.config.hyperparameters["sigma"]) + + def perturb_action_for_exploration_purposes(self, action_info): + """Perturbs the action of the agent to encourage exploration""" + action = action_info["action"] + action += self.noise.sample() + return action + + def add_exploration_rewards(self, reward_info): + """Actions intrinsic rewards to encourage exploration""" + raise ValueError("Must be implemented") + + def reset(self): + """Resets the noise process""" + self.noise.reset() \ No newline at end of file diff --git a/exploration_strategies/__init__.py b/exploration_strategies/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/exploration_strategies/__pycache__/Base_Exploration_Strategy.cpython-310.pyc b/exploration_strategies/__pycache__/Base_Exploration_Strategy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e9d2bfb1323b86b747d0814471b915fafafb1e0 Binary files /dev/null and b/exploration_strategies/__pycache__/Base_Exploration_Strategy.cpython-310.pyc differ diff --git a/exploration_strategies/__pycache__/Base_Exploration_Strategy.cpython-39.pyc b/exploration_strategies/__pycache__/Base_Exploration_Strategy.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8613abcbe4546e9463ddddcf857b5b34b24018d8 Binary files /dev/null and b/exploration_strategies/__pycache__/Base_Exploration_Strategy.cpython-39.pyc differ diff --git a/exploration_strategies/__pycache__/Epsilon_Greedy_Exploration.cpython-310.pyc b/exploration_strategies/__pycache__/Epsilon_Greedy_Exploration.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ee44f457938953cce75c02bc6c9dfaec37751a2 Binary files /dev/null and b/exploration_strategies/__pycache__/Epsilon_Greedy_Exploration.cpython-310.pyc differ diff --git a/exploration_strategies/__pycache__/Epsilon_Greedy_Exploration.cpython-39.pyc b/exploration_strategies/__pycache__/Epsilon_Greedy_Exploration.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9f381ade691b9b2418d837cf61bb464268533b7 Binary files /dev/null and b/exploration_strategies/__pycache__/Epsilon_Greedy_Exploration.cpython-39.pyc differ diff --git a/exploration_strategies/__pycache__/OU_Noise_Exploration.cpython-39.pyc b/exploration_strategies/__pycache__/OU_Noise_Exploration.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b3f24d3a6d6a1fcd2119149ca8eaa48751682ec Binary files /dev/null and b/exploration_strategies/__pycache__/OU_Noise_Exploration.cpython-39.pyc differ diff --git a/exploration_strategies/__pycache__/__init__.cpython-310.pyc b/exploration_strategies/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af6ed67120230fe80b24de58e23fb29be09a1263 Binary files /dev/null and b/exploration_strategies/__pycache__/__init__.cpython-310.pyc differ diff --git a/exploration_strategies/__pycache__/__init__.cpython-39.pyc b/exploration_strategies/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..942cbb86cccd1f678dd2714e9c69dd15b3700998 Binary files /dev/null and b/exploration_strategies/__pycache__/__init__.cpython-39.pyc differ diff --git a/models/exp_0.05_constant/DDQN with Prioritised Replay_network.pt b/models/exp_0.05_constant/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5791d022a3a7a479b1e57554023955518dabef3 Binary files /dev/null and b/models/exp_0.05_constant/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.05_constant/DDQN_network.pt b/models/exp_0.05_constant/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..143c66bb3f788f8ef01d050b4105c1c23649024b Binary files /dev/null and b/models/exp_0.05_constant/DDQN_network.pt differ diff --git a/models/exp_0.05_constant/DQN with Fixed Q Targets_network.pt b/models/exp_0.05_constant/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..38603d33efec057ed32ea7c4c10a556c03e92548 Binary files /dev/null and b/models/exp_0.05_constant/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.05_constant/DQN_network.pt b/models/exp_0.05_constant/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..faeac15a06eba5e8d7cfbf242c7f0116176a84cc Binary files /dev/null and b/models/exp_0.05_constant/DQN_network.pt differ diff --git a/models/exp_0.05_constant/Dueling DDQN_network.pt b/models/exp_0.05_constant/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef8692ce10eaa4d966f9a43829397fd1b250c641 Binary files /dev/null and b/models/exp_0.05_constant/Dueling DDQN_network.pt differ diff --git a/models/exp_0.05_dynamic/DDQN with Prioritised Replay_network.pt b/models/exp_0.05_dynamic/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..731b2842d6699b5eb8bbce52e58de8a3a29262e5 Binary files /dev/null and b/models/exp_0.05_dynamic/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.05_dynamic/DDQN_network.pt b/models/exp_0.05_dynamic/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e276b321dcf0adec24da1a620c22355b3d14be2 Binary files /dev/null and b/models/exp_0.05_dynamic/DDQN_network.pt differ diff --git a/models/exp_0.05_dynamic/DQN with Fixed Q Targets_network.pt b/models/exp_0.05_dynamic/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9f5644ddb38054b9e3125fd82b8fa3021c50691 Binary files /dev/null and b/models/exp_0.05_dynamic/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.05_dynamic/DQN_network.pt b/models/exp_0.05_dynamic/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd641301cb0f440999d7da19f8daedadf963b64f Binary files /dev/null and b/models/exp_0.05_dynamic/DQN_network.pt differ diff --git a/models/exp_0.05_dynamic/Dueling DDQN_network.pt b/models/exp_0.05_dynamic/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a7ca55d2c7ad74654d0e0e14e6a041444fdae61 Binary files /dev/null and b/models/exp_0.05_dynamic/Dueling DDQN_network.pt differ diff --git a/models/exp_0.05_random/DDQN with Prioritised Replay_network.pt b/models/exp_0.05_random/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ac643430123bdf9b50c3b0012fbdfc7104bc17d Binary files /dev/null and b/models/exp_0.05_random/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.05_random/DDQN_network.pt b/models/exp_0.05_random/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd54b168c15800c22d1f983e58660ba5876af9f1 Binary files /dev/null and b/models/exp_0.05_random/DDQN_network.pt differ diff --git a/models/exp_0.05_random/DQN with Fixed Q Targets_network.pt b/models/exp_0.05_random/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..43a0ad624fe71128413ef205b1b783d2c480385d Binary files /dev/null and b/models/exp_0.05_random/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.05_random/DQN_network.pt b/models/exp_0.05_random/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..4572f6064cb8b2455acf81644adfeceb98911084 Binary files /dev/null and b/models/exp_0.05_random/DQN_network.pt differ diff --git a/models/exp_0.05_random/Dueling DDQN_network.pt b/models/exp_0.05_random/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6efbdb53aa98433377cc8a2a46fdff057907e62 Binary files /dev/null and b/models/exp_0.05_random/Dueling DDQN_network.pt differ diff --git a/models/exp_0.05_sweeping/DDQN with Prioritised Replay_network.pt b/models/exp_0.05_sweeping/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..570dfe0c23323991218be4f1aeafab13e5caaa20 Binary files /dev/null and b/models/exp_0.05_sweeping/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.05_sweeping/DDQN_network.pt b/models/exp_0.05_sweeping/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6676582b228686347464fc7f48caf85971e9c83 Binary files /dev/null and b/models/exp_0.05_sweeping/DDQN_network.pt differ diff --git a/models/exp_0.05_sweeping/DQN with Fixed Q Targets_network.pt b/models/exp_0.05_sweeping/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5297e7f477564d287a768021b0d457605c7308a Binary files /dev/null and b/models/exp_0.05_sweeping/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.05_sweeping/DQN_network.pt b/models/exp_0.05_sweeping/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c91c9658acf63b92a1613c011355bdab204d825 Binary files /dev/null and b/models/exp_0.05_sweeping/DQN_network.pt differ diff --git a/models/exp_0.05_sweeping/Dueling DDQN_network.pt b/models/exp_0.05_sweeping/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b9d6711aab8f88d892c4bf068f9d3478f8dc936 Binary files /dev/null and b/models/exp_0.05_sweeping/Dueling DDQN_network.pt differ diff --git a/models/exp_0.0_constant/DDQN with Prioritised Replay_network.pt b/models/exp_0.0_constant/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..763d4e95f79d4004d8c7530a6f0a73d712fea102 Binary files /dev/null and b/models/exp_0.0_constant/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.0_constant/DDQN_network.pt b/models/exp_0.0_constant/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce1bfa294a0531386cde63554e0d1f844f0b75f4 Binary files /dev/null and b/models/exp_0.0_constant/DDQN_network.pt differ diff --git a/models/exp_0.0_constant/DQN with Fixed Q Targets_network.pt b/models/exp_0.0_constant/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..968b2c53218bb76d44f4f88b52374533799fb545 Binary files /dev/null and b/models/exp_0.0_constant/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.0_constant/DQN_network.pt b/models/exp_0.0_constant/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe37cd04e9bccbfbdf1eb040d414b3e31690926b Binary files /dev/null and b/models/exp_0.0_constant/DQN_network.pt differ diff --git a/models/exp_0.0_constant/Dueling DDQN_network.pt b/models/exp_0.0_constant/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..2de0b1bec5f5d43da69a657cfb29de391e40f713 Binary files /dev/null and b/models/exp_0.0_constant/Dueling DDQN_network.pt differ diff --git a/models/exp_0.0_dynamic/DDQN with Prioritised Replay_network.pt b/models/exp_0.0_dynamic/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..04737609e350203f5bd01eef6f0fe2c0076cc760 Binary files /dev/null and b/models/exp_0.0_dynamic/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.0_dynamic/DDQN_network.pt b/models/exp_0.0_dynamic/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..89aec13794edcea2cc893e54e9e5d81e908948fd Binary files /dev/null and b/models/exp_0.0_dynamic/DDQN_network.pt differ diff --git a/models/exp_0.0_dynamic/DQN with Fixed Q Targets_network.pt b/models/exp_0.0_dynamic/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f7b29869dd2bb1dc4a6e2d0dbb94b371c12bc5e Binary files /dev/null and b/models/exp_0.0_dynamic/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.0_dynamic/DQN_network.pt b/models/exp_0.0_dynamic/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..af61a14b96dbd4b24c26cddf0eb89c784cfaa9c4 Binary files /dev/null and b/models/exp_0.0_dynamic/DQN_network.pt differ diff --git a/models/exp_0.0_dynamic/Dueling DDQN_network.pt b/models/exp_0.0_dynamic/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..af5df4010ae89a8b7f027d831fc034cc82d6e1b7 Binary files /dev/null and b/models/exp_0.0_dynamic/Dueling DDQN_network.pt differ diff --git a/models/exp_0.0_random/DDQN with Prioritised Replay_network.pt b/models/exp_0.0_random/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..c40c9a105602d98cb24e51685cbf30eccb5c6702 Binary files /dev/null and b/models/exp_0.0_random/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.0_random/DDQN_network.pt b/models/exp_0.0_random/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..bcef471246b7748f00c2ff1f6be58db67e19294d Binary files /dev/null and b/models/exp_0.0_random/DDQN_network.pt differ diff --git a/models/exp_0.0_random/DQN with Fixed Q Targets_network.pt b/models/exp_0.0_random/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f4ab2944e305360774be745f6c6e9f939ae9cc2 Binary files /dev/null and b/models/exp_0.0_random/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.0_random/DQN_network.pt b/models/exp_0.0_random/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..c703a27cfabd1e8e4bb1c0f77149ac4c99aa5ef0 Binary files /dev/null and b/models/exp_0.0_random/DQN_network.pt differ diff --git a/models/exp_0.0_random/Dueling DDQN_network.pt b/models/exp_0.0_random/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0487b5ec8b2eb3de9dbf98fe1ed86547cc4c892 Binary files /dev/null and b/models/exp_0.0_random/Dueling DDQN_network.pt differ diff --git a/models/exp_0.0_sweeping/DDQN with Prioritised Replay_network.pt b/models/exp_0.0_sweeping/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b0252b5e0515cff87fdf9b00d3107fea517a47c Binary files /dev/null and b/models/exp_0.0_sweeping/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.0_sweeping/DDQN_network.pt b/models/exp_0.0_sweeping/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..b25a001888e05d367c583191511e53bf979c80ae Binary files /dev/null and b/models/exp_0.0_sweeping/DDQN_network.pt differ diff --git a/models/exp_0.0_sweeping/DQN with Fixed Q Targets_network.pt b/models/exp_0.0_sweeping/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..71ab1bc42bd3cf2a55b2ec60d9536f8560ee202b Binary files /dev/null and b/models/exp_0.0_sweeping/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.0_sweeping/DQN_network.pt b/models/exp_0.0_sweeping/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..5129875a03ff95def3a01e5d278ecda7034c388a Binary files /dev/null and b/models/exp_0.0_sweeping/DQN_network.pt differ diff --git a/models/exp_0.0_sweeping/Dueling DDQN_network.pt b/models/exp_0.0_sweeping/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..35835ff7b2ed88d4a0ac0b2cc3768cf4826302ed Binary files /dev/null and b/models/exp_0.0_sweeping/Dueling DDQN_network.pt differ diff --git a/models/exp_0.15_constant/DDQN with Prioritised Replay_network.pt b/models/exp_0.15_constant/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9683f7b09fca2320f2ff66ec9fde1f786ae5cac Binary files /dev/null and b/models/exp_0.15_constant/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.15_constant/DDQN_network.pt b/models/exp_0.15_constant/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed393e9ff6982d5d34a0102d05c4141fe239c649 Binary files /dev/null and b/models/exp_0.15_constant/DDQN_network.pt differ diff --git a/models/exp_0.15_constant/DQN with Fixed Q Targets_network.pt b/models/exp_0.15_constant/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..250b35c1e300ed44b25e876b3fd8bddc209954a2 Binary files /dev/null and b/models/exp_0.15_constant/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.15_constant/DQN_network.pt b/models/exp_0.15_constant/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..a07171da0acd5373a23987356a678d9207de4d40 Binary files /dev/null and b/models/exp_0.15_constant/DQN_network.pt differ diff --git a/models/exp_0.15_constant/Dueling DDQN_network.pt b/models/exp_0.15_constant/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..4fbec003d58989bb562b7c0b74a876f8a668ea25 Binary files /dev/null and b/models/exp_0.15_constant/Dueling DDQN_network.pt differ diff --git a/models/exp_0.15_dynamic/DDQN with Prioritised Replay_network.pt b/models/exp_0.15_dynamic/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..d728047388aa8679b36f247982d5877205152e85 Binary files /dev/null and b/models/exp_0.15_dynamic/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.15_dynamic/DDQN_network.pt b/models/exp_0.15_dynamic/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..e98582684d695dfbd64939f8146d2ee635945e8c Binary files /dev/null and b/models/exp_0.15_dynamic/DDQN_network.pt differ diff --git a/models/exp_0.15_dynamic/DQN with Fixed Q Targets_network.pt b/models/exp_0.15_dynamic/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..93d5909ec3db118e145c137119d67ad18d1fa941 Binary files /dev/null and b/models/exp_0.15_dynamic/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.15_dynamic/DQN_network.pt b/models/exp_0.15_dynamic/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c42e5516fb2fb31e505e11f983b64f603479b14 Binary files /dev/null and b/models/exp_0.15_dynamic/DQN_network.pt differ diff --git a/models/exp_0.15_dynamic/Dueling DDQN_network.pt b/models/exp_0.15_dynamic/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..639516a932ca8d2b57542557f52ba71ae284b56c Binary files /dev/null and b/models/exp_0.15_dynamic/Dueling DDQN_network.pt differ diff --git a/models/exp_0.15_random/DDQN with Prioritised Replay_network.pt b/models/exp_0.15_random/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..af399b7ca246b2f0f3c8f3b6a3e4f3b53816cbe6 Binary files /dev/null and b/models/exp_0.15_random/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.15_random/DDQN_network.pt b/models/exp_0.15_random/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..7fc467b32cc772578fcc5c24f4e1c9ad099fe1c3 Binary files /dev/null and b/models/exp_0.15_random/DDQN_network.pt differ diff --git a/models/exp_0.15_random/DQN with Fixed Q Targets_network.pt b/models/exp_0.15_random/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd4efb06d4f4c7d4a4eea3bf3645b2b22ac43368 Binary files /dev/null and b/models/exp_0.15_random/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.15_random/DQN_network.pt b/models/exp_0.15_random/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea6c206e33a87db6a3084640a9ca98815342d02b Binary files /dev/null and b/models/exp_0.15_random/DQN_network.pt differ diff --git a/models/exp_0.15_random/Dueling DDQN_network.pt b/models/exp_0.15_random/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa955af86f1f2e5f252caf574d9765851a0d189e Binary files /dev/null and b/models/exp_0.15_random/Dueling DDQN_network.pt differ diff --git a/models/exp_0.15_sweeping/DDQN with Prioritised Replay_network.pt b/models/exp_0.15_sweeping/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..26bc2349f91939049c12e0061e04be5d3cd8b8bc Binary files /dev/null and b/models/exp_0.15_sweeping/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.15_sweeping/DDQN_network.pt b/models/exp_0.15_sweeping/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..0906d0863a876079986e202eae90fdd9d2ebc97d Binary files /dev/null and b/models/exp_0.15_sweeping/DDQN_network.pt differ diff --git a/models/exp_0.15_sweeping/DQN with Fixed Q Targets_network.pt b/models/exp_0.15_sweeping/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae5a0b7aa5b5255ae6857dbc25c045139cd332c9 Binary files /dev/null and b/models/exp_0.15_sweeping/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.15_sweeping/DQN_network.pt b/models/exp_0.15_sweeping/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..97f551e043609030538617adb940b4767d2fa6a8 Binary files /dev/null and b/models/exp_0.15_sweeping/DQN_network.pt differ diff --git a/models/exp_0.15_sweeping/Dueling DDQN_network.pt b/models/exp_0.15_sweeping/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2399da1ee4ff8b6b2a0f70589c5ac7a94e6a666 Binary files /dev/null and b/models/exp_0.15_sweeping/Dueling DDQN_network.pt differ diff --git a/models/exp_0.1_constant/DDQN with Prioritised Replay_network.pt b/models/exp_0.1_constant/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9faac06101c67cfa0f9546f8ba8cfdfc31c0013 Binary files /dev/null and b/models/exp_0.1_constant/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.1_constant/DDQN_network.pt b/models/exp_0.1_constant/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..82a632fe6c7993b30fbb64e6db54e97f8a87b748 Binary files /dev/null and b/models/exp_0.1_constant/DDQN_network.pt differ diff --git a/models/exp_0.1_constant/DQN with Fixed Q Targets_network.pt b/models/exp_0.1_constant/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c9befb524b2ff1247a0c7db2a4ce704978d6e6b Binary files /dev/null and b/models/exp_0.1_constant/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.1_constant/DQN_network.pt b/models/exp_0.1_constant/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..f401ca15697c3cee46534d2545b68a372a7ecc0c Binary files /dev/null and b/models/exp_0.1_constant/DQN_network.pt differ diff --git a/models/exp_0.1_constant/Dueling DDQN_network.pt b/models/exp_0.1_constant/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..04556260ad707135145e8ddbd4cfc4c5c59f0297 Binary files /dev/null and b/models/exp_0.1_constant/Dueling DDQN_network.pt differ diff --git a/models/exp_0.1_dynamic/DDQN with Prioritised Replay_network.pt b/models/exp_0.1_dynamic/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb7177088340e1f71d632d4aae6564c6f44656bd Binary files /dev/null and b/models/exp_0.1_dynamic/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.1_dynamic/DDQN_network.pt b/models/exp_0.1_dynamic/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..600f530aa254f7a6bfc4ee1205fdf057462c5fbb Binary files /dev/null and b/models/exp_0.1_dynamic/DDQN_network.pt differ diff --git a/models/exp_0.1_dynamic/DQN with Fixed Q Targets_network.pt b/models/exp_0.1_dynamic/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc1773e7975e95c05b161a46b1342abf74f14397 Binary files /dev/null and b/models/exp_0.1_dynamic/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.1_dynamic/DQN_network.pt b/models/exp_0.1_dynamic/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad20a5d5dd07f85518990c3a7bde531f2eaee2a9 Binary files /dev/null and b/models/exp_0.1_dynamic/DQN_network.pt differ diff --git a/models/exp_0.1_dynamic/Dueling DDQN_network.pt b/models/exp_0.1_dynamic/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c98c01250b69017788ea6143d2742a376a0ab61 Binary files /dev/null and b/models/exp_0.1_dynamic/Dueling DDQN_network.pt differ diff --git a/models/exp_0.1_random/DDQN with Prioritised Replay_network.pt b/models/exp_0.1_random/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d9d90a143c1f9b5ae8690da22380d7ca1443630 Binary files /dev/null and b/models/exp_0.1_random/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.1_random/DDQN_network.pt b/models/exp_0.1_random/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..079c6aa62c62a8bdcec9a421b1fdf0e3776397c4 Binary files /dev/null and b/models/exp_0.1_random/DDQN_network.pt differ diff --git a/models/exp_0.1_random/DQN with Fixed Q Targets_network.pt b/models/exp_0.1_random/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..270eb746817abc3575cfe0b665de46f08841ba8e Binary files /dev/null and b/models/exp_0.1_random/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.1_random/DQN_network.pt b/models/exp_0.1_random/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..f207d0f6bb3267afab7b9789487186679985a7c8 Binary files /dev/null and b/models/exp_0.1_random/DQN_network.pt differ diff --git a/models/exp_0.1_random/Dueling DDQN_network.pt b/models/exp_0.1_random/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..241fe3f255225c8a730dd916d4f62468edff66d5 Binary files /dev/null and b/models/exp_0.1_random/Dueling DDQN_network.pt differ diff --git a/models/exp_0.1_sweeping/DDQN with Prioritised Replay_network.pt b/models/exp_0.1_sweeping/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ccaeb9d661e1af5f639bca61ad280b9c6445917 Binary files /dev/null and b/models/exp_0.1_sweeping/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.1_sweeping/DDQN_network.pt b/models/exp_0.1_sweeping/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..23f8592ababca7bab5ce68f4667a84473c5c48c5 Binary files /dev/null and b/models/exp_0.1_sweeping/DDQN_network.pt differ diff --git a/models/exp_0.1_sweeping/DQN with Fixed Q Targets_network.pt b/models/exp_0.1_sweeping/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..43fdcf2d14bc9f50c7b01ac8297da8482c886174 Binary files /dev/null and b/models/exp_0.1_sweeping/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.1_sweeping/DQN_network.pt b/models/exp_0.1_sweeping/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..462a9908cef65a44217972c6d92cad37e82ced79 Binary files /dev/null and b/models/exp_0.1_sweeping/DQN_network.pt differ diff --git a/models/exp_0.1_sweeping/Dueling DDQN_network.pt b/models/exp_0.1_sweeping/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d45e531baca28264525fc5d8fca4e19e6537ecb Binary files /dev/null and b/models/exp_0.1_sweeping/Dueling DDQN_network.pt differ diff --git a/models/exp_0.2_constant/DDQN with Prioritised Replay_network.pt b/models/exp_0.2_constant/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c7905af3526a4de444c544a614958c554fa2fd9 Binary files /dev/null and b/models/exp_0.2_constant/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.2_constant/DDQN_network.pt b/models/exp_0.2_constant/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..05c6efae8c18adbc9c5461845ff2f7dc1b449ccb Binary files /dev/null and b/models/exp_0.2_constant/DDQN_network.pt differ diff --git a/models/exp_0.2_constant/DQN with Fixed Q Targets_network.pt b/models/exp_0.2_constant/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b86b588645b0e862e046602ef296517cc996ffa Binary files /dev/null and b/models/exp_0.2_constant/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.2_constant/DQN_network.pt b/models/exp_0.2_constant/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..8bfb6cb176f489b6de0869ad95a64c6a4779b385 Binary files /dev/null and b/models/exp_0.2_constant/DQN_network.pt differ diff --git a/models/exp_0.2_constant/Dueling DDQN_network.pt b/models/exp_0.2_constant/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..acfd3227b3348d9dd061af44a5d0627d00b3fdb7 Binary files /dev/null and b/models/exp_0.2_constant/Dueling DDQN_network.pt differ diff --git a/models/exp_0.2_dynamic/DDQN with Prioritised Replay_network.pt b/models/exp_0.2_dynamic/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..8bb0f557a69717fcd7c7d45f7de86aae09d2276f Binary files /dev/null and b/models/exp_0.2_dynamic/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.2_dynamic/DDQN_network.pt b/models/exp_0.2_dynamic/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad8c97bf839345a527ed170ab765f083cae38235 Binary files /dev/null and b/models/exp_0.2_dynamic/DDQN_network.pt differ diff --git a/models/exp_0.2_dynamic/DQN with Fixed Q Targets_network.pt b/models/exp_0.2_dynamic/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..974afee2fe1c02c5ecb5c866e06d668432b08307 Binary files /dev/null and b/models/exp_0.2_dynamic/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.2_dynamic/DQN_network.pt b/models/exp_0.2_dynamic/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..87b1096646e03259be42ea054a5ca1c5f5161888 Binary files /dev/null and b/models/exp_0.2_dynamic/DQN_network.pt differ diff --git a/models/exp_0.2_dynamic/Dueling DDQN_network.pt b/models/exp_0.2_dynamic/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..36ae72887a65339760505fefecc2425ee370c187 Binary files /dev/null and b/models/exp_0.2_dynamic/Dueling DDQN_network.pt differ diff --git a/models/exp_0.2_sweeping/DDQN with Prioritised Replay_network.pt b/models/exp_0.2_sweeping/DDQN with Prioritised Replay_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..5efbed791fccf46818a1e172f0fa2f0e8b4fcd77 Binary files /dev/null and b/models/exp_0.2_sweeping/DDQN with Prioritised Replay_network.pt differ diff --git a/models/exp_0.2_sweeping/DDQN_network.pt b/models/exp_0.2_sweeping/DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..c11f1809da20126e9f76c859b0b07c5319c73dcf Binary files /dev/null and b/models/exp_0.2_sweeping/DDQN_network.pt differ diff --git a/models/exp_0.2_sweeping/DQN with Fixed Q Targets_network.pt b/models/exp_0.2_sweeping/DQN with Fixed Q Targets_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ad8152fe972d41cd1566189a861dc10bdeab74f Binary files /dev/null and b/models/exp_0.2_sweeping/DQN with Fixed Q Targets_network.pt differ diff --git a/models/exp_0.2_sweeping/DQN_network.pt b/models/exp_0.2_sweeping/DQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..b80f836ab15caabca99840496286690e0428ff4e Binary files /dev/null and b/models/exp_0.2_sweeping/DQN_network.pt differ diff --git a/models/exp_0.2_sweeping/Dueling DDQN_network.pt b/models/exp_0.2_sweeping/Dueling DDQN_network.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab163897b318a3bd7eb82253eca63a07c5de6064 Binary files /dev/null and b/models/exp_0.2_sweeping/Dueling DDQN_network.pt differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..86fd5849e158e178bc94f5b3fb0dd79189b06e56 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +numpy==1.15.2 +torch==1.12.0 +matplotlib==3.0.0 +PyVirtualDisplay==0.2.1 +gym==0.10.9 +nn_builder +streamlit +pandas diff --git a/results/Anti_Jam.py b/results/Anti_Jam.py new file mode 100644 index 0000000000000000000000000000000000000000..438ab0210caa0c1768fd7f2678f128d64fc5ebfd --- /dev/null +++ b/results/Anti_Jam.py @@ -0,0 +1,174 @@ +import os +import sys +from os.path import dirname, abspath +from pathlib import Path + +sys.path.append(dirname(dirname(abspath(__file__)))) + +import warnings + +warnings.filterwarnings("ignore", category=UserWarning) + +import numpy as np +from utilities.data_structures.Config import Config +from agents.DQN_agents.DQN import DQN +from agents.DQN_agents.DQN_With_Fixed_Q_Targets import DQN_With_Fixed_Q_Targets +from agents.DQN_agents.DDQN import DDQN +from agents.DQN_agents.DDQN_With_Prioritised_Experience_Replay import DDQN_With_Prioritised_Experience_Replay +from agents.DQN_agents.Dueling_DDQN import Dueling_DDQN +from agents.Trainer import Trainer +from environments.RF_spectrum import RfEnvironment + +config = Config() +config.seed = 1 +# Setup environment +jammers = ['constant'] +jammer = 'dynamic' if len(jammers) > 1 else jammers[0] +env_settings = sys.argv +jammer_distance = 20 # cm +jamming_power = 10 # dBm +all_channels = [list(range(5180, 5340, 20))] +all_channels = [freq for band in range(len(all_channels)) for freq in all_channels[band]] +length = 32 # int(sample_rate * sensing_window) +stride = 1 +csc = 0.2 +env = RfEnvironment(jammers, jammer_distance, jamming_power, csc, all_channels, length, stride) +ob_space = env.features +ac_space = env.action_space +a_size = len(all_channels) +s_size = env.observation_size +config.environment = env + +config.base_dir = os.path.abspath('') +config.models_dir = f'{config.base_dir}/models/exp_{csc}_{jammer}' +if not os.path.exists(config.models_dir): + os.makedirs(config.models_dir) +config.file_to_save_data_results = f"{config.base_dir}/results/data_and_graphs/Anti_Jam_training_{csc}_{jammer}.pkl" +config.file_to_save_results_graph = f"{config.base_dir}/results/data_and_graphs/Anti_Jam_training_{csc}_{jammer}.png" + +config.num_episodes_to_run = 1 +config.runs_per_agent = 3 +config.show_solution_score = False +config.visualise_individual_results = False +config.visualise_overall_agent_results = True +config.standard_deviation_results = 1.0 +config.use_GPU = False +config.overwrite_existing_results_file = True +config.randomise_random_seed = True +config.save_model = True +config.training = True + +config.hyperparameters = { + "DQN_Agents": { + "learning_rate": 0.001, + "batch_size": 64, + "buffer_size": 100000, + "epsilon": 1.0, + "epsilon_decay_rate_denominator": 1, + "discount_rate": 0.99, + "tau": 0.01, + "alpha_prioritised_replay": 0.6, + "beta_prioritised_replay": 0.1, + "incremental_td_error": 1e-8, + "update_every_n_steps": 1, + "linear_hidden_units": [256, 256], + "final_layer_activation": "None", + "batch_norm": False, + "gradient_clipping_norm": 0.7, + "learning_iterations": 1, + "clip_rewards": False, + "HER_sample_proportion": 0.8, + "y_range": (-1, 14) + }, + "Stochastic_Policy_Search_Agents": { + "policy_network_type": "Linear", + "noise_scale_start": 1e-2, + "noise_scale_min": 1e-3, + "noise_scale_max": 2.0, + "noise_scale_growth_factor": 2.0, + "stochastic_action_decision": False, + "num_policies": 10, + "episodes_per_policy": 1, + "num_policies_to_keep": 5, + "clip_rewards": False + }, + "Policy_Gradient_Agents": { + "learning_rate": 0.05, + "linear_hidden_units": [256, 256], + "final_layer_activation": "SOFTMAX", + "learning_iterations_per_round": 5, + "discount_rate": 0.99, + "batch_norm": False, + "clip_epsilon": 0.1, + "episodes_per_learning_round": 4, + "normalise_rewards": True, + "gradient_clipping_norm": 7.0, + "mu": 0.0, # only required for continuous action games + "theta": 0.0, # only required for continuous action games + "sigma": 0.0, # only required for continuous action games + "epsilon_decay_rate_denominator": 1.0, + "clip_rewards": False + }, + + "Actor_Critic_Agents": { + + "learning_rate": 0.005, + "linear_hidden_units": [256, 256], + "final_layer_activation": ["SOFTMAX", None], + "gradient_clipping_norm": 5.0, + "discount_rate": 0.99, + "epsilon_decay_rate_denominator": 1.0, + "normalise_rewards": True, + "exploration_worker_difference": 2.0, + "clip_rewards": False, + + "Actor": { + "learning_rate": 0.0003, + "linear_hidden_units": [256, 256], + "final_layer_activation": "Softmax", + "batch_norm": False, + "tau": 0.005, + "gradient_clipping_norm": 5, + "initialiser": "Xavier" + }, + + "Critic": { + "learning_rate": 0.0003, + "linear_hidden_units": [256, 256], + "final_layer_activation": None, + "batch_norm": False, + "buffer_size": 1000000, + "tau": 0.005, + "gradient_clipping_norm": 5, + "initialiser": "Xavier" + }, + + "min_steps_before_learning": 400, + "batch_size": 64, + "discount_rate": 0.99, + "mu": 0.0, # for O-H noise + "theta": 0.15, # for O-H noise + "sigma": 0.25, # for O-H noise + "action_noise_std": 0.2, # for TD3 + "action_noise_clipping_range": 0.5, # for TD3 + "update_every_n_steps": 1, + "learning_updates_per_learning_session": 1, + "automatically_tune_entropy_hyperparameter": True, + "entropy_term_weight": None, + "add_extra_noise": False, + "do_evaluation_iterations": True + } +} + +if __name__ == "__main__": + # Training + AGENTS = [DQN, DQN_With_Fixed_Q_Targets, DDQN, Dueling_DDQN, DDQN_With_Prioritised_Experience_Replay] + trainer = Trainer(config, AGENTS) + trainer.run_games_for_agents() + # Testing + config.training = False + config.runs_per_agent = 1 + config.file_to_save_data_results = f"{config.base_dir}/results/data_and_graphs/Anti_Jam_testing_{csc}_{jammer}.pkl" + config.file_to_save_results_graph = f"{config.base_dir}/results/data_and_graphs/Anti_Jam_testing_{csc}_{jammer}.png" + trainer = Trainer(config, AGENTS) + trainer.run_games_for_agents() diff --git a/results/data_and_graphs/Anti_Jam_testing_0.05_constant.pkl b/results/data_and_graphs/Anti_Jam_testing_0.05_constant.pkl new file mode 100644 index 0000000000000000000000000000000000000000..60a9d60c252d25e930adba41111616a3c2c9313c Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.05_constant.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.05_constant.png b/results/data_and_graphs/Anti_Jam_testing_0.05_constant.png new file mode 100644 index 0000000000000000000000000000000000000000..56c43131f4763909a8982eb4b80892a2116ec6b8 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.05_constant.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.05_dynamic.pkl b/results/data_and_graphs/Anti_Jam_testing_0.05_dynamic.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3a8ea4fa3b6ddb8476873e9c169723e6b526e43d Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.05_dynamic.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.05_dynamic.png b/results/data_and_graphs/Anti_Jam_testing_0.05_dynamic.png new file mode 100644 index 0000000000000000000000000000000000000000..e92227b6d56aacc21ce0fdf08191fbd48c38bd9d Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.05_dynamic.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.05_random.pkl b/results/data_and_graphs/Anti_Jam_testing_0.05_random.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ed547e5cdef53b411246c26fe8ce6d7bb956ac42 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.05_random.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.05_random.png b/results/data_and_graphs/Anti_Jam_testing_0.05_random.png new file mode 100644 index 0000000000000000000000000000000000000000..2eb54d92ce03f49ef6a7a5a030ac8666297dc2d3 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.05_random.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.05_sweeping.pkl b/results/data_and_graphs/Anti_Jam_testing_0.05_sweeping.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5514fbcab76662e66bec17d81251458d475bb7b8 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.05_sweeping.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.05_sweeping.png b/results/data_and_graphs/Anti_Jam_testing_0.05_sweeping.png new file mode 100644 index 0000000000000000000000000000000000000000..7e945858153a7b4b13ea88647dcc11d56d2e21d2 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.05_sweeping.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.0_constant.pkl b/results/data_and_graphs/Anti_Jam_testing_0.0_constant.pkl new file mode 100644 index 0000000000000000000000000000000000000000..cebb7e58330258611379541d73bf05e1dbc0d48a Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.0_constant.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.0_constant.png b/results/data_and_graphs/Anti_Jam_testing_0.0_constant.png new file mode 100644 index 0000000000000000000000000000000000000000..fa42193f135cc33a74af4b4f9caa632e83ddf843 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.0_constant.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.0_dynamic.pkl b/results/data_and_graphs/Anti_Jam_testing_0.0_dynamic.pkl new file mode 100644 index 0000000000000000000000000000000000000000..cfea9c89122ebb23a002ab7b9c05f2c57b5fe269 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.0_dynamic.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.0_dynamic.png b/results/data_and_graphs/Anti_Jam_testing_0.0_dynamic.png new file mode 100644 index 0000000000000000000000000000000000000000..1234539a5fc5d8b0bcdd4db4cfdf0bb92fc4930b Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.0_dynamic.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.0_random.pkl b/results/data_and_graphs/Anti_Jam_testing_0.0_random.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ee120ac41f349074fb29aa9207c574c28297694b Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.0_random.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.0_random.png b/results/data_and_graphs/Anti_Jam_testing_0.0_random.png new file mode 100644 index 0000000000000000000000000000000000000000..97b123005533c87733381e104413836eab6d041a Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.0_random.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.0_sweeping.pkl b/results/data_and_graphs/Anti_Jam_testing_0.0_sweeping.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7687d9e9a8c6538e5238727e38a002f3ab847eef Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.0_sweeping.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.0_sweeping.png b/results/data_and_graphs/Anti_Jam_testing_0.0_sweeping.png new file mode 100644 index 0000000000000000000000000000000000000000..a23c193a2580dd8b86b4bb88f318ef511c21b775 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.0_sweeping.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.15_constant.pkl b/results/data_and_graphs/Anti_Jam_testing_0.15_constant.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9eb220e2f749789abf6efa9eda4a77b8fa8fc8f5 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.15_constant.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.15_constant.png b/results/data_and_graphs/Anti_Jam_testing_0.15_constant.png new file mode 100644 index 0000000000000000000000000000000000000000..3d81100120846766bf654dc32f5c2736aa618498 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.15_constant.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.15_dynamic.pkl b/results/data_and_graphs/Anti_Jam_testing_0.15_dynamic.pkl new file mode 100644 index 0000000000000000000000000000000000000000..526b25022356678fdfdb67b3269b0408b79bc8aa Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.15_dynamic.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.15_dynamic.png b/results/data_and_graphs/Anti_Jam_testing_0.15_dynamic.png new file mode 100644 index 0000000000000000000000000000000000000000..7fd0217cfbd98c6a720c090e725c2d5b413e908b Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.15_dynamic.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.15_random.pkl b/results/data_and_graphs/Anti_Jam_testing_0.15_random.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e29e779bd826fee2563d2b7a1f919f488ed16da7 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.15_random.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.15_random.png b/results/data_and_graphs/Anti_Jam_testing_0.15_random.png new file mode 100644 index 0000000000000000000000000000000000000000..e8c90009830ba17a4d9dd42f361dbb673a10983a Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.15_random.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.15_sweeping.pkl b/results/data_and_graphs/Anti_Jam_testing_0.15_sweeping.pkl new file mode 100644 index 0000000000000000000000000000000000000000..040bb77d4ee8e48ed02e4910acedbfdf415ca74f Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.15_sweeping.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.15_sweeping.png b/results/data_and_graphs/Anti_Jam_testing_0.15_sweeping.png new file mode 100644 index 0000000000000000000000000000000000000000..f15438023c3b5a68beb33e083df24272146a033c Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.15_sweeping.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.1_constant.pkl b/results/data_and_graphs/Anti_Jam_testing_0.1_constant.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d31aafa6869d78ed27ceb1065ae069844334cfa1 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.1_constant.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.1_constant.png b/results/data_and_graphs/Anti_Jam_testing_0.1_constant.png new file mode 100644 index 0000000000000000000000000000000000000000..3f33c46f0efdd3cd41a9ba9c0a5fc2cd00268955 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.1_constant.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.1_dynamic.pkl b/results/data_and_graphs/Anti_Jam_testing_0.1_dynamic.pkl new file mode 100644 index 0000000000000000000000000000000000000000..06606c82d2114c31a7fddfe3174abd4f1d6031a1 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.1_dynamic.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.1_dynamic.png b/results/data_and_graphs/Anti_Jam_testing_0.1_dynamic.png new file mode 100644 index 0000000000000000000000000000000000000000..c68f1e9fa8e19dffba28759c98f8c67263960aa9 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.1_dynamic.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.1_random.pkl b/results/data_and_graphs/Anti_Jam_testing_0.1_random.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2ad8e047c097a3c3116d3f05519475470862bf3f Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.1_random.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.1_random.png b/results/data_and_graphs/Anti_Jam_testing_0.1_random.png new file mode 100644 index 0000000000000000000000000000000000000000..bde88e476ce47c01f00c4fe8e8bb96a76d372071 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.1_random.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.1_sweeping.pkl b/results/data_and_graphs/Anti_Jam_testing_0.1_sweeping.pkl new file mode 100644 index 0000000000000000000000000000000000000000..243d63f4bd8edeea85ccc8cf6d4e4e19d25386dd Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.1_sweeping.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.1_sweeping.png b/results/data_and_graphs/Anti_Jam_testing_0.1_sweeping.png new file mode 100644 index 0000000000000000000000000000000000000000..e3f71751f5a0a9adc8511bec698f6beb83a7fa87 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.1_sweeping.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.2_constant.pkl b/results/data_and_graphs/Anti_Jam_testing_0.2_constant.pkl new file mode 100644 index 0000000000000000000000000000000000000000..654f62658830462ea63922a20d47b1bdf831a43a Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.2_constant.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.2_constant.png b/results/data_and_graphs/Anti_Jam_testing_0.2_constant.png new file mode 100644 index 0000000000000000000000000000000000000000..09d5f1fcb1857dd3583e2046a10d0a54d63cfc1e Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.2_constant.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.2_dynamic.pkl b/results/data_and_graphs/Anti_Jam_testing_0.2_dynamic.pkl new file mode 100644 index 0000000000000000000000000000000000000000..cd54c8f57e91ee85e6707a3d079720d36683ea2d Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.2_dynamic.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.2_dynamic.png b/results/data_and_graphs/Anti_Jam_testing_0.2_dynamic.png new file mode 100644 index 0000000000000000000000000000000000000000..40005264be04b71f4d4c8734275ee68c03e2152f Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.2_dynamic.png differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.2_sweeping.pkl b/results/data_and_graphs/Anti_Jam_testing_0.2_sweeping.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5513ddec932047e7f1bf3558636f16dbd6efab1a Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.2_sweeping.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_testing_0.2_sweeping.png b/results/data_and_graphs/Anti_Jam_testing_0.2_sweeping.png new file mode 100644 index 0000000000000000000000000000000000000000..cd412523320a8195717b30b60ecacfc25cbae7dd Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_testing_0.2_sweeping.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.05_constant.pkl b/results/data_and_graphs/Anti_Jam_training_0.05_constant.pkl new file mode 100644 index 0000000000000000000000000000000000000000..21174e3f5998053f7049260749d69e1ecb7a3578 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.05_constant.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.05_constant.png b/results/data_and_graphs/Anti_Jam_training_0.05_constant.png new file mode 100644 index 0000000000000000000000000000000000000000..3fa5a68b4d8a95ad68c55cde5dd5b9c3dc68f8b9 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.05_constant.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.05_dynamic.pkl b/results/data_and_graphs/Anti_Jam_training_0.05_dynamic.pkl new file mode 100644 index 0000000000000000000000000000000000000000..385ae40f33b81216528bdf945a5c21bc05900f1d Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.05_dynamic.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.05_dynamic.png b/results/data_and_graphs/Anti_Jam_training_0.05_dynamic.png new file mode 100644 index 0000000000000000000000000000000000000000..5e7320eff7cc4feeee2559df0cba4927eda72d0d Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.05_dynamic.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.05_random.pkl b/results/data_and_graphs/Anti_Jam_training_0.05_random.pkl new file mode 100644 index 0000000000000000000000000000000000000000..58913607c2e38db3ea93dcac348e77f885a6e79f Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.05_random.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.05_random.png b/results/data_and_graphs/Anti_Jam_training_0.05_random.png new file mode 100644 index 0000000000000000000000000000000000000000..587d503e8c07a571c93551d949a0b7c0ea9369e8 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.05_random.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.05_sweeping.pkl b/results/data_and_graphs/Anti_Jam_training_0.05_sweeping.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7365a4e8ef611e57d8af83e2885822356b652c8c Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.05_sweeping.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.05_sweeping.png b/results/data_and_graphs/Anti_Jam_training_0.05_sweeping.png new file mode 100644 index 0000000000000000000000000000000000000000..573a8315c3571c5422992c4a0eb2fcb134e27e75 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.05_sweeping.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.0_constant.pkl b/results/data_and_graphs/Anti_Jam_training_0.0_constant.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c6721186712e3b242de1a4b5e3aac1e74c5cab10 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.0_constant.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.0_constant.png b/results/data_and_graphs/Anti_Jam_training_0.0_constant.png new file mode 100644 index 0000000000000000000000000000000000000000..10e8efab9fb199ececec37149a2326438a93bb62 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.0_constant.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.0_constant1.png b/results/data_and_graphs/Anti_Jam_training_0.0_constant1.png new file mode 100644 index 0000000000000000000000000000000000000000..9cebadf79d32605478a5cf58fb1c5dd29a67b5e7 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.0_constant1.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.0_dynamic.pkl b/results/data_and_graphs/Anti_Jam_training_0.0_dynamic.pkl new file mode 100644 index 0000000000000000000000000000000000000000..81bbe2cc2dd5bebaa4794d51cc0d601fcd63fdd3 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.0_dynamic.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.0_dynamic.png b/results/data_and_graphs/Anti_Jam_training_0.0_dynamic.png new file mode 100644 index 0000000000000000000000000000000000000000..4cb61dc93fa8b486d336f88381da9e53ce381949 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.0_dynamic.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.0_dynamic1.png b/results/data_and_graphs/Anti_Jam_training_0.0_dynamic1.png new file mode 100644 index 0000000000000000000000000000000000000000..95588375a9cbeae83f5fafc6a636c0948dcb31a2 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.0_dynamic1.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.0_random.pkl b/results/data_and_graphs/Anti_Jam_training_0.0_random.pkl new file mode 100644 index 0000000000000000000000000000000000000000..dfc4e150f1d292fb3701c3687f65740a9fed74a4 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.0_random.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.0_random.png b/results/data_and_graphs/Anti_Jam_training_0.0_random.png new file mode 100644 index 0000000000000000000000000000000000000000..703a489a268fb2d925a04a6607327efc6006c9f8 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.0_random.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.0_random1.png b/results/data_and_graphs/Anti_Jam_training_0.0_random1.png new file mode 100644 index 0000000000000000000000000000000000000000..ee99abdb51857b5710390f7266d3693fbeef245a Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.0_random1.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.0_sweeping.pkl b/results/data_and_graphs/Anti_Jam_training_0.0_sweeping.pkl new file mode 100644 index 0000000000000000000000000000000000000000..685af5124dd53b04ffbcda1faafbc3bcd80c20cf Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.0_sweeping.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.0_sweeping.png b/results/data_and_graphs/Anti_Jam_training_0.0_sweeping.png new file mode 100644 index 0000000000000000000000000000000000000000..64a3ba7374f74cbca393448c07724aff053682df Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.0_sweeping.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.0_sweeping1.png b/results/data_and_graphs/Anti_Jam_training_0.0_sweeping1.png new file mode 100644 index 0000000000000000000000000000000000000000..be51993c02aa47f09132a6e75c6a716fc73a57ba Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.0_sweeping1.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.15_constant.pkl b/results/data_and_graphs/Anti_Jam_training_0.15_constant.pkl new file mode 100644 index 0000000000000000000000000000000000000000..72f7314dba115a11364eb46a0f069e7bd188a635 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.15_constant.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.15_constant.png b/results/data_and_graphs/Anti_Jam_training_0.15_constant.png new file mode 100644 index 0000000000000000000000000000000000000000..1611b1a5a59a5cda5d7bc6ca8954f27d71f50cb5 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.15_constant.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.15_dynamic.pkl b/results/data_and_graphs/Anti_Jam_training_0.15_dynamic.pkl new file mode 100644 index 0000000000000000000000000000000000000000..df3cf85531a1d1cbc90efa4bedce7325a31d3c61 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.15_dynamic.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.15_dynamic.png b/results/data_and_graphs/Anti_Jam_training_0.15_dynamic.png new file mode 100644 index 0000000000000000000000000000000000000000..d18e405bbe2d0dc15b2631e8363c26917f7cb0a8 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.15_dynamic.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.15_random.pkl b/results/data_and_graphs/Anti_Jam_training_0.15_random.pkl new file mode 100644 index 0000000000000000000000000000000000000000..98ec476995021a7bd6d84c2d8e2c16b2e2d86e83 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.15_random.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.15_random.png b/results/data_and_graphs/Anti_Jam_training_0.15_random.png new file mode 100644 index 0000000000000000000000000000000000000000..743743bacc5c3ba4d531f39a795f4f18726781ff Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.15_random.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.15_sweeping.pkl b/results/data_and_graphs/Anti_Jam_training_0.15_sweeping.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0d591ae9f9a5ab916364284769e7e23da77e8b2a Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.15_sweeping.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.15_sweeping.png b/results/data_and_graphs/Anti_Jam_training_0.15_sweeping.png new file mode 100644 index 0000000000000000000000000000000000000000..c0b6dc28bcc9cc3d879dbd3b7b3d32a992e74d4d Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.15_sweeping.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.1_constant.pkl b/results/data_and_graphs/Anti_Jam_training_0.1_constant.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8e8391c52dff83095f4a99fe11a97f137421b7a7 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.1_constant.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.1_constant.png b/results/data_and_graphs/Anti_Jam_training_0.1_constant.png new file mode 100644 index 0000000000000000000000000000000000000000..902e546061ec25ca53fcb38115e6ae9e88b4d658 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.1_constant.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.1_dynamic.pkl b/results/data_and_graphs/Anti_Jam_training_0.1_dynamic.pkl new file mode 100644 index 0000000000000000000000000000000000000000..edfd7500b04aea1a52fabbce5af08bf0140c0c0a Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.1_dynamic.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.1_dynamic.png b/results/data_and_graphs/Anti_Jam_training_0.1_dynamic.png new file mode 100644 index 0000000000000000000000000000000000000000..593a0139280d838136fc0ba3a8af07ee978ae581 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.1_dynamic.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.1_random.pkl b/results/data_and_graphs/Anti_Jam_training_0.1_random.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ccf9e12645deeb871a0173a9d3dc2d697473bb75 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.1_random.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.1_random.png b/results/data_and_graphs/Anti_Jam_training_0.1_random.png new file mode 100644 index 0000000000000000000000000000000000000000..829b92de13cb9839481e62230edfd947abce4c32 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.1_random.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.1_sweeping.pkl b/results/data_and_graphs/Anti_Jam_training_0.1_sweeping.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4714d3a0ec006733b5f4a32719bb8c5ac0bd0499 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.1_sweeping.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.1_sweeping.png b/results/data_and_graphs/Anti_Jam_training_0.1_sweeping.png new file mode 100644 index 0000000000000000000000000000000000000000..6e3dfdded5f841e89c4ca58f621f1f164f862a84 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.1_sweeping.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.2_constant.pkl b/results/data_and_graphs/Anti_Jam_training_0.2_constant.pkl new file mode 100644 index 0000000000000000000000000000000000000000..479e0d843428e84162a5b4d8e4563ac791bc640f Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.2_constant.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.2_constant.png b/results/data_and_graphs/Anti_Jam_training_0.2_constant.png new file mode 100644 index 0000000000000000000000000000000000000000..f69888a2be8c2ed8df8a6873388200fde4fab00c Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.2_constant.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.2_dynamic.pkl b/results/data_and_graphs/Anti_Jam_training_0.2_dynamic.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3a0a3b4dfbb070ebe4c6d70ff2fffaca73508b70 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.2_dynamic.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.2_dynamic.png b/results/data_and_graphs/Anti_Jam_training_0.2_dynamic.png new file mode 100644 index 0000000000000000000000000000000000000000..50c4f8a74e29a955578b931ee8f4619d57f0b8cd Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.2_dynamic.png differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.2_sweeping.pkl b/results/data_and_graphs/Anti_Jam_training_0.2_sweeping.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f2949d62c51540283d94e6fafcb0edebcbcd83b4 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.2_sweeping.pkl differ diff --git a/results/data_and_graphs/Anti_Jam_training_0.2_sweeping.png b/results/data_and_graphs/Anti_Jam_training_0.2_sweeping.png new file mode 100644 index 0000000000000000000000000000000000000000..d5ee58f11c5f347377b5f3fcf5b5b8162c58fbd1 Binary files /dev/null and b/results/data_and_graphs/Anti_Jam_training_0.2_sweeping.png differ diff --git a/results/data_and_graphs/cst_dynamic.pdf b/results/data_and_graphs/cst_dynamic.pdf new file mode 100644 index 0000000000000000000000000000000000000000..0e400d1271b4c7f0bc70c4d0fa7b8b23f75ab176 Binary files /dev/null and b/results/data_and_graphs/cst_dynamic.pdf differ diff --git a/results/data_and_graphs/cst_dynamic.png b/results/data_and_graphs/cst_dynamic.png new file mode 100644 index 0000000000000000000000000000000000000000..5df12c9648f17c808b25b9bce89b7f1388d14381 Binary files /dev/null and b/results/data_and_graphs/cst_dynamic.png differ diff --git a/results/data_and_graphs/plots.py b/results/data_and_graphs/plots.py new file mode 100644 index 0000000000000000000000000000000000000000..7a4bf611520af4a435304eac13f1b990c6664340 --- /dev/null +++ b/results/data_and_graphs/plots.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# +# SPDX-License-Identifier: GPL-3.0 +# +# GNU Radio Python Flow Graph +# Title: Not titled yet +# Author: Abubakar Sani Ali +# GNU Radio version: 3.8.1.0 + +################################################################################### +# Importing Libraries +################################################################################### +import json +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sns +import pickle +import pandas as pd + +sns.set_palette('colorblind') + +# jammers = ['constant', 'sweeping', 'random', 'dynamic'] +jammers = ['dynamic'] +agents = ['DQN', 'DQN with Fixed Q Targets', 'DDQN', 'Dueling DDQN', 'DDQN with Prioritised Replay'] +network = 'FNN' +history = 1 +cscs = [0.0, 0.05, 0.1, 0.15] +max_env_steps = 100 +Episodes = 100 +tot_iterations = max_env_steps * Episodes + +# experiment 1: Convergence time +filename = f'Anti_Jam_training_{cscs[0]}_{jammers[-1]}.pkl' +file = open(filename, 'rb') +object_file = pickle.load(file) +file.close() +n_runs = 3 +meanConvergenceTime = [] +stdConvergenceTime = [] +for agent in agents: + convergenceTime = [] + for run in range(n_runs): + agentName = object_file[f'{agent}'] + time = agentName[run][4] + convergenceTime.append(time) + meanConvergenceTime.append(np.array(convergenceTime).mean()) + stdConvergenceTime.append(np.array(convergenceTime).std()) +print(f'The convergence times are:') +print(meanConvergenceTime) +print(stdConvergenceTime) + +# experiment 2: Inference time +# meanInferenceTime = [] +# stdInferenceTime = [] +# for agent in agents: +# inferenceTime = [] +# for csc in cscs: +# filename = f'Anti_Jam_testing_{csc}_{jammer}.pkl' +# file = open(filename, 'rb') +# object_file = pickle.load(file) +# file.close() +# agentName = object_file[f'{agent}'] +# time = agentName[0][4] +# inferenceTime.append(time) +# meanInferenceTime.append(np.array(inferenceTime).mean()) +# stdInferenceTime.append(np.array(inferenceTime).std()) +# print(f'The inference times are:') +# print(np.array(meanInferenceTime)/tot_iterations) +# print(np.array(stdInferenceTime)/tot_iterations) +# print(f'The inference speeds are:') +# print(tot_iterations/np.array(meanInferenceTime)) +# print(tot_iterations/np.array(stdInferenceTime)) + +# experiment 3 plots: rewards +for csc in cscs: + rolling_rewards = np.empty((len(agents), n_runs, Episodes)) + filename = f'Anti_Jam_training_{csc}_{jammers[0]}.pkl' + file = open(filename, 'rb') + object_file = pickle.load(file) + file.close() + for agent_idx in range(len(agents)): + agent = agents[agent_idx] + for run in range(n_runs): + agentName = object_file[f'{agent}'] + rollingReward = agentName[run][1] + rolling_rewards[agent_idx][run] = rollingReward + + # Compute the mean and standard deviation of the rolling rewards + mean_rewards = np.mean(rolling_rewards, axis=1) + std_rewards = np.std(rolling_rewards, axis=1) + + # Plot the mean rolling rewards and the shaded standard deviation area + plotName = f'rolling_reward_{csc}_{jammers[0]}.pdf' + fig, ax = plt.subplots() + fig.set_figwidth(6) + fig.set_figheight(5) + for agent_idx in range(len(agents)): + ax.plot(mean_rewards[agent_idx], label=f'{agents[agent_idx]}') + ax.fill_between(range(Episodes), mean_rewards[agent_idx] - std_rewards[agent_idx], + mean_rewards[agent_idx] + std_rewards[agent_idx], alpha=0.3) + + ax.set_xlabel('Episode') + ax.set_ylabel('Rolling Average Reward') + + # Updated legend position + ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2) + + # Adjust the bottom margin to create more space for the legend + plt.subplots_adjust(bottom=0.25) + + plt.savefig(plotName, bbox_inches='tight') + plt.show() + +# experiment 4 plots: Throughput +for jammer in jammers: + throughput = [] + for csc in cscs: + filename = f'Anti_Jam_testing_{csc}_{jammer}.pkl' + file = open(filename, 'rb') + object_file = pickle.load(file) + file.close() + agentsThroughputs = [] + for agent in agents: + agentName = object_file[f'{agent}'] + episodeThroughputs = agentName[0][1] + meanEpisodeThroughput = np.array(episodeThroughputs).mean() + agentsThroughputs.append(meanEpisodeThroughput) + throughput.append(agentsThroughputs) + normalizedThroughput = np.transpose(np.array(throughput) / Episodes) + X_axis = np.arange(len(cscs)) + plotName = f'throughput_{jammer}.pdf' + fig, ax = plt.subplots() + fig.set_figwidth(10) + fig.set_figheight(4) + plt.bar(X_axis - 0.3, normalizedThroughput[0], 0.15, label=agents[0]) + plt.bar(X_axis - 0.15, normalizedThroughput[1], 0.15, label=agents[1]) + plt.bar(X_axis + 0, normalizedThroughput[2], 0.15, label=agents[2]) + plt.bar(X_axis + 0.15, normalizedThroughput[3], 0.15, label=agents[3]) + plt.bar(X_axis + 0.3, normalizedThroughput[4], 0.15, label=agents[4]) + + plt.ylim((0.6, 1)) + plt.xticks(X_axis, cscs) + plt.xlabel('Channel switching cost (CSC)') + plt.ylabel('Normalized Throughput') + + # Updated legend position + plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=len(agents)) + + # Adjust the bottom margin to create more space for the legend + plt.subplots_adjust(bottom=0.25) + + plt.savefig(plotName, bbox_inches='tight') + plt.show() + +# experiment 5 plots: Channel Switching Times +for jammer in jammers: + cstime = [] + for csc in cscs: + filename = f'Anti_Jam_testing_{csc}_{jammer}.pkl' + file = open(filename, 'rb') + object_file = pickle.load(file) + file.close() + agentsCstimes = [] + for agent in agents: + agentName = object_file[f'{agent}'] + episodeCstimes = agentName[0][-1] + meanEpisodeCstime = np.array(episodeCstimes).mean() + agentsCstimes.append(meanEpisodeCstime) + cstime.append(agentsCstimes) + normalizedCstime = np.transpose(np.array(cstime) / Episodes) + X_axis = np.arange(len(cscs)) + plotName = f'cst_{jammer}.pdf' + fig, ax = plt.subplots() + fig.set_figwidth(10) + fig.set_figheight(4) + plt.bar(X_axis - 0.3, normalizedCstime[0], 0.15, label=agents[0]) + plt.bar(X_axis - 0.15, normalizedCstime[1], 0.15, label=agents[1]) + plt.bar(X_axis + 0, normalizedCstime[2], 0.15, label=agents[2]) + plt.bar(X_axis + 0.15, normalizedCstime[3], 0.15, label=agents[3]) + plt.bar(X_axis + 0.3, normalizedCstime[4], 0.15, label=agents[4]) + + plt.ylim((0, 1)) + plt.xticks(X_axis, cscs) + plt.xlabel('Channel switching cost (CSC)') + plt.ylabel('Normalized Channel Switiching Frequency') + + # Updated legend position + plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=len(agents)) + + # Adjust the bottom margin to create more space for the legend + plt.subplots_adjust(bottom=0.25) + + plt.savefig(plotName, bbox_inches='tight') + plt.show() diff --git a/results/data_and_graphs/rolling_reward_0.05_dynamic.pdf b/results/data_and_graphs/rolling_reward_0.05_dynamic.pdf new file mode 100644 index 0000000000000000000000000000000000000000..d05f14e8d32167885d34558568feae7775c6ac95 Binary files /dev/null and b/results/data_and_graphs/rolling_reward_0.05_dynamic.pdf differ diff --git a/results/data_and_graphs/rolling_reward_0.05_dynamic.png b/results/data_and_graphs/rolling_reward_0.05_dynamic.png new file mode 100644 index 0000000000000000000000000000000000000000..82b41788210ac534edfcc97615d617a99096273b Binary files /dev/null and b/results/data_and_graphs/rolling_reward_0.05_dynamic.png differ diff --git a/results/data_and_graphs/rolling_reward_0.0_dynamic.pdf b/results/data_and_graphs/rolling_reward_0.0_dynamic.pdf new file mode 100644 index 0000000000000000000000000000000000000000..8097038bc3e9852136c20d02b10583a27938d402 Binary files /dev/null and b/results/data_and_graphs/rolling_reward_0.0_dynamic.pdf differ diff --git a/results/data_and_graphs/rolling_reward_0.0_dynamic.png b/results/data_and_graphs/rolling_reward_0.0_dynamic.png new file mode 100644 index 0000000000000000000000000000000000000000..73683c457792b119baef586e3380319fb291f1d3 Binary files /dev/null and b/results/data_and_graphs/rolling_reward_0.0_dynamic.png differ diff --git a/results/data_and_graphs/rolling_reward_0.15_dynamic.pdf b/results/data_and_graphs/rolling_reward_0.15_dynamic.pdf new file mode 100644 index 0000000000000000000000000000000000000000..faaafc3a5d90d706b8fcd93769a6daab93078016 Binary files /dev/null and b/results/data_and_graphs/rolling_reward_0.15_dynamic.pdf differ diff --git a/results/data_and_graphs/rolling_reward_0.15_dynamic.png b/results/data_and_graphs/rolling_reward_0.15_dynamic.png new file mode 100644 index 0000000000000000000000000000000000000000..d347fa573cd6b3e4aab5ac630bc24d1a91c3428c Binary files /dev/null and b/results/data_and_graphs/rolling_reward_0.15_dynamic.png differ diff --git a/results/data_and_graphs/rolling_reward_0.1_dynamic.pdf b/results/data_and_graphs/rolling_reward_0.1_dynamic.pdf new file mode 100644 index 0000000000000000000000000000000000000000..147078b10e00c7685826222434e616a1c5f14f51 Binary files /dev/null and b/results/data_and_graphs/rolling_reward_0.1_dynamic.pdf differ diff --git a/results/data_and_graphs/rolling_reward_0.1_dynamic.png b/results/data_and_graphs/rolling_reward_0.1_dynamic.png new file mode 100644 index 0000000000000000000000000000000000000000..e74ba88763ff04e71bd5718568820ca26ed2d13a Binary files /dev/null and b/results/data_and_graphs/rolling_reward_0.1_dynamic.png differ diff --git a/results/data_and_graphs/throughput_dynamic.pdf b/results/data_and_graphs/throughput_dynamic.pdf new file mode 100644 index 0000000000000000000000000000000000000000..606e02fd5ba396adedcd990c03191ec43f1d7f9e Binary files /dev/null and b/results/data_and_graphs/throughput_dynamic.pdf differ diff --git a/results/data_and_graphs/throughput_dynamic.png b/results/data_and_graphs/throughput_dynamic.png new file mode 100644 index 0000000000000000000000000000000000000000..975a2e6d58f2d1c1a6b72546a4daa63c566768ca Binary files /dev/null and b/results/data_and_graphs/throughput_dynamic.png differ diff --git a/run_experiments.sh b/run_experiments.sh new file mode 100644 index 0000000000000000000000000000000000000000..91a60a4d1be558a5571091eed0a66b796b054d41 --- /dev/null +++ b/run_experiments.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Results bash script +# author: abubakar sani ali + + +# Experiment 1: wideband, 5GHz, csc = 0 +python3 results/Anti_Jam.py 0 5 0 + +# Experiment 2: wideband, 5GHz, csc = 0.2 +python3 results/Anti_Jam.py 0 5 0.2 + +# Experiment 3: wideband, 2.4GHz, csc = 0 +python3 results/Anti_Jam.py 0 2.4 0 + +# Experiment 4: wideband, 2.4GHz, csc = 0.2 +python3 results/Anti_Jam.py 0 2.4 0.2 + +# Experiment 5: broadband, 5GHz, csc = 0 +python3 results/Anti_Jam.py 1 5 0 + +# Experiment 6: broadband, 5GHz, csc = 0.2 +python3 results/Anti_Jam.py 1 5 0.2 + +# Experiment 7: broadband, 2.4GHz, csc = 0 +python3 results/Anti_Jam.py 1 2.4 0 + +# Experiment 8: broadband, 2.4GHz, csc = 0.2 +python3 results/Anti_Jam.py 1 2.4 0.2 + +echo All Done! diff --git a/utilities/Deepmind_RMS_Prop.py b/utilities/Deepmind_RMS_Prop.py new file mode 100644 index 0000000000000000000000000000000000000000..e143d12e40f60b314cfcedbab239c9e154e47b17 --- /dev/null +++ b/utilities/Deepmind_RMS_Prop.py @@ -0,0 +1,76 @@ +import torch +from torch.optim import Optimizer + + +class DM_RMSprop(Optimizer): + """Implements the form of RMSProp used in DM 2015 Atari paper. + Inspired by https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/updates.py""" + + def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= momentum: + raise ValueError("Invalid momentum value: {}".format(momentum)) + if not 0.0 <= weight_decay: + raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) + if not 0.0 <= alpha: + raise ValueError("Invalid alpha value: {}".format(alpha)) + + defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay) + super(DM_RMSprop, self).__init__(params, defaults) + + def __setstate__(self, state): + super(DM_RMSprop, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('momentum', 0) + group.setdefault('centered', False) + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + for group in self.param_groups: + momentum = group['momentum'] + sq_momentum = group['alpha'] + epsilon = group['eps'] + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('RMSprop does not support sparse gradients') + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + state['square_avg'] = torch.zeros_like(p.data) + if momentum > 0: + state['momentum_buffer'] = torch.zeros_like(p.data) + + mom_buffer = state['momentum_buffer'] + square_avg = state['square_avg'] + + + state['step'] += 1 + + mom_buffer.mul_(momentum) + mom_buffer.add_((1 - momentum) * grad) + + square_avg.mul_(sq_momentum).addcmul_(1 - sq_momentum, grad, grad) + + avg = (square_avg - mom_buffer**2 + epsilon).sqrt() + + p.data.addcdiv_(-group['lr'], grad, avg) + + return loss + diff --git a/utilities/LLM_image.png b/utilities/LLM_image.png new file mode 100644 index 0000000000000000000000000000000000000000..2ad4e63b4fefba40619803cc9ebc191d681b8d05 Binary files /dev/null and b/utilities/LLM_image.png differ diff --git a/utilities/Memory_Shaper.py b/utilities/Memory_Shaper.py new file mode 100644 index 0000000000000000000000000000000000000000..54e11098a7786634d79e3ff1a82c1106c41762e4 --- /dev/null +++ b/utilities/Memory_Shaper.py @@ -0,0 +1,109 @@ +# NOT FINISHED +from .data_structures.Action_Balanced_Replay_Buffer import Action_Balanced_Replay_Buffer +from .data_structures.Replay_Buffer import Replay_Buffer +import numpy as np +import random + +class Memory_Shaper(object): + """Takes in the experience of full episodes and reshapes it according to macro-actions you define. Then it provides + a replay buffer with this reshaped data to learn from""" + def __init__(self, buffer_size, batch_size, seed, new_reward_fn, action_balanced_replay_buffer=True): + self.reset() + self.buffer_size = buffer_size + self.batch_size = batch_size + self.seed = seed + self.new_reward_fn = new_reward_fn + self.action_balanced_replay_buffer = action_balanced_replay_buffer + + def put_adapted_experiences_in_a_replay_buffer(self, action_id_to_actions): + """Adds experiences to the replay buffer after re-imagining that the actions taken were macro-actions according to + action_rules as well as primitive actions. + + NOTE that we want to put both primitive actions and macro-actions into replay buffer so that it can learn that + its better to do a macro-action rather than the same primitive actions (which we will enforce with reward penalty) + """ + + actions_to_action_id = {v: k for k, v in action_id_to_actions.items()} + + self.num_actions = len(action_id_to_actions) + + print(actions_to_action_id) + + for key in actions_to_action_id.keys(): + assert isinstance(key, tuple) + assert isinstance(actions_to_action_id[key], int) + + episodes = len(self.states) + for data_type in [self.states, self.next_states, self.rewards, self.actions, self.dones]: + assert len(data_type) == episodes + + max_action_length = self.calculate_max_action_length(actions_to_action_id) + + if self.action_balanced_replay_buffer: + print("Using action balanced replay buffer") + replay_buffer = Action_Balanced_Replay_Buffer(self.buffer_size, self.batch_size, self.seed, num_actions=self.num_actions) + else: + print("Using ordinary replay buffer") + replay_buffer = Replay_Buffer(self.buffer_size, self.batch_size, self.seed) + + for episode_ix in range(episodes): + self.add_adapted_experience_for_an_episode(episode_ix, actions_to_action_id, max_action_length, replay_buffer) + + return replay_buffer + + def calculate_max_action_length(self, actions_to_action_id): + """Calculates the max length of the provided macro-actions""" + max_length = 0 + for key in actions_to_action_id.keys(): + action_length = len(key) + if action_length > max_length: + max_length = action_length + return max_length + + + def add_adapted_experience_for_an_episode(self, episode_ix, action_rules, max_action_length, replay_buffer): + """Adds all the experiences we have been given to a replay buffer after adapting experiences that involved doing a + macro action""" + states = self.states[episode_ix] + next_states = self.next_states[episode_ix] + rewards = self.rewards[episode_ix] + actions = self.actions[episode_ix] + dones = self.dones[episode_ix] + + assert len(states) == len(next_states) == len(rewards) == len(dones) == len(actions), "{} {} {} {} {} = {}".format(len(states), len(next_states), len(rewards), len(dones), len(actions), actions) + steps = len(states) + for step in range(steps): + replay_buffer.add_experience(states[step], actions[step], rewards[step], next_states[step], dones[step]) + for action_length in range(2, max_action_length + 1): + if step < action_length - 1: continue + action_sequence = tuple(actions[step - action_length + 1 : step + 1]) + assert all([action in range(self.num_actions) for action in action_sequence]), "All actions should be primitive here" + if action_sequence in action_rules.keys(): + new_action = action_rules[action_sequence] + new_state = states[step - action_length + 1] + new_reward = np.sum(rewards[step - action_length + 1:step + 1]) + new_reward = self.new_reward_fn(new_reward, len(action_sequence)) + new_next_state = next_states[step] + new_dones = dones[step] + replay_buffer.add_experience(new_state, new_action, new_reward, new_next_state, new_dones) + + + def add_episode_experience(self, states, next_states, rewards, actions, dones): + """Adds in an episode of experience""" + self.states.append(states) + self.next_states.append(next_states) + self.rewards.append(rewards) + self.actions.append(actions) + self.dones.append(dones) + + def reset(self): + self.states = [] + self.next_states = [] + self.rewards = [] + self.actions = [] + self.dones = [] + + + + + diff --git a/utilities/OU_Noise.py b/utilities/OU_Noise.py new file mode 100644 index 0000000000000000000000000000000000000000..c5dde9fce3eef98fad8dcecd3d4b5baf24e4def7 --- /dev/null +++ b/utilities/OU_Noise.py @@ -0,0 +1,22 @@ +import numpy as np +import random +import copy + +class OU_Noise(object): + """Ornstein-Uhlenbeck process.""" + def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2): + self.mu = mu * np.ones(size) + self.theta = theta + self.sigma = sigma + self.seed = random.seed(seed) + self.reset() + + def reset(self): + """Reset the internal state (= noise) to mean (mu).""" + self.state = copy.copy(self.mu) + + def sample(self): + """Update internal state and return it as a noise sample.""" + dx = self.theta * (self.mu - self.state) + self.sigma * np.array([np.random.normal() for _ in range(len(self.state))]) + self.state += dx + return self.state \ No newline at end of file diff --git a/utilities/Parallel_Experience_Generator.py b/utilities/Parallel_Experience_Generator.py new file mode 100644 index 0000000000000000000000000000000000000000..e3ed74bc821197efb783eb80e207ce337963bf0a --- /dev/null +++ b/utilities/Parallel_Experience_Generator.py @@ -0,0 +1,84 @@ +import random +import torch +import sys +from contextlib import closing +# +# from pathos.multiprocessing import ProcessingPool as Pool + +from torch.multiprocessing import Pool +from random import randint + +from utilities.OU_Noise import OU_Noise +from utilities.Utility_Functions import create_actor_distribution + +class Parallel_Experience_Generator(object): + """ Plays n episode in parallel using a fixed agent. Only works for PPO or DDPG type agents at the moment, not Q-learning agents""" + def __init__(self, environment, policy, seed, hyperparameters, action_size, use_GPU=False, action_choice_output_columns=None): + self.use_GPU = use_GPU + self.environment = environment + self.action_types = "DISCRETE" if self.environment.action_space.dtype in [int, 'int64'] else "CONTINUOUS" + self.action_size = action_size + self.policy = policy + self.action_choice_output_columns = action_choice_output_columns + self.hyperparameters = hyperparameters + if self.action_types == "CONTINUOUS": self.noise = OU_Noise(self.action_size, seed, self.hyperparameters["mu"], + self.hyperparameters["theta"], self.hyperparameters["sigma"]) + + + def play_n_episodes(self, n, exploration_epsilon=None): + """Plays n episodes in parallel using the fixed policy and returns the data""" + self.exploration_epsilon = exploration_epsilon + with closing(Pool(processes=n)) as pool: + results = pool.map(self, range(n)) + pool.terminate() + states_for_all_episodes = [episode[0] for episode in results] + actions_for_all_episodes = [episode[1] for episode in results] + rewards_for_all_episodes = [episode[2] for episode in results] + return states_for_all_episodes, actions_for_all_episodes, rewards_for_all_episodes + + def __call__(self, n): + exploration = max(0.0, random.uniform(self.exploration_epsilon / 3.0, self.exploration_epsilon * 3.0)) + return self.play_1_episode(exploration) + + def play_1_episode(self, epsilon_exploration): + """Plays 1 episode using the fixed policy and returns the data""" + state = self.reset_game() + done = False + episode_states = [] + episode_actions = [] + episode_rewards = [] + while not done: + action = self.pick_action(self.policy, state, epsilon_exploration) + next_state, reward, done, _ = self.environment.step(action) + if self.hyperparameters["clip_rewards"]: reward = max(min(reward, 1.0), -1.0) + episode_states.append(state) + episode_actions.append(action) + episode_rewards.append(reward) + state = next_state + return episode_states, episode_actions, episode_rewards + + def reset_game(self): + """Resets the game environment so it is ready to play a new episode""" + seed = randint(0, sys.maxsize) + torch.manual_seed(seed) # Need to do this otherwise each worker generates same experience + state = self.environment.reset() + if self.action_types == "CONTINUOUS": self.noise.reset() + return state + + def pick_action(self, policy, state, epsilon_exploration=None): + """Picks an action using the policy""" + if self.action_types == "DISCRETE": + if random.random() <= epsilon_exploration: + action = random.randint(0, self.action_size - 1) + return action + + state = torch.from_numpy(state).float().unsqueeze(0) + actor_output = policy.forward(state) + if self.action_choice_output_columns is not None: + actor_output = actor_output[:, self.action_choice_output_columns] + action_distribution = create_actor_distribution(self.action_types, actor_output, self.action_size) + action = action_distribution.sample().cpu() + + if self.action_types == "CONTINUOUS": action += torch.Tensor(self.noise.sample()) + else: action = action.item() + return action \ No newline at end of file diff --git a/utilities/PyTorch-logo-2.jpg b/utilities/PyTorch-logo-2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..279075d3797046c3ab0259d8d8675cfcbd89a249 Binary files /dev/null and b/utilities/PyTorch-logo-2.jpg differ diff --git a/utilities/RL_image.jpeg b/utilities/RL_image.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..d93ac10f4fc2a4189cf8b29c568f8b7b916842d0 Binary files /dev/null and b/utilities/RL_image.jpeg differ diff --git a/utilities/Tensorboard.py b/utilities/Tensorboard.py new file mode 100644 index 0000000000000000000000000000000000000000..66002f34c759792112ce9a82d476c9b1391ce51a --- /dev/null +++ b/utilities/Tensorboard.py @@ -0,0 +1,73 @@ +# NOTE that this code is not mine and was taken from https://becominghuman.ai/logging-in-tensorboard-with-pytorch-or-any-other-library-c549163dee9e + + +import io +import numpy as np +from PIL import Image +import tensorflow as tf + +# run tensorboard --logdir="logs/" on command line to get up the tensorboard afterwards + +class Tensorboard: + def __init__(self, logdir): + self.writer = tf.summary.FileWriter(logdir) + + def close(self): + self.writer.close() + + def log_scalar(self, tag, value, global_step): + summary = tf.Summary() + summary.value.add(tag=tag, simple_value=value) + self.writer.add_summary(summary, global_step=global_step) + self.writer.flush() + + def log_histogram(self, tag, values, global_step, bins): + counts, bin_edges = np.histogram(values, bins=bins) + + hist = tf.HistogramProto() + hist.min = float(np.min(values)) + hist.max = float(np.max(values)) + hist.num = int(np.prod(values.shape)) + hist.sum = float(np.sum(values)) + hist.sum_squares = float(np.sum(values ** 2)) + + bin_edges = bin_edges[1:] + + for edge in bin_edges: + hist.bucket_limit.append(edge) + for c in counts: + hist.bucket.append(c) + + summary = tf.Summary() + summary.value.add(tag=tag, histo=hist) + self.writer.add_summary(summary, global_step=global_step) + self.writer.flush() + + def log_image(self, tag, img, global_step): + s = io.BytesIO() + Image.fromarray(img).save(s, format='png') + + img_summary = tf.Summary.Image(encoded_image_string=s.getvalue(), + height=img.shape[0], + width=img.shape[1]) + + summary = tf.Summary() + summary.value.add(tag=tag, image=img_summary) + self.writer.add_summary(summary, global_step=global_step) + self.writer.flush() + + def log_plot(self, tag, figure, global_step): + plot_buf = io.BytesIO() + figure.savefig(plot_buf, format='png') + plot_buf.seek(0) + img = Image.open(plot_buf) + img_ar = np.array(img) + + img_summary = tf.Summary.Image(encoded_image_string=plot_buf.getvalue(), + height=img_ar.shape[0], + width=img_ar.shape[1]) + + summary = tf.Summary() + summary.value.add(tag=tag, image=img_summary) + self.writer.add_summary(summary, global_step=global_step) + self.writer.flush() \ No newline at end of file diff --git a/utilities/Utility_Functions.py b/utilities/Utility_Functions.py new file mode 100644 index 0000000000000000000000000000000000000000..0870df0fa868cce097c5c2560e6a44138aaf8f4a --- /dev/null +++ b/utilities/Utility_Functions.py @@ -0,0 +1,122 @@ +import math + +import numpy as np +from abc import ABCMeta +import torch +from nn_builder.pytorch.NN import NN +from torch.distributions import Categorical, normal, MultivariateNormal + +def abstract(cls): + return ABCMeta(cls.__name__, cls.__bases__, dict(cls.__dict__)) + +def save_score_results(file_path, results): + """Saves results as a numpy file at given path""" + np.save(file_path, results) + +def normalise_rewards(rewards): + """Normalises rewards to mean 0 and standard deviation 1""" + mean_reward = np.mean(rewards) + std_reward = np.std(rewards) + return (rewards - mean_reward) / (std_reward + 1e-8) #1e-8 added for stability + +def create_actor_distribution(action_types, actor_output, action_size): + """Creates a distribution that the actor can then use to randomly draw actions""" + if action_types == "DISCRETE": + assert actor_output.size()[1] == action_size, "Actor output the wrong size" + action_distribution = Categorical(actor_output) # this creates a distribution to sample from + else: + assert actor_output.size()[1] == action_size * 2, "Actor output the wrong size" + means = actor_output[:, :action_size].squeeze(0) + stds = actor_output[:, action_size:].squeeze(0) + if len(means.shape) == 2: means = means.squeeze(-1) + if len(stds.shape) == 2: stds = stds.squeeze(-1) + if len(stds.shape) > 1 or len(means.shape) > 1: + raise ValueError("Wrong mean and std shapes - {} -- {}".format(stds.shape, means.shape)) + action_distribution = normal.Normal(means.squeeze(0), torch.abs(stds)) + return action_distribution + +class SharedAdam(torch.optim.Adam): + """Creates an adam optimizer object that is shareable between processes. Useful for algorithms like A3C. Code + taken from https://github.com/ikostrikov/pytorch-a3c/blob/master/my_optim.py""" + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False): + super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad) + for group in self.param_groups: + for p in group['params']: + state = self.state[p] + state['step'] = torch.zeros(1) + state['exp_avg'] = p.data.new().resize_as_(p.data).zero_() + state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_() + + def share_memory(self): + for group in self.param_groups: + for p in group['params']: + state = self.state[p] + state['step'].share_memory_() + state['exp_avg'].share_memory_() + state['exp_avg_sq'].share_memory_() + + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + amsgrad = group['amsgrad'] + state = self.state[p] + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + if amsgrad: + max_exp_avg_sq = state['max_exp_avg_sq'] + beta1, beta2 = group['betas'] + state['step'] += 1 + if group['weight_decay'] != 0: + grad = grad.add(group['weight_decay'], p.data) + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(1 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + if amsgrad: + # Maintains the maximum of all 2nd moment running avg. till now + torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) + # Use the max. for normalizing running avg. of gradient + denom = max_exp_avg_sq.sqrt().add_(group['eps']) + else: + denom = exp_avg_sq.sqrt().add_(group['eps']) + bias_correction1 = 1 - beta1 ** state['step'].item() + bias_correction2 = 1 - beta2 ** state['step'].item() + step_size = group['lr'] * math.sqrt( + bias_correction2) / bias_correction1 + + p.data.addcdiv_(-step_size, exp_avg, denom) + return loss + +def flatten_action_id_to_actions(action_id_to_actions, global_action_id_to_primitive_action, num_primitive_actions): + """Converts the values in an action_id_to_actions dictionary back to the primitive actions they represent""" + flattened_action_id_to_actions = {} + for key in action_id_to_actions.keys(): + actions = action_id_to_actions[key] + raw_actions = backtrack_action_to_primitive_actions(actions, global_action_id_to_primitive_action, num_primitive_actions) + flattened_action_id_to_actions[key] = raw_actions + return flattened_action_id_to_actions + +def backtrack_action_to_primitive_actions(action_tuple, global_action_id_to_primitive_action, num_primitive_actions): + """Converts an action tuple back to the primitive actions it represents in a recursive way.""" + print("Recursing to backtrack on ", action_tuple) + primitive_actions = range(num_primitive_actions) + if all(action in primitive_actions for action in action_tuple): return action_tuple #base case + new_action_tuple = [] + for action in action_tuple: + if action in primitive_actions: new_action_tuple.append(action) + else: + converted_action = global_action_id_to_primitive_action[action] + print(new_action_tuple) + new_action_tuple.extend(converted_action) + print("Should have changed: ", new_action_tuple) + new_action_tuple = tuple(new_action_tuple) + return backtrack_action_to_primitive_actions(new_action_tuple) diff --git a/utilities/__pycache__/OU_Noise.cpython-310.pyc b/utilities/__pycache__/OU_Noise.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c14e3652a198d70ba0c7395b4cddbcc45702799 Binary files /dev/null and b/utilities/__pycache__/OU_Noise.cpython-310.pyc differ diff --git a/utilities/__pycache__/OU_Noise.cpython-39.pyc b/utilities/__pycache__/OU_Noise.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7189172f83e8120bd0b0424ba64a9d518f896d0f Binary files /dev/null and b/utilities/__pycache__/OU_Noise.cpython-39.pyc differ diff --git a/utilities/__pycache__/Parallel_Experience_Generator.cpython-310.pyc b/utilities/__pycache__/Parallel_Experience_Generator.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb66dcc0411dbb8d7bd2a14feb77a22b8767291c Binary files /dev/null and b/utilities/__pycache__/Parallel_Experience_Generator.cpython-310.pyc differ diff --git a/utilities/__pycache__/Parallel_Experience_Generator.cpython-39.pyc b/utilities/__pycache__/Parallel_Experience_Generator.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0469141fbb29254a889c1df1bc68c383ab3028ad Binary files /dev/null and b/utilities/__pycache__/Parallel_Experience_Generator.cpython-39.pyc differ diff --git a/utilities/__pycache__/Utility_Functions.cpython-310.pyc b/utilities/__pycache__/Utility_Functions.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5df83de365ff951896cc822f7e4ec32f0ae7fd52 Binary files /dev/null and b/utilities/__pycache__/Utility_Functions.cpython-310.pyc differ diff --git a/utilities/__pycache__/Utility_Functions.cpython-39.pyc b/utilities/__pycache__/Utility_Functions.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a2872ff2ae3d481583ec5541bd50df39faa08ee5 Binary files /dev/null and b/utilities/__pycache__/Utility_Functions.cpython-39.pyc differ diff --git a/utilities/data_structures/Action_Balanced_Replay_Buffer.py b/utilities/data_structures/Action_Balanced_Replay_Buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..f7f77fefbd8764a8d4b4d545241c85e4537d35fb --- /dev/null +++ b/utilities/data_structures/Action_Balanced_Replay_Buffer.py @@ -0,0 +1,104 @@ +import random +from collections import namedtuple, deque +import torch +import numpy as np +from .Replay_Buffer import Replay_Buffer + +class Action_Balanced_Replay_Buffer(Replay_Buffer): + """Replay buffer that provides sample of experiences that have an equal number of each action being conducted""" + def __init__(self, buffer_size, batch_size, seed, num_actions): + self.num_actions = num_actions + self.buffer_size_per_memory = int(buffer_size / self.num_actions) + + print("NUM ACTIONS ", self.num_actions) + self.memories = {action: deque(maxlen=self.buffer_size_per_memory) for action in range(self.num_actions)} + self.batch_size = batch_size + self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) + self.seed = random.seed(seed) + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + def add_experience(self, states, actions, rewards, next_states, dones): + """Adds experience or list of experiences into the replay buffer""" + if type(dones) == list: + assert type(dones[0]) != list, "A done shouldn't be a list" + experiences = [self.experience(state, action, reward, next_state, done) + for state, action, reward, next_state, done in + zip(states, actions, rewards, next_states, dones)] + for experience in experiences: + action = experience.action + self.memories[action].append(experience) + else: + experience = self.experience(states, actions, rewards, next_states, dones) + self.memories[actions].append(experience) + + def pick_experiences(self, num_experiences=None): + """Picks the experiences that the sample function will return as a random sample of experiences. It works by picking + an equal number of experiences that used each action (as far as possible)""" + if num_experiences: batch_size = num_experiences + else: batch_size = self.batch_size + batch_per_action = self.calculate_batch_sizes_per_action(batch_size) + samples_split_by_action = self.sample_each_action_equally(batch_per_action) + combined_sample = [] + for key in samples_split_by_action.keys(): + combined_sample.extend(samples_split_by_action[key]) + return combined_sample + + def calculate_batch_sizes_per_action(self, batch_size): + """Calculates the batch size we need to randomly draw from each action to make sure there is equal coverage + per action and that the batch gets filled up""" + min_batch_per_action = int(batch_size / self.num_actions) + batch_per_action = {k: min_batch_per_action for k in range(self.num_actions)} + current_batch_size = np.sum([batch_per_action[k] for k in range(self.num_actions)]) + remainder = batch_size - current_batch_size + give_remainder_to = random.sample(range(self.num_actions), remainder) + for action in give_remainder_to: + batch_per_action[action] += 1 + return batch_per_action + + def sample_each_action_equally(self, batch_per_action): + """Samples a number of experiences (determined by batch_per_action) from the memory buffer for each action""" + samples = {} + for action in range(self.num_actions): + memory = self.memories[action] + batch_size_for_action = batch_per_action[action] + action_memory_size = len(memory) + assert action_memory_size > 0, "Need at least 1 experience for each action" + if action_memory_size >= batch_size_for_action: + samples[action] = random.sample(memory, batch_size_for_action) + else: + print("Memory size {} vs. required batch size {}".format(action_memory_size, batch_size_for_action)) + samples_for_action = [] + while len(samples_for_action) < batch_per_action[action]: + remainder = batch_per_action[action] - len(samples_for_action) + sampled_experiences = random.sample(memory, min(remainder, action_memory_size)) + samples_for_action.extend(sampled_experiences) + samples[action] = samples_for_action + return samples + + def __len__(self): + return np.sum([len(memory) for memory in self.memories.values()]) + + def sample_experiences_with_certain_actions(self, allowed_actions, num_all_actions, required_batch_size): + """Samples a number of experiences where the action conducted was in the list of required actions""" + assert isinstance(allowed_actions, list) + assert len(allowed_actions) > 0 + + num_new_actions = len(allowed_actions) + experiences_to_sample = int(required_batch_size * float(num_all_actions) / float(num_new_actions)) + experiences = self.sample(num_experiences=experiences_to_sample) + states, actions, rewards, next_states, dones = experiences + matching_indexes = np.argwhere((np.in1d(actions.numpy(), allowed_actions))) + assert matching_indexes.shape[1] == 1 + + matching_indexes = matching_indexes[:, 0] + + states = states[matching_indexes] + actions = actions[matching_indexes] + rewards = rewards[matching_indexes] + next_states = next_states[matching_indexes] + dones = dones[matching_indexes] + + assert abs(states.shape[0] - required_batch_size) <= 0.05*required_batch_size, "{} vs. {}".format(states.shape[0], required_batch_size) + + + return (states, actions, rewards, next_states, dones) diff --git a/utilities/data_structures/Config.py b/utilities/data_structures/Config.py new file mode 100644 index 0000000000000000000000000000000000000000..8ddc01a3040a1fc62869abf65578badd00dd66fe --- /dev/null +++ b/utilities/data_structures/Config.py @@ -0,0 +1,22 @@ +class Config(object): + """Object to hold the config requirements for an agent/game""" + def __init__(self): + self.seed = None + self.environment = None + self.requirements_to_solve_game = None + self.num_episodes_to_run = None + self.file_to_save_data_results = None + self.file_to_save_results_graph = None + self.runs_per_agent = None + self.visualise_overall_results = None + self.visualise_individual_results = None + self.hyperparameters = None + self.use_GPU = None + self.overwrite_existing_results_file = None + self.save_model = False + self.standard_deviation_results = 1.0 + self.randomise_random_seed = True + self.show_solution_score = False + self.debug_mode = False + + diff --git a/utilities/data_structures/Deque.py b/utilities/data_structures/Deque.py new file mode 100644 index 0000000000000000000000000000000000000000..610d4d121511965afd1782659fc1aac22b9bf8cd --- /dev/null +++ b/utilities/data_structures/Deque.py @@ -0,0 +1,49 @@ +import numpy as np +from utilities.data_structures.Node import Node + +class Deque(object): + """Generic deque object""" + def __init__(self, max_size, dimension_of_value_attribute): + + self.max_size = max_size + self.dimension_of_value_attribute = dimension_of_value_attribute + self.deque = self.initialise_deque() + self.deque_index_to_overwrite_next = 0 + self.reached_max_capacity = False + self.number_experiences_in_deque = 0 + + def initialise_deque(self): + """Initialises a queue of Nodes of length self.max_size""" + deque = np.array([Node(0, tuple([None for _ in range(self.dimension_of_value_attribute)])) for _ in range(self.max_size)]) + return deque + + def add_element_to_deque(self, new_key, new_value): + """Adds an element to the deque and then updates the index of the next element to be overwritten and also the + amount of elements in the deque""" + self.update_deque_node_key_and_value(self.deque_index_to_overwrite_next, new_key, new_value) + self.update_number_experiences_in_deque() + self.update_deque_index_to_overwrite_next() + + def update_deque_node_key_and_value(self, index, new_key, new_value): + self.update_deque_node_key(index, new_key) + self.update_deque_node_value(index, new_value) + + def update_deque_node_key(self, index, new_key): + self.deque[index].update_key(new_key) + + def update_deque_node_value(self, index, new_value): + self.deque[index].update_value(new_value) + + def update_deque_index_to_overwrite_next(self): + """Updates the deque index that we should write over next. When the buffer gets full we begin writing over + older experiences""" + if self.deque_index_to_overwrite_next < self.max_size - 1: + self.deque_index_to_overwrite_next += 1 + else: + self.reached_max_capacity = True + self.deque_index_to_overwrite_next = 0 + + def update_number_experiences_in_deque(self): + """Keeps track of how many experiences there are in the buffer""" + if not self.reached_max_capacity: + self.number_experiences_in_deque += 1 \ No newline at end of file diff --git a/utilities/data_structures/Max_Heap.py b/utilities/data_structures/Max_Heap.py new file mode 100644 index 0000000000000000000000000000000000000000..150bea2fd6129a09d57192eaa76b1e70ee46d862 --- /dev/null +++ b/utilities/data_structures/Max_Heap.py @@ -0,0 +1,63 @@ +import numpy as np +from utilities.data_structures.Node import Node + +class Max_Heap(object): + """Generic max heap object""" + def __init__(self, max_size, dimension_of_value_attribute, default_key_to_use): + + self.max_size = max_size + self.dimension_of_value_attribute = dimension_of_value_attribute + self.default_key_to_use = default_key_to_use + self.heap = self.initialise_heap() + + def initialise_heap(self): + """Initialises a heap of Nodes of length self.max_size * 4 + 1""" + heap = np.array([Node(self.default_key_to_use, tuple([None for _ in range(self.dimension_of_value_attribute)])) for _ in range(self.max_size * 4 + 1)]) + + # We don't use the 0th element in a heap so we want it to have infinite value so it is never swapped with a lower node + heap[0] = Node(float("inf"), (None, None, None, None, None)) + return heap + + def update_element_and_reorganise_heap(self, heap_index_for_change, new_element): + self.update_heap_element(heap_index_for_change, new_element) + self.reorganise_heap(heap_index_for_change) + + def update_heap_element(self, heap_index, new_element): + self.heap[heap_index] = new_element + + def reorganise_heap(self, heap_index_changed): + """This reorganises the heap after a new value is added so as to keep the max value at the top of the heap which + is index position 1 in the array self.heap""" + + node_key = self.heap[heap_index_changed].key + parent_index = int(heap_index_changed / 2) + + if node_key > self.heap[parent_index].key: + self.swap_heap_elements(heap_index_changed, parent_index) + self.reorganise_heap(parent_index) + + else: + biggest_child_index = self.calculate_index_of_biggest_child(heap_index_changed) + if node_key < self.heap[biggest_child_index].key: + self.swap_heap_elements(heap_index_changed, biggest_child_index) + self.reorganise_heap(biggest_child_index) + + def swap_heap_elements(self, index1, index2): + """Swaps the position of two heap elements""" + self.heap[index1], self.heap[index2] = self.heap[index2], self.heap[index1] + + def calculate_index_of_biggest_child(self, heap_index_changed): + """Calculates the heap index of the node's child with the biggest td_error value""" + left_child = self.heap[int(heap_index_changed * 2)] + right_child = self.heap[int(heap_index_changed * 2) + 1] + + if left_child.key > right_child.key: + biggest_child_index = heap_index_changed * 2 + else: + biggest_child_index = heap_index_changed * 2 + 1 + + return biggest_child_index + + def give_max_key(self): + """Returns the maximum td error currently in the heap. Because it is a max heap this is the top element of the heap""" + return self.heap[1].key diff --git a/utilities/data_structures/Node.py b/utilities/data_structures/Node.py new file mode 100644 index 0000000000000000000000000000000000000000..bcb12c5bd94b8ec55dd7ad1ec039d4e30992a0da --- /dev/null +++ b/utilities/data_structures/Node.py @@ -0,0 +1,18 @@ +class Node(object): + """Generic Node class. Used in the implementation of a prioritised replay buffer""" + def __init__(self, key, value): + self.key = key + self.value = value + + def update_key_and_value(self, new_key, new_value): + self.update_key(new_key) + self.update_value(new_value) + + def update_key(self, new_key): + self.key = new_key + + def update_value(self, new_value): + self.value = new_value + + def __eq__(self, other): + return self.key == other.key and self.value == other.value \ No newline at end of file diff --git a/utilities/data_structures/Prioritised_Replay_Buffer.py b/utilities/data_structures/Prioritised_Replay_Buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..46c2b4d4c19f53b23be2675c48fbdccfd400e86d --- /dev/null +++ b/utilities/data_structures/Prioritised_Replay_Buffer.py @@ -0,0 +1,154 @@ +import numpy as np +import torch +from utilities.data_structures.Deque import Deque +from utilities.data_structures.Max_Heap import Max_Heap + +class Prioritised_Replay_Buffer(Max_Heap, Deque): + """Data structure that maintains a deque, a heap and an array. The deque keeps track of which experiences are the oldest and so + tells us which ones to delete once the buffer starts getting full. The heap lets us quickly retrieve the experience + with the max td_value. And the array lets us do quick random samples with probabilities equal to the proportional td errors. + We also keep track of the sum of the td values using a simple variable. + + NOTE that this implementation is not optimal in terms of speed. At some point I will make improvements to it. + + """ + + def __init__(self, hyperparameters, seed=0): + Max_Heap.__init__(self, hyperparameters["buffer_size"], dimension_of_value_attribute=5, default_key_to_use=0) + Deque.__init__(self, hyperparameters["buffer_size"], dimension_of_value_attribute=5) + np.random.seed(seed) + + self.deques_td_errors = self.initialise_td_errors_array() + + self.heap_index_to_overwrite_next = 1 + self.number_experiences_in_deque = 0 + self.adapted_overall_sum_of_td_errors = 0 + + self.alpha = hyperparameters["alpha_prioritised_replay"] + self.beta = hyperparameters["beta_prioritised_replay"] + self.incremental_td_error = hyperparameters["incremental_td_error"] + self.batch_size = hyperparameters["batch_size"] + + self.heap_indexes_to_update_td_error_for = None + + self.indexes_in_node_value_tuple = { + "state": 0, + "action": 1, + "reward": 2, + "next_state": 3, + "done": 4 + } + # self.device = torch.device("cpu") + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + def initialise_td_errors_array(self): + """Initialises a deque of Nodes of length self.max_size""" + return np.zeros(self.max_size) + + def add_experience(self, raw_td_error, state, action, reward, next_state, done): + """Save an experience in the replay buffer""" + td_error = (abs(raw_td_error) + self.incremental_td_error) ** self.alpha + self.update_overall_sum(td_error, self.deque[self.deque_index_to_overwrite_next].key) + self.update_deque_and_deque_td_errors(td_error, state, action, reward, next_state, done) + self.update_heap_and_heap_index_to_overwrite() + self.update_number_experiences_in_deque() + self.update_deque_index_to_overwrite_next() + + def update_overall_sum(self, new_td_error, old_td_error): + """Updates the overall sum of td_values present in the buffer""" + self.adapted_overall_sum_of_td_errors += new_td_error - old_td_error + + def update_deque_and_deque_td_errors(self, td_error, state, action, reward, next_state, done): + """Updates the deque by overwriting the oldest experience with the experience provided""" + self.deques_td_errors[self.deque_index_to_overwrite_next] = td_error + self.add_element_to_deque(td_error, (state, action, reward, next_state, done)) + + def add_element_to_deque(self, new_key, new_value): + """Adds an element to the deque""" + self.update_deque_node_key_and_value(self.deque_index_to_overwrite_next, new_key, new_value) + + def update_heap_and_heap_index_to_overwrite(self): + """Updates the heap by rearranging it given the new experience that was just incorporated into it. If we haven't + reached max capacity then the new experience is added directly into the heap, otherwise a pointer on the heap has + changed to reflect the new experience so there's no need to add it in""" + if not self.reached_max_capacity: + self.update_heap_element(self.heap_index_to_overwrite_next, self.deque[self.deque_index_to_overwrite_next]) + self.deque[self.deque_index_to_overwrite_next].heap_index = self.heap_index_to_overwrite_next + self.update_heap_index_to_overwrite_next() + + heap_index_change = self.deque[self.deque_index_to_overwrite_next].heap_index + self.reorganise_heap(heap_index_change) + + def update_heap_index_to_overwrite_next(self): + """This updates the heap index to write over next. Once the buffer gets full we stop calling this function because + the nodes the heap points to start being changed directly rather than the pointers on the heap changing""" + self.heap_index_to_overwrite_next += 1 + + def swap_heap_elements(self, index1, index2): + """Swaps two position of two heap elements and then updates the heap_index stored in the two nodes. We have to override + this method from Max_Heap so that it also updates the heap_index variables""" + self.heap[index1], self.heap[index2] = self.heap[index2], self.heap[index1] + self.heap[index1].heap_index = index1 + self.heap[index2].heap_index = index2 + + def sample(self, rank_based=True): + """Randomly samples a batch from experiences giving a higher likelihood to experiences with a higher td error. It then + calculates an importance sampling weight for each sampled experience, you can read about this in the paper: + https://arxiv.org/pdf/1511.05952.pdf""" + experiences, deque_sample_indexes = self.pick_experiences_based_on_proportional_td_error() + states, actions, rewards, next_states, dones = self.separate_out_data_types(experiences) + self.deque_sample_indexes_to_update_td_error_for = deque_sample_indexes + importance_sampling_weights = self.calculate_importance_sampling_weights(experiences) + return (states, actions, rewards, next_states, dones), importance_sampling_weights + + def pick_experiences_based_on_proportional_td_error(self): + """Randomly picks a batch of experiences with probability equal to their proportional td_errors""" + probabilities = self.deques_td_errors / self.give_adapted_sum_of_td_errors() + deque_sample_indexes = np.random.choice(range(len(self.deques_td_errors)), size=self.batch_size, replace=False, p=probabilities) + experiences = self.deque[deque_sample_indexes] + return experiences, deque_sample_indexes + + def separate_out_data_types(self, experiences): + """Separates out experiences into their different parts and makes them tensors ready to be used in a pytorch model""" + states = torch.from_numpy(np.vstack([e.value[self.indexes_in_node_value_tuple["state"]] for e in experiences])).float().to(self.device) + actions = torch.from_numpy(np.vstack([e.value[self.indexes_in_node_value_tuple["action"]] for e in experiences])).float().to(self.device) + rewards = torch.from_numpy(np.vstack([e.value[self.indexes_in_node_value_tuple["reward"]] for e in experiences])).float().to(self.device) + next_states = torch.from_numpy(np.vstack([e.value[self.indexes_in_node_value_tuple["next_state"]] for e in experiences])).float().to( + self.device) + dones = torch.from_numpy(np.vstack([int(e.value[self.indexes_in_node_value_tuple["done"]]) for e in experiences])).float().to(self.device) + + return states, actions, rewards, next_states, dones + + def calculate_importance_sampling_weights(self, experiences): + """Calculates the importance sampling weight of each observation in the sample. The weight is proportional to the td_error of the observation, + see the paper here for more details: https://arxiv.org/pdf/1511.05952.pdf""" + td_errors = [experience.key for experience in experiences] + importance_sampling_weights = [((1.0 / self.number_experiences_in_deque) * (self.give_adapted_sum_of_td_errors() / td_error)) ** self.beta for td_error in td_errors] + sample_max_importance_weight = max(importance_sampling_weights) + importance_sampling_weights = [is_weight / sample_max_importance_weight for is_weight in importance_sampling_weights] + importance_sampling_weights = torch.tensor(importance_sampling_weights).float().to(self.device) + return importance_sampling_weights + + def update_td_errors(self, td_errors): + """Updates the td_errors for the provided heap indexes. The indexes should be the observations provided most + recently by the give_sample method""" + for raw_td_error, deque_index in zip(td_errors, self.deque_sample_indexes_to_update_td_error_for): + td_error = (abs(raw_td_error) + self.incremental_td_error) ** self.alpha + corresponding_heap_index = self.deque[deque_index].heap_index + self.update_overall_sum(td_error, self.heap[corresponding_heap_index].key) + self.heap[corresponding_heap_index].key = td_error + self.reorganise_heap(corresponding_heap_index) + self.deques_td_errors[deque_index] = td_error + + def give_max_td_error(self): + """Returns the maximum td error currently in the heap. Because it is a max heap this is the top element of the heap""" + return self.give_max_key() + + def give_adapted_sum_of_td_errors(self): + """Returns the sum of td errors of the experiences currently in the heap""" + return self.adapted_overall_sum_of_td_errors + + def __len__(self): + """Tells us how many experiences there are in the replay buffer. This number will never exceed self.max_size""" + return self.number_experiences_in_deque + diff --git a/utilities/data_structures/Replay_Buffer.py b/utilities/data_structures/Replay_Buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..a1734a1c269c1abff8b6e518f3d04335a67ae1da --- /dev/null +++ b/utilities/data_structures/Replay_Buffer.py @@ -0,0 +1,57 @@ +from collections import namedtuple, deque +import random +import torch +import numpy as np + +class Replay_Buffer(object): + """Replay buffer to store past experiences that the agent can then use for training data""" + + def __init__(self, buffer_size, batch_size, seed, device=None): + + self.memory = deque(maxlen=buffer_size) + self.batch_size = batch_size + self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) + self.seed = random.seed(seed) + if device: + self.device = torch.device(device) + else: + self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + def add_experience(self, states, actions, rewards, next_states, dones): + """Adds experience(s) into the replay buffer""" + if type(dones) == list: + assert type(dones[0]) != list, "A done shouldn't be a list" + experiences = [self.experience(state, action, reward, next_state, done) + for state, action, reward, next_state, done in + zip(states, actions, rewards, next_states, dones)] + self.memory.extend(experiences) + else: + experience = self.experience(states, actions, rewards, next_states, dones) + self.memory.append(experience) + + def sample(self, num_experiences=None, separate_out_data_types=True): + """Draws a random sample of experience from the replay buffer""" + experiences = self.pick_experiences(num_experiences) + if separate_out_data_types: + states, actions, rewards, next_states, dones = self.separate_out_data_types(experiences) + return states, actions, rewards, next_states, dones + else: + return experiences + + def separate_out_data_types(self, experiences): + """Puts the sampled experience into the correct format for a PyTorch neural network""" + states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(self.device) + actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(self.device) + rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device) + next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(self.device) + dones = torch.from_numpy(np.vstack([int(e.done) for e in experiences if e is not None])).float().to(self.device) + + return states, actions, rewards, next_states, dones + + def pick_experiences(self, num_experiences=None): + if num_experiences is not None: batch_size = num_experiences + else: batch_size = self.batch_size + return random.sample(self.memory, k=batch_size) + + def __len__(self): + return len(self.memory) diff --git a/utilities/data_structures/Tanh_Distribution.py b/utilities/data_structures/Tanh_Distribution.py new file mode 100644 index 0000000000000000000000000000000000000000..ca46055753b7b5a1f312e4a38b3c4dc39b1e54ff --- /dev/null +++ b/utilities/data_structures/Tanh_Distribution.py @@ -0,0 +1,78 @@ + +# NOTE that this is not my code. +# Taken from here: https://github.com/vitchyr/rlkit/blob/master/rlkit/torch/distributions.py + + +import torch +from torch.distributions import Distribution, Normal + + +class TanhNormal(Distribution): + """ + Represent distribution of X where + X ~ tanh(Z) + Z ~ N(mean, std) + Note: this is not very numerically stable. + """ + def __init__(self, normal_mean, normal_std, epsilon=1e-6): + """ + :param normal_mean: Mean of the normal distribution + :param normal_std: Std of the normal distribution + :param epsilon: Numerical stability epsilon when computing log-prob. + """ + self.normal_mean = normal_mean + self.normal_std = normal_std + self.normal = Normal(normal_mean, normal_std) + self.epsilon = epsilon + + def sample_n(self, n, return_pre_tanh_value=False): + z = self.normal.sample_n(n) + if return_pre_tanh_value: + return torch.tanh(z), z + else: + return torch.tanh(z) + + def log_prob(self, value, pre_tanh_value=None): + """ + :param value: some value, x + :param pre_tanh_value: arctanh(x) + :return: + """ + if pre_tanh_value is None: + pre_tanh_value = torch.log( + (1+value) / (1-value) + ) / 2 + return self.normal.log_prob(pre_tanh_value) - torch.log( + 1 - value * value + self.epsilon + ) + + def sample(self, return_pretanh_value=False): + """ + Gradients will and should *not* pass through this operation. + See https://github.com/pytorch/pytorch/issues/4620 for discussion. + """ + z = self.normal.sample().detach() + + if return_pretanh_value: + return torch.tanh(z), z + else: + return torch.tanh(z) + + def rsample(self, return_pretanh_value=False): + """ + Sampling in the reparameterization case. + """ + z = ( + self.normal_mean + + self.normal_std * + Normal( + torch.zeros(self.normal_mean.size()), + torch.ones(self.normal_std.size()) + ).sample() + ) + z.requires_grad_() + + if return_pretanh_value: + return torch.tanh(z), z + else: + return torch.tanh(z) \ No newline at end of file diff --git a/utilities/data_structures/__pycache__/Config.cpython-310.pyc b/utilities/data_structures/__pycache__/Config.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bc53e7ffc659728b7a521c56db200ebc3ae14f70 Binary files /dev/null and b/utilities/data_structures/__pycache__/Config.cpython-310.pyc differ diff --git a/utilities/data_structures/__pycache__/Config.cpython-38.pyc b/utilities/data_structures/__pycache__/Config.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca7e004ba86a424cbb7c8ae5d5447c021844b22c Binary files /dev/null and b/utilities/data_structures/__pycache__/Config.cpython-38.pyc differ diff --git a/utilities/data_structures/__pycache__/Config.cpython-39.pyc b/utilities/data_structures/__pycache__/Config.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb896ed3ad949baede4b1fdb555600a2b18b1a4e Binary files /dev/null and b/utilities/data_structures/__pycache__/Config.cpython-39.pyc differ diff --git a/utilities/data_structures/__pycache__/Deque.cpython-310.pyc b/utilities/data_structures/__pycache__/Deque.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49899d8dca31399c49964f80e86383e154e6cf84 Binary files /dev/null and b/utilities/data_structures/__pycache__/Deque.cpython-310.pyc differ diff --git a/utilities/data_structures/__pycache__/Deque.cpython-39.pyc b/utilities/data_structures/__pycache__/Deque.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..146749dd0dd9ff979bcbab8432eea01c63ce5864 Binary files /dev/null and b/utilities/data_structures/__pycache__/Deque.cpython-39.pyc differ diff --git a/utilities/data_structures/__pycache__/Max_Heap.cpython-310.pyc b/utilities/data_structures/__pycache__/Max_Heap.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..09e6462f32a511cf6b802a714b774320d261252d Binary files /dev/null and b/utilities/data_structures/__pycache__/Max_Heap.cpython-310.pyc differ diff --git a/utilities/data_structures/__pycache__/Max_Heap.cpython-39.pyc b/utilities/data_structures/__pycache__/Max_Heap.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..210cb1c39ddad9908bd4050f97f764e1588fe6d0 Binary files /dev/null and b/utilities/data_structures/__pycache__/Max_Heap.cpython-39.pyc differ diff --git a/utilities/data_structures/__pycache__/Node.cpython-310.pyc b/utilities/data_structures/__pycache__/Node.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c9f0893951f86c839ee3dd6c3d03c2d36bddb53d Binary files /dev/null and b/utilities/data_structures/__pycache__/Node.cpython-310.pyc differ diff --git a/utilities/data_structures/__pycache__/Node.cpython-39.pyc b/utilities/data_structures/__pycache__/Node.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f43ea58c310ca5fef46e4e39e263442e8424a9f2 Binary files /dev/null and b/utilities/data_structures/__pycache__/Node.cpython-39.pyc differ diff --git a/utilities/data_structures/__pycache__/Prioritised_Replay_Buffer.cpython-310.pyc b/utilities/data_structures/__pycache__/Prioritised_Replay_Buffer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d41984866c088ca4759e0077d7c7f852a3a5b7b3 Binary files /dev/null and b/utilities/data_structures/__pycache__/Prioritised_Replay_Buffer.cpython-310.pyc differ diff --git a/utilities/data_structures/__pycache__/Prioritised_Replay_Buffer.cpython-39.pyc b/utilities/data_structures/__pycache__/Prioritised_Replay_Buffer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f6bbd854368505a7a7b598a92facae9e0a63f0e Binary files /dev/null and b/utilities/data_structures/__pycache__/Prioritised_Replay_Buffer.cpython-39.pyc differ diff --git a/utilities/data_structures/__pycache__/Replay_Buffer.cpython-310.pyc b/utilities/data_structures/__pycache__/Replay_Buffer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0337d1648b176b0e5ff859fc31dce3c32b528a6d Binary files /dev/null and b/utilities/data_structures/__pycache__/Replay_Buffer.cpython-310.pyc differ diff --git a/utilities/data_structures/__pycache__/Replay_Buffer.cpython-39.pyc b/utilities/data_structures/__pycache__/Replay_Buffer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7de1a334d661427275ab60a095f6e21b496332ab Binary files /dev/null and b/utilities/data_structures/__pycache__/Replay_Buffer.cpython-39.pyc differ diff --git a/utilities/grammar_algorithms/k_Sequitur.py b/utilities/grammar_algorithms/k_Sequitur.py new file mode 100644 index 0000000000000000000000000000000000000000..0de4ae671d77b4283ec986f7d20f28540755fa3a --- /dev/null +++ b/utilities/grammar_algorithms/k_Sequitur.py @@ -0,0 +1,166 @@ + +# My implementation of the k_Sequitur algorithm described in the papers: https://arxiv.org/pdf/cs/9709102.pdf +# and https://www.biorxiv.org/content/biorxiv/early/2018/03/13/281543.full.pdf +# The algorithm takes in a sequence and forms a grammar using two rules: +# 1) No pair of adjacent symbols appears more than k times in the grammar +# 2) Every rule in the grammar is used more than k times +# +# e.g. string "abddddeabde" with k=2 would turn to: +# "AdddBAB" +# R1: A --> ab +# R2: B --> de + +# TODO fix the fact that it sometimes provides rules that have end of episode symbol in them +# TODO add an option to return rules in terms of the amount of times they appear in a set of provided episodes + +from collections import defaultdict, Counter + + +class k_Sequitur(object): + + def __init__(self, k, end_of_episode_symbol="/"): + self.k = k + self.end_of_episode_symbol = end_of_episode_symbol + self.next_rule_name_ix = 0 + + def generate_action_grammar(self, actions): + """Generates a grammar given a list of actions""" + assert isinstance(actions, list), actions + assert not isinstance(actions[0], list), "Should be 1 long list of actions - {}".format(actions[0]) + assert len(actions) > 0, "Need to provide a list of at least 1 action" + assert isinstance(actions[0], int), "The actions should be integers" + new_actions, all_rules, rule_usage, rules_episode_appearance_count = self.discover_all_rules_and_new_actions_representation(actions) + action_usage = self.extract_action_usage_from_rule_usage(rule_usage, all_rules) + rules_episode_appearance_count = self.extract_action_usage_from_rule_usage(rules_episode_appearance_count, + all_rules) + return new_actions, all_rules, action_usage, rules_episode_appearance_count + + def discover_all_rules_and_new_actions_representation(self, actions): + """Takes in a list of actions and discovers all the rules present that get used more than self.k times and the + subsequent new actions list when all rules are applied recursively""" + all_rules = {} + current_actions = None + new_actions = actions + rule_usage = defaultdict(int) + num_episodes = Counter(actions)[self.end_of_episode_symbol] + rules_episode_appearance_tracker = {k: defaultdict(int) for k in range(num_episodes)} + + while new_actions != current_actions: + current_actions = new_actions + rules, reverse_rules = self.generate_1_layer_of_rules(current_actions) + all_rules.update(rules) + new_actions, rules_usage_count = self.convert_a_string_using_reverse_rules(current_actions, reverse_rules, + rules_episode_appearance_tracker) + for key in rules_usage_count.keys(): + rule_usage[key] += rules_usage_count[key] + + rules_episode_appearance_count = defaultdict(int) + + for episode in range(num_episodes): + rule_apperance_tracker = rules_episode_appearance_tracker[episode] + for key in rule_apperance_tracker.keys(): + if rule_apperance_tracker[key] == 1: + rules_episode_appearance_count[key] += 1 + + return new_actions, all_rules, rule_usage, rules_episode_appearance_count + + def generate_1_layer_of_rules(self, string): + """Generate dictionaries indicating the pair of symbols that appear next to each other more than self.k times""" + pairs_of_symbols = defaultdict(int) + last_pair = None + skip_next_symbol = False + rules = {} + + assert string[-1] == self.end_of_episode_symbol, "Final element of string must be self.end_of_episode_symbol {}".format(string) + + for ix in range(len(string) - 1): + # We skip the next symbol if it is already being used in a rule we just made + if skip_next_symbol: + skip_next_symbol = False + continue + + pair = (string[ix], string[ix+1]) + + # We don't count a pair if it was the previous pair (and therefore we have 3 of the same symbols in a row) + if pair != last_pair: + pairs_of_symbols[pair] += 1 + last_pair = pair + else: last_pair = None + if pairs_of_symbols[pair] >= self.k: + previous_pair = (string[ix-1], string[ix]) + pairs_of_symbols[previous_pair] -= 1 + skip_next_symbol = True + if pair not in rules.values() and self.end_of_episode_symbol not in pair: + rule_name = self.get_next_rule_name() + rules[rule_name] = pair + reverse_rules = {v: k for k, v in rules.items()} + return rules, reverse_rules + + def get_next_rule_name(self): + """Returns next rule name to use and increments count """ + next_rule_name = "R{}".format(self.next_rule_name_ix) + self.next_rule_name_ix += 1 + return next_rule_name + + def convert_symbol_to_raw_actions(self, symbol, rules): + """Converts a symbol back to the sequence of raw actions it represents""" + assert not isinstance(symbol, list) + assert isinstance(symbol, str) or isinstance(symbol, int) + symbol = [symbol] + finished = False + while not finished: + new_symbol = [] + for symbol_val in symbol: + if symbol_val in rules.keys(): + new_symbol.append(rules[symbol_val][0]) + new_symbol.append(rules[symbol_val][1]) + else: + new_symbol.append(symbol_val) + if new_symbol == symbol: finished = True + else: symbol = new_symbol + new_symbol = tuple(new_symbol) + return new_symbol + + def extract_action_usage_from_rule_usage(self, rule_usage, all_rules): + """Extracts the usage of each action (of 2 or more primitive actions) out from the usage of each rule""" + action_usage = {} + for key in rule_usage.keys(): + action_usage[self.convert_symbol_to_raw_actions(key, all_rules)] = rule_usage[key] + return action_usage + + def convert_a_string_using_reverse_rules(self, string, reverse_rules, rules_episode_appearance_tracker): + """Converts a string using the rules we have previously generated""" + new_string = [] + skip_next_element = False + rules_usage_count = defaultdict(int) + + episode = 0 + + rules_used_this_episode = [] + + for ix in range(len(string)): + if string[ix] == self.end_of_episode_symbol: + rules_used_this_episode = set(rules_used_this_episode) + for rule in rules_used_this_episode: + rules_episode_appearance_tracker[episode][rule] = 1 + rules_used_this_episode = [] + episode += 1 + + if skip_next_element: + skip_next_element = False + continue + # If is last element in string and wasn't just part of a pair then we add it to new string and finish + if ix == len(string) - 1: + new_string.append(string[ix]) + continue + pair = (string[ix], string[ix+1]) + if pair in reverse_rules.keys(): + result = reverse_rules[pair] + rules_usage_count[result] += 1 + rules_used_this_episode.append(result) + new_string.append(result) + skip_next_element = True + else: + new_string.append(string[ix]) + return new_string, rules_usage_count +