jam_shield_LLM_app / utilities /Parallel_Experience_Generator.py
asataura's picture
initial commit
6fa23b0
raw
history blame
4.01 kB
import random
import torch
import sys
from contextlib import closing
#
# from pathos.multiprocessing import ProcessingPool as Pool
from torch.multiprocessing import Pool
from random import randint
from utilities.OU_Noise import OU_Noise
from utilities.Utility_Functions import create_actor_distribution
class Parallel_Experience_Generator(object):
""" Plays n episode in parallel using a fixed agent. Only works for PPO or DDPG type agents at the moment, not Q-learning agents"""
def __init__(self, environment, policy, seed, hyperparameters, action_size, use_GPU=False, action_choice_output_columns=None):
self.use_GPU = use_GPU
self.environment = environment
self.action_types = "DISCRETE" if self.environment.action_space.dtype in [int, 'int64'] else "CONTINUOUS"
self.action_size = action_size
self.policy = policy
self.action_choice_output_columns = action_choice_output_columns
self.hyperparameters = hyperparameters
if self.action_types == "CONTINUOUS": self.noise = OU_Noise(self.action_size, seed, self.hyperparameters["mu"],
self.hyperparameters["theta"], self.hyperparameters["sigma"])
def play_n_episodes(self, n, exploration_epsilon=None):
"""Plays n episodes in parallel using the fixed policy and returns the data"""
self.exploration_epsilon = exploration_epsilon
with closing(Pool(processes=n)) as pool:
results = pool.map(self, range(n))
pool.terminate()
states_for_all_episodes = [episode[0] for episode in results]
actions_for_all_episodes = [episode[1] for episode in results]
rewards_for_all_episodes = [episode[2] for episode in results]
return states_for_all_episodes, actions_for_all_episodes, rewards_for_all_episodes
def __call__(self, n):
exploration = max(0.0, random.uniform(self.exploration_epsilon / 3.0, self.exploration_epsilon * 3.0))
return self.play_1_episode(exploration)
def play_1_episode(self, epsilon_exploration):
"""Plays 1 episode using the fixed policy and returns the data"""
state = self.reset_game()
done = False
episode_states = []
episode_actions = []
episode_rewards = []
while not done:
action = self.pick_action(self.policy, state, epsilon_exploration)
next_state, reward, done, _ = self.environment.step(action)
if self.hyperparameters["clip_rewards"]: reward = max(min(reward, 1.0), -1.0)
episode_states.append(state)
episode_actions.append(action)
episode_rewards.append(reward)
state = next_state
return episode_states, episode_actions, episode_rewards
def reset_game(self):
"""Resets the game environment so it is ready to play a new episode"""
seed = randint(0, sys.maxsize)
torch.manual_seed(seed) # Need to do this otherwise each worker generates same experience
state = self.environment.reset()
if self.action_types == "CONTINUOUS": self.noise.reset()
return state
def pick_action(self, policy, state, epsilon_exploration=None):
"""Picks an action using the policy"""
if self.action_types == "DISCRETE":
if random.random() <= epsilon_exploration:
action = random.randint(0, self.action_size - 1)
return action
state = torch.from_numpy(state).float().unsqueeze(0)
actor_output = policy.forward(state)
if self.action_choice_output_columns is not None:
actor_output = actor_output[:, self.action_choice_output_columns]
action_distribution = create_actor_distribution(self.action_types, actor_output, self.action_size)
action = action_distribution.sample().cpu()
if self.action_types == "CONTINUOUS": action += torch.Tensor(self.noise.sample())
else: action = action.item()
return action