Spaces:
Running
Running
File size: 47,813 Bytes
be5548b |
|
from abc import ABC, abstractmethod
import numpy as np
import torch
from torch_ac.format import default_preprocess_obss
from torch_ac.utils import DictList, ParallelEnv
from torch_ac.intrinsic_reward_models import *
from collections import Counter
class BaseAlgo(ABC):
"""The base class for RL algorithms."""
def __init__(self,
envs,
acmodel,
device,
num_frames_per_proc,
discount,
lr,
gae_lambda,
entropy_coef,
value_loss_coef,
max_grad_norm,
recurrence,
preprocess_obss,
reshape_reward,
exploration_bonus=False,
exploration_bonus_params=None,
exploration_bonus_tanh=None,
expert_exploration_bonus=False,
exploration_bonus_type="lang",
episodic_exploration_bonus=True,
utterance_moa_net=True, # used for social influence
clipped_rewards=False,
# default is set to fit RND
intrinsic_reward_loss_coef=0.1, # also used for social influence
intrinsic_reward_coef=0.1, # also used for social influence
intrinsic_reward_learning_rate=0.0001,
intrinsic_reward_momentum=0,
intrinsic_reward_epsilon=0.01,
intrinsic_reward_alpha=0.99,
intrinsic_reward_max_grad_norm=40,
intrinsic_reward_forward_loss_coef=10,
intrinsic_reward_inverse_loss_coef=0.1,
reset_rnd_ride_at_phase=False,
# social_influence
balance_moa_training=False,
moa_memory_dim=128,
):
"""
Initializes a `BaseAlgo` instance.
Parameters:
----------
envs : list
a list of environments that will be run in parallel
acmodel : torch.Module
the model
num_frames_per_proc : int
the number of frames collected by every process for an update
discount : float
the discount for future rewards
lr : float
the learning rate for optimizers
gae_lambda : float
the lambda coefficient in the GAE formula
([Schulman et al., 2015](https://arxiv.org/abs/1506.02438))
entropy_coef : float
the weight of the entropy cost in the final objective
value_loss_coef : float
the weight of the value loss in the final objective
max_grad_norm : float
gradient will be clipped to be at most this value
recurrence : int
the number of steps the gradient is propagated back in time
preprocess_obss : function
a function that takes observations returned by the environment
and converts them into the format that the model can handle
reshape_reward : function
a function that shapes the reward, takes an
(observation, action, reward, done) tuple as an input
"""
# Store parameters
self.env = ParallelEnv(envs)
self.acmodel = acmodel
self.device = device
self.num_frames_per_proc = num_frames_per_proc
self.discount = discount
self.lr = lr
self.gae_lambda = gae_lambda
self.entropy_coef = entropy_coef
self.value_loss_coef = value_loss_coef
self.max_grad_norm = max_grad_norm
self.recurrence = recurrence
self.preprocess_obss = preprocess_obss or default_preprocess_obss
self.reshape_reward = reshape_reward
self.exploration_bonus = exploration_bonus
self.expert_exploration_bonus = expert_exploration_bonus
self.exploration_bonus_type = exploration_bonus_type
self.episodic_exploration_bonus = episodic_exploration_bonus
self.clipped_rewards = clipped_rewards
self.update_epoch = 0
self.utterance_moa_net = utterance_moa_net # todo: as parameter
self.reset_rnd_ride_at_phase = reset_rnd_ride_at_phase
self.was_reset = False
# Control parameters
assert self.acmodel.recurrent or self.recurrence == 1
assert self.num_frames_per_proc % self.recurrence == 0
# Configure acmodel
self.acmodel.to(self.device)
self.acmodel.train()
# Store helpers values
self.num_procs = len(envs)
self.num_frames = self.num_frames_per_proc * self.num_procs
# Initialize experience values
shape = (self.num_frames_per_proc, self.num_procs)
self.obs = self.env.reset()
self.obss = [None]*(shape[0])
self.info = [{}]*(shape[0])
self.infos = [None]*(shape[0])
if self.acmodel.recurrent:
self.memory = torch.zeros(shape[1], self.acmodel.memory_size, device=self.device)
self.memories = torch.zeros(*shape, self.acmodel.memory_size, device=self.device)
self.mask = torch.ones(shape[1], device=self.device)
self.masks = torch.zeros(*shape, device=self.device)
self.next_masks = torch.zeros(*shape, device=self.device)
self.values = torch.zeros(*shape, device=self.device)
self.next_values = torch.zeros(*shape, device=self.device)
self.rewards = torch.zeros(*shape, device=self.device)
self.extrinsic_rewards = torch.zeros(*shape, device=self.device)
self.advantages = torch.zeros(*shape, device=self.device)
# as_shape = self.env.envs[0].action_space.shape
as_shape = self.acmodel.model_raw_action_space.shape
self.actions = torch.zeros(*(shape+as_shape), device=self.device, dtype=torch.int)
self.log_probs = torch.zeros(*(shape+as_shape), device=self.device)
# Initialize log values
self.log_episode_return = torch.zeros(self.num_procs, device=self.device)
self.log_episode_extrinsic_return = torch.zeros(self.num_procs, device=self.device)
self.log_episode_exploration_bonus = torch.zeros(self.num_procs, device=self.device)
self.log_episode_success_rate = torch.zeros(self.num_procs, device=self.device)
self.log_episode_curriculum_mean_perf = torch.zeros(self.num_procs, device=self.device)
self.log_episode_reshaped_return = torch.zeros(self.num_procs, device=self.device)
self.log_episode_num_frames = torch.zeros(self.num_procs, device=self.device)
self.log_episode_mission_string_observed = torch.zeros(self.num_procs, device=self.device)
self.log_episode_NPC_introduced_to = np.zeros(self.num_procs).astype(bool)
self.log_episode_curriculum_param = torch.zeros(self.num_procs, device=self.device)
self.intrinsic_reward_loss_coef = intrinsic_reward_loss_coef
self.intrinsic_reward_coef = intrinsic_reward_coef
self.intrinsic_reward_learning_rate = intrinsic_reward_learning_rate
self.intrinsic_reward_momentum = intrinsic_reward_momentum
self.intrinsic_reward_epsilon = intrinsic_reward_epsilon
self.intrinsic_reward_alpha = intrinsic_reward_alpha
self.intrinsic_reward_max_grad_norm = intrinsic_reward_max_grad_norm
self.intrinsic_reward_forward_loss_coef = intrinsic_reward_forward_loss_coef
self.intrinsic_reward_inverse_loss_coef = intrinsic_reward_inverse_loss_coef
self.balance_moa_training = balance_moa_training
self.moa_memory_dim = moa_memory_dim
self.log_done_counter = 0
self.log_return = [0] * self.num_procs
self.log_extrinsic_return = [0] * self.num_procs
self.log_exploration_bonus = [0] * self.num_procs
self.log_success_rate = [0] * self.num_procs
self.log_curriculum_max_mean_perf = [0] * self.num_procs
self.log_curriculum_param = [0] * self.num_procs
self.log_reshaped_return = [0] * self.num_procs
self.log_num_frames = [0] * self.num_procs
self.log_mission_string_observed = [0] * self.num_procs
self.log_NPC_introduced_to = [False] * self.num_procs
self.images_counter = [Counter() for _ in range(self.num_procs)]
if self.exploration_bonus:
self.visitation_counter = {}
self.exploration_bonus_params = {}
self.exploration_bonus_tanh = {}
for i, bonus_type in enumerate(self.exploration_bonus_type):
if bonus_type == "rnd":
assert not self.episodic_exploration_bonus
self.init_rnd_networks_and_optimizer()
elif bonus_type == "ride":
self.init_ride_networks_and_optimizer()
elif bonus_type == "soc_inf":
# npc actions
self.fn_name_to_npc_prim_act = self.env.envs[0].npc_prim_actions_dict
self.num_npc_prim_actions = len(self.fn_name_to_npc_prim_act)
self.npc_utterance_to_id = {a: i for i, a in enumerate(self.env.envs[0].all_npc_utterance_actions)}
self.num_npc_utterance_actions = len(self.npc_utterance_to_id)
if self.utterance_moa_net:
self.num_npc_all_actions = self.num_npc_prim_actions * self.num_npc_utterance_actions
else:
self.num_npc_all_actions = self.num_npc_prim_actions
# construct possible agent_action's list
self.all_possible_agent_actions, self.act_to_ind_dict = self.construct_all_possible_agent_actions()
self.agent_actions_tiled_all = None
im_shape = self.env.observation_space['image'].shape
embedding_size = self.acmodel.semi_memory_size
input_size = embedding_size \
+ self.num_npc_prim_actions \
+ self.acmodel.model_raw_action_space.nvec[0] \
+ self.acmodel.model_raw_action_space.nvec[2] \
+ self.acmodel.model_raw_action_space.nvec[3]
if self.utterance_moa_net:
input_size += self.num_npc_utterance_actions # todo: feed as index or as text?
self.moa_net = LSTMMoaNet(
input_size=input_size,
num_npc_prim_actions=self.num_npc_prim_actions,
num_npc_utterance_actions=self.num_npc_utterance_actions if self.utterance_moa_net else None,
acmodel=self.acmodel,
memory_dim=self.moa_memory_dim
).to(device=self.device)
# memory
assert shape == (self.num_frames_per_proc, self.num_procs)
self.moa_memory = torch.zeros(shape[1], self.moa_net.memory_size, device=self.device)
self.moa_memories = torch.zeros(*shape, self.moa_net.memory_size, device=self.device)
elif bonus_type in ["cell", "grid", "lang"]:
if self.episodic_exploration_bonus:
self.visitation_counter[bonus_type] = [Counter() for _ in range(self.num_procs)]
else:
self.visitation_counter[bonus_type] = Counter()
if exploration_bonus_params:
self.exploration_bonus_params[bonus_type] = exploration_bonus_params[2*i:2*i+2]
else:
self.exploration_bonus_params[bonus_type] = (100, 50.)
if exploration_bonus_tanh is None:
self.exploration_bonus_tanh[bonus_type] = None
else:
self.exploration_bonus_tanh[bonus_type] = exploration_bonus_tanh[i]
else:
raise ValueError(f"bonus type: {bonus_type} unknown.")
def load_status_dict(self, status):
self.acmodel.load_state_dict(status["model_state"])
if hasattr(self.env, "curriculum") and self.env.curriculum is not None:
self.env.curriculum.load_status_dict(status)
self.env.broadcast_curriculum_parameters(self.env.curriculum.get_parameters())
# self.optimizer.load_state_dict(status["optimizer_state"])
if self.exploration_bonus:
for i, bonus_type in enumerate(self.exploration_bonus_type):
if bonus_type == "rnd":
self.random_target_network.load_state_dict(status["random_target_network"])
self.predictor_network.load_state_dict(status["predictor_network"])
self.intrinsic_reward_optimizer.load_state_dict(status["intrinsic_reward_optimizer"])
elif bonus_type == "ride":
self.forward_dynamics_model.load_state_dict(status["forward_dynamics_model"])
self.inverse_dynamics_model.load_state_dict(status["inverse_dynamics_model"])
self.state_embedding_model.load_state_dict(status["state_embedding_model"])
self.state_embedding_optimizer.load_state_dict(status["state_embedding_optimizer"])
self.inverse_dynamics_optimizer.load_state_dict(status["inverse_dynamics_optimizer"])
self.forward_dynamics_optimizer.load_state_dict(status["forward_dynamics_optimizer"])
elif bonus_type == "soc_inf":
self.moa_net.load_state_dict(status["moa_net"])
def get_status_dict(self):
algo_status_dict = {
"model_state": self.acmodel.state_dict(),
}
if hasattr(self.env, "curriculum") and self.env.curriculum is not None:
algo_status_dict = {
**algo_status_dict,
**self.env.curriculum.get_status_dict()
}
if self.exploration_bonus:
for i, bonus_type in enumerate(self.exploration_bonus_type):
if bonus_type == "rnd":
algo_status_dict["random_target_network"] = self.random_target_network.state_dict()
algo_status_dict["predictor_network"] = self.predictor_network.state_dict()
algo_status_dict["intrinsic_reward_optimizer"] = self.intrinsic_reward_optimizer.state_dict()
elif bonus_type == "ride":
algo_status_dict["forward_dynamics_model"] = self.forward_dynamics_model.state_dict()
algo_status_dict["inverse_dynamics_model"] = self.inverse_dynamics_model.state_dict()
algo_status_dict["state_embedding_model"] = self.state_embedding_model.state_dict()
algo_status_dict["state_embedding_optimizer"] = self.state_embedding_optimizer.state_dict()
algo_status_dict["inverse_dynamics_optimizer"] = self.inverse_dynamics_optimizer.state_dict()
algo_status_dict["forward_dynamics_optimizer"] = self.forward_dynamics_optimizer.state_dict()
elif bonus_type == "soc_inf":
algo_status_dict["moa_net"] = self.moa_net.state_dict()
return algo_status_dict
def construct_all_possible_agent_actions(self):
if self.acmodel is None:
raise ValueError("This should be called after the model has been set")
# add non-speaking actions
# a non-speaking actions look like (?, 0, 0, 0)
# the last two zeros would normally mean the frst template and first word, but here they are to be
# ignored because of the second 0 (which means to not speak)
non_speaking_action_subspace = (self.acmodel.model_raw_action_space.nvec[0], 1, 1, 1)
non_speaking_actions = np.array(list(np.ndindex(non_speaking_action_subspace)))
# add speaking actions
speaking_action_subspace = (
self.acmodel.model_raw_action_space.nvec[0],
1, # one action,
self.acmodel.model_raw_action_space.nvec[2],
self.acmodel.model_raw_action_space.nvec[3],
)
speaking_actions = np.array(list(np.ndindex(speaking_action_subspace)))
speaking_actions = self.acmodel.no_speak_to_speak_action(speaking_actions)
# all actions
all_possible_agent_actions = np.concatenate([non_speaking_actions, speaking_actions])
# create the action -> index dict
act_to_ind_dict = {tuple(act): ind for ind, act in enumerate(all_possible_agent_actions)}
# map other non-speaking actions to the (?, 0, 0, 0), ex. (3, 0, 4, 12) -> (3, 0, 0, 0)
other_non_speaking_action_subspace = (
self.acmodel.model_raw_action_space.nvec[0],
1,
self.acmodel.model_raw_action_space.nvec[2],
self.acmodel.model_raw_action_space.nvec[3]
)
for action in np.ndindex(other_non_speaking_action_subspace):
assert action[1] == 0 # non-speaking
act_to_ind_dict[tuple(action)] = act_to_ind_dict[(action[0], 0, 0, 0)]
return all_possible_agent_actions, act_to_ind_dict
def step_to_n_frames(self, step):
return step * self.num_frames_per_proc * self.num_procs
def calculate_exploration_bonus(self, obs=None, done=None, prev_obs=None, info=None, prev_info=None, agent_actions=None, dist=None,
i_step=None, embeddings=None):
def state_hashes(observation, exploration_bonus_type):
if exploration_bonus_type == "lang":
hashes = [observation['utterance']]
assert len(hashes) == 1
elif exploration_bonus_type == "cell":
# for all new cells
im = observation["image"]
hashes = np.unique(im.reshape(-1, im.shape[-1]), axis=0)
hashes = np.apply_along_axis(lambda a: a.data.tobytes(), 1, hashes)
elif exploration_bonus_type == "grid":
# for seeing new grid configurations
im = observation["image"]
hashes = [im.data.tobytes()]
assert len(hashes) == 1
else:
raise ValueError(f"Unknown exploration bonus type {bonus_type}")
return hashes
total_bonus = [0]*len(obs)
for bonus_type in self.exploration_bonus_type:
if bonus_type == "rnd":
# -- [unroll_length x batch_size x height x width x channels] == [1, n_proc, 7, 7, 4]
batch = torch.tensor(np.array([[o['image'] for o in obs]])).to(self.device)
with torch.no_grad():
random_embedding = self.random_target_network(batch).reshape(len(obs), 128)
predicted_embedding = self.predictor_network(batch).reshape(len(obs), 128)
intrinsic_rewards = torch.norm(predicted_embedding.detach() - random_embedding.detach(), dim=1, p=2)
intrinsic_reward_coef = self.intrinsic_reward_coef
intrinsic_rewards *= intrinsic_reward_coef
# is this the best way? should we somehow extract the next_state?
bonus = [0.0 if d else float(r) for d, r in zip(done, intrinsic_rewards)]
elif bonus_type == "ride":
with torch.no_grad():
_obs = torch.tensor(np.array([[o['image'] for o in prev_obs]])).to(self.device)
_next_obs = torch.tensor(np.array([[o['image'] for o in obs]])).to(self.device)
# counts - number of times a state was seen during the SAME episode -> can be computed here
count_rewards = torch.tensor([1/np.sqrt(self.images_counter[p_i][np.array(o.to("cpu")).tobytes()]) for p_i, o in enumerate(_next_obs[0])]).to(self.device)
assert not any(torch.isinf(count_rewards))
state_emb = self.state_embedding_model(_obs.to(device=self.device)).reshape(len(obs), 128)
next_state_emb = self.state_embedding_model(_next_obs.to(device=self.device)).reshape(len(obs), 128)
control_rewards = torch.norm(next_state_emb - state_emb, dim=1, p=2)
intrinsic_rewards = self.intrinsic_reward_coef*(count_rewards * control_rewards)
# is this the best way? should we somehow extract the next_state?
bonus = [0.0 if d else float(r) for d, r in zip(done, intrinsic_rewards)]
elif bonus_type == "soc_inf":
if prev_info == [{}] * len(prev_info):
# this is the first step, info is not given during reset
# first step in the episode no influence can be estimated as there is no previous action
# todo: padd with zeros, and estimate anyway?
bonus = [0.0 for _ in done]
else:
# social influence
n_procs = len(obs)
_prev_NPC_prim_actions = torch.tensor(
[self.fn_name_to_npc_prim_act[o["NPC_prim_action"]] for o in prev_info]
).to(self.device)
# todo: what is the best way to feed utt action?
_prev_NPC_utt_actions = torch.tensor(
[self.npc_utterance_to_id[o["NPC_utterance"]] for o in prev_info]
).to(self.device)
# new
# calculate counterfactuals
npc_previous_prim_actions_all = _prev_NPC_prim_actions.repeat(len(self.all_possible_agent_actions)) # [A_ag*n_procs, ...]
npc_previous_utt_actions_all = _prev_NPC_utt_actions.repeat(len(self.all_possible_agent_actions)) # [A_ag*n_procs, ...]
# agent actions tiled
if self.agent_actions_tiled_all is not None:
agent_actions_tiled_all = self.agent_actions_tiled_all
else:
# only first time, we can't do it in init because we need len(im_obs)
agent_actions_tiled_all = []
for pot_agent_action in self.all_possible_agent_actions:
pot_agent_action_tiled = torch.from_numpy(np.tile(pot_agent_action, (n_procs, 1))) # [n_procs,...]
agent_actions_tiled_all.append(pot_agent_action_tiled.to(self.device))
agent_actions_tiled_all = torch.concat(agent_actions_tiled_all) # [A_ag*n_procs,....]
self.agent_actions_tiled_all = agent_actions_tiled_all
with torch.no_grad():
# todo: move this tiling above?
masked_memory = self.moa_memory * self.mask.unsqueeze(1)
masked_memory_tiled_all = masked_memory.repeat([len(self.all_possible_agent_actions), 1])
embedding_tiled_all = embeddings.repeat([len(self.all_possible_agent_actions), 1])
# use current memory for every action
counterfactuals_logits, moa_memory = self.moa_net(
embeddings=embedding_tiled_all,
# observations=observations_all,
npc_previous_prim_actions=npc_previous_prim_actions_all,
npc_previous_utterance_actions=npc_previous_utt_actions_all if self.utterance_moa_net else None,
agent_actions=agent_actions_tiled_all,
memory=masked_memory_tiled_all
) # logits : [A_ag * n_procs, A_npc]
counterfactuals_logits = counterfactuals_logits.reshape(
[len(self.all_possible_agent_actions), n_procs, self.num_npc_all_actions])
counterfactuals_logits = counterfactuals_logits.swapaxes(0, 1) # [n_procs, A_ag, A_npc]
assert counterfactuals_logits.shape == (len(obs), len(self.all_possible_agent_actions), self.num_npc_all_actions)
# compute npc logits p(A_npc|A_ag, s)
# note: ex (5,0,5,2) is mapped to (5,0,0,0), todo: is this ok everywhere?
agent_action_indices = [self.act_to_ind_dict[tuple(act.cpu().numpy())] for act in agent_actions]
# ~ p(a_npc| a_ag, ...)
predicted_logits = torch.stack([ctr[ind] for ctr, ind in zip(counterfactuals_logits, agent_action_indices)])
assert i_step is not None
self.moa_memories[i_step] = self.moa_memory
# only save for the actions actually taken
self.moa_memory = moa_memory[agent_action_indices]
assert predicted_logits.shape == (len(obs), self.num_npc_all_actions)
predicted_probs = torch.softmax(predicted_logits, dim=1) # use exp_softmax or something?
# compute marginal npc logits p(A_npc|s) = sum( p(A_NPC|A_ag,s), for every A_ag )
# compute agent logits for all possible agent actions
per_non_speaking_action_log_probs = dist[0].logits + dist[1].logits[:, :1]
per_speaking_action_log_probs = []
for p in range(n_procs):
log_probs_for_proc_p = [d.logits[p].cpu().numpy() for d in dist]
# speaking actions
speaking_log_probs = log_probs_for_proc_p
speaking_log_probs[1] = speaking_log_probs[1][1:] # only the speak action
# sum everybody with everybody
out = np.add.outer(speaking_log_probs[0], speaking_log_probs[1]).reshape(-1)
out = np.add.outer(out, speaking_log_probs[2]).reshape(-1)
out = np.add.outer(out, speaking_log_probs[3]).reshape(-1)
per_speaking_action_log_probs_proc_p = out
per_speaking_action_log_probs.append(per_speaking_action_log_probs_proc_p)
per_speaking_action_log_probs = np.stack(per_speaking_action_log_probs)
agent_log_probs = torch.concat([
per_non_speaking_action_log_probs,
torch.tensor(per_speaking_action_log_probs).to(device=self.device),
], dim=1)
# assert
for p in range(n_procs):
log_probs_for_proc_p = [d.logits[p].cpu().numpy() for d in dist]
assert torch.abs(agent_log_probs[p][self.act_to_ind_dict[(0, 1, 3, 1)]] - sum([p[a] for p, a in list(zip(log_probs_for_proc_p, (0, 1, 3, 1)))])) < 1e-5
assert torch.abs(agent_log_probs[p][self.act_to_ind_dict[(0, 1, 1, 10)]] - sum([p[a] for p, a in list(zip(log_probs_for_proc_p, (0, 1, 1, 10)))])) < 1e-5
agent_probs = agent_log_probs.exp()
counterfactuals_probs = counterfactuals_logits.softmax(dim=-1) # [n_procs, A_ag, A_npc]
counterfactuals_perm = counterfactuals_probs.permute(0, 2, 1) # [n_procs, A_npc, A_agent]
# compute marginal distributions
marginals = (counterfactuals_perm * agent_probs[:, None, :]).sum(-1)
# this already sums to one, so the following normalization is not needed
marginal_probs = marginals / marginals.sum(1, keepdims=True) # sum over npc_actions
assert marginal_probs.shape == (n_procs, self.num_npc_all_actions) # [batch, A_npc]
KL_loss = (predicted_probs * (predicted_probs.log() - marginal_probs.log())).sum(axis=-1)
intrinsic_rewards = self.intrinsic_reward_coef * KL_loss
# is the NPC observed in the image that is fed as input in this step
# (returned by the previous step() call )
NPC_observed = torch.tensor([pi["NPC_observed"] for pi in prev_info]).to(self.device)
intrinsic_rewards = intrinsic_rewards * NPC_observed
bonus = [0.0 if d else float(r) for d, r in zip(done, intrinsic_rewards)]
elif bonus_type in ["cell", "grid", "lang"]:
C, M = self.exploration_bonus_params[bonus_type]
C_ = C / self.num_frames_per_proc
if self.expert_exploration_bonus:
# expert
raise DeprecationWarning("Deprecated exploration bonus type")
elif self.episodic_exploration_bonus:
hashes = [state_hashes(o, bonus_type) for o in obs]
bonus = [
0 if d else # no bonus if done
np.sum([
C_ / ((self.visitation_counter[bonus_type][i_p][h] + 1) ** M) for h in hs
])
for i_p, (hs, d) in enumerate(zip(hashes, done))
]
# update the counters
for i_p, (o, d, hs) in enumerate(zip(obs, done, hashes)):
if not d:
for h in hs:
self.visitation_counter[bonus_type][i_p][h] += 1
else:
raise DeprecationWarning("Use episodic exploration bonus.")
# non-episodic exploration bonus
bonus = [
0 if d else # no bonus if done
np.sum([
C_ / ((self.visitation_counter[bonus_type][h] + 1) ** M) for h in state_hashes(o. bonus_type)
]) for o, d in zip(obs, done)
]
# update the counters
for o, d in zip(obs, done):
if not d:
for h in state_hashes(o, self.exploration_bonus_type):
self.visitation_counter[bonus_type][h] += 1
if self.exploration_bonus_tanh[bonus_type] is not None:
bonus = [np.tanh(b)*self.exploration_bonus_tanh[bonus_type] for b in bonus]
else:
raise ValueError(f"Unknown exploration bonus type {bonus_type}")
assert len(total_bonus) == len(bonus)
total_bonus = [tb+b for tb, b in zip(total_bonus, bonus)]
return total_bonus
def collect_experiences(self):
"""Collects rollouts and computes advantages.
Runs several environments concurrently. The next actions are computed
in a batch mode for all environments at the same time. The rollouts
and advantages from all environments are concatenated together.
Returns
-------
exps : DictList
Contains actions, rewards, advantages etc as attributes.
Each attribute, e.g. `exps.reward` has a shape
(self.num_frames_per_proc * num_envs, ...). k-th block
of consecutive `self.num_frames_per_proc` frames contains
data obtained from the k-th environment. Be careful not to mix
data from different environments!
logs : dict
Useful stats about the training process, including the average
reward, policy loss, value loss, etc.
"""
for i_step in range(self.num_frames_per_proc):
# Do one agent-environment interaction
preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
with torch.no_grad():
if self.acmodel.recurrent:
dist, value, memory, policy_embedding = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1), return_embeddings=True)
else:
dist, value, policy_embedding = self.acmodel(preprocessed_obs, return_embeddings=True)
action = self.acmodel.sample_action(dist)
obs, reward, done, info = self.env.step(
self.acmodel.construct_final_action(
action.cpu().numpy()
)
)
if hasattr(self.env, "curriculum") and self.env.curriculum is not None:
curriculum_params = self.env.curriculum.update_parameters({
"obs": obs,
"reward": reward,
"done": done,
"info": info,
})
# broadcast new parameters to all parallel environments
self.env.broadcast_curriculum_parameters(curriculum_params)
if self.reset_rnd_ride_at_phase and curriculum_params['phase'] == 2 and not self.was_reset:
self.was_reset = True
assert not self.episodic_exploration_bonus
for i, bonus_type in enumerate(self.exploration_bonus_type):
if bonus_type == "rnd":
self.init_rnd_networks_and_optimizer()
elif bonus_type == "ride":
self.init_ride_networks_and_optimizer()
for p_i, o in enumerate(obs):
self.images_counter[p_i][o['image'].tobytes()] += 1
extrinsic_reward = reward
exploration_bonus = (0,) * len(reward)
if self.exploration_bonus:
bonus = self.calculate_exploration_bonus(
obs=obs, done=done, prev_obs=self.obs, info=info, prev_info=self.info, agent_actions=action, dist=dist,
i_step=i_step, embeddings=policy_embedding,
)
exploration_bonus = bonus
reward = [r + b for r, b in zip(reward, bonus)]
if self.clipped_rewards:
# this should not be used with classic count-based rewards as they often,
# when combined with extr. rew go past 1.0
reward = list(map(float, torch.clamp(torch.tensor(reward), -1, 1)))
# Update experiences values
self.obss[i_step] = self.obs
self.obs = obs
self.infos[i_step] = info # info of this step is the current info
self.info = info # save as previous info
if self.acmodel.recurrent:
self.memories[i_step] = self.memory
self.memory = memory
self.masks[i_step] = self.mask
self.mask = 1 - torch.tensor(done, device=self.device, dtype=torch.float)
self.actions[i_step] = action
self.values[i_step] = value
if self.reshape_reward is not None:
self.rewards[i_step] = torch.tensor([
self.reshape_reward(obs_, action_, reward_, done_)
for obs_, action_, reward_, done_ in zip(obs, action, reward, done)
], device=self.device)
else:
self.rewards[i_step] = torch.tensor(reward, device=self.device)
self.log_probs[i_step] = self.acmodel.calculate_log_probs(dist, action)
# Update log values
self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float)
self.log_episode_extrinsic_return += torch.tensor(extrinsic_reward, device=self.device, dtype=torch.float)
self.log_episode_exploration_bonus += torch.tensor(exploration_bonus, device=self.device, dtype=torch.float)
self.log_episode_success_rate = torch.tensor([i["success"] for i in info]).float().to(self.device)
self.log_episode_curriculum_mean_perf = torch.tensor([i.get("curriculum_info_max_mean_perf", 0) for i in info]).float().to(self.device)
self.log_episode_reshaped_return += self.rewards[i_step]
self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device)
self.log_episode_curriculum_param = torch.tensor([i.get("curriculum_info_param", 0.0) for i in info]).float().to(self.device)
# self.log_episode_curriculum_param = torch.tensor([i.get("curriculum_info_mean_perf", 0.0) for i in info]).float().to(self.device)
assert self.log_episode_curriculum_param.var() == 0
log_episode_NPC_introduced_to_current = np.array([i.get('NPC_was_introduced_to', False) for i in info])
assert all((self.log_episode_NPC_introduced_to | log_episode_NPC_introduced_to_current) == log_episode_NPC_introduced_to_current)
self.log_episode_NPC_introduced_to = self.log_episode_NPC_introduced_to | log_episode_NPC_introduced_to_current
self.log_episode_mission_string_observed += torch.tensor([
float(m in o.get("utterance", ''))
for m, o in zip(self.env.get_mission(), self.obs)
], device=self.device, dtype=torch.float)
for p, done_ in enumerate(done):
if done_:
self.log_mission_string_observed.append(
torch.clamp(self.log_episode_mission_string_observed[p], 0, 1).item()
)
self.log_done_counter += 1
self.log_return.append(self.log_episode_return[p].item())
self.log_extrinsic_return.append(self.log_episode_extrinsic_return[p].item())
self.log_exploration_bonus.append(self.log_episode_exploration_bonus[p].item())
self.log_success_rate.append(self.log_episode_success_rate[p].item())
self.log_curriculum_max_mean_perf.append(self.log_episode_curriculum_mean_perf[p].item())
self.log_reshaped_return.append(self.log_episode_reshaped_return[p].item())
self.log_num_frames.append(self.log_episode_num_frames[p].item())
self.log_curriculum_param.append(self.log_episode_curriculum_param[p].item())
if self.episodic_exploration_bonus:
for v in self.visitation_counter.values():
v[p] = Counter()
self.images_counter[p] = Counter()
self.log_NPC_introduced_to.append(self.log_episode_NPC_introduced_to[p])
# print("log history:", self.log_success_rate)
# print("log history len:", len(self.log_success_rate)-16)
self.log_episode_mission_string_observed *= self.mask
self.log_episode_return *= self.mask
self.log_episode_extrinsic_return *= self.mask
self.log_episode_exploration_bonus *= self.mask
self.log_episode_success_rate *= self.mask
self.log_episode_curriculum_mean_perf *= self.mask
self.log_episode_reshaped_return *= self.mask
self.log_episode_num_frames *= self.mask
self.log_episode_NPC_introduced_to *= self.mask.cpu().numpy().astype(bool)
self.log_episode_curriculum_param *= self.mask
# Add advantage and return to experiences
preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
with torch.no_grad():
if self.acmodel.recurrent:
_, next_value, _ = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1))
else:
_, next_value = self.acmodel(preprocessed_obs)
for f in reversed(range(self.num_frames_per_proc)):
next_mask = self.masks[f+1] if f < self.num_frames_per_proc - 1 else self.mask
next_value = self.values[f+1] if f < self.num_frames_per_proc - 1 else next_value
next_advantage = self.advantages[f+1] if f < self.num_frames_per_proc - 1 else 0
self.next_masks[f] = next_mask
self.next_values[f] = next_value
delta = self.rewards[f] + self.discount * next_value * next_mask - self.values[f]
self.advantages[f] = delta + self.discount * self.gae_lambda * next_advantage * next_mask
# Define experiences:
# the whole experience is the concatenation of the experience
# of each process.
# In comments below:
# - T is self.num_frames_per_proc,
# - P is self.num_procs,
# - D is the dimensionality.
exps = DictList()
exps.obs = [self.obss[f][p]
for p in range(self.num_procs)
for f in range(self.num_frames_per_proc)]
exps.infos = np.array([self.infos[f][p]
for p in range(self.num_procs)
for f in range(self.num_frames_per_proc)])
# obs: (p1 (f1,f2,f3) ; p2 (f1,f2,f3); p3 (f1,f2,f3)
if self.acmodel.recurrent:
# T x P x D -> P x T x D -> (P * T) x D
exps.memory = self.memories.transpose(0, 1).reshape(-1, *self.memories.shape[2:])
# T x P -> P x T -> (P * T) x 1
exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1)
exps.next_mask = self.next_masks.transpose(0, 1).reshape(-1).unsqueeze(1)
if self.exploration_bonus and "soc_inf" in self.exploration_bonus_type:
exps.moa_memory = self.moa_memories.transpose(0, 1).reshape(-1, *self.moa_memories.shape[2:])
# for all tensors below, T x P -> P x T -> P * T
exps.action = self.actions.transpose(0, 1).reshape((-1, self.actions.shape[-1]))
exps.log_prob = self.log_probs.transpose(0, 1).reshape((-1, self.actions.shape[-1]))
exps.value = self.values.transpose(0, 1).reshape(-1)
exps.next_value = self.next_values.transpose(0, 1).reshape(-1)
exps.reward = self.rewards.transpose(0, 1).reshape(-1)
exps.advantage = self.advantages.transpose(0, 1).reshape(-1)
exps.returnn = exps.value + exps.advantage
# Preprocess experiences
exps.obs = self.preprocess_obss(exps.obs, device=self.device)
# Log some values
keep = max(self.log_done_counter, self.num_procs)
flat_actions = self.actions.reshape(-1, self.actions.shape[-1])
action_modalities = {
"action_modality_{}".format(m): flat_actions[:, m].cpu().numpy() for m in range(self.actions.shape[-1])
}
if not self.exploration_bonus:
assert self.log_return == self.log_extrinsic_return
logs = {
"return_per_episode": self.log_return[-keep:],
"mission_string_observed": self.log_mission_string_observed[-keep:],
"extrinsic_return_per_episode": self.log_extrinsic_return[-keep:],
"exploration_bonus_per_episode": self.log_exploration_bonus[-keep:],
"success_rate_per_episode": self.log_success_rate[-keep:],
"curriculum_max_mean_perf_per_episode": self.log_curriculum_max_mean_perf[-keep:],
"curriculum_param_per_episode": self.log_curriculum_param[-keep:],
"reshaped_return_per_episode": self.log_reshaped_return[-keep:],
"num_frames_per_episode": self.log_num_frames[-keep:],
"num_frames": self.num_frames,
"NPC_introduced_to": self.log_NPC_introduced_to[-keep:],
**action_modalities
}
self.log_done_counter = 0
self.log_return = self.log_return[-self.num_procs:]
self.log_extrinsic_return = self.log_extrinsic_return[-self.num_procs:]
self.log_exploration_bonus = self.log_exploration_bonus[-self.num_procs:]
self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
self.log_num_frames = self.log_num_frames[-self.num_procs:]
return exps, logs
def compute_advantages_and_returnn(self, exps):
"""
This function can be used for algorithms which reuse old data (not online RL) to
recompute non episodic intrinsic rewards on old experience.
This method is not used in PPO training.
Example usage from update_parameters
advs, retnn = self.compute_advantages_and_returnn(exps)
# if you want to do a sanity check
assert torch.equal(exps.advantage, advs)
assert torch.equal(exps.returnn, retnn)
exps.advantages, exps.returnn = advs, retnn
"""
shape = (self.num_frames_per_proc, self.num_procs)
advs = torch.zeros(*shape, device=self.device)
rewards = exps.reward.reshape(self.num_procs, self.num_frames_per_proc).transpose(0, 1)
values = exps.value.reshape(self.num_procs, self.num_frames_per_proc).transpose(0, 1)
next_values = exps.next_value.reshape(self.num_procs, self.num_frames_per_proc).transpose(0, 1)
next_masks = exps.next_mask.reshape(self.num_procs, self.num_frames_per_proc).transpose(0, 1)
for f in reversed(range(self.num_frames_per_proc)):
next_advantage = advs[f+1] if f < self.num_frames_per_proc - 1 else 0
delta = rewards[f] + self.discount * next_values[f] * next_masks[f] - values[f]
advs[f] = delta + self.discount * self.gae_lambda * next_advantage * next_masks[f]
advantage = advs.transpose(0, 1).reshape(-1)
returnn = exps.value + advantage
return advantage, returnn
@abstractmethod
def update_parameters(self):
pass
def init_rnd_networks_and_optimizer(self):
self.random_target_network = MinigridStateEmbeddingNet(self.env.observation_space['image'].shape).to(
device=self.device)
self.predictor_network = MinigridStateEmbeddingNet(self.env.observation_space['image'].shape).to(device=self.device)
self.intrinsic_reward_optimizer = torch.optim.RMSprop(
self.predictor_network.parameters(),
lr=self.intrinsic_reward_learning_rate,
momentum=self.intrinsic_reward_momentum,
eps=self.intrinsic_reward_epsilon,
alpha=self.intrinsic_reward_alpha,
)
def init_ride_networks_and_optimizer(self):
self.state_embedding_model = MinigridStateEmbeddingNet(self.env.observation_space['image'].shape).to(
device=self.device)
# linquistic actions
# n_actions = self.acmodel.model_raw_action_space.nvec.prod
# we only use primitive actions for ride
n_actions = self.acmodel.model_raw_action_space.nvec[0]
self.forward_dynamics_model = MinigridForwardDynamicsNet(n_actions).to(device=self.device)
self.inverse_dynamics_model = MinigridInverseDynamicsNet(n_actions).to(device=self.device)
self.state_embedding_optimizer = torch.optim.RMSprop(
self.state_embedding_model.parameters(),
lr=self.intrinsic_reward_learning_rate,
momentum=self.intrinsic_reward_momentum,
eps=self.intrinsic_reward_epsilon,
alpha=self.intrinsic_reward_alpha)
self.inverse_dynamics_optimizer = torch.optim.RMSprop(
self.inverse_dynamics_model.parameters(),
lr=self.intrinsic_reward_learning_rate,
momentum=self.intrinsic_reward_momentum,
eps=self.intrinsic_reward_epsilon,
alpha=self.intrinsic_reward_alpha)
self.forward_dynamics_optimizer = torch.optim.RMSprop(
self.forward_dynamics_model.parameters(),
lr=self.intrinsic_reward_learning_rate,
momentum=self.intrinsic_reward_momentum,
eps=self.intrinsic_reward_epsilon,
alpha=self.intrinsic_reward_alpha)
|