Spaces:

flowers-team
/

SocialAISchool

Sleeping

File size: 47,813 Bytes

be5548b

from abc import ABC, abstractmethod
import numpy as np
import torch

from torch_ac.format import default_preprocess_obss
from torch_ac.utils import DictList, ParallelEnv
from torch_ac.intrinsic_reward_models import *

from collections import Counter


class BaseAlgo(ABC):
    """The base class for RL algorithms."""

    def __init__(self,
                 envs,
                 acmodel,
                 device,
                 num_frames_per_proc,
                 discount,
                 lr,
                 gae_lambda,
                 entropy_coef,
                 value_loss_coef,
                 max_grad_norm,
                 recurrence,
                 preprocess_obss,
                 reshape_reward,
                 exploration_bonus=False,
                 exploration_bonus_params=None,
                 exploration_bonus_tanh=None,
                 expert_exploration_bonus=False,
                 exploration_bonus_type="lang",
                 episodic_exploration_bonus=True,
                 utterance_moa_net=True,  # used for social influence
                 clipped_rewards=False,
                 # default is set to fit RND
                 intrinsic_reward_loss_coef=0.1,  # also used for social influence
                 intrinsic_reward_coef=0.1,  # also used for social influence
                 intrinsic_reward_learning_rate=0.0001,
                 intrinsic_reward_momentum=0,
                 intrinsic_reward_epsilon=0.01,
                 intrinsic_reward_alpha=0.99,
                 intrinsic_reward_max_grad_norm=40,
                 intrinsic_reward_forward_loss_coef=10,
                 intrinsic_reward_inverse_loss_coef=0.1,
                 reset_rnd_ride_at_phase=False,
                 # social_influence
                 balance_moa_training=False,
                 moa_memory_dim=128,
    ):
        """
        Initializes a `BaseAlgo` instance.

        Parameters:
        ----------
        envs : list
            a list of environments that will be run in parallel
        acmodel : torch.Module
            the model
        num_frames_per_proc : int
            the number of frames collected by every process for an update
        discount : float
            the discount for future rewards
        lr : float
            the learning rate for optimizers
        gae_lambda : float
            the lambda coefficient in the GAE formula
            ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438))
        entropy_coef : float
            the weight of the entropy cost in the final objective
        value_loss_coef : float
            the weight of the value loss in the final objective
        max_grad_norm : float
            gradient will be clipped to be at most this value
        recurrence : int
            the number of steps the gradient is propagated back in time
        preprocess_obss : function
            a function that takes observations returned by the environment
            and converts them into the format that the model can handle
        reshape_reward : function
            a function that shapes the reward, takes an
            (observation, action, reward, done) tuple as an input
        """

        # Store parameters

        self.env = ParallelEnv(envs)
        self.acmodel = acmodel
        self.device = device
        self.num_frames_per_proc = num_frames_per_proc
        self.discount = discount
        self.lr = lr
        self.gae_lambda = gae_lambda
        self.entropy_coef = entropy_coef
        self.value_loss_coef = value_loss_coef
        self.max_grad_norm = max_grad_norm
        self.recurrence = recurrence
        self.preprocess_obss = preprocess_obss or default_preprocess_obss
        self.reshape_reward = reshape_reward
        self.exploration_bonus = exploration_bonus
        self.expert_exploration_bonus = expert_exploration_bonus
        self.exploration_bonus_type = exploration_bonus_type
        self.episodic_exploration_bonus = episodic_exploration_bonus
        self.clipped_rewards = clipped_rewards
        self.update_epoch = 0
        self.utterance_moa_net = utterance_moa_net  # todo: as parameter

        self.reset_rnd_ride_at_phase = reset_rnd_ride_at_phase
        self.was_reset = False

        # Control parameters

        assert self.acmodel.recurrent or self.recurrence == 1
        assert self.num_frames_per_proc % self.recurrence == 0

        # Configure acmodel

        self.acmodel.to(self.device)
        self.acmodel.train()

        # Store helpers values

        self.num_procs = len(envs)
        self.num_frames = self.num_frames_per_proc * self.num_procs

        # Initialize experience values

        shape = (self.num_frames_per_proc, self.num_procs)

        self.obs = self.env.reset()
        self.obss = [None]*(shape[0])

        self.info = [{}]*(shape[0])
        self.infos = [None]*(shape[0])
        if self.acmodel.recurrent:
            self.memory = torch.zeros(shape[1], self.acmodel.memory_size, device=self.device)
            self.memories = torch.zeros(*shape, self.acmodel.memory_size, device=self.device)
        self.mask = torch.ones(shape[1], device=self.device)
        self.masks = torch.zeros(*shape, device=self.device)
        self.next_masks = torch.zeros(*shape, device=self.device)

        self.values = torch.zeros(*shape, device=self.device)
        self.next_values = torch.zeros(*shape, device=self.device)
        self.rewards = torch.zeros(*shape, device=self.device)
        self.extrinsic_rewards = torch.zeros(*shape, device=self.device)
        self.advantages = torch.zeros(*shape, device=self.device)

        # as_shape = self.env.envs[0].action_space.shape
        as_shape = self.acmodel.model_raw_action_space.shape
        self.actions = torch.zeros(*(shape+as_shape), device=self.device, dtype=torch.int)
        self.log_probs = torch.zeros(*(shape+as_shape), device=self.device)

        # Initialize log values

        self.log_episode_return = torch.zeros(self.num_procs, device=self.device)
        self.log_episode_extrinsic_return = torch.zeros(self.num_procs, device=self.device)
        self.log_episode_exploration_bonus = torch.zeros(self.num_procs, device=self.device)
        self.log_episode_success_rate = torch.zeros(self.num_procs, device=self.device)
        self.log_episode_curriculum_mean_perf = torch.zeros(self.num_procs, device=self.device)
        self.log_episode_reshaped_return = torch.zeros(self.num_procs, device=self.device)
        self.log_episode_num_frames = torch.zeros(self.num_procs, device=self.device)
        self.log_episode_mission_string_observed = torch.zeros(self.num_procs, device=self.device)
        self.log_episode_NPC_introduced_to = np.zeros(self.num_procs).astype(bool)
        self.log_episode_curriculum_param = torch.zeros(self.num_procs, device=self.device)

        self.intrinsic_reward_loss_coef = intrinsic_reward_loss_coef
        self.intrinsic_reward_coef = intrinsic_reward_coef
        self.intrinsic_reward_learning_rate = intrinsic_reward_learning_rate
        self.intrinsic_reward_momentum = intrinsic_reward_momentum
        self.intrinsic_reward_epsilon = intrinsic_reward_epsilon
        self.intrinsic_reward_alpha = intrinsic_reward_alpha
        self.intrinsic_reward_max_grad_norm = intrinsic_reward_max_grad_norm
        self.intrinsic_reward_forward_loss_coef = intrinsic_reward_forward_loss_coef
        self.intrinsic_reward_inverse_loss_coef = intrinsic_reward_inverse_loss_coef
        self.balance_moa_training = balance_moa_training
        self.moa_memory_dim = moa_memory_dim

        self.log_done_counter = 0
        self.log_return = [0] * self.num_procs
        self.log_extrinsic_return = [0] * self.num_procs
        self.log_exploration_bonus = [0] * self.num_procs
        self.log_success_rate = [0] * self.num_procs
        self.log_curriculum_max_mean_perf = [0] * self.num_procs
        self.log_curriculum_param = [0] * self.num_procs
        self.log_reshaped_return = [0] * self.num_procs
        self.log_num_frames = [0] * self.num_procs
        self.log_mission_string_observed = [0] * self.num_procs
        self.log_NPC_introduced_to = [False] * self.num_procs
        self.images_counter = [Counter() for _ in range(self.num_procs)]

        if self.exploration_bonus:
            self.visitation_counter = {}
            self.exploration_bonus_params = {}
            self.exploration_bonus_tanh = {}

            for i, bonus_type in enumerate(self.exploration_bonus_type):
                if bonus_type == "rnd":
                    assert not self.episodic_exploration_bonus
                    self.init_rnd_networks_and_optimizer()

                elif bonus_type == "ride":
                    self.init_ride_networks_and_optimizer()


                elif bonus_type == "soc_inf":

                    # npc actions
                    self.fn_name_to_npc_prim_act = self.env.envs[0].npc_prim_actions_dict

                    self.num_npc_prim_actions = len(self.fn_name_to_npc_prim_act)

                    self.npc_utterance_to_id = {a: i for i, a in enumerate(self.env.envs[0].all_npc_utterance_actions)}
                    self.num_npc_utterance_actions = len(self.npc_utterance_to_id)

                    if self.utterance_moa_net:
                        self.num_npc_all_actions = self.num_npc_prim_actions * self.num_npc_utterance_actions
                    else:
                        self.num_npc_all_actions = self.num_npc_prim_actions

                    # construct possible agent_action's list
                    self.all_possible_agent_actions, self.act_to_ind_dict = self.construct_all_possible_agent_actions()
                    self.agent_actions_tiled_all = None

                    im_shape = self.env.observation_space['image'].shape

                    embedding_size = self.acmodel.semi_memory_size

                    input_size = embedding_size \
                                    + self.num_npc_prim_actions \
                                    + self.acmodel.model_raw_action_space.nvec[0] \
                                    + self.acmodel.model_raw_action_space.nvec[2] \
                                    + self.acmodel.model_raw_action_space.nvec[3]

                    if self.utterance_moa_net:
                        input_size += self.num_npc_utterance_actions  # todo: feed as index or as text?

                    self.moa_net = LSTMMoaNet(
                        input_size=input_size,
                        num_npc_prim_actions=self.num_npc_prim_actions,
                        num_npc_utterance_actions=self.num_npc_utterance_actions if self.utterance_moa_net else None,
                        acmodel=self.acmodel,
                        memory_dim=self.moa_memory_dim
                    ).to(device=self.device)

                    # memory
                    assert shape == (self.num_frames_per_proc, self.num_procs)
                    self.moa_memory = torch.zeros(shape[1], self.moa_net.memory_size, device=self.device)
                    self.moa_memories = torch.zeros(*shape, self.moa_net.memory_size, device=self.device)

                elif bonus_type in ["cell", "grid", "lang"]:
                    if self.episodic_exploration_bonus:
                        self.visitation_counter[bonus_type] = [Counter() for _ in range(self.num_procs)]
                    else:
                        self.visitation_counter[bonus_type] = Counter()

                    if exploration_bonus_params:
                        self.exploration_bonus_params[bonus_type] = exploration_bonus_params[2*i:2*i+2]
                    else:
                        self.exploration_bonus_params[bonus_type] = (100, 50.)

                    if exploration_bonus_tanh is None:
                        self.exploration_bonus_tanh[bonus_type] = None
                    else:
                        self.exploration_bonus_tanh[bonus_type] = exploration_bonus_tanh[i]
                else:
                    raise ValueError(f"bonus type: {bonus_type} unknown.")

    def load_status_dict(self, status):

        self.acmodel.load_state_dict(status["model_state"])

        if hasattr(self.env, "curriculum") and self.env.curriculum is not None:
            self.env.curriculum.load_status_dict(status)
            self.env.broadcast_curriculum_parameters(self.env.curriculum.get_parameters())

        # self.optimizer.load_state_dict(status["optimizer_state"])

        if self.exploration_bonus:
            for i, bonus_type in enumerate(self.exploration_bonus_type):

                if bonus_type == "rnd":
                    self.random_target_network.load_state_dict(status["random_target_network"])
                    self.predictor_network.load_state_dict(status["predictor_network"])
                    self.intrinsic_reward_optimizer.load_state_dict(status["intrinsic_reward_optimizer"])

                elif bonus_type == "ride":
                    self.forward_dynamics_model.load_state_dict(status["forward_dynamics_model"])
                    self.inverse_dynamics_model.load_state_dict(status["inverse_dynamics_model"])
                    self.state_embedding_model.load_state_dict(status["state_embedding_model"])

                    self.state_embedding_optimizer.load_state_dict(status["state_embedding_optimizer"])
                    self.inverse_dynamics_optimizer.load_state_dict(status["inverse_dynamics_optimizer"])
                    self.forward_dynamics_optimizer.load_state_dict(status["forward_dynamics_optimizer"])

                elif bonus_type == "soc_inf":
                    self.moa_net.load_state_dict(status["moa_net"])

    def get_status_dict(self):

        algo_status_dict = {
            "model_state": self.acmodel.state_dict(),
        }

        if hasattr(self.env, "curriculum") and self.env.curriculum is not None:
            algo_status_dict = {
                **algo_status_dict,
                **self.env.curriculum.get_status_dict()
            }

        if self.exploration_bonus:
            for i, bonus_type in enumerate(self.exploration_bonus_type):

                if bonus_type == "rnd":
                    algo_status_dict["random_target_network"] = self.random_target_network.state_dict()
                    algo_status_dict["predictor_network"] = self.predictor_network.state_dict()
                    algo_status_dict["intrinsic_reward_optimizer"] = self.intrinsic_reward_optimizer.state_dict()

                elif bonus_type == "ride":
                    algo_status_dict["forward_dynamics_model"] = self.forward_dynamics_model.state_dict()
                    algo_status_dict["inverse_dynamics_model"] = self.inverse_dynamics_model.state_dict()
                    algo_status_dict["state_embedding_model"] = self.state_embedding_model.state_dict()

                    algo_status_dict["state_embedding_optimizer"] = self.state_embedding_optimizer.state_dict()
                    algo_status_dict["inverse_dynamics_optimizer"] = self.inverse_dynamics_optimizer.state_dict()
                    algo_status_dict["forward_dynamics_optimizer"] = self.forward_dynamics_optimizer.state_dict()

                elif bonus_type == "soc_inf":
                    algo_status_dict["moa_net"] = self.moa_net.state_dict()

        return algo_status_dict

    def construct_all_possible_agent_actions(self):

        if self.acmodel is None:
            raise ValueError("This should be called after the model has been set")

        # add non-speaking actions

        # a non-speaking actions look like (?, 0, 0, 0)
        # the last two zeros would normally mean the frst template and first word, but here they are to be
        # ignored because of the second 0 (which means to not speak)
        non_speaking_action_subspace = (self.acmodel.model_raw_action_space.nvec[0], 1, 1, 1)
        non_speaking_actions = np.array(list(np.ndindex(non_speaking_action_subspace)))

        # add speaking actions
        speaking_action_subspace = (
            self.acmodel.model_raw_action_space.nvec[0],
            1,  # one action,
            self.acmodel.model_raw_action_space.nvec[2],
            self.acmodel.model_raw_action_space.nvec[3],
        )

        speaking_actions = np.array(list(np.ndindex(speaking_action_subspace)))
        speaking_actions = self.acmodel.no_speak_to_speak_action(speaking_actions)

        # all actions
        all_possible_agent_actions = np.concatenate([non_speaking_actions, speaking_actions])

        # create the action -> index dict
        act_to_ind_dict = {tuple(act): ind for ind, act in enumerate(all_possible_agent_actions)}

        # map other non-speaking actions to the (?, 0, 0, 0), ex. (3, 0, 4, 12) -> (3, 0, 0, 0)
        other_non_speaking_action_subspace = (
            self.acmodel.model_raw_action_space.nvec[0],
            1,
            self.acmodel.model_raw_action_space.nvec[2],
            self.acmodel.model_raw_action_space.nvec[3]
        )
        for action in np.ndindex(other_non_speaking_action_subspace):
            assert action[1] == 0  # non-speaking
            act_to_ind_dict[tuple(action)] = act_to_ind_dict[(action[0], 0, 0, 0)]

        return all_possible_agent_actions, act_to_ind_dict

    def step_to_n_frames(self, step):
        return step * self.num_frames_per_proc * self.num_procs

    def calculate_exploration_bonus(self, obs=None, done=None, prev_obs=None, info=None, prev_info=None, agent_actions=None, dist=None,
                                    i_step=None, embeddings=None):

        def state_hashes(observation, exploration_bonus_type):
            if exploration_bonus_type == "lang":
                hashes = [observation['utterance']]
                assert len(hashes) == 1
            elif exploration_bonus_type == "cell":
                # for all new cells
                im = observation["image"]
                hashes = np.unique(im.reshape(-1, im.shape[-1]), axis=0)
                hashes = np.apply_along_axis(lambda a: a.data.tobytes(), 1, hashes)

            elif exploration_bonus_type == "grid":
                # for seeing new grid configurations
                im = observation["image"]
                hashes = [im.data.tobytes()]
                assert len(hashes) == 1
            else:
                raise ValueError(f"Unknown exploration bonus type {bonus_type}")

            return hashes

        total_bonus = [0]*len(obs)
        for bonus_type in self.exploration_bonus_type:
            if bonus_type == "rnd":
                # -- [unroll_length x batch_size x height x width x channels] == [1, n_proc, 7, 7, 4]
                batch = torch.tensor(np.array([[o['image'] for o in obs]])).to(self.device)

                with torch.no_grad():
                    random_embedding = self.random_target_network(batch).reshape(len(obs), 128)
                    predicted_embedding = self.predictor_network(batch).reshape(len(obs), 128)
                    intrinsic_rewards = torch.norm(predicted_embedding.detach() - random_embedding.detach(), dim=1, p=2)
                    intrinsic_reward_coef = self.intrinsic_reward_coef
                    intrinsic_rewards *= intrinsic_reward_coef

                    # is this the best way? should we somehow extract the next_state?
                    bonus = [0.0 if d else float(r) for d, r in zip(done, intrinsic_rewards)]

            elif bonus_type == "ride":
                with torch.no_grad():
                    _obs = torch.tensor(np.array([[o['image'] for o in prev_obs]])).to(self.device)
                    _next_obs = torch.tensor(np.array([[o['image'] for o in obs]])).to(self.device)

                    # counts - number of times a state was seen during the SAME episode -> can be computed here
                    count_rewards = torch.tensor([1/np.sqrt(self.images_counter[p_i][np.array(o.to("cpu")).tobytes()]) for p_i, o in enumerate(_next_obs[0])]).to(self.device)
                    assert not any(torch.isinf(count_rewards))

                    state_emb = self.state_embedding_model(_obs.to(device=self.device)).reshape(len(obs), 128)
                    next_state_emb = self.state_embedding_model(_next_obs.to(device=self.device)).reshape(len(obs), 128)

                    control_rewards = torch.norm(next_state_emb - state_emb, dim=1, p=2)

                    intrinsic_rewards = self.intrinsic_reward_coef*(count_rewards * control_rewards)

                    # is this the best way? should we somehow extract the next_state?
                    bonus = [0.0 if d else float(r) for d, r in zip(done, intrinsic_rewards)]

            elif bonus_type == "soc_inf":
                if prev_info == [{}] * len(prev_info):
                    # this is the first step, info is not given during reset

                    # first step in the episode no influence can be estimated as there is no previous action
                    # todo: padd with zeros, and estimate anyway?
                    bonus = [0.0 for _ in done]
                else:
                    # social influence
                    n_procs = len(obs)

                    _prev_NPC_prim_actions = torch.tensor(
                        [self.fn_name_to_npc_prim_act[o["NPC_prim_action"]] for o in prev_info]
                    ).to(self.device)

                    # todo: what is the best way to feed utt action?
                    _prev_NPC_utt_actions = torch.tensor(
                        [self.npc_utterance_to_id[o["NPC_utterance"]] for o in prev_info]
                    ).to(self.device)

                    # new
                    # calculate counterfactuals
                    npc_previous_prim_actions_all = _prev_NPC_prim_actions.repeat(len(self.all_possible_agent_actions)) # [A_ag*n_procs, ...]
                    npc_previous_utt_actions_all = _prev_NPC_utt_actions.repeat(len(self.all_possible_agent_actions)) # [A_ag*n_procs, ...]

                    # agent actions tiled
                    if self.agent_actions_tiled_all is not None:
                        agent_actions_tiled_all = self.agent_actions_tiled_all

                    else:
                        # only first time, we can't do it in init because we need len(im_obs)
                        agent_actions_tiled_all = []
                        for pot_agent_action in self.all_possible_agent_actions:
                            pot_agent_action_tiled = torch.from_numpy(np.tile(pot_agent_action, (n_procs, 1))) # [n_procs,...]
                            agent_actions_tiled_all.append(pot_agent_action_tiled.to(self.device))

                        agent_actions_tiled_all = torch.concat(agent_actions_tiled_all)  # [A_ag*n_procs,....]

                        self.agent_actions_tiled_all = agent_actions_tiled_all

                    with torch.no_grad():
                        # todo: move this tiling above?
                        masked_memory = self.moa_memory * self.mask.unsqueeze(1)
                        masked_memory_tiled_all = masked_memory.repeat([len(self.all_possible_agent_actions), 1])
                        embedding_tiled_all = embeddings.repeat([len(self.all_possible_agent_actions), 1])

                        # use current memory for every action

                        counterfactuals_logits, moa_memory = self.moa_net(
                            embeddings=embedding_tiled_all,
                            # observations=observations_all,
                            npc_previous_prim_actions=npc_previous_prim_actions_all,
                            npc_previous_utterance_actions=npc_previous_utt_actions_all if self.utterance_moa_net else None,
                            agent_actions=agent_actions_tiled_all,
                            memory=masked_memory_tiled_all
                        )  # logits : [A_ag * n_procs, A_npc]

                        counterfactuals_logits = counterfactuals_logits.reshape(
                            [len(self.all_possible_agent_actions), n_procs, self.num_npc_all_actions])

                        counterfactuals_logits = counterfactuals_logits.swapaxes(0, 1) # [n_procs, A_ag, A_npc]

                    assert counterfactuals_logits.shape == (len(obs), len(self.all_possible_agent_actions), self.num_npc_all_actions)

                    # compute npc logits p(A_npc|A_ag, s)

                    # note: ex (5,0,5,2) is mapped to (5,0,0,0), todo: is this ok everywhere?
                    agent_action_indices = [self.act_to_ind_dict[tuple(act.cpu().numpy())] for act in agent_actions]
                    # ~ p(a_npc| a_ag, ...)

                    predicted_logits = torch.stack([ctr[ind] for ctr, ind in zip(counterfactuals_logits, agent_action_indices)])

                    assert i_step is not None
                    self.moa_memories[i_step] = self.moa_memory

                    # only save for the actions actually taken
                    self.moa_memory = moa_memory[agent_action_indices]

                    assert predicted_logits.shape == (len(obs), self.num_npc_all_actions)

                    predicted_probs = torch.softmax(predicted_logits, dim=1)  # use exp_softmax or something?


                    # compute marginal npc logits p(A_npc|s) = sum( p(A_NPC|A_ag,s), for every A_ag )
                    # compute agent logits for all possible agent actions
                    per_non_speaking_action_log_probs = dist[0].logits + dist[1].logits[:, :1]

                    per_speaking_action_log_probs = []
                    for p in range(n_procs):

                        log_probs_for_proc_p = [d.logits[p].cpu().numpy() for d in dist]

                        # speaking actions
                        speaking_log_probs = log_probs_for_proc_p
                        speaking_log_probs[1] = speaking_log_probs[1][1:]  # only the speak action

                        # sum everybody with everybody
                        out = np.add.outer(speaking_log_probs[0], speaking_log_probs[1]).reshape(-1)
                        out = np.add.outer(out, speaking_log_probs[2]).reshape(-1)
                        out = np.add.outer(out, speaking_log_probs[3]).reshape(-1)
                        per_speaking_action_log_probs_proc_p = out

                        per_speaking_action_log_probs.append(per_speaking_action_log_probs_proc_p)

                    per_speaking_action_log_probs = np.stack(per_speaking_action_log_probs)

                    agent_log_probs = torch.concat([
                        per_non_speaking_action_log_probs,
                        torch.tensor(per_speaking_action_log_probs).to(device=self.device),
                    ], dim=1)

                    # assert
                    for p in range(n_procs):
                        log_probs_for_proc_p = [d.logits[p].cpu().numpy() for d in dist]

                        assert torch.abs(agent_log_probs[p][self.act_to_ind_dict[(0, 1, 3, 1)]] - sum([p[a] for p, a in list(zip(log_probs_for_proc_p, (0, 1, 3, 1)))])) < 1e-5
                        assert torch.abs(agent_log_probs[p][self.act_to_ind_dict[(0, 1, 1, 10)]] - sum([p[a] for p, a in list(zip(log_probs_for_proc_p, (0, 1, 1, 10)))])) < 1e-5


                    agent_probs = agent_log_probs.exp()

                    counterfactuals_probs = counterfactuals_logits.softmax(dim=-1)  # [n_procs, A_ag, A_npc]
                    counterfactuals_perm = counterfactuals_probs.permute(0, 2, 1)  # [n_procs, A_npc, A_agent]

                    # compute marginal distributions
                    marginals = (counterfactuals_perm * agent_probs[:, None, :]).sum(-1)

                    # this already sums to one, so the following normalization is not needed
                    marginal_probs = marginals / marginals.sum(1, keepdims=True)  # sum over npc_actions
                    assert marginal_probs.shape == (n_procs, self.num_npc_all_actions)  # [batch, A_npc]

                    KL_loss = (predicted_probs * (predicted_probs.log() - marginal_probs.log())).sum(axis=-1)


                    intrinsic_rewards = self.intrinsic_reward_coef * KL_loss

                    # is the NPC observed in the image that is fed as input in this step
                    # (returned by the previous step() call )
                    NPC_observed = torch.tensor([pi["NPC_observed"] for pi in prev_info]).to(self.device)

                    intrinsic_rewards = intrinsic_rewards * NPC_observed

                    bonus = [0.0 if d else float(r) for d, r in zip(done, intrinsic_rewards)]

            elif bonus_type in ["cell", "grid", "lang"]:
                C, M = self.exploration_bonus_params[bonus_type]
                C_ = C / self.num_frames_per_proc

                if self.expert_exploration_bonus:
                    # expert
                    raise DeprecationWarning("Deprecated exploration bonus type")

                elif self.episodic_exploration_bonus:

                    hashes = [state_hashes(o, bonus_type) for o in obs]
                    bonus = [
                        0 if d else  # no bonus if done
                        np.sum([
                            C_ / ((self.visitation_counter[bonus_type][i_p][h] + 1) ** M) for h in hs
                        ])
                        for i_p, (hs, d) in enumerate(zip(hashes, done))
                    ]

                    # update the counters
                    for i_p, (o, d, hs) in enumerate(zip(obs, done, hashes)):
                        if not d:
                            for h in hs:
                                self.visitation_counter[bonus_type][i_p][h] += 1

                else:
                    raise DeprecationWarning("Use episodic exploration bonus.")
                    # non-episodic exploration bonus

                    bonus = [
                        0 if d else  # no bonus if done
                        np.sum([
                        C_ / ((self.visitation_counter[bonus_type][h] + 1) ** M) for h in state_hashes(o. bonus_type)
                        ]) for o, d in zip(obs, done)
                    ]

                    # update the counters
                    for o, d in zip(obs, done):
                        if not d:
                            for h in state_hashes(o, self.exploration_bonus_type):
                                self.visitation_counter[bonus_type][h] += 1

                if self.exploration_bonus_tanh[bonus_type] is not None:
                    bonus = [np.tanh(b)*self.exploration_bonus_tanh[bonus_type] for b in bonus]
            else:
                raise ValueError(f"Unknown exploration bonus type {bonus_type}")

            assert len(total_bonus) == len(bonus)
            total_bonus = [tb+b for tb, b in zip(total_bonus, bonus)]

        return total_bonus

    def collect_experiences(self):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.

        Returns
        -------
        exps : DictList
            Contains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.
        """

        for i_step in range(self.num_frames_per_proc):
            # Do one agent-environment interaction
            preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
            with torch.no_grad():

                if self.acmodel.recurrent:
                    dist, value, memory, policy_embedding = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1), return_embeddings=True)
                else:
                    dist, value, policy_embedding = self.acmodel(preprocessed_obs, return_embeddings=True)

            action = self.acmodel.sample_action(dist)

            obs, reward, done, info = self.env.step(
                self.acmodel.construct_final_action(
                    action.cpu().numpy()
                )
            )

            if hasattr(self.env, "curriculum") and self.env.curriculum is not None:
                curriculum_params = self.env.curriculum.update_parameters({
                    "obs": obs,
                    "reward": reward,
                    "done": done,
                    "info": info,
                })
                # broadcast new parameters to all parallel environments
                self.env.broadcast_curriculum_parameters(curriculum_params)

                if self.reset_rnd_ride_at_phase and curriculum_params['phase'] == 2 and not self.was_reset:
                    self.was_reset = True
                    assert not self.episodic_exploration_bonus

                    for i, bonus_type in enumerate(self.exploration_bonus_type):
                        if bonus_type == "rnd":
                            self.init_rnd_networks_and_optimizer()

                        elif bonus_type == "ride":
                            self.init_ride_networks_and_optimizer()

            for p_i, o in enumerate(obs):
                self.images_counter[p_i][o['image'].tobytes()] += 1

            extrinsic_reward = reward
            exploration_bonus = (0,) * len(reward)

            if self.exploration_bonus:
                bonus = self.calculate_exploration_bonus(
                    obs=obs, done=done, prev_obs=self.obs, info=info, prev_info=self.info, agent_actions=action, dist=dist,
                    i_step=i_step, embeddings=policy_embedding,
                )
                exploration_bonus = bonus
                reward = [r + b for r, b in zip(reward, bonus)]

            if self.clipped_rewards:
                # this should not be used with classic count-based rewards as they often,
                # when combined with extr. rew go past 1.0
                reward = list(map(float, torch.clamp(torch.tensor(reward), -1, 1)))

            # Update experiences values
            self.obss[i_step] = self.obs
            self.obs = obs
            self.infos[i_step] = info  # info of this step is the current info
            self.info = info  # save as previous info

            if self.acmodel.recurrent:
                self.memories[i_step] = self.memory
                self.memory = memory
            self.masks[i_step] = self.mask
            self.mask = 1 - torch.tensor(done, device=self.device, dtype=torch.float)

            self.actions[i_step] = action
            self.values[i_step] = value

            if self.reshape_reward is not None:
                self.rewards[i_step] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(obs, action, reward, done)
                ], device=self.device)
            else:
                self.rewards[i_step] = torch.tensor(reward, device=self.device)

            self.log_probs[i_step] = self.acmodel.calculate_log_probs(dist, action)

            # Update log values

            self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float)
            self.log_episode_extrinsic_return += torch.tensor(extrinsic_reward, device=self.device, dtype=torch.float)
            self.log_episode_exploration_bonus += torch.tensor(exploration_bonus, device=self.device, dtype=torch.float)
            self.log_episode_success_rate = torch.tensor([i["success"] for i in info]).float().to(self.device)
            self.log_episode_curriculum_mean_perf = torch.tensor([i.get("curriculum_info_max_mean_perf", 0) for i in info]).float().to(self.device)
            self.log_episode_reshaped_return += self.rewards[i_step]
            self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device)
            self.log_episode_curriculum_param = torch.tensor([i.get("curriculum_info_param", 0.0) for i in info]).float().to(self.device)
            # self.log_episode_curriculum_param = torch.tensor([i.get("curriculum_info_mean_perf", 0.0) for i in info]).float().to(self.device)
            assert self.log_episode_curriculum_param.var() == 0

            log_episode_NPC_introduced_to_current = np.array([i.get('NPC_was_introduced_to', False) for i in info])
            assert all((self.log_episode_NPC_introduced_to | log_episode_NPC_introduced_to_current) == log_episode_NPC_introduced_to_current)

            self.log_episode_NPC_introduced_to = self.log_episode_NPC_introduced_to | log_episode_NPC_introduced_to_current

            self.log_episode_mission_string_observed += torch.tensor([
                float(m in o.get("utterance", ''))
                for m, o in zip(self.env.get_mission(), self.obs)
            ], device=self.device, dtype=torch.float)

            for p, done_ in enumerate(done):
                if done_:
                    self.log_mission_string_observed.append(
                        torch.clamp(self.log_episode_mission_string_observed[p], 0, 1).item()
                    )
                    self.log_done_counter += 1
                    self.log_return.append(self.log_episode_return[p].item())
                    self.log_extrinsic_return.append(self.log_episode_extrinsic_return[p].item())
                    self.log_exploration_bonus.append(self.log_episode_exploration_bonus[p].item())
                    self.log_success_rate.append(self.log_episode_success_rate[p].item())
                    self.log_curriculum_max_mean_perf.append(self.log_episode_curriculum_mean_perf[p].item())
                    self.log_reshaped_return.append(self.log_episode_reshaped_return[p].item())
                    self.log_num_frames.append(self.log_episode_num_frames[p].item())
                    self.log_curriculum_param.append(self.log_episode_curriculum_param[p].item())
                    if self.episodic_exploration_bonus:
                        for v in self.visitation_counter.values():
                            v[p] = Counter()
                    self.images_counter[p] = Counter()
                    self.log_NPC_introduced_to.append(self.log_episode_NPC_introduced_to[p])
                    # print("log history:", self.log_success_rate)
                    # print("log history len:", len(self.log_success_rate)-16)

            self.log_episode_mission_string_observed *= self.mask
            self.log_episode_return *= self.mask
            self.log_episode_extrinsic_return *= self.mask
            self.log_episode_exploration_bonus *= self.mask
            self.log_episode_success_rate *= self.mask
            self.log_episode_curriculum_mean_perf *= self.mask
            self.log_episode_reshaped_return *= self.mask
            self.log_episode_num_frames *= self.mask
            self.log_episode_NPC_introduced_to *= self.mask.cpu().numpy().astype(bool)
            self.log_episode_curriculum_param *= self.mask
        # Add advantage and return to experiences

        preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
        with torch.no_grad():
            if self.acmodel.recurrent:
                _, next_value, _ = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1))
            else:
                _, next_value = self.acmodel(preprocessed_obs)
        for f in reversed(range(self.num_frames_per_proc)):
            next_mask = self.masks[f+1] if f < self.num_frames_per_proc - 1 else self.mask
            next_value = self.values[f+1] if f < self.num_frames_per_proc - 1 else next_value
            next_advantage = self.advantages[f+1] if f < self.num_frames_per_proc - 1 else 0

            self.next_masks[f] = next_mask
            self.next_values[f] = next_value

            delta = self.rewards[f] + self.discount * next_value * next_mask - self.values[f]
            self.advantages[f] = delta + self.discount * self.gae_lambda * next_advantage * next_mask

        # Define experiences:
        #   the whole experience is the concatenation of the experience
        #   of each process.
        # In comments below:
        #   - T is self.num_frames_per_proc,
        #   - P is self.num_procs,
        #   - D is the dimensionality.

        exps = DictList()
        exps.obs = [self.obss[f][p]
                    for p in range(self.num_procs)
                    for f in range(self.num_frames_per_proc)]

        exps.infos = np.array([self.infos[f][p]
                    for p in range(self.num_procs)
                    for f in range(self.num_frames_per_proc)])

        # obs: (p1 (f1,f2,f3) ; p2 (f1,f2,f3); p3 (f1,f2,f3)

        if self.acmodel.recurrent:
            # T x P x D -> P x T x D -> (P * T) x D
            exps.memory = self.memories.transpose(0, 1).reshape(-1, *self.memories.shape[2:])
            # T x P -> P x T -> (P * T) x 1
            exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1)
            exps.next_mask = self.next_masks.transpose(0, 1).reshape(-1).unsqueeze(1)

        if self.exploration_bonus and "soc_inf" in self.exploration_bonus_type:
            exps.moa_memory = self.moa_memories.transpose(0, 1).reshape(-1, *self.moa_memories.shape[2:])

        # for all tensors below, T x P -> P x T -> P * T

        exps.action = self.actions.transpose(0, 1).reshape((-1, self.actions.shape[-1]))
        exps.log_prob = self.log_probs.transpose(0, 1).reshape((-1, self.actions.shape[-1]))

        exps.value = self.values.transpose(0, 1).reshape(-1)
        exps.next_value = self.next_values.transpose(0, 1).reshape(-1)
        exps.reward = self.rewards.transpose(0, 1).reshape(-1)
        exps.advantage = self.advantages.transpose(0, 1).reshape(-1)
        exps.returnn = exps.value + exps.advantage

        # Preprocess experiences

        exps.obs = self.preprocess_obss(exps.obs, device=self.device)

        # Log some values

        keep = max(self.log_done_counter, self.num_procs)

        flat_actions = self.actions.reshape(-1, self.actions.shape[-1])
        action_modalities = {
            "action_modality_{}".format(m): flat_actions[:, m].cpu().numpy() for m in range(self.actions.shape[-1])
        }

        if not self.exploration_bonus:
            assert self.log_return == self.log_extrinsic_return

        logs = {
            "return_per_episode": self.log_return[-keep:],
            "mission_string_observed": self.log_mission_string_observed[-keep:],
            "extrinsic_return_per_episode": self.log_extrinsic_return[-keep:],
            "exploration_bonus_per_episode": self.log_exploration_bonus[-keep:],
            "success_rate_per_episode": self.log_success_rate[-keep:],
            "curriculum_max_mean_perf_per_episode": self.log_curriculum_max_mean_perf[-keep:],
            "curriculum_param_per_episode": self.log_curriculum_param[-keep:],
            "reshaped_return_per_episode": self.log_reshaped_return[-keep:],
            "num_frames_per_episode": self.log_num_frames[-keep:],
            "num_frames": self.num_frames,
            "NPC_introduced_to": self.log_NPC_introduced_to[-keep:],
            **action_modalities
        }

        self.log_done_counter = 0
        self.log_return = self.log_return[-self.num_procs:]
        self.log_extrinsic_return = self.log_extrinsic_return[-self.num_procs:]
        self.log_exploration_bonus = self.log_exploration_bonus[-self.num_procs:]
        self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
        self.log_num_frames = self.log_num_frames[-self.num_procs:]

        return exps, logs

    def compute_advantages_and_returnn(self, exps):
        """
        This function can be used for algorithms which reuse old data (not online RL) to
        recompute non episodic intrinsic rewards on old experience.
        This method is not used in PPO training.

        Example usage from update_parameters
        advs, retnn = self.compute_advantages_and_returnn(exps)

        # if you want to do a sanity check
        assert torch.equal(exps.advantage, advs)
        assert torch.equal(exps.returnn, retnn)

        exps.advantages, exps.returnn = advs, retnn
        """
        shape = (self.num_frames_per_proc, self.num_procs)
        advs = torch.zeros(*shape, device=self.device)

        rewards = exps.reward.reshape(self.num_procs, self.num_frames_per_proc).transpose(0, 1)
        values = exps.value.reshape(self.num_procs, self.num_frames_per_proc).transpose(0, 1)
        next_values = exps.next_value.reshape(self.num_procs, self.num_frames_per_proc).transpose(0, 1)
        next_masks = exps.next_mask.reshape(self.num_procs, self.num_frames_per_proc).transpose(0, 1)

        for f in reversed(range(self.num_frames_per_proc)):
            next_advantage = advs[f+1] if f < self.num_frames_per_proc - 1 else 0

            delta = rewards[f] + self.discount * next_values[f] * next_masks[f] - values[f]
            advs[f] = delta + self.discount * self.gae_lambda * next_advantage * next_masks[f]

        advantage = advs.transpose(0, 1).reshape(-1)
        returnn = exps.value + advantage
        return advantage, returnn

    @abstractmethod
    def update_parameters(self):
        pass

    def init_rnd_networks_and_optimizer(self):
        self.random_target_network = MinigridStateEmbeddingNet(self.env.observation_space['image'].shape).to(
            device=self.device)
        self.predictor_network = MinigridStateEmbeddingNet(self.env.observation_space['image'].shape).to(device=self.device)

        self.intrinsic_reward_optimizer = torch.optim.RMSprop(
            self.predictor_network.parameters(),
            lr=self.intrinsic_reward_learning_rate,
            momentum=self.intrinsic_reward_momentum,
            eps=self.intrinsic_reward_epsilon,
            alpha=self.intrinsic_reward_alpha,
        )

    def init_ride_networks_and_optimizer(self):
        self.state_embedding_model = MinigridStateEmbeddingNet(self.env.observation_space['image'].shape).to(
            device=self.device)
        # linquistic actions
        # n_actions = self.acmodel.model_raw_action_space.nvec.prod

        # we only use primitive actions for ride
        n_actions = self.acmodel.model_raw_action_space.nvec[0]

        self.forward_dynamics_model = MinigridForwardDynamicsNet(n_actions).to(device=self.device)
        self.inverse_dynamics_model = MinigridInverseDynamicsNet(n_actions).to(device=self.device)

        self.state_embedding_optimizer = torch.optim.RMSprop(
            self.state_embedding_model.parameters(),
            lr=self.intrinsic_reward_learning_rate,
            momentum=self.intrinsic_reward_momentum,
            eps=self.intrinsic_reward_epsilon,
            alpha=self.intrinsic_reward_alpha)

        self.inverse_dynamics_optimizer = torch.optim.RMSprop(
            self.inverse_dynamics_model.parameters(),
            lr=self.intrinsic_reward_learning_rate,
            momentum=self.intrinsic_reward_momentum,
            eps=self.intrinsic_reward_epsilon,
            alpha=self.intrinsic_reward_alpha)

        self.forward_dynamics_optimizer = torch.optim.RMSprop(
            self.forward_dynamics_model.parameters(),
            lr=self.intrinsic_reward_learning_rate,
            momentum=self.intrinsic_reward_momentum,
            eps=self.intrinsic_reward_epsilon,
            alpha=self.intrinsic_reward_alpha)