|
from collections.abc import Sequence |
|
from typing import Any |
|
|
|
import numpy as np |
|
import torch |
|
import torch.nn.functional as F |
|
from torch import nn |
|
|
|
from tianshou.data import Batch, to_torch |
|
from tianshou.utils.net.common import MLP, BaseActor, Net, TActionShape, get_output_dim |
|
|
|
|
|
class Actor(BaseActor): |
|
"""Simple actor network for discrete action spaces. |
|
|
|
:param preprocess_net: a self-defined preprocess_net. Typically, an instance of |
|
:class:`~tianshou.utils.net.common.Net`. |
|
:param action_shape: a sequence of int for the shape of action. |
|
:param hidden_sizes: a sequence of int for constructing the MLP after |
|
preprocess_net. Default to empty sequence (where the MLP now contains |
|
only a single linear layer). |
|
:param softmax_output: whether to apply a softmax layer over the last |
|
layer's output. |
|
:param preprocess_net_output_dim: the output dimension of |
|
`preprocess_net`. Only used when `preprocess_net` does not have the attribute `output_dim`. |
|
|
|
For advanced usage (how to customize the network), please refer to |
|
:ref:`build_the_network`. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
preprocess_net: nn.Module | Net, |
|
action_shape: TActionShape, |
|
hidden_sizes: Sequence[int] = (), |
|
softmax_output: bool = True, |
|
preprocess_net_output_dim: int | None = None, |
|
device: str | int | torch.device = "cpu", |
|
) -> None: |
|
super().__init__() |
|
|
|
|
|
self.device = device |
|
self.preprocess = preprocess_net |
|
self.output_dim = int(np.prod(action_shape)) |
|
input_dim = get_output_dim(preprocess_net, preprocess_net_output_dim) |
|
self.last = MLP( |
|
input_dim, |
|
self.output_dim, |
|
hidden_sizes, |
|
device=self.device, |
|
) |
|
self.softmax_output = softmax_output |
|
|
|
def get_preprocess_net(self) -> nn.Module: |
|
return self.preprocess |
|
|
|
def get_output_dim(self) -> int: |
|
return self.output_dim |
|
|
|
def forward( |
|
self, |
|
obs: np.ndarray | torch.Tensor, |
|
state: Any = None, |
|
info: dict[str, Any] | None = None, |
|
) -> tuple[torch.Tensor, torch.Tensor | None]: |
|
r"""Mapping: s_B -> action_values_BA, hidden_state_BH | None. |
|
|
|
Returns a tensor representing the values of each action, i.e, of shape |
|
`(n_actions, )`, and |
|
a hidden state (which may be None). If `self.softmax_output` is True, they are the |
|
probabilities for taking each action. Otherwise, they will be action values. |
|
The hidden state is only |
|
not None if a recurrent net is used as part of the learning algorithm. |
|
""" |
|
x, hidden_BH = self.preprocess(obs, state) |
|
x = self.last(x) |
|
if self.softmax_output: |
|
x = F.softmax(x, dim=-1) |
|
|
|
output_BA = x |
|
return output_BA, hidden_BH |
|
|
|
|
|
class Critic(nn.Module): |
|
"""Simple critic network for discrete action spaces. |
|
|
|
:param preprocess_net: a self-defined preprocess_net. Typically, an instance of |
|
:class:`~tianshou.utils.net.common.Net`. |
|
:param hidden_sizes: a sequence of int for constructing the MLP after |
|
preprocess_net. Default to empty sequence (where the MLP now contains |
|
only a single linear layer). |
|
:param last_size: the output dimension of Critic network. Default to 1. |
|
:param preprocess_net_output_dim: the output dimension of |
|
`preprocess_net`. Only used when `preprocess_net` does not have the attribute `output_dim`. |
|
|
|
For advanced usage (how to customize the network), please refer to |
|
:ref:`build_the_network`.. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
preprocess_net: nn.Module | Net, |
|
hidden_sizes: Sequence[int] = (), |
|
last_size: int = 1, |
|
preprocess_net_output_dim: int | None = None, |
|
device: str | int | torch.device = "cpu", |
|
) -> None: |
|
super().__init__() |
|
self.device = device |
|
self.preprocess = preprocess_net |
|
self.output_dim = last_size |
|
input_dim = get_output_dim(preprocess_net, preprocess_net_output_dim) |
|
self.last = MLP(input_dim, last_size, hidden_sizes, device=self.device) |
|
|
|
|
|
def forward(self, obs: np.ndarray | torch.Tensor, **kwargs: Any) -> torch.Tensor: |
|
"""Mapping: s_B -> V(s)_B.""" |
|
|
|
logits, _ = self.preprocess(obs, state=kwargs.get("state", None)) |
|
return self.last(logits) |
|
|
|
|
|
class CosineEmbeddingNetwork(nn.Module): |
|
"""Cosine embedding network for IQN. Convert a scalar in [0, 1] to a list of n-dim vectors. |
|
|
|
:param num_cosines: the number of cosines used for the embedding. |
|
:param embedding_dim: the dimension of the embedding/output. |
|
|
|
.. note:: |
|
|
|
From https://github.com/ku2482/fqf-iqn-qrdqn.pytorch/blob/master |
|
/fqf_iqn_qrdqn/network.py . |
|
""" |
|
|
|
def __init__(self, num_cosines: int, embedding_dim: int) -> None: |
|
super().__init__() |
|
self.net = nn.Sequential(nn.Linear(num_cosines, embedding_dim), nn.ReLU()) |
|
self.num_cosines = num_cosines |
|
self.embedding_dim = embedding_dim |
|
|
|
def forward(self, taus: torch.Tensor) -> torch.Tensor: |
|
batch_size = taus.shape[0] |
|
N = taus.shape[1] |
|
|
|
i_pi = np.pi * torch.arange( |
|
start=1, |
|
end=self.num_cosines + 1, |
|
dtype=taus.dtype, |
|
device=taus.device, |
|
).view(1, 1, self.num_cosines) |
|
|
|
cosines = torch.cos(taus.view(batch_size, N, 1) * i_pi).view( |
|
batch_size * N, |
|
self.num_cosines, |
|
) |
|
|
|
return self.net(cosines).view(batch_size, N, self.embedding_dim) |
|
|
|
|
|
class ImplicitQuantileNetwork(Critic): |
|
"""Implicit Quantile Network. |
|
|
|
:param preprocess_net: a self-defined preprocess_net which output a |
|
flattened hidden state. |
|
:param action_shape: a sequence of int for the shape of action. |
|
:param hidden_sizes: a sequence of int for constructing the MLP after |
|
preprocess_net. Default to empty sequence (where the MLP now contains |
|
only a single linear layer). |
|
:param num_cosines: the number of cosines to use for cosine embedding. |
|
Default to 64. |
|
:param preprocess_net_output_dim: the output dimension of |
|
preprocess_net. |
|
|
|
.. note:: |
|
|
|
Although this class inherits Critic, it is actually a quantile Q-Network |
|
with output shape (batch_size, action_dim, sample_size). |
|
|
|
The second item of the first return value is tau vector. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
preprocess_net: nn.Module, |
|
action_shape: TActionShape, |
|
hidden_sizes: Sequence[int] = (), |
|
num_cosines: int = 64, |
|
preprocess_net_output_dim: int | None = None, |
|
device: str | int | torch.device = "cpu", |
|
) -> None: |
|
last_size = int(np.prod(action_shape)) |
|
super().__init__(preprocess_net, hidden_sizes, last_size, preprocess_net_output_dim, device) |
|
self.input_dim = get_output_dim(preprocess_net, preprocess_net_output_dim) |
|
self.embed_model = CosineEmbeddingNetwork(num_cosines, self.input_dim).to( |
|
device, |
|
) |
|
|
|
def forward( |
|
self, |
|
obs: np.ndarray | torch.Tensor, |
|
sample_size: int, |
|
**kwargs: Any, |
|
) -> tuple[Any, torch.Tensor]: |
|
r"""Mapping: s -> Q(s, \*).""" |
|
logits, hidden = self.preprocess(obs, state=kwargs.get("state", None)) |
|
|
|
batch_size = logits.size(0) |
|
taus = torch.rand(batch_size, sample_size, dtype=logits.dtype, device=logits.device) |
|
embedding = (logits.unsqueeze(1) * self.embed_model(taus)).view( |
|
batch_size * sample_size, |
|
-1, |
|
) |
|
out = self.last(embedding).view(batch_size, sample_size, -1).transpose(1, 2) |
|
return (out, taus), hidden |
|
|
|
|
|
class FractionProposalNetwork(nn.Module): |
|
"""Fraction proposal network for FQF. |
|
|
|
:param num_fractions: the number of factions to propose. |
|
:param embedding_dim: the dimension of the embedding/input. |
|
|
|
.. note:: |
|
|
|
Adapted from https://github.com/ku2482/fqf-iqn-qrdqn.pytorch/blob/master |
|
/fqf_iqn_qrdqn/network.py . |
|
""" |
|
|
|
def __init__(self, num_fractions: int, embedding_dim: int) -> None: |
|
super().__init__() |
|
self.net = nn.Linear(embedding_dim, num_fractions) |
|
torch.nn.init.xavier_uniform_(self.net.weight, gain=0.01) |
|
torch.nn.init.constant_(self.net.bias, 0) |
|
self.num_fractions = num_fractions |
|
self.embedding_dim = embedding_dim |
|
|
|
def forward( |
|
self, |
|
obs_embeddings: torch.Tensor, |
|
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: |
|
|
|
dist = torch.distributions.Categorical(logits=self.net(obs_embeddings)) |
|
taus_1_N = torch.cumsum(dist.probs, dim=1) |
|
|
|
taus = F.pad(taus_1_N, (1, 0)) |
|
|
|
tau_hats = (taus[:, :-1] + taus[:, 1:]).detach() / 2.0 |
|
|
|
entropies = dist.entropy() |
|
return taus, tau_hats, entropies |
|
|
|
|
|
class FullQuantileFunction(ImplicitQuantileNetwork): |
|
"""Full(y parameterized) Quantile Function. |
|
|
|
:param preprocess_net: a self-defined preprocess_net which output a |
|
flattened hidden state. |
|
:param action_shape: a sequence of int for the shape of action. |
|
:param hidden_sizes: a sequence of int for constructing the MLP after |
|
preprocess_net. Default to empty sequence (where the MLP now contains |
|
only a single linear layer). |
|
:param num_cosines: the number of cosines to use for cosine embedding. |
|
Default to 64. |
|
:param preprocess_net_output_dim: the output dimension of |
|
preprocess_net. |
|
|
|
.. note:: |
|
|
|
The first return value is a tuple of (quantiles, fractions, quantiles_tau), |
|
where fractions is a Batch(taus, tau_hats, entropies). |
|
""" |
|
|
|
def __init__( |
|
self, |
|
preprocess_net: nn.Module, |
|
action_shape: TActionShape, |
|
hidden_sizes: Sequence[int] = (), |
|
num_cosines: int = 64, |
|
preprocess_net_output_dim: int | None = None, |
|
device: str | int | torch.device = "cpu", |
|
) -> None: |
|
super().__init__( |
|
preprocess_net, |
|
action_shape, |
|
hidden_sizes, |
|
num_cosines, |
|
preprocess_net_output_dim, |
|
device, |
|
) |
|
|
|
def _compute_quantiles(self, obs: torch.Tensor, taus: torch.Tensor) -> torch.Tensor: |
|
batch_size, sample_size = taus.shape |
|
embedding = (obs.unsqueeze(1) * self.embed_model(taus)).view(batch_size * sample_size, -1) |
|
return self.last(embedding).view(batch_size, sample_size, -1).transpose(1, 2) |
|
|
|
def forward( |
|
self, |
|
obs: np.ndarray | torch.Tensor, |
|
propose_model: FractionProposalNetwork, |
|
fractions: Batch | None = None, |
|
**kwargs: Any, |
|
) -> tuple[Any, torch.Tensor]: |
|
r"""Mapping: s -> Q(s, \*).""" |
|
logits, hidden = self.preprocess(obs, state=kwargs.get("state", None)) |
|
|
|
if fractions is None: |
|
taus, tau_hats, entropies = propose_model(logits.detach()) |
|
fractions = Batch(taus=taus, tau_hats=tau_hats, entropies=entropies) |
|
else: |
|
taus, tau_hats = fractions.taus, fractions.tau_hats |
|
quantiles = self._compute_quantiles(logits, tau_hats) |
|
|
|
quantiles_tau = None |
|
if self.training: |
|
with torch.no_grad(): |
|
quantiles_tau = self._compute_quantiles(logits, taus[:, 1:-1]) |
|
return (quantiles, fractions, quantiles_tau), hidden |
|
|
|
|
|
|
|
|
|
|
|
class FullQuantileFunctionRainbow(ImplicitQuantileNetwork): |
|
"""Full(y parameterized) Quantile Function with Noisy Networks and Dueling option. |
|
|
|
:param preprocess_net: a self-defined preprocess_net which output a |
|
flattened hidden state. |
|
:param action_shape: a sequence of int for the shape of action. |
|
:param hidden_sizes: a sequence of int for constructing the MLP after |
|
preprocess_net. Default to empty sequence (where the MLP now contains |
|
only a single linear layer). |
|
:param num_cosines: the number of cosines to use for cosine embedding. |
|
Default to 64. |
|
:param preprocess_net_output_dim: the output dimension of |
|
preprocess_net. |
|
:param noisy_std: standard deviation for NoisyLinear layers. Default to 0.5. |
|
:param is_noisy: whether to use noisy layers. Default to True. |
|
|
|
.. note:: |
|
|
|
The first return value is a tuple of (quantiles, fractions, quantiles_tau), |
|
where fractions is a Batch(taus, tau_hats, entropies). |
|
""" |
|
|
|
def __init__( |
|
self, |
|
preprocess_net: nn.Module, |
|
action_shape: TActionShape, |
|
hidden_sizes: Sequence[int] = (), |
|
num_cosines: int = 64, |
|
preprocess_net_output_dim: int | None = None, |
|
device: str | int | torch.device = "cpu", |
|
noisy_std: float = 0.5, |
|
is_noisy: bool = True, |
|
is_dueling : bool = True |
|
) -> None: |
|
super().__init__( |
|
preprocess_net, |
|
action_shape, |
|
hidden_sizes, |
|
num_cosines, |
|
preprocess_net_output_dim, |
|
device, |
|
) |
|
|
|
if preprocess_net_output_dim is None: |
|
raise ValueError("preprocess_net_output_dim must be specified and not None.") |
|
|
|
|
|
|
|
|
|
self.action_shape = action_shape |
|
self.noisy_std = noisy_std |
|
self.is_noisy = is_noisy |
|
self.is_dueling = is_dueling |
|
|
|
print(action_shape,noisy_std) |
|
print(preprocess_net_output_dim) |
|
|
|
def linear(x: int, y: int) -> nn.Module: |
|
if self.is_noisy: |
|
return NoisyLinear(x, y, self.noisy_std) |
|
return nn.Linear(x, y) |
|
|
|
|
|
|
|
self.advantage_net = nn.Sequential( |
|
linear(preprocess_net_output_dim, 512), |
|
nn.ReLU(inplace=True), |
|
linear(512, self.action_shape) |
|
) |
|
|
|
|
|
|
|
|
|
|
|
if self.is_dueling: |
|
self.value_net = nn.Sequential( |
|
linear(preprocess_net_output_dim, 512), |
|
nn.ReLU(inplace=True), |
|
linear(512, 1) |
|
) |
|
print("Dueling is True") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _compute_quantiles(self, obs: torch.Tensor, taus: torch.Tensor) -> torch.Tensor: |
|
batch_size, sample_size = taus.shape |
|
embedding = (obs.unsqueeze(1) * self.embed_model(taus)).view(batch_size * sample_size, -1) |
|
|
|
|
|
advantage = self.advantage_net(embedding).view(batch_size, sample_size, -1).transpose(1, 2) |
|
|
|
if self.is_dueling: |
|
|
|
value = self.value_net(embedding).view(batch_size, sample_size, 1).transpose(1, 2) |
|
|
|
quantiles = value + (advantage - advantage.mean(dim=1, keepdim=True)) |
|
else: |
|
quantiles = advantage |
|
|
|
return quantiles |
|
|
|
|
|
|
|
|
|
|
|
|
|
def forward( |
|
self, |
|
obs: np.ndarray | torch.Tensor, |
|
propose_model: FractionProposalNetwork, |
|
fractions: Batch | None = None, |
|
**kwargs: Any, |
|
) -> tuple[Any, torch.Tensor]: |
|
r"""Mapping: s -> Q(s, \*).""" |
|
logits, hidden = self.preprocess(obs, state=kwargs.get("state", None)) |
|
|
|
if fractions is None: |
|
taus, tau_hats, entropies = propose_model(logits.detach()) |
|
fractions = Batch(taus=taus, tau_hats=tau_hats, entropies=entropies) |
|
else: |
|
taus, tau_hats = fractions.taus, fractions.tau_hats |
|
quantiles = self._compute_quantiles(logits, tau_hats) |
|
|
|
quantiles_tau = None |
|
if self.training: |
|
with torch.no_grad(): |
|
quantiles_tau = self._compute_quantiles(logits, taus[:, 1:-1]) |
|
return (quantiles, fractions, quantiles_tau), hidden |
|
|
|
|
|
class NoisyLinear(nn.Module): |
|
"""Implementation of Noisy Networks. arXiv:1706.10295. |
|
|
|
:param in_features: the number of input features. |
|
:param out_features: the number of output features. |
|
:param noisy_std: initial standard deviation of noisy linear layers. |
|
|
|
.. note:: |
|
|
|
Adapted from https://github.com/ku2482/fqf-iqn-qrdqn.pytorch/blob/master |
|
/fqf_iqn_qrdqn/network.py . |
|
""" |
|
|
|
def __init__(self, in_features: int, out_features: int, noisy_std: float = 0.5) -> None: |
|
super().__init__() |
|
|
|
|
|
self.mu_W = nn.Parameter(torch.FloatTensor(out_features, in_features)) |
|
self.sigma_W = nn.Parameter(torch.FloatTensor(out_features, in_features)) |
|
self.mu_bias = nn.Parameter(torch.FloatTensor(out_features)) |
|
self.sigma_bias = nn.Parameter(torch.FloatTensor(out_features)) |
|
|
|
|
|
self.register_buffer("eps_p", torch.FloatTensor(in_features)) |
|
self.register_buffer("eps_q", torch.FloatTensor(out_features)) |
|
|
|
self.in_features = in_features |
|
self.out_features = out_features |
|
self.sigma = noisy_std |
|
|
|
self.reset() |
|
self.sample() |
|
|
|
def reset(self) -> None: |
|
bound = 1 / np.sqrt(self.in_features) |
|
self.mu_W.data.uniform_(-bound, bound) |
|
self.mu_bias.data.uniform_(-bound, bound) |
|
self.sigma_W.data.fill_(self.sigma / np.sqrt(self.in_features)) |
|
self.sigma_bias.data.fill_(self.sigma / np.sqrt(self.in_features)) |
|
|
|
def f(self, x: torch.Tensor) -> torch.Tensor: |
|
x = torch.randn(x.size(0), device=x.device) |
|
return x.sign().mul_(x.abs().sqrt_()) |
|
|
|
|
|
def sample(self) -> None: |
|
self.eps_p.copy_(self.f(self.eps_p)) |
|
self.eps_q.copy_(self.f(self.eps_q)) |
|
|
|
def forward(self, x: torch.Tensor) -> torch.Tensor: |
|
if self.training: |
|
weight = self.mu_W + self.sigma_W * (self.eps_q.ger(self.eps_p)) |
|
bias = self.mu_bias + self.sigma_bias * self.eps_q.clone() |
|
else: |
|
weight = self.mu_W |
|
bias = self.mu_bias |
|
|
|
return F.linear(x, weight, bias) |
|
|
|
|
|
class IntrinsicCuriosityModule(nn.Module): |
|
"""Implementation of Intrinsic Curiosity Module. arXiv:1705.05363. |
|
|
|
:param feature_net: a self-defined feature_net which output a |
|
flattened hidden state. |
|
:param feature_dim: input dimension of the feature net. |
|
:param action_dim: dimension of the action space. |
|
:param hidden_sizes: hidden layer sizes for forward and inverse models. |
|
:param device: device for the module. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
feature_net: nn.Module, |
|
feature_dim: int, |
|
action_dim: int, |
|
hidden_sizes: Sequence[int] = (), |
|
device: str | torch.device = "cpu", |
|
) -> None: |
|
super().__init__() |
|
self.feature_net = feature_net |
|
self.forward_model = MLP( |
|
feature_dim + action_dim, |
|
output_dim=feature_dim, |
|
hidden_sizes=hidden_sizes, |
|
device=device, |
|
) |
|
self.inverse_model = MLP( |
|
feature_dim * 2, |
|
output_dim=action_dim, |
|
hidden_sizes=hidden_sizes, |
|
device=device, |
|
) |
|
self.feature_dim = feature_dim |
|
self.action_dim = action_dim |
|
self.device = device |
|
|
|
def forward( |
|
self, |
|
s1: np.ndarray | torch.Tensor, |
|
act: np.ndarray | torch.Tensor, |
|
s2: np.ndarray | torch.Tensor, |
|
**kwargs: Any, |
|
) -> tuple[torch.Tensor, torch.Tensor]: |
|
r"""Mapping: s1, act, s2 -> mse_loss, act_hat.""" |
|
s1 = to_torch(s1, dtype=torch.float32, device=self.device) |
|
s2 = to_torch(s2, dtype=torch.float32, device=self.device) |
|
phi1, phi2 = self.feature_net(s1), self.feature_net(s2) |
|
act = to_torch(act, dtype=torch.long, device=self.device) |
|
phi2_hat = self.forward_model( |
|
torch.cat([phi1, F.one_hot(act, num_classes=self.action_dim)], dim=1), |
|
) |
|
mse_loss = 0.5 * F.mse_loss(phi2_hat, phi2, reduction="none").sum(1) |
|
act_hat = self.inverse_model(torch.cat([phi1, phi2], dim=1)) |
|
return mse_loss, act_hat |
|
|