GameServerX

Running

App Files Files Community

GameServerX / MLPY /Lib /site-packages /mlagents /trainers /torch_entities /attention.py

Kano001

Upload 280 files

e11e4fe verified 3 months ago

raw

history blame

11.9 kB

	from mlagents.torch_utils import torch
	import warnings
	from typing import Tuple, Optional, List
	from mlagents.trainers.torch_entities.layers import (
	LinearEncoder,
	Initialization,
	linear_layer,
	LayerNorm,
	)
	from mlagents.trainers.torch_entities.model_serialization import exporting_to_onnx
	from mlagents.trainers.exception import UnityTrainerException


	def get_zero_entities_mask(entities: List[torch.Tensor]) -> List[torch.Tensor]:
	"""
	Takes a List of Tensors and returns a List of mask Tensor with 1 if the input was
	all zeros (on dimension 2) and 0 otherwise. This is used in the Attention
	layer to mask the padding observations.
	"""
	with torch.no_grad():

	if exporting_to_onnx.is_exporting():
	with warnings.catch_warnings():
	# We ignore a TracerWarning from PyTorch that warns that doing
	# shape[n].item() will cause the trace to be incorrect (the trace might
	# not generalize to other inputs)
	# We ignore this warning because we know the model will always be
	# run with inputs of the same shape
	warnings.simplefilter("ignore")
	# When exporting to ONNX, we want to transpose the entities. This is
	# because ONNX only support input in NCHW (channel first) format.
	# Barracuda also expect to get data in NCHW.
	entities = [
	torch.transpose(obs, 2, 1).reshape(
	-1, obs.shape[1].item(), obs.shape[2].item()
	)
	for obs in entities
	]

	# Generate the masking tensors for each entities tensor (mask only if all zeros)
	key_masks: List[torch.Tensor] = [
	(torch.sum(ent**2, axis=2) < 0.01).float() for ent in entities
	]
	return key_masks


	class MultiHeadAttention(torch.nn.Module):

	NEG_INF = -1e6

	def __init__(self, embedding_size: int, num_heads: int):
	"""
	Multi Head Attention module. We do not use the regular Torch implementation since
	Barracuda does not support some operators it uses.
	Takes as input to the forward method 3 tensors:
	- query: of dimensions (batch_size, number_of_queries, embedding_size)
	- key: of dimensions (batch_size, number_of_keys, embedding_size)
	- value: of dimensions (batch_size, number_of_keys, embedding_size)
	The forward method will return 2 tensors:
	- The output: (batch_size, number_of_queries, embedding_size)
	- The attention matrix: (batch_size, num_heads, number_of_queries, number_of_keys)
	:param embedding_size: The size of the embeddings that will be generated (should be
	dividable by the num_heads)
	:param total_max_elements: The maximum total number of entities that can be passed to
	the module
	:param num_heads: The number of heads of the attention module
	"""
	super().__init__()
	self.n_heads = num_heads
	self.head_size: int = embedding_size // self.n_heads
	self.embedding_size: int = self.head_size * self.n_heads

	def forward(
	self,
	query: torch.Tensor,
	key: torch.Tensor,
	value: torch.Tensor,
	n_q: int,
	n_k: int,
	key_mask: Optional[torch.Tensor] = None,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	b = -1 # the batch size

	query = query.reshape(
	b, n_q, self.n_heads, self.head_size
	) # (b, n_q, h, emb / h)
	key = key.reshape(b, n_k, self.n_heads, self.head_size) # (b, n_k, h, emb / h)
	value = value.reshape(
	b, n_k, self.n_heads, self.head_size
	) # (b, n_k, h, emb / h)

	query = query.permute([0, 2, 1, 3]) # (b, h, n_q, emb / h)
	# The next few lines are equivalent to : key.permute([0, 2, 3, 1])
	# This is a hack, ONNX will compress two permute operations and
	# Barracuda will not like seeing `permute([0,2,3,1])`
	key = key.permute([0, 2, 1, 3]) # (b, h, emb / h, n_k)
	key -= 1
	key += 1
	key = key.permute([0, 1, 3, 2]) # (b, h, emb / h, n_k)

	qk = torch.matmul(query, key) # (b, h, n_q, n_k)

	if key_mask is None:
	qk = qk / (self.embedding_size**0.5)
	else:
	key_mask = key_mask.reshape(b, 1, 1, n_k)
	qk = (1 - key_mask) * qk / (
	self.embedding_size**0.5
	) + key_mask * self.NEG_INF

	att = torch.softmax(qk, dim=3) # (b, h, n_q, n_k)

	value = value.permute([0, 2, 1, 3]) # (b, h, n_k, emb / h)
	value_attention = torch.matmul(att, value) # (b, h, n_q, emb / h)

	value_attention = value_attention.permute([0, 2, 1, 3]) # (b, n_q, h, emb / h)
	value_attention = value_attention.reshape(
	b, n_q, self.embedding_size
	) # (b, n_q, emb)

	return value_attention, att


	class EntityEmbedding(torch.nn.Module):
	"""
	A module used to embed entities before passing them to a self-attention block.
	Used in conjunction with ResidualSelfAttention to encode information about a self
	and additional entities. Can also concatenate self to entities for ego-centric self-
	attention. Inspired by architecture used in https://arxiv.org/pdf/1909.07528.pdf.
	"""

	def __init__(
	self,
	entity_size: int,
	entity_num_max_elements: Optional[int],
	embedding_size: int,
	):
	"""
	Constructs an EntityEmbedding module.
	:param x_self_size: Size of "self" entity.
	:param entity_size: Size of other entities.
	:param entity_num_max_elements: Maximum elements for a given entity, None for unrestricted.
	Needs to be assigned in order for model to be exportable to ONNX and Barracuda.
	:param embedding_size: Embedding size for the entity encoder.
	:param concat_self: Whether to concatenate x_self to entities. Set True for ego-centric
	self-attention.
	"""
	super().__init__()
	self.self_size: int = 0
	self.entity_size: int = entity_size
	self.entity_num_max_elements: int = -1
	if entity_num_max_elements is not None:
	self.entity_num_max_elements = entity_num_max_elements
	self.embedding_size = embedding_size
	# Initialization scheme from http://www.cs.toronto.edu/~mvolkovs/ICML2020_tfixup.pdf
	self.self_ent_encoder = LinearEncoder(
	self.entity_size,
	1,
	self.embedding_size,
	kernel_init=Initialization.Normal,
	kernel_gain=(0.125 / self.embedding_size) ** 0.5,
	)

	def add_self_embedding(self, size: int) -> None:
	self.self_size = size
	self.self_ent_encoder = LinearEncoder(
	self.self_size + self.entity_size,
	1,
	self.embedding_size,
	kernel_init=Initialization.Normal,
	kernel_gain=(0.125 / self.embedding_size) ** 0.5,
	)

	def forward(self, x_self: torch.Tensor, entities: torch.Tensor) -> torch.Tensor:
	num_entities = self.entity_num_max_elements
	if num_entities < 0:
	if exporting_to_onnx.is_exporting():
	raise UnityTrainerException(
	"Trying to export an attention mechanism that doesn't have a set max \
	number of elements."
	)
	num_entities = entities.shape[1]

	if exporting_to_onnx.is_exporting():
	# When exporting to ONNX, we want to transpose the entities. This is
	# because ONNX only support input in NCHW (channel first) format.
	# Barracuda also expect to get data in NCHW.
	entities = torch.transpose(entities, 2, 1).reshape(
	-1, num_entities, self.entity_size
	)

	if self.self_size > 0:
	expanded_self = x_self.reshape(-1, 1, self.self_size)
	expanded_self = torch.cat([expanded_self] * num_entities, dim=1)
	# Concatenate all observations with self
	entities = torch.cat([expanded_self, entities], dim=2)
	# Encode entities
	encoded_entities = self.self_ent_encoder(entities)
	return encoded_entities


	class ResidualSelfAttention(torch.nn.Module):
	"""
	Residual self attentioninspired from https://arxiv.org/pdf/1909.07528.pdf. Can be used
	with an EntityEmbedding module, to apply multi head self attention to encode information
	about a "Self" and a list of relevant "Entities".
	"""

	EPSILON = 1e-7

	def __init__(
	self,
	embedding_size: int,
	entity_num_max_elements: Optional[int] = None,
	num_heads: int = 4,
	):
	"""
	Constructs a ResidualSelfAttention module.
	:param embedding_size: Embedding sizee for attention mechanism and
	Q, K, V encoders.
	:param entity_num_max_elements: A List of ints representing the maximum number
	of elements in an entity sequence. Should be of length num_entities. Pass None to
	not restrict the number of elements; however, this will make the module
	unexportable to ONNX/Barracuda.
	:param num_heads: Number of heads for Multi Head Self-Attention
	"""
	super().__init__()
	self.max_num_ent: Optional[int] = None
	if entity_num_max_elements is not None:
	self.max_num_ent = entity_num_max_elements

	self.attention = MultiHeadAttention(
	num_heads=num_heads, embedding_size=embedding_size
	)

	# Initialization scheme from http://www.cs.toronto.edu/~mvolkovs/ICML2020_tfixup.pdf
	self.fc_q = linear_layer(
	embedding_size,
	embedding_size,
	kernel_init=Initialization.Normal,
	kernel_gain=(0.125 / embedding_size) ** 0.5,
	)
	self.fc_k = linear_layer(
	embedding_size,
	embedding_size,
	kernel_init=Initialization.Normal,
	kernel_gain=(0.125 / embedding_size) ** 0.5,
	)
	self.fc_v = linear_layer(
	embedding_size,
	embedding_size,
	kernel_init=Initialization.Normal,
	kernel_gain=(0.125 / embedding_size) ** 0.5,
	)
	self.fc_out = linear_layer(
	embedding_size,
	embedding_size,
	kernel_init=Initialization.Normal,
	kernel_gain=(0.125 / embedding_size) ** 0.5,
	)
	self.embedding_norm = LayerNorm()
	self.residual_norm = LayerNorm()

	def forward(self, inp: torch.Tensor, key_masks: List[torch.Tensor]) -> torch.Tensor:
	# Gather the maximum number of entities information
	mask = torch.cat(key_masks, dim=1)

	inp = self.embedding_norm(inp)
	# Feed to self attention
	query = self.fc_q(inp) # (b, n_q, emb)
	key = self.fc_k(inp) # (b, n_k, emb)
	value = self.fc_v(inp) # (b, n_k, emb)

	# Only use max num if provided
	if self.max_num_ent is not None:
	num_ent = self.max_num_ent
	else:
	num_ent = inp.shape[1]
	if exporting_to_onnx.is_exporting():
	raise UnityTrainerException(
	"Trying to export an attention mechanism that doesn't have a set max \
	number of elements."
	)

	output, _ = self.attention(query, key, value, num_ent, num_ent, mask)
	# Residual
	output = self.fc_out(output) + inp
	output = self.residual_norm(output)
	# Average Pooling
	numerator = torch.sum(output * (1 - mask).reshape(-1, num_ent, 1), dim=1)
	denominator = torch.sum(1 - mask, dim=1, keepdim=True) + self.EPSILON
	output = numerator / denominator
	return output