debug special token

0a8807e 4 months ago

6.67 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.


	from contextlib import contextmanager
	from functools import wraps, lru_cache
	import hashlib
	import json
	import logging
	from pathlib import Path
	import typing as tp

	import flashy
	import flashy.distrib
	import omegaconf
	import torch
	from torch.nn.utils.rnn import pad_sequence


	logger = logging.getLogger(__name__)


	def model_hash(model: torch.nn.Module) -> str:
	"""Return a model hash. This should allow us to track regressions in model init
	from the logs of past experiments.
	"""
	hasher = hashlib.sha1()
	for p in model.parameters():
	hasher.update(p.data.cpu().numpy().tobytes())
	return hasher.hexdigest()


	def dict_from_config(cfg: omegaconf.DictConfig) -> dict:
	"""Convenience function to map an omegaconf configuration to a dictionary.

	Args:
	cfg (omegaconf.DictConfig): Original configuration to map to dict.
	Returns:
	dict: Config as dictionary object.
	"""
	dct = omegaconf.OmegaConf.to_container(cfg, resolve=True)
	assert isinstance(dct, dict)
	return dct


	def random_subset(dataset, max_samples: int, seed: int = 42) -> torch.utils.data.Subset:
	if max_samples >= len(dataset):
	return dataset

	generator = torch.Generator().manual_seed(seed)
	perm = torch.randperm(len(dataset), generator=generator)
	return torch.utils.data.Subset(dataset, perm[:max_samples].tolist())


	def get_loader(dataset, num_samples: tp.Optional[int], batch_size: int,
	num_workers: int, seed: int, **kwargs) -> torch.utils.data.DataLoader:
	"""Convenience function to load dataset into a dataloader with optional subset sampling.

	Args:
	dataset: Dataset to load.
	num_samples (Optional[int]): Number of samples to limit subset size.
	batch_size (int): Batch size.
	num_workers (int): Number of workers for data loading.
	seed (int): Random seed.
	"""
	if num_samples is not None:
	dataset = random_subset(dataset, num_samples, seed)

	dataloader = flashy.distrib.loader(
	dataset,
	batch_size=batch_size,
	num_workers=num_workers,
	**kwargs
	)
	return dataloader


	def get_dataset_from_loader(dataloader):
	dataset = dataloader.dataset
	if isinstance(dataset, torch.utils.data.Subset):
	return dataset.dataset
	else:
	return dataset


	def multinomial(input: torch.Tensor, num_samples: int, replacement=False, *, generator=None):
	"""torch.multinomial with arbitrary number of dimensions, and number of candidates on the last dimension.

	Args:
	input (torch.Tensor): The input tensor containing probabilities.
	num_samples (int): Number of samples to draw.
	replacement (bool): Whether to draw with replacement or not.
	Keywords args:
	generator (torch.Generator): A pseudorandom number generator for sampling.
	Returns:
	torch.Tensor: Last dimension contains num_samples indices
	sampled from the multinomial probability distribution
	located in the last dimension of tensor input.
	"""
	input_ = input.reshape(-1, input.shape[-1])
	output_ = torch.multinomial(input_, num_samples=num_samples, replacement=replacement, generator=generator)
	output = output_.reshape(*list(input.shape[:-1]), -1)

	# print('MULTINOmial', input.shape, output.shape) # MULTINOmial torch.Size([1, 4, 2048]) torch.Size([1, 4, 1])
	# output = input[..., 0:1]
	return output


	def sample_top_k(probs: torch.Tensor, k: int) -> torch.Tensor:
	"""Sample next token from top K values along the last dimension of the input probs tensor.

	Args:
	probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
	k (int): The k in “top-k”.
	Returns:
	torch.Tensor: Sampled tokens.
	"""
	top_k_value, i250 = torch.topk(probs, k, dim=-1) # probs: [1, 4, 2048]
	min_value_top_k = top_k_value[..., [-1]] #
	probs *= (probs >= min_value_top_k).float() # multiply all being > of min_topk with 1 thus zeroing others
	probs.div_(probs.sum(dim=-1, keepdim=True)) # why normalize by the sum ? oh in order to choose mult
	next_token = multinomial(probs, num_samples=1)
	# so instead of chooose multinomial what happens if we take all 250 topk tokens
	# probs.shape=torch.Size([1, 4, 2048]) <, print(next_token,f'{probs.shape=}', 'h') # 1,4,1 next token is 4tok
	# next_token = i250
	return next_token





	def length_to_mask(lengths: torch.Tensor, max_len: tp.Optional[int] = None) -> torch.Tensor:
	"""Utility function to convert a tensor of sequence lengths to a mask (useful when working on padded sequences).
	For example: [3, 5] => [[1, 1, 1, 0, 0], [1, 1, 1, 1, 1]]

	Args:
	lengths (torch.Tensor): tensor with lengths
	max_len (int): can set the max length manually. Defaults to None.
	Returns:
	torch.Tensor: mask with 0s where there is pad tokens else 1s
	"""
	assert len(lengths.shape) == 1, "Length shape should be 1 dimensional."
	final_length = lengths.max().item() if not max_len else max_len
	final_length = max(final_length, 1) # if all seqs are of len zero we don't want a zero-size tensor
	return torch.arange(final_length, device=lengths.device)[None, :] < lengths[:, None]


	def collate(tensors: tp.List[torch.Tensor], dim: int = 0) -> tp.Tuple[torch.Tensor, torch.Tensor]:
	"""Get a list of tensors and collate them to a single tensor. according to the following logic:
	- `dim` specifies the time dimension which will be stacked and padded.
	- The output will contain 1 new dimension (dimension index 0) which will be the size of
	of the original list.

	Args:
	tensors (tp.List[torch.Tensor]): List of tensors to collate.
	dim (int): Dimension which will be stacked and padded.
	Returns:
	tp.Tuple[torch.Tensor, torch.Tensor]:
	torch.Tensor: Stacked and padded tensor. The output will contain 1 new dimension
	(dimension index 0) which will be the size of the original list.
	torch.Tensor: Tensor containing length of original tensor sizes (without padding).
	"""
	tensors = [x.transpose(0, dim) for x in tensors]
	lens = torch.LongTensor([len(x) for x in tensors])
	padded_tensors = pad_sequence(tensors)
	padded_tensors = padded_tensors.transpose(0, 1)
	padded_tensors = padded_tensors.transpose(1, dim + 1)
	return padded_tensors, lens