S5 / Essay_classifier /s5 /dataloaders /lra.py

Upload 693 files

2ce7b1a about 2 years ago

27.3 kB

	"""Long Range Arena datasets"""
	import io
	import logging
	import os
	import pickle
	from pathlib import Path
	import torch
	from torch import nn
	import torch.nn.functional as F
	import torchtext
	import torchvision
	from einops.layers.torch import Rearrange, Reduce
	from PIL import Image # Only used for Pathfinder
	from datasets import DatasetDict, Value, load_dataset, load_from_disk

	from .base import default_data_path, SequenceDataset, ImageResolutionSequenceDataset


	class IMDB(SequenceDataset):
	_name_ = "imdb"
	d_output = 2
	l_output = 0

	@property
	def init_defaults(self):
	return {
	"l_max": 4096,
	"level": "char",
	"min_freq": 15,
	"seed": 42,
	"val_split": 0.0,
	"append_bos": False,
	"append_eos": True,
	# 'max_vocab': 135,
	"n_workers": 4, # Only used for tokenizing dataset before caching
	}

	@property
	def n_tokens(self):
	return len(self.vocab)

	def prepare_data(self):
	if self.cache_dir is None: # Just download the dataset
	load_dataset(self._name_, cache_dir=self.data_dir)
	else: # Process the dataset and save it
	self.process_dataset()

	def setup(self, stage=None):
	"""If cache_dir is not None, we'll cache the processed dataset there."""

	# # NOTE - AW - we manually set these elsewhere.
	# self.data_dir = self.data_dir or default_data_path / self._name_
	# self.cache_dir = self.data_dir / "cache"

	assert self.level in [
	"word",
	"char",
	], f"level {self.level} not supported"

	if stage == "test" and hasattr(self, "dataset_test"):
	return
	dataset, self.tokenizer, self.vocab = self.process_dataset()
	print(
	f"IMDB {self.level} level \| min_freq {self.min_freq} \| vocab size {len(self.vocab)}"
	)
	dataset.set_format(type="torch", columns=["input_ids", "label"])

	# Create all splits
	dataset_train, self.dataset_test = dataset["train"], dataset["test"]
	if self.val_split == 0.0:
	# Use test set as val set, as done in the LRA paper
	self.dataset_train, self.dataset_val = dataset_train, None
	else:
	train_val = dataset_train.train_test_split(
	test_size=self.val_split, seed=self.seed
	)
	self.dataset_train, self.dataset_val = (
	train_val["train"],
	train_val["test"],
	)

	def _collate_fn(self, batch):
	xs, ys = zip(*[(data["input_ids"], data["label"]) for data in batch])
	lengths = torch.tensor([len(x) for x in xs])
	xs = nn.utils.rnn.pad_sequence(
	xs, padding_value=self.vocab["<pad>"], batch_first=True
	)
	ys = torch.tensor(ys)
	return xs, ys, {"lengths": lengths}

	# self._collate_fn = collate_batch

	def process_dataset(self):
	cache_dir = (
	None if self.cache_dir is None else self.cache_dir / self._cache_dir_name
	)
	if cache_dir is not None:
	if cache_dir.is_dir():
	return self._load_from_cache(cache_dir)

	print(f"self._name_: {self._name_}")
	print(f"self.data_dir: {self.data_dir}")
	# dataset = load_dataset(self._name_, cache_dir=self.data_dir)
	dataset = load_from_disk('essays')
	print(type(dataset))
	dataset = DatasetDict(train=dataset["train"], test=dataset["test"])
	if self.level == "word":
	tokenizer = torchtext.data.utils.get_tokenizer(
	"spacy", language="en_core_web_sm"
	)
	else: # self.level == 'char'
	tokenizer = list # Just convert a string to a list of chars
	# Account for <bos> and <eos> tokens
	l_max = self.l_max - int(self.append_bos) - int(self.append_eos)
	tokenize = lambda example: {"tokens": tokenizer(example["text"])[:l_max]}
	dataset = dataset.map(
	tokenize,
	remove_columns=["text"],
	keep_in_memory=True,
	load_from_cache_file=False,
	num_proc=max(self.n_workers, 1),
	)
	vocab = torchtext.vocab.build_vocab_from_iterator(
	dataset["train"]["tokens"],
	min_freq=self.min_freq,
	specials=(
	["<pad>", "<unk>"]
	+ (["<bos>"] if self.append_bos else [])
	+ (["<eos>"] if self.append_eos else [])
	),
	)
	vocab.set_default_index(vocab["<unk>"])

	numericalize = lambda example: {
	"input_ids": vocab(
	(["<bos>"] if self.append_bos else [])
	+ example["tokens"]
	+ (["<eos>"] if self.append_eos else [])
	)
	}
	dataset = dataset.map(
	numericalize,
	remove_columns=["tokens"],
	keep_in_memory=True,
	load_from_cache_file=False,
	num_proc=max(self.n_workers, 1),
	)

	if cache_dir is not None:
	self._save_to_cache(dataset, tokenizer, vocab, cache_dir)
	return dataset, tokenizer, vocab

	def _save_to_cache(self, dataset, tokenizer, vocab, cache_dir):
	cache_dir = self.cache_dir / self._cache_dir_name
	logger = logging.getLogger(__name__)
	logger.info(f"Saving to cache at {str(cache_dir)}")
	dataset.save_to_disk(str(cache_dir))
	with open(cache_dir / "tokenizer.pkl", "wb") as f:
	pickle.dump(tokenizer, f)
	with open(cache_dir / "vocab.pkl", "wb") as f:
	pickle.dump(vocab, f)

	def _load_from_cache(self, cache_dir):
	assert cache_dir.is_dir()
	logger = logging.getLogger(__name__)
	logger.info(f"Load from cache at {str(cache_dir)}")
	dataset = DatasetDict.load_from_disk(str(cache_dir))
	with open(cache_dir / "tokenizer.pkl", "rb") as f:
	tokenizer = pickle.load(f)
	with open(cache_dir / "vocab.pkl", "rb") as f:
	vocab = pickle.load(f)
	return dataset, tokenizer, vocab

	@property
	def _cache_dir_name(self):
	return f"l_max-{self.l_max}-level-{self.level}-min_freq-{self.min_freq}-append_bos-{self.append_bos}-append_eos-{self.append_eos}"

	class TabularDataset(torch.utils.data.Dataset):
	def __init__(
	self,
	path,
	format,
	col_idx=None,
	skip_header=False,
	csv_reader_params=None,
	):
	"""
	col_idx: the indices of the columns.
	"""
	if csv_reader_params is None:
	csv_reader_params = {}
	format = format.lower()
	assert format in ["tsv", "csv"]
	with io.open(os.path.expanduser(path), encoding="utf8") as f:
	if format == "csv":
	reader = torchtext.utils.unicode_csv_reader(f, **csv_reader_params)
	elif format == "tsv":
	reader = torchtext.utils.unicode_csv_reader(
	f, delimiter="\t", **csv_reader_params
	)
	else:
	reader = f
	if skip_header:
	next(reader)
	self._data = [
	line if col_idx is None else [line[c] for c in col_idx]
	for line in reader
	]

	def __len__(self):
	return len(self._data)

	def __getitem__(self, idx):
	return self._data[idx]


	# LRA tokenizer renames ']' to 'X' and delete parentheses as their tokenizer removes
	# non-alphanumeric characters.
	# https://github.com/google-research/long-range-arena/blob/264227cbf9591e39dd596d2dc935297a2070bdfe/lra_benchmarks/listops/input_pipeline.py#L46
	def listops_tokenizer(s):
	return s.translate({ord("]"): ord("X"), ord("("): None, ord(")"): None}).split()


	class ListOps(SequenceDataset):
	_name_ = "listops"
	d_output = 10
	l_output = 0

	@property
	def init_defaults(self):
	return {
	"l_max": 2048,
	"append_bos": False,
	"append_eos": True,
	# 'max_vocab': 20, # Actual size 18
	"n_workers": 4, # Only used for tokenizing dataset
	}

	@property
	def n_tokens(self):
	return len(self.vocab)

	@property
	def _cache_dir_name(self):
	return f"l_max-{self.l_max}-append_bos-{self.append_bos}-append_eos-{self.append_eos}"

	def init(self):
	if self.data_dir is None:
	self.data_dir = default_data_path / self._name_
	self.cache_dir = self.data_dir / self._cache_dir_name

	def prepare_data(self):
	if self.cache_dir is None:
	for split in ["train", "val", "test"]:
	split_path = self.data_dir / f"basic_{split}.tsv"
	if not split_path.is_file():
	raise FileNotFoundError(
	f"""
	File {str(split_path)} not found.
	To get the dataset, download lra_release.gz from
	https://github.com/google-research/long-range-arena,
	then unzip it with tar -xvf lra_release.gz.
	Then point data_dir to the listops-1000 directory.
	"""
	)
	else: # Process the dataset and save it
	self.process_dataset()

	def setup(self, stage=None):
	if stage == "test" and hasattr(self, "dataset_test"):
	return
	dataset, self.tokenizer, self.vocab = self.process_dataset()
	self.vocab_size = len(self.vocab)
	dataset.set_format(type="torch", columns=["input_ids", "Target"])
	self.dataset_train, self.dataset_val, self.dataset_test = (
	dataset["train"],
	dataset["val"],
	dataset["test"],
	)

	def collate_batch(batch):
	xs, ys = zip(*[(data["input_ids"], data["Target"]) for data in batch])
	lengths = torch.tensor([len(x) for x in xs])
	xs = nn.utils.rnn.pad_sequence(
	xs, padding_value=self.vocab["<pad>"], batch_first=True
	)
	ys = torch.tensor(ys)
	return xs, ys, {"lengths": lengths}

	self._collate_fn = collate_batch

	def process_dataset(self):
	cache_dir = (
	None if self.cache_dir is None else self.cache_dir / self._cache_dir_name
	)
	if cache_dir is not None:
	if cache_dir.is_dir():
	return self._load_from_cache(cache_dir)

	dataset = load_dataset(
	"csv",
	data_files={
	"train": str(self.data_dir / "basic_train.tsv"),
	"val": str(self.data_dir / "basic_val.tsv"),
	"test": str(self.data_dir / "basic_test.tsv"),
	},
	delimiter="\t",
	keep_in_memory=True,
	)

	tokenizer = listops_tokenizer

	# Account for <bos> and <eos> tokens
	l_max = self.l_max - int(self.append_bos) - int(self.append_eos)
	tokenize = lambda example: {"tokens": tokenizer(example["Source"])[:l_max]}
	dataset = dataset.map(
	tokenize,
	remove_columns=["Source"],
	keep_in_memory=True,
	load_from_cache_file=False,
	num_proc=max(self.n_workers, 1),
	)
	vocab = torchtext.vocab.build_vocab_from_iterator(
	dataset["train"]["tokens"],
	specials=(
	["<pad>", "<unk>"]
	+ (["<bos>"] if self.append_bos else [])
	+ (["<eos>"] if self.append_eos else [])
	),
	)
	vocab.set_default_index(vocab["<unk>"])

	numericalize = lambda example: {
	"input_ids": vocab(
	(["<bos>"] if self.append_bos else [])
	+ example["tokens"]
	+ (["<eos>"] if self.append_eos else [])
	)
	}
	dataset = dataset.map(
	numericalize,
	remove_columns=["tokens"],
	keep_in_memory=True,
	load_from_cache_file=False,
	num_proc=max(self.n_workers, 1),
	)

	if cache_dir is not None:
	self._save_to_cache(dataset, tokenizer, vocab, cache_dir)
	return dataset, tokenizer, vocab

	def _save_to_cache(self, dataset, tokenizer, vocab, cache_dir):
	cache_dir = self.cache_dir / self._cache_dir_name
	logger = logging.getLogger(__name__)
	logger.info(f"Saving to cache at {str(cache_dir)}")
	dataset.save_to_disk(str(cache_dir))
	with open(cache_dir / "tokenizer.pkl", "wb") as f:
	pickle.dump(tokenizer, f)
	with open(cache_dir / "vocab.pkl", "wb") as f:
	pickle.dump(vocab, f)

	def _load_from_cache(self, cache_dir):
	assert cache_dir.is_dir()
	logger = logging.getLogger(__name__)
	logger.info(f"Load from cache at {str(cache_dir)}")
	dataset = DatasetDict.load_from_disk(str(cache_dir))
	with open(cache_dir / "tokenizer.pkl", "rb") as f:
	tokenizer = pickle.load(f)
	with open(cache_dir / "vocab.pkl", "rb") as f:
	vocab = pickle.load(f)
	return dataset, tokenizer, vocab

	class PathFinderDataset(torch.utils.data.Dataset):
	"""Path Finder dataset."""

	# There's an empty file in the dataset
	blacklist = {"pathfinder32/curv_baseline/imgs/0/sample_172.png"}

	def __init__(self, data_dir, transform=None):
	"""
	Args:
	data_dir (string): Directory with all the images.
	transform (callable, optional): Optional transform to be applied
	on a sample.
	"""
	self.data_dir = Path(data_dir).expanduser()
	assert self.data_dir.is_dir(), f"data_dir {str(self.data_dir)} does not exist"
	self.transform = transform
	samples = []
	# for diff_level in ['curv_baseline', 'curv_contour_length_9', 'curv_contour_length_14']:
	for diff_level in ["curv_contour_length_14"]:
	path_list = sorted(
	list((self.data_dir / diff_level / "metadata").glob("*.npy")),
	key=lambda path: int(path.stem),
	)
	assert path_list, "No metadata found"
	for metadata_file in path_list:
	with open(metadata_file, "r") as f:
	for metadata in f.read().splitlines():
	metadata = metadata.split()
	image_path = Path(diff_level) / metadata[0] / metadata[1]
	if (
	str(Path(self.data_dir.stem) / image_path)
	not in self.blacklist
	):
	label = int(metadata[3])
	samples.append((image_path, label))
	self.samples = samples

	def __len__(self):
	return len(self.samples)

	def __getitem__(self, idx):
	path, target = self.samples[idx]
	# https://github.com/pytorch/vision/blob/9b29f3f22783112406d9c1a6db47165a297c3942/torchvision/datasets/folder.py#L247
	with open(self.data_dir / path, "rb") as f:
	sample = Image.open(f).convert("L") # Open in grayscale
	if self.transform is not None:
	sample = self.transform(sample)
	return sample, target


	class PathFinder(ImageResolutionSequenceDataset):
	_name_ = "pathfinder"
	d_input = 1
	d_output = 2
	l_output = 0

	@property
	def n_tokens(self):
	if self.tokenize:
	return 256

	@property
	def init_defaults(self):
	return {
	"resolution": 32,
	"sequential": True,
	"tokenize": False,
	"pool": 1,
	"val_split": 0.1,
	"test_split": 0.1,
	"seed": 42, # Controls the train/val/test split
	}

	def default_transforms(self):
	transform_list = [torchvision.transforms.ToTensor()]
	if self.pool > 1:
	transform_list.append(
	Reduce(
	"1 (h h2) (w w2) -> 1 h w",
	"mean",
	h2=self.pool,
	w2=self.pool,
	)
	)
	if self.tokenize:
	transform_list.append(
	torchvision.transforms.Lambda(lambda x: (x * 255).long())
	)
	else:
	transform_list.append(torchvision.transforms.Normalize(mean=0.5, std=0.5))
	if self.sequential:
	# If tokenize, it makes more sense to get rid of the channel dimension
	transform_list.append(
	Rearrange("1 h w -> (h w)")
	if self.tokenize
	else Rearrange("1 h w -> (h w) 1")
	)
	else:
	transform_list.append(Rearrange("1 h w -> h w 1"))
	return torchvision.transforms.Compose(transform_list)

	def prepare_data(self):
	if not self.data_dir.is_dir():
	raise FileNotFoundError(
	f"""
	Directory {str(self.data_dir)} not found.
	To get the dataset, download lra_release.gz from
	https://github.com/google-research/long-range-arena,
	then unzip it with tar -xvf lra_release.gz.
	Then point data_dir to the pathfinderX directory, where X is either 32, 64, 128, or 256.
	"""
	)

	def setup(self, stage=None):
	if self.data_dir is None:
	self.data_dir = (
	default_data_path / self._name_ / f"pathfinder{self.resolution}"
	)

	if self.cache_dir is not None:
	if Path(self.cache_dir / (self._cache_dir_name + '.pt')).exists():
	with open(self.cache_dir / (self._cache_dir_name + '.pt'), 'rb') as f:
	dset = torch.load(f)
	self.dataset_train = dset['train']
	self.dataset_val = dset['val']
	self.dataset_test = dset['test']
	return None

	if stage == "test" and hasattr(self, "dataset_test"):
	return
	# [2021-08-18] TD: I ran into RuntimeError: Too many open files.
	# https://github.com/pytorch/pytorch/issues/11201
	torch.multiprocessing.set_sharing_strategy("file_system")
	dataset = PathFinderDataset(self.data_dir, transform=self.default_transforms())
	len_dataset = len(dataset)
	val_len = int(self.val_split * len_dataset)
	test_len = int(self.test_split * len_dataset)
	train_len = len_dataset - val_len - test_len
	(
	self.dataset_train,
	self.dataset_val,
	self.dataset_test,
	) = torch.utils.data.random_split(
	dataset,
	[train_len, val_len, test_len],
	generator=torch.Generator().manual_seed(self.seed),
	)

	# AW - Now we need to iterate over each of these datasets and store them in a proper cache.
	def _compile_convert(dset, tag):
	"""

	:param dset:
	:param tag:
	:return:
	"""
	loader = torch.utils.data.DataLoader(dataset=dset, batch_size=len(dset), shuffle=False, drop_last=False)
	inp, out = next(iter(loader))
	dset_compiled = torch.utils.data.TensorDataset(inp, out)
	return dset_compiled

	os.makedirs(self.cache_dir, exist_ok=True)
	self.dataset_train = _compile_convert(self.dataset_train, tag='train')
	self.dataset_val = _compile_convert(self.dataset_val, tag='val')
	self.dataset_test = _compile_convert(self.dataset_test, tag='test')

	# Cache.
	cache_path = self.cache_dir / (self._cache_dir_name + '.pt')
	logger = logging.getLogger(__name__)
	logger.info(f"Saving to cache at {str(cache_path)}")
	with open(cache_path, 'wb') as f:
	torch.save({'train': self.dataset_train,
	'val': self.dataset_val,
	'test': self.dataset_test},
	f)

	@property
	def _cache_dir_name(self):
	return f"pathfinder-resolution-{self.resolution}"


	class AAN(SequenceDataset):
	_name_ = "aan"
	d_output = 2 # Use accuracy instead of binary_accuracy
	l_output = 0

	@property
	def n_tokens(self):
	return len(self.vocab)

	@property
	def init_defaults(self):
	return {
	"l_max": 4000,
	# 'max_vocab': 100, # Full size 98
	"append_bos": False,
	"append_eos": True,
	"n_workers": 4, # For tokenizing only
	}

	@property
	def _cache_dir_name(self):
	return f"l_max-{self.l_max}-append_bos-{self.append_bos}-append_eos-{self.append_eos}"

	def init(self):
	if self.data_dir is None:
	self.data_dir = default_data_path / self._name_
	self.cache_dir = self.data_dir / self._cache_dir_name

	def prepare_data(self):
	if self.cache_dir is None:
	for split in ["train", "eval", "test"]:
	split_path = self.data_dir / f"new_aan_pairs.{split}.tsv"
	if not split_path.is_file():
	raise FileNotFoundError(
	f"""
	File {str(split_path)} not found.
	To get the dataset, download lra_release.gz from
	https://github.com/google-research/long-range-arena,
	then unzip it with tar -xvf lra_release.gz.
	Then point data_dir to the tsv_data directory.
	"""
	)
	else: # Process the dataset and save it
	self.process_dataset()

	def setup(self, stage=None):
	if stage == "test" and hasattr(self, "dataset_test"):
	return

	# [2021-08-18] TD: I ran into RuntimeError: Too many open files.
	# https://github.com/pytorch/pytorch/issues/11201
	torch.multiprocessing.set_sharing_strategy("file_system")

	dataset, self.tokenizer, self.vocab = self.process_dataset()
	# self.vocab_size = len(self.vocab)
	print("AAN vocab size:", len(self.vocab))

	dataset.set_format(type="torch", columns=["input_ids1", "input_ids2", "label"])
	self.dataset_train, self.dataset_val, self.dataset_test = (
	dataset["train"],
	dataset["val"],
	dataset["test"],
	)

	def collate_batch(batch):
	xs1, xs2, ys = zip(
	*[
	(data["input_ids1"], data["input_ids2"], data["label"])
	for data in batch
	]
	)
	lengths1 = torch.tensor([len(x) for x in xs1])
	lengths2 = torch.tensor([len(x) for x in xs2])
	xs1 = nn.utils.rnn.pad_sequence(
	xs1, padding_value=self.vocab["<pad>"], batch_first=True
	)
	xs2 = nn.utils.rnn.pad_sequence(
	xs2, padding_value=self.vocab["<pad>"], batch_first=True
	)
	# Pad both to same length
	# Shape (batch, length)
	L = max(xs1.size(1), xs2.size(1))
	xs1 = F.pad(xs1, (0, L-xs1.size(1)), value=self.vocab["<pad>"])
	xs2 = F.pad(xs2, (0, L-xs2.size(1)), value=self.vocab["<pad>"])
	ys = torch.tensor(ys)
	# return xs1, xs2, ys, lengths1, lengths2

	# Concatenate two batches
	xs = torch.cat([xs1, xs2], dim=0)
	lengths = torch.cat([lengths1, lengths2], dim=0)
	return xs, ys, {"lengths": lengths}

	self._collate_fn = collate_batch

	def process_dataset(self):
	cache_dir = (
	None if self.cache_dir is None else self.cache_dir / self._cache_dir_name
	)
	if cache_dir is not None:
	if cache_dir.is_dir():
	return self._load_from_cache(cache_dir)

	dataset = load_dataset(
	"csv",
	data_files={
	"train": str(self.data_dir / "new_aan_pairs.train.tsv"),
	"val": str(self.data_dir / "new_aan_pairs.eval.tsv"),
	"test": str(self.data_dir / "new_aan_pairs.test.tsv"),
	},
	delimiter="\t",
	column_names=["label", "input1_id", "input2_id", "text1", "text2"],
	keep_in_memory=True,
	) # True)
	dataset = dataset.remove_columns(["input1_id", "input2_id"])
	new_features = dataset["train"].features.copy()
	new_features["label"] = Value("int32")
	dataset = dataset.cast(new_features)

	tokenizer = list # Just convert a string to a list of chars
	# Account for <bos> and <eos> tokens
	l_max = self.l_max - int(self.append_bos) - int(self.append_eos)
	tokenize = lambda example: {
	"tokens1": tokenizer(example["text1"])[:l_max],
	"tokens2": tokenizer(example["text2"])[:l_max],
	}
	dataset = dataset.map(
	tokenize,
	remove_columns=["text1", "text2"],
	keep_in_memory=True,
	load_from_cache_file=False,
	num_proc=max(self.n_workers, 1),
	)
	vocab = torchtext.vocab.build_vocab_from_iterator(
	dataset["train"]["tokens1"] + dataset["train"]["tokens2"],
	specials=(
	["<pad>", "<unk>"]
	+ (["<bos>"] if self.append_bos else [])
	+ (["<eos>"] if self.append_eos else [])
	),
	)
	vocab.set_default_index(vocab["<unk>"])

	encode = lambda text: vocab(
	(["<bos>"] if self.append_bos else [])
	+ text
	+ (["<eos>"] if self.append_eos else [])
	)
	numericalize = lambda example: {
	"input_ids1": encode(example["tokens1"]),
	"input_ids2": encode(example["tokens2"]),
	}
	dataset = dataset.map(
	numericalize,
	remove_columns=["tokens1", "tokens2"],
	keep_in_memory=True,
	load_from_cache_file=False,
	num_proc=max(self.n_workers, 1),
	)

	if cache_dir is not None:
	self._save_to_cache(dataset, tokenizer, vocab, cache_dir)
	return dataset, tokenizer, vocab

	def _save_to_cache(self, dataset, tokenizer, vocab, cache_dir):
	cache_dir = self.cache_dir / self._cache_dir_name
	logger = logging.getLogger(__name__)
	logger.info(f"Saving to cache at {str(cache_dir)}")
	dataset.save_to_disk(str(cache_dir))
	with open(cache_dir / "tokenizer.pkl", "wb") as f:
	pickle.dump(tokenizer, f)
	with open(cache_dir / "vocab.pkl", "wb") as f:
	pickle.dump(vocab, f)

	def _load_from_cache(self, cache_dir):
	assert cache_dir.is_dir()
	logger = logging.getLogger(__name__)
	logger.info(f"Load from cache at {str(cache_dir)}")
	dataset = DatasetDict.load_from_disk(str(cache_dir))
	with open(cache_dir / "tokenizer.pkl", "rb") as f:
	tokenizer = pickle.load(f)
	with open(cache_dir / "vocab.pkl", "rb") as f:
	vocab = pickle.load(f)
	return dataset, tokenizer, vocab