import os
import sys
import time
import random
from itertools import chain
from collections import Counter
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from transformers.data.data_collator import DataCollator
from multiprocessing import Pool
import mmap
from torch.utils.data import Dataset
class IUPACDataset(Dataset):
def __init__(self, dataset_dir='./',dataset_filename="iupacs_logp.txt", tokenizer=None,max_length=None,target_col=None,
self.dataset_dir = dataset_dir
self.tokenizer = tokenizer
self.target_col = target_col
self.max_length = max_length
self.dataset_size = dataset_size
self.dataset_filename = dataset_filename
# where the data is
self.dataset_fn = os.path.join(self.dataset_dir,self.dataset_filename)
# a bit of an odd way to read in a data file, but it lets
# us keep the data in csv format, and it's pretty fast
# (30s for 17G on my machine).
# we need to use mmap for data-parallel training with
# multiple processes so that the processes don't each keep
# a local copy of the dataset in host memory
line_offsets = []
# each element of data_mm is a character in the dataset file
self.data_mm = np.memmap(self.dataset_fn, dtype=np.uint8, mode="r")
# process chunksize bytes at a time
chunksize = int(1e9)
for i in range(0, len(self.data_mm), chunksize):
chunk = self.data_mm[i:i + chunksize]
# the index of each newline is the character before
# the beginning of the next line
newlines = np.nonzero(chunk == 0x0a)[0]
line_offsets.append(i + newlines + 1)
if self.dataset_size is not None and i > self.dataset_size:
# don't need to keep loading data
# line_offsets indicates the beginning of each line in self.dataset_fn
self.line_offsets = np.hstack(line_offsets)
if (self.dataset_size is not None
and self.dataset_size > self.line_offsets.shape[0]):
msg = "specified dataset_size {}, but the dataset only has {} items"
raise ValueError(msg.format(self.dataset_size,
# extract headers
header_line = bytes(self.data_mm[0:self.line_offsets[0]])
headers = header_line.decode("utf8").strip().split("|")
# figure out which column IDs are of interest
self.name_col_id = headers.index(iupac_name_col)
except ValueError as e:
raise RuntimeError("Expecting a column called '{}' "
"that contains IUPAC names".format(iupac_name_col))
self.target_col_id = None
if self.target_col is not None:
self.target_col_id = headers.index(self.target_col)
except ValueError as e:
raise RuntimeError("User supplied target col " + target_col + \
"but column is not present in data file")
def __getitem__(self, idx):
# model_inputs is a dict with keys
# input_ids, target
if self.dataset_size is not None and idx > self.dataset_size:
msg = "provided index {} is larger than dataset size {}"
raise IndexError(msg.format(idx, self.dataset_size))
start = self.line_offsets[idx]
end = self.line_offsets[idx + 1]
line = bytes(self.data_mm[start:end])
line = line.decode("utf8").strip().split("|")
name = line[self.name_col_id]
# get the target value, if needed
target = None
if self.target_col_id is not None:
target = line[self.target_col_id]
if self.target_col == "Log P" and len(target) == 0:
target = 3.16 # average of training data
target = float(target)
tokenized = self.tokenizer(name) #after this the tokenizer.eos_token_id have been added automaticly
input_ids = torch.tensor(tokenized["input_ids"])
iupac_unk = torch.tensor([self.tokenizer._convert_token_to_id(self.tokenizer.unk_token)])
input_ids = torch.tensor(input_ids)
input_ids = torch.cat([iupac_unk,input_ids])
attention_mask = torch.ones(input_ids.numel(), dtype=int)
return_dict = {}
return_dict["input_ids"] = input_ids
return_dict["labels"] = torch.tensor(np.array(target))
return_dict["attention_mask"] = attention_mask
if self.max_length is not None:
return_dict["input_ids"] = return_dict["input_ids"][:self.max_length]
return_dict["attention_mask"] = return_dict["attention_mask"][:self.max_length]
return return_dict
def __len__(self):
if self.dataset_size is None:
return len(self.line_offsets) - 1
return self.dataset_size