import os import torch import numpy as np import json import re from torch.autograd import Variable def _truncate_seq_pair(tokens_a, tokens_b, max_length): """Truncates a sequence pair in place to the maximum length.""" # This is a simple heuristic which will always truncate the longer sequence # one token at a time. This makes more sense than truncating an equal percent # of tokens from each, since if one sequence is very short then each token # that's truncated likely contains more information than a longer sequence. while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= max_length: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() def tok2int_sent(sentence, tokenizer, max_seq_length): """Loads a data file into a list of `InputBatch`s.""" sent_a, sent_b = sentence tokens_a = tokenizer.tokenize(sent_a) tokens_b = None if sent_b: tokens_b = tokenizer.tokenize(sent_b) _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[:(max_seq_length - 2)] tokens = ["[CLS]"] + tokens_a + ["[SEP]"] segment_ids = [0] * len(tokens) if tokens_b: tokens = tokens + tokens_b + ["[SEP]"] segment_ids += [1] * (len(tokens_b) + 1) input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) padding = [0] * (max_seq_length - len(input_ids)) input_ids += padding input_mask += padding segment_ids += padding assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length return input_ids, input_mask, segment_ids def tok2int_list(src_list, tokenizer, max_seq_length, max_seq_size=-1): inp_padding = list() msk_padding = list() seg_padding = list() for step, sent in enumerate(src_list): input_ids, input_mask, input_seg = tok2int_sent(sent, tokenizer, max_seq_length) inp_padding.append(input_ids) msk_padding.append(input_mask) seg_padding.append(input_seg) # if max_seq_size != -1: # inp_padding = inp_padding[:max_seq_size] # msk_padding = msk_padding[:max_seq_size] # seg_padding = seg_padding[:max_seq_size] # inp_padding += ([[0] * max_seq_length] * (max_seq_size - len(inp_padding))) # msk_padding += ([[0] * max_seq_length] * (max_seq_size - len(msk_padding))) # seg_padding += ([[0] * max_seq_length] * (max_seq_size - len(seg_padding))) return inp_padding, msk_padding, seg_padding class DataLoader(object): ''' For data iteration ''' def __init__(self, data_path, tokenizer, args, test=False, cuda=True, batch_size=64): self.cuda = cuda self.batch_size = batch_size self.tokenizer = tokenizer self.max_len = args.max_len self.evi_num = args.evi_num self.threshold = args.threshold self.data_path = data_path self.test = test examples = self.read_file(data_path) self.examples = examples self.total_num = len(examples) if self.test: self.total_num = 100000 self.total_step = np.ceil(self.total_num * 1.0 / batch_size) self.shuffle() else: self.total_step = self.total_num / batch_size self.shuffle() self.step = 0 def process_sent(self, sentence): sentence = re.sub(" \-LSB\-.*?\-RSB\-", "", sentence) sentence = re.sub("\-LRB\- \-RRB\- ", "", sentence) sentence = re.sub(" -LRB-", " ( ", sentence) sentence = re.sub("-RRB-", " )", sentence) sentence = re.sub("--", "-", sentence) sentence = re.sub("``", '"', sentence) sentence = re.sub("''", '"', sentence) return sentence def process_wiki_title(self, title): title = re.sub("_", " ", title) title = re.sub(" -LRB-", " ( ", title) title = re.sub("-RRB-", " )", title) title = re.sub("-COLON-", ":", title) return title def read_file(self, data_path): examples = list() with open(data_path) as fin: for step, line in enumerate(fin): sublines = line.strip().split("\t") examples.append( [self.process_sent(sublines[0]), self.process_sent(sublines[2]), self.process_sent(sublines[4])]) return examples def shuffle(self): np.random.shuffle(self.examples) def __iter__(self): return self def __next__(self): return self.next() def __len__(self): return self._n_batch def next(self): ''' Get the next batch ''' if self.step < self.total_step: examples = self.examples[self.step * self.batch_size: (self.step + 1) * self.batch_size] pos_inputs = list() neg_inputs = list() for example in examples: pos_inputs.append([example[0], example[1]]) neg_inputs.append([example[0], example[2]]) inp_pos, msk_pos, seg_pos = tok2int_list(pos_inputs, self.tokenizer, self.max_len) inp_neg, msk_neg, seg_neg = tok2int_list(neg_inputs, self.tokenizer, self.max_len) inp_tensor_pos = Variable( torch.LongTensor(inp_pos)) msk_tensor_pos = Variable( torch.LongTensor(msk_pos)) seg_tensor_pos = Variable( torch.LongTensor(seg_pos)) inp_tensor_neg = Variable( torch.LongTensor(inp_neg)) msk_tensor_neg = Variable( torch.LongTensor(msk_neg)) seg_tensor_neg = Variable( torch.LongTensor(seg_neg)) if self.cuda: inp_tensor_pos = inp_tensor_pos.cuda() msk_tensor_pos = msk_tensor_pos.cuda() seg_tensor_pos = seg_tensor_pos.cuda() inp_tensor_neg = inp_tensor_neg.cuda() msk_tensor_neg = msk_tensor_neg.cuda() seg_tensor_neg = seg_tensor_neg.cuda() self.step += 1 return inp_tensor_pos, msk_tensor_pos, seg_tensor_pos, inp_tensor_neg, msk_tensor_neg, seg_tensor_neg else: self.step = 0 if not self.test: # examples = self.read_file(self.data_path) # self.examples = examples self.shuffle() raise StopIteration() class DataLoaderTest(object): ''' For data iteration ''' def __init__(self, data_path, tokenizer, args, cuda=True, batch_size=64): self.cuda = cuda self.batch_size = batch_size self.tokenizer = tokenizer self.max_len = args.max_len self.evi_num = args.evi_num self.threshold = args.threshold self.data_path = data_path inputs, ids, evi_list = self.read_all(data_path) self.inputs = inputs self.ids = ids self.evi_list = evi_list self.total_num = len(inputs) self.total_step = np.ceil(self.total_num * 1.0 / batch_size) self.step = 0 def process_sent(self, sentence): sentence = re.sub(" \-LSB\-.*?\-RSB\-", "", sentence) sentence = re.sub("\-LRB\- \-RRB\- ", "", sentence) sentence = re.sub(" -LRB-", " ( ", sentence) sentence = re.sub("-RRB-", " )", sentence) sentence = re.sub("--", "-", sentence) sentence = re.sub("``", '"', sentence) sentence = re.sub("''", '"', sentence) return sentence def process_wiki_title(self, title): title = re.sub("_", " ", title) title = re.sub(" -LRB-", " ( ", title) title = re.sub("-RRB-", " )", title) title = re.sub("-COLON-", ":", title) return title def read_all(self, data): if not isinstance(data, list): with open(data) as f: data_ = [json.loads(line) for line in f] else: data_ = data inputs = list() ids = list() evi_list = list() for instance in data_: claim = instance['claim'] id = instance['id'] for evidence in instance['evidence']: ids.append(id) inputs.append([self.process_sent(claim), self.process_sent(evidence[2])]) evi_list.append(evidence) return inputs, ids, evi_list def shuffle(self): np.random.shuffle(self.examples) def __iter__(self): return self def __next__(self): return self.next() def __len__(self): return self._n_batch def next(self): ''' Get the next batch ''' if self.step < self.total_step: inputs = self.inputs[self.step * self.batch_size: (self.step + 1) * self.batch_size] ids = self.ids[self.step * self.batch_size: (self.step + 1) * self.batch_size] evi_list = self.evi_list[self.step * self.batch_size: (self.step + 1) * self.batch_size] inp, msk, seg = tok2int_list(inputs, self.tokenizer, self.max_len, -1) inp_tensor_input = Variable( torch.LongTensor(inp)) msk_tensor_input = Variable( torch.LongTensor(msk)) seg_tensor_input = Variable( torch.LongTensor(seg)) if self.cuda: inp_tensor_input = inp_tensor_input.cuda() msk_tensor_input = msk_tensor_input.cuda() seg_tensor_input = seg_tensor_input.cuda() self.step += 1 return inp_tensor_input, msk_tensor_input, seg_tensor_input, ids, evi_list else: self.step = 0 raise StopIteration()