|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Loads the WikiQuestions dataset. |
|
|
|
An example consists of question, table. Additionally, we store the processed |
|
columns which store the entries after performing number, date and other |
|
preprocessing as done in the baseline. |
|
columns, column names and processed columns are split into word and number |
|
columns. |
|
lookup answer (or matrix) is also split into number and word lookup matrix |
|
Author: aneelakantan (Arvind Neelakantan) |
|
""" |
|
from __future__ import print_function |
|
|
|
import math |
|
import os |
|
import re |
|
import numpy as np |
|
import unicodedata as ud |
|
import tensorflow as tf |
|
|
|
bad_number = -200000.0 |
|
|
|
def is_nan_or_inf(number): |
|
return math.isnan(number) or math.isinf(number) |
|
|
|
def strip_accents(s): |
|
u = unicode(s, "utf-8") |
|
u_new = ''.join(c for c in ud.normalize('NFKD', u) if ud.category(c) != 'Mn') |
|
return u_new.encode("utf-8") |
|
|
|
|
|
def correct_unicode(string): |
|
string = strip_accents(string) |
|
string = re.sub("\xc2\xa0", " ", string).strip() |
|
string = re.sub("\xe2\x80\x93", "-", string).strip() |
|
|
|
string = re.sub("‚", ",", string) |
|
string = re.sub("…", "...", string) |
|
|
|
string = re.sub("ˆ", "^", string) |
|
string = re.sub("˜", "~", string) |
|
string = re.sub("‹", "<", string) |
|
string = re.sub("›", ">", string) |
|
|
|
|
|
|
|
|
|
string = re.sub(r'[\u2E00-\uFFFF]', "", string) |
|
string = re.sub("\\s+", " ", string).strip() |
|
return string |
|
|
|
|
|
def simple_normalize(string): |
|
string = correct_unicode(string) |
|
|
|
string = re.sub("\[(nb ?)?\d+\]", "", string) |
|
string = re.sub("\*+$", "", string) |
|
|
|
string = re.sub("\(\d* ?-? ?\d*\)", "", string) |
|
string = re.sub("^\"(.*)\"$", "", string) |
|
return string |
|
|
|
|
|
def full_normalize(string): |
|
|
|
string = simple_normalize(string) |
|
|
|
string = re.sub("\[[^\]]*\]", "", string) |
|
|
|
string = re.sub(r'[\u007F-\uFFFF]', "", string.strip()) |
|
|
|
string = re.sub("\([^)]*\)$", "", string.strip()) |
|
string = final_normalize(string) |
|
|
|
string = re.sub("\?", "", string).strip() |
|
|
|
string = re.sub("\:$", " ", string).strip() |
|
|
|
string = re.sub(r"/", " ", string).strip() |
|
string = re.sub(r"\\", " ", string).strip() |
|
|
|
|
|
string = re.sub(r"\:", " ", string).strip() |
|
string = re.sub("/", " ", string).strip() |
|
string = re.sub("-", " ", string).strip() |
|
|
|
|
|
if not string: |
|
string = "UNK" |
|
return string |
|
|
|
def final_normalize(string): |
|
|
|
string = re.sub("\\s+", " ", string).strip() |
|
|
|
string = string.lower() |
|
|
|
string = re.sub("\\\\n", " ", string).strip() |
|
|
|
string = re.sub(r"\"", "", string).strip() |
|
string = re.sub(r"\'", "", string).strip() |
|
string = re.sub(r"`", "", string).strip() |
|
|
|
string = re.sub("\*", "", string).strip() |
|
return string |
|
|
|
def is_number(x): |
|
try: |
|
f = float(x) |
|
return not is_nan_or_inf(f) |
|
except ValueError: |
|
return False |
|
except TypeError: |
|
return False |
|
|
|
|
|
class WikiExample(object): |
|
|
|
def __init__(self, id, question, answer, table_key): |
|
self.question_id = id |
|
self.question = question |
|
self.answer = answer |
|
self.table_key = table_key |
|
self.lookup_matrix = [] |
|
self.is_bad_example = False |
|
self.is_word_lookup = False |
|
self.is_ambiguous_word_lookup = False |
|
self.is_number_lookup = False |
|
self.is_number_calc = False |
|
self.is_unknown_answer = False |
|
|
|
|
|
class TableInfo(object): |
|
|
|
def __init__(self, word_columns, word_column_names, word_column_indices, |
|
number_columns, number_column_names, number_column_indices, |
|
processed_word_columns, processed_number_columns, orig_columns): |
|
self.word_columns = word_columns |
|
self.word_column_names = word_column_names |
|
self.word_column_indices = word_column_indices |
|
self.number_columns = number_columns |
|
self.number_column_names = number_column_names |
|
self.number_column_indices = number_column_indices |
|
self.processed_word_columns = processed_word_columns |
|
self.processed_number_columns = processed_number_columns |
|
self.orig_columns = orig_columns |
|
|
|
|
|
class WikiQuestionLoader(object): |
|
|
|
def __init__(self, data_name, root_folder): |
|
self.root_folder = root_folder |
|
self.data_folder = os.path.join(self.root_folder, "data") |
|
self.examples = [] |
|
self.data_name = data_name |
|
|
|
def num_questions(self): |
|
return len(self.examples) |
|
|
|
def load_qa(self): |
|
data_source = os.path.join(self.data_folder, self.data_name) |
|
f = tf.gfile.GFile(data_source, "r") |
|
id_regex = re.compile("\(id ([^\)]*)\)") |
|
for line in f: |
|
id_match = id_regex.search(line) |
|
id = id_match.group(1) |
|
self.examples.append(id) |
|
|
|
def load(self): |
|
self.load_qa() |
|
|
|
|
|
def is_date(word): |
|
if (not (bool(re.search("[a-z0-9]", word, re.IGNORECASE)))): |
|
return False |
|
if (len(word) != 10): |
|
return False |
|
if (word[4] != "-"): |
|
return False |
|
if (word[7] != "-"): |
|
return False |
|
for i in range(len(word)): |
|
if (not (word[i] == "X" or word[i] == "x" or word[i] == "-" or re.search( |
|
"[0-9]", word[i]))): |
|
return False |
|
return True |
|
|
|
|
|
class WikiQuestionGenerator(object): |
|
|
|
def __init__(self, train_name, dev_name, test_name, root_folder): |
|
self.train_name = train_name |
|
self.dev_name = dev_name |
|
self.test_name = test_name |
|
self.train_loader = WikiQuestionLoader(train_name, root_folder) |
|
self.dev_loader = WikiQuestionLoader(dev_name, root_folder) |
|
self.test_loader = WikiQuestionLoader(test_name, root_folder) |
|
self.bad_examples = 0 |
|
self.root_folder = root_folder |
|
self.data_folder = os.path.join(self.root_folder, "annotated/data") |
|
self.annotated_examples = {} |
|
self.annotated_tables = {} |
|
self.annotated_word_reject = {} |
|
self.annotated_word_reject["-lrb-"] = 1 |
|
self.annotated_word_reject["-rrb-"] = 1 |
|
self.annotated_word_reject["UNK"] = 1 |
|
|
|
def is_money(self, word): |
|
if (not (bool(re.search("[a-z0-9]", word, re.IGNORECASE)))): |
|
return False |
|
for i in range(len(word)): |
|
if (not (word[i] == "E" or word[i] == "." or re.search("[0-9]", |
|
word[i]))): |
|
return False |
|
return True |
|
|
|
def remove_consecutive(self, ner_tags, ner_values): |
|
for i in range(len(ner_tags)): |
|
if ((ner_tags[i] == "NUMBER" or ner_tags[i] == "MONEY" or |
|
ner_tags[i] == "PERCENT" or ner_tags[i] == "DATE") and |
|
i + 1 < len(ner_tags) and ner_tags[i] == ner_tags[i + 1] and |
|
ner_values[i] == ner_values[i + 1] and ner_values[i] != ""): |
|
word = ner_values[i] |
|
word = word.replace(">", "").replace("<", "").replace("=", "").replace( |
|
"%", "").replace("~", "").replace("$", "").replace("£", "").replace( |
|
"€", "") |
|
if (re.search("[A-Z]", word) and not (is_date(word)) and not ( |
|
self.is_money(word))): |
|
ner_values[i] = "A" |
|
else: |
|
ner_values[i] = "," |
|
return ner_tags, ner_values |
|
|
|
def pre_process_sentence(self, tokens, ner_tags, ner_values): |
|
sentence = [] |
|
tokens = tokens.split("|") |
|
ner_tags = ner_tags.split("|") |
|
ner_values = ner_values.split("|") |
|
ner_tags, ner_values = self.remove_consecutive(ner_tags, ner_values) |
|
|
|
for i in range(len(tokens)): |
|
word = tokens[i] |
|
if (ner_values[i] != "" and |
|
(ner_tags[i] == "NUMBER" or ner_tags[i] == "MONEY" or |
|
ner_tags[i] == "PERCENT" or ner_tags[i] == "DATE")): |
|
word = ner_values[i] |
|
word = word.replace(">", "").replace("<", "").replace("=", "").replace( |
|
"%", "").replace("~", "").replace("$", "").replace("£", "").replace( |
|
"€", "") |
|
if (re.search("[A-Z]", word) and not (is_date(word)) and not ( |
|
self.is_money(word))): |
|
word = tokens[i] |
|
if (is_number(ner_values[i])): |
|
word = float(ner_values[i]) |
|
elif (is_number(word)): |
|
word = float(word) |
|
if (tokens[i] == "score"): |
|
word = "score" |
|
if (is_number(word)): |
|
word = float(word) |
|
if (not (self.annotated_word_reject.has_key(word))): |
|
if (is_number(word) or is_date(word) or self.is_money(word)): |
|
sentence.append(word) |
|
else: |
|
word = full_normalize(word) |
|
if (not (self.annotated_word_reject.has_key(word)) and |
|
bool(re.search("[a-z0-9]", word, re.IGNORECASE))): |
|
m = re.search(",", word) |
|
sentence.append(word.replace(",", "")) |
|
if (len(sentence) == 0): |
|
sentence.append("UNK") |
|
return sentence |
|
|
|
def load_annotated_data(self, in_file): |
|
self.annotated_examples = {} |
|
self.annotated_tables = {} |
|
f = tf.gfile.GFile(in_file, "r") |
|
counter = 0 |
|
for line in f: |
|
if (counter > 0): |
|
line = line.strip() |
|
(question_id, utterance, context, target_value, tokens, lemma_tokens, |
|
pos_tags, ner_tags, ner_values, target_canon) = line.split("\t") |
|
question = self.pre_process_sentence(tokens, ner_tags, ner_values) |
|
target_canon = target_canon.split("|") |
|
self.annotated_examples[question_id] = WikiExample( |
|
question_id, question, target_canon, context) |
|
self.annotated_tables[context] = [] |
|
counter += 1 |
|
print("Annotated examples loaded ", len(self.annotated_examples)) |
|
f.close() |
|
|
|
def is_number_column(self, a): |
|
for w in a: |
|
if (len(w) != 1): |
|
return False |
|
if (not (is_number(w[0]))): |
|
return False |
|
return True |
|
|
|
def convert_table(self, table): |
|
answer = [] |
|
for i in range(len(table)): |
|
temp = [] |
|
for j in range(len(table[i])): |
|
temp.append(" ".join([str(w) for w in table[i][j]])) |
|
answer.append(temp) |
|
return answer |
|
|
|
def load_annotated_tables(self): |
|
for table in self.annotated_tables.keys(): |
|
annotated_table = table.replace("csv", "annotated") |
|
orig_columns = [] |
|
processed_columns = [] |
|
f = tf.gfile.GFile(os.path.join(self.root_folder, annotated_table), "r") |
|
counter = 0 |
|
for line in f: |
|
if (counter > 0): |
|
line = line.strip() |
|
line = line + "\t" * (13 - len(line.split("\t"))) |
|
(row, col, read_id, content, tokens, lemma_tokens, pos_tags, ner_tags, |
|
ner_values, number, date, num2, read_list) = line.split("\t") |
|
counter += 1 |
|
f.close() |
|
max_row = int(row) |
|
max_col = int(col) |
|
for i in range(max_col + 1): |
|
orig_columns.append([]) |
|
processed_columns.append([]) |
|
for j in range(max_row + 1): |
|
orig_columns[i].append(bad_number) |
|
processed_columns[i].append(bad_number) |
|
|
|
f = tf.gfile.GFile(os.path.join(self.root_folder, annotated_table), "r") |
|
counter = 0 |
|
column_names = [] |
|
for line in f: |
|
if (counter > 0): |
|
line = line.strip() |
|
line = line + "\t" * (13 - len(line.split("\t"))) |
|
(row, col, read_id, content, tokens, lemma_tokens, pos_tags, ner_tags, |
|
ner_values, number, date, num2, read_list) = line.split("\t") |
|
entry = self.pre_process_sentence(tokens, ner_tags, ner_values) |
|
if (row == "-1"): |
|
column_names.append(entry) |
|
else: |
|
orig_columns[int(col)][int(row)] = entry |
|
if (len(entry) == 1 and is_number(entry[0])): |
|
processed_columns[int(col)][int(row)] = float(entry[0]) |
|
else: |
|
for single_entry in entry: |
|
if (is_number(single_entry)): |
|
processed_columns[int(col)][int(row)] = float(single_entry) |
|
break |
|
nt = ner_tags.split("|") |
|
nv = ner_values.split("|") |
|
for i_entry in range(len(tokens.split("|"))): |
|
if (nt[i_entry] == "DATE" and |
|
is_number(nv[i_entry].replace("-", "").replace("X", ""))): |
|
processed_columns[int(col)][int(row)] = float(nv[ |
|
i_entry].replace("-", "").replace("X", "")) |
|
|
|
if (len(entry) == 1 and (is_number(entry[0]) or is_date(entry[0]) or |
|
self.is_money(entry[0]))): |
|
if (len(entry) == 1 and not (is_number(entry[0])) and |
|
is_date(entry[0])): |
|
entry[0] = entry[0].replace("X", "x") |
|
counter += 1 |
|
word_columns = [] |
|
processed_word_columns = [] |
|
word_column_names = [] |
|
word_column_indices = [] |
|
number_columns = [] |
|
processed_number_columns = [] |
|
number_column_names = [] |
|
number_column_indices = [] |
|
for i in range(max_col + 1): |
|
if (self.is_number_column(orig_columns[i])): |
|
number_column_indices.append(i) |
|
number_column_names.append(column_names[i]) |
|
temp = [] |
|
for w in orig_columns[i]: |
|
if (is_number(w[0])): |
|
temp.append(w[0]) |
|
number_columns.append(temp) |
|
processed_number_columns.append(processed_columns[i]) |
|
else: |
|
word_column_indices.append(i) |
|
word_column_names.append(column_names[i]) |
|
word_columns.append(orig_columns[i]) |
|
processed_word_columns.append(processed_columns[i]) |
|
table_info = TableInfo( |
|
word_columns, word_column_names, word_column_indices, number_columns, |
|
number_column_names, number_column_indices, processed_word_columns, |
|
processed_number_columns, orig_columns) |
|
self.annotated_tables[table] = table_info |
|
f.close() |
|
|
|
def answer_classification(self): |
|
lookup_questions = 0 |
|
number_lookup_questions = 0 |
|
word_lookup_questions = 0 |
|
ambiguous_lookup_questions = 0 |
|
number_questions = 0 |
|
bad_questions = 0 |
|
ice_bad_questions = 0 |
|
tot = 0 |
|
got = 0 |
|
ice = {} |
|
with tf.gfile.GFile( |
|
self.root_folder + "/arvind-with-norms-2.tsv", mode="r") as f: |
|
lines = f.readlines() |
|
for line in lines: |
|
line = line.strip() |
|
if (not (self.annotated_examples.has_key(line.split("\t")[0]))): |
|
continue |
|
if (len(line.split("\t")) == 4): |
|
line = line + "\t" * (5 - len(line.split("\t"))) |
|
if (not (is_number(line.split("\t")[2]))): |
|
ice_bad_questions += 1 |
|
(example_id, ans_index, ans_raw, process_answer, |
|
matched_cells) = line.split("\t") |
|
if (ice.has_key(example_id)): |
|
ice[example_id].append(line.split("\t")) |
|
else: |
|
ice[example_id] = [line.split("\t")] |
|
for q_id in self.annotated_examples.keys(): |
|
tot += 1 |
|
example = self.annotated_examples[q_id] |
|
table_info = self.annotated_tables[example.table_key] |
|
|
|
n_cols = len(table_info.orig_columns) |
|
n_rows = len(table_info.orig_columns[0]) |
|
example.lookup_matrix = np.zeros((n_rows, n_cols)) |
|
exact_matches = {} |
|
for (example_id, ans_index, ans_raw, process_answer, |
|
matched_cells) in ice[q_id]: |
|
for match_cell in matched_cells.split("|"): |
|
if (len(match_cell.split(",")) == 2): |
|
(row, col) = match_cell.split(",") |
|
row = int(row) |
|
col = int(col) |
|
if (row >= 0): |
|
exact_matches[ans_index] = 1 |
|
answer_is_in_table = len(exact_matches) == len(example.answer) |
|
if (answer_is_in_table): |
|
for (example_id, ans_index, ans_raw, process_answer, |
|
matched_cells) in ice[q_id]: |
|
for match_cell in matched_cells.split("|"): |
|
if (len(match_cell.split(",")) == 2): |
|
(row, col) = match_cell.split(",") |
|
row = int(row) |
|
col = int(col) |
|
example.lookup_matrix[row, col] = float(ans_index) + 1.0 |
|
example.lookup_number_answer = 0.0 |
|
if (answer_is_in_table): |
|
lookup_questions += 1 |
|
if len(example.answer) == 1 and is_number(example.answer[0]): |
|
example.number_answer = float(example.answer[0]) |
|
number_lookup_questions += 1 |
|
example.is_number_lookup = True |
|
else: |
|
|
|
example.calc_answer = example.number_answer = 0.0 |
|
word_lookup_questions += 1 |
|
example.is_word_lookup = True |
|
else: |
|
if (len(example.answer) == 1 and is_number(example.answer[0])): |
|
example.number_answer = example.answer[0] |
|
example.is_number_calc = True |
|
else: |
|
bad_questions += 1 |
|
example.is_bad_example = True |
|
example.is_unknown_answer = True |
|
example.is_lookup = example.is_word_lookup or example.is_number_lookup |
|
if not example.is_word_lookup and not example.is_bad_example: |
|
number_questions += 1 |
|
example.calc_answer = example.answer[0] |
|
example.lookup_number_answer = example.calc_answer |
|
|
|
number_column_indices = table_info.number_column_indices |
|
word_column_indices = table_info.word_column_indices |
|
example.word_columns = table_info.word_columns |
|
example.number_columns = table_info.number_columns |
|
example.word_column_names = table_info.word_column_names |
|
example.processed_number_columns = table_info.processed_number_columns |
|
example.processed_word_columns = table_info.processed_word_columns |
|
example.number_column_names = table_info.number_column_names |
|
example.number_lookup_matrix = example.lookup_matrix[:, |
|
number_column_indices] |
|
example.word_lookup_matrix = example.lookup_matrix[:, word_column_indices] |
|
|
|
def load(self): |
|
train_data = [] |
|
dev_data = [] |
|
test_data = [] |
|
self.load_annotated_data( |
|
os.path.join(self.data_folder, "training.annotated")) |
|
self.load_annotated_tables() |
|
self.answer_classification() |
|
self.train_loader.load() |
|
self.dev_loader.load() |
|
for i in range(self.train_loader.num_questions()): |
|
example = self.train_loader.examples[i] |
|
example = self.annotated_examples[example] |
|
train_data.append(example) |
|
for i in range(self.dev_loader.num_questions()): |
|
example = self.dev_loader.examples[i] |
|
dev_data.append(self.annotated_examples[example]) |
|
|
|
self.load_annotated_data( |
|
os.path.join(self.data_folder, "pristine-unseen-tables.annotated")) |
|
self.load_annotated_tables() |
|
self.answer_classification() |
|
self.test_loader.load() |
|
for i in range(self.test_loader.num_questions()): |
|
example = self.test_loader.examples[i] |
|
test_data.append(self.annotated_examples[example]) |
|
return train_data, dev_data, test_data |
|
|