loren-fact-checking / src /dataloaders.py
jiangjiechen's picture
init loren for spaces
7f7285f
raw
history blame
4.52 kB
# -*- coding: utf-8 -*-
'''
@Author : Jiangjie Chen
@Time : 2020/7/20 17:34
@Contact : [email protected]
@Description:
'''
import tensorflow as tf
import cjjpy as cjj
import os
import re
import ujson as json
from collections import defaultdict
pj_prefix = cjj.AbsParentDir(__file__, '..')
class FEVERLoader:
def __init__(self, role):
role = 'dev' if role == 'val' else role
assert role in ['train', 'dev', 'test', 'eval']
self.role = role
self.fever_data = defaultdict(dict)
self.SUPPORTS = 'SUPPORTS'
self.REFUTES = 'REFUTES'
self.NEI = 'NOT ENOUGH INFO'
def __iter__(self):
for k in self.fever_data:
yield k
def __len__(self):
return len(self.fever_data)
def __getitem__(self, item):
return self.fever_data[item]
def load_fever(self, retrieve_type='bert', clean_load=True):
self._load_fever_golden()
self._load_fever_all()
self._load_fever_retrieved(retrieve_type, clean_load)
def _load_json(self, fname):
with tf.io.gfile.GFile(fname) as f:
return [json.loads(x) for x in f.readlines()]
def _new_role(self):
role = self.role if self.role != 'eval' else 'dev'
return role
def _load_fever_golden(self):
if self.role == 'test':
postfix = f'data/fever/shared_task_test.jsonl'
for js in self._load_json(f'{pj_prefix}/{postfix}'):
self.fever_data[js['id']].update({
'id': js['id'],
'claim': js['claim']
})
else:
role = self._new_role()
postfix = f'data/fever/baked_data/golden_{role}.json'
for js in self._load_json(f'{pj_prefix}/{postfix}'):
self.fever_data[js['id']].update({
'id': js['id'],
'claim': js['claim'],
'label': js['label'],
'golden_evidence': self._clean_evidence(js['evidence'])
})
print('* FEVER golden loaded.')
def _load_fever_all(self):
role = self._new_role()
postfix = f'data/fever/baked_data/all_{role}.json'
for js in self._load_json(f'{pj_prefix}/{postfix}'):
self.fever_data[js['id']].update({
'all_evidence': self._clean_evidence(js['evidence'])
})
print('* FEVER all loaded.')
def _load_fever_retrieved(self, retrieve_type, clean_load):
assert retrieve_type in ['bert']
postfix = f'data/fever/baked_data/{retrieve_type}_{self.role}.json'
for js in self._load_json(f'{pj_prefix}/{postfix}'):
self.fever_data[js['id']].update({
f'{retrieve_type}_evidence': self._clean_evidence(js['evidence']) if clean_load else js['evidence']
})
print(f'* FEVER {retrieve_type} loaded.')
def clean_text(self, sentence):
sentence = re.sub(" \-LSB\-.*?\-RSB\-", "", sentence)
sentence = re.sub("\-LRB\- \-RRB\- ", "", sentence)
sentence = re.sub(" -LRB-", " ( ", sentence)
sentence = re.sub("-RRB-", " )", sentence)
sentence = re.sub(" LSB.*?RSB", "", sentence)
sentence = re.sub("LRB RRB ", "", sentence)
sentence = re.sub("LRB", " ( ", sentence)
sentence = re.sub("RRB", " )", sentence)
sentence = re.sub("--", "-", sentence)
sentence = re.sub("``", '"', sentence)
sentence = re.sub("''", '"', sentence)
sentence = re.sub(' ', ' ', sentence)
return sentence
def clean_title(self, title):
title = re.sub("_", " ", title)
title = re.sub(" -LRB-", " ( ", title)
title = re.sub("-RRB-", " )", title)
title = re.sub("-COLON-", ":", title)
title = re.sub(' ', ' ', title)
return title
def _clean_evidence(self, evidence):
cev = []
for ev in evidence:
if len(ev) == 4:
cev.append([self.clean_title(ev[0]), ev[1], self.clean_text(ev[2]), ev[3]])
elif len(ev) == 3:
cev.append([self.clean_title(ev[0]), ev[1], self.clean_text(ev[2])])
elif len(ev) == 0:
cev.append(ev)
else:
raise ValueError(ev)
return cev
if __name__ == '__main__':
floader = FEVERLoader('test')
floader.load_fever('bert', clean_load=False)
for k in floader:
print(floader[k])
input()