MEIRa / data_utils /utils.py
KawshikManikantan's picture
upload_trial
98e2ea5
raw
history blame
3.29 kB
import json
from os import path
from typing import Dict
import jsonlines
def get_data_file(data_dir: str, split: str, max_segment_len: int) -> str:
jsonl_file = path.join(
data_dir, "{}.{}.met.jsonlines".format(split, max_segment_len)
)
print("File access: ", jsonl_file)
if path.exists(jsonl_file):
return jsonl_file
else:
jsonl_file = path.join(data_dir, "{}.met.jsonlines".format(split))
if path.exists(jsonl_file):
return jsonl_file
def load_dataset(
data_dir: str,
singleton_file: str = None,
max_segment_len: int = 2048,
num_train_docs: int = None,
num_dev_docs: int = None,
num_test_docs: int = None,
dataset_name: str = None,
) -> Dict:
all_splits = []
for split in ["train", "dev", "test"]:
jsonl_file = get_data_file(data_dir, split, max_segment_len)
if jsonl_file is None:
raise ValueError(f"No relevant files at {data_dir}")
split_data = []
with open(jsonl_file) as f:
for line in f:
load_dict = json.loads(line.strip())
load_dict["dataset_name"] = dataset_name
split_data.append(load_dict)
all_splits.append(split_data)
train_data, dev_data, test_data = all_splits
if singleton_file is not None and path.exists(singleton_file):
num_singletons = 0
with open(singleton_file) as f:
singleton_data = json.loads(f.read())
for instance in train_data:
doc_key = instance["doc_key"]
if doc_key in singleton_data:
if len(instance["clusters"]) != 0:
num_singletons += len(singleton_data[doc_key])
instance["clusters"][-1].extend(
[cluster[0] for cluster in singleton_data[doc_key]]
)
print("Added %d singletons" % num_singletons)
return {
"train": train_data[:num_train_docs],
"dev": dev_data[:num_dev_docs],
"test": test_data[:num_test_docs],
}
def load_eval_dataset(
data_dir: str, external_md_file: str, max_segment_len: int, dataset_name: str = None
) -> Dict:
data_dict = {}
for split in ["dev", "test"]:
jsonl_file = get_data_file(data_dir, split, max_segment_len)
if jsonl_file is not None:
split_data = []
with open(jsonl_file) as f:
for line in f:
load_dict = json.loads(line.strip())
load_dict["dataset_name"] = dataset_name
split_data.append(load_dict)
data_dict[split] = split_data
if external_md_file is not None and path.exists(external_md_file):
predicted_mentions = {}
with jsonlines.open(external_md_file, mode="r") as reader:
for line in reader:
predicted_mentions[line["doc_key"]] = line
for split in ["dev", "test"]:
for instance in data_dict[split]:
doc_key = instance["doc_key"]
if doc_key in predicted_mentions:
instance["ext_predicted_mentions"] = sorted(
predicted_mentions[doc_key]["pred_mentions"]
)
return data_dict