Spaces:
Runtime error
Runtime error
#!/usr/bin/env python3 | |
# coding=utf-8 | |
import json | |
from itertools import chain | |
from transformers import AutoTokenizer | |
from utility.subtokenize import subtokenize | |
import os | |
os.environ["TOKENIZERS_PARALLELISM"] = "true" | |
def load_dataset(path): | |
data = {} | |
with open(path, encoding="utf8") as f: | |
for sentence in f.readlines(): | |
sentence = json.loads(sentence) | |
data[sentence["id"]] = sentence | |
if "nodes" not in sentence: | |
sentence["nodes"] = [] | |
if "edges" not in sentence: | |
sentence["edges"] = [] | |
for sample in list(data.values()): | |
sample["sentence"] = sample["input"] | |
sample["input"] = sample["sentence"].split(' ') | |
sample["token anchors"], offset = [], 0 | |
for token in sample["input"]: | |
sample["token anchors"].append({"from": offset, "to": offset + len(token)}) | |
offset += len(token) + 1 | |
return data | |
def node_generator(data): | |
for d in data.values(): | |
for n in d["nodes"]: | |
yield n, d | |
def anchor_ids_from_intervals(data): | |
for node, sentence in node_generator(data): | |
if "anchors" not in node: | |
node["anchors"] = [] | |
node["anchors"] = sorted(node["anchors"], key=lambda a: (a["from"], a["to"])) | |
node["token references"] = set() | |
for anchor in node["anchors"]: | |
for i, token_anchor in enumerate(sentence["token anchors"]): | |
if token_anchor["to"] <= anchor["from"]: | |
continue | |
if token_anchor["from"] >= anchor["to"]: | |
break | |
node["token references"].add(i) | |
node["anchor intervals"] = node["anchors"] | |
node["anchors"] = sorted(list(node["token references"])) | |
del node["token references"] | |
for sentence in data.values(): | |
sentence["token anchors"] = [[a["from"], a["to"]] for a in sentence["token anchors"]] | |
def create_bert_tokens(data, encoder: str): | |
tokenizer = AutoTokenizer.from_pretrained(encoder, use_fast=True) | |
for sentence in data.values(): | |
sentence["bert input"], sentence["to scatter"] = subtokenize(sentence["input"], tokenizer) | |
def create_edges(sentence, label_f=None): | |
N = len(sentence["nodes"]) | |
sentence["edge presence"] = [N, N, []] | |
sentence["edge labels"] = [N, N, []] | |
for e in sentence["edges"]: | |
source, target = e["source"], e["target"] | |
label = e["label"] if "label" in e else "none" | |
if label_f is not None: | |
label = label_f(label) | |
sentence["edge presence"][-1].append((source, target, 1)) | |
sentence["edge labels"][-1].append((source, target, label)) | |
edge_counter = len(sentence["edge presence"][-1]) | |
return edge_counter | |