deepcode-ai
/

Compact_Facts

Model card Files Files and versions Community

File size: 6,131 Bytes

4fb0bd1

import json
import os
from collections import defaultdict
import random

from transformers import AutoTokenizer


def add_marker_tokens(tokenizer, ner_labels):
    new_tokens = ['<START>', '<END>']
    for label in ner_labels:
        new_tokens.append('<START=%s>'%label)
        new_tokens.append('<END=%s>'%label)
    tokenizer.add_tokens(new_tokens)
    print('# vocab after adding markers: %d'%len(tokenizer))


def tokenize_sentences(ext, tokenizer, special_tokens, rel_file):
    rel_indices = {}
    arg_indices = {}
    label_ids = []

    def get_special_token(w):
        if w not in special_tokens:
            special_tokens[w] = ('<' + w + '>').lower()
        return special_tokens[w]

    cls = tokenizer.cls_token
    sep = tokenizer.sep_token
    wordpiece_tokens = [cls]
    wordpiece_tokens_index = []
    cur_index = len(wordpiece_tokens)

    Argument_START_NER = get_special_token("START=Argument")
    Argument_END_NER = get_special_token("END=Argument")
    Relation_START_NER = get_special_token("START=Relation")
    Relation_END_NER = get_special_token("END=Relation")

    ent2offset = {}
    for ent in ext['entityMentions']:
        ent2offset[ent['emId']] = ent['span_ids']

    argument_start_ids = []
    argument_end_ids = []
    relation_start_ids = []
    # add negative relations as well (label = 0)
    relation_end_ids = []
    entity_set = set()
    relation2entity = defaultdict(set)
    for rel in ext['relationMentions']:
        relation_span = ent2offset[rel['arg1']['emId']]
        relation_start_ids.append(relation_span[0])
        relation_end_ids.append(relation_span[-1])
        argument_span = ent2offset[rel['arg2']['emId']]
        argument_start_ids.append(argument_span[0])
        argument_end_ids.append(argument_span[-1])
        label_ids.append(rel_file["id"][rel['label']])
        # add negative sampling
        relation2entity[relation_start_ids[-1]].add(argument_start_ids[-1])
        entity_set.add(argument_start_ids[-1])

    for i, token in enumerate(ext['sentence'].split(' ')):
        if i in relation_start_ids:
            rel_indices[i] = len(wordpiece_tokens)
            wordpiece_tokens.append(Relation_START_NER)
            wordpiece_tokens_index.append([cur_index, cur_index + 1])
            cur_index += 1
        if i in argument_start_ids:
            arg_indices[i] = len(wordpiece_tokens)
            wordpiece_tokens.append(Argument_START_NER)
            wordpiece_tokens_index.append([cur_index, cur_index + 1])
            cur_index += 1

        tokenized_token = list(tokenizer.tokenize(token))
        wordpiece_tokens.extend(tokenized_token)
        wordpiece_tokens_index.append([cur_index, cur_index + len(tokenized_token)])
        cur_index += len(tokenized_token)

        if i in relation_end_ids:
            wordpiece_tokens.append(Relation_END_NER)
            wordpiece_tokens_index.append([cur_index, cur_index + 1])
            cur_index += 1
        if i in argument_end_ids:
            wordpiece_tokens.append(Argument_END_NER)
            wordpiece_tokens_index.append([cur_index, cur_index + 1])
            cur_index += 1

    wordpiece_tokens.append(sep)
    wordpiece_segment_ids = [1] * (len(wordpiece_tokens))
    assert len(argument_start_ids) == len(relation_start_ids)
    assert len(argument_start_ids) == len(label_ids)

    # add negative relations with label 0
    for rel, args in relation2entity.items():
        negative_args = list(entity_set.difference(args))
        for i in range(len(negative_args) // 3):
            arg_index = random.randint(0, len(negative_args) - 1)
            relation_start_ids.append(rel)
            argument_start_ids.append(negative_args[arg_index])
            label_ids.append(0)

    return {
        'sentId': ext['sentId'],
        'sentText': ext['sentence'],
        'entityMentions': ext['entityMentions'],
        'relationMentions': ext['relationMentions'],
        'extractionMentions': ext['extractionMentions'],
        'labelIds': label_ids,
        'relationIds': [rel_indices[r] for r in relation_start_ids],
        'argumentIds': [arg_indices[a] for a in argument_start_ids],
        'wordpieceSentText': " ".join(wordpiece_tokens),
        'wordpieceTokensIndex': wordpiece_tokens_index,
        'wordpieceSegmentIds': wordpiece_segment_ids
    }


def write_dataset_to_file(dataset, dataset_path):
    print("dataset: {}, size: {}".format(dataset_path, len(dataset)))
    with open(dataset_path, 'w', encoding='utf-8') as fout:
        for idx, ext in enumerate(dataset):
            fout.write(json.dumps(ext))
            if idx != len(dataset) - 1:
                fout.write('\n')


def process(source_file, rel_file, target_file, pretrained_model):
    extractions_list = []
    auto_tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
    print("Load {} tokenizer successfully.".format(pretrained_model))

    rel_id_file = json.load(open(rel_file, 'r', encoding='utf-8'))
    add_marker_tokens(auto_tokenizer, rel_id_file["entity_text"])

    if os.path.exists('special_tokens.json'):
        with open('special_tokens.json', 'r') as f:
            special_tokens = json.load(f)
    else:
        raise FileNotFoundError

    with open(source_file, 'r', encoding='utf-8') as fin, open(target_file, 'w', encoding='utf-8') as fout:
        for line in fin:
            ext = json.loads(line.strip())
            ext_dict = tokenize_sentences(ext, auto_tokenizer, special_tokens, rel_id_file)
            extractions_list.append(ext_dict)
            fout.write(json.dumps(ext_dict))
            fout.write('\n')

    # shuffle and split to train/test/dev
    random.seed(100)
    random.shuffle(extractions_list)
    train_set = extractions_list[:len(extractions_list) - 700]
    dev_set = extractions_list[-700:-200]
    test_set = extractions_list[-200:]
    write_dataset_to_file(train_set, "train.json")
    write_dataset_to_file(dev_set, "devs.json")
    write_dataset_to_file(test_set, "test.json")


if __name__ == '__main__':
    process("../benchmark.json", "rel_file.json", "relation_model_data.json", "bert-base-uncased")