Spaces:

jiangjiechen
/

loren-fact-checking

Build error

File size: 10,725 Bytes

# -*- coding:utf-8  -*-

"""
@Last modified date : 2020/12/23
"""

import re
import nltk
import torch.cuda
from nltk.stem import WordNetLemmatizer
from allennlp.predictors.predictor import Predictor

nltk.download('wordnet')
nltk.download('stopwords')


def deal_bracket(text, restore, leading_ent=None):
    if leading_ent:
        leading_ent = ' '.join(leading_ent.split('_'))
        text = f'Things about {leading_ent}: ' + text
    if restore:
        text = text.replace('-LRB-', '(').replace('-RRB-', ')')
        text = text.replace('LRB', '(').replace('RRB', ')')
    return text


def refine_entity(entity):
    entity = re.sub(r'-LRB- .+ -RRB-$', '', entity)
    entity = re.sub(r'LRB .+ RRB$', '', entity)
    entity = re.sub(r'_', ' ', entity)
    entity = re.sub(r'\s+', ' ', entity)
    return entity.strip()


def find_sub_seq(seq_a, seq_b, shift=0, uncased=False, lemmatizer=None):
    if uncased:
        seq_a = [token.lower() for token in seq_a]
        seq_b = [token.lower() for token in seq_b]
    if lemmatizer is not None:
        seq_a = [lemmatizer.lemmatize(token) for token in seq_a]
        seq_b = [lemmatizer.lemmatize(token) for token in seq_b]
    for i in range(shift, len(seq_a)):
        if seq_a[i:i+len(seq_b)] == seq_b:
            return i, i + len(seq_b)
    return -1, -1


def is_sub_seq(seq_start, seq_end, all_seqs):
    for start, end, is_candidate in all_seqs:
        if start <= seq_start < seq_end <= end:
            return start, end, is_candidate
    return None


# extract named entity with B-I-L-U-O schema
def extract_named_entity(tags):
    all_NEs = []
    ne_type, ne_start = '', -1
    for i, t in enumerate(tags):
        if t == 'O':
            ne_type, ne_start = '', -1
            continue
        t1, t2 = t.split('-')
        if t1 == 'B':
            ne_type, ne_start = t2, i
        elif t1 == 'I' and t2 != ne_type:
            ne_type, ne_start = '', -1
        elif t1 == 'L' and t2 != ne_type:
            ne_type, ne_start = '', -1
        elif t1 == 'L' and t2 == ne_type:
            all_NEs.append((ne_start, i + 1, False))
            ne_type, ne_start = '', -1
        elif t1 == 'U':
            all_NEs.append((i, i + 1, False))
            ne_type, ne_start = '', -1

    return all_NEs


def refine_results(tokens, spans, stopwords):
    all_spans = []
    for span_start, span_end, is_candidate in spans:
        # remove stopwords
        if not is_candidate:
            while span_start < span_end and tokens[span_start].lower() in stopwords:
                span_start += 1
            if span_start >= span_end:
                continue

        # add prefix
        if span_start > 0 and tokens[span_start - 1] in ['a', 'an', 'A', 'An', 'the', 'The']:
            span_start -= 1

        # convert token-level index into char-level index
        span = ' '.join(tokens[span_start:span_end])
        span_start = len(' '.join(tokens[:span_start])) + 1 * min(1, span_start)  # 1 for blank
        span_end = span_start + len(span)

        all_spans.append((span, span_start, span_end))
    all_spans = sorted(all_spans, key=lambda x: (x[1], x[1] - x[2]))

    # remove overlap
    refined_spans = []
    for span, span_start, span_end in all_spans:
        flag = True
        for _, start, end in refined_spans:
            if start <= span_start < span_end <= end:
                flag = False
                break
        if flag:
            refined_spans.append((span, span_start, span_end))

    return refined_spans


class SentenceParser:
    def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu',
                 ner_path="https://storage.googleapis.com/allennlp-public-models/ner-model-2020.02.10.tar.gz",
                 cp_path="https://storage.googleapis.com/allennlp-public-models/elmo-constituency-parser-2020.02.10.tar.gz"):
        self.device = self.parse_device(device)
        self.ner = Predictor.from_path(ner_path, cuda_device=self.device)
        print('* ner loaded')
        self.cp = Predictor.from_path(cp_path, cuda_device=self.device)
        print('* constituency parser loaded')
        self.lemmatizer = WordNetLemmatizer()

        # some heuristic rules can be added here
        self.stopwords = set(nltk.corpus.stopwords.words('english'))
        self.stopwords.update({'-', '\'s', 'try', 'tries', 'tried', 'trying',
                               'become', 'becomes', 'became', 'becoming',
                               'make', 'makes', 'made', 'making', 'call', 'called', 'calling',
                               'put', 'ever', 'something', 'someone', 'sometime'})
        self.special_tokens = ['only', 'most', 'before', 'after', 'behind']
        for token in self.special_tokens:
            if token in self.stopwords: self.stopwords.remove(token)
        if 'won' in self.stopwords: self.stopwords.remove('won')
        if 'own' in self.stopwords: self.stopwords.remove('own')

    def parse_device(self, device):
        if 'cpu' in device:
            return -1
        else:
            dev = re.findall('\d+', device)
            return 0 if len(dev) == 0 else int(dev[0])

    def identify_NPs(self, text, candidate_NPs=None):
        text = re.sub(r'\s+', ' ', text).strip()
        if len(text) == 0: return {'text': '', 'NPs': [], 'verbs': [], 'adjs': []}

        cp_outputs = self.cp.predict(text)
        ner_outputs = self.ner.predict(text)
        tokens = cp_outputs['tokens']
        pos_tags = cp_outputs['pos_tags']
        ner_tags = ner_outputs['tags']
        tree = cp_outputs['hierplane_tree']['root']

        # extract candidate noun phrases passed by user with token index
        all_NPs = []
        candidate_NPs = [refine_entity(np).split() for np in candidate_NPs] if candidate_NPs else []
        for np in sorted(candidate_NPs, key=len, reverse=True):
            np_start, np_end = find_sub_seq(tokens, np, 0, uncased=True, lemmatizer=self.lemmatizer)
            while np_start != -1 and np_end != -1:
                if not is_sub_seq(np_start, np_end, all_NPs):
                    all_NPs.append((np_start, np_end, True))
                np_start, np_end = find_sub_seq(tokens, np, np_end, uncased=True, lemmatizer=self.lemmatizer)

        # extract noun phrases from tree
        def _get_bottom_NPs(children):
            if 'children' not in children:
                return None
            if {'NP', 'OP', 'XP', 'QP'} & set(children['attributes']):
                is_bottom = True
                for child in children['children']:
                    if 'children' in child:
                        is_bottom = False
                if is_bottom:
                    bottom_NPs.append(children['word'].split())
                else:
                    for child in children['children']:
                        _get_bottom_NPs(child)
            else:
                for child in children['children']:
                    _get_bottom_NPs(child)
        bottom_NPs = []
        _get_bottom_NPs(tree)

        # find token indices of noun phrases
        np_index = -1
        for np in bottom_NPs:
            np_start, np_end = find_sub_seq(tokens, np, np_index + 1)
            if not is_sub_seq(np_start, np_end, all_NPs):
                all_NPs.append((np_start, np_end, False))
            np_index = np_end

        # extract named entities with token index
        all_NEs = extract_named_entity(ner_tags)

        # extract verbs with token index
        all_verbs = []
        for i, pos in enumerate(pos_tags):
            if pos[0] == 'V':
                if not is_sub_seq(i, i + 1, all_NPs) and not is_sub_seq(i, i + 1, all_NEs):
                    all_verbs.append((i, i + 1, False))

        # extract modifiers with token index
        all_modifiers = []
        for i, (token, pos) in enumerate(zip(tokens, pos_tags)):
            if pos in ['JJ', 'RB']:  # adj. and adv.
                if not is_sub_seq(i, i + 1, all_NPs) and not is_sub_seq(i, i + 1, all_NEs):
                    all_modifiers.append((i, i + 1, False))
            elif token in self.special_tokens:
                if not is_sub_seq(i, i + 1, all_NPs) and not is_sub_seq(i, i + 1, all_NEs):
                    all_modifiers.append((i, i + 1, False))

        # split noun phrases with named entities
        all_spans = []
        for np_start, np_end, np_is_candidate in all_NPs:
            if np_is_candidate:  # candidate noun phrases will be preserved
                all_spans.append((np_start, np_end, np_is_candidate))
            else:
                match = is_sub_seq(np_start, np_end, all_NEs)
                if match:  # if a noun phrase is a sub span of a named entity, the named entity will be preserved
                    all_spans.append(match)
                else:  # else if a named entity is a sub span of a noun phrase, the noun phrase will be split
                    index = np_start
                    for ne_start, ne_end, ne_is_candidate in all_NEs:
                        if np_start <= ne_start < ne_end <= np_end:
                            all_modifiers.append((index, ne_start, False))
                            all_spans.append((ne_start, ne_end, ne_is_candidate))
                            index = ne_end
                    all_spans.append((index, np_end, False))

        # named entities without overlapping
        for ne_start, ne_end, is_candidate in all_NEs:
            if not is_sub_seq(ne_start, ne_end, all_spans):
                all_spans.append((ne_start, ne_end, is_candidate))

        all_spans = refine_results(tokens, all_spans, self.stopwords)
        all_verbs = refine_results(tokens, all_verbs, self.stopwords)
        all_modifiers = refine_results(tokens, all_modifiers, self.stopwords)

        return {'text': tree['word'], 'NPs': all_spans, 'verbs': all_verbs, 'adjs': all_modifiers}


if __name__ == '__main__':
    import json

    print('Initializing sentence parser.')
    client = SentenceParser(device='cpu')

    print('Parsing sentence.')
    sentence = "The Africa Cup of Nations is held in odd - numbered years due to conflict with the World Cup . "
    entities = ['Africa Cup of Nations', 'Africa_Cup_of_Nations', 'Africa Cup', 'Africa_Cup']
    results = client.identify_NPs(sentence, entities)
    print(json.dumps(results, ensure_ascii=False, indent=4))

    # import random
    # from tqdm import tqdm
    # from utils import read_json_lines, save_json
    #
    # print('Parsing file.')
    # results = []
    # data = list(read_json_lines('data/train.jsonl'))
    # random.shuffle(data)
    # for entry in tqdm(data[:100]):
    #     results.append(client.identify_NPs(entry['claim']))
    # save_json(results, 'data/results.json')