|
from ts.torch_handler.base_handler import BaseHandler |
|
from nltk.chunk import conlltags2tree |
|
from nltk import pos_tag |
|
from nltk.tree import Tree |
|
import numpy as np |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
import json |
|
import string |
|
|
|
import logging |
|
import os |
|
import sys |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
current_directory = os.path.dirname(os.path.realpath(__file__)) |
|
print(current_directory) |
|
|
|
sys.path.insert(0, current_directory) |
|
import pickle |
|
def pickle_load(path, verbose=False): |
|
if path is None: |
|
return None |
|
if verbose: |
|
print('Loading {}'.format(path)) |
|
with open(path, "rb") as f: |
|
obj = pickle.load(f) |
|
return obj |
|
|
|
DEFAULT_MODEL = 'facebook/mgenre-wiki' |
|
|
|
def tokenize(text): |
|
|
|
|
|
|
|
tokens = text.split() |
|
return tokens |
|
|
|
logger.info(f'Loading title2wikidataID') |
|
lang_title2wikidataID_path = "lang_title2wikidataID-normalized_with_redirect.pkl" |
|
lang_title2wikidataID = pickle_load( |
|
lang_title2wikidataID_path, verbose=True) |
|
|
|
def text_to_id(x): |
|
return max(lang_title2wikidataID[tuple( |
|
reversed([y.strip() for y in x.split(" >> ")]))], key=lambda y: int(y[1:])) |
|
|
|
""" |
|
Method for retrieving the Qid |
|
""" |
|
|
|
def get_wikidata_qid(wikipedia_titles, scores): |
|
qid = 'NIL' |
|
wikipedia_title = wikipedia_titles[0] |
|
score = scores[0] |
|
for idx, title in enumerate( |
|
wikipedia_titles): |
|
try: |
|
qid = text_to_id(title) |
|
wikipedia_title = wikipedia_titles[idx] |
|
score = scores[idx] |
|
return qid, wikipedia_title, score |
|
except BaseException: |
|
qid = 'NIL' |
|
return qid, wikipedia_title, score |
|
|
|
|
|
def get_entities(tokens, preds_list_coarse, preds_list_fine, coarse_confidences, fine_confidences): |
|
tags_coarse = [tag.replace('S-', 'B-').replace('E-', 'I-') for tag in preds_list_coarse] |
|
tags_fine = [tag.replace('S-', 'B-').replace('E-', 'I-') for tag in preds_list_fine] |
|
pos_tags = [pos for token, pos in pos_tag(tokens)] |
|
|
|
conll_coarse_tags = [(token, pos, tg) |
|
for token, pos, tg in zip(tokens, pos_tags, tags_coarse)] |
|
conll_fine_tags = [(token, pos, tg) |
|
for token, pos, tg in zip(tokens, pos_tags, tags_fine)] |
|
|
|
ne_tree_coarse = conlltags2tree(conll_coarse_tags) |
|
ne_tree_fine = conlltags2tree(conll_fine_tags) |
|
|
|
coarse_entities = get_entities_from_tree(ne_tree_coarse, coarse_confidences) |
|
fine_entities = get_entities_from_tree(ne_tree_fine, fine_confidences) |
|
return coarse_entities, fine_entities |
|
|
|
|
|
def logarithmic_scaling(confidence_score): |
|
return np.log(confidence_score + 1e-10) |
|
|
|
|
|
def classify_confidence(confidence_score): |
|
return int(confidence_score * 100.0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_entities_from_tree(ne_tree, token_confidences): |
|
entities = [] |
|
idx = 0 |
|
char_position = 0 |
|
|
|
for subtree in ne_tree: |
|
|
|
if isinstance(subtree, Tree): |
|
original_label = subtree.label() |
|
original_string = " ".join( |
|
[token for token, pos in subtree.leaves()]) |
|
|
|
|
|
|
|
entity_start_position = char_position |
|
entity_end_position = entity_start_position + len(original_string) |
|
|
|
confidences = token_confidences[idx:idx + len(subtree)] |
|
|
|
avg_confidence = sum(confidences) / len(confidences) |
|
print(original_string, '- confidence -', token_confidences[idx:idx + len(subtree)], '- avg -', |
|
avg_confidence, classify_confidence(avg_confidence), '- label -', original_label) |
|
|
|
entities.append( |
|
(original_string, |
|
original_label, |
|
(idx, |
|
idx + len(subtree)), |
|
(entity_start_position, |
|
entity_end_position), |
|
classify_confidence(avg_confidence))) |
|
|
|
idx += len(subtree) |
|
|
|
|
|
|
|
char_position += len(original_string) + 1 |
|
else: |
|
token, pos = subtree |
|
|
|
|
|
char_position += len(token) + 1 |
|
idx += 1 |
|
return entities |
|
|
|
|
|
def realign( |
|
text_sentence, |
|
tokens_coarse_result, |
|
tokens_fine_result, |
|
coarse_confidences, |
|
fine_confidences, |
|
tokenizer, |
|
language, |
|
nerc_coarse_label_map, |
|
nerc_fine_label_map): |
|
|
|
preds_list_coarse, preds_list_fine, words_list, coarse_confidences_list, fine_confidences_list = [], [], [], [], [] |
|
word_ids = tokenizer(text_sentence, is_split_into_words=True).word_ids() |
|
|
|
for idx, word in enumerate(text_sentence): |
|
try: |
|
beginning_index = word_ids.index(idx) |
|
preds_list_coarse.append(nerc_coarse_label_map[tokens_coarse_result[beginning_index]]) |
|
preds_list_fine.append(nerc_fine_label_map[tokens_fine_result[beginning_index]]) |
|
|
|
coarse_confidences_list.append(coarse_confidences[beginning_index]) |
|
fine_confidences_list.append(fine_confidences[beginning_index]) |
|
|
|
except Exception as ex: |
|
preds_list_coarse.append('O') |
|
preds_list_fine.append('O') |
|
|
|
coarse_confidences_list.append(1.0) |
|
fine_confidences_list.append(1.0) |
|
|
|
words_list.append(word) |
|
|
|
return words_list, preds_list_coarse, preds_list_fine, coarse_confidences_list, fine_confidences_list |
|
|
|
import os |
|
|
|
|
|
|
|
class NewsAgencyHandler(BaseHandler): |
|
def __init__(self): |
|
super().__init__() |
|
self.model = None |
|
self.tokenizer = None |
|
self.device = None |
|
|
|
def initialize(self, ctx): |
|
|
|
properties = ctx.system_properties |
|
self.map_location = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
self.device = torch.device(self.map_location + ":" + str( |
|
properties.get("gpu_id")) if torch.cuda.is_available() else self.map_location) |
|
|
|
|
|
|
|
|
|
|
|
|
|
model_name = ctx.model_yaml_config["handler"]["model_name"] |
|
logger.info("Model %s loading tokenizer", model_name) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger.error(f'getcwd: {os.getcwd()}') |
|
logger.error(f'__file__: {__file__}') |
|
logger.error(f'Model: {model_name}') |
|
logger.error(f'Device: {self.device}') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
|
|
self.model.to(self.map_location) |
|
self.model.eval() |
|
logger.info("Transformer model from path %s loaded successfully", model_name) |
|
|
|
def preprocess(self, requests): |
|
logger.info(f'Preprocessing requests {len(requests)}') |
|
|
|
data = requests[0] |
|
text_sentences = [] |
|
|
|
|
|
|
|
|
|
|
|
for item in data['body']: |
|
item = json.loads(item) |
|
text = item['text'] |
|
text_sentences.append(text) |
|
language = item['language'] |
|
|
|
|
|
|
|
|
|
return text_sentences, language |
|
|
|
def inference(self, inputs): |
|
|
|
text_sentences, language = inputs |
|
|
|
tokens_coarse_results, tokens_fine_results = [], [] |
|
tokens_coarse_confidences, tokens_fine_confidences = [], [] |
|
|
|
qids = [] |
|
with torch.no_grad(): |
|
for sentence in text_sentences: |
|
|
|
sentences = [sentence] |
|
|
|
|
|
|
|
outputs = self.model.generate( |
|
**self.tokenizer(sentences, return_tensors="pt").to(self.device), |
|
num_beams=5, |
|
num_return_sequences=5, |
|
return_dict_in_generate=True, |
|
output_scores=True) |
|
|
|
token_ids, scores = outputs['sequences'], outputs['sequences_scores'] |
|
wikipedia_titles = self.tokenizer.batch_decode(token_ids, skip_special_tokens=True) |
|
|
|
|
|
log_likelihoods = torch.tensor(scores) |
|
|
|
|
|
probabilities = torch.exp(log_likelihoods) |
|
|
|
|
|
normalized_probabilities = probabilities / torch.sum(probabilities) |
|
|
|
|
|
percentages = normalized_probabilities * 100 |
|
|
|
qid, wikipedia_title, score = get_wikidata_qid(wikipedia_titles, percentages) |
|
percentage_score = int(score) |
|
|
|
|
|
|
|
|
|
qids.append({'qid': qid, 'wikipedia_title': wikipedia_title, 'score': percentage_score}) |
|
|
|
|
|
return qids, text_sentences, language |
|
|
|
def postprocess(self, outputs): |
|
|
|
|
|
qids, text_sentences, language = outputs |
|
|
|
logger.info(f'Result NEL: {qids}') |
|
|
|
return [[qids]] |
|
|
|
|