from ts.torch_handler.base_handler import BaseHandler from nltk.chunk import conlltags2tree from nltk import pos_tag from nltk.tree import Tree import numpy as np import torch from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import json import string # Get the directory of your script import logging import os import sys logger = logging.getLogger(__name__) # get the current directory current_directory = os.path.dirname(os.path.realpath(__file__)) print(current_directory) # add the current directory to sys.path sys.path.insert(0, current_directory) import pickle def pickle_load(path, verbose=False): if path is None: return None if verbose: print('Loading {}'.format(path)) with open(path, "rb") as f: obj = pickle.load(f) return obj DEFAULT_MODEL = 'facebook/mgenre-wiki' def tokenize(text): # Add a space before and after specified punctuation marks # text = re.sub(r'([,.!?])', r' \1 ', text) # Split the text into tokens tokens = text.split() return tokens logger.info(f'Loading title2wikidataID') lang_title2wikidataID_path = "lang_title2wikidataID-normalized_with_redirect.pkl" lang_title2wikidataID = pickle_load( lang_title2wikidataID_path, verbose=True) def text_to_id(x): return max(lang_title2wikidataID[tuple( reversed([y.strip() for y in x.split(" >> ")]))], key=lambda y: int(y[1:])) """ Method for retrieving the Qid """ def get_wikidata_qid(wikipedia_titles, scores): qid = 'NIL' wikipedia_title = wikipedia_titles[0] score = scores[0] for idx, title in enumerate( wikipedia_titles): try: qid = text_to_id(title) wikipedia_title = wikipedia_titles[idx] score = scores[idx] return qid, wikipedia_title, score except BaseException: qid = 'NIL' return qid, wikipedia_title, score def get_entities(tokens, preds_list_coarse, preds_list_fine, coarse_confidences, fine_confidences): tags_coarse = [tag.replace('S-', 'B-').replace('E-', 'I-') for tag in preds_list_coarse] tags_fine = [tag.replace('S-', 'B-').replace('E-', 'I-') for tag in preds_list_fine] pos_tags = [pos for token, pos in pos_tag(tokens)] conll_coarse_tags = [(token, pos, tg) for token, pos, tg in zip(tokens, pos_tags, tags_coarse)] conll_fine_tags = [(token, pos, tg) for token, pos, tg in zip(tokens, pos_tags, tags_fine)] ne_tree_coarse = conlltags2tree(conll_coarse_tags) ne_tree_fine = conlltags2tree(conll_fine_tags) coarse_entities = get_entities_from_tree(ne_tree_coarse, coarse_confidences) fine_entities = get_entities_from_tree(ne_tree_fine, fine_confidences) return coarse_entities, fine_entities def logarithmic_scaling(confidence_score): return np.log(confidence_score + 1e-10) # Adding a small value to avoid log(0) def classify_confidence(confidence_score): return int(confidence_score * 100.0) # TypeError: Object of type float32 is not JSON serializable # if confidence_score > 0.95: # return 'high' # elif confidence_score > 0.75: # return 'medium' # else: # return 'low' def get_entities_from_tree(ne_tree, token_confidences): entities = [] idx = 0 char_position = 0 # This will hold the current character position for subtree in ne_tree: # skipping 'O' tags if isinstance(subtree, Tree): original_label = subtree.label() original_string = " ".join( [token for token, pos in subtree.leaves()]) # original_string = reconstruct_text([token for token, pos in subtree.leaves()]) entity_start_position = char_position entity_end_position = entity_start_position + len(original_string) confidences = token_confidences[idx:idx + len(subtree)] # Compute the average confidence avg_confidence = sum(confidences) / len(confidences) print(original_string, '- confidence -', token_confidences[idx:idx + len(subtree)], '- avg -', avg_confidence, classify_confidence(avg_confidence), '- label -', original_label) entities.append( (original_string, original_label, (idx, idx + len(subtree)), (entity_start_position, entity_end_position), classify_confidence(avg_confidence))) idx += len(subtree) # Update the current character position # We add the length of the original string + 1 (for the space) char_position += len(original_string) + 1 else: token, pos = subtree # If it's not a named entity, we still need to update the character # position char_position += len(token) + 1 # We add 1 for the space idx += 1 return entities def realign( text_sentence, tokens_coarse_result, tokens_fine_result, coarse_confidences, fine_confidences, tokenizer, language, nerc_coarse_label_map, nerc_fine_label_map): preds_list_coarse, preds_list_fine, words_list, coarse_confidences_list, fine_confidences_list = [], [], [], [], [] word_ids = tokenizer(text_sentence, is_split_into_words=True).word_ids() for idx, word in enumerate(text_sentence): try: beginning_index = word_ids.index(idx) preds_list_coarse.append(nerc_coarse_label_map[tokens_coarse_result[beginning_index]]) preds_list_fine.append(nerc_fine_label_map[tokens_fine_result[beginning_index]]) coarse_confidences_list.append(coarse_confidences[beginning_index]) fine_confidences_list.append(fine_confidences[beginning_index]) except Exception as ex: # the sentence was longer then max_length preds_list_coarse.append('O') preds_list_fine.append('O') coarse_confidences_list.append(1.0) fine_confidences_list.append(1.0) words_list.append(word) return words_list, preds_list_coarse, preds_list_fine, coarse_confidences_list, fine_confidences_list import os class NewsAgencyHandler(BaseHandler): def __init__(self): super().__init__() self.model = None self.tokenizer = None self.device = None def initialize(self, ctx): # boilerplate properties = ctx.system_properties self.map_location = "cuda" if torch.cuda.is_available() else "cpu" self.device = torch.device(self.map_location + ":" + str( properties.get("gpu_id")) if torch.cuda.is_available() else self.map_location) # self.manifest = ctx.manifest # model_dir is the inside of your archive! # extra-files are in this dir. model_name = ctx.model_yaml_config["handler"]["model_name"] logger.info("Model %s loading tokenizer", model_name) # serialized_file = self.manifest["model"]["serializedFile"] # self.tokenizer = AutoTokenizer.from_pretrained( # model_dir, local_files_only=True) # # # Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode # # further setup config can be added. logger.error(f'getcwd: {os.getcwd()}') logger.error(f'__file__: {__file__}') logger.error(f'Model: {model_name}') logger.error(f'Device: {self.device}') # # save_mode = "pretrained" # # if save_mode == "torchscript": # self.model = torch.jit.load(serialized_file) # elif save_mode == "pretrained": # model_dir = properties.get("model_dir") # serialized_file = self.manifest["model"]["serializedFile"] # self.tokenizer = AutoTokenizer.from_pretrained( # model_dir, local_files_only=True) # # self.model = torch.jit.load(serialized_file, map_location=self.device) # # self.model.to(self.device) # self.model.eval() self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # self.model = torch.nn.DataParallel(self.model) self.tokenizer = AutoTokenizer.from_pretrained(model_name) # else: # logger.warning("Missing the checkpoint or state_dict.") self.model.to(self.map_location) self.model.eval() logger.info("Transformer model from path %s loaded successfully", model_name) def preprocess(self, requests): logger.info(f'Preprocessing requests {len(requests)}') data = requests[0] text_sentences = [] # The request should have the text: # THE next MEETLNG of the TRITSTEE, will be held at the [START] LONDON HOTEL [END] in POOLE, on ldomaT, # the 12th day or MARCH next. at 12 oClock at Noon for item in data['body']: item = json.loads(item) text = item['text'] text_sentences.append(text) language = item['language'] # print('Doc id:', item['doc_id']) # print('-----Text', text, type(text)) # print('-----Language', language) return text_sentences, language def inference(self, inputs): text_sentences, language = inputs tokens_coarse_results, tokens_fine_results = [], [] tokens_coarse_confidences, tokens_fine_confidences = [], [] qids = [] with torch.no_grad(): for sentence in text_sentences: sentences = [sentence] # logger.error(f'Device: {self.device}') outputs = self.model.generate( **self.tokenizer(sentences, return_tensors="pt").to(self.device), num_beams=5, num_return_sequences=5, return_dict_in_generate=True, output_scores=True) token_ids, scores = outputs['sequences'], outputs['sequences_scores'] wikipedia_titles = self.tokenizer.batch_decode(token_ids, skip_special_tokens=True) # Example log-likelihoods (scores) log_likelihoods = torch.tensor(scores) # Convert log-likelihoods to "probabilities" (not true probabilities) probabilities = torch.exp(log_likelihoods) # Normalize these probabilities so they sum to 1 normalized_probabilities = probabilities / torch.sum(probabilities) # Convert to percentages percentages = normalized_probabilities * 100 qid, wikipedia_title, score = get_wikidata_qid(wikipedia_titles, percentages) percentage_score = int(score) # logger.info(f"Model prediction: {wikipedia_titles} {qid}, {wikipedia_title}, {score}, " # f"---- {percentage_score}") qids.append({'qid': qid, 'wikipedia_title': wikipedia_title, 'score': percentage_score}) # logger.info('-' * 100) return qids, text_sentences, language def postprocess(self, outputs): # postprocess the outputs here, for example, convert predictions to labels qids, text_sentences, language = outputs logger.info(f'Result NEL: {qids}') return [[qids]]