Spaces:

ajitrajasekharan
/

self-supervised-ner-biomedical

Runtime error

App Files Files Community

ajit commited on Feb 5, 2022

Commit

5775680

1 Parent(s): 3df8af2

Initial creation

Browse files

Files changed (18) hide show

BatchInference.py +707 -0
aggregate_server_json.py +541 -0
app.py +271 -0
batched_main_NER.py +905 -0
bbc/bbc_labels.txt +0 -0
bbc/desc_bbc_config.json +6 -0
bbc/ner_bbc_config.json +8 -0
bbc/vocab.txt +0 -0
bio/a100_labels.txt +0 -0
bio/desc_a100_config.json +6 -0
bio/ner_a100_config.json +8 -0
bio/vocab.txt +0 -0
common.py +153 -0
common_descs.txt +149 -0
config_utils.py +19 -0
entity_types_consolidated.txt +18 -0
requirements.txt +3 -0
untagged_terms.txt +0 -0

BatchInference.py ADDED Viewed

	@@ -0,0 +1,707 @@

+import torch
+import subprocess
+#from pytorch_transformers import *
+from transformers import *
+import pdb
+import operator
+from collections import OrderedDict
+import numpy as np
+import argparse
+import sys
+import traceback
+import string
+import common as utils
+import config_utils as cf
+import requests
+import json
+import streamlit as st
+# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
+import logging
+logging.basicConfig(level=logging.INFO)
+DEFAULT_TOP_K = 20
+DEFAULT_CONFIG = "./server_config.json"
+DEFAULT_MODEL_PATH='./'
+DEFAULT_LABELS_PATH='./labels.txt'
+DEFAULT_TO_LOWER=False
+DESC_FILE="./common_descs.txt"
+SPECIFIC_TAG=":__entity__"
+MAX_TOKENIZED_SENT_LENGTH = 500 #additional buffer for CLS SEP and entity term
+try:
+    from subprocess import DEVNULL  # Python 3.
+except ImportError:
+    DEVNULL = open(os.devnull, 'wb')
+@st.cache()
+def load_bert_model(model_name,to_lower):
+  try:
+    bert_tokenizer = BertTokenizer.from_pretrained(model_name,do_lower_case=to_lower)
+    bert_model = BertForMaskedLM.from_pretrained(model_name)
+    return bert_tokenizer,bert_model
+  except Exception as e:
+    pass
+def read_descs(file_name):
+    ret_dict = {}
+    with open(file_name) as fp:
+        line = fp.readline().rstrip("\n")
+        if (len(line) >= 1):
+            ret_dict[line] = 1
+        while line:
+            line = fp.readline().rstrip("\n")
+            if (len(line) >= 1):
+                ret_dict[line] = 1
+    return ret_dict
+def read_vocab(file_name):
+    l_vocab_dict = {}
+    o_vocab_dict = {}
+    with open(file_name) as fp:
+        for line in fp:
+            line = line.rstrip('\n')
+            if (len(line) > 0):
+                l_vocab_dict[line.lower()] = line   #If there are multiple cased versions they will be collapsed into one. which is okay since we have the original saved. This is only used
+                                                    #when a word is not found in its pristine form in the original list.
+                o_vocab_dict[line] = line
+    print("Read vocab file:",len(o_vocab_dict))
+    return o_vocab_dict,l_vocab_dict
+def consolidate_labels(existing_node,new_labels,new_counts):
+    """Consolidates all the labels and counts for terms ignoring casing
+    For instance, egfr may not have an entity label associated with it
+    but eGFR and EGFR may have. So if input is egfr, then this function ensures
+    the combined entities set fo eGFR and EGFR is made so as to return that union
+    for egfr
+    """
+    new_dict = {}
+    existing_labels_arr = existing_node["label"].split('/')
+    existing_counts_arr = existing_node["counts"].split('/')
+    new_labels_arr = new_labels.split('/')
+    new_counts_arr = new_counts.split('/')
+    assert(len(existing_labels_arr) == len(existing_counts_arr))
+    assert(len(new_labels_arr) == len(new_counts_arr))
+    for i in range(len(existing_labels_arr)):
+        new_dict[existing_labels_arr[i]] = int(existing_counts_arr[i])
+    for i in range(len(new_labels_arr)):
+        if (new_labels_arr[i] in new_dict):
+            new_dict[new_labels_arr[i]] += int(new_counts_arr[i])
+        else:
+            new_dict[new_labels_arr[i]] = int(new_counts_arr[i])
+    sorted_d = OrderedDict(sorted(new_dict.items(), key=lambda kv: kv[1], reverse=True))
+    ret_labels_str = ""
+    ret_counts_str = ""
+    count = 0
+    for key in sorted_d:
+        if (count == 0):
+            ret_labels_str = key
+            ret_counts_str = str(sorted_d[key])
+        else:
+            ret_labels_str += '/' +  key
+            ret_counts_str += '/' +  str(sorted_d[key])
+        count += 1
+    return {"label":ret_labels_str,"counts":ret_counts_str}
+def read_labels(labels_file):
+    terms_dict = OrderedDict()
+    lc_terms_dict = OrderedDict()
+    with open(labels_file,encoding="utf-8") as fin:
+        count = 1
+        for term in fin:
+            term = term.strip("\n")
+            term = term.split()
+            if (len(term) == 3):
+                terms_dict[term[2]] = {"label":term[0],"counts":term[1]}
+                lc_term = term[2].lower()
+                if (lc_term in lc_terms_dict):
+                     lc_terms_dict[lc_term] = consolidate_labels(lc_terms_dict[lc_term],term[0],term[1])
+                else:
+                     lc_terms_dict[lc_term] = {"label":term[0],"counts":term[1]}
+                count += 1
+            else:
+                print("Invalid line:",term)
+                assert(0)
+    print("count of labels in " + labels_file + ":", len(terms_dict))
+    return terms_dict,lc_terms_dict
+class BatchInference:
+    def __init__(self, config_file,path,to_lower,patched,topk,abbrev,tokmod,vocab_path,labels_file,delimsep):
+        print("Model path:",path,"lower casing set to:",to_lower," is patched ", patched)
+        self.path = path
+        base_path = cf.read_config(config_file)["BASE_PATH"] if  ("BASE_PATH" in cf.read_config(config_file)) else "./"
+        desc_file_path = cf.read_config(config_file)["DESC_FILE"] if  ("DESC_FILE" in cf.read_config(config_file)) else DESC_FILE
+        self.labels_dict,self.lc_labels_dict = read_labels(labels_file)
+        #self.tokenizer = BertTokenizer.from_pretrained(path,do_lower_case=to_lower) ### Set this to to True for uncased models
+        #self.model = BertForMaskedLM.from_pretrained(path)
+        self.tokenizer, self.model = load_bert_model(path,to_lower)
+        self.model.eval()
+        #st.info("model loaded")
+        self.descs = read_descs(desc_file_path)
+        #st.info("descs loaded")
+        self.top_k = topk
+        self.patched = patched
+        self.abbrev = abbrev
+        self.tokmod  = tokmod
+        self.delimsep  = delimsep
+        self.truncated_fp = open(base_path + "truncated_sentences.txt","a")
+        self.always_log_fp = open(base_path + "CI_LOGS.txt","a")
+        if (cf.read_config(config_file)["USE_CLS"] == "1"): #Models like Bert base cased return same prediction for CLS regardless of input. So ignore CLS
+            print("************** USE CLS: Turned ON for this model. ******* ")
+            self.use_cls = True
+        else:
+            print("************** USE CLS: Turned OFF for this model. ******* ")
+            self.use_cls = False
+        if (cf.read_config(config_file)["LOG_DESCS"] == "1"):
+            self.log_descs = True
+            self.ci_fp = open(base_path + "log_ci_predictions.txt","w")
+            self.cs_fp = open(base_path + "log_cs_predictions.txt","w")
+        else:
+            self.log_descs = False
+        self.pos_server_url  = cf.read_config(config_file)["POS_SERVER_URL"]
+        #st.info("Attemting to load vocab file")
+        if (tokmod):
+            self.o_vocab_dict,self.l_vocab_dict = read_vocab(vocab_path + "/vocab.txt")
+        else:
+            self.o_vocab_dict = {}
+            self.l_vocab_dict = {}
+       # st.info("Constructor complete")
+        #pdb.set_trace()
+    def dispatch_request(self,url):
+        max_retries = 10
+        attempts = 0
+        while True:
+            try:
+                r = requests.get(url,timeout=1000)
+                if (r.status_code == 200):
+                    return r
+            except:
+                print("Request:", url, " failed. Retrying...")
+            attempts += 1
+            if (attempts >= max_retries):
+                print("Request:", url, " failed")
+                break
+    def modify_text_to_match_vocab(self,text):
+        ret_arr  = []
+        text = text.split()
+        for word in text:
+            if (word in self.o_vocab_dict):
+                ret_arr.append(word)
+            else:
+                if (word.lower() in self.l_vocab_dict):
+                    ret_arr.append(self.l_vocab_dict[word.lower()])
+                else:
+                    ret_arr.append(word)
+        return ' '.join(ret_arr)
+    #This is bad hack for prototyping - parsing from text output as opposed to json
+    def extract_POS(self,text):
+        arr = text.split('\n')
+        if (len(arr) > 0):
+            start_pos = 0
+            for i,line in enumerate(arr):
+                if (len(line) > 0):
+                    start_pos += 1
+                    continue
+                else:
+                    break
+            #print(arr[start_pos:])
+            terms_arr = []
+            for i,line in enumerate(arr[start_pos:]):
+                terms = line.split('\t')
+                if (len(terms) == 5):
+                    #print(terms)
+                    terms_arr.append(terms)
+            return terms_arr
+    def masked_word_first_letter_capitalize(self,entity):
+        arr = entity.split()
+        ret_arr = []
+        for term in arr:
+            if (len(term) > 1 and term[0].islower() and term[1].islower()):
+                ret_arr.append(term[0].upper() + term[1:])
+            else:
+                ret_arr.append(term)
+        return ' '.join(ret_arr)
+    def gen_single_phrase_sentences(self,terms_arr,span_arr):
+        sentence_template = "%s is a entity"
+        #print(span_arr)
+        sentences = []
+        singleton_spans_arr  = []
+        run_index = 0
+        entity  = ""
+        singleton_span = []
+        while (run_index < len(span_arr)):
+            if (span_arr[run_index] == 1):
+                while (run_index < len(span_arr)):
+                    if (span_arr[run_index] == 1):
+                        #print(terms_arr[run_index][WORD_POS],end=' ')
+                        if (len(entity) == 0):
+                            entity = terms_arr[run_index][utils.WORD_POS]
+                        else:
+                            entity = entity + " " + terms_arr[run_index][utils.WORD_POS]
+                        singleton_span.append(1)
+                        run_index += 1
+                    else:
+                        break
+                #print()
+                for i in sentence_template.split():
+                    if (i != "%s"):
+                        singleton_span.append(0)
+                entity = self.masked_word_first_letter_capitalize(entity)
+                if (self.tokmod):
+                    entity = self.modify_text_to_match_vocab(entity)
+                sentence = sentence_template % entity
+                sentences.append(sentence)
+                singleton_spans_arr.append(singleton_span)
+                #print(sentence)
+                #rint(singleton_span)
+                entity = ""
+                singleton_span = []
+            else:
+                run_index += 1
+        return sentences,singleton_spans_arr
+    def gen_padded_sentence(self,text,max_tokenized_sentence_length,tokenized_text_arr,orig_tokenized_length_arr,indexed_tokens_arr,attention_mask_arr,to_replace):
+        if (to_replace):
+            text_arr = text.split()
+            new_text_arr = []
+            for i in range(len(text_arr)):
+                if (text_arr[i] == "entity" ):
+                    new_text_arr.append( "[MASK]")
+                else:
+                    new_text_arr.append(text_arr[i])
+            text = ' '.join(new_text_arr)
+        text = '[CLS] ' + text + ' [SEP]'
+        tokenized_text = self.tokenizer.tokenize(text)
+        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
+        tok_length = len(indexed_tokens)
+        max_tokenized_sentence_length = max_tokenized_sentence_length if tok_length <= max_tokenized_sentence_length else tok_length
+        indexed_tokens_arr.append(indexed_tokens)
+        attention_mask_arr.append([1]*tok_length)
+        tokenized_text_arr.append(tokenized_text)
+        orig_tokenized_length_arr.append(tokenized_text)
+        return max_tokenized_sentence_length
+    def find_entity(self,word):
+        entities = self.labels_dict
+        lc_entities = self.lc_labels_dict
+        in_vocab = False
+        #words = self.filter_glue_words(words) #do not filter glue words anymore. Let them pass through
+        l_word = word.lower()
+        if l_word.isdigit():
+            ret_label = "MEASURE"
+            ret_counts = str(1)
+        elif (word in entities):
+            ret_label = entities[word]["label"]
+            ret_counts = entities[word]["counts"]
+            in_vocab = True
+        elif (l_word in entities):
+            ret_label = entities[l_word]["label"]
+            ret_counts = entities[l_word]["counts"]
+            in_vocab = True
+        elif (l_word in lc_entities):
+            ret_label = lc_entities[l_word]["label"]
+            ret_counts = lc_entities[l_word]["counts"]
+            in_vocab = True
+        else:
+            ret_label = "OTHER"
+            ret_counts = "1"
+        if (ret_label == "OTHER"):
+            ret_label = "UNTAGGED_ENTITY"
+            ret_counts = "1"
+        #print(word,ret_label,ret_counts)
+        return ret_label,ret_counts,in_vocab
+    #This is just a trivial hack for consistency of CI prediction of numbers
+    def override_ci_number_predictions(self,masked_sent):
+        words = masked_sent.split()
+        words_count = len(words)
+        if (len(words) == 4 and words[words_count-1] == "entity" and words[words_count -2] == "a" and words[words_count -3] == "is"  and words[0].isnumeric()): #only integers skipped
+            return True,"two","1","NUMBER"
+        else:
+            return False,"","",""
+    def override_ci_for_vocab_terms(self,masked_sent):
+        words = masked_sent.split()
+        words_count = len(words)
+        if (len(words) == 4 and words[words_count-1] == "entity" and words[words_count -2] == "a" and words[words_count -3] == "is"):
+            entity,entity_count,in_vocab = self.find_entity(words[0])
+            if (in_vocab):
+                return True,words[0],entity_count,entity
+        return False,"","",""
+    def normalize_sent(self,sent):
+        normalized_tokens = "!\"%();?[]`{}"
+        end_tokens = "!,.:;?"
+        sent = sent.rstrip()
+        if (len(sent) > 1):
+            if (self.delimsep):
+                for i in range(len(normalized_tokens)):
+                    sent = sent.replace(normalized_tokens[i],' ' + normalized_tokens[i] + ' ')
+                sent = sent.rstrip()
+            if (not sent.endswith(":__entity__")):
+                last_char = sent[-1]
+                if (last_char not in end_tokens): #End all sentences with a period if not already present in sentence.
+                    sent = sent + ' . '
+        print("Normalized sent",sent)
+        return sent
+    def truncate_sent_if_too_long(self,text):
+       truncated_count = 0
+       orig_sent = text
+       while (True):
+           tok_text = '[CLS] ' + text + ' [SEP]'
+           tokenized_text = self.tokenizer.tokenize(tok_text)
+           if (len(tokenized_text) < MAX_TOKENIZED_SENT_LENGTH):
+                break
+           text = ' '.join(text.split()[:-1])
+           truncated_count += 1
+       if (truncated_count > 0):
+            print("Input sentence was truncated by: ", truncated_count, " tokens")
+            self.truncated_fp.write("Input sentence was truncated by: " +  str(truncated_count) + " tokens\n")
+            self.truncated_fp.write(orig_sent + "\n")
+            self.truncated_fp.write(text + "\n\n")
+       return text
+    def get_descriptors(self,sent,pos_arr):
+        '''
+            Batched creation of descriptors given a sentence.
+                1) Find noun phrases to tag in a sentence if user did not explicitly tag.
+                2) Create 'N' CS and  CI sentences if there are N phrases to tag.  Total 2*N sentences
+                3) Create a batch padding all sentences to the maximum sentence length.
+                4) Perform inference on batch
+                5) Return json of descriptors for the ooriginal sentence as well as all CI sentences
+        '''
+        #Truncate sent if the tokenized sent is longer than max sent length
+        #st.info("in get descriptors")
+        sent = self.truncate_sent_if_too_long(sent)
+        #This is a modification of input text to words in vocab that match it in case insensitive manner.
+        #This is *STILL* required when we are using subwords too for prediction. The prediction quality is still better.
+        #An example is Mesothelioma is caused by exposure to asbestos. The quality of prediction is better when Mesothelioma is not split by lowercasing with A100 model
+        if (self.tokmod):
+            sent = self.modify_text_to_match_vocab(sent)
+        #The input sentence is normalized. Specifically all input is terminated with a punctuation if not already present. Also some of the punctuation marks are separated from text if glued to a word(disabled by default for test set sync)
+        sent = self.normalize_sent(sent)
+        #Step 1. Find entities to tag if user did not explicitly tag terms
+        #All noun phrases are tagged for prediction
+        if (SPECIFIC_TAG in sent):
+            terms_arr = utils.set_POS_based_on_entities(sent)
+        else:
+            if (pos_arr is  None):
+                assert(0)
+                url = self.pos_server_url  + sent.replace('"','\'')
+                r = self.dispatch_request(url)
+                terms_arr = self.extract_POS(r.text)
+            else:
+               # st.info("Reusing Pos arr")
+                terms_arr = pos_arr
+        print(terms_arr)
+        #Note span arr only contains phrases in the input that need to be tagged - not the span of all phrases in sentences
+        #Step 2. Create N CS sentences
+        #This returns masked sentences for all positions
+        main_sent_arr,masked_sent_arr,span_arr = utils.detect_masked_positions(terms_arr)
+        ignore_cs = True if (len(masked_sent_arr) == 1 and len(masked_sent_arr[0]) == 2 and  masked_sent_arr[0][0] == "__entity__" and masked_sent_arr[0][1] == ".") else False #This is a boundary condition to avoid using cs if the input is just trying to get entity type for a phrase. There is no sentence context in that case.
+        #Step 2. Create N CI sentences
+        singleton_sentences,not_used_singleton_spans_arr = self.gen_single_phrase_sentences(terms_arr,span_arr)
+        #We now have 2*N sentences
+        max_tokenized_sentence_length = 0
+        tokenized_text_arr = []
+        indexed_tokens_arr = []
+        attention_mask_arr = []
+        all_sentences_arr = []
+        orig_tokenized_length_arr = []
+        assert(len(masked_sent_arr) == len(singleton_sentences))
+        for ci_s,cs_s in zip(singleton_sentences,masked_sent_arr):
+            all_sentences_arr.append(ci_s)
+            max_tokenized_sentence_length = self.gen_padded_sentence(ci_s,max_tokenized_sentence_length,tokenized_text_arr,orig_tokenized_length_arr,indexed_tokens_arr,attention_mask_arr,True)
+            cs_s = ' '.join(cs_s).replace("__entity__","entity")
+            all_sentences_arr.append(cs_s)
+            max_tokenized_sentence_length = self.gen_padded_sentence(cs_s,max_tokenized_sentence_length,tokenized_text_arr,orig_tokenized_length_arr,indexed_tokens_arr,attention_mask_arr,True)
+        #pad all sentences with length less than max sentence length. This includes the full sentence too since we used indexed_tokens_arr
+        for i in range(len(indexed_tokens_arr)):
+            padding = [self.tokenizer.pad_token_id]*(max_tokenized_sentence_length - len(indexed_tokens_arr[i]))
+            att_padding = [0]*(max_tokenized_sentence_length - len(indexed_tokens_arr[i]))
+            if (len(padding) > 0):
+                indexed_tokens_arr[i].extend(padding)
+                attention_mask_arr[i].extend(att_padding)
+        assert(len(main_sent_arr) == len(span_arr))
+        assert(len(all_sentences_arr) == len(indexed_tokens_arr))
+        assert(len(all_sentences_arr) == len(attention_mask_arr))
+        assert(len(all_sentences_arr) == len(tokenized_text_arr))
+        assert(len(all_sentences_arr) == len(orig_tokenized_length_arr))
+        # Convert inputs to PyTorch tensors
+        tokens_tensor = torch.tensor(indexed_tokens_arr)
+        attention_tensors = torch.tensor(attention_mask_arr)
+        print("Input:",sent)
+        ret_obj = OrderedDict()
+        with torch.no_grad():
+            predictions = self.model(tokens_tensor, attention_mask=attention_tensors)
+            for sent_index in  range(len(predictions[0])):
+                #print("*** Current sentence ***",all_sentences_arr[sent_index])
+                if (self.log_descs):
+                    fp = self.cs_fp if sent_index %2 != 0  else self.ci_fp
+                    fp.write("\nCurrent sentence: " + all_sentences_arr[sent_index] + "\n")
+                prediction = "ci_prediction" if (sent_index %2 == 0 ) else "cs_prediction"
+                out_index = int(sent_index/2) + 1
+                if (out_index not in ret_obj):
+                    ret_obj[out_index] = {}
+                assert(prediction not in ret_obj[out_index])
+                ret_obj[out_index][prediction] = {}
+                ret_obj[out_index][prediction]["sentence"] = all_sentences_arr[sent_index]
+                curr_sent_arr = []
+                ret_obj[out_index][prediction]["descs"] = curr_sent_arr
+                for word in range(len(tokenized_text_arr[sent_index])):
+                    if (word == len(tokenized_text_arr[sent_index]) - 1): # SEP is  skipped for CI and CS
+                        continue
+                    if (sent_index %2 == 0 and (word != 0 and word != len(orig_tokenized_length_arr[sent_index]) - 2)): #For all CI sentences pick only the neighbors of CLS and the last word of the sentence (X is a entity)
+                    #if (sent_index %2 == 0 and (word != 0 and word != len(orig_tokenized_length_arr[sent_index]) - 2) and word != len(orig_tokenized_length_arr[sent_index]) - 3): #For all CI sentences - just pick CLS, "a" and "entity"
+                    #if (sent_index %2 == 0 and (word != 0 and (word == len(orig_tokenized_length_arr[sent_index]) - 4))): #For all CI sentences pick ALL terms excluding "is" in "X is a entity"
+                        continue
+                    if (sent_index %2 == 0 and (word == 0 and not self.use_cls)): #This is for models like bert base cased where we cant use CLS - it is the same for all words.
+                        continue
+                    if (sent_index %2 != 0 and tokenized_text_arr[sent_index][word] != "[MASK]"): # for all CS sentences skip all terms except the mask position
+                        continue
+                    results_dict = {}
+                    masked_index = word
+                    #pick all model predictions for current position word
+                    if (self.patched):
+                        for j in range(len(predictions[0][0][sent_index][masked_index])):
+                            tok = tokenizer.convert_ids_to_tokens([j])[0]
+                            results_dict[tok] = float(predictions[0][0][sent_index][masked_index][j].tolist())
+                    else:
+                        for j in range(len(predictions[0][sent_index][masked_index])):
+                            tok = self.tokenizer.convert_ids_to_tokens([j])[0]
+                            results_dict[tok] = float(predictions[0][sent_index][masked_index][j].tolist())
+                    k = 0
+                    #sort it - big to small
+                    sorted_d = OrderedDict(sorted(results_dict.items(), key=lambda kv: kv[1], reverse=True))
+                    #print("********* Top predictions for token: ",tokenized_text_arr[sent_index][word])
+                    if (self.log_descs):
+                        fp.write("********* Top predictions for token: " + tokenized_text_arr[sent_index][word] + "\n")
+                    if (sent_index %2 == 0): #For CI sentences, just pick half for CLS and entity position to match with CS counts
+                        if (self.use_cls): #If we are not using [CLS] for models like BBC, then take all top k from the entity prediction
+                            top_k = self.top_k/2
+                        else:
+                            top_k = self.top_k
+                    else:
+                        top_k = self.top_k
+                    #Looping through each descriptor prediction for a position and picking it up subject to some conditions
+                    for index in sorted_d:
+                        #if (index in string.punctuation or index.startswith('##') or len(index) == 1 or index.startswith('.') or index.startswith('[')):
+                        if index.lower() in self.descs: #these have almost no entity info - glue words like "the","a"
+                            continue
+                        #if (index in string.punctuation  or len(index) == 1 or index.startswith('.') or index.startswith('[') or index.startswith("#")):
+                        if (index in string.punctuation  or len(index) == 1 or index.startswith('.') or index.startswith('[')):
+                            continue
+                        if (index.startswith("#")): #subwords suggest model is trying to predict a multi word term that generally tends to be noisy. So penalize. Count and skip
+                            k += 1
+                            continue
+                        #print(index,round(float(sorted_d[index]),4))
+                        if (sent_index % 2 != 0):
+                            #CS predictions
+                            entity,entity_count,dummy = self.find_entity(index)
+                            if (self.log_descs):
+                                self.cs_fp.write(index + " " + entity +  " " +  entity_count + " " + str(round(float(sorted_d[index]),4)) + "\n")
+                            if (not ignore_cs):
+                                curr_sent_arr.append({"desc":index,"e":entity,"e_count":entity_count,"v":str(round(float(sorted_d[index]),4))})
+                            if (all_sentences_arr[sent_index].strip().rstrip(".").strip().endswith("entity")):
+                                self.always_log_fp.write(' '.join(all_sentences_arr[sent_index].split()[:-1]) + " " + index + " :__entity__\n")
+                        else:
+                            #CI predictions of the form X is a entity
+                            entity,entity_count,dummy = self.find_entity(index) #index is one of  the predicted descs for the [CLS]/[MASK] psition
+                            number_override,override_index,override_entity_count,override_entity = self.override_ci_number_predictions(all_sentences_arr[sent_index]) #Note this override just uses the sentence to override all descs
+                            if (number_override): #note the prediction for this position still takes the prediction float values model returns
+                               index = override_index
+                               entity_count = override_entity_count
+                               entity = override_entity
+                            else:
+                                if (not self.use_cls or word != 0):
+                                    override,override_index,override_entity_count,override_entity = self.override_ci_for_vocab_terms(all_sentences_arr[sent_index]) #this also uses the sentence to override, ignoring descs, except reusing the prediction score
+                                    if (override): #note the prediction for this position still takes the prediction float values model returns
+                                        index = override_index
+                                        entity_count = override_entity_count
+                                        entity = override_entity
+                                        k = top_k #just add this override once. We dont have to add this override for each descripor and inundate downstream NER with the same signature
+                            if (self.log_descs):
+                                self.ci_fp.write(index + " " + entity + " " +  entity_count + " " + str(round(float(sorted_d[index]),4)) +  "\n")
+                            curr_sent_arr.append({"desc":index,"e":entity,"e_count":entity_count,"v":str(round(float(sorted_d[index]),4))})
+                            #if (index != "two" and not index.startswith("#")  and not all_sentences_arr[sent_index].strip().startswith("is ")):
+                            if (index != "two" and not all_sentences_arr[sent_index].strip().startswith("is ")):
+                                self.always_log_fp.write(' '.join(all_sentences_arr[sent_index].split()[:-1]) + " " + index + " :__entity__\n")
+                        k += 1
+                        if (k >= top_k):
+                            break
+                    #print()
+        #print(ret_obj)
+        #print(ret_obj)
+        #st.info("Enf. of prediciton")
+        #pdb.set_trace()
+        #final_obj = {"terms_arr":main_sent_arr,"span_arr":span_arr,"descs_and_entities":ret_obj,"all_sentences":all_sentences_arr}
+        final_obj = {"input":sent,"terms_arr":main_sent_arr,"span_arr":span_arr,"descs_and_entities":ret_obj}
+        if (self.log_descs):
+            self.ci_fp.flush()
+            self.cs_fp.flush()
+        self.always_log_fp.flush()
+        self.truncated_fp.flush()
+        return final_obj
+test_arr = [
+       "ajit? is an engineer .",
+       "Sam:__entity__ Malone:__entity__ .",
+       "1. Jesper:__entity__ Ronnback:__entity__ ( Sweden:__entity__ ) 25.76 points",
+       "He felt New York has a chance:__entity__ to win this year's competition .",
+       "The new omicron variant could increase the likelihood that people will need a fourth coronavirus  vaccine dose earlier than expected, executives at Prin dummy:__entity__  said Wednesday .",
+       "The new omicron variant could increase the likelihood that people will need a fourth coronavirus  vaccine dose earlier than expected, executives at pharmaceutical:__entity__ giant:__entity__ Pfizer:__entity__  said Wednesday .",
+       "The conditions:__entity__ in the camp were very poor",
+        "Imatinib:__entity__ is used to treat nsclc",
+        "imatinib:__entity__ is used to treat nsclc",
+        "imatinib:__entity__ mesylate:__entity__ is used to treat nsclc",
+       "Staten is a :__entity__",
+       "John is a :__entity__",
+       "I met my best friend at eighteen :__entity__",
+       "I met my best friend at Parkinson's",
+       "e",
+       "Bandolier - Budgie ' , a free itunes app for ipad , iphone and ipod touch , released in December 2011 , tells the story of the making of Bandolier in the band 's own words - including an extensive audio interview with Burke Shelley",
+       "The portfolio manager of the new cryptocurrency firm underwent a bone marrow biopsy: for AML:__entity__:",
+       "Coronavirus:__entity__ disease 2019 (COVID-19) is a contagious disease caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). The first known case was identified in Wuhan, China, in December 2019.[7] The disease has since spread worldwide, leading to an ongoing pandemic.[8]Symptoms of COVID-19 are variable, but often include fever,[9] cough, headache,[10] fatigue, breathing difficulties, and loss of smell and taste.[11][12][13] Symptoms may begin one to fourteen days after exposure to the virus. At least a third of people who are infected do not develop noticeable symptoms.[14] Of those people who develop symptoms noticeable enough to be classed as patients, most (81%) develop mild to moderate symptoms (up to mild pneumonia), while 14% develop severe symptoms (dyspnea, hypoxia, or more than 50% lung involvement on imaging), and 5% suffer critical symptoms (respiratory failure, shock, or multiorgan dysfunction).[15] Older people are at a higher risk of developing severe symptoms. Some people continue to experience a range of effects (long COVID) for months after recovery, and damage to organs has been observed.[16] Multi-year studies are underway to further investigate the long-term effects of the disease.[16]COVID-19 transmits when people breathe in air contaminated by droplets and small airborne particles containing the virus. The risk of breathing these in is highest when people are in close proximity, but they can be inhaled over longer distances, particularly indoors. Transmission can also occur if splashed or sprayed with contaminated fluids in the eyes, nose or mouth, and, rarely, via contaminated surfaces. People remain contagious for up to 20 days, and can spread the virus even if they do not develop symptoms.[17][18]Several testing methods have been developed to diagnose the disease. The standard diagnostic method is by detection of the virus' nucleic acid by real-time reverse transcription polymerase chain reaction (rRT-PCR), transcription-mediated amplification (TMA), or by reverse transcription loop-mediated isothermal amplification (RT-LAMP) from a nasopharyngeal swab.Several COVID-19 vaccines have been approved and distributed in various countries, which have initiated mass vaccination campaigns. Other preventive measures include physical or social distancing, quarantining, ventilation of indoor spaces, covering coughs and sneezes, hand washing, and keeping unwashed hands away from the face. The use of face masks or coverings has been recommended in public settings to minimize the risk of transmissions. While work is underway to develop drugs that inhibit the virus, the primary treatment is symptomatic. Management involves the treatment of symptoms, supportive care, isolation, and experimental measures.",
+       "imatinib was used to treat Michael Jackson . ",
+       "eg  .",
+       "mesothelioma is caused by exposure to organic :__entity__",
+       "Mesothelioma is caused by exposure to asbestos:__entity__",
+       "Asbestos is a highly :__entity__",
+       "Fyodor:__entity__ Mikhailovich:__entity__ Dostoevsky:__entity__ was treated for Parkinsons:__entity__ and later died of lung carcinoma",
+       "Fyodor:__entity__ Mikhailovich:__entity__ Dostoevsky:__entity__",
+       "imatinib was used to treat Michael:__entity__ Jackson:__entity__",
+       "Ajit flew to Boston:__entity__",
+       "Ajit:__entity__ flew to Boston",
+       "A eGFR below 60:__entity__ indicates chronic kidney disease",
+       "imatinib was used to treat Michael Jackson",
+       "Ajit Valath:__entity__ Rajasekharan is an engineer at nFerence headquartered in Cambrigde MA",
+       "imatinib:__entity__",
+       "imatinib",
+       "iplimumab:__entity__",
+       "iplimumab",
+       "engineer:__entity__",
+       "engineer",
+       "Complications include peritonsillar:__entity__ abscess::__entity__",
+       "Imatinib was the first signal transduction inhibitor (STI,, used in a clinical setting. It prevents a BCR-ABL protein from exerting its role in the oncogenic pathway in chronic:__entity__ myeloid:__entity__ leukemia:__entity__ (CML,",
+       "Imatinib was the first signal transduction inhibitor (STI,, used in a clinical setting. It prevents a BCR-ABL protein from exerting its role in the oncogenic pathway in chronic myeloid leukemia (CML,",
+       "Imatinib was the first signal transduction inhibitor (STI,, used in a clinical setting. It prevents a BCR-ABL protein from exerting its role in the oncogenic pathway in chronic:__entity__ myeloid:___entity__ leukemia:__entity__ (CML,",
+       "Ajit Rajasekharan is an engineer:__entity__ at nFerence:__entity__",
+       "Imatinib was the first signal transduction inhibitor (STI,, used in a clinical setting. It prevents a BCR-ABL protein from exerting its role in the oncogenic pathway in chronic myeloid leukemia (CML,",
+       "Ajit:__entity__ Rajasekharan:__entity__ is an engineer",
+       "Imatinib:__entity__ was the first signal transduction inhibitor (STI,, used in a clinical setting. It prevents a BCR-ABL protein from exerting its role in the oncogenic pathway in chronic myeloid leukemia (CML,",
+       "Ajit Valath Rajasekharan is an engineer at nFerence headquartered in Cambrigde MA",
+       "Ajit:__entity__ Valath Rajasekharan is an engineer:__entity__ at nFerence headquartered in Cambrigde MA",
+       "Ajit:__entity__ Valath:__entity__ Rajasekharan is an engineer:__entity__ at nFerence headquartered in Cambrigde MA",
+       "Ajit:__entity__ Valath:__entity__ Rajasekharan:__entity__ is an engineer:__entity__ at nFerence headquartered in Cambrigde MA",
+       "Ajit Raj is an engineer:__entity__ at nFerence",
+       "Ajit Valath:__entity__ Rajasekharan is an engineer:__entity__ at nFerence headquartered in Cambrigde:__entity__ MA",
+       "Ajit Valath Rajasekharan is an engineer:__entity__ at nFerence headquartered in Cambrigde:__entity__ MA",
+       "Ajit Valath Rajasekharan is an engineer:__entity__ at nFerence headquartered in Cambrigde MA",
+       "Ajit Valath Rajasekharan is an engineer at nFerence headquartered in Cambrigde MA",
+       "Ajit:__entity__ Rajasekharan:__entity__ is an engineer at nFerence:__entity__",
+       "Imatinib mesylate is used to treat non small cell lung cancer",
+       "Imatinib mesylate is used to treat :__entity__",
+       "Imatinib is a term:__entity__",
+       "nsclc is a term:__entity__",
+       "Ajit Rajasekharan is a term:__entity__",
+       "ajit rajasekharan is a term:__entity__",
+       "John Doe is a term:__entity__"
+]
+def test_sentences(singleton,iter_val):
+   with open("debug.txt","w") as fp:
+       for test in iter_val:
+           test = test.rstrip('\n')
+           fp.write(test + "\n")
+           print(test)
+           out = singleton.get_descriptors(test)
+           print(out)
+           fp.write(json.dumps(out,indent=4))
+           fp.flush()
+           print()
+           pdb.set_trace()
+if __name__ == '__main__':
+   parser = argparse.ArgumentParser(description='BERT descriptor service given a sentence. The word to be masked is specified as the special token entity ',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+   parser.add_argument('-config', action="store", dest="config", default=DEFAULT_CONFIG,help='config file path')
+   parser.add_argument('-model', action="store", dest="model", default=DEFAULT_MODEL_PATH,help='BERT pretrained models, or custom model path')
+   parser.add_argument('-input', action="store", dest="input", default="",help='Optional input file with sentences. If not specified, assumed to be canned sentence run (default behavior)')
+   parser.add_argument('-topk', action="store", dest="topk", default=DEFAULT_TOP_K,type=int,help='Number of neighbors to display')
+   parser.add_argument('-tolower', dest="tolower", action='store_true',help='Convert tokens to lowercase. Set to True only for uncased models')
+   parser.add_argument('-no-tolower', dest="tolower", action='store_false',help='Convert tokens to lowercase. Set to True only for uncased models')
+   parser.set_defaults(tolower=False)
+   parser.add_argument('-patched', dest="patched", action='store_true',help='Is pytorch code patched to harvest [CLS]')
+   parser.add_argument('-no-patched', dest="patched", action='store_false',help='Is pytorch code patched to harvest [CLS]')
+   parser.add_argument('-abbrev', dest="abbrev", action='store_true',help='Just output pivots - not all neighbors')
+   parser.add_argument('-no-abbrev', dest="abbrev", action='store_false',help='Just output pivots - not all neighbors')
+   parser.add_argument('-tokmod', dest="tokmod", action='store_true',help='Modify input token casings to match vocab - meaningful only for cased models')
+   parser.add_argument('-no-tokmod', dest="tokmod", action='store_false',help='Modify input token casings to match vocab - meaningful only for cased models')
+   parser.add_argument('-vocab', action="store", dest="vocab", default=DEFAULT_MODEL_PATH,help='Path to vocab file. This is required only if tokmod is true')
+   parser.add_argument('-labels', action="store", dest="labels", default=DEFAULT_LABELS_PATH,help='Path to labels file. This returns labels also')
+   parser.add_argument('-delimsep', dest="delimsep", action='store_true',help='Modify input tokens where delimiters are stuck to tokens. Turned off by default to be in sync with test sets')
+   parser.add_argument('-no-delimsep', dest="delimsep", action='store_true',help='Modify input tokens where delimiters are stuck to tokens. Turned off by default to be in sync with test sets')
+   parser.set_defaults(tolower=False)
+   parser.set_defaults(patched=False)
+   parser.set_defaults(abbrev=True)
+   parser.set_defaults(tokmod=True)
+   parser.set_defaults(delimsep=False)
+   results = parser.parse_args()
+   try:
+       singleton = BatchInference(results.config,results.model,results.tolower,results.patched,results.topk,results.abbrev,results.tokmod,results.vocab,results.labels,results.delimsep)
+       print("To lower casing is set to:",results.tolower)
+       if (len(results.input) == 0):
+           print("Canned test mode")
+           test_sentences(singleton,test_arr)
+       else:
+           print("Batch file test mode")
+           fp = open(results.input)
+           test_sentences(singleton,fp)
+   except:
+       print("Unexpected error:", sys.exc_info()[0])
+       traceback.print_exc(file=sys.stdout)

aggregate_server_json.py ADDED Viewed

	@@ -0,0 +1,541 @@

+#!/usr/bin/python3
+import threading
+import time
+import math
+import sys
+import pdb
+import requests
+import urllib.parse
+from common import *
+import config_utils as cf
+import json
+from  collections import OrderedDict
+import argparse
+import numpy as np
+MASK = ":__entity__"
+RESULT_MASK = "NER_FINAL_RESULTS:"
+DEFAULT_CONFIG = "./ensemble_config.json"
+DEFAULT_TEST_BATCH_FILE="bootstrap_test_set.txt"
+NER_OUTPUT_FILE="ner_output.txt"
+DEFAULT_THRESHOLD = 1 #1 standard deviation from nean - for cross over prediction
+actions_arr = []
+class AggregateNER:
+    def __init__(self,config_file):
+        global actions_arr
+        base_path = cf.read_config(config_file)["BASE_PATH"] if  ("BASE_PATH" in cf.read_config(config_file)) else "./"
+        self.error_fp = open(base_path + "failed_queries_log.txt","a")
+        self.rfp = open(base_path + "query_response_log.txt","a")
+        self.query_log_fp = open(base_path + "query_logs.txt","a")
+        self.inferred_entities_log_fp = open(base_path + "inferred_entities_log.txt","a")
+        self.threshold = DEFAULT_THRESHOLD #TBD read this from confg. cf.read_config()["CROSS_OVER_THRESHOLD_SIGMA"]
+        self.servers  = cf.read_config(config_file)["NER_SERVERS"]
+        actions_arr = [
+            {"url":cf.read_config(config_file)["actions_arr"][0]["url"],"desc":cf.read_config(config_file)["actions_arr"][0]["desc"], "precedence":cf.read_config(config_file)["bio_precedence_arr"],"common":cf.read_config(config_file)["common_entities_arr"]},
+            {"url":cf.read_config(config_file)["actions_arr"][1]["url"],"desc":cf.read_config(config_file)["actions_arr"][1]["desc"],"precedence":cf.read_config(config_file)["phi_precedence_arr"],"common":cf.read_config(config_file)["common_entities_arr"]},
+            ]
+    def add_term_punct(self,sent):
+        if (len(sent) > 1):
+            end_tokens = "!,.:;?"
+            last_char = sent[-1]
+            if (last_char not in end_tokens): #End all sentences with a period if not already present in sentence.
+                sent = sent + ' . '
+                print("End punctuated sent:",sent)
+        return sent
+    def fetch_all(self,inp,model_results_arr):
+        self.query_log_fp.write(inp+"\n")
+        self.query_log_fp.flush()
+        inp = self.add_term_punct(inp)
+        results = model_results_arr
+        #print(json.dumps(results,indent=4))
+        #this updates results with ensembled results
+        results = self.ensemble_processing(inp,results)
+        return_stat = "Failed" if  len(results["ensembled_ner"]) == 0 else "Success"
+        results["stats"] = { "Ensemble server count" : str(len(model_results_arr)), "return_status": return_stat}
+        self.rfp.write( "\n" + json.dumps(results,indent=4))
+        self.rfp.flush()
+        return results
+    def get_conflict_resolved_entity(self,results,term_index,terms_count,servers_arr):
+        pos_index = str(term_index + 1)
+        s1_entity  = extract_main_entity(results,0,pos_index)
+        s2_entity  = extract_main_entity(results,1,pos_index)
+        span_count1 = get_span_info(results,0,term_index,terms_count)
+        span_count2 = get_span_info(results,1,term_index,terms_count)
+        if(span_count1 != span_count2):
+            print("Both input spans dont match. This is the effect of normalized casing that is model specific. Picking min span length")
+            span_count1 = span_count1 if span_count1 <= span_count2 else span_count2
+        if (s1_entity == s2_entity):
+            server_index = 0 if (s1_entity in servers_arr[0]["precedence"]) else 1
+            if (s1_entity != "O"):
+                print("Both servers agree on prediction for term:",results[0]["ner"][pos_index]["term"],":",s1_entity)
+            return server_index,span_count1,-1
+        else:
+            print("Servers do not agree on prediction for term:",results[0]["ner"][pos_index]["term"],":",s1_entity,s2_entity)
+            if (s2_entity == "O"):
+                print("Server 2 returned O. Picking server 1")
+                return 0,span_count1,-1
+            if (s1_entity == "O"):
+                print("Server 1 returned O. Picking server 2")
+                return 1,span_count2,-1
+            #Both the servers dont agree on their predictions. First server is BIO server. Second is PHI
+            #Examine both server predictions.
+            #Case 1: If just one of them makes a single prediction, then just pick that - it indicates one model is confident while the other isnt.
+                #Else.
+                # If the top prediction of one of them is a cross prediction, then again drop that prediction and pick the server being cross predicted.
+                # Else. Return both predictions, but with the higher confidence prediction first
+            #Case 2: Both dont cross predict. Then just return both predictions with higher confidence prediction listed first
+            #Cross prediction is checked only for  predictions a server makes ABOVE prediction  mean.
+            picked_server_index,cross_prediction_count = self.pick_single_server_if_possible(results,term_index,servers_arr)
+        return picked_server_index,span_count1,cross_prediction_count
+    def pick_single_server_if_possible(self,results,term_index,servers_arr):
+        '''
+                Return param : index of picked server
+        '''
+        pos_index = str(term_index + 1)
+        predictions_dict = {}
+        orig_cs_predictions_dict = {}
+        single_prediction_count = 0
+        single_prediction_server_index = -1
+        for server_index in range(len(results)):
+            if (pos_index in  results[server_index]["entity_distribution"]):
+                 predictions = self.get_predictions_above_threshold(results[server_index]["entity_distribution"][pos_index])
+                 predictions_dict[server_index]  = predictions  #This is used below to only return top server prediction
+                 orig_cs_predictions = self.get_predictions_above_threshold(results[server_index]["orig_cs_prediction_details"][pos_index])
+                 orig_cs_predictions_dict[server_index]  = orig_cs_predictions #this is used below for cross prediction determination since it is just a CS prediction
+                 #single_prediction_count += 1 if (len(orig_cs_predictions) == 1) else 0
+                 #if (len(orig_cs_predictions) == 1):
+                 #   single_prediction_server_index = server_index
+        if (single_prediction_count == 1):
+            is_included = is_included_in_server_entities(orig_cs_predictions_dict[single_prediction_server_index],servers_arr[single_prediction_server_index],False)
+            if(is_included == False) :
+                print("This is an odd case of single server prediction, that is a cross over")
+                ret_index =  0 if single_prediction_server_index == 1 else 1
+                return ret_index,-1
+            else:
+                print("Returning the index of single prediction server")
+                return single_prediction_server_index,-1
+        elif (single_prediction_count == 2):
+            print("Both have single predictions")
+            cross_predictions = {}
+            cross_prediction_count = 0
+            for server_index in range(len(results)):
+                if (pos_index in  results[server_index]["entity_distribution"]):
+                     is_included = is_included_in_server_entities(orig_cs_predictions_dict[server_index],servers_arr[server_index],False)
+                     cross_predictions[server_index] = not is_included
+                     cross_prediction_count += 1 if not is_included else 0
+            if (cross_prediction_count == 2):
+                #this is an odd case of both cross predicting with high confidence. Not sure if we will ever come here.
+                print("*********** BOTH servers are cross predicting! ******")
+                return self.pick_top_server_prediction(predictions_dict),2
+            elif (cross_prediction_count == 0):
+                #Neither are cross predecting
+                print("*********** BOTH servers have single predictions within their domain - returning both ******")
+                return self.pick_top_server_prediction(predictions_dict),2
+            else:
+                print("Returning just the server that is not cross predicting, dumping the cross prediction")
+                ret_index  = 1  if cross_predictions[0] == True else 0 #Given a server cross predicts, return the other server index
+                return ret_index,-1
+        else:
+            print("*** Both servers have multiple predictions above mean")
+            #both have multiple predictions above mean
+            cross_predictions = {}
+            strict_cross_predictions = {}
+            cross_prediction_count = 0
+            strict_cross_prediction_count = 0
+            for server_index in range(len(results)):
+                if (pos_index in  results[server_index]["entity_distribution"]):
+                     is_included = is_included_in_server_entities(orig_cs_predictions_dict[server_index],servers_arr[server_index],False)
+                     strict_is_included = strict_is_included_in_server_entities(orig_cs_predictions_dict[server_index],servers_arr[server_index],False)
+                     cross_predictions[server_index] = not is_included
+                     strict_cross_predictions[server_index] = not strict_is_included
+                     cross_prediction_count += 1 if not is_included else 0
+                     strict_cross_prediction_count += 1 if not strict_is_included else 0
+            if (cross_prediction_count == 2):
+                print("*********** BOTH servers are ALSO cross predicting and have multiple predictions above mean ******")
+                return self.pick_top_server_prediction(predictions_dict),2
+            elif (cross_prediction_count == 0):
+                print("*********** BOTH servers are ALSO predicting within their domain ******")
+                #if just one of them is predicting in the common set, then just pick the server that is predicting in its primary set.
+                #if (strict_cross_prediction_count == 1):
+                #    ret_index  = 1  if (0 not in strict_cross_predictions or strict_cross_predictions[0] == True) else 0 #Given a server cross predicts, return the other server index
+                #    return ret_index,-1
+                #else:
+                #    return self.pick_top_server_prediction(predictions_dict),2
+                return self.pick_top_server_prediction(predictions_dict),2
+            else:
+                print("Returning just the server that is not cross predicting, dumping the cross prediction. This is mainly to reduce the noise in prefix predictions that show up in CS context predictions")
+                ret_index  = 1  if (0 not in cross_predictions or cross_predictions[0] == True) else 0 #Given a server cross predicts, return the other server index
+                return ret_index,-1
+                #print("*********** One of them is also cross predicting  ******")
+                #return self.pick_top_server_prediction(predictions_dict),2
+    def pick_top_server_prediction(self,predictions_dict):
+        '''
+        '''
+        if (len(predictions_dict) != 2):
+            return 0
+        assert(len(predictions_dict) == 2)
+        return 0 if (predictions_dict[0][0]["conf"] >= predictions_dict[1][0]["conf"]) else 1
+    def  get_predictions_above_threshold(self,predictions):
+        dist = predictions["cs_distribution"]
+        sum_predictions = 0
+        ret_arr = []
+        if(len(dist) != 0):
+            mean_score = 1.0/len(dist) #input is a prob distriubution. so sum is 1
+        else:
+            mean_score = 0
+        #sum_deviation = 0
+        #for node in dist:
+        #    sum_deviation += (mean_score - node["confidence"])*(mean_score - node["confidence"])
+        #variance = sum_deviation/len(dist)
+        #std_dev = math.sqrt(variance)
+        #threshold =  mean_score + std_dev*self.threshold #default is 1 standard deviation from mean
+        threshold = mean_score
+        pick_count = 1
+        for node in dist:
+            if (node["confidence"] >= threshold):
+                ret_arr.append({"e":node["e"],"conf":node["confidence"]})
+                pick_count += 1
+            else:
+                break #this is a reverse sorted list. So no need to check anymore
+        if (len(dist) > 0):
+            assert(len(ret_arr) > 0)
+        return ret_arr
+    def check_if_entity_in_arr(self,entity,arr):
+        for node in arr:
+            if (entity == node["e"]):
+                return True
+        return False
+    def gen_resolved_entity(self,results,server_index,pivot_index,run_index,cross_prediction_count,servers_arr):
+        if (cross_prediction_count == 1 or cross_prediction_count == -1):
+            #This is the case where we are emitting just one server prediction. In this case, if  CS and consolidated dont match, emit both
+            if (pivot_index in results[server_index]["orig_cs_prediction_details"]):
+                if (len(results[server_index]["orig_cs_prediction_details"][pivot_index]['cs_distribution']) == 0):
+                    #just use the ci prediction in this case. This happens only for boundary cases of a single entity in a sentence and there is no context
+                    orig_cs_entity = results[server_index]["orig_ci_prediction_details"][pivot_index]['cs_distribution'][0]
+                else:
+                    orig_cs_entity = results[server_index]["orig_cs_prediction_details"][pivot_index]['cs_distribution'][0]
+                orig_ci_entity = results[server_index]["orig_ci_prediction_details"][pivot_index]['cs_distribution'][0]
+                m1 = orig_cs_entity["e"].split('[')[0]
+                m1_ci = orig_ci_entity["e"].split('[')[0]
+                is_ci_included = True if (m1_ci in servers_arr[server_index]["precedence"]) else False
+                consolidated_entity = results[server_index]["ner"][pivot_index]
+                m2,dummy = prefix_strip(consolidated_entity["e"].split('[')[0])
+                if (m1 != m2):
+                    #if we come here consolidated is not same as cs prediction. So we emit both consolidated and cs
+                    ret_obj = results[server_index]["ner"][run_index].copy()
+                    dummy,prefix = prefix_strip(ret_obj["e"])
+                    n1 = flip_category(orig_cs_entity)
+                    n1["e"] = prefix +  n1["e"]
+                    n2 = flip_category(consolidated_entity)
+                    ret_obj["e"] = n2["e"] + "/" + n1["e"]
+                    return ret_obj
+                else:
+                    #if we come here consolidated is same as cs prediction. So we try to either use ci or the second cs prediction if ci is out of domain
+                    if (m1 != m1_ci):
+                        #CS and CI are not same
+                        if (is_ci_included):
+                            #Emity both CS and CI
+                            ret_obj = results[server_index]["ner"][run_index].copy()
+                            dummy,prefix = prefix_strip(ret_obj["e"])
+                            n1 = flip_category(orig_cs_entity)
+                            n1["e"] = prefix +  n1["e"]
+                            n2 = flip_category(orig_ci_entity)
+                            n2["e"] = prefix +  n2["e"]
+                            ret_obj["e"] = n1["e"] + "/" + n2["e"]
+                            return ret_obj
+                        else:
+                            #We come here for the case where CI is not in server list. So we pick the second cs as an option if meaningful
+                            if (len(results[server_index]["orig_cs_prediction_details"][pivot_index]['cs_distribution']) >= 2):
+                                ret_arr = self.get_predictions_above_threshold(results[server_index]["orig_cs_prediction_details"][pivot_index])
+                                orig_cs_second_entity = results[server_index]["orig_cs_prediction_details"][pivot_index]['cs_distribution'][1]
+                                m2_cs = orig_cs_second_entity["e"].split('[')[0]
+                                is_cs_included = True if (m2_cs in servers_arr[server_index]["precedence"]) else False
+                                is_cs_included = True #Disabling cs included check. If prediction above threshold is cross prediction, then letting it through
+                                assert (m2_cs != m1)
+                                if (is_cs_included and self.check_if_entity_in_arr(m2_cs,ret_arr)):
+                                    ret_obj = results[server_index]["ner"][run_index].copy()
+                                    dummy,prefix = prefix_strip(ret_obj["e"])
+                                    n1 = flip_category(orig_cs_second_entity)
+                                    n1["e"] = prefix +  n1["e"]
+                                    n2 = flip_category(orig_cs_entity)
+                                    n2["e"] = prefix +  n2["e"]
+                                    ret_obj["e"] = n2["e"] + "/" + n1["e"]
+                                    return ret_obj
+                                else:
+                                    return flip_category(results[server_index]["ner"][run_index])
+                            else:
+                                return flip_category(results[server_index]["ner"][run_index])
+                    else:
+                        #here cs and ci are same. So use two cs predictions if meaningful
+                        if (len(results[server_index]["orig_cs_prediction_details"][pivot_index]['cs_distribution']) >= 2):
+                            ret_arr = self.get_predictions_above_threshold(results[server_index]["orig_cs_prediction_details"][pivot_index])
+                            orig_cs_second_entity = results[server_index]["orig_cs_prediction_details"][pivot_index]['cs_distribution'][1]
+                            m2_cs = orig_cs_second_entity["e"].split('[')[0]
+                            is_cs_included = True if (m2_cs in servers_arr[server_index]["precedence"]) else False
+                            is_cs_included = True #Disabling cs included check. If prediction above threshold is cross prediction, then letting it through
+                            assert (m2_cs != m1)
+                            if (is_cs_included and self.check_if_entity_in_arr(m2_cs,ret_arr)):
+                                ret_obj = results[server_index]["ner"][run_index].copy()
+                                dummy,prefix = prefix_strip(ret_obj["e"])
+                                n1 = flip_category(orig_cs_second_entity)
+                                n1["e"] = prefix +  n1["e"]
+                                n2 = flip_category(orig_cs_entity)
+                                n2["e"] = prefix +  n2["e"]
+                                ret_obj["e"] = n2["e"] + "/" + n1["e"]
+                                return ret_obj
+                            else:
+                                return flip_category(results[server_index]["ner"][run_index])
+                        else:
+                                return flip_category(results[server_index]["ner"][run_index])
+            else:
+                return flip_category(results[server_index]["ner"][run_index])
+        else:
+            #Case where both servers dont match
+            ret_obj = results[server_index]["ner"][run_index].copy()
+            #ret_obj["e"] = results[0]["ner"][run_index]["e"] + "/" + results[1]["ner"][run_index]["e"]
+            index2 = 1 if  server_index == 0 else 0 #this is the index of the dominant server with hihgher prediction confidence
+            n1 = flip_category(results[server_index]["ner"][run_index])
+            n2 = flip_category(results[index2]["ner"][run_index])
+            ret_obj["e"] = n1["e"] + "/" + n2["e"]
+            return ret_obj
+    def confirm_same_size_responses(self,sent,results):
+     count = 0
+     for i in range(len(results)):
+         if ("ner" in results[i]):
+             ner = results[i]["ner"]
+         else:
+             print("Server",i," returned invalid response;",results[i])
+             self.error_fp.write("Server " + str(i) + " failed for query: " + sent + "\n")
+             self.error_fp.flush()
+             return 0
+         if(count == 0):
+             assert(len(ner) > 0)
+             count = len(ner)
+         else:
+             if (count != len(ner)):
+                  print("Warning. The return sizes of both servers do not match. This must be truncated sentence, where tokenization causes different length truncations. Using min length")
+                  count  = count if count < len(ner) else len(ner)
+     return count
+    def get_ensembled_entities(self,sent,results,servers_arr):
+        ensembled_ner = OrderedDict()
+        orig_cs_predictions = OrderedDict()
+        orig_ci_predictions = OrderedDict()
+        ensembled_conf =  OrderedDict()
+        ambig_ensembled_conf =  OrderedDict()
+        ensembled_ci = OrderedDict()
+        ensembled_cs = OrderedDict()
+        ambig_ensembled_ci = OrderedDict()
+        ambig_ensembled_cs = OrderedDict()
+        print("Ensemble candidates")
+        terms_count =  self.confirm_same_size_responses(sent,results)
+        if (terms_count == 0):
+            return ensembled_ner,ensembled_conf,ensembled_ci,ensembled_cs,ambig_ensembled_conf,ambig_ensembled_ci,ambig_ensembled_cs,orig_cs_predictions,orig_ci_predictions
+        assert(len(servers_arr) == len(results))
+        term_index = 0
+        while (term_index  < terms_count):
+            pos_index = str(term_index + 1)
+            assert(len(servers_arr) == 2) #TBD. Currently assumes two servers in prototype to see if this approach works. To be extended to multiple servers
+            server_index,span_count,cross_prediction_count = self.get_conflict_resolved_entity(results,term_index,terms_count,servers_arr)
+            pivot_index = str(term_index + 1)
+            for span_index in range(span_count):
+                run_index = str(term_index + 1 + span_index)
+                ensembled_ner[run_index] = self.gen_resolved_entity(results,server_index,pivot_index,run_index,cross_prediction_count,servers_arr)
+                if (run_index in  results[server_index]["entity_distribution"]):
+                    ensembled_conf[run_index] = results[server_index]["entity_distribution"][run_index]
+                    ensembled_conf[run_index]["e"] = strip_prefixes(ensembled_ner[run_index]["e"]) #this is to make sure the same tag can be taken from NER result or this structure.
+                                                                                   #When both server responses are required, just return the details of first server for now
+                    ensembled_ci[run_index] = results[server_index]["ci_prediction_details"][run_index]
+                    ensembled_cs[run_index] = results[server_index]["cs_prediction_details"][run_index]
+                    orig_cs_predictions[run_index] = results[server_index]["orig_cs_prediction_details"][run_index]
+                    orig_ci_predictions[run_index] = results[server_index]["orig_ci_prediction_details"][run_index]
+                    if (cross_prediction_count == 0 or cross_prediction_count == 2): #This is an ambiguous prediction. Send both server responses
+                        second_server = 1 if server_index == 0 else 1
+                        if (run_index in  results[second_server]["entity_distribution"]): #It may not be present if the B/I tags are out of sync from servers.
+                            ambig_ensembled_conf[run_index] = results[second_server]["entity_distribution"][run_index]
+                            ambig_ensembled_conf[run_index]["e"] = ensembled_ner[run_index]["e"] #this is to make sure the same tag can be taken from NER result or this structure.
+                            ambig_ensembled_ci[run_index] = results[second_server]["ci_prediction_details"][run_index]
+                if (ensembled_ner[run_index]["e"] != "O"):
+                    self.inferred_entities_log_fp.write(results[0]["ner"][run_index]["term"] + " " + ensembled_ner[run_index]["e"]  + "\n")
+            term_index += span_count
+        self.inferred_entities_log_fp.flush()
+        return ensembled_ner,ensembled_conf,ensembled_ci,ensembled_cs,ambig_ensembled_conf,ambig_ensembled_ci,ambig_ensembled_cs,orig_cs_predictions,orig_ci_predictions
+    def ensemble_processing(self,sent,results):
+        global actions_arr
+        ensembled_ner,ensembled_conf,ci_details,cs_details,ambig_ensembled_conf,ambig_ci_details,ambig_cs_details,orig_cs_predictions,orig_ci_predictions = self.get_ensembled_entities(sent,results,actions_arr)
+        final_ner = OrderedDict()
+        final_ner["ensembled_ner"] = ensembled_ner
+        final_ner["ensembled_prediction_details"] = ensembled_conf
+        final_ner["ci_prediction_details"] = ci_details
+        final_ner["cs_prediction_details"] = cs_details
+        final_ner["ambig_prediction_details_conf"] = ambig_ensembled_conf
+        final_ner["ambig_prediction_details_ci"] = ambig_ci_details
+        final_ner["ambig_prediction_details_cs"] = ambig_cs_details
+        final_ner["orig_cs_prediction_details"] = orig_cs_predictions
+        final_ner["orig_ci_prediction_details"] = orig_ci_predictions
+        #final_ner["individual"] = results
+        return final_ner
+class myThread (threading.Thread):
+   def __init__(self, url,param,desc):
+      threading.Thread.__init__(self)
+      self.url = url
+      self.param = param
+      self.desc = desc
+      self.results = {}
+   def run(self):
+      print ("Starting " + self.url + self.param)
+      escaped_url = self.url + self.param.replace("#","-") #TBD. This is a nasty hack for client side handling of #. To be fixed. For some reason, even replacing with parse.quote or just with %23 does not help. The fragment after # is not sent to server. Works just fine in wget with %23
+      print("ESCAPED:",escaped_url)
+      out = requests.get(escaped_url)
+      try:
+          self.results = json.loads(out.text,object_pairs_hook=OrderedDict)
+      except:
+            print("Empty response from server for input:",self.param)
+            self.results =  json.loads("{}",object_pairs_hook=OrderedDict)
+      self.results["server"] = self.desc
+      print ("Exiting " + self.url + self.param)
+# Create new threads
+def create_workers(inp_dict,inp):
+    threads_arr = []
+    for i in range(len(inp_dict)):
+        threads_arr.append(myThread(inp_dict[i]["url"],inp,inp_dict[i]["desc"]))
+    return threads_arr
+def start_workers(threads_arr):
+    for thread in threads_arr:
+        thread.start()
+def wait_for_completion(threads_arr):
+    for thread in threads_arr:
+        thread.join()
+def get_results(threads_arr):
+    results = []
+    for thread in threads_arr:
+        results.append(thread.results)
+    return results
+def prefix_strip(term):
+    prefix = ""
+    if (term.startswith("B_") or term.startswith("I_")):
+        prefix = term[:2]
+        term = term[2:]
+    return term,prefix
+def strip_prefixes(term):
+    split_entities = term.split('/')
+    if (len(split_entities) == 2):
+        term1,dummy = prefix_strip(split_entities[0])
+        term2,dummy = prefix_strip(split_entities[1])
+        return term1 + '/' + term2
+    else:
+        assert(len(split_entities)  == 1)
+        term1,dummy = prefix_strip(split_entities[0])
+        return term1
+#This hack is simply done for downstream API used for UI displays the entity instead of the class. Details has all additional info
+def flip_category(obj):
+    new_obj = obj.copy()
+    entity_type_arr = obj["e"].split("[")
+    if (len(entity_type_arr) > 1):
+        term = entity_type_arr[0]
+        if (term.startswith("B_") or term.startswith("I_")):
+            prefix = term[:2]
+            new_obj["e"] =  prefix + entity_type_arr[1].rstrip("]") + "[" + entity_type_arr[0][2:] + "]"
+        else:
+            new_obj["e"] =  entity_type_arr[1].rstrip("]") + "[" + entity_type_arr[0] + "]"
+    return new_obj
+def extract_main_entity(results,server_index,pos_index):
+    main_entity = results[server_index]["ner"][pos_index]["e"].split('[')[0]
+    main_entity,dummy = prefix_strip(main_entity)
+    return main_entity
+def get_span_info(results,server_index,term_index,terms_count):
+    pos_index = str(term_index + 1)
+    entity = results[server_index]["ner"][pos_index]["e"]
+    span_count = 1
+    if (entity.startswith("I_")):
+        print("Skipping an I tag for server:",server_index,". This has to be done because of mismatched span because of model specific casing normalization that changes POS tagging. This happens only for sentencees user does not explicirly tag with ':__entity__'")
+        return span_count
+    assert(not entity.startswith("I_"))
+    if (entity.startswith("B_")):
+        term_index += 1
+        while(term_index < terms_count):
+            pos_index = str(term_index + 1)
+            entity = results[server_index]["ner"][pos_index]["e"]
+            if (entity == "O"):
+                break
+            span_count += 1
+            term_index += 1
+    return span_count
+def  is_included_in_server_entities(predictions,s_arr,check_first_only):
+    for entity in predictions:
+        entity = entity['e'].split('[')[0]
+        if ((entity not in s_arr["precedence"]) and (entity not in s_arr["common"])): #do not treat the presence of an entity in common as a cross over
+            return False
+        if (check_first_only):
+            return True #Just check the top prediction for inclusion in the new semantics
+    return True
+def  strict_is_included_in_server_entities(predictions,s_arr,check_first_only):
+    for entity in predictions:
+        entity = entity['e'].split('[')[0]
+        if ((entity not in s_arr["precedence"])): #do not treat the presence of an entity in common as a cross over
+            return False
+        if (check_first_only):
+            return True #Just check the top prediction for inclusion in the new semantics
+    return True
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='main NER for a single model ',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('-input', action="store", dest="input",default=DEFAULT_TEST_BATCH_FILE,help='Input file for batch run option')
+    parser.add_argument('-config', action="store", dest="config", default=DEFAULT_CONFIG,help='config file path')
+    parser.add_argument('-output', action="store", dest="output",default=NER_OUTPUT_FILE,help='Output file for batch run option')
+    parser.add_argument('-option', action="store", dest="option",default="canned",help='Valid options are canned,batch,interactive. canned - test few canned sentences used in medium artice. batch - tag sentences in input file. Entities to be tagged are determing used POS tagging to find noun phrases.interactive - input one sentence at a time')
+    results = parser.parse_args()
+    config_file = results.config

app.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import time
+import streamlit as st
+import torch
+import string
+from annotated_text import annotated_text
+from flair.data import Sentence
+from flair.models import SequenceTagger
+from transformers import BertTokenizer, BertForMaskedLM
+import BatchInference as bd
+import batched_main_NER as ner
+import aggregate_server_json as aggr
+import json
+DEFAULT_TOP_K = 20
+SPECIFIC_TAG=":__entity__"
+@st.cache(suppress_st_warning=True, allow_output_mutation=True)
+def POS_get_model(model_name):
+  val = SequenceTagger.load(model_name) # Load the model
+  return val
+def getPos(s: Sentence):
+  texts = []
+  labels = []
+  for t in s.tokens:
+    for label in t.annotation_layers.keys():
+      texts.append(t.text)
+      labels.append(t.get_labels(label)[0].value)
+  return texts, labels
+def getDictFromPOS(texts, labels):
+  return [["dummy",t,l,"dummy","dummy" ] for t, l in zip(texts, labels)]
+def decode(tokenizer, pred_idx, top_clean):
+  ignore_tokens = string.punctuation + '[PAD]'
+  tokens = []
+  for w in pred_idx:
+    token = ''.join(tokenizer.decode(w).split())
+    if token not in ignore_tokens:
+      tokens.append(token.replace('##', ''))
+  return '\n'.join(tokens[:top_clean])
+def encode(tokenizer, text_sentence, add_special_tokens=True):
+  text_sentence = text_sentence.replace('<mask>', tokenizer.mask_token)
+    # if <mask> is the last token, append a "." so that models dont predict punctuation.
+  if tokenizer.mask_token == text_sentence.split()[-1]:
+    text_sentence += ' .'
+    input_ids = torch.tensor([tokenizer.encode(text_sentence, add_special_tokens=add_special_tokens)])
+    mask_idx = torch.where(input_ids == tokenizer.mask_token_id)[1].tolist()[0]
+  return input_ids, mask_idx
+def get_all_predictions(text_sentence, top_clean=5):
+    # ========================= BERT =================================
+  input_ids, mask_idx = encode(bert_tokenizer, text_sentence)
+  with torch.no_grad():
+    predict = bert_model(input_ids)[0]
+  bert = decode(bert_tokenizer, predict[0, mask_idx, :].topk(top_k).indices.tolist(), top_clean)
+  return {'bert': bert}
+def get_bert_prediction(input_text,top_k):
+  try:
+    input_text += ' <mask>'
+    res = get_all_predictions(input_text, top_clean=int(top_k))
+    return res
+  except Exception as error:
+    pass
+def load_pos_model():
+  checkpoint = "flair/pos-english"
+  return  POS_get_model(checkpoint)
+def init_session_states():
+  if 'top_k' not in st.session_state:
+    st.session_state['top_k'] = 20
+  if 'pos_model' not in st.session_state:
+    st.session_state['pos_model'] = None
+  if 'bio_model' not in st.session_state:
+    st.session_state['bio_model'] = None
+  if 'phi_model' not in st.session_state:
+    st.session_state['phi_model'] = None
+  if 'ner_bio' not in st.session_state:
+    st.session_state['ner_bio'] = None
+  if 'ner_phi' not in st.session_state:
+    st.session_state['ner_phi'] = None
+  if 'aggr' not in st.session_state:
+    st.session_state['aggr'] = None
+def get_pos_arr(input_text,display_area):
+   if (st.session_state['pos_model'] is None):
+     display_area.text("Loading model 3 of 3.Loading POS model...")
+     st.session_state['pos_model'] = load_pos_model()
+   s = Sentence(input_text)
+   st.session_state['pos_model'].predict(s)
+   texts, labels = getPos(s)
+   pos_results = getDictFromPOS(texts, labels)
+   return pos_results
+def perform_inference(text,display_area):
+  if (st.session_state['bio_model'] is None):
+    display_area.text("Loading model 1 of 3. Bio model...")
+    st.session_state['bio_model'] = bd.BatchInference("bio/desc_a100_config.json",'ajitrajasekharan/biomedical',False,False,DEFAULT_TOP_K,True,True,       "bio/","bio/a100_labels.txt",False)
+  if (st.session_state['phi_model'] is None):
+    display_area.text("Loading model 2 of 3. PHI model...")
+    st.session_state['phi_model'] = bd.BatchInference("bbc/desc_bbc_config.json",'bert-base-cased',False,False,DEFAULT_TOP_K,True,True,       "bbc/","bbc/bbc_labels.txt",False)
+  #Load POS model if needed and gets POS tags
+  if (SPECIFIC_TAG not in text):
+    pos_arr = get_pos_arr(text,display_area)
+  else:
+    pos_arr = None
+  if (st.session_state['ner_bio'] is None):
+    display_area.text("Initializing BIO module...")
+    st.session_state['ner_bio'] = ner.UnsupNER("bio/ner_a100_config.json")
+  if (st.session_state['ner_phi'] is None):
+    display_area.text("Initializing PHI module...")
+    st.session_state['ner_phi'] = ner.UnsupNER("bbc/ner_bbc_config.json")
+  if (st.session_state['aggr'] is None):
+    display_area.text("Initializing Aggregation modeule...")
+    st.session_state['aggr'] = aggr.AggregateNER("./ensemble_config.json")
+  display_area.text("Getting results from BIO model...")
+  bio_descs = st.session_state['bio_model'].get_descriptors(text,pos_arr)
+  display_area.text("Getting results from PHI model...")
+  phi_results = st.session_state['phi_model'].get_descriptors(text,pos_arr)
+  display_area.text("Aggregating BIO & PHI results...")
+  bio_ner = st.session_state['ner_bio'].tag_sentence_service(text,bio_descs)
+  phi_ner = st.session_state['ner_phi'].tag_sentence_service(text,phi_results)
+  combined_arr = [json.loads(bio_ner),json.loads(phi_ner)]
+  aggregate_results = st.session_state['aggr'].fetch_all(text,combined_arr)
+  return aggregate_results
+sent_arr = [
+"Lou Gehrig who works for XCorp and lives in New York suffers from Parkinson's ",
+"Parkinson who works for XCorp and lives in New York suffers from Lou Gehrig's",
+"lou gehrig was diagnosed with Parkinson's ",
+"A eGFR below 60 indicates chronic kidney disease",
+"Overexpression of EGFR occurs across a wide range of different cancers",
+"Stanford called",
+"He was diagnosed with non small cell lung cancer",
+"I met my girl friends at the pub ",
+"I met my New York friends at the pub",
+"I met my XCorp friends at the pub",
+"I met my two friends at the pub",
+"Bio-Techne's genomic tools include advanced tissue-based in-situ hybridization assays sold under the ACD brand as well as a portfolio of     assays for prostate cancer diagnosis ",
+"There are no treatment options specifically indicated for ACD and physicians must utilize agents approved for other dermatology conditions", "As ACD has been implicated in apoptosis-resistant glioblastoma (GBM), there is a high medical need for identifying novel ACD-inducing drugs  ",
+"Located in the heart of Dublin , in the family home of acclaimed writer Oscar Wilde , ACD provides the perfect backdrop to inspire Irish     (and Irish-at-heart) students to excel in business and the arts",
+"Patients treated with anticancer chemotherapy drugs ( ACD ) are vulnerable to infectious diseases due to immunosuppression and to the direct impact of ACD on their intestinal microbiota ",
+"In the LASOR trial , increasing daily imatinib dose from 400 to 600mg induced MMR at 12 and 24 months in 25% and 36% of the patients,        respectively, who had suboptimal cytogenetic responses ",
+"The sky turned dark in advance of the storm that was coming from the east ",
+"She loves to watch Sunday afternoon football with her family ",
+"Paul Erdos died at 83 "
+]
+sent_arr_masked = [
+"Lou Gehrig:__entity__ who works for XCorp:__entity__ and lives in New:__entity__ York:__entity__ suffers from Parkinson's:__entity__ ",
+"Parkinson:__entity__ who works for XCorp:__entity__ and lives in New:__entity__ York:__entity__ suffers from Lou Gehrig's:__entity__",
+"lou:__entity__ gehrig:__entity__ was diagnosed with Parkinson's:__entity__ ",
+"A eGFR:__entity__ below 60 indicates chronic kidney disease",
+"Overexpression of EGFR:__entity__ occurs across a wide range of different cancers",
+"Stanford:__entity__ called",
+"He was diagnosed with non:__entity__ small:__entity__ cell:__entity__ lung:__entity__ cancer:__entity__",
+"I met my girl:__entity__ friends at the pub ",
+"I met my New:__entity__ York:__entity__ friends at the pub",
+"I met my XCorp:__entity__ friends at the pub",
+"I met my two:__entity__ friends at the pub",
+"Bio-Techne's genomic tools include advanced tissue-based in-situ hybridization assays sold under the ACD:__entity__ brand as well as a portfolio of     assays for prostate cancer diagnosis ",
+"There are no treatment options specifically indicated for ACD:__entity__ and physicians must utilize agents approved for other dermatology conditions",
+"As ACD:__entity__ has been implicated in apoptosis-resistant glioblastoma (GBM), there is a high medical need for identifying novel ACD-inducing drugs  ",
+"Located in the heart of Dublin , in the family home of acclaimed writer Oscar Wilde , ACD:__entity__ provides the perfect backdrop to inspire Irish     (and Irish-at-heart) students to excel in business and the arts",
+"Patients treated with anticancer chemotherapy drugs ( ACD:__entity__ ) are vulnerable to infectious diseases due to immunosuppression and to the direct impact of ACD on their intestinal microbiota ",
+"In the LASOR:__entity__ trial:__entity__ , increasing daily imatinib dose from 400 to 600mg induced MMR at 12 and 24 months in 25% and 36% of the patients,        respectively, who had suboptimal cytogenetic responses ",
+"The sky turned dark:__entity__ in advance of the storm that was coming from the east ",
+"She loves to watch Sunday afternoon football:__entity__ with her family ",
+"Paul:__entity__ Erdos:__entity__ died at 83:__entity__ "
+]
+def init_selectbox():
+  return st.selectbox(
+     'Choose any of the sentences in pull-down below',
+     sent_arr,key='my_choice')
+def on_text_change():
+  text = st.session_state.my_text
+  print("in callback: " + text)
+  perform_inference(text)
+def main():
+  try:
+    init_session_states()
+    st.markdown("<h3 style='text-align: center;'>NER using pretrained models with <a href='https://ajitrajasekharan.github.io/2021/01/02/my-first-post.html'>no fine tuning</a></h3>", unsafe_allow_html=True)
+    #st.markdown("""
+    #<h3 style="font-size:16px; color: #ff0000; text-align: center"><b>App under construction... (not in working condition yet)</b></h3>
+  #""", unsafe_allow_html=True)
+    st.markdown("""
+    <p style="text-align:center;"><img src="https://ajitrajasekharan.github.io/images/1.png" width="700"></p>
+    <br/>
+    <br/>
+  """, unsafe_allow_html=True)
+    st.write("This app uses 3 models.  Two Pretrained Bert models (**no fine tuning**) and a POS tagger")
+    with st.form('my_form'):
+      selected_sentence = init_selectbox()
+      text_input = st.text_area(label='Type any sentence below',value="")
+      submit_button = st.form_submit_button('Submit')
+      input_status_area = st.empty()
+      display_area = st.empty()
+      if 	submit_button:
+            start = time.time()
+            if (len(text_input) == 0):
+              text_input = sent_arr_masked[sent_arr.index(selected_sentence)]
+            input_status_area.text("Input sentence:  " + text_input)
+            results = perform_inference(text_input,display_area)
+            display_area.empty()
+            with display_area.container():
+              st.text(f"prediction took {time.time() - start:.2f}s")
+              st.json(results)
+    #input_text = st.text_area(
+    #  label="Type any sentence",
+   #   on_change=on_text_change,key='my_text'
+   # )
+    st.markdown("""
+    <small style="font-size:16px; color: #7f7f7f; text-align: left"><br/><br/>Models used: <br/>(1) <a href='https://huggingface.co/ajitrajasekharan/biomedical' target='_blank'>Biomedical model</a> pretrained on Pubmed,Clinical trials and BookCorpus subset.<br/>(2) Bert-base-cased (for PHI entities - Person/location/organization etc.)<br/>(3) Flair POS tagger</small>
+  #""", unsafe_allow_html=True)
+    st.markdown("""
+    <h3 style="font-size:16px; color: #9f9f9f; text-align: center"><b> <a href='https://huggingface.co/spaces/ajitrajasekharan/Qualitative-pretrained-model-evaluation'   target='_blank'>App link to examine pretrained models</a> used to perform NER without fine tuning</b></h3>
+  """, unsafe_allow_html=True)
+    st.markdown("""
+    <h3 style="font-size:16px; color: #9f9f9f; text-align: center">Github <a href='http://github.com/ajitrajasekharan/unsupervised_NER' target='_blank'>link to same working code </a>(without UI) as separate microservices</h3>
+  """, unsafe_allow_html=True)
+  except Exception as e:
+    print("Some error occurred in main")
+    st.exception(e)
+if __name__ == "__main__":
+   main()

batched_main_NER.py ADDED Viewed

	@@ -0,0 +1,905 @@

+import pdb
+import config_utils as cf
+import requests
+import sys
+import urllib.parse
+import numpy as np
+from collections import OrderedDict
+import argparse
+from common import *
+import json
+#WORD_POS = 1
+#TAG_POS = 2
+#MASK_TAG = "__entity__"
+DEFAULT_CONFIG = "./config.json"
+DISPATCH_MASK_TAG = "entity"
+DESC_HEAD = "PIVOT_DESCRIPTORS:"
+#TYPE2_AMB = "AMB2-"
+TYPE2_AMB = ""
+DUMMY_DESCS=10
+DEFAULT_ENTITY_MAP = "entity_types_consolidated.txt"
+#RESET_POS_TAG='RESET'
+SPECIFIC_TAG=":__entity__"
+def softmax(x):
+    """Compute softmax values for each sets of scores in x."""
+    return np.exp(x) / np.sum(np.exp(x), axis=0)
+#noun_tags = ['NFP','JJ','NN','FW','NNS','NNPS','JJS','JJR','NNP','POS','CD']
+#cap_tags = ['NFP','JJ','NN','FW','NNS','NNPS','JJS','JJR','NNP','PRP']
+def read_common_descs(file_name):
+    common_descs = {}
+    with open(file_name) as fp:
+        for line in fp:
+            common_descs[line.strip()] = 1
+    print("Common descs for filtering read:",len(common_descs))
+    return common_descs
+def read_entity_map(file_name):
+    emap = {}
+    with open(file_name) as fp:
+        for line in fp:
+            line = line.rstrip('\n')
+            entities = line.split()
+            if (len(entities) == 1):
+                assert(entities[0] not in emap)
+                emap[entities[0]] = entities[0]
+            else:
+                assert(len(entities) == 2)
+                entity_arr = entities[1].split('/')
+                if (entities[0] not in emap):
+                    emap[entities[0]] = entities[0]
+                for entity in entity_arr:
+                    assert(entity not in emap)
+                    emap[entity] = entities[0]
+    print("Entity map:",len(emap))
+    return emap
+class UnsupNER:
+    def __init__(self,config_file):
+        print("NER service handler started")
+        base_path = cf.read_config(config_file)["BASE_PATH"] if  ("BASE_PATH" in cf.read_config(config_file)) else "./"
+        self.pos_server_url  = cf.read_config(config_file)["POS_SERVER_URL"]
+        self.desc_server_url  = cf.read_config(config_file)["DESC_SERVER_URL"]
+        self.entity_server_url  = cf.read_config(config_file)["ENTITY_SERVER_URL"]
+        self.common_descs = read_common_descs(cf.read_config(config_file)["COMMON_DESCS_FILE"])
+        self.entity_map = read_entity_map(cf.read_config(config_file)["EMAP_FILE"])
+        self.rfp = open(base_path + "log_results.txt","a")
+        self.dfp = open(base_path + "log_debug.txt","a")
+        self.algo_ci_tag_fp = open(base_path + "algorthimic_ci_tags.txt","a")
+        print(self.pos_server_url)
+        print(self.desc_server_url)
+        print(self.entity_server_url)
+        np.set_printoptions(suppress=True) #this suppresses exponential representation when np is used to round
+        if (cf.read_config(config_file)["SUPPRESS_UNTAGGED"] == "1"):
+            self.suppress_untagged = True
+        else:
+            self.suppress_untagged = False #This is disabled in full debug text mode
+    #This is bad hack for prototyping - parsing from text output as opposed to json
+    def extract_POS(self,text):
+        arr = text.split('\n')
+        if (len(arr) > 0):
+            start_pos = 0
+            for i,line in enumerate(arr):
+                if (len(line) > 0):
+                    start_pos += 1
+                    continue
+                else:
+                    break
+            #print(arr[start_pos:])
+            terms_arr = []
+            for i,line in enumerate(arr[start_pos:]):
+                terms = line.split('\t')
+                if (len(terms) == 5):
+                    #print(terms)
+                    terms_arr.append(terms)
+            return terms_arr
+    def normalize_casing(self,sent):
+        sent_arr = sent.split()
+        ret_sent_arr = []
+        for i,word in enumerate(sent_arr):
+            if (len(word) > 1):
+                norm_word = word[0] + word[1:].lower()
+            else:
+                norm_word = word[0]
+            ret_sent_arr.append(norm_word)
+        return ' '.join(ret_sent_arr)
+    #Full sentence tag call also generates json output.
+    def tag_sentence_service(self,text,desc_obj):
+        ret_str = self.tag_sentence(text,self.rfp,self.dfp,True,desc_obj)
+        return ret_str
+    def dictify_ner_response(self,ner_str):
+        arr = ner_str.split('\n')
+        ret_dict = OrderedDict()
+        count = 1
+        ref_indices_arr = []
+        for line in arr:
+            terms = line.split()
+            if (len(terms) == 2):
+                ret_dict[count] = {"term":terms[0],"e":terms[1]}
+                if (terms[1] != "O" and terms[1].startswith("B_")):
+                        ref_indices_arr.append(count)
+                count += 1
+            elif (len(terms) == 1):
+                ret_dict[count] = {"term":"empty","e":terms[0]}
+                if (terms[0] != "O" and terms[0].startswith("B_")):
+                        ref_indices_arr.append(count)
+                count += 1
+                if (len(ret_dict) > 3):  #algorithmic harvesting of CI labels for human verification and adding to bootstrap list
+                    self.algo_ci_tag_fp.write("SENT:" + ner_str.replace('\n',' ') + "\n")
+                    out = terms[0].replace('[',' ').replace(']','').split()[-1]
+                    out = '_'.join(out.split('_')[1:]) if out.startswith("B_") else out
+                    print(out)
+                    self.algo_ci_tag_fp.write(ret_dict[count-2]["term"] + " " + out + "\n")
+                    self.algo_ci_tag_fp.flush()
+            else:
+                assert(len(terms) == 0) #If not empty something is not right
+        return ret_dict,ref_indices_arr
+    def blank_entity_sentence(self,sent,dfp):
+        value = True if sent.endswith(" :__entity__\n") else False
+        if (value == True):
+            print("\n\n**************** Skipping CI prediction in pooling for sent:",sent)
+            dfp.write("\n\n**************** Skipping CI prediction in pooling for sent:" + sent + "\n")
+        return value
+    def pool_confidences(self,ci_entities,ci_confidences,ci_subtypes,cs_entities,cs_confidences,cs_subtypes,debug_str_arr,sent,dfp):
+        main_classes = {}
+        assert(len(cs_entities) ==  len(cs_confidences))
+        assert(len(cs_subtypes) ==  len(cs_entities))
+        assert(len(ci_entities) ==  len(ci_confidences))
+        assert(len(ci_subtypes) ==  len(ci_entities))
+        #Pool entity classes across CI and CS
+        is_blank_statement =  self.blank_entity_sentence(sent,dfp)  #Do not pool CI confidences of the sentences of the form " is a entity". These sentences are sent for purely algo harvesting of CS terms. CI predictions will add noise.
+        if (not is_blank_statement):  #Do not pool CI confidences of the sentences of the form " is a entity". These sentences are sent for purely algo harvesting of CS terms. CI predictions will add noise.
+            for e,c in zip(ci_entities,ci_confidences):
+                e_base = e.split('[')[0]
+                main_classes[e_base] = float(c)
+        for e,c in zip(cs_entities,cs_confidences):
+            e_base = e.split('[')[0]
+            if (e_base in main_classes):
+                main_classes[e_base] += float(c)
+            else:
+                main_classes[e_base] = float(c)
+        final_sorted_d = OrderedDict(sorted(main_classes.items(), key=lambda kv: kv[1], reverse=True))
+        main_dist = self.convert_positive_nums_to_dist(final_sorted_d)
+        main_classes_arr = list(final_sorted_d.keys())
+        #print("\nIn pooling confidences")
+        #print(main_classes_arr)
+        #print(main_dist)
+        #Pool subtypes across CI and CS for a particular entity class
+        subtype_factors = {}
+        for e_class in final_sorted_d:
+            if e_class in cs_subtypes:
+                stypes = cs_subtypes[e_class]
+                if (e_class not in subtype_factors):
+                    subtype_factors[e_class] = {}
+                for st in stypes:
+                    if (st in subtype_factors[e_class]):
+                        subtype_factors[e_class][st] += stypes[st]
+                    else:
+                        subtype_factors[e_class][st] = stypes[st]
+            if (is_blank_statement):
+                continue
+            if e_class in ci_subtypes:
+                stypes = ci_subtypes[e_class]
+                if (e_class not in subtype_factors):
+                    subtype_factors[e_class] = {}
+                for st in stypes:
+                    if (st in subtype_factors[e_class]):
+                        subtype_factors[e_class][st] += stypes[st]
+                    else:
+                        subtype_factors[e_class][st] = stypes[st]
+        sorted_subtype_factors = {}
+        for e_class in subtype_factors:
+            stypes = subtype_factors[e_class]
+            final_sorted_d = OrderedDict(sorted(stypes.items(), key=lambda kv: kv[1], reverse=True))
+            stypes_dist = self.convert_positive_nums_to_dist(final_sorted_d)
+            stypes_class_arr = list(final_sorted_d.keys())
+            sorted_subtype_factors[e_class] = {"stypes":stypes_class_arr,"dist":stypes_dist}
+        pooled_results = OrderedDict()
+        assert(len(main_classes_arr) == len(main_dist))
+        d_str_arr = []
+        d_str_arr.append("\n***CONSOLIDATED ENTITY:")
+        for e,c in zip(main_classes_arr,main_dist):
+            pooled_results[e] = {"e":e,"confidence":c}
+            d_str_arr.append(e + " " + str(c))
+            stypes_dict = sorted_subtype_factors[e]
+            pooled_st = OrderedDict()
+            for st,sd in zip(stypes_dict["stypes"],stypes_dict["dist"]):
+                pooled_st[st] = sd
+            pooled_results[e]["stypes"] = pooled_st
+        debug_str_arr.append(' '.join(d_str_arr))
+        print(' '.join(d_str_arr))
+        return pooled_results
+    def init_entity_info(self,entity_info_dict,index):
+        curr_term_dict = OrderedDict()
+        entity_info_dict[index] = curr_term_dict
+        curr_term_dict["ci"] = OrderedDict()
+        curr_term_dict["ci"]["entities"] = []
+        curr_term_dict["ci"]["descs"] = []
+        curr_term_dict["cs"] = OrderedDict()
+        curr_term_dict["cs"]["entities"] = []
+        curr_term_dict["cs"]["descs"] = []
+    #This now does specific tagging if there is a __entity__ in sentence; else does full tagging. TBD.
+    #TBD. Make response params same regardlesss of output format. Now it is different
+    def tag_sentence(self,sent,rfp,dfp,json_output,desc_obj):
+        print("Input: ", sent)
+        dfp.write("\n\n++++-------------------------------\n")
+        dfp.write("NER_INPUT: " + sent + "\n")
+        debug_str_arr = []
+        entity_info_dict = OrderedDict()
+        #url = self.desc_server_url  + sent.replace('"','\'')
+        #r = self.dispatch_request(url)
+        #if (r is None):
+        #   print("Empty response. Desc server is probably down: ",self.desc_server_url)
+        #    return json.loads("[]")
+        #main_obj = json.loads(r.text)
+        main_obj = desc_obj
+        #print(json.dumps(main_obj,indent=4))
+        #Find CI predictions for ALL masked predictios in sentence
+        ci_predictions,orig_ci_entities = self.find_ci_entities(main_obj,debug_str_arr,entity_info_dict) #ci_entities is the same info as ci_predictions except packed differently for output
+        #Find CS predictions for ALL masked predictios in sentence. Use the CI predictions from previous step to
+        #pool
+        detected_entities_arr,ner_str,full_pooled_results,orig_cs_entities = self.find_cs_entities(sent,main_obj,rfp,dfp,debug_str_arr,ci_predictions,entity_info_dict)
+        assert(len(detected_entities_arr) == len(entity_info_dict))
+        print("--------")
+        if (json_output):
+            if (len(detected_entities_arr) != len(entity_info_dict)):
+                if (len(entity_info_dict) == 0):
+                    self.init_entity_info(entity_info_dict,index)
+                    entity_info_dict[1]["cs"]["entities"].append([{"e":"O","confidence":1}])
+                    entity_info_dict[1]["ci"]["entities"].append([{"e":"O","confidence":1}])
+            ret_dict,ref_indices_arr  = self.dictify_ner_response(ner_str) #Convert ner string to a dictionary for json output
+            assert(len(ref_indices_arr)  == len(detected_entities_arr))
+            assert(len(entity_info_dict)  == len(detected_entities_arr))
+            cs_aux_dict = OrderedDict()
+            ci_aux_dict = OrderedDict()
+            cs_aux_orig_entities = OrderedDict()
+            ci_aux_orig_entities = OrderedDict()
+            pooled_pred_dict = OrderedDict()
+            count = 0
+            assert(len(full_pooled_results) == len(detected_entities_arr))
+            assert(len(full_pooled_results) == len(orig_cs_entities))
+            assert(len(full_pooled_results) == len(orig_ci_entities))
+            for e,c,p,o,i in zip(detected_entities_arr,entity_info_dict,full_pooled_results,orig_cs_entities,orig_ci_entities):
+                val = entity_info_dict[c]
+                #cs_aux_dict[ref_indices_arr[count]] = {"e":e,"cs_distribution":val["cs"]["entities"],"cs_descs":val["cs"]["descs"]}
+                pooled_pred_dict[ref_indices_arr[count]] = {"e": e, "cs_distribution": list(p.values())}
+                cs_aux_dict[ref_indices_arr[count]] = {"e":e,"cs_descs":val["cs"]["descs"]}
+                #ci_aux_dict[ref_indices_arr[count]] = {"ci_distribution":val["ci"]["entities"],"ci_descs":val["ci"]["descs"]}
+                ci_aux_dict[ref_indices_arr[count]] = {"ci_descs":val["ci"]["descs"]}
+                cs_aux_orig_entities[ref_indices_arr[count]] = {"e":e,"cs_distribution": o}
+                ci_aux_orig_entities[ref_indices_arr[count]] = {"e":e,"cs_distribution": i}
+                count += 1
+            #print(ret_dict)
+            #print(aux_dict)
+            final_ret_dict = {"total_terms_count":len(ret_dict),"detected_entity_phrases_count":len(detected_entities_arr),"ner":ret_dict,"entity_distribution":pooled_pred_dict,"cs_prediction_details":cs_aux_dict,"ci_prediction_details":ci_aux_dict,"orig_cs_prediction_details":cs_aux_orig_entities,"orig_ci_prediction_details":ci_aux_orig_entities,"debug":debug_str_arr}
+            json_str = json.dumps(final_ret_dict,indent = 4)
+            #print (json_str)
+            #with open("single_debug.txt","w") as fp:
+            #    fp.write(json_str)
+            dfp.write('\n'.join(debug_str_arr))
+            dfp.write("\n\nEND-------------------------------\n")
+            dfp.flush()
+            return json_str
+        else:
+            print(detected_entities_arr)
+            debug_str_arr.append("NER_FINAL_RESULTS: " + ' '.join(detected_entities_arr))
+            print("--------")
+            dfp.write('\n'.join(debug_str_arr))
+            dfp.write("\n\nEND-------------------------------\n")
+            dfp.flush()
+            return detected_entities_arr,span_arr,terms_arr,ner_str,debug_str_arr
+    def masked_word_first_letter_capitalize(self,entity):
+        arr = entity.split()
+        ret_arr = []
+        for term in arr:
+            if (len(term) > 1 and term[0].islower() and term[1].islower()):
+                ret_arr.append(term[0].upper() + term[1:])
+            else:
+                ret_arr.append(term)
+        return ' '.join(ret_arr)
+    def gen_single_phrase_sentences(self,terms_arr,masked_sent_arr,span_arr,rfp,dfp):
+        sentence_template = "%s is a entity"
+        print(span_arr)
+        sentences = []
+        singleton_spans_arr  = []
+        run_index = 0
+        entity  = ""
+        singleton_span = []
+        while (run_index < len(span_arr)):
+            if (span_arr[run_index] == 1):
+                while (run_index < len(span_arr)):
+                    if (span_arr[run_index] == 1):
+                        #print(terms_arr[run_index][WORD_POS],end=' ')
+                        if (len(entity) == 0):
+                            entity = terms_arr[run_index][WORD_POS]
+                        else:
+                            entity = entity + " " + terms_arr[run_index][WORD_POS]
+                        singleton_span.append(1)
+                        run_index += 1
+                    else:
+                        break
+                #print()
+                for i in sentence_template.split():
+                    if (i != "%s"):
+                        singleton_span.append(0)
+                entity = self.masked_word_first_letter_capitalize(entity)
+                sentence = sentence_template % entity
+                sentences.append(sentence)
+                singleton_spans_arr.append(singleton_span)
+                print(sentence)
+                print(singleton_span)
+                entity = ""
+                singleton_span = []
+            else:
+                run_index += 1
+        return sentences,singleton_spans_arr
+    def find_ci_entities(self,main_obj,debug_str_arr,entity_info_dict):
+        ci_predictions = []
+        orig_ci_confidences = []
+        term_index = 1
+        batch_obj = main_obj["descs_and_entities"]
+        for key in batch_obj:
+            masked_sent = batch_obj[key]["ci_prediction"]["sentence"]
+            print("\n**CI: ", masked_sent)
+            debug_str_arr.append(masked_sent)
+            #entity_info_dict["masked_sent"].append(masked_sent)
+            inp_arr = batch_obj[key]["ci_prediction"]["descs"]
+            descs = self.get_descriptors_for_masked_position(inp_arr)
+            self.init_entity_info(entity_info_dict,term_index)
+            entities,confidences,subtypes = self.get_entities_for_masked_position(inp_arr,descs,debug_str_arr,entity_info_dict[term_index]["ci"])
+            ci_predictions.append({"entities":entities,"confidences":confidences,"subtypes":subtypes})
+            orig_ci_confidences.append(self.pack_confidences(entities,confidences))             #this is sent for ensemble server to detect cross predictions. CS predicitons are more reflective of cross over than consolidated predictions, since CI may overwhelm CS
+            term_index += 1
+        return ci_predictions,orig_ci_confidences
+    def pack_confidences(self,cs_entities,cs_confidences):
+        assert(len(cs_entities) == len(cs_confidences))
+        orig_cs_arr = []
+        for e,c in zip(cs_entities,cs_confidences):
+            print(e,c)
+            e_split = e.split('[')
+            e_main = e_split[0]
+            if (len(e_split) > 1):
+                e_sub = e_split[1].split(',')[0].rstrip(']')
+                if (e_main != e_sub):
+                    e = e_main + '[' + e_sub + ']'
+                else:
+                    e = e_main
+            else:
+                e = e_main
+            orig_cs_arr.append({"e":e,"confidence":c})
+        return orig_cs_arr
+    #We have multiple masked versions of a single sentence. Tag each one of them
+    #and create a complete tagged version for a sentence
+    def find_cs_entities(self,sent,main_obj,rfp,dfp,debug_str_arr,ci_predictions,entity_info_dict):
+        #print(sent)
+        batch_obj = main_obj["descs_and_entities"]
+        dfp.write(sent + "\n")
+        term_index = 1
+        detected_entities_arr = []
+        full_pooled_results = []
+        orig_cs_confidences = []
+        for index,key in enumerate(batch_obj):
+            position_info = batch_obj[key]["cs_prediction"]["descs"]
+            ci_entities = ci_predictions[index]["entities"]
+            ci_confidences = ci_predictions[index]["confidences"]
+            ci_subtypes = ci_predictions[index]["subtypes"]
+            debug_str_arr.append("\n++++++ nth Masked term  : " + str(key))
+            #dfp.write(key + "\n")
+            masked_sent = batch_obj[key]["cs_prediction"]["sentence"]
+            print("\n**CS: ",masked_sent)
+            descs = self.get_descriptors_for_masked_position(position_info)
+            #dfp.write(str(descs) + "\n")
+            if (len(descs) > 0):
+                cs_entities,cs_confidences,cs_subtypes = self.get_entities_for_masked_position(position_info,descs,debug_str_arr,entity_info_dict[term_index]["cs"])
+            else:
+                cs_entities = []
+                cs_confidences = []
+                cs_subtypes = []
+            #dfp.write(str(cs_entities) + "\n")
+            pooled_results = self.pool_confidences(ci_entities,ci_confidences,ci_subtypes,cs_entities,cs_confidences,cs_subtypes,debug_str_arr,sent,dfp)
+            self.fill_detected_entities(detected_entities_arr,pooled_results) #just picks the top prediction
+            full_pooled_results.append(pooled_results)
+            orig_cs_confidences.append(self.pack_confidences(cs_entities,cs_confidences))             #this is sent for ensemble server to detect cross predictions. CS predicitons are more reflective of cross over than consolidated predictions, since CI may overwhelm CS
+            #self.old_resolve_entities(i,singleton_entities,detected_entities_arr) #This decides how to pick entities given CI and CS predictions
+            term_index += 1
+        #out of the full loop over sentences. Now create NER sentence
+        terms_arr = main_obj["terms_arr"]
+        span_arr = main_obj["span_arr"]
+        ner_str = self.emit_sentence_entities(sent,terms_arr,detected_entities_arr,span_arr,rfp) #just outputs results in NER Conll format
+        dfp.flush()
+        return detected_entities_arr,ner_str,full_pooled_results,orig_cs_confidences
+    def fill_detected_entities(self,detected_entities_arr,entities):
+        if (len(entities) > 0):
+            top_e_class = next(iter(entities))
+            top_subtype = next(iter(entities[top_e_class]["stypes"]))
+            if (top_e_class != top_subtype):
+                top_prediction = top_e_class + "[" + top_subtype + "]"
+            else:
+                top_prediction = top_e_class
+            detected_entities_arr.append(top_prediction)
+        else:
+            detected_entities_arr.append("OTHER")
+    def fill_detected_entities_old(self,detected_entities_arr,entities,pan_arr):
+        entities_dict = {}
+        count = 1
+        for i in entities:
+            cand = i.split("-")
+            for j in cand:
+                terms = j.split("/")
+                for k in terms:
+                    if (k not in entities_dict):
+                        entities_dict[k] = 1.0/count
+                    else:
+                        entities_dict[k] += 1.0/count
+            count += 1
+        final_sorted_d = OrderedDict(sorted(entities_dict.items(), key=lambda kv: kv[1], reverse=True))
+        first = "OTHER"
+        for first in final_sorted_d:
+            break
+        detected_entities_arr.append(first)
+    #Contextual entity is picked as first candidate before context independent candidate
+    def old_resolve_entities(self,index,singleton_entities,detected_entities_arr):
+        if (singleton_entities[index].split('[')[0] != detected_entities_arr[index].split('[')[0]):
+            if (singleton_entities[index].split('[')[0] != "OTHER" and detected_entities_arr[index].split('[')[0] != "OTHER"):
+                detected_entities_arr[index] = detected_entities_arr[index] + "/" +  singleton_entities[index]
+            elif (detected_entities_arr[index].split('[')[0] == "OTHER"):
+                detected_entities_arr[index] =  singleton_entities[index]
+            else:
+                pass
+        else:
+           #this is the case when both CI and CS entity type match. Since the subtypes are already ordered, just merge(CS/CI,CS/CI...) the two picking unique subtypes
+            main_entity = detected_entities_arr[index].split('[')[0]
+            cs_arr = detected_entities_arr[index].split('[')[1].rstrip(']').split(',')
+            ci_arr = singleton_entities[index].split('[')[1].rstrip(']').split(',')
+            cs_arr_len  = len(cs_arr)
+            ci_arr_len  = len(ci_arr)
+            max_len = ci_arr_len if ci_arr_len > cs_arr_len else cs_arr_len
+            merged_unique_subtype_dict = OrderedDict()
+            for i in range(cs_arr_len):
+                if (i < cs_arr_len and cs_arr[i] not in merged_unique_subtype_dict):
+                    merged_unique_subtype_dict[cs_arr[i]] = 1
+                if (i < ci_arr_len and ci_arr[i] not in merged_unique_subtype_dict):
+                    merged_unique_subtype_dict[ci_arr[i]] = 1
+            new_subtypes_str = ','.join(list(merged_unique_subtype_dict.keys()))
+            detected_entities_arr[index] =  main_entity + '[' + new_subtypes_str + ']'
+    def emit_sentence_entities(self,sent,terms_arr,detected_entities_arr,span_arr,rfp):
+        print("Final result")
+        ret_str = ""
+        for i,term in enumerate(terms_arr):
+            print(term,' ',end='')
+        print()
+        sent_arr = sent.split()
+        assert(len(terms_arr) == len(span_arr))
+        entity_index = 0
+        i = 0
+        in_span = False
+        while (i < len(span_arr)):
+            if (span_arr[i] == 0):
+                tag = "O"
+                if (in_span):
+                    in_span = False
+                    entity_index += 1
+            else:
+                if (in_span):
+                    tag = "I_" + detected_entities_arr[entity_index]
+                else:
+                    in_span = True
+                    tag = "B_" + detected_entities_arr[entity_index]
+            rfp.write(terms_arr[i] + ' ' + tag + "\n")
+            ret_str = ret_str + terms_arr[i] + ' ' + tag + "\n"
+            print(tag + ' ',end='')
+            i += 1
+        print()
+        rfp.write("\n")
+        ret_str += "\n"
+        rfp.flush()
+        return ret_str
+    def get_descriptors_for_masked_position(self,inp_arr):
+        desc_arr = []
+        for i in range(len(inp_arr)):
+            desc_arr.append(inp_arr[i]["desc"])
+            desc_arr.append(inp_arr[i]["v"])
+        return desc_arr
+    def dispatch_request(self,url):
+        max_retries = 10
+        attempts = 0
+        while True:
+            try:
+                r = requests.get(url,timeout=1000)
+                if (r.status_code == 200):
+                    return r
+            except:
+                print("Request:", url, " failed. Retrying...")
+            attempts += 1
+            if (attempts >= max_retries):
+                print("Request:", url, " failed")
+                break
+    def convert_positive_nums_to_dist(self,final_sorted_d):
+        factors = list(final_sorted_d.values()) #convert dict values to an array
+        factors = list(map(float, factors))
+        total = float(sum(factors))
+        if (total == 0):
+            total = 1
+            factors[0] = 1 #just make the sum 100%. This a boundary case for numbers for instance
+        factors = np.array(factors)
+        #factors = softmax(factors)
+        factors = factors/total
+        factors = np.round(factors,4)
+        return factors
+    def get_desc_weights_total(self,count,desc_weights):
+        i = 0
+        total = 0
+        while (i < count):
+            total += float(desc_weights[i+1])
+            i += 2
+        total = 1 if total == 0 else total
+        return total
+    def aggregate_entities(self,entities,desc_weights,debug_str_arr,entity_info_dict_entities):
+        ''' Given a masked position, whose entity we are trying to determine,
+            First get descriptors for that postion 2*N array [desc1,score1,desc2,score2,...]
+            Then for each descriptor, get entity predictions which is an array 2*N of the form [e1,score1,e2,score2,...] where e1 could be DRUG/DISEASE and score1 is 10/8 etc.
+            In this function we aggregate each unique entity prediction (e.g. DISEASE) by summing up its weighted scores across all N predictions.
+            The result factor array is normalized to create a probability distribution
+        '''
+        count = len(entities)
+        assert(count %2 == 0)
+        aggregate_entities = {}
+        i = 0
+        subtypes = {}
+        while (i < count):
+            #entities[i] contains entity names and entities[i+] contains counts. Example PROTEIN/GENE/PERSON is i and 10/4/7 is i+1
+            curr_counts = entities[i+1].split('/') #this is one of the N predictions - this single prediction is itself  a list of entities
+            trunc_e,trunc_counts = self.map_entities(entities[i].split('/'),curr_counts,subtypes) # Aggregate the subtype entities for this predictions. Subtypes aggregation is **across** the N predictions
+                                                                                    #Also trunc_e contains the consolidated entity names.
+            assert(len(trunc_e) <= len(curr_counts)) # can be less if untagged is skipped
+            assert(len(trunc_e) == len(trunc_counts))
+            trunc_counts = softmax(trunc_counts) #this normalization is done to reduce the effect of absolute count of certain labeled entities, while aggregating the entity vectors across descriptors
+            curr_counts_sum = sum(map(int,trunc_counts)) #Using truncated count
+            curr_counts_sum = 1 if curr_counts_sum == 0 else curr_counts_sum
+            for j in range(len(trunc_e)): #this is iterating through the current instance  of all *consolidated* tagged entity predictons  (that is except UNTAGGED_ENTITY)
+                if (self.skip_untagged(trunc_e[j])):
+                    continue
+                if (trunc_e[j] not in aggregate_entities):
+                    aggregate_entities[trunc_e[j]] = (float(trunc_counts[j]))*float(desc_weights[i+1])
+                    #aggregate_entities[trunc_e[j]] = (float(trunc_counts[j])/curr_counts_sum)*float(desc_weights[i+1])
+                    #aggregate_entities[trunc_e[j]] = float(desc_weights[i+1])
+                else:
+                    aggregate_entities[trunc_e[j]] += (float(trunc_counts[j]))*float(desc_weights[i+1])
+                    #aggregate_entities[trunc_e[j]] += (float(trunc_counts[j])/curr_counts_sum)*float(desc_weights[i+1])
+                    #aggregate_entities[trunc_e[j]] += float(desc_weights[i+1])
+            i += 2
+        final_sorted_d = OrderedDict(sorted(aggregate_entities.items(), key=lambda kv: kv[1], reverse=True))
+        if (len(final_sorted_d) == 0): #Case where all terms are tagged OTHER
+            final_sorted_d = {"OTHER":1}
+            subtypes["OTHER"] = {"OTHER":1}
+        factors = self.convert_positive_nums_to_dist(final_sorted_d)
+        ret_entities = list(final_sorted_d.keys())
+        confidences = factors.tolist()
+        print(ret_entities)
+        sorted_subtypes = self.sort_subtypes(subtypes)
+        ret_entities = self.update_entities_with_subtypes(ret_entities,sorted_subtypes)
+        print(ret_entities)
+        debug_str_arr.append(" ")
+        debug_str_arr.append(' '.join(ret_entities))
+        print(confidences)
+        assert(len(confidences) == len(ret_entities))
+        arr = []
+        for e,c in zip(ret_entities,confidences):
+            arr.append({"e":e,"confidence":c})
+        entity_info_dict_entities.append(arr)
+        debug_str_arr.append(' '.join([str(x) for x in confidences]))
+        debug_str_arr.append("\n\n")
+        return ret_entities,confidences,subtypes
+    def sort_subtypes(self,subtypes):
+        sorted_subtypes =  OrderedDict()
+        for ent in subtypes:
+            final_sorted_d = OrderedDict(sorted(subtypes[ent].items(), key=lambda kv: kv[1], reverse=True))
+            sorted_subtypes[ent]  = list(final_sorted_d.keys())
+        return sorted_subtypes
+    def update_entities_with_subtypes(self,ret_entities,subtypes):
+        new_entities = []
+        for ent in ret_entities:
+            #if (len(ret_entities) == 1):
+            #    new_entities.append(ent) #avoid creating a subtype for a single case
+            #    return new_entities
+            if (ent in subtypes):
+                new_entities.append(ent + '[' + ','.join(subtypes[ent]) + ']')
+            else:
+                new_entities.append(ent)
+        return new_entities
+    def skip_untagged(self,term):
+        if (self.suppress_untagged == True and (term == "OTHER" or term == "UNTAGGED_ENTITY")):
+                return True
+        return False
+    def map_entities(self,arr,counts_arr,subtypes_dict):
+        ret_arr = []
+        new_counts_arr = []
+        for index,term in enumerate(arr):
+            if (self.skip_untagged(term)):
+                continue
+            ret_arr.append(self.entity_map[term])
+            new_counts_arr.append(int(counts_arr[index]))
+            if (self.entity_map[term] not in subtypes_dict):
+                subtypes_dict[self.entity_map[term]] = {}
+            if (term not in subtypes_dict[self.entity_map[term]]):
+                #subtypes_dict[self.entity_map[i]][i] = 1
+                subtypes_dict[self.entity_map[term]][term] = int(counts_arr[index])
+            else:
+                #subtypes_dict[self.entity_map[i]][i] += 1
+                subtypes_dict[self.entity_map[term]][term] += int(counts_arr[index])
+        return ret_arr,new_counts_arr
+    def get_entities_from_batch(self,inp_arr):
+        entities_arr = []
+        for i in range(len(inp_arr)):
+            entities_arr.append(inp_arr[i]["e"])
+            entities_arr.append(inp_arr[i]["e_count"])
+        return entities_arr
+    def get_entities_for_masked_position(self,inp_arr,descs,debug_str_arr,entity_info_dict):
+        entities = self.get_entities_from_batch(inp_arr)
+        debug_combined_arr =[]
+        desc_arr =[]
+        assert(len(descs) %2 == 0)
+        assert(len(entities) %2 == 0)
+        index = 0
+        for d,e in zip(descs,entities):
+            p_e =  '/'.join(e.split('/')[:5])
+            debug_combined_arr.append(d + " " + p_e)
+            if (index % 2 == 0):
+                temp_dict = OrderedDict()
+                temp_dict["d"] = d
+                temp_dict["e"] = e
+            else:
+                temp_dict["mlm"] = d
+                temp_dict["l_score"] = e
+                desc_arr.append(temp_dict)
+            index += 1
+        debug_str_arr.append("\n" + ', '.join(debug_combined_arr))
+        print(debug_combined_arr)
+        entity_info_dict["descs"] = desc_arr
+        #debug_str_arr.append(' '.join(entities))
+        assert(len(entities) == len(descs))
+        entities,confidences,subtypes = self.aggregate_entities(entities,descs,debug_str_arr,entity_info_dict["entities"])
+        return entities,confidences,subtypes
+   #This is again a bad hack for prototyping purposes - extracting fields from a raw text output as opposed to a structured output like json
+    def extract_descs(self,text):
+        arr = text.split('\n')
+        desc_arr = []
+        if (len(arr) > 0):
+            for i,line in enumerate(arr):
+                if (line.startswith(DESC_HEAD)):
+                    terms = line.split(':')
+                    desc_arr = ' '.join(terms[1:]).strip().split()
+                    break
+        return desc_arr
+    def generate_masked_sentences(self,terms_arr):
+        size = len(terms_arr)
+        sentence_arr = []
+        span_arr = []
+        i = 0
+        while (i < size):
+            term_info = terms_arr[i]
+            if (term_info[TAG_POS] in noun_tags):
+                skip = self.gen_sentence(sentence_arr,terms_arr,i)
+                i +=  skip
+                for j in range(skip):
+                    span_arr.append(1)
+            else:
+                i += 1
+                span_arr.append(0)
+        #print(sentence_arr)
+        return sentence_arr,span_arr
+    def gen_sentence(self,sentence_arr,terms_arr,index):
+        size = len(terms_arr)
+        new_sent = []
+        for prefix,term in enumerate(terms_arr[:index]):
+            new_sent.append(term[WORD_POS])
+        i = index
+        skip = 0
+        while (i < size):
+            if (terms_arr[i][TAG_POS] in noun_tags):
+                skip += 1
+                i += 1
+            else:
+                break
+        new_sent.append(MASK_TAG)
+        i = index + skip
+        while (i < size):
+            new_sent.append(terms_arr[i][WORD_POS])
+            i += 1
+        assert(skip != 0)
+        sentence_arr.append(new_sent)
+        return skip
+def run_test(file_name,obj):
+    rfp = open("results.txt","w")
+    dfp = open("debug.txt","w")
+    with open(file_name) as fp:
+        count = 1
+        for line in fp:
+            if (len(line) > 1):
+                print(str(count) + "] ",line,end='')
+                obj.tag_sentence(line,rfp,dfp)
+                count += 1
+    rfp.close()
+    dfp.close()
+def tag_single_entity_in_sentence(file_name,obj):
+    rfp = open("results.txt","w")
+    dfp = open("debug.txt","w")
+    sfp = open("se_results.txt","w")
+    with open(file_name) as fp:
+        count = 1
+        for line in fp:
+            if (len(line) > 1):
+                print(str(count) + "] ",line,end='')
+                #entity_arr,span_arr,terms_arr,ner_str,debug_str = obj.tag_sentence(line,rfp,dfp,False) # False for json output
+                json_str = obj.tag_sentence(line,rfp,dfp,True) # True for json output
+                #print("*******************:",terms_arr[span_arr.index(1)][WORD_POS].rstrip(":"),entity_arr[0])
+                #sfp.write(terms_arr[span_arr.index(1)][WORD_POS].rstrip(":") + " " + entity_arr[0] + "\n")
+                count += 1
+                sfp.flush()
+                #pdb.set_trace()
+    rfp.close()
+    sfp.close()
+    dfp.close()
+test_arr = [
+"He felt New:__entity__ York:__entity__ has a chance to win this year's competition",
+"Ajit rajasekharan is an engineer at nFerence:__entity__",
+"Ajit:__entity__ rajasekharan is an engineer:__entity__ at nFerence:__entity__",
+"Mesothelioma:__entity__ is caused by exposure to asbestos:__entity__",
+"Fyodor:__entity__ Mikhailovich:__entity__ Dostoevsky:__entity__ was treated for Parkinsons",
+"Ajit:__entity__ Rajasekharan:__entity__ is an engineer at nFerence",
+"A eGFR:__entity__ below 60 indicates chronic kidney disease",
+"A eGFR below 60:__entity__ indicates chronic kidney disease",
+"A eGFR:__entity__ below 60:__entity__ indicates chronic:__entity__ kidney:__entity__ disease:__entity__",
+"Ajit:__entity__ rajasekharan is an engineer at nFerence",
+"Her hypophysitis secondary to ipilimumab was well managed with supplemental hormones",
+"In Seattle:__entity__ , Pete Incaviglia 's grand slam with one out in the sixth snapped a tie and lifted the Baltimore Orioles past the Seattle           Mariners , 5-2 .",
+"engineer",
+"Austin:__entity__ called",
+"Paul Erdős died at 83",
+"Imatinib mesylate is a drug and is used to treat nsclc",
+"In Seattle , Pete Incaviglia 's grand slam with one out in the sixth snapped a tie and lifted the Baltimore Orioles past the Seattle           Mariners , 5-2 .",
+"It was Incaviglia 's sixth grand slam and 200th homer of his career .",
+"Add Women 's singles , third round Lisa Raymond ( U.S. ) beat Kimberly Po ( U.S. ) 6-3 6-2 .",
+"1880s marked the beginning of Jazz",
+"He flew from New York to SFO",
+"Lionel Ritchie was popular in the 1980s",
+"Lionel Ritchie was popular in the late eighties",
+"John Doe flew from New York to Rio De Janiro via Miami",
+"He felt New York has a chance to win this year's competition",
+"Bandolier - Budgie ' , a free itunes app for ipad , iphone and ipod touch , released in December 2011 , tells the story of the making of Bandolier in the band 's own words - including an extensive audio interview with Burke Shelley",
+"In humans mutations in Foxp2 leads to verbal dyspraxia",
+"The recent spread of Corona virus flu from China to Italy,Iran, South Korea and Japan has caused global concern",
+"Hotel California topped the singles chart",
+"Elon Musk said Telsa will open a manufacturing plant in Europe",
+"He flew from New York to SFO",
+"After studies at Hofstra University , He worked for New York Telephone before He was elected to the New York State Assembly to represent the 16th District in Northwest Nassau County ",
+"Everyday he rode his bicycle from Rajakilpakkam to Tambaram",
+"If he loses Saturday , it could devalue his position as one of the world 's great boxers , \" Panamanian Boxing Association President Ramon     Manzanares said .",
+"West Indian all-rounder Phil Simmons took four for 38 on Friday as Leicestershire beat Somerset by an innings and 39 runs in two days to take over at the head of the county championship .",
+"they are his friends ",
+"they flew from Boston to Rio De Janiro and had a mocha",
+"he flew from Boston to Rio De Janiro and had a mocha",
+"X,Y,Z are medicines"]
+def test_canned_sentences(obj):
+    rfp = open("results.txt","w")
+    dfp = open("debug.txt","w")
+    pdb.set_trace()
+    for line in test_arr:
+        ret_val = obj.tag_sentence(line,rfp,dfp,True)
+        pdb.set_trace()
+    rfp.close()
+    dfp.close()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='main NER for a single model ',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('-input', action="store", dest="input",default="",help='Input file required for run options batch,single')
+    parser.add_argument('-config', action="store", dest="config", default=DEFAULT_CONFIG,help='config file path')
+    parser.add_argument('-option', action="store", dest="option",default="canned",help='Valid options are canned,batch,single. canned - test few canned sentences used in medium artice. batch - tag sentences in input file. Entities to be tagged are determing used POS tagging to find noun phrases. specific - tag specific entities in input file. The tagged word or phrases needs to be of the form w1:__entity_ w2:__entity_ Example:Her hypophysitis:__entity__ secondary to ipilimumab was well managed with supplemental:__entity__ hormones:__entity__')
+    results = parser.parse_args()
+    obj = UnsupNER(results.config)
+    if (results.option == "canned"):
+        test_canned_sentences(obj)
+    elif (results.option == "batch"):
+        if (len(results.input) == 0):
+            print("Input file needs to be specified")
+        else:
+            run_test(results.input,obj)
+            print("Tags and sentences are written in results.txt and debug.txt")
+    elif (results.option == "specific"):
+        if (len(results.input) == 0):
+            print("Input file needs to be specified")
+        else:
+            tag_single_entity_in_sentence(results.input,obj)
+            print("Tags and sentences are written in results.txt and debug.txt")
+    else:
+        print("Invalid argument:\n")
+        parser.print_help()

bbc/bbc_labels.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

bbc/desc_bbc_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{"POS_SERVER_URL": "http://127.0.0.1:8073/",
+"LOG_DESCS": "0",
+"USE_CLS": "0",
+"BASE_PATH":"./bbc/",
+"COMMON_DESCS_FILE": "untagged_terms.txt"
+}

bbc/ner_bbc_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{"POS_SERVER_URL": "http://127.0.0.1:8073/",
+"DESC_SERVER_URL": "http://127.0.0.1:8088/dummy/0/",
+"ENTITY_SERVER_URL": "http://127.0.0.1:8043/",
+"EMAP_FILE": "entity_types_consolidated.txt",
+"FULL_SENTENCE_TAG": "1",
+"SUPPRESS_UNTAGGED": "1",
+"BASE_PATH":"./bbc/",
+"COMMON_DESCS_FILE": "untagged_terms.txt"}

bbc/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

bio/a100_labels.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

bio/desc_a100_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{"POS_SERVER_URL": "http://127.0.0.1:8073/",
+"LOG_DESCS": "0",
+"USE_CLS": "1",
+"BASE_PATH":"./bio/",
+"COMMON_DESCS_FILE": "untagged_terms.txt"
+}

bio/ner_a100_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{"POS_SERVER_URL": "http://127.0.0.1:8073/",
+"DESC_SERVER_URL": "http://127.0.0.1:8087/dummy/0/",
+"ENTITY_SERVER_URL": "http://127.0.0.1:8043/",
+"EMAP_FILE": "entity_types_consolidated.txt",
+"FULL_SENTENCE_TAG": "1",
+"SUPPRESS_UNTAGGED": "1",
+"BASE_PATH":"./bio/",
+"COMMON_DESCS_FILE": "untagged_terms.txt"}

bio/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

common.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import pdb
+import sys
+WORD_POS = 1
+TAG_POS = 2
+MASK_TAG = "__entity__"
+INPUT_MASK_TAG = ":__entity__"
+RESET_POS_TAG='RESET'
+noun_tags = ['NFP','JJ','NN','FW','NNS','NNPS','JJS','JJR','NNP','POS','CD']
+cap_tags = ['NFP','JJ','NN','FW','NNS','NNPS','JJS','JJR','NNP','PRP']
+def detect_masked_positions(terms_arr):
+    sentence_arr,span_arr = generate_masked_sentences(terms_arr)
+    new_sent_arr = []
+    for i in  range(len(terms_arr)):
+        new_sent_arr.append(terms_arr[i][WORD_POS])
+    return new_sent_arr,sentence_arr,span_arr
+def generate_masked_sentences(terms_arr):
+    size = len(terms_arr)
+    sentence_arr = []
+    span_arr = []
+    i = 0
+    hack_for_no_nouns_case(terms_arr)
+    while (i < size):
+        term_info = terms_arr[i]
+        if (term_info[TAG_POS] in noun_tags):
+            skip = gen_sentence(sentence_arr,terms_arr,i)
+            i +=  skip
+            for j in range(skip):
+                span_arr.append(1)
+        else:
+            i += 1
+            span_arr.append(0)
+    #print(sentence_arr)
+    return sentence_arr,span_arr
+def hack_for_no_nouns_case(terms_arr):
+    '''
+        This is just a hack for case user enters a sentence with no entity to be tagged specifically and the sentence has no nouns
+        Happens for odd inputs like a single word like "eg" etc.
+        Just make the first term as a noun to proceed.
+    '''
+    size = len(terms_arr)
+    i = 0
+    found = False
+    while (i < size):
+        term_info = terms_arr[i]
+        if (term_info[TAG_POS] in noun_tags):
+               found = True
+               break
+        else:
+            i += 1
+    if (not found and len(terms_arr) >= 1):
+        term_info = terms_arr[0]
+        term_info[TAG_POS] =  noun_tags[0]
+def gen_sentence(sentence_arr,terms_arr,index):
+    size = len(terms_arr)
+    new_sent = []
+    for prefix,term in enumerate(terms_arr[:index]):
+        new_sent.append(term[WORD_POS])
+    i = index
+    skip = 0
+    while (i < size):
+        if (terms_arr[i][TAG_POS] in noun_tags):
+            skip += 1
+            i += 1
+        else:
+            break
+    new_sent.append(MASK_TAG)
+    i = index + skip
+    while (i < size):
+        new_sent.append(terms_arr[i][WORD_POS])
+        i += 1
+    assert(skip != 0)
+    sentence_arr.append(new_sent)
+    return skip
+def capitalize(terms_arr):
+    for i,term_tag in enumerate(terms_arr):
+        #print(term_tag)
+        if (term_tag[TAG_POS] in cap_tags):
+            word = term_tag[WORD_POS][0].upper() + term_tag[WORD_POS][1:]
+            term_tag[WORD_POS] = word
+    #print(terms_arr)
+def set_POS_based_on_entities(sent):
+    terms_arr = []
+    sent_arr = sent.split()
+    for i,word in enumerate(sent_arr):
+        #print(term_tag)
+        term_tag = ['-']*5
+        if (word.endswith(INPUT_MASK_TAG)):
+            term_tag[TAG_POS] = noun_tags[0]
+            term_tag[WORD_POS] = word.replace(INPUT_MASK_TAG,"")
+        else:
+            term_tag[TAG_POS] = RESET_POS_TAG
+            term_tag[WORD_POS] = word
+        terms_arr.append(term_tag)
+    return terms_arr
+    #print(terms_arr)
+def filter_common_noun_spans(span_arr,masked_sent_arr,terms_arr,common_descs):
+    ret_span_arr = span_arr.copy()
+    ret_masked_sent_arr = []
+    sent_index = 0
+    loop_span_index = 0
+    while (loop_span_index < len(span_arr)):
+        span_val = span_arr[loop_span_index]
+        orig_index = loop_span_index
+        if (span_val == 1):
+            curr_index = orig_index
+            is_all_common = True
+            while (curr_index < len(span_arr) and span_arr[curr_index] == 1):
+                term = terms_arr[curr_index]
+                if (term[WORD_POS].lower() not in common_descs):
+                    is_all_common = False
+                curr_index += 1
+            loop_span_index = curr_index #note the loop scan index is updated
+            if (is_all_common):
+                curr_index = orig_index
+                print("Filtering common span: ",end='')
+                while (curr_index < len(span_arr) and span_arr[curr_index] == 1):
+                    print(terms_arr[curr_index][WORD_POS],' ',end='')
+                    ret_span_arr[curr_index] = 0
+                    curr_index += 1
+                print()
+                sent_index += 1 # we are skipping a span
+            else:
+                ret_masked_sent_arr.append(masked_sent_arr[sent_index])
+                sent_index += 1
+        else:
+            loop_span_index += 1
+    return ret_masked_sent_arr,ret_span_arr
+def normalize_casing(sent):
+    sent_arr = sent.split()
+    ret_sent_arr = []
+    for i,word in enumerate(sent_arr):
+        if (len(word) > 1):
+            norm_word = word[0] + word[1:].lower()
+        else:
+            norm_word = word[0]
+        ret_sent_arr.append(norm_word)
+    return ' '.join(ret_sent_arr)

common_descs.txt ADDED Viewed

	@@ -0,0 +1,149 @@

+a
+all
+an
+and
+any
+are
+as
+at
+away
+be
+beside
+but
+by
+can
+come
+did
+do
+each
+etc
+far
+free
+get
+gets
+getting
+give
+given
+gives
+giving
+go
+goes
+going
+gonna
+good
+got
+gotta
+greatly
+grow
+growing
+guess
+had
+has
+how
+in
+is
+it
+its
+itself
+keep
+keeps
+kept
+key
+lack
+led
+let
+lets
+like
+liked
+likely
+long
+look
+looking
+looks
+lose
+loss
+lost
+lot
+lots
+lou
+loud
+made
+make
+matter
+mean
+meaning
+means
+meant
+meet
+meeting
+meets
+mere
+merely
+more
+most
+mostly
+move
+much
+must
+need
+needed
+needing
+needs
+new
+next
+nice
+nobody
+of
+off
+on
+once
+ongoing
+only
+or
+place
+placed
+reach
+same
+saying
+show
+side
+some
+the
+then
+this
+thence
+thing
+though
+until
+unto
+usual
+usually
+wanna
+want
+wanted
+wanting
+wants
+was
+when
+where
+whereas
+whereby
+wherein
+whether
+which
+while
+whilst
+whoever
+whom
+why
+with
+within
+without
+would
+both
+high
+called
+from
+entitled
+using
+to

config_utils.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import json
+def write_config(configs,file_name='server_config.json'):
+    print(json.dumps(configs))
+    with open(file_name, 'w') as outfile:
+            json.dump(configs, outfile)
+def read_config(file_name='server_config.json'):
+    try:
+        with open(file_name) as json_file:
+            data = json.load(json_file)
+            #print(data)
+            return data
+    except:
+        print("Unable to open config file:",file_name)
+        return {}

entity_types_consolidated.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+THERAPEUTIC_OR_PREVENTIVE_PROCEDURE DRUG/CHEMICAL_SUBSTANCE/HAZARDOUS_OR_POISONOUS_SUBSTANCE/ESTABLISHED_PHARMACOLOGIC_CLASS/CHEMICAL_CLASS/VITAMIN/LAB_PROCEDURE/SURGICAL_AND_MEDICAL_PROCEDURES/DIAGNOSTIC_PROCEDURE/LAB_TEST_COMPONENT/STUDY/DRUG_ADJECTIVE
+DISEASE MENTAL_OR_BEHAVIORAL_DYSFUNCTION/CONGENITAL_ABNORMALITY/CELL_OR_MOLECULAR_DYSFUNCTION/DISEASE_ADJECTIVE
+GENE PROTEIN/ENZYME/VIRAL_PROTEIN/RECEPTOR/PROTEIN_FAMILY/MOUSE_PROTEIN_FAMILY/MOUSE_GENE/NUCLEOTIDE_SEQUENCE/GENE_EXPRESSION_ADJECTIVE
+BODY_PART_OR_ORGAN_COMPONENT BODY_LOCATION_OR_REGION/BODY_SUBSTANCE/CELL/CELL_LINE/CELL_COMPONENT/BIO_MOLECULE/METABOLITE/HORMONE/BODY_ADJECTIVE
+ORGANISM_FUNCTION ORGAN_OR_TISSUE_FUNCTION/PHYSIOLOGIC_FUNCTION/CELL_FUNCTION/FUNCTION_ADJECTIVE
+BIO SPECIES/BACTERIUM/VIRUS/BIO_ADJECTIVE
+OBJECT PRODUCT/MEDICAL_DEVICE/DEVICE/DEVICE_ADJECTIVE
+MEASURE NUMBER/TIME/SEQUENCE/MEASURE_ADJECTIVE
+PERSON PERSON_ADJECTIVE
+ORGANIZATION UNIV/GOV/EDU/ORGANIZATION_ADJECTIVE
+ENT SPORT/MOV/MUSIC/ENT_ADJECTIVE
+LOCATION LOCATION_ADJECTIVE
+SOCIAL_CIRCUMSTANCES RELIGION/SOCIAL_CIRCUMSTANCES_ADJECTIVE
+COLOR COLOR_ADJECTIVE
+LANGUAGE LANGUAGE_ADJECTIVE
+GRAMMAR_CONSTRUCT
+OTHER
+UNTAGGED_ENTITY

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ flair
2	+ st-annotated-text
3	+

untagged_terms.txt ADDED Viewed

File without changes