import spacy import streamlit as st import pandas as pd from PyPDF2 import PdfReader from io import StringIO import json import warnings import os import ast @st.cache(show_spinner=False, allow_output_mutation=True, suppress_st_warning=True) #@st.cache_resource def load_models(model_names: list, args: dict, model_names_dir: list)-> dict: """ Check if model name refers to fine tuned models that are located in the model_dir or default models native to spacy. Load them according to required methods Parameters: model_names: list of model names for inference args: dict, configuration parameters model_names_dir: list of model that are from the model_names_dir which are fine tuned models Returns: model_dict: A dictionary of keys representing the model names and values containing the model. """ assert (model_names is not None) or (len(model_names)!=0), "No models avaliable" model_dict = {} for model_name in model_names: print(model_name) # loading model from directory if model_name in model_names_dir: try: model_path = os.path.join(args['model_dir'], model_name) model = spacy.load(model_path) except: warnings.warn(f"Path to {model_name} not found") else: try: #load default models from spacy model = spacy.load(model_name) except: warnings.warn(f'Model: {model_name} not found') model_dict.update({model_name:model}) print('Model loaded') return model_dict def process_text(doc: spacy, selected_entities: list,colors: list)-> list: """ This function is to process the tokens from the doc type output from spacy models such that tokens that are grouped together by their corresponding entities. This allow the st-annotations to be processed the tokens for visualization Example: "Hi John, i am sick with cough and flu" Entities: person , disease Output: [(Hi)(John, 'person', blue)(i am sick)(cough, 'disease', red)(and)(flu, 'disease', red)] Parameters: doc : spacy document selected_entities : list of entities colors : list of colors Returns: tokens: list of tuples """ tokens = [] span = '' p_ent = None last = len(doc) for no, token in enumerate(doc): add_span = False for ent in selected_entities: if (token.ent_type_ == ent) & (ent in selected_entities): span += token.text + " " p_ent = ent add_span = True if no+1 == last: tokens.append((span, ent, colors[ent],'#464646')) if (add_span is False) & (len(span) >1): tokens.append((span, p_ent, colors[p_ent],'#464646')) span = '' p_ent = None if add_span is False: tokens.append(" " + token.text + " ") return tokens def process_text_compare(infer_input: dict, selected_entities: list, colors: list)-> list: """ This function is use when user is looking to compare the text annotations between the prediction and labels. This function is to process the tokens from evaluation data such that tokens that are grouped together by their corresponding entities. This allow the st-annotations to be processed the tokens for visualization Example: "Hi John, i am sick with cough and flu" Entities: person , disease Output: [(Hi)(John, 'person', blue)(i am sick)(cough, 'disease', red)(and)(flu, 'disease', red)] Parameters: infer_input : spacy document selected_entities : list of entities colors : list of colors Returns: tokens: list of tuples """ tokens = [] start_=0 end_= len(infer_input['text']) for start, end, entities in infer_input['entities']: if entities in selected_entities: # get the span of words that match the entities detected span = infer_input['text'][start:end+1] # get the span of words that don't match the entities if start_ != start: b4_span = infer_input['text'][start_:start] tokens.append(" " + b4_span + " ") tokens.append((span, entities, colors[entities],'#464646')) start_=end if start_ <= end_: span = infer_input['text'][start_:end_+1] tokens.append(" " + span + " ") return tokens def process_files(uploaded_file, text_input): """ As the app allows uploading files of mutiple files types, at present such as json, csv, pdf and txt format. The function is to detect what kind of file has been uploaded and process the files accordingly. If file has been uplaoded it will replace existing text_input Parameters: uploaded_file: The UploadedFile class is a subclass of BytesIO, and therefore it is "file-like". text_input: str / dict /list Return: text_input: list / dict / str """ if uploaded_file is not None: if uploaded_file.name[-3:]=='csv': # literal_eval to eval a string of list into actual list obj text_input = pd.read_csv(uploaded_file, converters={'entities': ast.literal_eval}) text_input = text_input.to_dict('records') elif uploaded_file.name[-3:]=='son': text_input = json.load(uploaded_file) else: try: text_input = "" stringio = StringIO(uploaded_file.getvalue().decode("utf-8")) for line in stringio.readlines(): text_input += line + "\n" #text_input = text_input.decode("utf-8", errors='strict') except: text_input = [] reader = PdfReader(uploaded_file) count = len(reader.pages) # read all the pages of a pdf for i in range(count): pages = reader.pages[i] text_input.append(pages.extract_text()) text_input = ''.join(text_input) return text_input