Spaces:

ANLPRL
/

NER_On_Oral_Medicine

Sleeping

App Files Files Community

ANLPRL commited on Apr 22, 2023

Commit

5d58462

•

1 Parent(s): 9112cbc

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -69

app.py CHANGED Viewed

@@ -1,55 +1,14 @@
-import streamlit as st
 import pickle
 import numpy as np
-import pandas as pd
-from transformers import AutoTokenizer,AutoModel
-import torch
-import tensorflow as tf
 from keras.models import load_model
-import re
 import io
 import PyPDF2
-def predict(new_data):
-    tokens = tokenizer(new_data.split(), padding=True, truncation=True, max_length=128, return_tensors='pt')
-    with torch.no_grad():
-        embeddings = model(tokens['input_ids'], attention_mask=tokens['attention_mask'])[0][:, 0, :].numpy()
-    y_pred = rf.predict(embeddings)
-    prev_label=" "
-    text=new_data.split()
-    data=[]
-    labels=[]
-    for i,(word,label) in enumerate(zip(text,y_pred)):
-        if label!="Other":
-            label=label.split('-')[1]
-        if prev_label==label:
-                data[-1]=data[-1]+" "+word
-        else:
-            data.append(word)
-            labels.append(label)
-        prev_label=label
-    return(data,labels)
-def highlight(sentence):
-    highlighted_text = ""
-    entity_colors = {"Symptom":"#87cefa","Medical Condition":"#ffb6c1"}
-    words, labels = predict(sentence)
-    for words, label in zip(words, labels):
-        prev_label=""
-        if label!="Other" and words!="a":
-            if label in ["Medical Condition","Symptom"]:
-                    word_color = entity_colors.get(label, "yellow")
-                    label_color = entity_colors.get(label + '-label', "<b>black</b>")
-                    highlighted_text += f'<mark style="background-color: {word_color}; color: {label_color}; padding: 0 0.25rem; border-radius: 0.25rem; border: 2px solid {word_color}; border-bottom-width: 1px">{words}<sup style="background-color: white; color: black; border: 1px solid black; border-radius: 2px; padding: 0 0.15rem; font-size: 70%; margin-left: 0.15rem; font-weight: bold;">{label}</sup></mark> '
-            else:
-                highlighted_text += f'{words} '
-        else:
-            highlighted_text += f'{words} '
-    st.markdown(highlighted_text, unsafe_allow_html=True)
 def read_uploaded_file(uploaded_file):
@@ -88,16 +47,60 @@ def preprocess(text):
     return text
-#Load the trained model
-with open("biobert_rf.pkl", 'rb') as f:
-    rf = pickle.load(f)
-# Load the BioBERT model and tokenizer
-model_name = "dmis-lab/biobert-base-cased-v1.1"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModel.from_pretrained(model_name)
-st.title('Oral Medicine Meets NLP')
-st.subheader('Named Entity Recoginition System For Oral Medicine ')
-sentence = st.text_area('Enter a sentence:')
 st.write("OR")
 uploaded_file = st.file_uploader("Upload a file")
@@ -105,7 +108,7 @@ uploaded_file = st.file_uploader("Upload a file")
 if uploaded_file is not None:
     # Do something with the file
     st.write("File uploaded!")
 st.write("OR")
 selected_options = st.selectbox(
 'Choose a text from dropdown: ',
@@ -114,6 +117,7 @@ selected_options = st.selectbox(
     'Hemophilia is a genetic illness that mainly affects the blood ability to clot properly. Individuals with significant hemophilia are at an elevated possibility of experiencing unforeseen bleeding episodes, which can occur in various parts of the body, including the mouth. Oral bleeding can be a sign of hemophilia and can present as  gum bleeding or mouth sores.',
     "Von Willebrand disease VWD  is a genetic condition that impairs the blood's ability to  clot properly. One of the symptoms of VWD is spontaneous gingival bleeding , which can occur without any apparent cause or trauma"))  # set default to None
 # Define the colors for each label
 if st.button('Analyze'):
@@ -123,17 +127,8 @@ if st.button('Analyze'):
         text=read_uploaded_file(uploaded_file)
         text=preprocess(text)
         highlight(text)
-    elif selected_options:
         highlight(selected_options)
     else:
-        st.write("Please enter a text or select an example to analyze")

+from transformers import AutoTokenizer, TFAutoModel
+import tensorflow as tf
+#from keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.preprocessing.sequence import pad_sequences
 import pickle
 import numpy as np
 from keras.models import load_model
+import streamlit as st
 import io
 import PyPDF2
+import re
 def read_uploaded_file(uploaded_file):
     return text
+def predict(new_data):
+    #Load the trained model
+    X_tokens = [tokenizer.encode(text, add_special_tokens=True) for text in new_data.split()]
+    X_padded = pad_sequences(X_tokens, maxlen=22, dtype='long', truncating='post', padding='post')
+    X_tensor = tf.convert_to_tensor(X_padded)
+    X_embeddings = biobert_model(X_tensor)[0]
+    pred=model.predict(X_embeddings)
+    predicted_labels = list(le.inverse_transform(np.argmax(pred, axis=1)))
+    text=new_data.split()
+    prev_label=" "
+    data=[]
+    labels=[]
+    for i,(word,label) in enumerate(zip(text,predicted_labels)):
+        if label!="Other":
+            label=label.split('-')[1]
+        if prev_label==label:
+                data[-1]=data[-1]+" "+word
+        else:
+            data.append(word)
+            labels.append(label)
+        prev_label=label
+    return(data,labels)
+def highlight(sentence):
+    highlighted_text = ""
+    entity_colors = {"Symptom":"#87cefa","Medical Condition":"#ffb6c1"}
+    words, labels = predict(sentence)
+    for words, label in zip(words, labels):
+        if label!="Other" and words!="a":
+            if label in ["Medical Condition","Symptom"]:
+                    word_color = entity_colors.get(label, "yellow")
+                    label_color = entity_colors.get(label + '-label', "<b>black</b>")
+                    highlighted_text += f'<mark style="background-color: {word_color}; color: {label_color}; padding: 0 0.25rem; border-radius: 0.25rem; border: 2px solid {word_color}; border-bottom-width: 1px">{words}<sup style="background-color: white; color: black; border: 1px solid black; border-radius: 2px; padding: 0 0.15rem; font-size: 70%; margin-left: 0.15rem; font-weight: bold;">{label}</sup></mark> '
+            else:
+                highlighted_text += f'{words} '
+        else:
+            highlighted_text += f'{words} '
+    st.markdown(highlighted_text, unsafe_allow_html=True)
+# Create a LabelEncoder object
+with open("label_encoder.pkl", 'rb') as f:
+  le = pickle.load(f)
+model= tf.keras.models.load_model("biobert_rnn_weightless.h5")
+tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
+biobert_model = TFAutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1", from_pt=True)
+st.title('Named Entity Recognition')
+sentence = st.text_input('Enter a sentence:')
 st.write("OR")
 uploaded_file = st.file_uploader("Upload a file")
 if uploaded_file is not None:
     # Do something with the file
     st.write("File uploaded!")
 st.write("OR")
 selected_options = st.selectbox(
 'Choose a text from dropdown: ',
     'Hemophilia is a genetic illness that mainly affects the blood ability to clot properly. Individuals with significant hemophilia are at an elevated possibility of experiencing unforeseen bleeding episodes, which can occur in various parts of the body, including the mouth. Oral bleeding can be a sign of hemophilia and can present as  gum bleeding or mouth sores.',
     "Von Willebrand disease VWD  is a genetic condition that impairs the blood's ability to  clot properly. One of the symptoms of VWD is spontaneous gingival bleeding , which can occur without any apparent cause or trauma"))  # set default to None
 # Define the colors for each label
 if st.button('Analyze'):
         text=read_uploaded_file(uploaded_file)
         text=preprocess(text)
         highlight(text)
+    elif selected_options:
         highlight(selected_options)
     else:
+        st.write('Please enter a sentence or select an option from the dropdown.')