File size: 5,342 Bytes
2bfbf48
 
 
 
 
 
 
 
01c4e0e
 
 
2bfbf48
 
 
ca4d97c
2bfbf48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca4d97c
2bfbf48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01c4e0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bfbf48
ca4d97c
2b1291a
ca4d97c
 
 
 
 
 
 
 
 
 
01c4e0e
 
 
 
 
 
 
ca4d97c
 
 
 
 
 
 
 
 
 
 
 
01c4e0e
 
 
 
ca4d97c
 
 
 
 
2bfbf48
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import streamlit as st
import pickle
import numpy as np
import pandas as pd
from transformers import AutoTokenizer,AutoModel
import torch
import tensorflow as tf
from keras.models import load_model
import re
import io
import PyPDF2


def predict(new_data):
    
    tokens = tokenizer(new_data.split(), padding=True, truncation=True, max_length=128, return_tensors='pt')
    with torch.no_grad():
        embeddings = model(tokens['input_ids'], attention_mask=tokens['attention_mask'])[0][:, 0, :].numpy()
    y_pred = rf.predict(embeddings)
    prev_label=" "
    text=new_data.split()
    data=[]
    labels=[]
    for i,(word,label) in enumerate(zip(text,y_pred)):
        if label!="Other":
            label=label.split('-')[1]
        if prev_label==label:
                data[-1]=data[-1]+" "+word
        else:
            data.append(word)
            labels.append(label)
        prev_label=label
    return(data,labels)


def highlight(sentence):
    highlighted_text = ""
    entity_colors = {"Symptom":"#87cefa","Medical Condition":"#ffb6c1"}
    words, labels = predict(sentence)
    for words, label in zip(words, labels):
        prev_label=""
        if label!="Other" and words!="a":
            if label in ["Medical Condition","Symptom"]:
                    word_color = entity_colors.get(label, "yellow")
                    label_color = entity_colors.get(label + '-label', "<b>black</b>")
                    highlighted_text += f'<mark style="background-color: {word_color}; color: {label_color}; padding: 0 0.25rem; border-radius: 0.25rem; border: 2px solid {word_color}; border-bottom-width: 1px">{words}<sup style="background-color: white; color: black; border: 1px solid black; border-radius: 2px; padding: 0 0.15rem; font-size: 70%; margin-left: 0.15rem; font-weight: bold;">{label}</sup></mark> '
            else:
                highlighted_text += f'{words} '
        else:
            highlighted_text += f'{words} '
    st.markdown(highlighted_text, unsafe_allow_html=True)
    


def read_uploaded_file(uploaded_file):
    content = None
    if uploaded_file is not None:
        content_type = uploaded_file.type
        if content_type == 'application/pdf':
            content = read_pdf_file(uploaded_file)
        elif content_type == 'text/plain':
            content = read_text_file(uploaded_file)
    return content

def read_pdf_file(uploaded_file):
    with io.BytesIO(uploaded_file.read()) as f:
        pdf_reader = PyPDF2.PdfReader(f)
        text = ''
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

def read_text_file(uploaded_file):
    with io.StringIO(uploaded_file.read().decode()) as f:
        text = f.read()
    return text

   
def preprocess(text):
    # Define a regular expression pattern for URLs, non-alphabetic characters, and user names
    pattern = re.compile(r'https?://\S+|[^0-9A-Za-z t]|@\w+')
    # Use the regular expression to find all URLs, non-alphabetic characters, and user names in the text
    matches = pattern.findall(text)
    #Replace the URLs, non-alphabetic characters, and user names with an empty string
    for match in matches:
        text = text.replace(match, ' ')
    return text


#Load the trained model
with open("biobert_rf.pkl", 'rb') as f:
    rf = pickle.load(f)
# Load the BioBERT model and tokenizer
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
st.title('Oral Medicine Meets NLP')
st.subheader('Named Entity Recoginition System For Oral Medicine ')
sentence = st.text_area('Enter a sentence:')

st.write("OR")
uploaded_file = st.file_uploader("Upload a file")

if uploaded_file is not None:
    # Do something with the file
    st.write("File uploaded!")
    
st.write("OR") 
selected_options = st.selectbox(
'Choose a text from dropdown: ',
(" ",
    'Anemia and gingival bleeding are connected in that anemia can be a contributing cause to the occurrence of gingival bleeding. Anemia is a condition characterized by a shortage in the number or quality of red blood cells, which can lead to a reduced ability of the blood to carry oxygen throughout the body.', 
    'Hemophilia is a genetic illness that mainly affects the blood ability to clot properly. Individuals with significant hemophilia are at an elevated possibility of experiencing unforeseen bleeding episodes, which can occur in various parts of the body, including the mouth. Oral bleeding can be a sign of hemophilia and can present as  gum bleeding or mouth sores.', 
    "Von Willebrand disease VWD  is a genetic condition that impairs the blood's ability to  clot properly. One of the symptoms of VWD is spontaneous gingival bleeding , which can occur without any apparent cause or trauma"))  # set default to None

# Define the colors for each label

if st.button('Analyze'):
    if sentence:
        highlight(sentence)
    elif uploaded_file:
        text=read_uploaded_file(uploaded_file)
        text=preprocess(text)
        highlight(text)
    elif selected_options: 
        highlight(selected_options)
    else:
        st.write("Please enter a text or select an example to analyze")