Muhammad Haris
commited on
Commit
·
04dafd4
1
Parent(s):
664874d
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
4 |
+
import numpy as np
|
5 |
+
import re
|
6 |
+
import os
|
7 |
+
import gdown
|
8 |
+
from sentence_transformers import SentenceTransformer
|
9 |
+
|
10 |
+
# Download the file
|
11 |
+
file_id = '1P3Nz6f3KG0m0kO_2pEfnVIhgP8Bvkl4v'
|
12 |
+
url = f'https://drive.google.com/uc?id={file_id}'
|
13 |
+
excel_file_path = os.path.join(os.path.expanduser("~"), 'medical_data.csv')
|
14 |
+
|
15 |
+
gdown.download(url, excel_file_path, quiet=False)
|
16 |
+
|
17 |
+
# Read the CSV file into a DataFrame using 'latin1' encoding
|
18 |
+
try:
|
19 |
+
medical_df = pd.read_csv(excel_file_path, encoding='utf-8')
|
20 |
+
except UnicodeDecodeError:
|
21 |
+
medical_df = pd.read_csv(excel_file_path, encoding='latin1')
|
22 |
+
|
23 |
+
def remove_digits_with_dot(input_string):
|
24 |
+
# Define a regex pattern to match digits with a dot at the beginning of the string
|
25 |
+
pattern = re.compile(r'^\d+\.')
|
26 |
+
|
27 |
+
# Use sub() method to replace the matched pattern with an empty string
|
28 |
+
result_string = re.sub(pattern, '', input_string)
|
29 |
+
|
30 |
+
return result_string
|
31 |
+
|
32 |
+
medical_df["Questions"] = medical_df["Questions"].apply(remove_digits_with_dot)
|
33 |
+
|
34 |
+
medical_df = medical_df[medical_df["Answers"].notna()]
|
35 |
+
|
36 |
+
from InstructorEmbedding import INSTRUCTOR
|
37 |
+
|
38 |
+
model = INSTRUCTOR("hkunlp/instructor-large")
|
39 |
+
corpus = medical_df["Answers"].apply(lambda x:[x]).tolist()
|
40 |
+
answer_embeddings = []
|
41 |
+
for answer in corpus:
|
42 |
+
answer_embeddings.append(model.encode(answer))
|
43 |
+
|
44 |
+
answer_embeddings = np.array(answer_embeddings)
|
45 |
+
answer_embeddings = answer_embeddings.reshape(148, 768)
|
46 |
+
|
47 |
+
def get_answer(query):
|
48 |
+
|
49 |
+
query = [['Represent the Wikipedia question for retrieving supporting documents: ', query]]
|
50 |
+
|
51 |
+
query_embedding = model.encode(query)
|
52 |
+
|
53 |
+
similarities = cosine_similarity(query_embedding, answer_embeddings)
|
54 |
+
|
55 |
+
retrieved_doc_id = np.argmax(similarities)
|
56 |
+
|
57 |
+
q = medical_df.iloc[retrieved_doc_id]["Questions"]
|
58 |
+
a = medical_df.iloc[retrieved_doc_id]["Answers"]
|
59 |
+
r = medical_df.iloc[retrieved_doc_id]["References"]
|
60 |
+
|
61 |
+
return (q, a, r)
|
62 |
+
|
63 |
+
# Streamlit app
|
64 |
+
st.title("Medical QA System")
|
65 |
+
|
66 |
+
user_input = st.text_input("Ask a medical question:")
|
67 |
+
if user_input:
|
68 |
+
result = get_answer(user_input)
|
69 |
+
st.subheader("Question:")
|
70 |
+
st.write(result[0])
|
71 |
+
st.subheader("Answer:")
|
72 |
+
st.write(result[1])
|
73 |
+
st.subheader("References:")
|
74 |
+
st.write(result[2])
|