Muhammad Haris commited on
Commit
04dafd4
·
1 Parent(s): 664874d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -0
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ import numpy as np
5
+ import re
6
+ import os
7
+ import gdown
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+ # Download the file
11
+ file_id = '1P3Nz6f3KG0m0kO_2pEfnVIhgP8Bvkl4v'
12
+ url = f'https://drive.google.com/uc?id={file_id}'
13
+ excel_file_path = os.path.join(os.path.expanduser("~"), 'medical_data.csv')
14
+
15
+ gdown.download(url, excel_file_path, quiet=False)
16
+
17
+ # Read the CSV file into a DataFrame using 'latin1' encoding
18
+ try:
19
+ medical_df = pd.read_csv(excel_file_path, encoding='utf-8')
20
+ except UnicodeDecodeError:
21
+ medical_df = pd.read_csv(excel_file_path, encoding='latin1')
22
+
23
+ def remove_digits_with_dot(input_string):
24
+ # Define a regex pattern to match digits with a dot at the beginning of the string
25
+ pattern = re.compile(r'^\d+\.')
26
+
27
+ # Use sub() method to replace the matched pattern with an empty string
28
+ result_string = re.sub(pattern, '', input_string)
29
+
30
+ return result_string
31
+
32
+ medical_df["Questions"] = medical_df["Questions"].apply(remove_digits_with_dot)
33
+
34
+ medical_df = medical_df[medical_df["Answers"].notna()]
35
+
36
+ from InstructorEmbedding import INSTRUCTOR
37
+
38
+ model = INSTRUCTOR("hkunlp/instructor-large")
39
+ corpus = medical_df["Answers"].apply(lambda x:[x]).tolist()
40
+ answer_embeddings = []
41
+ for answer in corpus:
42
+ answer_embeddings.append(model.encode(answer))
43
+
44
+ answer_embeddings = np.array(answer_embeddings)
45
+ answer_embeddings = answer_embeddings.reshape(148, 768)
46
+
47
+ def get_answer(query):
48
+
49
+ query = [['Represent the Wikipedia question for retrieving supporting documents: ', query]]
50
+
51
+ query_embedding = model.encode(query)
52
+
53
+ similarities = cosine_similarity(query_embedding, answer_embeddings)
54
+
55
+ retrieved_doc_id = np.argmax(similarities)
56
+
57
+ q = medical_df.iloc[retrieved_doc_id]["Questions"]
58
+ a = medical_df.iloc[retrieved_doc_id]["Answers"]
59
+ r = medical_df.iloc[retrieved_doc_id]["References"]
60
+
61
+ return (q, a, r)
62
+
63
+ # Streamlit app
64
+ st.title("Medical QA System")
65
+
66
+ user_input = st.text_input("Ask a medical question:")
67
+ if user_input:
68
+ result = get_answer(user_input)
69
+ st.subheader("Question:")
70
+ st.write(result[0])
71
+ st.subheader("Answer:")
72
+ st.write(result[1])
73
+ st.subheader("References:")
74
+ st.write(result[2])