drkareemkamal commited on
Commit
d4ddbea
Β·
verified Β·
1 Parent(s): 059dad7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -0
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from langchain_community.document_loaders import PDFPlumberLoader
4
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
5
+ from langchain.vectorstores import FAISS
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+ from langchain.prompts import ChatPromptTemplate
8
+ from langchain.chains import LLMChain
9
+ from langchain.llms import CTransformers
10
+ import torch
11
+
12
+ # ==== Configuration ====
13
+ pdfs_directory = 'pdfs'
14
+ vectorstores_directory = 'vectorstores_medical'
15
+ os.makedirs(pdfs_directory, exist_ok=True)
16
+ os.makedirs(vectorstores_directory, exist_ok=True)
17
+
18
+ PREDEFINED_BOOKS = [f for f in os.listdir(pdfs_directory) if f.endswith(".pdf")]
19
+
20
+ TEMPLATE = """
21
+ You are a medical assistant with deep clinical knowledge.
22
+ Use the following retrieved context to answer the question.
23
+ If unsure, say "I don't know." Keep answers accurate, concise, and clear.
24
+
25
+ Question: {question}
26
+ Context: {context}
27
+ Answer:
28
+ """
29
+
30
+ # ==== Embedding Model (Medical) ====
31
+ embedding_model = HuggingFaceEmbeddings(
32
+ model_name='pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb',
33
+ model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
34
+ encode_kwargs={"normalize_embeddings": False}
35
+ )
36
+
37
+ # ==== LLM (Local Quantized Medical Model) ====
38
+ # llm = CTransformers(
39
+ # model='TheBloke/MedAlpaca-7B-GGUF',
40
+ # model_file='medalpaca-7b.Q4_K_M.gguf',
41
+ # model_type='llama',
42
+ # config={'max_new_tokens': 512, 'temperature': 0.4}
43
+ # )
44
+ from langchain.llms import HuggingFaceHub
45
+
46
+ hf_token = "your_huggingface_token"
47
+
48
+ llm = HuggingFaceHub(
49
+ repo_id="epfl-llm/meditron-7b", # Or BioGPT, GatorTron, ClinicalT5, etc.
50
+ model_kwargs={"temperature": 0.4, "max_new_tokens": 512},
51
+ huggingfacehub_api_token=hf_token
52
+ )
53
+
54
+ # ==== Helpers ====
55
+ def split_text(documents):
56
+ splitter = RecursiveCharacterTextSplitter(
57
+ chunk_size=1000,
58
+ chunk_overlap=200,
59
+ add_start_index=True
60
+ )
61
+ return splitter.split_documents(documents)
62
+
63
+ def get_vectorstore_path(book_filename):
64
+ base_name = os.path.splitext(book_filename)[0]
65
+ return os.path.join(vectorstores_directory, base_name)
66
+
67
+ def load_or_create_vectorstore(book_filename, documents=None):
68
+ vs_path = get_vectorstore_path(book_filename)
69
+
70
+ if os.path.exists(os.path.join(vs_path, "index.faiss")):
71
+ return FAISS.load_local(vs_path, embedding_model, allow_dangerous_deserialization=True)
72
+
73
+ if documents is None:
74
+ raise ValueError("Documents required to create vector store.")
75
+
76
+ with st.spinner(f"⏳ Creating vector store for '{book_filename}'..."):
77
+ os.makedirs(vs_path, exist_ok=True)
78
+ chunks = split_text(documents)
79
+ vector_store = FAISS.from_documents(chunks, embedding_model)
80
+ vector_store.save_local(vs_path)
81
+ st.success(f"βœ… Vector store created for '{book_filename}'.")
82
+ return vector_store
83
+
84
+ def retrieve_docs(vector_store, query):
85
+ return vector_store.similarity_search(query)
86
+
87
+ def answer_question(question, documents):
88
+ context = "\n\n".join(doc.page_content for doc in documents)
89
+ prompt = ChatPromptTemplate.from_template(TEMPLATE)
90
+ chain = LLMChain(llm=llm, prompt=prompt)
91
+ return chain.run({"question": question, "context": context})
92
+
93
+ def upload_pdf(file):
94
+ save_path = os.path.join(pdfs_directory, file.name)
95
+ with open(save_path, "wb") as f:
96
+ f.write(file.getbuffer())
97
+ return file.name
98
+
99
+ def load_pdf(file_path):
100
+ loader = PDFPlumberLoader(file_path)
101
+ return loader.load()
102
+
103
+ # ==== Streamlit App ====
104
+ st.set_page_config(page_title="🩺 Medical PDF Chat", layout="centered")
105
+ st.title("πŸ“š Medical Assistant - PDF Q&A")
106
+
107
+ with st.sidebar:
108
+ st.header("Select or Upload a Medical Book")
109
+ selected_book = st.selectbox("Choose a PDF", PREDEFINED_BOOKS + ["Upload new book"])
110
+
111
+ if selected_book == "Upload new book":
112
+ uploaded_file = st.file_uploader("Upload Medical PDF", type="pdf")
113
+ if uploaded_file:
114
+ filename = upload_pdf(uploaded_file)
115
+ st.success(f"πŸ“₯ Uploaded: {filename}")
116
+ selected_book = filename
117
+
118
+ # ==== Main Logic ====
119
+ if selected_book and selected_book != "Upload new book":
120
+ st.info(f"πŸ“– You selected: {selected_book}")
121
+ file_path = os.path.join(pdfs_directory, selected_book)
122
+ vectorstore_path = get_vectorstore_path(selected_book)
123
+
124
+ try:
125
+ if os.path.exists(os.path.join(vectorstore_path, "index.faiss")):
126
+ st.success("βœ… Vector store already exists. Using cached version.")
127
+ vector_store = load_or_create_vectorstore(selected_book)
128
+ else:
129
+ documents = load_pdf(file_path)
130
+ vector_store = load_or_create_vectorstore(selected_book, documents)
131
+
132
+ # Chat Input
133
+ question = st.chat_input("Ask your medical question...")
134
+ if question:
135
+ st.chat_message("user").write(question)
136
+ related_docs = retrieve_docs(vector_store, question)
137
+ answer = answer_question(question, related_docs)
138
+ st.chat_message("assistant").write(answer)
139
+
140
+ except Exception as e:
141
+ st.error(f"❌ Error loading or processing the PDF: {e}")