Spaces:
Runtime error
Runtime error
Arjun Moorthy
commited on
Commit
Β·
cefbc35
1
Parent(s):
92042a1
Add RAG capabilities and fix requirements.txt location
Browse files- .gitignore +22 -0
- Oncolife/app.py +233 -38
- requirements.txt +14 -5
.gitignore
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Ignore PDF files in guideline-docs (keep them local)
|
2 |
+
Oncolife/guideline-docs/*.pdf
|
3 |
+
|
4 |
+
# Ignore other binary files
|
5 |
+
*.pdf
|
6 |
+
*.docx
|
7 |
+
*.doc
|
8 |
+
|
9 |
+
# Keep specific files we want in git
|
10 |
+
!Oncolife/guideline-docs/*.json
|
11 |
+
!Oncolife/guideline-docs/*.txt
|
12 |
+
!Oncolife/guideline-docs/oncolife_alerts_configuration*.docx
|
13 |
+
!Oncolife/guideline-docs/WrittenChatbotDocument*.docx
|
14 |
+
|
15 |
+
# System files
|
16 |
+
.DS_Store
|
17 |
+
__pycache__/
|
18 |
+
*.pyc
|
19 |
+
|
20 |
+
# Environment files
|
21 |
+
oncolife_env311/
|
22 |
+
.env
|
Oncolife/app.py
CHANGED
@@ -4,15 +4,26 @@ OncoLife Symptom & Triage Assistant
|
|
4 |
A medical chatbot that performs both symptom assessment and clinical triage for chemotherapy patients.
|
5 |
Updated: Using BioMistral-7B base model for medical conversations.
|
6 |
REBUILD: Simplified to use only base model, no adapters.
|
|
|
7 |
"""
|
8 |
|
9 |
import gradio as gr
|
10 |
import os
|
11 |
import json
|
|
|
12 |
from transformers import AutoTokenizer, MistralForCausalLM
|
13 |
import torch
|
14 |
from spaces import GPU
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
# Force GPU detection for HF Spaces
|
17 |
@GPU
|
18 |
def force_gpu_detection():
|
@@ -37,14 +48,210 @@ class OncoLifeAssistant:
|
|
37 |
|
38 |
self._load_model(BASE, gpu_available)
|
39 |
|
40 |
-
#
|
41 |
-
self.
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
def _load_model(self, model_id, gpu_available):
|
50 |
"""Load the BioMistral base model"""
|
@@ -90,7 +297,7 @@ class OncoLifeAssistant:
|
|
90 |
self.tokenizer = None
|
91 |
|
92 |
def generate_oncolife_response(self, user_input, conversation_history):
|
93 |
-
"""Generate response using OncoLife
|
94 |
try:
|
95 |
if self.model is None or self.tokenizer is None:
|
96 |
return """β **Model Loading Error**
|
@@ -104,24 +311,24 @@ Please check the Space logs for details."""
|
|
104 |
|
105 |
print(f"π Generating OncoLife response for: {user_input}")
|
106 |
|
107 |
-
#
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
-
|
111 |
-
1. Ask for symptoms if none provided
|
112 |
-
2. For each symptom, ask severity rating (mild/moderate/severe)
|
113 |
-
3. Check for red flags and immediate escalation needs
|
114 |
-
4. Grade severity using CTCAE or UKONS criteria
|
115 |
-
5. Ask targeted questions based on utility scoring
|
116 |
-
6. Provide structured summary with triage recommendations
|
117 |
|
118 |
-
|
119 |
-
- Never provide medical advice or treatment recommendations
|
120 |
-
- Always redirect to oncology team for medical decisions
|
121 |
-
- Escalate immediately for dangerous symptoms
|
122 |
-
- Add legal disclaimer at session end
|
123 |
|
124 |
-
Current
|
125 |
|
126 |
# Format conversation history
|
127 |
history_text = ""
|
@@ -190,10 +397,6 @@ Current conversation state: {conversation_state}"""
|
|
190 |
else:
|
191 |
answer = response.strip()
|
192 |
|
193 |
-
# Add legal disclaimer if this appears to be end of session
|
194 |
-
if any(keyword in user_input.lower() for keyword in ['done', 'finished', 'complete', 'summary']):
|
195 |
-
answer += "\n\n" + self._get_legal_disclaimer()
|
196 |
-
|
197 |
print("β
OncoLife response generated successfully")
|
198 |
return answer
|
199 |
|
@@ -210,14 +413,6 @@ This could be due to:
|
|
210 |
|
211 |
Please try a simpler question or check the logs for more details."""
|
212 |
|
213 |
-
def _get_legal_disclaimer(self):
|
214 |
-
"""Return the legal disclaimer as specified in the instructions"""
|
215 |
-
return """**Legal Disclaimer:**
|
216 |
-
|
217 |
-
Patient verbalizes agreement with plan of care and understanding of the information we have gone over today and has no further comments, questions or concerns at this time. Will follow up with Doctor or ONN if symptoms worsen, do not improve, or any other symptoms develop. Agrees to seek emergency care if pt believes is needed, including for increased dizziness, depression, or any thoughts of SI.
|
218 |
-
|
219 |
-
**Important:** I cannot provide medical advice or treatment recommendations. Please call your oncology team to confirm what's appropriate for your specific situation."""
|
220 |
-
|
221 |
def chat(self, message, history):
|
222 |
"""Main chat interface for OncoLife Assistant"""
|
223 |
if not message.strip():
|
@@ -231,7 +426,7 @@ Patient verbalizes agreement with plan of care and understanding of the informat
|
|
231 |
"assistant": assistant_msg
|
232 |
})
|
233 |
|
234 |
-
# Generate response using OncoLife
|
235 |
response = self.generate_oncolife_response(message, conversation_history)
|
236 |
|
237 |
return response
|
@@ -241,7 +436,7 @@ assistant = OncoLifeAssistant()
|
|
241 |
interface = gr.ChatInterface(
|
242 |
fn=assistant.chat,
|
243 |
title="π₯ OncoLife Symptom & Triage Assistant",
|
244 |
-
description="I'm here to help assess your symptoms and determine if you need to contact your care team.
|
245 |
examples=[
|
246 |
["I'm feeling nauseous and tired"],
|
247 |
["I have a fever of 101"],
|
|
|
4 |
A medical chatbot that performs both symptom assessment and clinical triage for chemotherapy patients.
|
5 |
Updated: Using BioMistral-7B base model for medical conversations.
|
6 |
REBUILD: Simplified to use only base model, no adapters.
|
7 |
+
RAG: Added document retrieval capabilities for PDFs and other reference materials.
|
8 |
"""
|
9 |
|
10 |
import gradio as gr
|
11 |
import os
|
12 |
import json
|
13 |
+
from pathlib import Path
|
14 |
from transformers import AutoTokenizer, MistralForCausalLM
|
15 |
import torch
|
16 |
from spaces import GPU
|
17 |
|
18 |
+
# RAG imports
|
19 |
+
import chromadb
|
20 |
+
from sentence_transformers import SentenceTransformer
|
21 |
+
import PyPDF2
|
22 |
+
import pdfplumber
|
23 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
24 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
25 |
+
import fitz # PyMuPDF for better PDF handling
|
26 |
+
|
27 |
# Force GPU detection for HF Spaces
|
28 |
@GPU
|
29 |
def force_gpu_detection():
|
|
|
48 |
|
49 |
self._load_model(BASE, gpu_available)
|
50 |
|
51 |
+
# Load the OncoLife instructions
|
52 |
+
self._load_instructions()
|
53 |
+
|
54 |
+
# Initialize RAG system
|
55 |
+
self._initialize_rag()
|
56 |
+
|
57 |
+
def _load_instructions(self):
|
58 |
+
"""Load the OncoLife instructions from the text file"""
|
59 |
+
try:
|
60 |
+
instructions_file = Path(__file__).parent / "oncolifebot_instructions.txt"
|
61 |
+
if instructions_file.exists():
|
62 |
+
with open(instructions_file, 'r') as f:
|
63 |
+
self.instructions = f.read()
|
64 |
+
print("β
Loaded oncolifebot_instructions.txt")
|
65 |
+
else:
|
66 |
+
print("β οΈ oncolifebot_instructions.txt not found")
|
67 |
+
self.instructions = ""
|
68 |
+
except Exception as e:
|
69 |
+
print(f"β Error loading instructions: {e}")
|
70 |
+
self.instructions = ""
|
71 |
+
|
72 |
+
def _initialize_rag(self):
|
73 |
+
"""Initialize the RAG system with document embeddings"""
|
74 |
+
try:
|
75 |
+
print("π Initializing RAG system...")
|
76 |
+
|
77 |
+
# Initialize embedding model
|
78 |
+
self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
79 |
+
print("β
Loaded embedding model")
|
80 |
+
|
81 |
+
# Initialize ChromaDB
|
82 |
+
self.chroma_client = chromadb.Client()
|
83 |
+
self.collection = self.chroma_client.create_collection(
|
84 |
+
name="oncolife_documents",
|
85 |
+
metadata={"description": "OncoLife reference documents"}
|
86 |
+
)
|
87 |
+
print("β
Initialized ChromaDB collection")
|
88 |
+
|
89 |
+
# Load and process documents
|
90 |
+
self._load_documents()
|
91 |
+
|
92 |
+
except Exception as e:
|
93 |
+
print(f"β Error initializing RAG: {e}")
|
94 |
+
self.embedding_model = None
|
95 |
+
self.collection = None
|
96 |
+
|
97 |
+
def _load_documents(self):
|
98 |
+
"""Load and process all reference documents"""
|
99 |
+
try:
|
100 |
+
docs_path = Path(__file__).parent / "guideline-docs"
|
101 |
+
print(f"π Loading documents from: {docs_path}")
|
102 |
+
|
103 |
+
if not docs_path.exists():
|
104 |
+
print("β οΈ guideline-docs directory not found")
|
105 |
+
return
|
106 |
+
|
107 |
+
# Text splitter for chunking documents
|
108 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
109 |
+
chunk_size=1000,
|
110 |
+
chunk_overlap=200,
|
111 |
+
separators=["\n\n", "\n", ". ", " ", ""]
|
112 |
+
)
|
113 |
+
|
114 |
+
documents_loaded = 0
|
115 |
+
|
116 |
+
# Process PDF files
|
117 |
+
for pdf_file in docs_path.glob("*.pdf"):
|
118 |
+
try:
|
119 |
+
print(f"π Processing PDF: {pdf_file.name}")
|
120 |
+
text = self._extract_pdf_text(pdf_file)
|
121 |
+
if text:
|
122 |
+
chunks = text_splitter.split_text(text)
|
123 |
+
self._add_chunks_to_db(chunks, pdf_file.name)
|
124 |
+
documents_loaded += 1
|
125 |
+
print(f"β
Added {len(chunks)} chunks from {pdf_file.name}")
|
126 |
+
except Exception as e:
|
127 |
+
print(f"β Error processing {pdf_file.name}: {e}")
|
128 |
+
|
129 |
+
# Process JSON files
|
130 |
+
for json_file in docs_path.glob("*.json"):
|
131 |
+
try:
|
132 |
+
print(f"π Processing JSON: {json_file.name}")
|
133 |
+
with open(json_file, 'r') as f:
|
134 |
+
data = json.load(f)
|
135 |
+
# Convert JSON to text representation
|
136 |
+
text = json.dumps(data, indent=2)
|
137 |
+
chunks = text_splitter.split_text(text)
|
138 |
+
self._add_chunks_to_db(chunks, json_file.name)
|
139 |
+
documents_loaded += 1
|
140 |
+
print(f"β
Added {len(chunks)} chunks from {json_file.name}")
|
141 |
+
except Exception as e:
|
142 |
+
print(f"β Error processing {json_file.name}: {e}")
|
143 |
+
|
144 |
+
# Process text files
|
145 |
+
for txt_file in docs_path.glob("*.txt"):
|
146 |
+
try:
|
147 |
+
print(f"π Processing TXT: {txt_file.name}")
|
148 |
+
with open(txt_file, 'r', encoding='utf-8') as f:
|
149 |
+
text = f.read()
|
150 |
+
chunks = text_splitter.split_text(text)
|
151 |
+
self._add_chunks_to_db(chunks, txt_file.name)
|
152 |
+
documents_loaded += 1
|
153 |
+
print(f"β
Added {len(chunks)} chunks from {txt_file.name}")
|
154 |
+
except Exception as e:
|
155 |
+
print(f"β Error processing {txt_file.name}: {e}")
|
156 |
+
|
157 |
+
print(f"β
RAG system initialized with {documents_loaded} documents")
|
158 |
+
|
159 |
+
except Exception as e:
|
160 |
+
print(f"β Error loading documents: {e}")
|
161 |
+
|
162 |
+
def _extract_pdf_text(self, pdf_path):
|
163 |
+
"""Extract text from PDF using multiple methods"""
|
164 |
+
try:
|
165 |
+
# Try PyMuPDF first (better for complex PDFs)
|
166 |
+
try:
|
167 |
+
doc = fitz.open(pdf_path)
|
168 |
+
text = ""
|
169 |
+
for page in doc:
|
170 |
+
text += page.get_text()
|
171 |
+
doc.close()
|
172 |
+
if text.strip():
|
173 |
+
return text
|
174 |
+
except Exception as e:
|
175 |
+
print(f"PyMuPDF failed for {pdf_path.name}: {e}")
|
176 |
+
|
177 |
+
# Fallback to pdfplumber
|
178 |
+
try:
|
179 |
+
with pdfplumber.open(pdf_path) as pdf:
|
180 |
+
text = ""
|
181 |
+
for page in pdf.pages:
|
182 |
+
if page.extract_text():
|
183 |
+
text += page.extract_text() + "\n"
|
184 |
+
return text
|
185 |
+
except Exception as e:
|
186 |
+
print(f"pdfplumber failed for {pdf_path.name}: {e}")
|
187 |
+
|
188 |
+
# Final fallback to PyPDF2
|
189 |
+
try:
|
190 |
+
with open(pdf_path, 'rb') as file:
|
191 |
+
reader = PyPDF2.PdfReader(file)
|
192 |
+
text = ""
|
193 |
+
for page in reader.pages:
|
194 |
+
text += page.extract_text() + "\n"
|
195 |
+
return text
|
196 |
+
except Exception as e:
|
197 |
+
print(f"PyPDF2 failed for {pdf_path.name}: {e}")
|
198 |
+
|
199 |
+
return None
|
200 |
+
|
201 |
+
except Exception as e:
|
202 |
+
print(f"β Error extracting text from {pdf_path.name}: {e}")
|
203 |
+
return None
|
204 |
+
|
205 |
+
def _add_chunks_to_db(self, chunks, source_name):
|
206 |
+
"""Add document chunks to the vector database"""
|
207 |
+
try:
|
208 |
+
if not chunks or not self.collection:
|
209 |
+
return
|
210 |
+
|
211 |
+
# Generate embeddings
|
212 |
+
embeddings = self.embedding_model.encode(chunks)
|
213 |
+
|
214 |
+
# Add to ChromaDB
|
215 |
+
self.collection.add(
|
216 |
+
embeddings=embeddings.tolist(),
|
217 |
+
documents=chunks,
|
218 |
+
metadatas=[{"source": source_name, "chunk_id": i} for i in range(len(chunks))],
|
219 |
+
ids=[f"{source_name}_chunk_{i}" for i in range(len(chunks))]
|
220 |
+
)
|
221 |
+
|
222 |
+
except Exception as e:
|
223 |
+
print(f"β Error adding chunks to database: {e}")
|
224 |
+
|
225 |
+
def _retrieve_relevant_documents(self, query, top_k=5):
|
226 |
+
"""Retrieve relevant document chunks for a query"""
|
227 |
+
try:
|
228 |
+
if not self.collection or not self.embedding_model:
|
229 |
+
return []
|
230 |
+
|
231 |
+
# Generate query embedding
|
232 |
+
query_embedding = self.embedding_model.encode([query])
|
233 |
+
|
234 |
+
# Search for similar documents
|
235 |
+
results = self.collection.query(
|
236 |
+
query_embeddings=query_embedding.tolist(),
|
237 |
+
n_results=top_k
|
238 |
+
)
|
239 |
+
|
240 |
+
# Format results
|
241 |
+
relevant_docs = []
|
242 |
+
if results['documents']:
|
243 |
+
for i, doc in enumerate(results['documents'][0]):
|
244 |
+
relevant_docs.append({
|
245 |
+
'content': doc,
|
246 |
+
'source': results['metadatas'][0][i]['source'],
|
247 |
+
'similarity': results['distances'][0][i] if 'distances' in results else None
|
248 |
+
})
|
249 |
+
|
250 |
+
return relevant_docs
|
251 |
+
|
252 |
+
except Exception as e:
|
253 |
+
print(f"β Error retrieving documents: {e}")
|
254 |
+
return []
|
255 |
|
256 |
def _load_model(self, model_id, gpu_available):
|
257 |
"""Load the BioMistral base model"""
|
|
|
297 |
self.tokenizer = None
|
298 |
|
299 |
def generate_oncolife_response(self, user_input, conversation_history):
|
300 |
+
"""Generate response using OncoLife instructions and RAG"""
|
301 |
try:
|
302 |
if self.model is None or self.tokenizer is None:
|
303 |
return """β **Model Loading Error**
|
|
|
311 |
|
312 |
print(f"π Generating OncoLife response for: {user_input}")
|
313 |
|
314 |
+
# Retrieve relevant documents using RAG
|
315 |
+
relevant_docs = self._retrieve_relevant_documents(user_input, top_k=3)
|
316 |
+
|
317 |
+
# Format retrieved documents
|
318 |
+
context_text = ""
|
319 |
+
if relevant_docs:
|
320 |
+
context_text = "\n\n**Relevant Reference Information:**\n"
|
321 |
+
for i, doc in enumerate(relevant_docs):
|
322 |
+
context_text += f"\n--- Source: {doc['source']} ---\n{doc['content'][:500]}...\n"
|
323 |
+
|
324 |
+
# Create prompt using the loaded instructions and retrieved context
|
325 |
+
system_prompt = f"""You are the OncoLife Symptom & Triage Assistant. Follow these instructions exactly:
|
326 |
|
327 |
+
{self.instructions}
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
|
329 |
+
{context_text}
|
|
|
|
|
|
|
|
|
330 |
|
331 |
+
Current user input: {user_input}"""
|
332 |
|
333 |
# Format conversation history
|
334 |
history_text = ""
|
|
|
397 |
else:
|
398 |
answer = response.strip()
|
399 |
|
|
|
|
|
|
|
|
|
400 |
print("β
OncoLife response generated successfully")
|
401 |
return answer
|
402 |
|
|
|
413 |
|
414 |
Please try a simpler question or check the logs for more details."""
|
415 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
416 |
def chat(self, message, history):
|
417 |
"""Main chat interface for OncoLife Assistant"""
|
418 |
if not message.strip():
|
|
|
426 |
"assistant": assistant_msg
|
427 |
})
|
428 |
|
429 |
+
# Generate response using OncoLife instructions and RAG
|
430 |
response = self.generate_oncolife_response(message, conversation_history)
|
431 |
|
432 |
return response
|
|
|
436 |
interface = gr.ChatInterface(
|
437 |
fn=assistant.chat,
|
438 |
title="π₯ OncoLife Symptom & Triage Assistant",
|
439 |
+
description="I'm here to help assess your symptoms and determine if you need to contact your care team. I can access your medical guidelines and reference documents to provide accurate information.",
|
440 |
examples=[
|
441 |
["I'm feeling nauseous and tired"],
|
442 |
["I have a fever of 101"],
|
requirements.txt
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
# Medical Chatbot HF Space Requirements
|
2 |
|
3 |
# Web framework
|
4 |
-
gradio
|
5 |
|
6 |
# Machine learning libraries - specific versions for compatibility
|
7 |
-
torch
|
8 |
-
transformers
|
9 |
-
accelerate
|
10 |
|
11 |
# HF Spaces GPU support
|
12 |
spaces>=0.1.0
|
@@ -16,5 +16,14 @@ numpy>=1.21.0,<2.0.0
|
|
16 |
requests>=2.28.0
|
17 |
|
18 |
# Additional dependencies for better device handling
|
19 |
-
safetensors
|
20 |
tokenizers>=0.15.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Medical Chatbot HF Space Requirements
|
2 |
|
3 |
# Web framework
|
4 |
+
gradio==4.44.0
|
5 |
|
6 |
# Machine learning libraries - specific versions for compatibility
|
7 |
+
torch==2.1.2
|
8 |
+
transformers==4.36.2
|
9 |
+
accelerate==0.25.0
|
10 |
|
11 |
# HF Spaces GPU support
|
12 |
spaces>=0.1.0
|
|
|
16 |
requests>=2.28.0
|
17 |
|
18 |
# Additional dependencies for better device handling
|
19 |
+
safetensors==0.4.1
|
20 |
tokenizers>=0.15.0
|
21 |
+
|
22 |
+
# RAG implementation
|
23 |
+
bitsandbytes==0.41.3
|
24 |
+
sentence-transformers==2.2.2
|
25 |
+
chromadb==0.4.22
|
26 |
+
pypdf2==3.0.1
|
27 |
+
pdfplumber==0.10.3
|
28 |
+
langchain==0.1.0
|
29 |
+
langchain-community==0.0.10
|