Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,9 +1,16 @@
|
|
1 |
-
#
|
2 |
"""
|
3 |
Streamlit application for Medical Image Analysis using Google Gemini Vision
|
4 |
-
and Retrieval-Augmented Generation (RAG) with Chroma DB
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
"""
|
8 |
|
9 |
# --- Imports ---
|
@@ -11,7 +18,6 @@ import streamlit as st
|
|
11 |
import google.generativeai as genai
|
12 |
import chromadb
|
13 |
from chromadb.utils import embedding_functions
|
14 |
-
from chromadb.api.types import EmbeddingFunction # For type hinting
|
15 |
from PIL import Image
|
16 |
import io
|
17 |
import time
|
@@ -22,9 +28,24 @@ from typing import Optional, Dict, List, Any, Tuple
|
|
22 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
23 |
logger = logging.getLogger(__name__)
|
24 |
|
25 |
-
# --- Configuration
|
26 |
-
#
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
VISION_MODEL_NAME = "gemini-pro-vision"
|
29 |
GENERATION_CONFIG = {
|
30 |
"temperature": 0.2,
|
@@ -38,269 +59,258 @@ SAFETY_SETTINGS = [
|
|
38 |
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
|
39 |
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
|
40 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
# Chroma DB Configuration
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
#
|
48 |
-
#
|
49 |
-
#
|
50 |
-
#
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
#
|
55 |
-
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Or specify a different HF model name
|
56 |
-
CHROMA_DISTANCE_FUNCTION = "cosine" # Use cosine similarity
|
57 |
-
|
58 |
-
# UI Configuration
|
59 |
-
MAX_RAG_RESULTS = 3 # Number of results to fetch from Chroma
|
60 |
-
|
61 |
-
# --- Initialization Functions with Caching ---
|
62 |
|
63 |
@st.cache_resource
|
64 |
-
def
|
65 |
-
"""Configures the Google AI SDK using secrets."""
|
66 |
-
try:
|
67 |
-
google_api_key = st.secrets[GOOGLE_API_KEY_SECRET]
|
68 |
-
genai.configure(api_key=google_api_key)
|
69 |
-
logger.info("Google AI SDK configured successfully.")
|
70 |
-
return True
|
71 |
-
except KeyError:
|
72 |
-
st.error(f"β **Error:** '{GOOGLE_API_KEY_SECRET}' not found in Hugging Face Secrets.")
|
73 |
-
logger.error(f"Secret '{GOOGLE_API_KEY_SECRET}' not found.")
|
74 |
-
return False
|
75 |
-
except Exception as e:
|
76 |
-
st.error(f"β **Error:** Failed to configure Google AI SDK: {e}")
|
77 |
-
logger.error(f"Error configuring Google AI SDK: {e}", exc_info=True)
|
78 |
-
return False
|
79 |
-
|
80 |
-
@st.cache_resource
|
81 |
-
def get_gemini_model() -> Optional[genai.GenerativeModel]:
|
82 |
"""Initializes and returns the Gemini Generative Model."""
|
83 |
-
if not configure_google_ai():
|
84 |
-
return None
|
85 |
try:
|
|
|
86 |
model = genai.GenerativeModel(
|
87 |
model_name=VISION_MODEL_NAME,
|
88 |
generation_config=GENERATION_CONFIG,
|
89 |
safety_settings=SAFETY_SETTINGS
|
90 |
)
|
91 |
-
logger.info(f"Gemini Model
|
92 |
return model
|
93 |
except Exception as e:
|
94 |
-
|
95 |
-
|
|
|
96 |
return None
|
97 |
|
98 |
@st.cache_resource
|
99 |
-
def
|
100 |
-
"""Initializes and returns the
|
101 |
try:
|
102 |
-
#
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
|
|
|
|
|
|
107 |
except Exception as e:
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
110 |
return None
|
111 |
|
112 |
@st.cache_resource
|
113 |
-
def
|
114 |
-
"""Initializes
|
115 |
-
|
116 |
-
|
117 |
return None
|
118 |
-
|
119 |
try:
|
120 |
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
|
121 |
-
logger.info(f"ChromaDB client initialized with path: {CHROMA_PATH}")
|
122 |
-
|
123 |
collection = chroma_client.get_or_create_collection(
|
124 |
name=COLLECTION_NAME,
|
125 |
-
embedding_function=
|
126 |
-
metadata={"hnsw:space":
|
127 |
)
|
128 |
-
logger.info(f"
|
129 |
return collection
|
130 |
except Exception as e:
|
131 |
-
|
132 |
-
st.
|
133 |
-
logger.error(
|
|
|
134 |
return None
|
135 |
|
136 |
-
# ---
|
137 |
|
138 |
-
|
|
|
139 |
"""
|
140 |
-
Analyzes image bytes with Gemini
|
141 |
-
|
142 |
-
Args:
|
143 |
-
gemini_model: The initialized Gemini model instance.
|
144 |
-
image_bytes: The image data as bytes.
|
145 |
-
|
146 |
-
Returns:
|
147 |
-
A tuple containing:
|
148 |
-
- The analysis text (str) or None if error/blocked.
|
149 |
-
- A boolean indicating success (True) or failure/block (False).
|
150 |
"""
|
|
|
|
|
|
|
151 |
try:
|
152 |
img = Image.open(io.BytesIO(image_bytes))
|
153 |
-
|
154 |
-
Describe key visual features relevant for medical context (structures, cells, staining, anomalies).
|
155 |
-
Identify potential findings:
|
156 |
-
- Possible conditions or disease indicators
|
157 |
-
- Pathological features (morphology, patterns)
|
158 |
-
- Visible cell types or tissue structures
|
159 |
-
- Relevant biomarkers (if suggested by visuals)
|
160 |
-
- Anatomical context (if clear)
|
161 |
-
|
162 |
-
Focus on visual evidence. Be concise. Avoid definitive diagnosis. State uncertainties clearly.
|
163 |
-
"""
|
164 |
-
response = gemini_model.generate_content([prompt, img], stream=False) # Use stream=False for simpler handling here
|
165 |
-
response.resolve() # Ensure response is fully processed if stream=True was used
|
166 |
|
167 |
if not response.parts:
|
168 |
-
reason = "Unknown reason"
|
169 |
if response.prompt_feedback and response.prompt_feedback.block_reason:
|
170 |
-
reason = response.prompt_feedback.block_reason
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
|
|
|
|
|
|
175 |
logger.info("Gemini analysis successful.")
|
176 |
-
return response.text,
|
177 |
|
178 |
except genai.types.BlockedPromptException as e:
|
179 |
-
|
180 |
-
|
181 |
-
return
|
182 |
except Exception as e:
|
183 |
-
|
184 |
-
|
185 |
-
return
|
186 |
-
|
187 |
-
|
188 |
-
def query_chroma(
|
189 |
-
"""Queries
|
|
|
|
|
190 |
if not query_text:
|
191 |
-
logger.warning("
|
192 |
-
st.warning("β οΈ Cannot query knowledge base without analysis text.")
|
193 |
return None
|
194 |
try:
|
195 |
-
|
196 |
-
|
|
|
|
|
|
|
|
|
197 |
n_results=n_results,
|
198 |
include=['documents', 'metadatas', 'distances']
|
199 |
)
|
200 |
-
logger.info(f"
|
201 |
return results
|
202 |
except Exception as e:
|
203 |
-
|
204 |
-
st.error(
|
|
|
205 |
return None
|
206 |
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
docs = [
|
213 |
-
"Figure 1A shows adenocarcinoma of the lung, papillary subtype. Note the glandular structures and nuclear atypia. TTF-1 staining was positive.",
|
214 |
-
"Pathology slide 34B demonstrates high-grade glioma (glioblastoma) with significant necrosis and microvascular proliferation. Ki-67 index was high.",
|
215 |
-
"Diagram: EGFR signaling pathway mutations in NSCLC targeted by TKIs.", # Shorter version
|
216 |
-
"Micrograph: Chronic gastritis with H. pylori organisms (special stain needed). Mild intestinal metaplasia noted.", # Shorter
|
217 |
-
"Slide CJD-02: Spongiform changes in cerebral cortex characteristic of prion disease. Gliosis present." # Shorter
|
218 |
-
]
|
219 |
-
metadatas = [
|
220 |
-
{"source": "Example Paper 1", "topic": "Lung Cancer Pathology", "entities": "adenocarcinoma, lung cancer, glandular structures, nuclear atypia, papillary subtype, TTF-1", "IMAGE_ID": "fig_1a_adeno_lung.png"},
|
221 |
-
{"source": "Path Report 789", "topic": "Brain Tumor Pathology", "entities": "high-grade glioma, glioblastoma, necrosis, microvascular proliferation, Ki-67", "IMAGE_ID": "slide_34b_gbm.tiff"},
|
222 |
-
{"source": "Textbook Chapter 5", "topic": "Molecular Oncology", "entities": "EGFR, TKIs, NSCLC, signaling pathway", "IMAGE_ID": "diagram_egfr_pathway.svg"},
|
223 |
-
{"source": "Path Report 101", "topic": "Gastrointestinal Pathology", "entities": "chronic gastritis, Helicobacter pylori, intestinal metaplasia", "IMAGE_ID": "micrograph_h_pylori_gastritis.jpg"},
|
224 |
-
{"source": "Case Study CJD", "topic": "Neuropathology", "entities": "prion disease, Spongiform changes, Gliosis, cerebral cortex", "IMAGE_ID": "slide_cjd_sample_02.jpg"}
|
225 |
-
]
|
226 |
-
# Generate potentially more stable IDs for demo purposes if needed, but time-based is fine too
|
227 |
-
# Example: ids = [f"dummy_doc_{i+1}" for i in range(len(docs))]
|
228 |
-
ids = [f"doc_{int(time.time())}_{i}" for i in range(len(docs))]
|
229 |
|
|
|
230 |
try:
|
231 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
existing_docs = collection.get(where={"$or": [{"document": doc} for doc in docs]}, include=[])
|
233 |
if not existing_docs or not existing_docs.get('ids'):
|
|
|
|
|
|
|
234 |
collection.add(
|
235 |
documents=docs,
|
236 |
metadatas=metadatas,
|
237 |
ids=ids
|
238 |
)
|
239 |
-
|
240 |
-
|
241 |
else:
|
242 |
-
|
243 |
-
|
244 |
|
245 |
except Exception as e:
|
246 |
-
|
247 |
-
|
|
|
248 |
|
249 |
-
# ---
|
250 |
-
st.
|
|
|
|
|
|
|
251 |
|
252 |
-
|
253 |
-
st.
|
254 |
-
|
255 |
-
""")
|
256 |
|
257 |
-
# ---
|
258 |
st.warning("""
|
259 |
-
**β οΈ Disclaimer:** This tool is for
|
260 |
-
It is **NOT** a medical device and **
|
261 |
-
|
262 |
-
|
263 |
""")
|
264 |
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
# Check if critical components failed initialization
|
270 |
-
if not gemini_model or not chroma_collection:
|
271 |
-
st.error("β Critical components failed to initialize. Cannot proceed. Check logs and secrets.")
|
272 |
-
st.stop() # Stop execution if core components aren't ready
|
273 |
-
|
274 |
|
275 |
-
#
|
276 |
with st.sidebar:
|
277 |
st.header("βοΈ Controls")
|
278 |
uploaded_file = st.file_uploader(
|
279 |
-
"
|
280 |
type=["jpg", "jpeg", "png", "tiff", "webp"],
|
281 |
-
help="Upload
|
282 |
)
|
283 |
|
284 |
st.divider()
|
285 |
|
286 |
-
st.
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
|
|
292 |
|
293 |
st.info(f"""
|
294 |
-
**
|
295 |
-
-
|
296 |
-
-
|
297 |
-
-
|
298 |
-
-
|
299 |
""")
|
300 |
-
st.caption("
|
|
|
301 |
|
302 |
-
|
303 |
-
# --- Main Processing Area ---
|
304 |
col1, col2 = st.columns(2)
|
305 |
|
306 |
with col1:
|
@@ -309,91 +319,67 @@ with col1:
|
|
309 |
image_bytes = uploaded_file.getvalue()
|
310 |
st.image(image_bytes, caption=f"Uploaded: {uploaded_file.name}", use_column_width=True)
|
311 |
else:
|
312 |
-
st.info("Upload an image using the sidebar to begin
|
313 |
|
314 |
with col2:
|
315 |
-
st.subheader("
|
316 |
-
if uploaded_file is not None and gemini_model and
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
analysis_text
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
meta
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
st.markdown(f"> {doc}")
|
367 |
-
st.markdown("**Metadata:**")
|
368 |
-
# Nicer metadata display
|
369 |
-
meta_display = {k: v for k, v in meta.items() if v} # Filter empty values
|
370 |
-
st.json(meta_display, expanded=False)
|
371 |
-
|
372 |
-
# Provide link/info if related image exists
|
373 |
-
if meta.get("IMAGE_ID"):
|
374 |
-
st.info(f"βΉοΈ Associated Visual: `{meta['IMAGE_ID']}`")
|
375 |
-
|
376 |
-
elif chroma_results is not None: # Query ran, no results
|
377 |
-
st.warning("β οΈ No relevant information found in the knowledge base for this analysis.")
|
378 |
-
status_query.update(label="β οΈ No results found", state="warning")
|
379 |
-
else: # Query failed (error handled in query_chroma)
|
380 |
-
status_query.update(label="π₯ Query Error", state="error")
|
381 |
-
|
382 |
-
except Exception as e:
|
383 |
-
logger.error(f"Unhandled error during query status block: {e}", exc_info=True)
|
384 |
-
st.error(f"β An unexpected error occurred during the knowledge base search: {e}")
|
385 |
-
status_query.update(label="π₯ Query Process Error", state="error")
|
386 |
-
|
387 |
-
elif not analysis_successful:
|
388 |
-
st.info("Cannot proceed to knowledge base search as image analysis failed or was blocked.")
|
389 |
|
390 |
elif not uploaded_file:
|
391 |
-
st.info("Analysis results
|
392 |
else:
|
393 |
-
|
394 |
-
st.info("Waiting for components to initialize...")
|
395 |
-
|
396 |
|
397 |
-
# --- Footer ---
|
398 |
st.markdown("---")
|
399 |
-
st.
|
|
|
|
|
|
1 |
+
# --- Docstring ---
|
2 |
"""
|
3 |
Streamlit application for Medical Image Analysis using Google Gemini Vision
|
4 |
+
and Retrieval-Augmented Generation (RAG) with Chroma DB, enhanced for
|
5 |
+
Hugging Face Spaces deployment and improved practices.
|
6 |
+
|
7 |
+
Features:
|
8 |
+
- Image analysis via Google Gemini Pro Vision.
|
9 |
+
- RAG using Chroma DB with Hugging Face embeddings.
|
10 |
+
- Caching for performance.
|
11 |
+
- Basic logging.
|
12 |
+
- Improved UX and error handling.
|
13 |
+
- Explicit Disclaimer.
|
14 |
"""
|
15 |
|
16 |
# --- Imports ---
|
|
|
18 |
import google.generativeai as genai
|
19 |
import chromadb
|
20 |
from chromadb.utils import embedding_functions
|
|
|
21 |
from PIL import Image
|
22 |
import io
|
23 |
import time
|
|
|
28 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
29 |
logger = logging.getLogger(__name__)
|
30 |
|
31 |
+
# --- Application Configuration ---
|
32 |
+
# Secrets Management (Prioritize Hugging Face Secrets)
|
33 |
+
try:
|
34 |
+
GOOGLE_API_KEY = st.secrets["GOOGLE_API_KEY"]
|
35 |
+
# HF_TOKEN is optional for many public models, but required for gated/private ones
|
36 |
+
HF_TOKEN = st.secrets.get("HF_TOKEN") # Use .get() for optional token
|
37 |
+
except KeyError as e:
|
38 |
+
err_msg = f"β Missing Secret: {e}. Please add it to your Hugging Face Space secrets."
|
39 |
+
st.error(err_msg)
|
40 |
+
logger.error(err_msg)
|
41 |
+
st.stop()
|
42 |
+
except Exception as e:
|
43 |
+
err_msg = f"β Error loading secrets: {e}"
|
44 |
+
st.error(err_msg)
|
45 |
+
logger.error(err_msg)
|
46 |
+
st.stop()
|
47 |
+
|
48 |
+
# Gemini Configuration
|
49 |
VISION_MODEL_NAME = "gemini-pro-vision"
|
50 |
GENERATION_CONFIG = {
|
51 |
"temperature": 0.2,
|
|
|
59 |
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
|
60 |
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
|
61 |
]
|
62 |
+
GEMINI_ANALYSIS_PROMPT = """Analyze this medical image (e.g., pathology slide, diagram, scan).
|
63 |
+
Describe the key visual features relevant to a medical context.
|
64 |
+
Identify potential:
|
65 |
+
- Diseases or conditions indicated
|
66 |
+
- Pathological findings (e.g., cellular morphology, tissue structure, staining patterns)
|
67 |
+
- Visible cell types
|
68 |
+
- Relevant biomarkers (if inferable from staining or morphology)
|
69 |
+
- Anatomical context (if discernible)
|
70 |
+
|
71 |
+
Be concise and focus primarily on visually evident information. Avoid definitive diagnoses.
|
72 |
+
Structure the output clearly, perhaps using bullet points for findings.
|
73 |
+
"""
|
74 |
|
75 |
# Chroma DB Configuration
|
76 |
+
CHROMA_PATH = "chroma_data_hf" # Use a distinct path if needed
|
77 |
+
COLLECTION_NAME = "medical_docs_hf"
|
78 |
+
# IMPORTANT: Choose an appropriate HF embedding model. 'all-mpnet-base-v2' is general purpose.
|
79 |
+
# For better medical results, consider models like:
|
80 |
+
# - 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' (might need more RAM/compute)
|
81 |
+
# - 'dmis-lab/sapbert-from-pubmedbert-sentencetransformer'
|
82 |
+
# - Other models tagged 'medical' or 'biomedical' on Hugging Face Hub.
|
83 |
+
# Ensure the chosen model is compatible with chromadb's HuggingFaceEmbeddingFunction.
|
84 |
+
EMBEDDING_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" # <-- REPLACE if possible
|
85 |
+
CHROMA_DISTANCE_METRIC = "cosine"
|
86 |
+
|
87 |
+
# --- Caching Resource Initialization ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
@st.cache_resource
|
90 |
+
def initialize_gemini_model() -> Optional[genai.GenerativeModel]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
"""Initializes and returns the Gemini Generative Model."""
|
|
|
|
|
92 |
try:
|
93 |
+
genai.configure(api_key=GOOGLE_API_KEY)
|
94 |
model = genai.GenerativeModel(
|
95 |
model_name=VISION_MODEL_NAME,
|
96 |
generation_config=GENERATION_CONFIG,
|
97 |
safety_settings=SAFETY_SETTINGS
|
98 |
)
|
99 |
+
logger.info(f"Successfully initialized Gemini Model: {VISION_MODEL_NAME}")
|
100 |
return model
|
101 |
except Exception as e:
|
102 |
+
err_msg = f"β Error initializing Gemini Model ({VISION_MODEL_NAME}): {e}"
|
103 |
+
st.error(err_msg)
|
104 |
+
logger.error(err_msg, exc_info=True)
|
105 |
return None
|
106 |
|
107 |
@st.cache_resource
|
108 |
+
def initialize_embedding_function() -> Optional[embedding_functions.HuggingFaceEmbeddingFunction]:
|
109 |
+
"""Initializes and returns the Hugging Face Embedding Function."""
|
110 |
try:
|
111 |
+
# Pass HF_TOKEN if it exists (required for private/gated models)
|
112 |
+
api_key_param = {"api_key": HF_TOKEN} if HF_TOKEN else {}
|
113 |
+
embed_func = embedding_functions.HuggingFaceEmbeddingFunction(
|
114 |
+
api_key=HF_TOKEN, # Pass token here if needed by model
|
115 |
+
model_name=EMBEDDING_MODEL_NAME
|
116 |
+
)
|
117 |
+
logger.info(f"Successfully initialized HuggingFace Embedding Function: {EMBEDDING_MODEL_NAME}")
|
118 |
+
return embed_func
|
119 |
except Exception as e:
|
120 |
+
err_msg = f"β Error initializing HuggingFace Embedding Function ({EMBEDDING_MODEL_NAME}): {e}"
|
121 |
+
st.error(err_msg)
|
122 |
+
logger.error(err_msg, exc_info=True)
|
123 |
+
st.info("βΉοΈ Make sure the embedding model name is correct and you have network access. "
|
124 |
+
"If using a private model, ensure HF_TOKEN is set in secrets.")
|
125 |
return None
|
126 |
|
127 |
@st.cache_resource
|
128 |
+
def initialize_chroma_collection(_embedding_func: embedding_functions.EmbeddingFunction) -> Optional[chromadb.Collection]:
|
129 |
+
"""Initializes the Chroma DB client and returns the collection."""
|
130 |
+
if not _embedding_func:
|
131 |
+
st.error("β Cannot initialize Chroma DB without a valid embedding function.")
|
132 |
return None
|
|
|
133 |
try:
|
134 |
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
|
|
|
|
|
135 |
collection = chroma_client.get_or_create_collection(
|
136 |
name=COLLECTION_NAME,
|
137 |
+
embedding_function=_embedding_func, # Pass the initialized function
|
138 |
+
metadata={"hnsw:space": CHROMA_DISTANCE_METRIC}
|
139 |
)
|
140 |
+
logger.info(f"Chroma DB collection '{COLLECTION_NAME}' loaded/created at '{CHROMA_PATH}' using {CHROMA_DISTANCE_METRIC}.")
|
141 |
return collection
|
142 |
except Exception as e:
|
143 |
+
err_msg = f"β Error initializing Chroma DB at '{CHROMA_PATH}': {e}"
|
144 |
+
st.error(err_msg)
|
145 |
+
logger.error(err_msg, exc_info=True)
|
146 |
+
st.info(f"βΉοΈ Ensure the path '{CHROMA_PATH}' is writable.")
|
147 |
return None
|
148 |
|
149 |
+
# --- Core Logic Functions (with Caching for Data Operations) ---
|
150 |
|
151 |
+
@st.cache_data(show_spinner=False) # Show spinner manually in UI
|
152 |
+
def analyze_image_with_gemini(_gemini_model: genai.GenerativeModel, image_bytes: bytes) -> Tuple[str, bool]:
|
153 |
"""
|
154 |
+
Analyzes image bytes with Gemini, returns (analysis_text, is_error).
|
155 |
+
Uses Streamlit's caching based on image_bytes.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
"""
|
157 |
+
if not _gemini_model:
|
158 |
+
return "Error: Gemini model not initialized.", True
|
159 |
+
|
160 |
try:
|
161 |
img = Image.open(io.BytesIO(image_bytes))
|
162 |
+
response = _gemini_model.generate_content([GEMINI_ANALYSIS_PROMPT, img])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
if not response.parts:
|
|
|
165 |
if response.prompt_feedback and response.prompt_feedback.block_reason:
|
166 |
+
reason = response.prompt_feedback.block_reason
|
167 |
+
msg = f"Analysis blocked by safety settings: {reason}"
|
168 |
+
logger.warning(msg)
|
169 |
+
return msg, True # Indicate block/error state
|
170 |
+
else:
|
171 |
+
msg = "Error: Gemini analysis returned no content (empty or invalid response)."
|
172 |
+
logger.error(msg)
|
173 |
+
return msg, True
|
174 |
logger.info("Gemini analysis successful.")
|
175 |
+
return response.text, False # Indicate success
|
176 |
|
177 |
except genai.types.BlockedPromptException as e:
|
178 |
+
msg = f"Analysis blocked (prompt issue): {e}"
|
179 |
+
logger.warning(msg)
|
180 |
+
return msg, True
|
181 |
except Exception as e:
|
182 |
+
msg = f"Error during Gemini analysis: {e}"
|
183 |
+
logger.error(msg, exc_info=True)
|
184 |
+
return msg, True
|
185 |
+
|
186 |
+
@st.cache_data(show_spinner=False)
|
187 |
+
def query_chroma(_collection: chromadb.Collection, query_text: str, n_results: int = 5) -> Optional[Dict[str, List[Any]]]:
|
188 |
+
"""Queries Chroma DB, returns results dict or None on error."""
|
189 |
+
if not _collection:
|
190 |
+
return None
|
191 |
if not query_text:
|
192 |
+
logger.warning("Attempted to query Chroma with empty text.")
|
|
|
193 |
return None
|
194 |
try:
|
195 |
+
# Placeholder for potential query refinement:
|
196 |
+
# refined_query = refine_query_for_chroma(query_text) # Implement this if needed
|
197 |
+
refined_query = query_text # Using direct analysis text for now
|
198 |
+
|
199 |
+
results = _collection.query(
|
200 |
+
query_texts=[refined_query],
|
201 |
n_results=n_results,
|
202 |
include=['documents', 'metadatas', 'distances']
|
203 |
)
|
204 |
+
logger.info(f"Chroma query successful for text snippet: '{query_text[:50]}...'")
|
205 |
return results
|
206 |
except Exception as e:
|
207 |
+
err_msg = f"Error querying Chroma DB: {e}"
|
208 |
+
st.error(err_msg) # Show error in UI as well
|
209 |
+
logger.error(err_msg, exc_info=True)
|
210 |
return None
|
211 |
|
212 |
+
def add_dummy_data_to_chroma(collection: chromadb.Collection, embedding_func: embedding_functions.EmbeddingFunction):
|
213 |
+
"""Adds example medical text snippets to Chroma using the provided embedding function."""
|
214 |
+
if not collection or not embedding_func:
|
215 |
+
st.error("β Cannot add dummy data: Chroma Collection or Embedding Function not available.")
|
216 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
|
218 |
+
status = st.status("Adding dummy data to Chroma DB...", expanded=False)
|
219 |
try:
|
220 |
+
# --- Dummy Data Definition ---
|
221 |
+
# (Same data as before, but ensure metadata is useful)
|
222 |
+
docs = [
|
223 |
+
"Figure 1A shows adenocarcinoma of the lung, papillary subtype. Note the glandular structures and nuclear atypia. TTF-1 staining was positive.",
|
224 |
+
"Pathology slide 34B demonstrates high-grade glioma (glioblastoma) with significant necrosis and microvascular proliferation. Ki-67 index was high.",
|
225 |
+
"This diagram illustrates the EGFR signaling pathway and common mutation sites targeted by tyrosine kinase inhibitors in non-small cell lung cancer.",
|
226 |
+
"Micrograph showing chronic gastritis with Helicobacter pylori organisms (visible with special stain, not shown here). Mild intestinal metaplasia is present.",
|
227 |
+
"Slide CJD-Sample-02: Spongiform changes characteristic of prion disease are evident in the cerebral cortex. Gliosis is also noted."
|
228 |
+
]
|
229 |
+
metadatas = [
|
230 |
+
{"source": "Example Paper 1", "topic": "Lung Cancer Pathology", "entities": "adenocarcinoma, lung cancer, glandular structures, nuclear atypia, papillary subtype, TTF-1", "IMAGE_ID": "fig_1a_adeno_lung.png"},
|
231 |
+
{"source": "Path Report 789", "topic": "Brain Tumor Pathology", "entities": "high-grade glioma, glioblastoma, necrosis, microvascular proliferation, Ki-67", "IMAGE_ID": "slide_34b_gbm.tiff"},
|
232 |
+
{"source": "Textbook Chapter 5", "topic": "Molecular Oncology Pathways", "entities": "EGFR, tyrosine kinase inhibitors, non-small cell lung cancer", "IMAGE_ID": "diagram_egfr_pathway.svg"},
|
233 |
+
{"source": "Path Report 101", "topic": "Gastrointestinal Pathology", "entities": "chronic gastritis, Helicobacter pylori, intestinal metaplasia", "IMAGE_ID": "micrograph_h_pylori_gastritis.jpg"},
|
234 |
+
{"source": "Case Study CJD", "topic": "Neuropathology", "entities": "prion disease, Spongiform changes, Gliosis, cerebral cortex", "IMAGE_ID": "slide_cjd_sample_02.jpg"}
|
235 |
+
]
|
236 |
+
ids = [f"doc_hf_{int(time.time())}_{i}" for i in range(len(docs))]
|
237 |
+
|
238 |
+
# Check for existing documents (simple check based on text)
|
239 |
+
status.update(label="Checking for existing dummy documents...")
|
240 |
existing_docs = collection.get(where={"$or": [{"document": doc} for doc in docs]}, include=[])
|
241 |
if not existing_docs or not existing_docs.get('ids'):
|
242 |
+
status.update(label=f"Generating embeddings for {len(docs)} documents (may take time)...")
|
243 |
+
# Embeddings are generated implicitly by ChromaDB during .add()
|
244 |
+
# when an embedding_function is configured for the collection.
|
245 |
collection.add(
|
246 |
documents=docs,
|
247 |
metadatas=metadatas,
|
248 |
ids=ids
|
249 |
)
|
250 |
+
status.update(label=f"β
Added {len(docs)} dummy documents.", state="complete")
|
251 |
+
logger.info(f"Added {len(docs)} dummy documents to collection '{COLLECTION_NAME}'.")
|
252 |
else:
|
253 |
+
status.update(label="β οΈ Dummy data already exists. No new data added.", state="complete")
|
254 |
+
logger.warning("Dummy data seems to already exist in the collection based on text match.")
|
255 |
|
256 |
except Exception as e:
|
257 |
+
err_msg = f"Error adding dummy data to Chroma: {e}"
|
258 |
+
status.update(label=f"β Error: {err_msg}", state="error")
|
259 |
+
logger.error(err_msg, exc_info=True)
|
260 |
|
261 |
+
# --- Initialize Resources ---
|
262 |
+
# These calls use @st.cache_resource, so they run only once per session/resource change.
|
263 |
+
gemini_model = initialize_gemini_model()
|
264 |
+
embedding_func = initialize_embedding_function()
|
265 |
+
collection = initialize_chroma_collection(embedding_func) # Pass embedding func to chroma init
|
266 |
|
267 |
+
# --- Streamlit UI ---
|
268 |
+
st.set_page_config(layout="wide", page_title="Medical Image Analysis & RAG (HF)")
|
269 |
+
st.title("βοΈ Medical Image Analysis & RAG (Hugging Face Enhanced)")
|
|
|
270 |
|
271 |
+
# --- DISCLAIMER ---
|
272 |
st.warning("""
|
273 |
+
**β οΈ Disclaimer:** This tool is for demonstration and informational purposes ONLY.
|
274 |
+
It is **NOT** a medical device and should **NOT** be used for actual medical diagnosis, treatment, or decision-making.
|
275 |
+
AI analysis can be imperfect. Always consult with qualified healthcare professionals for any medical concerns.
|
276 |
+
Do **NOT** upload identifiable patient data (PHI).
|
277 |
""")
|
278 |
|
279 |
+
st.markdown("""
|
280 |
+
Upload a medical image. Gemini Vision will analyze it, and related information
|
281 |
+
will be retrieved from a Chroma DB knowledge base using Hugging Face embeddings.
|
282 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
283 |
|
284 |
+
# Sidebar
|
285 |
with st.sidebar:
|
286 |
st.header("βοΈ Controls")
|
287 |
uploaded_file = st.file_uploader(
|
288 |
+
"Choose an image...",
|
289 |
type=["jpg", "jpeg", "png", "tiff", "webp"],
|
290 |
+
help="Upload a medical image file (e.g., pathology, diagram)."
|
291 |
)
|
292 |
|
293 |
st.divider()
|
294 |
|
295 |
+
if st.button("β Add/Verify Dummy KB Data", help="Adds example text data to Chroma DB if it doesn't exist."):
|
296 |
+
if collection and embedding_func:
|
297 |
+
add_dummy_data_to_chroma(collection, embedding_func)
|
298 |
+
else:
|
299 |
+
st.error("β Cannot add dummy data: Chroma Collection or Embedding Function failed to initialize.")
|
300 |
+
|
301 |
+
st.divider()
|
302 |
|
303 |
st.info(f"""
|
304 |
+
**Setup Info:**
|
305 |
+
- Gemini Model: `{VISION_MODEL_NAME}`
|
306 |
+
- Embedding Model: `{EMBEDDING_MODEL_NAME}`
|
307 |
+
- Chroma Collection: `{COLLECTION_NAME}` (at `{CHROMA_PATH}`)
|
308 |
+
- Distance Metric: `{CHROMA_DISTANCE_METRIC}`
|
309 |
""")
|
310 |
+
st.caption(f"Using Google API Key: {'*' * (len(GOOGLE_API_KEY)-4)}{GOOGLE_API_KEY[-4:]}" if GOOGLE_API_KEY else "Not Set")
|
311 |
+
st.caption(f"Using HF Token: {'Provided' if HF_TOKEN else 'Not Provided'}")
|
312 |
|
313 |
+
# Main Display Area
|
|
|
314 |
col1, col2 = st.columns(2)
|
315 |
|
316 |
with col1:
|
|
|
319 |
image_bytes = uploaded_file.getvalue()
|
320 |
st.image(image_bytes, caption=f"Uploaded: {uploaded_file.name}", use_column_width=True)
|
321 |
else:
|
322 |
+
st.info("Upload an image using the sidebar to begin.")
|
323 |
|
324 |
with col2:
|
325 |
+
st.subheader("π¬ Analysis & Retrieval")
|
326 |
+
if uploaded_file is not None and gemini_model and collection:
|
327 |
+
# 1. Analyze Image
|
328 |
+
analysis_text = ""
|
329 |
+
analysis_error = False
|
330 |
+
with st.status("π§ Analyzing image with Gemini Vision...", expanded=True) as status_gemini:
|
331 |
+
# The actual analysis function is cached via @st.cache_data
|
332 |
+
analysis_text, analysis_error = analyze_image_with_gemini(gemini_model, image_bytes)
|
333 |
+
if analysis_error:
|
334 |
+
status_gemini.update(label=f"β οΈ Analysis Failed/Blocked: {analysis_text.split(':')[1].strip() if ':' in analysis_text else 'See details'}", state="error")
|
335 |
+
st.error(f"**Analysis Output:** {analysis_text}") # Show error/block message
|
336 |
+
else:
|
337 |
+
status_gemini.update(label="β
Analysis Complete", state="complete")
|
338 |
+
st.markdown("**Gemini Vision Analysis:**")
|
339 |
+
st.markdown(analysis_text)
|
340 |
+
|
341 |
+
# 2. Query Chroma if Analysis Succeeded
|
342 |
+
if not analysis_error and analysis_text:
|
343 |
+
st.markdown("---")
|
344 |
+
st.subheader("π Related Information (RAG)")
|
345 |
+
with st.status("π Searching knowledge base (Chroma DB)...", expanded=True) as status_chroma:
|
346 |
+
# The actual query function is cached via @st.cache_data
|
347 |
+
chroma_results = query_chroma(collection, analysis_text, n_results=3)
|
348 |
+
|
349 |
+
if chroma_results and chroma_results.get('documents') and chroma_results['documents'][0]:
|
350 |
+
num_results = len(chroma_results['documents'][0])
|
351 |
+
status_chroma.update(label=f"β
Found {num_results} related entries.", state="complete")
|
352 |
+
|
353 |
+
for i in range(num_results):
|
354 |
+
doc = chroma_results['documents'][0][i]
|
355 |
+
meta = chroma_results['metadatas'][0][i]
|
356 |
+
dist = chroma_results['distances'][0][i]
|
357 |
+
similarity = 1.0 - dist # For cosine distance
|
358 |
+
|
359 |
+
expander_title = f"Result {i+1} (Similarity: {similarity:.4f}) | Source: {meta.get('source', 'N/A')}"
|
360 |
+
with st.expander(expander_title):
|
361 |
+
st.markdown("**Retrieved Text:**")
|
362 |
+
st.markdown(f"> {doc}")
|
363 |
+
st.markdown("**Metadata:**")
|
364 |
+
# Display metadata keys/values more nicely
|
365 |
+
for key, value in meta.items():
|
366 |
+
st.markdown(f"- **{key.replace('_', ' ').title()}:** `{value}`")
|
367 |
+
|
368 |
+
# Highlight linked image ID
|
369 |
+
if meta.get("IMAGE_ID"):
|
370 |
+
st.info(f"βΉοΈ Associated visual asset ID: `{meta['IMAGE_ID']}`")
|
371 |
+
|
372 |
+
elif chroma_results is not None: # Query ran, no results
|
373 |
+
status_chroma.update(label="β οΈ No relevant information found.", state="warning")
|
374 |
+
else: # Error occurred during query (already logged and shown via st.error)
|
375 |
+
status_chroma.update(label="β Failed to retrieve results.", state="error")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
376 |
|
377 |
elif not uploaded_file:
|
378 |
+
st.info("Analysis results will appear here once an image is uploaded.")
|
379 |
else:
|
380 |
+
st.error("β Analysis cannot proceed. Check if Gemini model or Chroma DB failed to initialize (see sidebar/logs).")
|
|
|
|
|
381 |
|
|
|
382 |
st.markdown("---")
|
383 |
+
st.markdown("<div style='text-align: center; font-size: small;'>Powered by Google Gemini, Chroma DB, Hugging Face, and Streamlit</div>", unsafe_allow_html=True)
|
384 |
+
|
385 |
+
|