mgbam commited on
Commit
ed31030
Β·
verified Β·
1 Parent(s): 228cbf8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +254 -268
app.py CHANGED
@@ -1,9 +1,16 @@
1
- # -*- coding: utf-8 -*-
2
  """
3
  Streamlit application for Medical Image Analysis using Google Gemini Vision
4
- and Retrieval-Augmented Generation (RAG) with Chroma DB.
5
-
6
- Optimized for deployment on Hugging Face Spaces.
 
 
 
 
 
 
 
7
  """
8
 
9
  # --- Imports ---
@@ -11,7 +18,6 @@ import streamlit as st
11
  import google.generativeai as genai
12
  import chromadb
13
  from chromadb.utils import embedding_functions
14
- from chromadb.api.types import EmbeddingFunction # For type hinting
15
  from PIL import Image
16
  import io
17
  import time
@@ -22,9 +28,24 @@ from typing import Optional, Dict, List, Any, Tuple
22
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
23
  logger = logging.getLogger(__name__)
24
 
25
- # --- Configuration Constants ---
26
- # Model and API Configuration
27
- GOOGLE_API_KEY_SECRET = "GOOGLE_API_KEY" # Name of the HF Secret
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  VISION_MODEL_NAME = "gemini-pro-vision"
29
  GENERATION_CONFIG = {
30
  "temperature": 0.2,
@@ -38,269 +59,258 @@ SAFETY_SETTINGS = [
38
  {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
39
  {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
40
  ]
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  # Chroma DB Configuration
43
- # Using persistent storage within the HF Space (relative path)
44
- # NOTE: Ensure your HF Space has persistent storage enabled if you need data to survive restarts.
45
- CHROMA_PATH = "chroma_data_hf"
46
- COLLECTION_NAME = "medical_docs_v2"
47
- # Embedding Function - Using Default (all-MiniLM-L6-v2).
48
- # For better medical relevance, consider models fine-tuned on biomedical text.
49
- # Examples (might require installing `sentence-transformers` explicitly):
50
- # - 'sentence-transformers/all-MiniLM-L6-v2' (Default, General Purpose)
51
- # - 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' (Needs adapter usually)
52
- # - 'emilyalsentzer/Bio_ClinicalBERT' (Needs adapter usually)
53
- # Check Sentence Transformers documentation for loading Hugging Face models directly.
54
- # Make sure the model chosen is consistent between indexing and querying.
55
- EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Or specify a different HF model name
56
- CHROMA_DISTANCE_FUNCTION = "cosine" # Use cosine similarity
57
-
58
- # UI Configuration
59
- MAX_RAG_RESULTS = 3 # Number of results to fetch from Chroma
60
-
61
- # --- Initialization Functions with Caching ---
62
 
63
  @st.cache_resource
64
- def configure_google_ai() -> bool:
65
- """Configures the Google AI SDK using secrets."""
66
- try:
67
- google_api_key = st.secrets[GOOGLE_API_KEY_SECRET]
68
- genai.configure(api_key=google_api_key)
69
- logger.info("Google AI SDK configured successfully.")
70
- return True
71
- except KeyError:
72
- st.error(f"❌ **Error:** '{GOOGLE_API_KEY_SECRET}' not found in Hugging Face Secrets.")
73
- logger.error(f"Secret '{GOOGLE_API_KEY_SECRET}' not found.")
74
- return False
75
- except Exception as e:
76
- st.error(f"❌ **Error:** Failed to configure Google AI SDK: {e}")
77
- logger.error(f"Error configuring Google AI SDK: {e}", exc_info=True)
78
- return False
79
-
80
- @st.cache_resource
81
- def get_gemini_model() -> Optional[genai.GenerativeModel]:
82
  """Initializes and returns the Gemini Generative Model."""
83
- if not configure_google_ai():
84
- return None
85
  try:
 
86
  model = genai.GenerativeModel(
87
  model_name=VISION_MODEL_NAME,
88
  generation_config=GENERATION_CONFIG,
89
  safety_settings=SAFETY_SETTINGS
90
  )
91
- logger.info(f"Gemini Model '{VISION_MODEL_NAME}' initialized.")
92
  return model
93
  except Exception as e:
94
- st.error(f"❌ **Error:** Failed to initialize Gemini Model ({VISION_MODEL_NAME}): {e}")
95
- logger.error(f"Error initializing Gemini Model: {e}", exc_info=True)
 
96
  return None
97
 
98
  @st.cache_resource
99
- def get_embedding_function() -> Optional[EmbeddingFunction]:
100
- """Initializes and returns the embedding function."""
101
  try:
102
- # Using DefaultEmbeddingFunction which leverages sentence-transformers
103
- # Ensure sentence-transformers library is installed
104
- ef = embedding_functions.DefaultEmbeddingFunction(model_name=EMBEDDING_MODEL_NAME)
105
- logger.info(f"Initialized embedding function with model: {EMBEDDING_MODEL_NAME}")
106
- return ef
 
 
 
107
  except Exception as e:
108
- st.error(f"❌ **Error:** Failed to initialize embedding function ({EMBEDDING_MODEL_NAME}): {e}")
109
- logger.error(f"Error initializing embedding function: {e}", exc_info=True)
 
 
 
110
  return None
111
 
112
  @st.cache_resource
113
- def get_chroma_collection() -> Optional[chromadb.Collection]:
114
- """Initializes ChromaDB client and returns the specified collection."""
115
- embedding_func = get_embedding_function()
116
- if not embedding_func:
117
  return None
118
-
119
  try:
120
  chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
121
- logger.info(f"ChromaDB client initialized with path: {CHROMA_PATH}")
122
-
123
  collection = chroma_client.get_or_create_collection(
124
  name=COLLECTION_NAME,
125
- embedding_function=embedding_func,
126
- metadata={"hnsw:space": CHROMA_DISTANCE_FUNCTION}
127
  )
128
- logger.info(f"ChromaDB collection '{COLLECTION_NAME}' loaded/created.")
129
  return collection
130
  except Exception as e:
131
- st.error(f"❌ **Error:** Failed to initialize Chroma DB collection '{COLLECTION_NAME}': {e}")
132
- st.info(f"ℹ️ Attempted path: '{CHROMA_PATH}'. Ensure write permissions and space.")
133
- logger.error(f"Error initializing Chroma DB: {e}", exc_info=True)
 
134
  return None
135
 
136
- # --- Helper Functions ---
137
 
138
- def analyze_image_with_gemini(gemini_model: genai.GenerativeModel, image_bytes: bytes) -> Tuple[Optional[str], bool]:
 
139
  """
140
- Analyzes image bytes with Gemini Vision.
141
-
142
- Args:
143
- gemini_model: The initialized Gemini model instance.
144
- image_bytes: The image data as bytes.
145
-
146
- Returns:
147
- A tuple containing:
148
- - The analysis text (str) or None if error/blocked.
149
- - A boolean indicating success (True) or failure/block (False).
150
  """
 
 
 
151
  try:
152
  img = Image.open(io.BytesIO(image_bytes))
153
- prompt = """Analyze this medical image (e.g., pathology slide, diagram, scan).
154
- Describe key visual features relevant for medical context (structures, cells, staining, anomalies).
155
- Identify potential findings:
156
- - Possible conditions or disease indicators
157
- - Pathological features (morphology, patterns)
158
- - Visible cell types or tissue structures
159
- - Relevant biomarkers (if suggested by visuals)
160
- - Anatomical context (if clear)
161
-
162
- Focus on visual evidence. Be concise. Avoid definitive diagnosis. State uncertainties clearly.
163
- """
164
- response = gemini_model.generate_content([prompt, img], stream=False) # Use stream=False for simpler handling here
165
- response.resolve() # Ensure response is fully processed if stream=True was used
166
 
167
  if not response.parts:
168
- reason = "Unknown reason"
169
  if response.prompt_feedback and response.prompt_feedback.block_reason:
170
- reason = response.prompt_feedback.block_reason.name # Get the reason enum name
171
- logger.warning(f"Gemini analysis blocked or empty. Reason: {reason}")
172
- st.warning(f"⚠️ Analysis blocked by safety filters or returned empty. Reason: {reason}")
173
- return None, False
174
-
 
 
 
175
  logger.info("Gemini analysis successful.")
176
- return response.text, True
177
 
178
  except genai.types.BlockedPromptException as e:
179
- logger.error(f"Gemini analysis blocked due to prompt: {e}")
180
- st.error(f"❌ **Analysis Blocked:** The prompt content triggered safety filters: {e}")
181
- return None, False
182
  except Exception as e:
183
- logger.error(f"Error during Gemini analysis: {e}", exc_info=True)
184
- st.error(f"❌ **Error:** An unexpected error occurred during Gemini analysis: {e}")
185
- return None, False
186
-
187
-
188
- def query_chroma(collection: chromadb.Collection, query_text: str, n_results: int = 3) -> Optional[Dict[str, List[Any]]]:
189
- """Queries the Chroma collection."""
 
 
190
  if not query_text:
191
- logger.warning("Chroma query attempted with empty text.")
192
- st.warning("⚠️ Cannot query knowledge base without analysis text.")
193
  return None
194
  try:
195
- results = collection.query(
196
- query_texts=[query_text],
 
 
 
 
197
  n_results=n_results,
198
  include=['documents', 'metadatas', 'distances']
199
  )
200
- logger.info(f"ChromaDB query executed successfully for text: '{query_text[:50]}...'")
201
  return results
202
  except Exception as e:
203
- logger.error(f"Error querying Chroma DB: {e}", exc_info=True)
204
- st.error(f"❌ **Error:** Failed to query the knowledge base: {e}")
 
205
  return None
206
 
207
- # Function to add dummy data (Consider moving to a separate setup script for cleaner app code)
208
- def add_dummy_data_to_chroma(collection: chromadb.Collection):
209
- """Adds predefined example medical text snippets to the Chroma collection."""
210
- st.info("Attempting to add dummy data to Chroma DB...")
211
- # --- (Same dummy data as before - Keep for demonstration) ---
212
- docs = [
213
- "Figure 1A shows adenocarcinoma of the lung, papillary subtype. Note the glandular structures and nuclear atypia. TTF-1 staining was positive.",
214
- "Pathology slide 34B demonstrates high-grade glioma (glioblastoma) with significant necrosis and microvascular proliferation. Ki-67 index was high.",
215
- "Diagram: EGFR signaling pathway mutations in NSCLC targeted by TKIs.", # Shorter version
216
- "Micrograph: Chronic gastritis with H. pylori organisms (special stain needed). Mild intestinal metaplasia noted.", # Shorter
217
- "Slide CJD-02: Spongiform changes in cerebral cortex characteristic of prion disease. Gliosis present." # Shorter
218
- ]
219
- metadatas = [
220
- {"source": "Example Paper 1", "topic": "Lung Cancer Pathology", "entities": "adenocarcinoma, lung cancer, glandular structures, nuclear atypia, papillary subtype, TTF-1", "IMAGE_ID": "fig_1a_adeno_lung.png"},
221
- {"source": "Path Report 789", "topic": "Brain Tumor Pathology", "entities": "high-grade glioma, glioblastoma, necrosis, microvascular proliferation, Ki-67", "IMAGE_ID": "slide_34b_gbm.tiff"},
222
- {"source": "Textbook Chapter 5", "topic": "Molecular Oncology", "entities": "EGFR, TKIs, NSCLC, signaling pathway", "IMAGE_ID": "diagram_egfr_pathway.svg"},
223
- {"source": "Path Report 101", "topic": "Gastrointestinal Pathology", "entities": "chronic gastritis, Helicobacter pylori, intestinal metaplasia", "IMAGE_ID": "micrograph_h_pylori_gastritis.jpg"},
224
- {"source": "Case Study CJD", "topic": "Neuropathology", "entities": "prion disease, Spongiform changes, Gliosis, cerebral cortex", "IMAGE_ID": "slide_cjd_sample_02.jpg"}
225
- ]
226
- # Generate potentially more stable IDs for demo purposes if needed, but time-based is fine too
227
- # Example: ids = [f"dummy_doc_{i+1}" for i in range(len(docs))]
228
- ids = [f"doc_{int(time.time())}_{i}" for i in range(len(docs))]
229
 
 
230
  try:
231
- # Simple check if *any* of these specific texts exist (for demo)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  existing_docs = collection.get(where={"$or": [{"document": doc} for doc in docs]}, include=[])
233
  if not existing_docs or not existing_docs.get('ids'):
 
 
 
234
  collection.add(
235
  documents=docs,
236
  metadatas=metadatas,
237
  ids=ids
238
  )
239
- logger.info(f"Added {len(docs)} dummy documents to Chroma collection '{COLLECTION_NAME}'.")
240
- st.success(f"βœ… Added {len(docs)} dummy documents to Chroma collection '{COLLECTION_NAME}'.")
241
  else:
242
- logger.warning("Dummy data check indicates data might already exist. Skipping addition.")
243
- st.warning("⚠️ Dummy data seems to already exist in the collection. No new data added.")
244
 
245
  except Exception as e:
246
- logger.error(f"Error adding dummy data to Chroma: {e}", exc_info=True)
247
- st.error(f"❌ **Error:** Could not add dummy data to Chroma: {e}")
 
248
 
249
- # --- Streamlit UI ---
250
- st.set_page_config(layout="wide", page_title="Medical Image RAG - HF", page_icon="βš•οΈ")
 
 
 
251
 
252
- st.title("βš•οΈ Medical Image Analysis & RAG")
253
- st.markdown("""
254
- *Powered by Google Gemini, ChromaDB, and Streamlit on Hugging Face Spaces*
255
- """)
256
 
257
- # --- CRITICAL DISCLAIMER ---
258
  st.warning("""
259
- **⚠️ Disclaimer:** This tool is for informational and illustrative purposes ONLY.
260
- It is **NOT** a medical device and **CANNOT** provide a diagnosis. AI analysis may be
261
- imperfect or incomplete. **ALWAYS** consult qualified medical professionals for any
262
- health concerns or decisions. Do **NOT** rely solely on this tool for medical judgment.
263
  """)
264
 
265
- # --- Initialize Services ---
266
- gemini_model = get_gemini_model()
267
- chroma_collection = get_chroma_collection()
268
-
269
- # Check if critical components failed initialization
270
- if not gemini_model or not chroma_collection:
271
- st.error("❌ Critical components failed to initialize. Cannot proceed. Check logs and secrets.")
272
- st.stop() # Stop execution if core components aren't ready
273
-
274
 
275
- # --- Sidebar Controls ---
276
  with st.sidebar:
277
  st.header("βš™οΈ Controls")
278
  uploaded_file = st.file_uploader(
279
- "1. Upload Medical Image",
280
  type=["jpg", "jpeg", "png", "tiff", "webp"],
281
- help="Upload formats like pathology slides, diagrams, scans."
282
  )
283
 
284
  st.divider()
285
 
286
- st.header("πŸ“š Knowledge Base")
287
- if st.button("βž• Add Dummy KB Data", help="Add example text data to the Chroma vector database for demonstration."):
288
- if chroma_collection:
289
- add_dummy_data_to_chroma(chroma_collection)
290
- else:
291
- st.error("❌ Chroma DB not available to add data.")
 
292
 
293
  st.info(f"""
294
- **KB Info:**
295
- - **Collection:** `{COLLECTION_NAME}`
296
- - **Storage:** `{CHROMA_PATH}` (in Space storage)
297
- - **Embeddings:** `{EMBEDDING_MODEL_NAME}`
298
- - **Similarity:** `{CHROMA_DISTANCE_FUNCTION}`
299
  """)
300
- st.caption("Note: Data persists if persistent storage is enabled for this Space, otherwise it's temporary.")
 
301
 
302
-
303
- # --- Main Processing Area ---
304
  col1, col2 = st.columns(2)
305
 
306
  with col1:
@@ -309,91 +319,67 @@ with col1:
309
  image_bytes = uploaded_file.getvalue()
310
  st.image(image_bytes, caption=f"Uploaded: {uploaded_file.name}", use_column_width=True)
311
  else:
312
- st.info("Upload an image using the sidebar to begin analysis.")
313
 
314
  with col2:
315
- st.subheader("πŸ€– AI Analysis & Retrieval")
316
- if uploaded_file is not None and gemini_model and chroma_collection:
317
- analysis_text = None
318
- analysis_successful = False
319
-
320
- # Step 1: Analyze Image with Gemini
321
- with st.status("🧠 Analyzing image with Gemini Vision...", expanded=False) as status_analysis:
322
- try:
323
- st.write("Sending image to Gemini...")
324
- analysis_text, analysis_successful = analyze_image_with_gemini(gemini_model, image_bytes)
325
- if analysis_successful:
326
- st.write("Analysis complete.")
327
- status_analysis.update(label="βœ… Analysis Complete", state="complete")
328
- else:
329
- # Error/block message already shown by helper function
330
- status_analysis.update(label="⚠️ Analysis Failed or Blocked", state="error")
331
-
332
- except Exception as e: # Catch potential unexpected errors here too
333
- logger.error(f"Unhandled error during analysis status block: {e}", exc_info=True)
334
- st.error(f"❌ An unexpected error occurred during the analysis process: {e}")
335
- status_analysis.update(label="πŸ’₯ Analysis Error", state="error")
336
- analysis_successful = False # Ensure flag is False
337
-
338
- # Display Analysis Result if successful
339
- if analysis_successful and analysis_text:
340
- st.markdown("**πŸ”¬ Gemini Vision Analysis:**")
341
- st.markdown(analysis_text)
342
- st.divider() # Separator
343
-
344
- # Step 2: Query Chroma DB with Analysis Text
345
- st.markdown("**πŸ“š Related Information (RAG via Chroma DB):**")
346
- with st.status("πŸ” Searching knowledge base...", expanded=True) as status_query:
347
- try:
348
- st.write(f"Querying with analysis summary (top {MAX_RAG_RESULTS} results)...")
349
- chroma_results = query_chroma(chroma_collection, analysis_text, n_results=MAX_RAG_RESULTS)
350
-
351
- if chroma_results and chroma_results.get('documents') and chroma_results['documents'][0]:
352
- num_results = len(chroma_results['documents'][0])
353
- st.write(f"Found {num_results} related entries.")
354
- status_query.update(label=f"βœ… Found {num_results} results", state="complete")
355
-
356
- # Display RAG Results
357
- for i in range(num_results):
358
- doc = chroma_results['documents'][0][i]
359
- meta = chroma_results['metadatas'][0][i]
360
- dist = chroma_results['distances'][0][i]
361
- similarity = 1.0 - dist # For cosine distance
362
-
363
- expander_title = f"Result {i+1} (Similarity: {similarity:.3f}) - Source: {meta.get('source', 'N/A')}"
364
- with st.expander(expander_title):
365
- st.markdown("**Retrieved Text:**")
366
- st.markdown(f"> {doc}")
367
- st.markdown("**Metadata:**")
368
- # Nicer metadata display
369
- meta_display = {k: v for k, v in meta.items() if v} # Filter empty values
370
- st.json(meta_display, expanded=False)
371
-
372
- # Provide link/info if related image exists
373
- if meta.get("IMAGE_ID"):
374
- st.info(f"ℹ️ Associated Visual: `{meta['IMAGE_ID']}`")
375
-
376
- elif chroma_results is not None: # Query ran, no results
377
- st.warning("⚠️ No relevant information found in the knowledge base for this analysis.")
378
- status_query.update(label="⚠️ No results found", state="warning")
379
- else: # Query failed (error handled in query_chroma)
380
- status_query.update(label="πŸ’₯ Query Error", state="error")
381
-
382
- except Exception as e:
383
- logger.error(f"Unhandled error during query status block: {e}", exc_info=True)
384
- st.error(f"❌ An unexpected error occurred during the knowledge base search: {e}")
385
- status_query.update(label="πŸ’₯ Query Process Error", state="error")
386
-
387
- elif not analysis_successful:
388
- st.info("Cannot proceed to knowledge base search as image analysis failed or was blocked.")
389
 
390
  elif not uploaded_file:
391
- st.info("Analysis results and related information will appear here once an image is uploaded and processed.")
392
  else:
393
- # This case means initialization failed earlier, message already shown.
394
- st.info("Waiting for components to initialize...")
395
-
396
 
397
- # --- Footer ---
398
  st.markdown("---")
399
- st.caption("Ensure responsible use. Verify all findings with qualified professionals.")
 
 
 
1
+ # --- Docstring ---
2
  """
3
  Streamlit application for Medical Image Analysis using Google Gemini Vision
4
+ and Retrieval-Augmented Generation (RAG) with Chroma DB, enhanced for
5
+ Hugging Face Spaces deployment and improved practices.
6
+
7
+ Features:
8
+ - Image analysis via Google Gemini Pro Vision.
9
+ - RAG using Chroma DB with Hugging Face embeddings.
10
+ - Caching for performance.
11
+ - Basic logging.
12
+ - Improved UX and error handling.
13
+ - Explicit Disclaimer.
14
  """
15
 
16
  # --- Imports ---
 
18
  import google.generativeai as genai
19
  import chromadb
20
  from chromadb.utils import embedding_functions
 
21
  from PIL import Image
22
  import io
23
  import time
 
28
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
29
  logger = logging.getLogger(__name__)
30
 
31
+ # --- Application Configuration ---
32
+ # Secrets Management (Prioritize Hugging Face Secrets)
33
+ try:
34
+ GOOGLE_API_KEY = st.secrets["GOOGLE_API_KEY"]
35
+ # HF_TOKEN is optional for many public models, but required for gated/private ones
36
+ HF_TOKEN = st.secrets.get("HF_TOKEN") # Use .get() for optional token
37
+ except KeyError as e:
38
+ err_msg = f"❌ Missing Secret: {e}. Please add it to your Hugging Face Space secrets."
39
+ st.error(err_msg)
40
+ logger.error(err_msg)
41
+ st.stop()
42
+ except Exception as e:
43
+ err_msg = f"❌ Error loading secrets: {e}"
44
+ st.error(err_msg)
45
+ logger.error(err_msg)
46
+ st.stop()
47
+
48
+ # Gemini Configuration
49
  VISION_MODEL_NAME = "gemini-pro-vision"
50
  GENERATION_CONFIG = {
51
  "temperature": 0.2,
 
59
  {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
60
  {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
61
  ]
62
+ GEMINI_ANALYSIS_PROMPT = """Analyze this medical image (e.g., pathology slide, diagram, scan).
63
+ Describe the key visual features relevant to a medical context.
64
+ Identify potential:
65
+ - Diseases or conditions indicated
66
+ - Pathological findings (e.g., cellular morphology, tissue structure, staining patterns)
67
+ - Visible cell types
68
+ - Relevant biomarkers (if inferable from staining or morphology)
69
+ - Anatomical context (if discernible)
70
+
71
+ Be concise and focus primarily on visually evident information. Avoid definitive diagnoses.
72
+ Structure the output clearly, perhaps using bullet points for findings.
73
+ """
74
 
75
  # Chroma DB Configuration
76
+ CHROMA_PATH = "chroma_data_hf" # Use a distinct path if needed
77
+ COLLECTION_NAME = "medical_docs_hf"
78
+ # IMPORTANT: Choose an appropriate HF embedding model. 'all-mpnet-base-v2' is general purpose.
79
+ # For better medical results, consider models like:
80
+ # - 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' (might need more RAM/compute)
81
+ # - 'dmis-lab/sapbert-from-pubmedbert-sentencetransformer'
82
+ # - Other models tagged 'medical' or 'biomedical' on Hugging Face Hub.
83
+ # Ensure the chosen model is compatible with chromadb's HuggingFaceEmbeddingFunction.
84
+ EMBEDDING_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" # <-- REPLACE if possible
85
+ CHROMA_DISTANCE_METRIC = "cosine"
86
+
87
+ # --- Caching Resource Initialization ---
 
 
 
 
 
 
 
88
 
89
  @st.cache_resource
90
+ def initialize_gemini_model() -> Optional[genai.GenerativeModel]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  """Initializes and returns the Gemini Generative Model."""
 
 
92
  try:
93
+ genai.configure(api_key=GOOGLE_API_KEY)
94
  model = genai.GenerativeModel(
95
  model_name=VISION_MODEL_NAME,
96
  generation_config=GENERATION_CONFIG,
97
  safety_settings=SAFETY_SETTINGS
98
  )
99
+ logger.info(f"Successfully initialized Gemini Model: {VISION_MODEL_NAME}")
100
  return model
101
  except Exception as e:
102
+ err_msg = f"❌ Error initializing Gemini Model ({VISION_MODEL_NAME}): {e}"
103
+ st.error(err_msg)
104
+ logger.error(err_msg, exc_info=True)
105
  return None
106
 
107
  @st.cache_resource
108
+ def initialize_embedding_function() -> Optional[embedding_functions.HuggingFaceEmbeddingFunction]:
109
+ """Initializes and returns the Hugging Face Embedding Function."""
110
  try:
111
+ # Pass HF_TOKEN if it exists (required for private/gated models)
112
+ api_key_param = {"api_key": HF_TOKEN} if HF_TOKEN else {}
113
+ embed_func = embedding_functions.HuggingFaceEmbeddingFunction(
114
+ api_key=HF_TOKEN, # Pass token here if needed by model
115
+ model_name=EMBEDDING_MODEL_NAME
116
+ )
117
+ logger.info(f"Successfully initialized HuggingFace Embedding Function: {EMBEDDING_MODEL_NAME}")
118
+ return embed_func
119
  except Exception as e:
120
+ err_msg = f"❌ Error initializing HuggingFace Embedding Function ({EMBEDDING_MODEL_NAME}): {e}"
121
+ st.error(err_msg)
122
+ logger.error(err_msg, exc_info=True)
123
+ st.info("ℹ️ Make sure the embedding model name is correct and you have network access. "
124
+ "If using a private model, ensure HF_TOKEN is set in secrets.")
125
  return None
126
 
127
  @st.cache_resource
128
+ def initialize_chroma_collection(_embedding_func: embedding_functions.EmbeddingFunction) -> Optional[chromadb.Collection]:
129
+ """Initializes the Chroma DB client and returns the collection."""
130
+ if not _embedding_func:
131
+ st.error("❌ Cannot initialize Chroma DB without a valid embedding function.")
132
  return None
 
133
  try:
134
  chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
 
 
135
  collection = chroma_client.get_or_create_collection(
136
  name=COLLECTION_NAME,
137
+ embedding_function=_embedding_func, # Pass the initialized function
138
+ metadata={"hnsw:space": CHROMA_DISTANCE_METRIC}
139
  )
140
+ logger.info(f"Chroma DB collection '{COLLECTION_NAME}' loaded/created at '{CHROMA_PATH}' using {CHROMA_DISTANCE_METRIC}.")
141
  return collection
142
  except Exception as e:
143
+ err_msg = f"❌ Error initializing Chroma DB at '{CHROMA_PATH}': {e}"
144
+ st.error(err_msg)
145
+ logger.error(err_msg, exc_info=True)
146
+ st.info(f"ℹ️ Ensure the path '{CHROMA_PATH}' is writable.")
147
  return None
148
 
149
+ # --- Core Logic Functions (with Caching for Data Operations) ---
150
 
151
+ @st.cache_data(show_spinner=False) # Show spinner manually in UI
152
+ def analyze_image_with_gemini(_gemini_model: genai.GenerativeModel, image_bytes: bytes) -> Tuple[str, bool]:
153
  """
154
+ Analyzes image bytes with Gemini, returns (analysis_text, is_error).
155
+ Uses Streamlit's caching based on image_bytes.
 
 
 
 
 
 
 
 
156
  """
157
+ if not _gemini_model:
158
+ return "Error: Gemini model not initialized.", True
159
+
160
  try:
161
  img = Image.open(io.BytesIO(image_bytes))
162
+ response = _gemini_model.generate_content([GEMINI_ANALYSIS_PROMPT, img])
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  if not response.parts:
 
165
  if response.prompt_feedback and response.prompt_feedback.block_reason:
166
+ reason = response.prompt_feedback.block_reason
167
+ msg = f"Analysis blocked by safety settings: {reason}"
168
+ logger.warning(msg)
169
+ return msg, True # Indicate block/error state
170
+ else:
171
+ msg = "Error: Gemini analysis returned no content (empty or invalid response)."
172
+ logger.error(msg)
173
+ return msg, True
174
  logger.info("Gemini analysis successful.")
175
+ return response.text, False # Indicate success
176
 
177
  except genai.types.BlockedPromptException as e:
178
+ msg = f"Analysis blocked (prompt issue): {e}"
179
+ logger.warning(msg)
180
+ return msg, True
181
  except Exception as e:
182
+ msg = f"Error during Gemini analysis: {e}"
183
+ logger.error(msg, exc_info=True)
184
+ return msg, True
185
+
186
+ @st.cache_data(show_spinner=False)
187
+ def query_chroma(_collection: chromadb.Collection, query_text: str, n_results: int = 5) -> Optional[Dict[str, List[Any]]]:
188
+ """Queries Chroma DB, returns results dict or None on error."""
189
+ if not _collection:
190
+ return None
191
  if not query_text:
192
+ logger.warning("Attempted to query Chroma with empty text.")
 
193
  return None
194
  try:
195
+ # Placeholder for potential query refinement:
196
+ # refined_query = refine_query_for_chroma(query_text) # Implement this if needed
197
+ refined_query = query_text # Using direct analysis text for now
198
+
199
+ results = _collection.query(
200
+ query_texts=[refined_query],
201
  n_results=n_results,
202
  include=['documents', 'metadatas', 'distances']
203
  )
204
+ logger.info(f"Chroma query successful for text snippet: '{query_text[:50]}...'")
205
  return results
206
  except Exception as e:
207
+ err_msg = f"Error querying Chroma DB: {e}"
208
+ st.error(err_msg) # Show error in UI as well
209
+ logger.error(err_msg, exc_info=True)
210
  return None
211
 
212
+ def add_dummy_data_to_chroma(collection: chromadb.Collection, embedding_func: embedding_functions.EmbeddingFunction):
213
+ """Adds example medical text snippets to Chroma using the provided embedding function."""
214
+ if not collection or not embedding_func:
215
+ st.error("❌ Cannot add dummy data: Chroma Collection or Embedding Function not available.")
216
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
+ status = st.status("Adding dummy data to Chroma DB...", expanded=False)
219
  try:
220
+ # --- Dummy Data Definition ---
221
+ # (Same data as before, but ensure metadata is useful)
222
+ docs = [
223
+ "Figure 1A shows adenocarcinoma of the lung, papillary subtype. Note the glandular structures and nuclear atypia. TTF-1 staining was positive.",
224
+ "Pathology slide 34B demonstrates high-grade glioma (glioblastoma) with significant necrosis and microvascular proliferation. Ki-67 index was high.",
225
+ "This diagram illustrates the EGFR signaling pathway and common mutation sites targeted by tyrosine kinase inhibitors in non-small cell lung cancer.",
226
+ "Micrograph showing chronic gastritis with Helicobacter pylori organisms (visible with special stain, not shown here). Mild intestinal metaplasia is present.",
227
+ "Slide CJD-Sample-02: Spongiform changes characteristic of prion disease are evident in the cerebral cortex. Gliosis is also noted."
228
+ ]
229
+ metadatas = [
230
+ {"source": "Example Paper 1", "topic": "Lung Cancer Pathology", "entities": "adenocarcinoma, lung cancer, glandular structures, nuclear atypia, papillary subtype, TTF-1", "IMAGE_ID": "fig_1a_adeno_lung.png"},
231
+ {"source": "Path Report 789", "topic": "Brain Tumor Pathology", "entities": "high-grade glioma, glioblastoma, necrosis, microvascular proliferation, Ki-67", "IMAGE_ID": "slide_34b_gbm.tiff"},
232
+ {"source": "Textbook Chapter 5", "topic": "Molecular Oncology Pathways", "entities": "EGFR, tyrosine kinase inhibitors, non-small cell lung cancer", "IMAGE_ID": "diagram_egfr_pathway.svg"},
233
+ {"source": "Path Report 101", "topic": "Gastrointestinal Pathology", "entities": "chronic gastritis, Helicobacter pylori, intestinal metaplasia", "IMAGE_ID": "micrograph_h_pylori_gastritis.jpg"},
234
+ {"source": "Case Study CJD", "topic": "Neuropathology", "entities": "prion disease, Spongiform changes, Gliosis, cerebral cortex", "IMAGE_ID": "slide_cjd_sample_02.jpg"}
235
+ ]
236
+ ids = [f"doc_hf_{int(time.time())}_{i}" for i in range(len(docs))]
237
+
238
+ # Check for existing documents (simple check based on text)
239
+ status.update(label="Checking for existing dummy documents...")
240
  existing_docs = collection.get(where={"$or": [{"document": doc} for doc in docs]}, include=[])
241
  if not existing_docs or not existing_docs.get('ids'):
242
+ status.update(label=f"Generating embeddings for {len(docs)} documents (may take time)...")
243
+ # Embeddings are generated implicitly by ChromaDB during .add()
244
+ # when an embedding_function is configured for the collection.
245
  collection.add(
246
  documents=docs,
247
  metadatas=metadatas,
248
  ids=ids
249
  )
250
+ status.update(label=f"βœ… Added {len(docs)} dummy documents.", state="complete")
251
+ logger.info(f"Added {len(docs)} dummy documents to collection '{COLLECTION_NAME}'.")
252
  else:
253
+ status.update(label="⚠️ Dummy data already exists. No new data added.", state="complete")
254
+ logger.warning("Dummy data seems to already exist in the collection based on text match.")
255
 
256
  except Exception as e:
257
+ err_msg = f"Error adding dummy data to Chroma: {e}"
258
+ status.update(label=f"❌ Error: {err_msg}", state="error")
259
+ logger.error(err_msg, exc_info=True)
260
 
261
+ # --- Initialize Resources ---
262
+ # These calls use @st.cache_resource, so they run only once per session/resource change.
263
+ gemini_model = initialize_gemini_model()
264
+ embedding_func = initialize_embedding_function()
265
+ collection = initialize_chroma_collection(embedding_func) # Pass embedding func to chroma init
266
 
267
+ # --- Streamlit UI ---
268
+ st.set_page_config(layout="wide", page_title="Medical Image Analysis & RAG (HF)")
269
+ st.title("βš•οΈ Medical Image Analysis & RAG (Hugging Face Enhanced)")
 
270
 
271
+ # --- DISCLAIMER ---
272
  st.warning("""
273
+ **⚠️ Disclaimer:** This tool is for demonstration and informational purposes ONLY.
274
+ It is **NOT** a medical device and should **NOT** be used for actual medical diagnosis, treatment, or decision-making.
275
+ AI analysis can be imperfect. Always consult with qualified healthcare professionals for any medical concerns.
276
+ Do **NOT** upload identifiable patient data (PHI).
277
  """)
278
 
279
+ st.markdown("""
280
+ Upload a medical image. Gemini Vision will analyze it, and related information
281
+ will be retrieved from a Chroma DB knowledge base using Hugging Face embeddings.
282
+ """)
 
 
 
 
 
283
 
284
+ # Sidebar
285
  with st.sidebar:
286
  st.header("βš™οΈ Controls")
287
  uploaded_file = st.file_uploader(
288
+ "Choose an image...",
289
  type=["jpg", "jpeg", "png", "tiff", "webp"],
290
+ help="Upload a medical image file (e.g., pathology, diagram)."
291
  )
292
 
293
  st.divider()
294
 
295
+ if st.button("βž• Add/Verify Dummy KB Data", help="Adds example text data to Chroma DB if it doesn't exist."):
296
+ if collection and embedding_func:
297
+ add_dummy_data_to_chroma(collection, embedding_func)
298
+ else:
299
+ st.error("❌ Cannot add dummy data: Chroma Collection or Embedding Function failed to initialize.")
300
+
301
+ st.divider()
302
 
303
  st.info(f"""
304
+ **Setup Info:**
305
+ - Gemini Model: `{VISION_MODEL_NAME}`
306
+ - Embedding Model: `{EMBEDDING_MODEL_NAME}`
307
+ - Chroma Collection: `{COLLECTION_NAME}` (at `{CHROMA_PATH}`)
308
+ - Distance Metric: `{CHROMA_DISTANCE_METRIC}`
309
  """)
310
+ st.caption(f"Using Google API Key: {'*' * (len(GOOGLE_API_KEY)-4)}{GOOGLE_API_KEY[-4:]}" if GOOGLE_API_KEY else "Not Set")
311
+ st.caption(f"Using HF Token: {'Provided' if HF_TOKEN else 'Not Provided'}")
312
 
313
+ # Main Display Area
 
314
  col1, col2 = st.columns(2)
315
 
316
  with col1:
 
319
  image_bytes = uploaded_file.getvalue()
320
  st.image(image_bytes, caption=f"Uploaded: {uploaded_file.name}", use_column_width=True)
321
  else:
322
+ st.info("Upload an image using the sidebar to begin.")
323
 
324
  with col2:
325
+ st.subheader("πŸ”¬ Analysis & Retrieval")
326
+ if uploaded_file is not None and gemini_model and collection:
327
+ # 1. Analyze Image
328
+ analysis_text = ""
329
+ analysis_error = False
330
+ with st.status("🧠 Analyzing image with Gemini Vision...", expanded=True) as status_gemini:
331
+ # The actual analysis function is cached via @st.cache_data
332
+ analysis_text, analysis_error = analyze_image_with_gemini(gemini_model, image_bytes)
333
+ if analysis_error:
334
+ status_gemini.update(label=f"⚠️ Analysis Failed/Blocked: {analysis_text.split(':')[1].strip() if ':' in analysis_text else 'See details'}", state="error")
335
+ st.error(f"**Analysis Output:** {analysis_text}") # Show error/block message
336
+ else:
337
+ status_gemini.update(label="βœ… Analysis Complete", state="complete")
338
+ st.markdown("**Gemini Vision Analysis:**")
339
+ st.markdown(analysis_text)
340
+
341
+ # 2. Query Chroma if Analysis Succeeded
342
+ if not analysis_error and analysis_text:
343
+ st.markdown("---")
344
+ st.subheader("πŸ“š Related Information (RAG)")
345
+ with st.status("πŸ” Searching knowledge base (Chroma DB)...", expanded=True) as status_chroma:
346
+ # The actual query function is cached via @st.cache_data
347
+ chroma_results = query_chroma(collection, analysis_text, n_results=3)
348
+
349
+ if chroma_results and chroma_results.get('documents') and chroma_results['documents'][0]:
350
+ num_results = len(chroma_results['documents'][0])
351
+ status_chroma.update(label=f"βœ… Found {num_results} related entries.", state="complete")
352
+
353
+ for i in range(num_results):
354
+ doc = chroma_results['documents'][0][i]
355
+ meta = chroma_results['metadatas'][0][i]
356
+ dist = chroma_results['distances'][0][i]
357
+ similarity = 1.0 - dist # For cosine distance
358
+
359
+ expander_title = f"Result {i+1} (Similarity: {similarity:.4f}) | Source: {meta.get('source', 'N/A')}"
360
+ with st.expander(expander_title):
361
+ st.markdown("**Retrieved Text:**")
362
+ st.markdown(f"> {doc}")
363
+ st.markdown("**Metadata:**")
364
+ # Display metadata keys/values more nicely
365
+ for key, value in meta.items():
366
+ st.markdown(f"- **{key.replace('_', ' ').title()}:** `{value}`")
367
+
368
+ # Highlight linked image ID
369
+ if meta.get("IMAGE_ID"):
370
+ st.info(f"ℹ️ Associated visual asset ID: `{meta['IMAGE_ID']}`")
371
+
372
+ elif chroma_results is not None: # Query ran, no results
373
+ status_chroma.update(label="⚠️ No relevant information found.", state="warning")
374
+ else: # Error occurred during query (already logged and shown via st.error)
375
+ status_chroma.update(label="❌ Failed to retrieve results.", state="error")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
  elif not uploaded_file:
378
+ st.info("Analysis results will appear here once an image is uploaded.")
379
  else:
380
+ st.error("❌ Analysis cannot proceed. Check if Gemini model or Chroma DB failed to initialize (see sidebar/logs).")
 
 
381
 
 
382
  st.markdown("---")
383
+ st.markdown("<div style='text-align: center; font-size: small;'>Powered by Google Gemini, Chroma DB, Hugging Face, and Streamlit</div>", unsafe_allow_html=True)
384
+
385
+