mgbam commited on
Commit
228cbf8
Β·
verified Β·
1 Parent(s): 8c51c2c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +287 -210
app.py CHANGED
@@ -1,14 +1,9 @@
1
- # --- Docstring ---
2
  """
3
  Streamlit application for Medical Image Analysis using Google Gemini Vision
4
  and Retrieval-Augmented Generation (RAG) with Chroma DB.
5
 
6
- Allows users to upload a medical image (pathology slide, diagram, etc.).
7
- 1. The image is analyzed by Google's Gemini Pro Vision model to generate a
8
- textual description of key features.
9
- 2. This description is then used as a query to a Chroma vector database
10
- (populated with example medical text snippets) to retrieve relevant
11
- information from a simulated knowledge base.
12
  """
13
 
14
  # --- Imports ---
@@ -16,38 +11,27 @@ import streamlit as st
16
  import google.generativeai as genai
17
  import chromadb
18
  from chromadb.utils import embedding_functions
 
19
  from PIL import Image
20
  import io
21
- import time # Used for generating unique IDs for Chroma DB demo data
22
- from typing import Optional, Dict, List, Any # For type hinting
23
-
24
- # --- Configuration ---
25
- try:
26
- # Attempt to load the Google API key from Streamlit secrets
27
- GOOGLE_API_KEY = st.secrets["GOOGLE_API_KEY"]
28
- genai.configure(api_key=GOOGLE_API_KEY)
29
- except KeyError:
30
- st.error("❌ GOOGLE_API_KEY not found in Streamlit secrets! Please add it.")
31
- st.stop()
32
- except Exception as e:
33
- st.error(f"❌ Error configuring Google AI SDK: {e}")
34
- st.stop()
35
-
36
- # --- Gemini Model Setup ---
37
- # Define the specific Gemini model to use (ensure it's a vision-capable model)
38
- VISION_MODEL_NAME = "gemini-pro-vision"
39
 
40
- # Configure generation parameters for the model
41
- # Lower temperature for more deterministic, factual descriptions
 
 
42
  GENERATION_CONFIG = {
43
  "temperature": 0.2,
44
  "top_p": 0.95,
45
  "top_k": 40,
46
  "max_output_tokens": 1024,
47
  }
48
-
49
- # Configure safety settings (adjust thresholds as needed for medical content)
50
- # Blocking potentially sensitive content might be necessary depending on the images
51
  SAFETY_SETTINGS = [
52
  {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
53
  {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
@@ -55,268 +39,361 @@ SAFETY_SETTINGS = [
55
  {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
56
  ]
57
 
58
- # Initialize the Gemini Generative Model
59
- try:
60
- gemini_model = genai.GenerativeModel(
61
- model_name=VISION_MODEL_NAME,
62
- generation_config=GENERATION_CONFIG,
63
- safety_settings=SAFETY_SETTINGS
64
- )
65
- st.success(f"βœ… Initialized Gemini Model: {VISION_MODEL_NAME}")
66
- except Exception as e:
67
- st.error(f"❌ Error initializing Gemini Model ({VISION_MODEL_NAME}): {e}")
68
- st.stop()
69
-
70
- # --- Chroma DB Setup ---
71
- # Using persistent storage within the Streamlit deployment environment (e.g., HF Space)
72
- # NOTE: Data will be lost if the persistent storage is wiped or the environment resets.
73
- # For production, consider a managed Chroma instance or alternative database.
74
- CHROMA_PATH = "chroma_data"
75
- COLLECTION_NAME = "medical_docs"
76
-
77
- # Define the embedding function.
78
- # Using a default Sentence Transformer model (runs locally on CPU).
79
- # IMPORTANT: The embedding model used for querying MUST match the one used
80
- # when initially adding data to the collection.
81
- # For improved performance/relevance on medical text, consider fine-tuned
82
- # medical domain-specific embedding models if available.
83
- embedding_func = embedding_functions.DefaultEmbeddingFunction()
84
-
85
- try:
86
- # Initialize Chroma DB client with persistence
87
- chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
88
-
89
- # Get or create the collection, specifying the embedding function and distance metric
90
- # Using cosine distance is common for text similarity tasks.
91
- collection = chroma_client.get_or_create_collection(
92
- name=COLLECTION_NAME,
93
- embedding_function=embedding_func,
94
- metadata={"hnsw:space": "cosine"} # Specify cosine distance metric
95
- )
96
- st.success(f"βœ… Chroma DB collection '{COLLECTION_NAME}' loaded/created at '{CHROMA_PATH}'.")
97
- except Exception as e:
98
- st.error(f"❌ Error initializing Chroma DB at '{CHROMA_PATH}': {e}")
99
- st.info("ℹ️ If this is the first run, the 'chroma_data' directory will be created.")
100
- st.stop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  # --- Helper Functions ---
104
 
105
- def analyze_image_with_gemini(image_bytes: bytes) -> str:
106
  """
107
- Sends image bytes to the Gemini Vision model for analysis and returns
108
- the generated text description.
109
 
110
  Args:
 
111
  image_bytes: The image data as bytes.
112
 
113
  Returns:
114
- A string containing the analysis text, or an error/blocked message.
 
 
115
  """
116
  try:
117
  img = Image.open(io.BytesIO(image_bytes))
118
- # Define the prompt for the vision model
119
  prompt = """Analyze this medical image (e.g., pathology slide, diagram, scan).
120
- Describe the key visual features relevant to a medical context.
121
- Identify potential:
122
- - Diseases or conditions indicated
123
- - Pathological findings (e.g., cellular morphology, tissue structure, staining patterns)
124
- - Visible cell types
125
- - Relevant biomarkers (if inferable from staining or morphology)
126
- - Anatomical context (if discernible)
127
-
128
- Be concise and focus primarily on visually evident information. Avoid definitive diagnoses.
129
  """
130
- # Generate content using the model
131
- response = gemini_model.generate_content([prompt, img])
132
 
133
- # Check for blocked content or empty response
134
  if not response.parts:
 
135
  if response.prompt_feedback and response.prompt_feedback.block_reason:
136
- block_reason = response.prompt_feedback.block_reason
137
- st.warning(f"⚠️ Analysis blocked by safety settings: {block_reason}")
138
- return f"Analysis blocked due to safety settings: {block_reason}"
139
- else:
140
- st.error("❌ Gemini analysis returned no content. Response might be empty or invalid.")
141
- return "Error: Gemini analysis failed or returned no content."
142
 
143
- # Return the generated text
144
- return response.text
145
 
146
  except genai.types.BlockedPromptException as e:
147
- st.error(f"❌ Gemini request blocked due to prompt content: {e}")
148
- return f"Analysis blocked (prompt issue): {e}"
 
149
  except Exception as e:
150
- st.error(f"❌ An error occurred during Gemini analysis: {e}")
151
- return f"Error analyzing image: {e}"
152
-
153
 
154
- def query_chroma(query_text: str, n_results: int = 5) -> Optional[Dict[str, List[Any]]]:
155
- """
156
- Queries the Chroma DB collection with the given text.
157
-
158
- Args:
159
- query_text: The text to use for the similarity search.
160
- n_results: The maximum number of results to return.
161
 
162
- Returns:
163
- A dictionary containing the query results ('documents', 'metadatas',
164
- 'distances'), or None if an error occurs.
165
- """
 
 
166
  try:
167
  results = collection.query(
168
  query_texts=[query_text],
169
  n_results=n_results,
170
- include=['documents', 'metadatas', 'distances'] # Specify fields to include
171
  )
 
172
  return results
173
  except Exception as e:
174
- st.error(f"❌ Error querying Chroma DB: {e}")
 
175
  return None
176
 
177
- def add_dummy_data_to_chroma():
178
- """
179
- Adds predefined example medical text snippets and metadata to the Chroma collection.
180
- Checks if documents with the same text already exist before adding.
181
- """
182
  st.info("Attempting to add dummy data to Chroma DB...")
183
-
184
- # --- IMPORTANT ---
185
- # In a real application, this data ingestion process would involve:
186
- # 1. Parsing actual medical documents (research papers, clinical notes, textbooks).
187
- # 2. Extracting relevant text chunks (e.g., using tools like Unstructured).
188
- # 3. Extracting or associating meaningful METADATA (source, patient ID (anonymized),
189
- # image IDs linked to text, extracted entities like diseases/genes).
190
- # 4. Generating embeddings using the SAME embedding function used for querying.
191
  docs = [
192
  "Figure 1A shows adenocarcinoma of the lung, papillary subtype. Note the glandular structures and nuclear atypia. TTF-1 staining was positive.",
193
  "Pathology slide 34B demonstrates high-grade glioma (glioblastoma) with significant necrosis and microvascular proliferation. Ki-67 index was high.",
194
- "This diagram illustrates the EGFR signaling pathway and common mutation sites targeted by tyrosine kinase inhibitors in non-small cell lung cancer.",
195
- "Micrograph showing chronic gastritis with Helicobacter pylori organisms (visible with special stain, not shown here). Mild intestinal metaplasia is present.",
196
- "Slide CJD-Sample-02: Spongiform changes characteristic of prion disease are evident in the cerebral cortex. Gliosis is also noted."
197
  ]
198
  metadatas = [
199
- {"source": "Example Paper 1", "topic": "Lung Cancer Pathology", "entities": {"DISEASES": ["adenocarcinoma", "lung cancer"], "PATHOLOGY_FINDINGS": ["glandular structures", "nuclear atypia", "papillary subtype"], "BIOMARKERS": ["TTF-1"]}, "IMAGE_ID": "fig_1a_adeno_lung.png"},
200
- {"source": "Path Report 789", "topic": "Brain Tumor Pathology", "entities": {"DISEASES": ["high-grade glioma", "glioblastoma"], "PATHOLOGY_FINDINGS": ["necrosis", "microvascular proliferation"], "BIOMARKERS": ["Ki-67"]}, "IMAGE_ID": "slide_34b_gbm.tiff"},
201
- {"source": "Textbook Chapter 5", "topic": "Molecular Oncology Pathways", "entities": {"GENES": ["EGFR"], "DRUGS": ["tyrosine kinase inhibitors"], "DISEASES": ["non-small cell lung cancer"]}, "IMAGE_ID": "diagram_egfr_pathway.svg"},
202
- {"source": "Path Report 101", "topic": "Gastrointestinal Pathology", "entities": {"DISEASES": ["chronic gastritis", "Helicobacter pylori infection"], "PATHOLOGY_FINDINGS": ["intestinal metaplasia"]}, "IMAGE_ID": "micrograph_h_pylori_gastritis.jpg"},
203
- {"source": "Case Study CJD", "topic": "Neuropathology", "entities": {"DISEASES": ["prion disease"], "PATHOLOGY_FINDINGS": ["Spongiform changes", "Gliosis"], "ANATOMICAL_LOCATIONS": ["cerebral cortex"]}, "IMAGE_ID": "slide_cjd_sample_02.jpg"}
204
  ]
205
- # Generate unique IDs using timestamp + index to minimize collision chance in demo
 
206
  ids = [f"doc_{int(time.time())}_{i}" for i in range(len(docs))]
207
 
208
  try:
209
- # Check if documents with these exact texts already exist to avoid duplicates
210
- existing_docs = collection.get(where={"$or": [{"document": doc} for doc in docs]}, include=[]) # Don't need full data, just check existence
211
  if not existing_docs or not existing_docs.get('ids'):
212
  collection.add(
213
  documents=docs,
214
  metadatas=metadatas,
215
  ids=ids
216
  )
 
217
  st.success(f"βœ… Added {len(docs)} dummy documents to Chroma collection '{COLLECTION_NAME}'.")
218
  else:
219
- st.warning("⚠️ Dummy data (based on document text) seems to already exist in the collection. No new data added.")
 
220
 
221
  except Exception as e:
222
- st.error(f"❌ Error adding dummy data to Chroma: {e}")
223
-
224
 
225
  # --- Streamlit UI ---
226
- st.set_page_config(layout="wide", page_title="Medical Image Analysis & RAG")
 
227
  st.title("βš•οΈ Medical Image Analysis & RAG")
228
  st.markdown("""
229
- Upload a medical image (e.g., pathology slide, diagram).
230
- Google Gemini Vision will analyze it, and Chroma DB will retrieve related text snippets
231
- from a simulated knowledge base based on the analysis.
 
 
 
 
 
 
232
  """)
233
 
234
- # Sidebar for Controls
 
 
 
 
 
 
 
 
 
 
235
  with st.sidebar:
236
  st.header("βš™οΈ Controls")
237
  uploaded_file = st.file_uploader(
238
- "Choose an image...",
239
  type=["jpg", "jpeg", "png", "tiff", "webp"],
240
- help="Upload a medical image file."
241
  )
242
 
243
- st.divider() # Visual separator
244
-
245
- if st.button("βž• Load Dummy KB Data", help="Add example text data to the Chroma vector database."):
246
- add_dummy_data_to_chroma()
247
-
248
  st.divider()
249
 
 
 
 
 
 
 
 
250
  st.info(f"""
251
- ℹ️ **Note:**
252
- - Chroma data is stored in the '{CHROMA_PATH}' folder within the app's environment.
253
- - This data persists across runs but **will be lost** if the hosting environment (e.g., Streamlit Cloud, Hugging Face Space) is reset or its storage is cleared.
254
- - Ensure the Google API Key is set in Streamlit Secrets.
 
255
  """)
 
256
 
257
 
258
- # Main Display Area
259
- col1, col2 = st.columns(2) # Create two columns for layout
260
 
261
  with col1:
262
  st.subheader("πŸ–ΌοΈ Uploaded Image")
263
  if uploaded_file is not None:
264
- # Read image bytes from the uploaded file
265
  image_bytes = uploaded_file.getvalue()
266
- # Display the uploaded image
267
  st.image(image_bytes, caption=f"Uploaded: {uploaded_file.name}", use_column_width=True)
268
  else:
269
- st.info("Upload an image using the sidebar to begin.")
270
 
271
  with col2:
272
- st.subheader("πŸ”¬ Gemini Vision Analysis")
273
- if uploaded_file is not None:
274
- # Analyze image with Gemini when an image is uploaded
275
- with st.spinner("🧠 Analyzing image with Gemini Vision... This may take a moment."):
276
- analysis_text = analyze_image_with_gemini(image_bytes)
277
-
278
- # Display analysis or error message
279
- if analysis_text.startswith("Error:") or analysis_text.startswith("Analysis blocked"):
280
- # Errors/blocks are already logged via st.error/st.warning in the helper function
281
- st.markdown(f"**Analysis Status:** {analysis_text}") # Show status message
282
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  st.markdown(analysis_text)
284
-
285
- st.markdown("---") # Separator before RAG results
286
- st.subheader("πŸ“š Related Information (RAG via Chroma DB)")
287
-
288
- # Query Chroma DB using the Gemini analysis text
289
- with st.spinner("πŸ” Searching knowledge base..."):
290
- chroma_results = query_chroma(analysis_text, n_results=3) # Fetch top 3 results
291
-
292
- if chroma_results and chroma_results.get('documents') and chroma_results['documents'][0]:
293
- num_results = len(chroma_results['documents'][0])
294
- st.success(f"βœ… Found {num_results} related entries in the knowledge base:")
295
-
296
- for i in range(num_results):
297
- doc = chroma_results['documents'][0][i]
298
- meta = chroma_results['metadatas'][0][i]
299
- dist = chroma_results['distances'][0][i]
300
-
301
- expander_title = f"Result {i+1} (Similarity Score: {1-dist:.4f}) - Source: {meta.get('source', 'N/A')}"
302
- with st.expander(expander_title):
303
- st.markdown("**Retrieved Text:**")
304
- st.markdown(f"> {doc}") # Use blockquote for text
305
- st.markdown("**Metadata:**")
306
- st.json(meta) # Display metadata nicely formatted
307
-
308
- # Highlight if the retrieved text references another image/asset
309
- if meta.get("IMAGE_ID"):
310
- st.info(f"ℹ️ This text chunk is associated with visual asset: `{meta['IMAGE_ID']}`")
311
- # In a more complex app, you could add logic here to fetch/display this related image if available.
312
-
313
- elif chroma_results is not None: # Query ran successfully but found nothing
314
- st.warning("⚠️ No relevant information found in the knowledge base matching the image analysis.")
315
- # Else case (chroma_results is None) implies an error occurred, handled by st.error in query_chroma
316
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  else:
318
- st.info("Analysis will appear here once an image is uploaded.")
 
319
 
320
 
 
321
  st.markdown("---")
322
- st.markdown("<div style='text-align: center;'>Powered by Google Gemini, Chroma DB, and Streamlit</div>", unsafe_allow_html=True)
 
1
+ # -*- coding: utf-8 -*-
2
  """
3
  Streamlit application for Medical Image Analysis using Google Gemini Vision
4
  and Retrieval-Augmented Generation (RAG) with Chroma DB.
5
 
6
+ Optimized for deployment on Hugging Face Spaces.
 
 
 
 
 
7
  """
8
 
9
  # --- Imports ---
 
11
  import google.generativeai as genai
12
  import chromadb
13
  from chromadb.utils import embedding_functions
14
+ from chromadb.api.types import EmbeddingFunction # For type hinting
15
  from PIL import Image
16
  import io
17
+ import time
18
+ import logging
19
+ from typing import Optional, Dict, List, Any, Tuple
20
+
21
+ # --- Basic Logging Setup ---
22
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
23
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ # --- Configuration Constants ---
26
+ # Model and API Configuration
27
+ GOOGLE_API_KEY_SECRET = "GOOGLE_API_KEY" # Name of the HF Secret
28
+ VISION_MODEL_NAME = "gemini-pro-vision"
29
  GENERATION_CONFIG = {
30
  "temperature": 0.2,
31
  "top_p": 0.95,
32
  "top_k": 40,
33
  "max_output_tokens": 1024,
34
  }
 
 
 
35
  SAFETY_SETTINGS = [
36
  {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
37
  {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
 
39
  {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
40
  ]
41
 
42
+ # Chroma DB Configuration
43
+ # Using persistent storage within the HF Space (relative path)
44
+ # NOTE: Ensure your HF Space has persistent storage enabled if you need data to survive restarts.
45
+ CHROMA_PATH = "chroma_data_hf"
46
+ COLLECTION_NAME = "medical_docs_v2"
47
+ # Embedding Function - Using Default (all-MiniLM-L6-v2).
48
+ # For better medical relevance, consider models fine-tuned on biomedical text.
49
+ # Examples (might require installing `sentence-transformers` explicitly):
50
+ # - 'sentence-transformers/all-MiniLM-L6-v2' (Default, General Purpose)
51
+ # - 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' (Needs adapter usually)
52
+ # - 'emilyalsentzer/Bio_ClinicalBERT' (Needs adapter usually)
53
+ # Check Sentence Transformers documentation for loading Hugging Face models directly.
54
+ # Make sure the model chosen is consistent between indexing and querying.
55
+ EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Or specify a different HF model name
56
+ CHROMA_DISTANCE_FUNCTION = "cosine" # Use cosine similarity
57
+
58
+ # UI Configuration
59
+ MAX_RAG_RESULTS = 3 # Number of results to fetch from Chroma
60
+
61
+ # --- Initialization Functions with Caching ---
62
+
63
+ @st.cache_resource
64
+ def configure_google_ai() -> bool:
65
+ """Configures the Google AI SDK using secrets."""
66
+ try:
67
+ google_api_key = st.secrets[GOOGLE_API_KEY_SECRET]
68
+ genai.configure(api_key=google_api_key)
69
+ logger.info("Google AI SDK configured successfully.")
70
+ return True
71
+ except KeyError:
72
+ st.error(f"❌ **Error:** '{GOOGLE_API_KEY_SECRET}' not found in Hugging Face Secrets.")
73
+ logger.error(f"Secret '{GOOGLE_API_KEY_SECRET}' not found.")
74
+ return False
75
+ except Exception as e:
76
+ st.error(f"❌ **Error:** Failed to configure Google AI SDK: {e}")
77
+ logger.error(f"Error configuring Google AI SDK: {e}", exc_info=True)
78
+ return False
79
+
80
+ @st.cache_resource
81
+ def get_gemini_model() -> Optional[genai.GenerativeModel]:
82
+ """Initializes and returns the Gemini Generative Model."""
83
+ if not configure_google_ai():
84
+ return None
85
+ try:
86
+ model = genai.GenerativeModel(
87
+ model_name=VISION_MODEL_NAME,
88
+ generation_config=GENERATION_CONFIG,
89
+ safety_settings=SAFETY_SETTINGS
90
+ )
91
+ logger.info(f"Gemini Model '{VISION_MODEL_NAME}' initialized.")
92
+ return model
93
+ except Exception as e:
94
+ st.error(f"❌ **Error:** Failed to initialize Gemini Model ({VISION_MODEL_NAME}): {e}")
95
+ logger.error(f"Error initializing Gemini Model: {e}", exc_info=True)
96
+ return None
97
+
98
+ @st.cache_resource
99
+ def get_embedding_function() -> Optional[EmbeddingFunction]:
100
+ """Initializes and returns the embedding function."""
101
+ try:
102
+ # Using DefaultEmbeddingFunction which leverages sentence-transformers
103
+ # Ensure sentence-transformers library is installed
104
+ ef = embedding_functions.DefaultEmbeddingFunction(model_name=EMBEDDING_MODEL_NAME)
105
+ logger.info(f"Initialized embedding function with model: {EMBEDDING_MODEL_NAME}")
106
+ return ef
107
+ except Exception as e:
108
+ st.error(f"❌ **Error:** Failed to initialize embedding function ({EMBEDDING_MODEL_NAME}): {e}")
109
+ logger.error(f"Error initializing embedding function: {e}", exc_info=True)
110
+ return None
111
+
112
+ @st.cache_resource
113
+ def get_chroma_collection() -> Optional[chromadb.Collection]:
114
+ """Initializes ChromaDB client and returns the specified collection."""
115
+ embedding_func = get_embedding_function()
116
+ if not embedding_func:
117
+ return None
118
+
119
+ try:
120
+ chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
121
+ logger.info(f"ChromaDB client initialized with path: {CHROMA_PATH}")
122
 
123
+ collection = chroma_client.get_or_create_collection(
124
+ name=COLLECTION_NAME,
125
+ embedding_function=embedding_func,
126
+ metadata={"hnsw:space": CHROMA_DISTANCE_FUNCTION}
127
+ )
128
+ logger.info(f"ChromaDB collection '{COLLECTION_NAME}' loaded/created.")
129
+ return collection
130
+ except Exception as e:
131
+ st.error(f"❌ **Error:** Failed to initialize Chroma DB collection '{COLLECTION_NAME}': {e}")
132
+ st.info(f"ℹ️ Attempted path: '{CHROMA_PATH}'. Ensure write permissions and space.")
133
+ logger.error(f"Error initializing Chroma DB: {e}", exc_info=True)
134
+ return None
135
 
136
  # --- Helper Functions ---
137
 
138
+ def analyze_image_with_gemini(gemini_model: genai.GenerativeModel, image_bytes: bytes) -> Tuple[Optional[str], bool]:
139
  """
140
+ Analyzes image bytes with Gemini Vision.
 
141
 
142
  Args:
143
+ gemini_model: The initialized Gemini model instance.
144
  image_bytes: The image data as bytes.
145
 
146
  Returns:
147
+ A tuple containing:
148
+ - The analysis text (str) or None if error/blocked.
149
+ - A boolean indicating success (True) or failure/block (False).
150
  """
151
  try:
152
  img = Image.open(io.BytesIO(image_bytes))
 
153
  prompt = """Analyze this medical image (e.g., pathology slide, diagram, scan).
154
+ Describe key visual features relevant for medical context (structures, cells, staining, anomalies).
155
+ Identify potential findings:
156
+ - Possible conditions or disease indicators
157
+ - Pathological features (morphology, patterns)
158
+ - Visible cell types or tissue structures
159
+ - Relevant biomarkers (if suggested by visuals)
160
+ - Anatomical context (if clear)
161
+
162
+ Focus on visual evidence. Be concise. Avoid definitive diagnosis. State uncertainties clearly.
163
  """
164
+ response = gemini_model.generate_content([prompt, img], stream=False) # Use stream=False for simpler handling here
165
+ response.resolve() # Ensure response is fully processed if stream=True was used
166
 
 
167
  if not response.parts:
168
+ reason = "Unknown reason"
169
  if response.prompt_feedback and response.prompt_feedback.block_reason:
170
+ reason = response.prompt_feedback.block_reason.name # Get the reason enum name
171
+ logger.warning(f"Gemini analysis blocked or empty. Reason: {reason}")
172
+ st.warning(f"⚠️ Analysis blocked by safety filters or returned empty. Reason: {reason}")
173
+ return None, False
 
 
174
 
175
+ logger.info("Gemini analysis successful.")
176
+ return response.text, True
177
 
178
  except genai.types.BlockedPromptException as e:
179
+ logger.error(f"Gemini analysis blocked due to prompt: {e}")
180
+ st.error(f"❌ **Analysis Blocked:** The prompt content triggered safety filters: {e}")
181
+ return None, False
182
  except Exception as e:
183
+ logger.error(f"Error during Gemini analysis: {e}", exc_info=True)
184
+ st.error(f"❌ **Error:** An unexpected error occurred during Gemini analysis: {e}")
185
+ return None, False
186
 
 
 
 
 
 
 
 
187
 
188
+ def query_chroma(collection: chromadb.Collection, query_text: str, n_results: int = 3) -> Optional[Dict[str, List[Any]]]:
189
+ """Queries the Chroma collection."""
190
+ if not query_text:
191
+ logger.warning("Chroma query attempted with empty text.")
192
+ st.warning("⚠️ Cannot query knowledge base without analysis text.")
193
+ return None
194
  try:
195
  results = collection.query(
196
  query_texts=[query_text],
197
  n_results=n_results,
198
+ include=['documents', 'metadatas', 'distances']
199
  )
200
+ logger.info(f"ChromaDB query executed successfully for text: '{query_text[:50]}...'")
201
  return results
202
  except Exception as e:
203
+ logger.error(f"Error querying Chroma DB: {e}", exc_info=True)
204
+ st.error(f"❌ **Error:** Failed to query the knowledge base: {e}")
205
  return None
206
 
207
+ # Function to add dummy data (Consider moving to a separate setup script for cleaner app code)
208
+ def add_dummy_data_to_chroma(collection: chromadb.Collection):
209
+ """Adds predefined example medical text snippets to the Chroma collection."""
 
 
210
  st.info("Attempting to add dummy data to Chroma DB...")
211
+ # --- (Same dummy data as before - Keep for demonstration) ---
 
 
 
 
 
 
 
212
  docs = [
213
  "Figure 1A shows adenocarcinoma of the lung, papillary subtype. Note the glandular structures and nuclear atypia. TTF-1 staining was positive.",
214
  "Pathology slide 34B demonstrates high-grade glioma (glioblastoma) with significant necrosis and microvascular proliferation. Ki-67 index was high.",
215
+ "Diagram: EGFR signaling pathway mutations in NSCLC targeted by TKIs.", # Shorter version
216
+ "Micrograph: Chronic gastritis with H. pylori organisms (special stain needed). Mild intestinal metaplasia noted.", # Shorter
217
+ "Slide CJD-02: Spongiform changes in cerebral cortex characteristic of prion disease. Gliosis present." # Shorter
218
  ]
219
  metadatas = [
220
+ {"source": "Example Paper 1", "topic": "Lung Cancer Pathology", "entities": "adenocarcinoma, lung cancer, glandular structures, nuclear atypia, papillary subtype, TTF-1", "IMAGE_ID": "fig_1a_adeno_lung.png"},
221
+ {"source": "Path Report 789", "topic": "Brain Tumor Pathology", "entities": "high-grade glioma, glioblastoma, necrosis, microvascular proliferation, Ki-67", "IMAGE_ID": "slide_34b_gbm.tiff"},
222
+ {"source": "Textbook Chapter 5", "topic": "Molecular Oncology", "entities": "EGFR, TKIs, NSCLC, signaling pathway", "IMAGE_ID": "diagram_egfr_pathway.svg"},
223
+ {"source": "Path Report 101", "topic": "Gastrointestinal Pathology", "entities": "chronic gastritis, Helicobacter pylori, intestinal metaplasia", "IMAGE_ID": "micrograph_h_pylori_gastritis.jpg"},
224
+ {"source": "Case Study CJD", "topic": "Neuropathology", "entities": "prion disease, Spongiform changes, Gliosis, cerebral cortex", "IMAGE_ID": "slide_cjd_sample_02.jpg"}
225
  ]
226
+ # Generate potentially more stable IDs for demo purposes if needed, but time-based is fine too
227
+ # Example: ids = [f"dummy_doc_{i+1}" for i in range(len(docs))]
228
  ids = [f"doc_{int(time.time())}_{i}" for i in range(len(docs))]
229
 
230
  try:
231
+ # Simple check if *any* of these specific texts exist (for demo)
232
+ existing_docs = collection.get(where={"$or": [{"document": doc} for doc in docs]}, include=[])
233
  if not existing_docs or not existing_docs.get('ids'):
234
  collection.add(
235
  documents=docs,
236
  metadatas=metadatas,
237
  ids=ids
238
  )
239
+ logger.info(f"Added {len(docs)} dummy documents to Chroma collection '{COLLECTION_NAME}'.")
240
  st.success(f"βœ… Added {len(docs)} dummy documents to Chroma collection '{COLLECTION_NAME}'.")
241
  else:
242
+ logger.warning("Dummy data check indicates data might already exist. Skipping addition.")
243
+ st.warning("⚠️ Dummy data seems to already exist in the collection. No new data added.")
244
 
245
  except Exception as e:
246
+ logger.error(f"Error adding dummy data to Chroma: {e}", exc_info=True)
247
+ st.error(f"❌ **Error:** Could not add dummy data to Chroma: {e}")
248
 
249
  # --- Streamlit UI ---
250
+ st.set_page_config(layout="wide", page_title="Medical Image RAG - HF", page_icon="βš•οΈ")
251
+
252
  st.title("βš•οΈ Medical Image Analysis & RAG")
253
  st.markdown("""
254
+ *Powered by Google Gemini, ChromaDB, and Streamlit on Hugging Face Spaces*
255
+ """)
256
+
257
+ # --- CRITICAL DISCLAIMER ---
258
+ st.warning("""
259
+ **⚠️ Disclaimer:** This tool is for informational and illustrative purposes ONLY.
260
+ It is **NOT** a medical device and **CANNOT** provide a diagnosis. AI analysis may be
261
+ imperfect or incomplete. **ALWAYS** consult qualified medical professionals for any
262
+ health concerns or decisions. Do **NOT** rely solely on this tool for medical judgment.
263
  """)
264
 
265
+ # --- Initialize Services ---
266
+ gemini_model = get_gemini_model()
267
+ chroma_collection = get_chroma_collection()
268
+
269
+ # Check if critical components failed initialization
270
+ if not gemini_model or not chroma_collection:
271
+ st.error("❌ Critical components failed to initialize. Cannot proceed. Check logs and secrets.")
272
+ st.stop() # Stop execution if core components aren't ready
273
+
274
+
275
+ # --- Sidebar Controls ---
276
  with st.sidebar:
277
  st.header("βš™οΈ Controls")
278
  uploaded_file = st.file_uploader(
279
+ "1. Upload Medical Image",
280
  type=["jpg", "jpeg", "png", "tiff", "webp"],
281
+ help="Upload formats like pathology slides, diagrams, scans."
282
  )
283
 
 
 
 
 
 
284
  st.divider()
285
 
286
+ st.header("πŸ“š Knowledge Base")
287
+ if st.button("βž• Add Dummy KB Data", help="Add example text data to the Chroma vector database for demonstration."):
288
+ if chroma_collection:
289
+ add_dummy_data_to_chroma(chroma_collection)
290
+ else:
291
+ st.error("❌ Chroma DB not available to add data.")
292
+
293
  st.info(f"""
294
+ **KB Info:**
295
+ - **Collection:** `{COLLECTION_NAME}`
296
+ - **Storage:** `{CHROMA_PATH}` (in Space storage)
297
+ - **Embeddings:** `{EMBEDDING_MODEL_NAME}`
298
+ - **Similarity:** `{CHROMA_DISTANCE_FUNCTION}`
299
  """)
300
+ st.caption("Note: Data persists if persistent storage is enabled for this Space, otherwise it's temporary.")
301
 
302
 
303
+ # --- Main Processing Area ---
304
+ col1, col2 = st.columns(2)
305
 
306
  with col1:
307
  st.subheader("πŸ–ΌοΈ Uploaded Image")
308
  if uploaded_file is not None:
 
309
  image_bytes = uploaded_file.getvalue()
 
310
  st.image(image_bytes, caption=f"Uploaded: {uploaded_file.name}", use_column_width=True)
311
  else:
312
+ st.info("Upload an image using the sidebar to begin analysis.")
313
 
314
  with col2:
315
+ st.subheader("πŸ€– AI Analysis & Retrieval")
316
+ if uploaded_file is not None and gemini_model and chroma_collection:
317
+ analysis_text = None
318
+ analysis_successful = False
319
+
320
+ # Step 1: Analyze Image with Gemini
321
+ with st.status("🧠 Analyzing image with Gemini Vision...", expanded=False) as status_analysis:
322
+ try:
323
+ st.write("Sending image to Gemini...")
324
+ analysis_text, analysis_successful = analyze_image_with_gemini(gemini_model, image_bytes)
325
+ if analysis_successful:
326
+ st.write("Analysis complete.")
327
+ status_analysis.update(label="βœ… Analysis Complete", state="complete")
328
+ else:
329
+ # Error/block message already shown by helper function
330
+ status_analysis.update(label="⚠️ Analysis Failed or Blocked", state="error")
331
+
332
+ except Exception as e: # Catch potential unexpected errors here too
333
+ logger.error(f"Unhandled error during analysis status block: {e}", exc_info=True)
334
+ st.error(f"❌ An unexpected error occurred during the analysis process: {e}")
335
+ status_analysis.update(label="πŸ’₯ Analysis Error", state="error")
336
+ analysis_successful = False # Ensure flag is False
337
+
338
+ # Display Analysis Result if successful
339
+ if analysis_successful and analysis_text:
340
+ st.markdown("**πŸ”¬ Gemini Vision Analysis:**")
341
  st.markdown(analysis_text)
342
+ st.divider() # Separator
343
+
344
+ # Step 2: Query Chroma DB with Analysis Text
345
+ st.markdown("**πŸ“š Related Information (RAG via Chroma DB):**")
346
+ with st.status("πŸ” Searching knowledge base...", expanded=True) as status_query:
347
+ try:
348
+ st.write(f"Querying with analysis summary (top {MAX_RAG_RESULTS} results)...")
349
+ chroma_results = query_chroma(chroma_collection, analysis_text, n_results=MAX_RAG_RESULTS)
350
+
351
+ if chroma_results and chroma_results.get('documents') and chroma_results['documents'][0]:
352
+ num_results = len(chroma_results['documents'][0])
353
+ st.write(f"Found {num_results} related entries.")
354
+ status_query.update(label=f"βœ… Found {num_results} results", state="complete")
355
+
356
+ # Display RAG Results
357
+ for i in range(num_results):
358
+ doc = chroma_results['documents'][0][i]
359
+ meta = chroma_results['metadatas'][0][i]
360
+ dist = chroma_results['distances'][0][i]
361
+ similarity = 1.0 - dist # For cosine distance
362
+
363
+ expander_title = f"Result {i+1} (Similarity: {similarity:.3f}) - Source: {meta.get('source', 'N/A')}"
364
+ with st.expander(expander_title):
365
+ st.markdown("**Retrieved Text:**")
366
+ st.markdown(f"> {doc}")
367
+ st.markdown("**Metadata:**")
368
+ # Nicer metadata display
369
+ meta_display = {k: v for k, v in meta.items() if v} # Filter empty values
370
+ st.json(meta_display, expanded=False)
371
+
372
+ # Provide link/info if related image exists
373
+ if meta.get("IMAGE_ID"):
374
+ st.info(f"ℹ️ Associated Visual: `{meta['IMAGE_ID']}`")
375
+
376
+ elif chroma_results is not None: # Query ran, no results
377
+ st.warning("⚠️ No relevant information found in the knowledge base for this analysis.")
378
+ status_query.update(label="⚠️ No results found", state="warning")
379
+ else: # Query failed (error handled in query_chroma)
380
+ status_query.update(label="πŸ’₯ Query Error", state="error")
381
+
382
+ except Exception as e:
383
+ logger.error(f"Unhandled error during query status block: {e}", exc_info=True)
384
+ st.error(f"❌ An unexpected error occurred during the knowledge base search: {e}")
385
+ status_query.update(label="πŸ’₯ Query Process Error", state="error")
386
+
387
+ elif not analysis_successful:
388
+ st.info("Cannot proceed to knowledge base search as image analysis failed or was blocked.")
389
+
390
+ elif not uploaded_file:
391
+ st.info("Analysis results and related information will appear here once an image is uploaded and processed.")
392
  else:
393
+ # This case means initialization failed earlier, message already shown.
394
+ st.info("Waiting for components to initialize...")
395
 
396
 
397
+ # --- Footer ---
398
  st.markdown("---")
399
+ st.caption("Ensure responsible use. Verify all findings with qualified professionals.")