Spaces:

mgbam
/

AuditXCodeInsights

Sleeping

App Files Files Community

mgbam commited on Apr 6

Commit

ed31030

verified ·

1 Parent(s): 228cbf8

Update app.py

Browse files

Files changed (1) hide show

app.py +254 -268

app.py CHANGED Viewed

@@ -1,9 +1,16 @@
-# -*- coding: utf-8 -*-
 """
 Streamlit application for Medical Image Analysis using Google Gemini Vision
-and Retrieval-Augmented Generation (RAG) with Chroma DB.
-Optimized for deployment on Hugging Face Spaces.
 """
 # --- Imports ---
@@ -11,7 +18,6 @@ import streamlit as st
 import google.generativeai as genai
 import chromadb
 from chromadb.utils import embedding_functions
-from chromadb.api.types import EmbeddingFunction # For type hinting
 from PIL import Image
 import io
 import time
@@ -22,9 +28,24 @@ from typing import Optional, Dict, List, Any, Tuple
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# --- Configuration Constants ---
-# Model and API Configuration
-GOOGLE_API_KEY_SECRET = "GOOGLE_API_KEY" # Name of the HF Secret
 VISION_MODEL_NAME = "gemini-pro-vision"
 GENERATION_CONFIG = {
     "temperature": 0.2,
@@ -38,269 +59,258 @@ SAFETY_SETTINGS = [
     {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
     {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
 ]
 # Chroma DB Configuration
-# Using persistent storage within the HF Space (relative path)
-# NOTE: Ensure your HF Space has persistent storage enabled if you need data to survive restarts.
-CHROMA_PATH = "chroma_data_hf"
-COLLECTION_NAME = "medical_docs_v2"
-# Embedding Function - Using Default (all-MiniLM-L6-v2).
-# For better medical relevance, consider models fine-tuned on biomedical text.
-# Examples (might require installing `sentence-transformers` explicitly):
-# - 'sentence-transformers/all-MiniLM-L6-v2' (Default, General Purpose)
-# - 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' (Needs adapter usually)
-# - 'emilyalsentzer/Bio_ClinicalBERT' (Needs adapter usually)
-# Check Sentence Transformers documentation for loading Hugging Face models directly.
-# Make sure the model chosen is consistent between indexing and querying.
-EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Or specify a different HF model name
-CHROMA_DISTANCE_FUNCTION = "cosine" # Use cosine similarity
-# UI Configuration
-MAX_RAG_RESULTS = 3 # Number of results to fetch from Chroma
-# --- Initialization Functions with Caching ---
 @st.cache_resource
-def configure_google_ai() -> bool:
-    """Configures the Google AI SDK using secrets."""
-    try:
-        google_api_key = st.secrets[GOOGLE_API_KEY_SECRET]
-        genai.configure(api_key=google_api_key)
-        logger.info("Google AI SDK configured successfully.")
-        return True
-    except KeyError:
-        st.error(f"❌ **Error:** '{GOOGLE_API_KEY_SECRET}' not found in Hugging Face Secrets.")
-        logger.error(f"Secret '{GOOGLE_API_KEY_SECRET}' not found.")
-        return False
-    except Exception as e:
-        st.error(f"❌ **Error:** Failed to configure Google AI SDK: {e}")
-        logger.error(f"Error configuring Google AI SDK: {e}", exc_info=True)
-        return False
-@st.cache_resource
-def get_gemini_model() -> Optional[genai.GenerativeModel]:
     """Initializes and returns the Gemini Generative Model."""
-    if not configure_google_ai():
-        return None
     try:
         model = genai.GenerativeModel(
             model_name=VISION_MODEL_NAME,
             generation_config=GENERATION_CONFIG,
             safety_settings=SAFETY_SETTINGS
         )
-        logger.info(f"Gemini Model '{VISION_MODEL_NAME}' initialized.")
         return model
     except Exception as e:
-        st.error(f"❌ **Error:** Failed to initialize Gemini Model ({VISION_MODEL_NAME}): {e}")
-        logger.error(f"Error initializing Gemini Model: {e}", exc_info=True)
         return None
 @st.cache_resource
-def get_embedding_function() -> Optional[EmbeddingFunction]:
-    """Initializes and returns the embedding function."""
     try:
-        # Using DefaultEmbeddingFunction which leverages sentence-transformers
-        # Ensure sentence-transformers library is installed
-        ef = embedding_functions.DefaultEmbeddingFunction(model_name=EMBEDDING_MODEL_NAME)
-        logger.info(f"Initialized embedding function with model: {EMBEDDING_MODEL_NAME}")
-        return ef
     except Exception as e:
-        st.error(f"❌ **Error:** Failed to initialize embedding function ({EMBEDDING_MODEL_NAME}): {e}")
-        logger.error(f"Error initializing embedding function: {e}", exc_info=True)
         return None
 @st.cache_resource
-def get_chroma_collection() -> Optional[chromadb.Collection]:
-    """Initializes ChromaDB client and returns the specified collection."""
-    embedding_func = get_embedding_function()
-    if not embedding_func:
         return None
     try:
         chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
-        logger.info(f"ChromaDB client initialized with path: {CHROMA_PATH}")
         collection = chroma_client.get_or_create_collection(
             name=COLLECTION_NAME,
-            embedding_function=embedding_func,
-            metadata={"hnsw:space": CHROMA_DISTANCE_FUNCTION}
         )
-        logger.info(f"ChromaDB collection '{COLLECTION_NAME}' loaded/created.")
         return collection
     except Exception as e:
-        st.error(f"❌ **Error:** Failed to initialize Chroma DB collection '{COLLECTION_NAME}': {e}")
-        st.info(f"ℹ️ Attempted path: '{CHROMA_PATH}'. Ensure write permissions and space.")
-        logger.error(f"Error initializing Chroma DB: {e}", exc_info=True)
         return None
-# --- Helper Functions ---
-def analyze_image_with_gemini(gemini_model: genai.GenerativeModel, image_bytes: bytes) -> Tuple[Optional[str], bool]:
     """
-    Analyzes image bytes with Gemini Vision.
-    Args:
-        gemini_model: The initialized Gemini model instance.
-        image_bytes: The image data as bytes.
-    Returns:
-        A tuple containing:
-        - The analysis text (str) or None if error/blocked.
-        - A boolean indicating success (True) or failure/block (False).
     """
     try:
         img = Image.open(io.BytesIO(image_bytes))
-        prompt = """Analyze this medical image (e.g., pathology slide, diagram, scan).
-Describe key visual features relevant for medical context (structures, cells, staining, anomalies).
-Identify potential findings:
-- Possible conditions or disease indicators
-- Pathological features (morphology, patterns)
-- Visible cell types or tissue structures
-- Relevant biomarkers (if suggested by visuals)
-- Anatomical context (if clear)
-Focus on visual evidence. Be concise. Avoid definitive diagnosis. State uncertainties clearly.
-"""
-        response = gemini_model.generate_content([prompt, img], stream=False) # Use stream=False for simpler handling here
-        response.resolve() # Ensure response is fully processed if stream=True was used
         if not response.parts:
-            reason = "Unknown reason"
             if response.prompt_feedback and response.prompt_feedback.block_reason:
-                reason = response.prompt_feedback.block_reason.name # Get the reason enum name
-            logger.warning(f"Gemini analysis blocked or empty. Reason: {reason}")
-            st.warning(f"⚠️ Analysis blocked by safety filters or returned empty. Reason: {reason}")
-            return None, False
         logger.info("Gemini analysis successful.")
-        return response.text, True
     except genai.types.BlockedPromptException as e:
-        logger.error(f"Gemini analysis blocked due to prompt: {e}")
-        st.error(f"❌ **Analysis Blocked:** The prompt content triggered safety filters: {e}")
-        return None, False
     except Exception as e:
-        logger.error(f"Error during Gemini analysis: {e}", exc_info=True)
-        st.error(f"❌ **Error:** An unexpected error occurred during Gemini analysis: {e}")
-        return None, False
-def query_chroma(collection: chromadb.Collection, query_text: str, n_results: int = 3) -> Optional[Dict[str, List[Any]]]:
-    """Queries the Chroma collection."""
     if not query_text:
-        logger.warning("Chroma query attempted with empty text.")
-        st.warning("⚠️ Cannot query knowledge base without analysis text.")
         return None
     try:
-        results = collection.query(
-            query_texts=[query_text],
             n_results=n_results,
             include=['documents', 'metadatas', 'distances']
         )
-        logger.info(f"ChromaDB query executed successfully for text: '{query_text[:50]}...'")
         return results
     except Exception as e:
-        logger.error(f"Error querying Chroma DB: {e}", exc_info=True)
-        st.error(f"❌ **Error:** Failed to query the knowledge base: {e}")
         return None
-# Function to add dummy data (Consider moving to a separate setup script for cleaner app code)
-def add_dummy_data_to_chroma(collection: chromadb.Collection):
-    """Adds predefined example medical text snippets to the Chroma collection."""
-    st.info("Attempting to add dummy data to Chroma DB...")
-    # --- (Same dummy data as before - Keep for demonstration) ---
-    docs = [
-        "Figure 1A shows adenocarcinoma of the lung, papillary subtype. Note the glandular structures and nuclear atypia. TTF-1 staining was positive.",
-        "Pathology slide 34B demonstrates high-grade glioma (glioblastoma) with significant necrosis and microvascular proliferation. Ki-67 index was high.",
-        "Diagram: EGFR signaling pathway mutations in NSCLC targeted by TKIs.", # Shorter version
-        "Micrograph: Chronic gastritis with H. pylori organisms (special stain needed). Mild intestinal metaplasia noted.", # Shorter
-        "Slide CJD-02: Spongiform changes in cerebral cortex characteristic of prion disease. Gliosis present." # Shorter
-    ]
-    metadatas = [
-        {"source": "Example Paper 1", "topic": "Lung Cancer Pathology", "entities": "adenocarcinoma, lung cancer, glandular structures, nuclear atypia, papillary subtype, TTF-1", "IMAGE_ID": "fig_1a_adeno_lung.png"},
-        {"source": "Path Report 789", "topic": "Brain Tumor Pathology", "entities": "high-grade glioma, glioblastoma, necrosis, microvascular proliferation, Ki-67", "IMAGE_ID": "slide_34b_gbm.tiff"},
-        {"source": "Textbook Chapter 5", "topic": "Molecular Oncology", "entities": "EGFR, TKIs, NSCLC, signaling pathway", "IMAGE_ID": "diagram_egfr_pathway.svg"},
-        {"source": "Path Report 101", "topic": "Gastrointestinal Pathology", "entities": "chronic gastritis, Helicobacter pylori, intestinal metaplasia", "IMAGE_ID": "micrograph_h_pylori_gastritis.jpg"},
-        {"source": "Case Study CJD", "topic": "Neuropathology", "entities": "prion disease, Spongiform changes, Gliosis, cerebral cortex", "IMAGE_ID": "slide_cjd_sample_02.jpg"}
-    ]
-    # Generate potentially more stable IDs for demo purposes if needed, but time-based is fine too
-    # Example: ids = [f"dummy_doc_{i+1}" for i in range(len(docs))]
-    ids = [f"doc_{int(time.time())}_{i}" for i in range(len(docs))]
     try:
-        # Simple check if *any* of these specific texts exist (for demo)
         existing_docs = collection.get(where={"$or": [{"document": doc} for doc in docs]}, include=[])
         if not existing_docs or not existing_docs.get('ids'):
             collection.add(
                 documents=docs,
                 metadatas=metadatas,
                 ids=ids
             )
-            logger.info(f"Added {len(docs)} dummy documents to Chroma collection '{COLLECTION_NAME}'.")
-            st.success(f"✅ Added {len(docs)} dummy documents to Chroma collection '{COLLECTION_NAME}'.")
         else:
-            logger.warning("Dummy data check indicates data might already exist. Skipping addition.")
-            st.warning("⚠️ Dummy data seems to already exist in the collection. No new data added.")
     except Exception as e:
-        logger.error(f"Error adding dummy data to Chroma: {e}", exc_info=True)
-        st.error(f"❌ **Error:** Could not add dummy data to Chroma: {e}")
-# --- Streamlit UI ---
-st.set_page_config(layout="wide", page_title="Medical Image RAG - HF", page_icon="⚕️")
-st.title("⚕️ Medical Image Analysis & RAG")
-st.markdown("""
-*Powered by Google Gemini, ChromaDB, and Streamlit on Hugging Face Spaces*
-""")
-# --- CRITICAL DISCLAIMER ---
 st.warning("""
-**⚠️ Disclaimer:** This tool is for informational and illustrative purposes ONLY.
-It is **NOT** a medical device and **CANNOT** provide a diagnosis. AI analysis may be
-imperfect or incomplete. **ALWAYS** consult qualified medical professionals for any
-health concerns or decisions. Do **NOT** rely solely on this tool for medical judgment.
 """)
-# --- Initialize Services ---
-gemini_model = get_gemini_model()
-chroma_collection = get_chroma_collection()
-# Check if critical components failed initialization
-if not gemini_model or not chroma_collection:
-    st.error("❌ Critical components failed to initialize. Cannot proceed. Check logs and secrets.")
-    st.stop() # Stop execution if core components aren't ready
-# --- Sidebar Controls ---
 with st.sidebar:
     st.header("⚙️ Controls")
     uploaded_file = st.file_uploader(
-        "1. Upload Medical Image",
         type=["jpg", "jpeg", "png", "tiff", "webp"],
-        help="Upload formats like pathology slides, diagrams, scans."
     )
     st.divider()
-    st.header("📚 Knowledge Base")
-    if st.button("➕ Add Dummy KB Data", help="Add example text data to the Chroma vector database for demonstration."):
-        if chroma_collection:
-             add_dummy_data_to_chroma(chroma_collection)
-        else:
-             st.error("❌ Chroma DB not available to add data.")
     st.info(f"""
-    **KB Info:**
-    - **Collection:** `{COLLECTION_NAME}`
-    - **Storage:** `{CHROMA_PATH}` (in Space storage)
-    - **Embeddings:** `{EMBEDDING_MODEL_NAME}`
-    - **Similarity:** `{CHROMA_DISTANCE_FUNCTION}`
     """)
-    st.caption("Note: Data persists if persistent storage is enabled for this Space, otherwise it's temporary.")
-# --- Main Processing Area ---
 col1, col2 = st.columns(2)
 with col1:
@@ -309,91 +319,67 @@ with col1:
         image_bytes = uploaded_file.getvalue()
         st.image(image_bytes, caption=f"Uploaded: {uploaded_file.name}", use_column_width=True)
     else:
-        st.info("Upload an image using the sidebar to begin analysis.")
 with col2:
-    st.subheader("🤖 AI Analysis & Retrieval")
-    if uploaded_file is not None and gemini_model and chroma_collection:
-        analysis_text = None
-        analysis_successful = False
-        # Step 1: Analyze Image with Gemini
-        with st.status("🧠 Analyzing image with Gemini Vision...", expanded=False) as status_analysis:
-            try:
-                st.write("Sending image to Gemini...")
-                analysis_text, analysis_successful = analyze_image_with_gemini(gemini_model, image_bytes)
-                if analysis_successful:
-                    st.write("Analysis complete.")
-                    status_analysis.update(label="✅ Analysis Complete", state="complete")
-                else:
-                    # Error/block message already shown by helper function
-                    status_analysis.update(label="⚠️ Analysis Failed or Blocked", state="error")
-            except Exception as e: # Catch potential unexpected errors here too
-                 logger.error(f"Unhandled error during analysis status block: {e}", exc_info=True)
-                 st.error(f"❌ An unexpected error occurred during the analysis process: {e}")
-                 status_analysis.update(label="💥 Analysis Error", state="error")
-                 analysis_successful = False # Ensure flag is False
-        # Display Analysis Result if successful
-        if analysis_successful and analysis_text:
-            st.markdown("**🔬 Gemini Vision Analysis:**")
-            st.markdown(analysis_text)
-            st.divider() # Separator
-            # Step 2: Query Chroma DB with Analysis Text
-            st.markdown("**📚 Related Information (RAG via Chroma DB):**")
-            with st.status("🔍 Searching knowledge base...", expanded=True) as status_query:
-                try:
-                    st.write(f"Querying with analysis summary (top {MAX_RAG_RESULTS} results)...")
-                    chroma_results = query_chroma(chroma_collection, analysis_text, n_results=MAX_RAG_RESULTS)
-                    if chroma_results and chroma_results.get('documents') and chroma_results['documents'][0]:
-                        num_results = len(chroma_results['documents'][0])
-                        st.write(f"Found {num_results} related entries.")
-                        status_query.update(label=f"✅ Found {num_results} results", state="complete")
-                        # Display RAG Results
-                        for i in range(num_results):
-                            doc = chroma_results['documents'][0][i]
-                            meta = chroma_results['metadatas'][0][i]
-                            dist = chroma_results['distances'][0][i]
-                            similarity = 1.0 - dist # For cosine distance
-                            expander_title = f"Result {i+1} (Similarity: {similarity:.3f}) - Source: {meta.get('source', 'N/A')}"
-                            with st.expander(expander_title):
-                                st.markdown("**Retrieved Text:**")
-                                st.markdown(f"> {doc}")
-                                st.markdown("**Metadata:**")
-                                # Nicer metadata display
-                                meta_display = {k: v for k, v in meta.items() if v} # Filter empty values
-                                st.json(meta_display, expanded=False)
-                                # Provide link/info if related image exists
-                                if meta.get("IMAGE_ID"):
-                                    st.info(f"ℹ️ Associated Visual: `{meta['IMAGE_ID']}`")
-                    elif chroma_results is not None: # Query ran, no results
-                        st.warning("⚠️ No relevant information found in the knowledge base for this analysis.")
-                        status_query.update(label="⚠️ No results found", state="warning")
-                    else: # Query failed (error handled in query_chroma)
-                        status_query.update(label="💥 Query Error", state="error")
-                except Exception as e:
-                    logger.error(f"Unhandled error during query status block: {e}", exc_info=True)
-                    st.error(f"❌ An unexpected error occurred during the knowledge base search: {e}")
-                    status_query.update(label="💥 Query Process Error", state="error")
-        elif not analysis_successful:
-            st.info("Cannot proceed to knowledge base search as image analysis failed or was blocked.")
     elif not uploaded_file:
-        st.info("Analysis results and related information will appear here once an image is uploaded and processed.")
     else:
-        # This case means initialization failed earlier, message already shown.
-        st.info("Waiting for components to initialize...")
-# --- Footer ---
 st.markdown("---")
-st.caption("Ensure responsible use. Verify all findings with qualified professionals.")

+# --- Docstring ---
 """
 Streamlit application for Medical Image Analysis using Google Gemini Vision
+and Retrieval-Augmented Generation (RAG) with Chroma DB, enhanced for
+Hugging Face Spaces deployment and improved practices.
+Features:
+- Image analysis via Google Gemini Pro Vision.
+- RAG using Chroma DB with Hugging Face embeddings.
+- Caching for performance.
+- Basic logging.
+- Improved UX and error handling.
+- Explicit Disclaimer.
 """
 # --- Imports ---
 import google.generativeai as genai
 import chromadb
 from chromadb.utils import embedding_functions
 from PIL import Image
 import io
 import time
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# --- Application Configuration ---
+# Secrets Management (Prioritize Hugging Face Secrets)
+try:
+    GOOGLE_API_KEY = st.secrets["GOOGLE_API_KEY"]
+    # HF_TOKEN is optional for many public models, but required for gated/private ones
+    HF_TOKEN = st.secrets.get("HF_TOKEN") # Use .get() for optional token
+except KeyError as e:
+    err_msg = f"❌ Missing Secret: {e}. Please add it to your Hugging Face Space secrets."
+    st.error(err_msg)
+    logger.error(err_msg)
+    st.stop()
+except Exception as e:
+    err_msg = f"❌ Error loading secrets: {e}"
+    st.error(err_msg)
+    logger.error(err_msg)
+    st.stop()
+# Gemini Configuration
 VISION_MODEL_NAME = "gemini-pro-vision"
 GENERATION_CONFIG = {
     "temperature": 0.2,
     {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
     {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
 ]
+GEMINI_ANALYSIS_PROMPT = """Analyze this medical image (e.g., pathology slide, diagram, scan).
+Describe the key visual features relevant to a medical context.
+Identify potential:
+- Diseases or conditions indicated
+- Pathological findings (e.g., cellular morphology, tissue structure, staining patterns)
+- Visible cell types
+- Relevant biomarkers (if inferable from staining or morphology)
+- Anatomical context (if discernible)
+Be concise and focus primarily on visually evident information. Avoid definitive diagnoses.
+Structure the output clearly, perhaps using bullet points for findings.
+"""
 # Chroma DB Configuration
+CHROMA_PATH = "chroma_data_hf" # Use a distinct path if needed
+COLLECTION_NAME = "medical_docs_hf"
+# IMPORTANT: Choose an appropriate HF embedding model. 'all-mpnet-base-v2' is general purpose.
+# For better medical results, consider models like:
+# - 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' (might need more RAM/compute)
+# - 'dmis-lab/sapbert-from-pubmedbert-sentencetransformer'
+# - Other models tagged 'medical' or 'biomedical' on Hugging Face Hub.
+# Ensure the chosen model is compatible with chromadb's HuggingFaceEmbeddingFunction.
+EMBEDDING_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" # <-- REPLACE if possible
+CHROMA_DISTANCE_METRIC = "cosine"
+# --- Caching Resource Initialization ---
 @st.cache_resource
+def initialize_gemini_model() -> Optional[genai.GenerativeModel]:
     """Initializes and returns the Gemini Generative Model."""
     try:
+        genai.configure(api_key=GOOGLE_API_KEY)
         model = genai.GenerativeModel(
             model_name=VISION_MODEL_NAME,
             generation_config=GENERATION_CONFIG,
             safety_settings=SAFETY_SETTINGS
         )
+        logger.info(f"Successfully initialized Gemini Model: {VISION_MODEL_NAME}")
         return model
     except Exception as e:
+        err_msg = f"❌ Error initializing Gemini Model ({VISION_MODEL_NAME}): {e}"
+        st.error(err_msg)
+        logger.error(err_msg, exc_info=True)
         return None
 @st.cache_resource
+def initialize_embedding_function() -> Optional[embedding_functions.HuggingFaceEmbeddingFunction]:
+    """Initializes and returns the Hugging Face Embedding Function."""
     try:
+        # Pass HF_TOKEN if it exists (required for private/gated models)
+        api_key_param = {"api_key": HF_TOKEN} if HF_TOKEN else {}
+        embed_func = embedding_functions.HuggingFaceEmbeddingFunction(
+            api_key=HF_TOKEN, # Pass token here if needed by model
+            model_name=EMBEDDING_MODEL_NAME
+        )
+        logger.info(f"Successfully initialized HuggingFace Embedding Function: {EMBEDDING_MODEL_NAME}")
+        return embed_func
     except Exception as e:
+        err_msg = f"❌ Error initializing HuggingFace Embedding Function ({EMBEDDING_MODEL_NAME}): {e}"
+        st.error(err_msg)
+        logger.error(err_msg, exc_info=True)
+        st.info("ℹ️ Make sure the embedding model name is correct and you have network access. "
+                "If using a private model, ensure HF_TOKEN is set in secrets.")
         return None
 @st.cache_resource
+def initialize_chroma_collection(_embedding_func: embedding_functions.EmbeddingFunction) -> Optional[chromadb.Collection]:
+    """Initializes the Chroma DB client and returns the collection."""
+    if not _embedding_func:
+        st.error("❌ Cannot initialize Chroma DB without a valid embedding function.")
         return None
     try:
         chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
         collection = chroma_client.get_or_create_collection(
             name=COLLECTION_NAME,
+            embedding_function=_embedding_func, # Pass the initialized function
+            metadata={"hnsw:space": CHROMA_DISTANCE_METRIC}
         )
+        logger.info(f"Chroma DB collection '{COLLECTION_NAME}' loaded/created at '{CHROMA_PATH}' using {CHROMA_DISTANCE_METRIC}.")
         return collection
     except Exception as e:
+        err_msg = f"❌ Error initializing Chroma DB at '{CHROMA_PATH}': {e}"
+        st.error(err_msg)
+        logger.error(err_msg, exc_info=True)
+        st.info(f"ℹ️ Ensure the path '{CHROMA_PATH}' is writable.")
         return None
+# --- Core Logic Functions (with Caching for Data Operations) ---
+@st.cache_data(show_spinner=False) # Show spinner manually in UI
+def analyze_image_with_gemini(_gemini_model: genai.GenerativeModel, image_bytes: bytes) -> Tuple[str, bool]:
     """
+    Analyzes image bytes with Gemini, returns (analysis_text, is_error).
+    Uses Streamlit's caching based on image_bytes.
     """
+    if not _gemini_model:
+        return "Error: Gemini model not initialized.", True
     try:
         img = Image.open(io.BytesIO(image_bytes))
+        response = _gemini_model.generate_content([GEMINI_ANALYSIS_PROMPT, img])
         if not response.parts:
             if response.prompt_feedback and response.prompt_feedback.block_reason:
+                reason = response.prompt_feedback.block_reason
+                msg = f"Analysis blocked by safety settings: {reason}"
+                logger.warning(msg)
+                return msg, True # Indicate block/error state
+            else:
+                msg = "Error: Gemini analysis returned no content (empty or invalid response)."
+                logger.error(msg)
+                return msg, True
         logger.info("Gemini analysis successful.")
+        return response.text, False # Indicate success
     except genai.types.BlockedPromptException as e:
+        msg = f"Analysis blocked (prompt issue): {e}"
+        logger.warning(msg)
+        return msg, True
     except Exception as e:
+        msg = f"Error during Gemini analysis: {e}"
+        logger.error(msg, exc_info=True)
+        return msg, True
+@st.cache_data(show_spinner=False)
+def query_chroma(_collection: chromadb.Collection, query_text: str, n_results: int = 5) -> Optional[Dict[str, List[Any]]]:
+    """Queries Chroma DB, returns results dict or None on error."""
+    if not _collection:
+        return None
     if not query_text:
+        logger.warning("Attempted to query Chroma with empty text.")
         return None
     try:
+        # Placeholder for potential query refinement:
+        # refined_query = refine_query_for_chroma(query_text) # Implement this if needed
+        refined_query = query_text # Using direct analysis text for now
+        results = _collection.query(
+            query_texts=[refined_query],
             n_results=n_results,
             include=['documents', 'metadatas', 'distances']
         )
+        logger.info(f"Chroma query successful for text snippet: '{query_text[:50]}...'")
         return results
     except Exception as e:
+        err_msg = f"Error querying Chroma DB: {e}"
+        st.error(err_msg) # Show error in UI as well
+        logger.error(err_msg, exc_info=True)
         return None
+def add_dummy_data_to_chroma(collection: chromadb.Collection, embedding_func: embedding_functions.EmbeddingFunction):
+    """Adds example medical text snippets to Chroma using the provided embedding function."""
+    if not collection or not embedding_func:
+        st.error("❌ Cannot add dummy data: Chroma Collection or Embedding Function not available.")
+        return
+    status = st.status("Adding dummy data to Chroma DB...", expanded=False)
     try:
+        # --- Dummy Data Definition ---
+        # (Same data as before, but ensure metadata is useful)
+        docs = [
+            "Figure 1A shows adenocarcinoma of the lung, papillary subtype. Note the glandular structures and nuclear atypia. TTF-1 staining was positive.",
+            "Pathology slide 34B demonstrates high-grade glioma (glioblastoma) with significant necrosis and microvascular proliferation. Ki-67 index was high.",
+            "This diagram illustrates the EGFR signaling pathway and common mutation sites targeted by tyrosine kinase inhibitors in non-small cell lung cancer.",
+            "Micrograph showing chronic gastritis with Helicobacter pylori organisms (visible with special stain, not shown here). Mild intestinal metaplasia is present.",
+            "Slide CJD-Sample-02: Spongiform changes characteristic of prion disease are evident in the cerebral cortex. Gliosis is also noted."
+        ]
+        metadatas = [
+            {"source": "Example Paper 1", "topic": "Lung Cancer Pathology", "entities": "adenocarcinoma, lung cancer, glandular structures, nuclear atypia, papillary subtype, TTF-1", "IMAGE_ID": "fig_1a_adeno_lung.png"},
+            {"source": "Path Report 789", "topic": "Brain Tumor Pathology", "entities": "high-grade glioma, glioblastoma, necrosis, microvascular proliferation, Ki-67", "IMAGE_ID": "slide_34b_gbm.tiff"},
+            {"source": "Textbook Chapter 5", "topic": "Molecular Oncology Pathways", "entities": "EGFR, tyrosine kinase inhibitors, non-small cell lung cancer", "IMAGE_ID": "diagram_egfr_pathway.svg"},
+            {"source": "Path Report 101", "topic": "Gastrointestinal Pathology", "entities": "chronic gastritis, Helicobacter pylori, intestinal metaplasia", "IMAGE_ID": "micrograph_h_pylori_gastritis.jpg"},
+            {"source": "Case Study CJD", "topic": "Neuropathology", "entities": "prion disease, Spongiform changes, Gliosis, cerebral cortex", "IMAGE_ID": "slide_cjd_sample_02.jpg"}
+        ]
+        ids = [f"doc_hf_{int(time.time())}_{i}" for i in range(len(docs))]
+        # Check for existing documents (simple check based on text)
+        status.update(label="Checking for existing dummy documents...")
         existing_docs = collection.get(where={"$or": [{"document": doc} for doc in docs]}, include=[])
         if not existing_docs or not existing_docs.get('ids'):
+            status.update(label=f"Generating embeddings for {len(docs)} documents (may take time)...")
+            # Embeddings are generated implicitly by ChromaDB during .add()
+            # when an embedding_function is configured for the collection.
             collection.add(
                 documents=docs,
                 metadatas=metadatas,
                 ids=ids
             )
+            status.update(label=f"✅ Added {len(docs)} dummy documents.", state="complete")
+            logger.info(f"Added {len(docs)} dummy documents to collection '{COLLECTION_NAME}'.")
         else:
+            status.update(label="⚠️ Dummy data already exists. No new data added.", state="complete")
+            logger.warning("Dummy data seems to already exist in the collection based on text match.")
     except Exception as e:
+        err_msg = f"Error adding dummy data to Chroma: {e}"
+        status.update(label=f"❌ Error: {err_msg}", state="error")
+        logger.error(err_msg, exc_info=True)
+# --- Initialize Resources ---
+# These calls use @st.cache_resource, so they run only once per session/resource change.
+gemini_model = initialize_gemini_model()
+embedding_func = initialize_embedding_function()
+collection = initialize_chroma_collection(embedding_func) # Pass embedding func to chroma init
+# --- Streamlit UI ---
+st.set_page_config(layout="wide", page_title="Medical Image Analysis & RAG (HF)")
+st.title("⚕️ Medical Image Analysis & RAG (Hugging Face Enhanced)")
+# --- DISCLAIMER ---
 st.warning("""
+**⚠️ Disclaimer:** This tool is for demonstration and informational purposes ONLY.
+It is **NOT** a medical device and should **NOT** be used for actual medical diagnosis, treatment, or decision-making.
+AI analysis can be imperfect. Always consult with qualified healthcare professionals for any medical concerns.
+Do **NOT** upload identifiable patient data (PHI).
 """)
+st.markdown("""
+Upload a medical image. Gemini Vision will analyze it, and related information
+will be retrieved from a Chroma DB knowledge base using Hugging Face embeddings.
+""")
+# Sidebar
 with st.sidebar:
     st.header("⚙️ Controls")
     uploaded_file = st.file_uploader(
+        "Choose an image...",
         type=["jpg", "jpeg", "png", "tiff", "webp"],
+        help="Upload a medical image file (e.g., pathology, diagram)."
     )
     st.divider()
+    if st.button("➕ Add/Verify Dummy KB Data", help="Adds example text data to Chroma DB if it doesn't exist."):
+         if collection and embedding_func:
+             add_dummy_data_to_chroma(collection, embedding_func)
+         else:
+             st.error("❌ Cannot add dummy data: Chroma Collection or Embedding Function failed to initialize.")
+    st.divider()
     st.info(f"""
+    **Setup Info:**
+    - Gemini Model: `{VISION_MODEL_NAME}`
+    - Embedding Model: `{EMBEDDING_MODEL_NAME}`
+    - Chroma Collection: `{COLLECTION_NAME}` (at `{CHROMA_PATH}`)
+    - Distance Metric: `{CHROMA_DISTANCE_METRIC}`
     """)
+    st.caption(f"Using Google API Key: {'*' * (len(GOOGLE_API_KEY)-4)}{GOOGLE_API_KEY[-4:]}" if GOOGLE_API_KEY else "Not Set")
+    st.caption(f"Using HF Token: {'Provided' if HF_TOKEN else 'Not Provided'}")
+# Main Display Area
 col1, col2 = st.columns(2)
 with col1:
         image_bytes = uploaded_file.getvalue()
         st.image(image_bytes, caption=f"Uploaded: {uploaded_file.name}", use_column_width=True)
     else:
+        st.info("Upload an image using the sidebar to begin.")
 with col2:
+    st.subheader("🔬 Analysis & Retrieval")
+    if uploaded_file is not None and gemini_model and collection:
+        # 1. Analyze Image
+        analysis_text = ""
+        analysis_error = False
+        with st.status("🧠 Analyzing image with Gemini Vision...", expanded=True) as status_gemini:
+            # The actual analysis function is cached via @st.cache_data
+            analysis_text, analysis_error = analyze_image_with_gemini(gemini_model, image_bytes)
+            if analysis_error:
+                status_gemini.update(label=f"⚠️ Analysis Failed/Blocked: {analysis_text.split(':')[1].strip() if ':' in analysis_text else 'See details'}", state="error")
+                st.error(f"**Analysis Output:** {analysis_text}") # Show error/block message
+            else:
+                status_gemini.update(label="✅ Analysis Complete", state="complete")
+                st.markdown("**Gemini Vision Analysis:**")
+                st.markdown(analysis_text)
+        # 2. Query Chroma if Analysis Succeeded
+        if not analysis_error and analysis_text:
+            st.markdown("---")
+            st.subheader("📚 Related Information (RAG)")
+            with st.status("🔍 Searching knowledge base (Chroma DB)...", expanded=True) as status_chroma:
+                # The actual query function is cached via @st.cache_data
+                chroma_results = query_chroma(collection, analysis_text, n_results=3)
+                if chroma_results and chroma_results.get('documents') and chroma_results['documents'][0]:
+                    num_results = len(chroma_results['documents'][0])
+                    status_chroma.update(label=f"✅ Found {num_results} related entries.", state="complete")
+                    for i in range(num_results):
+                        doc = chroma_results['documents'][0][i]
+                        meta = chroma_results['metadatas'][0][i]
+                        dist = chroma_results['distances'][0][i]
+                        similarity = 1.0 - dist # For cosine distance
+                        expander_title = f"Result {i+1} (Similarity: {similarity:.4f}) | Source: {meta.get('source', 'N/A')}"
+                        with st.expander(expander_title):
+                            st.markdown("**Retrieved Text:**")
+                            st.markdown(f"> {doc}")
+                            st.markdown("**Metadata:**")
+                            # Display metadata keys/values more nicely
+                            for key, value in meta.items():
+                                st.markdown(f"- **{key.replace('_', ' ').title()}:** `{value}`")
+                            # Highlight linked image ID
+                            if meta.get("IMAGE_ID"):
+                                st.info(f"ℹ️ Associated visual asset ID: `{meta['IMAGE_ID']}`")
+                elif chroma_results is not None: # Query ran, no results
+                    status_chroma.update(label="⚠️ No relevant information found.", state="warning")
+                else: # Error occurred during query (already logged and shown via st.error)
+                     status_chroma.update(label="❌ Failed to retrieve results.", state="error")
     elif not uploaded_file:
+        st.info("Analysis results will appear here once an image is uploaded.")
     else:
+        st.error("❌ Analysis cannot proceed. Check if Gemini model or Chroma DB failed to initialize (see sidebar/logs).")
 st.markdown("---")
+st.markdown("<div style='text-align: center; font-size: small;'>Powered by Google Gemini, Chroma DB, Hugging Face, and Streamlit</div>", unsafe_allow_html=True)