mgbam commited on
Commit
21689c4
·
verified ·
1 Parent(s): 5709238

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +234 -0
app.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import google.generativeai as genai
3
+ import chromadb
4
+ from chromadb.utils import embedding_functions
5
+ from PIL import Image
6
+ import os
7
+ import io
8
+ import time # To create unique IDs for Chroma
9
+
10
+ # --- Configuration ---
11
+ try:
12
+ # Try loading secrets from Hugging Face secrets first
13
+ GOOGLE_API_KEY = st.secrets["GOOGLE_API_KEY"]
14
+ genai.configure(api_key=GOOGLE_API_KEY)
15
+ except KeyError:
16
+ st.error("GOOGLE_API_KEY not found in Hugging Face secrets!")
17
+ st.stop()
18
+ except Exception as e:
19
+ st.error(f"Error configuring Google AI: {e}")
20
+ st.stop()
21
+
22
+ # --- Gemini Model Setup ---
23
+ # Check available models if needed, select the vision model
24
+ # for m in genai.list_models():
25
+ # if 'generateContent' in m.supported_generation_methods:
26
+ # print(m.name) # Find the vision model name (e.g., 'gemini-pro-vision')
27
+
28
+ VISION_MODEL_NAME = "gemini-pro-vision"
29
+ GENERATION_CONFIG = {
30
+ "temperature": 0.2, # Lower temp for more factual descriptions
31
+ "top_p": 0.95,
32
+ "top_k": 40,
33
+ "max_output_tokens": 1024,
34
+ }
35
+ SAFETY_SETTINGS = [ # Adjust safety settings as needed for medical content
36
+ {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
37
+ {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
38
+ {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
39
+ {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
40
+ ]
41
+
42
+ try:
43
+ gemini_model = genai.GenerativeModel(
44
+ model_name=VISION_MODEL_NAME,
45
+ generation_config=GENERATION_CONFIG,
46
+ safety_settings=SAFETY_SETTINGS
47
+ )
48
+ except Exception as e:
49
+ st.error(f"Error initializing Gemini Model ({VISION_MODEL_NAME}): {e}")
50
+ st.stop()
51
+
52
+ # --- Chroma DB Setup ---
53
+ # Using persistent storage within the HF Space (data lost if space is wiped)
54
+ # For production, consider a hosted Chroma or other DB solution.
55
+ CHROMA_PATH = "chroma_data"
56
+ COLLECTION_NAME = "medical_docs"
57
+
58
+ # Use a default sentence transformer embedding function (runs locally on HF space CPU)
59
+ # For better domain adaptation, consider finetuned medical embeddings if possible/available.
60
+ # Make sure the model used here matches the one used when INGESTING data.
61
+ embedding_func = embedding_functions.DefaultEmbeddingFunction()
62
+
63
+ try:
64
+ chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
65
+ # Get or create the collection with the specified embedding function
66
+ collection = chroma_client.get_or_create_collection(
67
+ name=COLLECTION_NAME,
68
+ embedding_function=embedding_func,
69
+ metadata={"hnsw:space": "cosine"} # Use cosine distance
70
+ )
71
+ except Exception as e:
72
+ st.error(f"Error initializing Chroma DB at '{CHROMA_PATH}': {e}")
73
+ st.info("If this is the first run, the directory will be created.")
74
+ # Attempt creation again more robustly if needed, or guide user.
75
+ st.stop()
76
+
77
+
78
+ # --- Helper Functions ---
79
+ def analyze_image_with_gemini(image_bytes):
80
+ """Sends image bytes to Gemini Vision and returns the text description."""
81
+ try:
82
+ img = Image.open(io.BytesIO(image_bytes))
83
+ prompt = """Analyze this medical image (could be a pathology slide, diagram, or other medical visual).
84
+ Describe the key visual features relevant to a medical professional.
85
+ Identify potential:
86
+ - Diseases or conditions suggested
87
+ - Pathological findings (e.g., cellular morphology, tissue structure, staining patterns)
88
+ - Cell types visible
89
+ - Relevant biomarkers (if inferrable from staining or morphology)
90
+ - Anatomical context (if clear)
91
+
92
+ Be concise and focus on visually evident information.
93
+ """
94
+ response = gemini_model.generate_content([prompt, img])
95
+ # Handle potential blocked responses or errors
96
+ if not response.parts:
97
+ # Check if it was blocked
98
+ if response.prompt_feedback and response.prompt_feedback.block_reason:
99
+ return f"Analysis blocked: {response.prompt_feedback.block_reason}"
100
+ else:
101
+ # Some other issue, maybe no response text?
102
+ return "Error: Gemini analysis failed or returned no content."
103
+
104
+ return response.text
105
+ except genai.types.BlockedPromptException as e:
106
+ st.error(f"Gemini request blocked: {e}")
107
+ return f"Analysis blocked due to safety settings: {e}"
108
+ except Exception as e:
109
+ st.error(f"Error during Gemini analysis: {e}")
110
+ return f"Error analyzing image: {e}"
111
+
112
+
113
+ def query_chroma(query_text, n_results=5):
114
+ """Queries the Chroma collection with the given text."""
115
+ try:
116
+ results = collection.query(
117
+ query_texts=[query_text],
118
+ n_results=n_results,
119
+ include=['documents', 'metadatas', 'distances'] # Include distances for relevance sorting
120
+ )
121
+ return results
122
+ except Exception as e:
123
+ st.error(f"Error querying Chroma DB: {e}")
124
+ return None
125
+
126
+ def add_dummy_data_to_chroma():
127
+ """Adds some example medical text snippets to Chroma."""
128
+ # --- IMPORTANT ---
129
+ # In a real scenario, this data would come from processing actual medical documents
130
+ # (papers, reports) using a tool like Unstructured (as in the original article)
131
+ # or manual curation to extract text and METADATA, including IMAGE_IDs.
132
+ # The embeddings generated here MUST match the query embedding function.
133
+ st.info("Adding dummy data to Chroma DB...")
134
+ docs = [
135
+ "Figure 1A shows adenocarcinoma of the lung, papillary subtype. Note the glandular structures and nuclear atypia. TTF-1 staining was positive.",
136
+ "Pathology slide 34B demonstrates high-grade glioma (glioblastoma) with significant necrosis and microvascular proliferation. Ki-67 index was high.",
137
+ "This diagram illustrates the EGFR signaling pathway and common mutation sites targeted by tyrosine kinase inhibitors in non-small cell lung cancer.",
138
+ "Micrograph showing chronic gastritis with Helicobacter pylori organisms (visible with special stain, not shown here). Mild intestinal metaplasia is present.",
139
+ "Slide CJD-Sample-02: Spongiform changes characteristic of prion disease are evident in the cerebral cortex. Gliosis is also noted."
140
+ ]
141
+ metadatas = [
142
+ {"source": "Example Paper 1", "entities": {"DISEASES": ["adenocarcinoma", "lung cancer"], "PATHOLOGY_FINDINGS": ["glandular structures", "nuclear atypia", "papillary subtype"], "BIOMARKERS": ["TTF-1"]}, "IMAGE_ID": "fig_1a_adeno_lung.png"},
143
+ {"source": "Path Report 789", "entities": {"DISEASES": ["high-grade glioma", "glioblastoma"], "PATHOLOGY_FINDINGS": ["necrosis", "microvascular proliferation"], "BIOMARKERS": ["Ki-67"]}, "IMAGE_ID": "slide_34b_gbm.tiff"},
144
+ {"source": "Textbook Chapter 5", "entities": {"GENES": ["EGFR"], "DRUGS": ["tyrosine kinase inhibitors"], "DISEASES": ["non-small cell lung cancer"]}, "IMAGE_ID": "diagram_egfr_pathway.svg"},
145
+ {"source": "Path Report 101", "entities": {"DISEASES": ["chronic gastritis", "Helicobacter pylori infection"], "PATHOLOGY_FINDINGS": ["intestinal metaplasia"]}, "IMAGE_ID": "micrograph_h_pylori_gastritis.jpg"},
146
+ {"source": "Case Study CJD", "entities": {"DISEASES": ["prion disease"], "PATHOLOGY_FINDINGS": ["Spongiform changes", "Gliosis"], "ANATOMICAL_LOCATIONS": ["cerebral cortex"]}, "IMAGE_ID": "slide_cjd_sample_02.jpg"}
147
+ ]
148
+ ids = [f"doc_{int(time.time())}_{i}" for i in range(len(docs))] # Unique IDs
149
+
150
+ try:
151
+ # Check if docs with these exact texts already exist to avoid duplicates on rerun
152
+ existing = collection.get(where={"$or": [{"document": doc} for doc in docs]})
153
+ if not existing or not existing['ids']: # Only add if none exist
154
+ collection.add(
155
+ documents=docs,
156
+ metadatas=metadatas,
157
+ ids=ids
158
+ )
159
+ st.success(f"Added {len(docs)} dummy documents to Chroma collection '{COLLECTION_NAME}'.")
160
+ else:
161
+ st.warning("Dummy data seems to already exist in the collection.")
162
+
163
+ except Exception as e:
164
+ st.error(f"Error adding dummy data to Chroma: {e}")
165
+
166
+
167
+ # --- Streamlit UI ---
168
+ st.set_page_config(layout="wide")
169
+ st.title("⚕️ Medical Image Analysis & RAG")
170
+ st.markdown("Upload a medical image (pathology slide, diagram, etc.). Gemini Vision will analyze it, and Chroma DB will retrieve related information from a knowledge base.")
171
+
172
+ # Sidebar for controls
173
+ with st.sidebar:
174
+ st.header("Controls")
175
+ uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png", "tiff", "webp"])
176
+ if st.button("Load Dummy KB Data"):
177
+ add_dummy_data_to_chroma()
178
+ st.info("Note: Chroma data persists in the Space's storage but is lost if the Space is reset/deleted.")
179
+
180
+
181
+ # Main area for display
182
+ if uploaded_file is not None:
183
+ # Read image bytes
184
+ image_bytes = uploaded_file.getvalue()
185
+
186
+ # Display the uploaded image
187
+ st.image(image_bytes, caption=f"Uploaded Image: {uploaded_file.name}", use_column_width=False, width=400)
188
+
189
+ st.markdown("---")
190
+ st.subheader("🔬 Gemini Vision Analysis")
191
+
192
+ # Analyze image with Gemini
193
+ with st.spinner("Analyzing image with Gemini Vision..."):
194
+ analysis_text = analyze_image_with_gemini(image_bytes)
195
+
196
+ if analysis_text.startswith("Error:") or analysis_text.startswith("Analysis blocked:"):
197
+ st.error(analysis_text)
198
+ else:
199
+ st.markdown(analysis_text)
200
+
201
+ st.markdown("---")
202
+ st.subheader("📚 Related Information from Knowledge Base (Chroma DB)")
203
+
204
+ # Query Chroma DB using the Gemini analysis text
205
+ with st.spinner("Querying Chroma DB..."):
206
+ chroma_results = query_chroma(analysis_text)
207
+
208
+ if chroma_results and chroma_results.get('documents') and chroma_results['documents'][0]:
209
+ st.success(f"Found {len(chroma_results['documents'][0])} related entries:")
210
+ for i in range(len(chroma_results['documents'][0])):
211
+ doc = chroma_results['documents'][0][i]
212
+ meta = chroma_results['metadatas'][0][i]
213
+ dist = chroma_results['distances'][0][i]
214
+
215
+ with st.expander(f"Result {i+1} (Distance: {dist:.4f}) - Source: {meta.get('source', 'N/A')}"):
216
+ st.markdown("**Text:**")
217
+ st.markdown(doc)
218
+ st.markdown("**Metadata:**")
219
+ st.json(meta) # Display all metadata nicely
220
+
221
+ # Highlight if it references another image
222
+ if meta.get("IMAGE_ID"):
223
+ st.info(f"ℹ️ This text describes another visual asset: `{meta['IMAGE_ID']}`")
224
+ # In a real app, you might fetch/display this image if available
225
+ elif chroma_results is not None: # Query ran but found nothing
226
+ st.warning("No relevant information found in the knowledge base for this analysis.")
227
+ else: # Error occurred during query
228
+ st.error("Failed to retrieve results from Chroma DB.")
229
+
230
+ else:
231
+ st.info("Upload an image using the sidebar to start the analysis.")
232
+
233
+ st.markdown("---")
234
+ st.markdown("Powered by Google Gemini, Chroma DB, and Streamlit.")