Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,341 +1,198 @@
|
|
|
|
1 |
import os
|
2 |
-
import gradio as gr
|
3 |
import tempfile
|
4 |
from pathlib import Path
|
5 |
import base64
|
6 |
-
import fitz # PyMuPDF
|
7 |
from PIL import Image
|
8 |
import io
|
9 |
|
10 |
-
|
|
|
|
|
|
|
11 |
from langchain_community.vectorstores import FAISS
|
12 |
-
from
|
13 |
-
# Text splitter to break large documents into manageable chunks
|
14 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
15 |
-
# HF Inference client for multimodal model
|
16 |
-
from huggingface_hub import InferenceClient
|
17 |
|
18 |
# ββ Globals βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
19 |
-
index = None
|
20 |
-
retriever = None
|
21 |
-
current_pdf_name = None
|
22 |
-
extracted_content = None
|
23 |
-
extracted_images = []
|
24 |
|
25 |
# ββ Single Multimodal Model ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
26 |
-
# Using a single multimodal model that can handle both text and images
|
27 |
multimodal_client = InferenceClient(model="microsoft/Phi-3.5-vision-instruct")
|
28 |
-
|
29 |
-
# ββ Multimodal Embeddings ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
30 |
-
# Using CLIP-based embeddings that can handle both text and images
|
31 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/clip-ViT-B-32")
|
32 |
|
33 |
-
# Create
|
34 |
temp_dir = tempfile.mkdtemp()
|
35 |
figures_dir = os.path.join(temp_dir, "figures")
|
36 |
os.makedirs(figures_dir, exist_ok=True)
|
37 |
|
38 |
def encode_image_to_base64(image_path):
|
39 |
-
"""Convert image to base64 for API calls"""
|
40 |
with open(image_path, "rb") as image_file:
|
41 |
return base64.b64encode(image_file.read()).decode('utf-8')
|
42 |
|
43 |
def extract_images_from_pdf_pymupdf(pdf_path):
|
44 |
-
"""
|
45 |
-
Extract images from PDF using PyMuPDF (works on HF Spaces)
|
46 |
-
Args:
|
47 |
-
pdf_path: Path to the PDF file
|
48 |
-
Returns:
|
49 |
-
List of image paths and their descriptions
|
50 |
-
"""
|
51 |
extracted_images = []
|
52 |
image_descriptions = []
|
53 |
-
|
54 |
try:
|
55 |
-
# Open PDF with PyMuPDF
|
56 |
pdf_document = fitz.open(pdf_path)
|
57 |
-
|
58 |
for page_num in range(len(pdf_document)):
|
59 |
page = pdf_document.load_page(page_num)
|
60 |
-
|
61 |
-
|
62 |
-
for img_index, img in enumerate(image_list):
|
63 |
-
# Get image data
|
64 |
xref = img[0]
|
65 |
pix = fitz.Pixmap(pdf_document, xref)
|
66 |
-
|
67 |
-
# Convert to PIL Image
|
68 |
-
if pix.n - pix.alpha < 4: # GRAY or RGB
|
69 |
img_data = pix.tobytes("png")
|
70 |
img_pil = Image.open(io.BytesIO(img_data))
|
71 |
-
|
72 |
-
# Save image
|
73 |
image_filename = f"page_{page_num}_img_{img_index}.png"
|
74 |
image_path = os.path.join(figures_dir, image_filename)
|
75 |
img_pil.save(image_path)
|
76 |
-
|
77 |
-
# Analyze image with multimodal model
|
78 |
-
description = analyze_image_with_multimodal_model(image_path)
|
79 |
-
|
80 |
extracted_images.append(image_path)
|
81 |
-
image_descriptions.append(
|
82 |
-
|
83 |
-
pix = None # Free memory
|
84 |
-
|
85 |
pdf_document.close()
|
86 |
return extracted_images, image_descriptions
|
87 |
-
|
88 |
except Exception as e:
|
89 |
print(f"Error extracting images: {e}")
|
90 |
return [], []
|
91 |
|
92 |
def analyze_image_with_multimodal_model(image_path):
|
93 |
-
"""
|
94 |
-
Analyze an extracted image using the multimodal model.
|
95 |
-
Args:
|
96 |
-
image_path: Path to the extracted image file
|
97 |
-
Returns:
|
98 |
-
Text description of the image content
|
99 |
-
"""
|
100 |
try:
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
Image: [Image data provided]
|
108 |
-
|
109 |
-
Description:"""
|
110 |
-
|
111 |
-
# Use multimodal model for image analysis
|
112 |
-
# Note: Simplified for HF Spaces compatibility
|
113 |
-
response = multimodal_client.text_generation(
|
114 |
-
prompt=prompt,
|
115 |
-
max_new_tokens=200,
|
116 |
-
temperature=0.3
|
117 |
)
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
except Exception as e:
|
123 |
-
return f"[IMAGE CONTENT]: Could not analyze image - {
|
124 |
|
125 |
def process_pdf_multimodal(pdf_file):
|
126 |
-
"""
|
127 |
-
Process PDF using PyMuPDF (HF Spaces compatible).
|
128 |
-
"""
|
129 |
global current_pdf_name, index, retriever, extracted_content, extracted_images
|
130 |
-
|
131 |
if pdf_file is None:
|
132 |
return None, "β Please upload a PDF file.", gr.update(interactive=False)
|
133 |
|
134 |
current_pdf_name = os.path.basename(pdf_file.name)
|
135 |
-
|
|
|
|
|
|
|
136 |
try:
|
137 |
-
#
|
138 |
-
extracted_images.clear()
|
139 |
-
for file in os.listdir(figures_dir):
|
140 |
-
os.remove(os.path.join(figures_dir, file))
|
141 |
-
|
142 |
-
# Extract text using PyMuPDF
|
143 |
pdf_document = fitz.open(pdf_file.name)
|
144 |
text_elements = []
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
text_elements.append(f"[PAGE {page_num + 1}]\n{text.strip()}")
|
151 |
-
|
152 |
pdf_document.close()
|
153 |
-
|
154 |
-
#
|
155 |
-
|
156 |
-
extracted_images.extend(
|
157 |
-
|
158 |
-
# Combine
|
159 |
-
all_content = text_elements +
|
160 |
extracted_content = "\n\n".join(all_content)
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
167 |
-
chunk_size=1000,
|
168 |
-
chunk_overlap=200,
|
169 |
-
add_start_index=True
|
170 |
)
|
171 |
-
chunks =
|
172 |
-
|
173 |
-
# Create FAISS index with multimodal embeddings
|
174 |
index = FAISS.from_texts(chunks, embeddings)
|
175 |
retriever = index.as_retriever(search_kwargs={"k": 3})
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
return current_pdf_name, status, gr.update(interactive=True)
|
183 |
-
|
184 |
except Exception as e:
|
185 |
-
|
186 |
-
return current_pdf_name, error_msg, gr.update(interactive=False)
|
187 |
|
188 |
def ask_multimodal_question(pdf_name, question):
|
189 |
-
|
190 |
-
|
191 |
-
"""
|
192 |
-
global retriever, extracted_images
|
193 |
-
|
194 |
-
if index is None or retriever is None:
|
195 |
return "β Please upload and process a PDF first."
|
196 |
-
|
197 |
if not question.strip():
|
198 |
return "β Please enter a question."
|
199 |
-
|
200 |
-
try:
|
201 |
-
# Retrieve relevant chunks
|
202 |
-
docs = retriever.get_relevant_documents(question)
|
203 |
-
context = "\n\n".join(doc.page_content for doc in docs)
|
204 |
-
|
205 |
-
# Create prompt for text generation
|
206 |
-
prompt = f"""You are an AI assistant analyzing a document that contains both text and visual elements.
|
207 |
-
|
208 |
-
RETRIEVED CONTEXT:
|
209 |
-
{context}
|
210 |
-
|
211 |
-
QUESTION: {question}
|
212 |
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
|
|
224 |
)
|
225 |
-
|
226 |
-
return response.strip()
|
227 |
-
|
228 |
except Exception as e:
|
229 |
-
return f"β Error generating answer: {
|
230 |
|
231 |
def generate_multimodal_summary():
|
232 |
-
"""
|
233 |
-
Generate summary using the multimodal model.
|
234 |
-
"""
|
235 |
if not extracted_content:
|
236 |
return "β Please upload and process a PDF first."
|
237 |
-
|
238 |
try:
|
239 |
-
|
240 |
-
content_preview = extracted_content[:4000]
|
241 |
-
|
242 |
messages = [
|
243 |
-
{
|
244 |
-
"
|
245 |
-
"
|
246 |
-
|
247 |
-
"type": "text",
|
248 |
-
"text": f"""Please provide a comprehensive summary of this document content. The content includes both textual information and descriptions of visual elements (images, charts, tables, diagrams).
|
249 |
-
|
250 |
-
DOCUMENT CONTENT:
|
251 |
-
{content_preview}
|
252 |
-
|
253 |
-
Create a well-structured summary that captures:
|
254 |
-
1. Main topics and key points from the text
|
255 |
-
2. Important information from visual elements (charts, images, tables)
|
256 |
-
3. Overall document purpose and conclusions
|
257 |
-
|
258 |
-
SUMMARY:"""
|
259 |
-
}
|
260 |
-
]
|
261 |
-
}
|
262 |
]
|
263 |
-
|
264 |
-
|
265 |
-
messages=messages,
|
266 |
-
max_tokens=250,
|
267 |
-
temperature=0.3
|
268 |
)
|
269 |
-
|
270 |
-
return response["choices"][0]["message"]["content"].strip()
|
271 |
-
|
272 |
except Exception as e:
|
273 |
-
return f"β Error generating summary: {
|
274 |
|
275 |
def extract_multimodal_keywords():
|
276 |
-
"""
|
277 |
-
Extract keywords using the multimodal model.
|
278 |
-
"""
|
279 |
if not extracted_content:
|
280 |
return "β Please upload and process a PDF first."
|
281 |
-
|
282 |
try:
|
283 |
-
|
284 |
-
|
285 |
messages = [
|
286 |
-
{
|
287 |
-
"
|
288 |
-
"
|
289 |
-
|
290 |
-
"type": "text",
|
291 |
-
"text": f"""Analyze the following document content and extract 12-15 key terms, concepts, and important phrases. The content includes both text and descriptions of visual elements.
|
292 |
-
|
293 |
-
DOCUMENT CONTENT:
|
294 |
-
{content_preview}
|
295 |
-
|
296 |
-
Extract key terms that represent:
|
297 |
-
- Main topics and concepts
|
298 |
-
- Important technical terms
|
299 |
-
- Key findings or data points
|
300 |
-
- Visual elements mentioned (chart types, image subjects)
|
301 |
-
|
302 |
-
Format as a comma-separated list.
|
303 |
-
|
304 |
-
KEY TERMS:"""
|
305 |
-
}
|
306 |
-
]
|
307 |
-
}
|
308 |
]
|
309 |
-
|
310 |
-
|
311 |
-
messages=messages,
|
312 |
-
max_tokens=120,
|
313 |
-
temperature=0.3
|
314 |
)
|
315 |
-
|
316 |
-
return response["choices"][0]["message"]["content"].strip()
|
317 |
-
|
318 |
except Exception as e:
|
319 |
-
return f"β Error extracting keywords: {
|
320 |
|
321 |
def clear_multimodal_interface():
|
322 |
-
"""
|
323 |
-
Reset all global state and clear UI.
|
324 |
-
"""
|
325 |
global index, retriever, current_pdf_name, extracted_content, extracted_images
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
for file in os.listdir(figures_dir):
|
330 |
-
os.remove(os.path.join(figures_dir, file))
|
331 |
-
except:
|
332 |
-
pass
|
333 |
-
|
334 |
-
# Reset globals
|
335 |
index = retriever = None
|
336 |
current_pdf_name = extracted_content = None
|
337 |
extracted_images.clear()
|
338 |
-
|
339 |
return None, "", gr.update(interactive=False)
|
340 |
|
341 |
# ββ Gradio UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
@@ -345,37 +202,12 @@ with gr.Blocks(theme=theme, css="""
|
|
345 |
.container { border-radius: 10px; padding: 15px; }
|
346 |
.pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
|
347 |
.footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
|
348 |
-
.main-title {
|
349 |
-
|
350 |
-
|
351 |
-
font-weight: bold;
|
352 |
-
margin-bottom: 20px;
|
353 |
-
}
|
354 |
-
.multimodal-badge {
|
355 |
-
background: linear-gradient(45deg, #6366f1, #8b5cf6);
|
356 |
-
color: white;
|
357 |
-
padding: 5px 15px;
|
358 |
-
border-radius: 20px;
|
359 |
-
font-size: 14px;
|
360 |
-
display: inline-block;
|
361 |
-
margin: 10px auto;
|
362 |
-
}
|
363 |
-
.model-info {
|
364 |
-
background: #f8fafc;
|
365 |
-
border: 1px solid #e2e8f0;
|
366 |
-
border-radius: 8px;
|
367 |
-
padding: 10px;
|
368 |
-
margin: 10px 0;
|
369 |
-
font-size: 12px;
|
370 |
-
color: #64748b;
|
371 |
-
}
|
372 |
""") as demo:
|
373 |
-
|
374 |
-
# Application title with multimodal badge
|
375 |
gr.Markdown("<div class='main-title'>Unified MultiModal RAG</div>")
|
376 |
-
gr.Markdown("<div style='text-align:
|
377 |
-
|
378 |
-
# Model information
|
379 |
gr.Markdown("""
|
380 |
<div class='model-info'>
|
381 |
<strong>π€ Powered by:</strong> Microsoft Phi-3.5-Vision + CLIP Embeddings + PyMuPDF (HF Spaces Compatible)
|
@@ -389,19 +221,12 @@ with gr.Blocks(theme=theme, css="""
|
|
389 |
pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
|
390 |
upload_button = gr.Button("π Process with Multimodal AI", variant="primary")
|
391 |
status_box = gr.Textbox(label="Processing Status", interactive=False)
|
392 |
-
|
393 |
with gr.Column():
|
394 |
gr.Markdown("## β Ask Questions")
|
395 |
-
gr.
|
396 |
-
question_input = gr.Textbox(
|
397 |
-
lines=3,
|
398 |
-
placeholder="Ask about text content, images, charts, tables, or any visual elements...",
|
399 |
-
interactive=False
|
400 |
-
)
|
401 |
ask_button = gr.Button("π Ask Multimodal AI", variant="primary")
|
402 |
answer_output = gr.Textbox(label="AI Response", lines=8, interactive=False)
|
403 |
|
404 |
-
# Analysis tools
|
405 |
with gr.Row():
|
406 |
with gr.Column():
|
407 |
summary_button = gr.Button("π Generate Summary", variant="secondary")
|
@@ -410,34 +235,18 @@ with gr.Blocks(theme=theme, css="""
|
|
410 |
keywords_button = gr.Button("π·οΈ Extract Keywords", variant="secondary")
|
411 |
keywords_output = gr.Textbox(label="Key Terms", lines=4, interactive=False)
|
412 |
|
413 |
-
# Clear button
|
414 |
clear_button = gr.Button("ποΈ Clear All", variant="secondary")
|
415 |
-
|
416 |
gr.Markdown("""
|
417 |
<div class='footer'>
|
418 |
-
<strong>Unified Multimodal Pipeline:</strong> One model handles text
|
419 |
-
Supports: Text β’ Images β’ Charts β’ Tables β’ Diagrams β’ Mixed Content Queries
|
420 |
</div>
|
421 |
""")
|
422 |
|
423 |
-
|
424 |
-
|
425 |
-
process_pdf_multimodal,
|
426 |
-
[pdf_file],
|
427 |
-
[pdf_display, status_box, question_input]
|
428 |
-
)
|
429 |
-
ask_button.click(
|
430 |
-
ask_multimodal_question,
|
431 |
-
[pdf_display, question_input],
|
432 |
-
answer_output
|
433 |
-
)
|
434 |
summary_button.click(generate_multimodal_summary, [], summary_output)
|
435 |
keywords_button.click(extract_multimodal_keywords, [], keywords_output)
|
436 |
-
clear_button.click(
|
437 |
-
clear_multimodal_interface,
|
438 |
-
[],
|
439 |
-
[pdf_file, pdf_display, question_input]
|
440 |
-
)
|
441 |
|
442 |
if __name__ == "__main__":
|
443 |
-
demo.launch(debug=True
|
|
|
1 |
+
# app.py
|
2 |
import os
|
|
|
3 |
import tempfile
|
4 |
from pathlib import Path
|
5 |
import base64
|
6 |
+
import fitz # PyMuPDF
|
7 |
from PIL import Image
|
8 |
import io
|
9 |
|
10 |
+
import gradio as gr
|
11 |
+
from huggingface_hub import InferenceClient
|
12 |
+
|
13 |
+
# Import vectorstore and embeddings from updated packages
|
14 |
from langchain_community.vectorstores import FAISS
|
15 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
|
|
16 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
|
17 |
|
18 |
# ββ Globals βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
19 |
+
index = None
|
20 |
+
retriever = None
|
21 |
+
current_pdf_name = None
|
22 |
+
extracted_content = None
|
23 |
+
extracted_images = []
|
24 |
|
25 |
# ββ Single Multimodal Model ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
26 |
multimodal_client = InferenceClient(model="microsoft/Phi-3.5-vision-instruct")
|
|
|
|
|
|
|
27 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/clip-ViT-B-32")
|
28 |
|
29 |
+
# Create temp dirs
|
30 |
temp_dir = tempfile.mkdtemp()
|
31 |
figures_dir = os.path.join(temp_dir, "figures")
|
32 |
os.makedirs(figures_dir, exist_ok=True)
|
33 |
|
34 |
def encode_image_to_base64(image_path):
|
|
|
35 |
with open(image_path, "rb") as image_file:
|
36 |
return base64.b64encode(image_file.read()).decode('utf-8')
|
37 |
|
38 |
def extract_images_from_pdf_pymupdf(pdf_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
extracted_images = []
|
40 |
image_descriptions = []
|
|
|
41 |
try:
|
|
|
42 |
pdf_document = fitz.open(pdf_path)
|
|
|
43 |
for page_num in range(len(pdf_document)):
|
44 |
page = pdf_document.load_page(page_num)
|
45 |
+
for img_index, img in enumerate(page.get_images()):
|
|
|
|
|
|
|
46 |
xref = img[0]
|
47 |
pix = fitz.Pixmap(pdf_document, xref)
|
48 |
+
if pix.n - pix.alpha < 4:
|
|
|
|
|
49 |
img_data = pix.tobytes("png")
|
50 |
img_pil = Image.open(io.BytesIO(img_data))
|
|
|
|
|
51 |
image_filename = f"page_{page_num}_img_{img_index}.png"
|
52 |
image_path = os.path.join(figures_dir, image_filename)
|
53 |
img_pil.save(image_path)
|
54 |
+
desc = analyze_image_with_multimodal_model(image_path)
|
|
|
|
|
|
|
55 |
extracted_images.append(image_path)
|
56 |
+
image_descriptions.append(desc)
|
57 |
+
pix = None
|
|
|
|
|
58 |
pdf_document.close()
|
59 |
return extracted_images, image_descriptions
|
|
|
60 |
except Exception as e:
|
61 |
print(f"Error extracting images: {e}")
|
62 |
return [], []
|
63 |
|
64 |
def analyze_image_with_multimodal_model(image_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
try:
|
66 |
+
b64 = encode_image_to_base64(image_path)
|
67 |
+
prompt = (
|
68 |
+
"Analyze this image and provide a detailed description. Include any text, data, "
|
69 |
+
"charts, diagrams, tables, or important visual elements you can see.\n"
|
70 |
+
"Image: [Image data provided]\nDescription:"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
)
|
72 |
+
resp = multimodal_client.text_generation(
|
73 |
+
prompt=prompt, max_new_tokens=200, temperature=0.3
|
74 |
+
)
|
75 |
+
return "[IMAGE CONTENT]: " + resp.strip()
|
76 |
except Exception as e:
|
77 |
+
return f"[IMAGE CONTENT]: Could not analyze image - {e}"
|
78 |
|
79 |
def process_pdf_multimodal(pdf_file):
|
|
|
|
|
|
|
80 |
global current_pdf_name, index, retriever, extracted_content, extracted_images
|
|
|
81 |
if pdf_file is None:
|
82 |
return None, "β Please upload a PDF file.", gr.update(interactive=False)
|
83 |
|
84 |
current_pdf_name = os.path.basename(pdf_file.name)
|
85 |
+
extracted_images.clear()
|
86 |
+
for f in os.listdir(figures_dir):
|
87 |
+
os.remove(os.path.join(figures_dir, f))
|
88 |
+
|
89 |
try:
|
90 |
+
# Text extraction
|
|
|
|
|
|
|
|
|
|
|
91 |
pdf_document = fitz.open(pdf_file.name)
|
92 |
text_elements = []
|
93 |
+
for i in range(len(pdf_document)):
|
94 |
+
p = pdf_document.load_page(i)
|
95 |
+
t = p.get_text().strip()
|
96 |
+
if t:
|
97 |
+
text_elements.append(f"[PAGE {i+1}]\n{t}")
|
|
|
|
|
98 |
pdf_document.close()
|
99 |
+
|
100 |
+
# Image extraction & analysis
|
101 |
+
imgs, img_descs = extract_images_from_pdf_pymupdf(pdf_file.name)
|
102 |
+
extracted_images.extend(imgs)
|
103 |
+
|
104 |
+
# Combine content and split
|
105 |
+
all_content = text_elements + img_descs
|
106 |
extracted_content = "\n\n".join(all_content)
|
107 |
+
if not extracted_content:
|
108 |
+
return current_pdf_name, "β No content extracted.", gr.update(interactive=False)
|
109 |
+
|
110 |
+
splitter = RecursiveCharacterTextSplitter(
|
111 |
+
chunk_size=1000, chunk_overlap=200, add_start_index=True
|
|
|
|
|
|
|
|
|
112 |
)
|
113 |
+
chunks = splitter.split_text(extracted_content)
|
114 |
+
|
|
|
115 |
index = FAISS.from_texts(chunks, embeddings)
|
116 |
retriever = index.as_retriever(search_kwargs={"k": 3})
|
117 |
+
|
118 |
+
status = (
|
119 |
+
f"β
Processed '{current_pdf_name}' β "
|
120 |
+
f"{len(chunks)} chunks "
|
121 |
+
f"({len(text_elements)} pages, {len(img_descs)} images analyzed)"
|
122 |
+
)
|
123 |
return current_pdf_name, status, gr.update(interactive=True)
|
124 |
+
|
125 |
except Exception as e:
|
126 |
+
return current_pdf_name, f"β Error processing PDF: {e}", gr.update(interactive=False)
|
|
|
127 |
|
128 |
def ask_multimodal_question(pdf_name, question):
|
129 |
+
global retriever
|
130 |
+
if not retriever:
|
|
|
|
|
|
|
|
|
131 |
return "β Please upload and process a PDF first."
|
|
|
132 |
if not question.strip():
|
133 |
return "β Please enter a question."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
+
try:
|
136 |
+
docs = retriever.invoke(question)
|
137 |
+
context = "\n\n".join(d.page_content for d in docs)
|
138 |
+
prompt = (
|
139 |
+
"You are an AI assistant analyzing a document that contains both text and visual elements.\n\n"
|
140 |
+
f"RETRIEVED CONTEXT:\n{context}\n\n"
|
141 |
+
f"QUESTION: {question}\n"
|
142 |
+
"Please provide a comprehensive answer based on the retrieved context above. "
|
143 |
+
"If you reference visual elements, mention them explicitly.\nANSWER:"
|
144 |
+
)
|
145 |
+
resp = multimodal_client.text_generation(
|
146 |
+
prompt=prompt, max_new_tokens=300, temperature=0.5
|
147 |
)
|
148 |
+
return resp.strip()
|
|
|
|
|
149 |
except Exception as e:
|
150 |
+
return f"β Error generating answer: {e}"
|
151 |
|
152 |
def generate_multimodal_summary():
|
|
|
|
|
|
|
153 |
if not extracted_content:
|
154 |
return "β Please upload and process a PDF first."
|
|
|
155 |
try:
|
156 |
+
preview = extracted_content[:4000]
|
|
|
|
|
157 |
messages = [
|
158 |
+
{"role":"user","content":[{"type":"text","text":
|
159 |
+
"Please provide a comprehensive summary of this document content. The content includes both textual "
|
160 |
+
f"information and descriptions of visual elements.\n\nDOCUMENT CONTENT:\n{preview}\n\nSUMMARY:"
|
161 |
+
}]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
]
|
163 |
+
resp = multimodal_client.chat_completion(
|
164 |
+
messages=messages, max_tokens=250, temperature=0.3
|
|
|
|
|
|
|
165 |
)
|
166 |
+
return resp["choices"][0]["message"]["content"].strip()
|
|
|
|
|
167 |
except Exception as e:
|
168 |
+
return f"β Error generating summary: {e}"
|
169 |
|
170 |
def extract_multimodal_keywords():
|
|
|
|
|
|
|
171 |
if not extracted_content:
|
172 |
return "β Please upload and process a PDF first."
|
|
|
173 |
try:
|
174 |
+
preview = extracted_content[:3000]
|
|
|
175 |
messages = [
|
176 |
+
{"role":"user","content":[{"type":"text","text":
|
177 |
+
"Analyze the following document content and extract 12-15 key terms, concepts, and important phrases. "
|
178 |
+
f"DOCUMENT CONTENT:\n{preview}\n\nKEY TERMS:"
|
179 |
+
}]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
]
|
181 |
+
resp = multimodal_client.chat_completion(
|
182 |
+
messages=messages, max_tokens=120, temperature=0.3
|
|
|
|
|
|
|
183 |
)
|
184 |
+
return resp["choices"][0]["message"]["content"].strip()
|
|
|
|
|
185 |
except Exception as e:
|
186 |
+
return f"β Error extracting keywords: {e}"
|
187 |
|
188 |
def clear_multimodal_interface():
|
|
|
|
|
|
|
189 |
global index, retriever, current_pdf_name, extracted_content, extracted_images
|
190 |
+
for f in os.listdir(figures_dir):
|
191 |
+
try: os.remove(os.path.join(figures_dir, f))
|
192 |
+
except: pass
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
index = retriever = None
|
194 |
current_pdf_name = extracted_content = None
|
195 |
extracted_images.clear()
|
|
|
196 |
return None, "", gr.update(interactive=False)
|
197 |
|
198 |
# ββ Gradio UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
202 |
.container { border-radius: 10px; padding: 15px; }
|
203 |
.pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
|
204 |
.footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
|
205 |
+
.main-title { text-align: center; font-size: 64px; font-weight: bold; margin-bottom: 20px; }
|
206 |
+
.multimodal-badge { background: linear-gradient(45deg, #6366f1, #8b5cf6); color: white; padding: 5px 15px; border-radius: 20px; font-size: 14px; display: inline-block; margin: 10px auto; }
|
207 |
+
.model-info { background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; padding: 10px; margin: 10px 0; font-size: 12px; color: #64748b; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
""") as demo:
|
|
|
|
|
209 |
gr.Markdown("<div class='main-title'>Unified MultiModal RAG</div>")
|
210 |
+
gr.Markdown("<div style='text-align:center;'><span class='multimodal-badge'>π§ Single Model β’ Text + Vision</span></div>")
|
|
|
|
|
211 |
gr.Markdown("""
|
212 |
<div class='model-info'>
|
213 |
<strong>π€ Powered by:</strong> Microsoft Phi-3.5-Vision + CLIP Embeddings + PyMuPDF (HF Spaces Compatible)
|
|
|
221 |
pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
|
222 |
upload_button = gr.Button("π Process with Multimodal AI", variant="primary")
|
223 |
status_box = gr.Textbox(label="Processing Status", interactive=False)
|
|
|
224 |
with gr.Column():
|
225 |
gr.Markdown("## β Ask Questions")
|
226 |
+
question_input = gr.Textbox(lines=3, placeholder="Ask about text or visual content...", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
227 |
ask_button = gr.Button("π Ask Multimodal AI", variant="primary")
|
228 |
answer_output = gr.Textbox(label="AI Response", lines=8, interactive=False)
|
229 |
|
|
|
230 |
with gr.Row():
|
231 |
with gr.Column():
|
232 |
summary_button = gr.Button("π Generate Summary", variant="secondary")
|
|
|
235 |
keywords_button = gr.Button("π·οΈ Extract Keywords", variant="secondary")
|
236 |
keywords_output = gr.Textbox(label="Key Terms", lines=4, interactive=False)
|
237 |
|
|
|
238 |
clear_button = gr.Button("ποΈ Clear All", variant="secondary")
|
|
|
239 |
gr.Markdown("""
|
240 |
<div class='footer'>
|
241 |
+
<strong>Unified Multimodal Pipeline:</strong> One model handles text, images, charts, tables, diagrams, and mixed content queries
|
|
|
242 |
</div>
|
243 |
""")
|
244 |
|
245 |
+
upload_button.click(process_pdf_multimodal, [pdf_file], [pdf_display, status_box, question_input])
|
246 |
+
ask_button.click(ask_multimodal_question, [pdf_display, question_input], answer_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
summary_button.click(generate_multimodal_summary, [], summary_output)
|
248 |
keywords_button.click(extract_multimodal_keywords, [], keywords_output)
|
249 |
+
clear_button.click(clear_multimodal_interface, [], [pdf_file, pdf_display, question_input])
|
|
|
|
|
|
|
|
|
250 |
|
251 |
if __name__ == "__main__":
|
252 |
+
demo.launch(debug=True)
|