Muzammil6376 commited on
Commit
5884a33
Β·
verified Β·
1 Parent(s): 1a049e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +630 -148
app.py CHANGED
@@ -1,177 +1,659 @@
1
- # app.py
2
  import os
 
3
  import tempfile
 
 
 
 
 
 
 
4
  import base64
5
- from pathlib import Path
6
  import io
 
 
 
 
 
7
 
8
- import gradio as gr
9
- from huggingface_hub import InferenceClient
10
- from langchain_community.vectorstores import FAISS
11
- from langchain_huggingface import HuggingFaceEmbeddings
12
- from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
13
 
14
- # ── Globals ───────────────────────────────────────────────────────────────
15
- index = None
16
- retriever = None
17
- extracted_content = None
18
 
19
- # ── Inference & Embeddings ─────────────────────────────────────────────────
20
- multimodal_client = InferenceClient(model="microsoft/Phi-3.5-vision-instruct")
21
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/clip-ViT-B-32")
 
 
22
 
23
- # Temporary dirs for image extraction
24
- TMP_DIR = tempfile.mkdtemp()
25
- FIGURES_DIR = os.path.join(TMP_DIR, "figures")
26
- os.makedirs(FIGURES_DIR, exist_ok=True)
27
 
28
- # ── Helpers ─────────────────────────────────────────────────────────────────
29
- def encode_image_to_base64(image_path):
30
- with open(image_path, "rb") as f:
31
- return base64.b64encode(f.read()).decode()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
 
33
 
34
- def extract_images_from_pdf(pdf_path):
35
- from fitz import open as fitz_open
36
- from PIL import Image
37
- import fitz
38
 
39
- extracted = []
40
- descriptions = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  try:
42
- doc = fitz_open(pdf_path)
43
- for p in range(len(doc)):
44
- page = doc.load_page(p)
45
- for img in page.get_images():
46
- xref = img[0]
47
- pix = fitz.Pixmap(doc, xref)
48
- if pix.n - pix.alpha < 4:
49
- png = pix.tobytes("png")
50
- img_pil = Image.open(io.BytesIO(png))
51
- fname = f"page_{p}_img_{xref}.png"
52
- path = os.path.join(FIGURES_DIR, fname)
53
- img_pil.save(path)
54
- desc = analyze_image(path)
55
- extracted.append(path)
56
- descriptions.append(desc)
57
- pix = None
58
- doc.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  except Exception as e:
60
- print(f"Image extraction error: {e}")
61
- return extracted, descriptions
62
 
 
 
 
63
 
64
- def analyze_image(image_path):
 
65
  try:
66
- b64 = encode_image_to_base64(image_path)
67
- prompt = (
68
- "Analyze this image and provide a detailed description. "
69
- "Include any text, charts, tables, or important visual elements.\n"
70
- "Image: [data]\nDescription:"
71
- )
72
- raw = multimodal_client.text_generation(
73
- prompt=prompt, max_new_tokens=200, temperature=0.3
74
- )
75
- # Handle dict or list wrapping
76
- if isinstance(raw, dict):
77
- out = raw.get("generated_text", str(raw))
78
- elif isinstance(raw, list) and raw and isinstance(raw[0], dict):
79
- out = raw[0].get("generated_text", str(raw))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  else:
81
- out = str(raw)
82
- return f"[IMAGE]: {out.strip()}"
83
  except Exception as e:
84
- return f"[IMAGE ERROR]: {e}"
 
85
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- def process_pdf(pdf_file):
88
- global index, retriever, extracted_content
89
- if not pdf_file:
90
- return None, "❌ Upload a PDF.", gr.update(interactive=False)
 
91
 
92
- # clear old images
93
- for f in os.listdir(FIGURES_DIR):
94
- os.remove(os.path.join(FIGURES_DIR, f))
 
 
 
 
95
 
96
- path = pdf_file.name if isinstance(pdf_file, Path) else pdf_file
 
97
  try:
98
- import fitz
99
- doc = fitz.open(path)
100
- pages = []
101
- for i in range(len(doc)):
102
- txt = doc.load_page(i).get_text().strip()
103
- if txt:
104
- pages.append(f"[Page {i+1}]\n" + txt)
105
- doc.close()
106
-
107
- imgs, descs = extract_images_from_pdf(path)
108
- all_content = pages + descs
109
- extracted_content = "\n\n".join(all_content)
110
- if not extracted_content:
111
- return pdf_file.name, "❌ No content extracted.", gr.update(interactive=False)
112
-
113
- splitter = RecursiveCharacterTextSplitter(
114
- chunk_size=1000, chunk_overlap=200, add_start_index=True
115
- )
116
- chunks = splitter.split_text(extracted_content)
117
- index = FAISS.from_texts(chunks, embeddings)
118
- retriever = index.as_retriever(search_kwargs={"k": 3})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
- msg = f"βœ… Processed {pdf_file.name} β€” {len(chunks)} chunks."
121
- return pdf_file.name, msg, gr.update(interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
- except Exception as e:
124
- return pdf_file.name if pdf_file else None, f"❌ PDF error: {e}", gr.update(interactive=False)
125
 
 
 
126
 
127
- def ask_question(doc_name, question):
128
- global retriever
129
- if not retriever:
130
- return "❌ Process a PDF first."
131
- if not question.strip():
132
- return "❌ Enter a question."
 
 
133
 
134
- # retrieve
135
- try:
136
- docs = retriever.invoke(question)
137
- except Exception:
138
- docs = retriever.get_relevant_documents(question)
139
-
140
- context = "\n\n".join(d.page_content for d in docs)
141
- prompt = (
142
- "You are an AI assistant with both text and visual context.\n"
143
- f"CONTEXT:\n{context}\nQUESTION: {question}\nAnswer:"
144
- )
145
- try:
146
- raw = multimodal_client.text_generation(
147
- prompt=prompt, max_new_tokens=300, temperature=0.5
148
- )
149
- if isinstance(raw, dict): out = raw.get("generated_text", str(raw))
150
- elif isinstance(raw, list) and raw and isinstance(raw[0], dict): out = raw[0].get("generated_text", str(raw))
151
- else: out = str(raw)
152
- return out.strip()
153
- except Exception as e:
154
- return f"❌ Generation error: {e}"
155
-
156
- # ── Gradio UI ───────────────────────────────────────────────────────────────
157
- theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
158
- with gr.Blocks(theme=theme) as demo:
159
- gr.Markdown("## 🧠 Unified MultiModal RAG")
160
- with gr.Row():
161
- with gr.Column():
162
- pdf_in = gr.File(label="Upload PDF", file_types=[".pdf"], type="file")
163
- proc_btn = gr.Button("πŸ”„ Process PDF", variant="primary")
164
- pdf_disp = gr.Textbox(label="Active Doc", interactive=False)
165
- status = gr.Textbox(label="Status", interactive=False)
166
- with gr.Column():
167
- q_in = gr.Textbox(label="Ask your question…", lines=3, interactive=False)
168
- ask_btn = gr.Button("πŸ” Ask", variant="primary", interactive=False)
169
- ans_out = gr.Textbox(label="Answer", lines=6, interactive=False)
170
-
171
- proc_btn.click(process_pdf, [pdf_in], [pdf_disp, status, q_in])
172
- # enable ask button only after processing
173
- proc_btn.click(lambda *_: gr.update(interactive=True), [], [], [ask_btn])
174
- ask_btn.click(ask_question, [pdf_disp, q_in], ans_out)
175
-
176
- if __name__ == "__main__":
177
- demo.launch(debug=True)
 
1
+
2
  import os
3
+ import gradio as gr
4
  import tempfile
5
+ from langchain_core.prompts import ChatPromptTemplate
6
+ from langchain_core.vectorstores import InMemoryVectorStore
7
+ from langchain_huggingface import HuggingFaceEmbeddings
8
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
9
+ from unstructured.partition.pdf import partition_pdf
10
+ from unstructured.partition.utils.constants import PartitionStrategy
11
+ from huggingface_hub import InferenceClient
12
  import base64
13
+ from PIL import Image
14
  import io
15
+ import requests
16
+ from getpass import getpass
17
+ import PyPDF2
18
+ import fitz # PyMuPDF
19
+ import pytesseract
20
 
21
+ # Step 2: Set up Hugging Face Token
22
+ print("πŸ”‘ Setting up Hugging Face Token...")
23
+ print("Please enter your Hugging Face token (get it from: https://huggingface.co/settings/tokens)")
24
+ HF_TOKEN = getpass("Hugging Face Token: ")
25
+
26
+ # Set environment variable
27
+ os.environ["HUGGINGFACE_HUB_TOKEN"] = HF_TOKEN
28
 
29
+ # Step 3: Initialize Hugging Face components
30
+ print("πŸš€ Initializing models...")
 
 
31
 
32
+ # Initialize embeddings model (runs locally for better performance)
33
+ embeddings = HuggingFaceEmbeddings(
34
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
35
+ model_kwargs={'device': 'cpu'}
36
+ )
37
 
38
+ # Initialize vector store
39
+ vector_store = InMemoryVectorStore(embeddings)
 
 
40
 
41
+ # Initialize Hugging Face Inference clients with proper multimodal support
42
+ def initialize_multimodal_clients():
43
+ """Initialize clients with proper multimodal capabilities"""
44
+
45
+ # Vision-Language Models (can understand images AND text together)
46
+ multimodal_models = [
47
+ "microsoft/git-large-coco", # Best for image+text understanding
48
+ "Salesforce/blip2-opt-2.7b", # Strong multimodal model
49
+ "microsoft/git-base-coco", # Lighter alternative
50
+ "Salesforce/blip-image-captioning-large" # Good image understanding
51
+ ]
52
+
53
+ # Text-only models for when no images are involved
54
+ text_models = [
55
+ "google/flan-t5-base", # Excellent for Q&A
56
+ "microsoft/DialoGPT-medium", # Conversational
57
+ "facebook/blenderbot-400M-distill", # Another option
58
+ ]
59
+
60
+ vision_client = None
61
+ text_client = None
62
+
63
+ # Try to initialize multimodal/vision client
64
+ for model_name in multimodal_models:
65
+ try:
66
+ vision_client = InferenceClient(model=model_name, token=HF_TOKEN)
67
+ print(f"βœ… Multimodal client initialized: {model_name}")
68
+ break
69
+ except Exception as e:
70
+ print(f"⚠️ Failed to initialize {model_name}: {e}")
71
+ continue
72
+
73
+ # Try to initialize text client
74
+ for model_name in text_models:
75
+ try:
76
+ text_client = InferenceClient(model=model_name, token=HF_TOKEN)
77
+ print(f"βœ… Text client initialized: {model_name}")
78
+ break
79
+ except Exception as e:
80
+ print(f"⚠️ Failed to initialize {model_name}: {e}")
81
+ continue
82
+
83
+ return vision_client, text_client
84
 
85
+ vision_client, text_client = initialize_multimodal_clients()
86
 
87
+ template = """
88
+ You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
 
 
89
 
90
+ Question: {question}
91
+ Context: {context}
92
+ Answer:
93
+ """
94
+
95
+ def extract_text_with_multiple_methods(pdf_path):
96
+ """Try multiple methods to extract text from PDF"""
97
+ extracted_text = ""
98
+ methods_tried = []
99
+
100
+ # Method 1: PyPDF2
101
+ try:
102
+ print("πŸ” Trying PyPDF2...")
103
+ with open(pdf_path, 'rb') as file:
104
+ pdf_reader = PyPDF2.PdfReader(file)
105
+ text_parts = []
106
+ for page_num, page in enumerate(pdf_reader.pages):
107
+ page_text = page.extract_text()
108
+ if page_text.strip():
109
+ text_parts.append(f"Page {page_num + 1}:\n{page_text}")
110
+
111
+ if text_parts:
112
+ extracted_text = "\n\n".join(text_parts)
113
+ methods_tried.append("PyPDF2")
114
+ print(f"βœ… PyPDF2 extracted {len(extracted_text)} characters")
115
+ except Exception as e:
116
+ print(f"⚠️ PyPDF2 failed: {e}")
117
+
118
+ # Method 2: PyMuPDF (fitz) - often better for complex PDFs
119
+ if not extracted_text.strip():
120
+ try:
121
+ print("πŸ” Trying PyMuPDF...")
122
+ doc = fitz.open(pdf_path)
123
+ text_parts = []
124
+ for page_num in range(len(doc)):
125
+ page = doc.load_page(page_num)
126
+ page_text = page.get_text()
127
+ if page_text.strip():
128
+ text_parts.append(f"Page {page_num + 1}:\n{page_text}")
129
+
130
+ if text_parts:
131
+ extracted_text = "\n\n".join(text_parts)
132
+ methods_tried.append("PyMuPDF")
133
+ print(f"βœ… PyMuPDF extracted {len(extracted_text)} characters")
134
+ doc.close()
135
+ except Exception as e:
136
+ print(f"⚠️ PyMuPDF failed: {e}")
137
+
138
+ # Method 3: OCR with PyMuPDF for image-based PDFs
139
+ if not extracted_text.strip():
140
+ try:
141
+ print("πŸ” Trying OCR with PyMuPDF...")
142
+ doc = fitz.open(pdf_path)
143
+ text_parts = []
144
+ for page_num in range(min(len(doc), 5)): # Limit to first 5 pages for OCR
145
+ page = doc.load_page(page_num)
146
+ # Convert page to image
147
+ pix = page.get_pixmap()
148
+ img_data = pix.tobytes("png")
149
+ img = Image.open(io.BytesIO(img_data))
150
+
151
+ # Apply OCR
152
+ ocr_text = pytesseract.image_to_string(img)
153
+ if ocr_text.strip():
154
+ text_parts.append(f"Page {page_num + 1} (OCR):\n{ocr_text}")
155
+
156
+ if text_parts:
157
+ extracted_text = "\n\n".join(text_parts)
158
+ methods_tried.append("OCR")
159
+ print(f"βœ… OCR extracted {len(extracted_text)} characters")
160
+ doc.close()
161
+ except Exception as e:
162
+ print(f"⚠️ OCR failed: {e}")
163
+
164
+ return extracted_text, methods_tried
165
+
166
+ def upload_and_process_pdf(pdf_file):
167
+ """Process uploaded PDF file with enhanced error handling"""
168
+ if pdf_file is None:
169
+ return "Please upload a PDF file first."
170
+
171
  try:
172
+ # Create temporary directories
173
+ with tempfile.TemporaryDirectory() as temp_dir:
174
+ figures_dir = os.path.join(temp_dir, "figures")
175
+ os.makedirs(figures_dir, exist_ok=True)
176
+
177
+ # Save uploaded file temporarily
178
+ temp_pdf_path = os.path.join(temp_dir, "uploaded.pdf")
179
+ with open(temp_pdf_path, "wb") as f:
180
+ f.write(pdf_file)
181
+
182
+ # Check file size and validity
183
+ file_size = os.path.getsize(temp_pdf_path)
184
+ print(f"πŸ“„ Processing PDF: {file_size} bytes")
185
+
186
+ if file_size == 0:
187
+ return "❌ The uploaded file is empty. Please check your PDF file."
188
+
189
+ if file_size > 50 * 1024 * 1024: # 50MB limit
190
+ return "❌ File too large (>50MB). Please upload a smaller PDF."
191
+
192
+ # Try multiple extraction methods
193
+ text, methods = extract_text_with_multiple_methods(temp_pdf_path)
194
+
195
+ # Process with unstructured as backup/additional method
196
+ unstructured_text = ""
197
+ try:
198
+ print("πŸ” Trying unstructured...")
199
+ elements = partition_pdf(
200
+ temp_pdf_path,
201
+ strategy=PartitionStrategy.FAST,
202
+ extract_image_block_types=["Image", "Table"],
203
+ extract_image_block_output_dir=figures_dir,
204
+ infer_table_structure=True
205
+ )
206
+
207
+ # Extract text elements
208
+ text_elements = []
209
+ for element in elements:
210
+ if hasattr(element, 'text') and element.text and element.category not in ["Image", "Table"]:
211
+ text_elements.append(element.text)
212
+
213
+ if text_elements:
214
+ unstructured_text = "\n\n".join(text_elements)
215
+ print(f"βœ… Unstructured extracted {len(unstructured_text)} characters")
216
+
217
+ # Combine with existing text if available
218
+ if text.strip():
219
+ text = f"{text}\n\n--- Additional Content ---\n\n{unstructured_text}"
220
+ else:
221
+ text = unstructured_text
222
+ methods.append("unstructured")
223
+
224
+ except Exception as unstructured_error:
225
+ print(f"⚠️ Unstructured processing failed: {unstructured_error}")
226
+
227
+ # Process images
228
+ image_text = ""
229
+ image_count = 0
230
+ if os.path.exists(figures_dir):
231
+ for file in os.listdir(figures_dir):
232
+ if file.lower().endswith(('.png', '.jpg', '.jpeg')):
233
+ try:
234
+ extracted_image_text = extract_text_from_image(os.path.join(figures_dir, file))
235
+ image_text += f"\n\n{extracted_image_text}"
236
+ image_count += 1
237
+ except Exception as e:
238
+ print(f"⚠️ Error processing image {file}: {e}")
239
+
240
+ # Also try to extract images directly from PDF using PyMuPDF
241
+ try:
242
+ doc = fitz.open(temp_pdf_path)
243
+ for page_num in range(min(len(doc), 10)): # Process first 10 pages
244
+ page = doc.load_page(page_num)
245
+ image_list = page.get_images(full=True)
246
+
247
+ for img_index, img in enumerate(image_list[:3]): # Max 3 images per page
248
+ try:
249
+ xref = img[0]
250
+ pix = fitz.Pixmap(doc, xref)
251
+ if pix.n - pix.alpha < 4: # GRAY or RGB
252
+ img_data = pix.tobytes("png")
253
+ img_path = os.path.join(figures_dir, f"page_{page_num}_img_{img_index}.png")
254
+ with open(img_path, "wb") as img_file:
255
+ img_file.write(img_data)
256
+
257
+ extracted_image_text = extract_text_from_image(img_path)
258
+ image_text += f"\n\n{extracted_image_text}"
259
+ image_count += 1
260
+ pix = None
261
+ except Exception as img_error:
262
+ print(f"⚠️ Error extracting image: {img_error}")
263
+ continue
264
+ doc.close()
265
+ except Exception as e:
266
+ print(f"⚠️ Error extracting images from PDF: {e}")
267
+
268
+ # Combine all text
269
+ full_text = text
270
+ if image_text.strip():
271
+ full_text += f"\n\n--- Image Content ---\n{image_text}"
272
+
273
+ if not full_text.strip():
274
+ return (f"⚠️ No text could be extracted from the PDF using any method. "
275
+ f"This might be a scanned PDF without OCR text, or the file might be corrupted. "
276
+ f"Methods tried: {', '.join(['PyPDF2', 'PyMuPDF', 'OCR', 'unstructured']) if not methods else ', '.join(methods)}")
277
+
278
+ # Split and index the text
279
+ chunked_texts = split_text(full_text)
280
+
281
+ if not chunked_texts:
282
+ return "⚠️ Text was extracted but could not be split into chunks."
283
+
284
+ # Clear existing vector store and add new documents
285
+ global vector_store
286
+ vector_store = InMemoryVectorStore(embeddings)
287
+ index_docs(chunked_texts)
288
+
289
+ success_msg = (f"βœ… PDF processed successfully!\n"
290
+ f"πŸ“Š Statistics:\n"
291
+ f"- Text chunks: {len(chunked_texts)}\n"
292
+ f"- Images processed: {image_count}\n"
293
+ f"- Methods used: {', '.join(methods)}\n"
294
+ f"- Total characters: {len(full_text)}")
295
+
296
+ return success_msg
297
+
298
  except Exception as e:
299
+ return f"❌ Error processing PDF: {str(e)}\n\nTroubleshooting tips:\n- Ensure the PDF is not password protected\n- Try a different PDF file\n- Check if the file is corrupted"
 
300
 
301
+ def load_pdf(file_path, figures_directory):
302
+ """Legacy function - now handled by upload_and_process_pdf"""
303
+ return extract_text_with_multiple_methods(file_path)[0]
304
 
305
+ def extract_text_from_image(image_path):
306
+ """Extract text description from image using Hugging Face Vision model"""
307
  try:
308
+ # First try OCR for any text in the image
309
+ ocr_text = ""
310
+ try:
311
+ img = Image.open(image_path)
312
+ ocr_text = pytesseract.image_to_string(img)
313
+ if ocr_text.strip():
314
+ ocr_text = f"Text in image: {ocr_text.strip()}"
315
+ except Exception as ocr_error:
316
+ print(f"⚠️ OCR failed for image: {ocr_error}")
317
+
318
+ # Then use vision model for description
319
+ vision_description = ""
320
+ if vision_client:
321
+ try:
322
+ with open(image_path, "rb") as img_file:
323
+ image_data = img_file.read()
324
+
325
+ response = vision_client.image_to_text(image_data)
326
+
327
+ if isinstance(response, list) and len(response) > 0:
328
+ vision_description = response[0].get('generated_text', '')
329
+ elif isinstance(response, dict):
330
+ vision_description = response.get('generated_text', '')
331
+ else:
332
+ vision_description = str(response)
333
+
334
+ except Exception as vision_error:
335
+ print(f"⚠️ Vision model failed: {vision_error}")
336
+
337
+ # Combine OCR and vision results
338
+ combined_result = []
339
+ if ocr_text:
340
+ combined_result.append(ocr_text)
341
+ if vision_description:
342
+ combined_result.append(f"Image description: {vision_description}")
343
+
344
+ if combined_result:
345
+ return "\n".join(combined_result)
346
  else:
347
+ return "Image content: Visual element present but could not be processed"
348
+
349
  except Exception as e:
350
+ print(f"⚠️ Error extracting text from image: {e}")
351
+ return "Image content: Visual element present but could not be processed"
352
 
353
+ def split_text(text):
354
+ """Split text into chunks"""
355
+ if not text or not text.strip():
356
+ return []
357
+
358
+ text_splitter = RecursiveCharacterTextSplitter(
359
+ chunk_size=1000,
360
+ chunk_overlap=200,
361
+ add_start_index=True
362
+ )
363
+ return text_splitter.split_text(text)
364
 
365
+ def index_docs(texts):
366
+ """Index documents in vector store"""
367
+ if texts:
368
+ vector_store.add_texts(texts)
369
+ print(f"πŸ“š Indexed {len(texts)} text chunks")
370
 
371
+ def retrieve_docs(query, k=4):
372
+ """Retrieve relevant documents"""
373
+ try:
374
+ return vector_store.similarity_search(query, k=k)
375
+ except Exception as e:
376
+ print(f"⚠️ Error retrieving documents: {e}")
377
+ return []
378
 
379
+ def answer_question_hf(question):
380
+ """Answer question using Hugging Face multimodal models"""
381
  try:
382
+ # Retrieve relevant documents
383
+ related_documents = retrieve_docs(question)
384
+
385
+ if not related_documents:
386
+ return "❓ No relevant documents found. Please upload and process a PDF first."
387
+
388
+ # Prepare context
389
+ context = "\n\n".join([doc.page_content for doc in related_documents])
390
+
391
+ # Limit context length for better performance
392
+ if len(context) > 1500:
393
+ context = context[:1500] + "..."
394
+
395
+ # Check if we have image content in the context
396
+ has_image_content = "Image content:" in context or "Image description:" in context
397
+
398
+ if has_image_content and vision_client:
399
+ # Use multimodal approach for questions involving images
400
+ try:
401
+ # For multimodal models, we can send both text and image context
402
+ multimodal_prompt = f"""
403
+ Based on the document content below (including text and image descriptions), answer this question: {question}
404
+
405
+ Document content:
406
+ {context}
407
+
408
+ Please provide a clear, concise answer in 2-3 sentences.
409
+ """
410
+
411
+ response = vision_client.text_generation(
412
+ multimodal_prompt,
413
+ max_new_tokens=150,
414
+ temperature=0.7,
415
+ do_sample=True,
416
+ return_full_text=False,
417
+ stop=["Question:", "Document content:", "\n\n\n"]
418
+ )
419
+
420
+ if isinstance(response, dict):
421
+ answer = response.get('generated_text', '')
422
+ elif isinstance(response, str):
423
+ answer = response
424
+ else:
425
+ answer = str(response)
426
+
427
+ if answer.strip():
428
+ return f"πŸ–ΌοΈ {answer.strip()}"
429
+
430
+ except Exception as multimodal_error:
431
+ print(f"⚠️ Multimodal model failed: {multimodal_error}")
432
+
433
+ # Fall back to text-only approach
434
+ if text_client:
435
+ try:
436
+ text_prompt = f"""
437
+ Question: {question}
438
+
439
+ Based on the following information from the document, provide a clear and concise answer:
440
+
441
+ {context}
442
+
443
+ Answer:"""
444
+
445
+ response = text_client.text_generation(
446
+ text_prompt,
447
+ max_new_tokens=150,
448
+ temperature=0.7,
449
+ do_sample=True,
450
+ return_full_text=False,
451
+ stop=["Question:", "Answer:", "\n\n\n"]
452
+ )
453
+
454
+ if isinstance(response, dict):
455
+ answer = response.get('generated_text', '')
456
+ elif isinstance(response, str):
457
+ answer = response
458
+ else:
459
+ answer = str(response)
460
+
461
+ # Clean up the answer
462
+ answer = answer.strip()
463
+ if answer:
464
+ return f"πŸ“„ {answer}"
465
+
466
+ except Exception as text_error:
467
+ print(f"⚠️ Text model failed: {text_error}")
468
+
469
+ # Last resort: Return extracted context
470
+ if context:
471
+ return f"πŸ“‹ Based on the document, here's the relevant information:\n\n{context[:500]}{'...' if len(context) > 500 else ''}"
472
+ else:
473
+ return "❌ Unable to find relevant information in the document."
474
+
475
+ except Exception as e:
476
+ return f"❌ Error generating answer: {str(e)}"
477
 
478
+ def create_colab_interface():
479
+ """Create Gradio interface optimized for Colab"""
480
+
481
+ with gr.Blocks(
482
+ title="Enhanced Multimodal RAG with Hugging Face",
483
+ theme=gr.themes.Soft(),
484
+ css="""
485
+ .gradio-container {
486
+ max-width: 1200px !important;
487
+ }
488
+ """
489
+ ) as iface:
490
+
491
+ gr.HTML("""
492
+ <div style="text-align: center; padding: 20px;">
493
+ <h1>πŸ“š Enhanced Multimodal RAG with Hugging Face</h1>
494
+ <p>Upload a PDF document and ask questions about its content, including images and tables!</p>
495
+ <p><em>Now with improved PDF processing and multiple extraction methods</em></p>
496
+ </div>
497
+ """)
498
+
499
+ with gr.Row():
500
+ with gr.Column(scale=1):
501
+ # PDF Upload Section
502
+ gr.Markdown("### πŸ“€ Upload Document")
503
+ pdf_input = gr.File(
504
+ label="Upload PDF Document",
505
+ file_types=[".pdf"],
506
+ type="binary",
507
+ height=100
508
+ )
509
+
510
+ upload_btn = gr.Button("πŸ”„ Process PDF", variant="primary", size="lg")
511
+ upload_status = gr.Textbox(
512
+ label="Processing Status",
513
+ interactive=False,
514
+ lines=6,
515
+ placeholder="Upload a PDF and click 'Process PDF' to begin..."
516
+ )
517
+
518
+ with gr.Column(scale=2):
519
+ # Chat Interface
520
+ gr.Markdown("### πŸ’¬ Chat Interface")
521
+ chatbot = gr.Chatbot(
522
+ label="Chat with your document",
523
+ height=400,
524
+ show_label=False
525
+ )
526
+
527
+ with gr.Row():
528
+ question_input = gr.Textbox(
529
+ label="Ask a question",
530
+ placeholder="What is this document about?",
531
+ lines=1,
532
+ scale=4
533
+ )
534
+ ask_btn = gr.Button("Ask", variant="secondary", scale=1)
535
+
536
+ # Example questions
537
+ gr.Markdown("### πŸ’‘ Example Questions")
538
+ example_questions = [
539
+ "What is the main topic of this document?",
540
+ "Can you summarize the key points?",
541
+ "What information is shown in the images or tables?",
542
+ "What are the conclusions or recommendations?"
543
+ ]
544
+
545
+ with gr.Row():
546
+ for i, eq in enumerate(example_questions):
547
+ example_btn = gr.Button(eq, size="sm")
548
+ example_btn.click(
549
+ lambda x=eq: x,
550
+ outputs=[question_input]
551
+ )
552
+
553
+ # Event handlers
554
+ def process_pdf_and_update(pdf_file):
555
+ if pdf_file is None:
556
+ return "Please select a PDF file first."
557
+ return upload_and_process_pdf(pdf_file)
558
+
559
+ def ask_and_update_chat(question, chat_history):
560
+ if not question.strip():
561
+ return chat_history, ""
562
+
563
+ # Get answer
564
+ answer = answer_question_hf(question)
565
+
566
+ # Update chat history
567
+ if chat_history is None:
568
+ chat_history = []
569
+
570
+ chat_history.append([question, answer])
571
+
572
+ return chat_history, ""
573
+
574
+ def clear_chat():
575
+ return []
576
+
577
+ # Connect events
578
+ upload_btn.click(
579
+ fn=process_pdf_and_update,
580
+ inputs=[pdf_input],
581
+ outputs=[upload_status]
582
+ )
583
+
584
+ ask_btn.click(
585
+ fn=ask_and_update_chat,
586
+ inputs=[question_input, chatbot],
587
+ outputs=[chatbot, question_input]
588
+ )
589
+
590
+ question_input.submit(
591
+ fn=ask_and_update_chat,
592
+ inputs=[question_input, chatbot],
593
+ outputs=[chatbot, question_input]
594
+ )
595
+
596
+ # Clear chat button
597
+ clear_btn = gr.Button("πŸ—‘οΈ Clear Chat", variant="stop", size="sm")
598
+ clear_btn.click(
599
+ fn=clear_chat,
600
+ outputs=[chatbot]
601
+ )
602
+
603
+ # Enhanced Instructions
604
+ gr.Markdown("""
605
+ ---
606
+ ### πŸ“‹ Instructions:
607
+ 1. **Get HF Token**: Visit [Hugging Face Settings](https://huggingface.co/settings/tokens) to get your token
608
+ 2. **Upload PDF**: Click "Choose File" and select your PDF document
609
+ 3. **Process Document**: Click "Process PDF" and wait for confirmation
610
+ 4. **Ask Questions**: Type questions or use example prompts
611
+
612
+ ### ✨ Enhanced Features:
613
+ - πŸ“„ **Multiple Text Extraction Methods**: PyPDF2, PyMuPDF, OCR, and Unstructured
614
+ - πŸ–ΌοΈ **Advanced Image Processing**: Direct PDF image extraction + vision models
615
+ - πŸ” **Robust PDF Handling**: Works with scanned PDFs, complex layouts, and image-heavy documents
616
+ - πŸ’¬ **Interactive Chat**: Conversation history with multimodal understanding
617
+ - ⚑ **Error Recovery**: Graceful fallbacks when one extraction method fails
618
+ - πŸ“Š **Processing Statistics**: Detailed feedback on what was extracted
619
+
620
+ ### πŸ”§ Models Used:
621
+ - **🎭 Multimodal**: Microsoft GIT-Large (understands images + text together)
622
+ - **πŸ“ Text Generation**: Google FLAN-T5-Base (optimized for Q&A)
623
+ - **πŸ‘οΈ Vision**: Salesforce BLIP (image captioning and understanding)
624
+ - **πŸ” Embeddings**: Sentence Transformers all-MiniLM-L6-v2
625
+ - **πŸ“– OCR**: Tesseract for text recognition in images
626
+
627
+ ### 🎯 Multimodal Capabilities:
628
+ - **Text + Images**: Can answer questions about both text content and visual elements
629
+ - **Image Understanding**: Describes charts, diagrams, photos in your PDFs
630
+ - **OCR Integration**: Extracts text from images within PDFs
631
+ - **Context Awareness**: Combines text and visual information for comprehensive answers
632
+ - **Fallback Strategy**: Uses multiple methods to ensure successful text extraction
633
+
634
+ ### πŸ› οΈ Troubleshooting:
635
+ - **No text extracted**: Try different PDF files, ensure not password-protected
636
+ - **Large files**: Keep PDFs under 50MB for optimal performance
637
+ - **Scanned PDFs**: OCR will automatically process image-based text
638
+ - **Complex layouts**: Multiple extraction methods handle various PDF formats
639
+ """)
640
+
641
+ return iface
642
 
643
+ # Step 4: Launch the application
644
+ print("βœ… Setup complete! Launching Enhanced Gradio interface...")
645
 
646
+ # Create and launch interface
647
+ iface = create_colab_interface()
648
 
649
+ # Launch with public link for Colab
650
+ iface.launch(
651
+ debug=True,
652
+ share=True, # Creates public link
653
+ server_name="0.0.0.0",
654
+ server_port=7860,
655
+ show_error=True
656
+ )
657
 
658
+ print("πŸŽ‰ Enhanced Application launched successfully!")
659
+ print("πŸ“± Use the public link above to access your app from anywhere!")