AiDeveloper1 commited on
Commit
3e0fb99
Β·
verified Β·
1 Parent(s): 962a299

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +756 -756
app.py CHANGED
@@ -1,756 +1,756 @@
1
- import gradio as gr
2
- import PyPDF2
3
- import chromadb
4
- from openai import OpenAI
5
- import numpy as np
6
- from typing import List, Dict, Tuple
7
- import json
8
- import io
9
- import os
10
- from datetime import datetime
11
- import pandas as pd
12
-
13
- class RAGPipeline:
14
- def __init__(self):
15
- # Initialize local ChromaDB client using new configuration
16
- try:
17
- self.chroma_client = chromadb.PersistentClient(path="./chroma_db")
18
- except Exception as e:
19
- print(f"ChromaDB initialization error: {e}")
20
- self.chroma_client = None
21
-
22
- # OpenAI client (will be set through UI)
23
- self.openai_client = None
24
- self.openai_api_key = None
25
-
26
- # Collection for storing document chunks
27
- self.collection = None
28
-
29
- # Store document metadata and full text
30
- self.document_metadata = {}
31
- self.full_extracted_text = "" # Store full text here
32
-
33
- def set_openai_key(self, openai_key: str):
34
- """Set OpenAI API key and create client"""
35
- self.openai_api_key = openai_key
36
-
37
- if openai_key:
38
- self.openai_client = OpenAI(api_key=openai_key)
39
-
40
- def get_openai_embedding(self, text: str) -> List[float]:
41
- """Generate embeddings using OpenAI's text-embedding-ada-002 model"""
42
- if not self.openai_client:
43
- raise ValueError("OpenAI client not initialized")
44
-
45
- try:
46
- response = self.openai_client.embeddings.create(
47
- model="text-embedding-ada-002",
48
- input=text
49
- )
50
- return response.data[0].embedding
51
- except Exception as e:
52
- raise Exception(f"OpenAI embedding generation failed: {str(e)}")
53
-
54
- def extract_text_from_pdf(self, pdf_file) -> Tuple[str, Dict]:
55
- """Extract text from uploaded PDF file"""
56
- try:
57
- # Handle different file types from Gradio
58
- if hasattr(pdf_file, 'name'):
59
- # If it's a file path, read the file
60
- with open(pdf_file.name, 'rb') as file:
61
- pdf_content = file.read()
62
- elif isinstance(pdf_file, bytes):
63
- # If it's already bytes
64
- pdf_content = pdf_file
65
- else:
66
- # If it's a file-like object, read it
67
- pdf_content = pdf_file.read() if hasattr(pdf_file, 'read') else pdf_file
68
-
69
- # Read PDF file
70
- pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
71
-
72
- text = ""
73
- page_count = len(pdf_reader.pages)
74
-
75
- # Extract text from all pages
76
- for page_num, page in enumerate(pdf_reader.pages):
77
- page_text = page.extract_text()
78
- if page_text.strip(): # Only add non-empty pages
79
- text += f"\n--- Page {page_num + 1} ---\n"
80
- text += page_text + "\n"
81
-
82
- # Clean up the text
83
- text = text.strip()
84
-
85
- # Store the full text in the pipeline object
86
- self.full_extracted_text = text
87
- print(f"DEBUG: Stored full text length: {len(self.full_extracted_text)}")
88
-
89
- # Create extraction metadata
90
- metadata = {
91
- "total_pages": page_count,
92
- "total_characters": len(text),
93
- "extraction_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
94
- "file_size_bytes": len(pdf_content),
95
- "pages_with_text": sum(1 for page in pdf_reader.pages if page.extract_text().strip()),
96
- "average_chars_per_page": len(text) // page_count if page_count > 0 else 0
97
- }
98
-
99
- return text, metadata
100
-
101
- except Exception as e:
102
- return f"Error extracting PDF: {str(e)}", {}
103
-
104
- def chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> Tuple[List[str], Dict]:
105
- """Split text into overlapping chunks"""
106
- if not text or len(text.strip()) == 0:
107
- return [], {"error": "No text provided for chunking"}
108
-
109
- # Clean the text first
110
- text = text.strip()
111
-
112
- chunks = []
113
- start = 0
114
-
115
- print(f"DEBUG: Starting chunking with text length: {len(text)}")
116
- print(f"DEBUG: Chunk size: {chunk_size}, Overlap: {overlap}")
117
-
118
- while start < len(text):
119
- end = start + chunk_size
120
-
121
- # If we're not at the end, try to break at a sentence or word boundary
122
- if end < len(text):
123
- # Look for sentence boundary
124
- last_period = text.rfind('.', start, end)
125
- last_newline = text.rfind('\n', start, end)
126
- last_space = text.rfind(' ', start, end)
127
-
128
- # Choose the best breaking point
129
- break_point = max(last_period, last_newline, last_space)
130
- if break_point > start:
131
- end = break_point + 1
132
-
133
- chunk = text[start:end].strip()
134
- if chunk and len(chunk) > 50: # Only add meaningful chunks
135
- chunks.append(chunk)
136
- print(f"DEBUG: Added chunk {len(chunks)}: length={len(chunk)}")
137
-
138
- # Move start position
139
- if end >= len(text):
140
- break
141
- start = end - overlap
142
-
143
- # Prevent infinite loop
144
- if start >= end:
145
- start = end
146
-
147
- print(f"DEBUG: Final chunks count: {len(chunks)}")
148
-
149
- # Create chunking metadata
150
- chunk_lengths = [len(chunk) for chunk in chunks]
151
- metadata = {
152
- "total_chunks": len(chunks),
153
- "chunk_size": chunk_size,
154
- "overlap": overlap,
155
- "avg_chunk_length": np.mean(chunk_lengths) if chunks else 0,
156
- "min_chunk_length": min(chunk_lengths) if chunks else 0,
157
- "max_chunk_length": max(chunk_lengths) if chunks else 0,
158
- "total_text_length": len(text),
159
- "chunking_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
160
- }
161
-
162
- return chunks, metadata
163
-
164
- def store_in_chromadb(self, chunks: List[str], document_name: str) -> Dict:
165
- """Store chunks in ChromaDB with OpenAI embeddings"""
166
- if not self.openai_client:
167
- return {"error": "OpenAI client not initialized for embedding generation"}
168
-
169
- try:
170
- # Create or get collection
171
- collection_name = f"financial_docs_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
172
-
173
- try:
174
- self.chroma_client.delete_collection(collection_name)
175
- except:
176
- pass
177
-
178
- self.collection = self.chroma_client.create_collection(
179
- name=collection_name,
180
- metadata={"hnsw:space": "cosine"}
181
- )
182
-
183
- # Generate embeddings for chunks using OpenAI
184
- embeddings = []
185
- embedding_metadata = {
186
- "model_used": "text-embedding-ada-002",
187
- "total_chunks_processed": len(chunks),
188
- "embedding_start_time": datetime.now().isoformat()
189
- }
190
-
191
- for i, chunk in enumerate(chunks):
192
- try:
193
- embedding = self.get_openai_embedding(chunk)
194
- embeddings.append(embedding)
195
- except Exception as e:
196
- return {"error": f"Failed to generate embedding for chunk {i}: {str(e)}"}
197
-
198
- embedding_metadata["embedding_end_time"] = datetime.now().isoformat()
199
- embedding_metadata["embedding_dimension"] = len(embeddings[0]) if embeddings else 0
200
-
201
- # Create unique IDs for each chunk
202
- ids = [f"chunk_{i}" for i in range(len(chunks))]
203
-
204
- # Create metadata for each chunk
205
- metadatas = [
206
- {
207
- "chunk_id": i,
208
- "document_name": document_name,
209
- "chunk_length": len(chunk),
210
- "created_at": datetime.now().isoformat(),
211
- "embedding_model": "text-embedding-ada-002"
212
- }
213
- for i, chunk in enumerate(chunks)
214
- ]
215
-
216
- # Store in ChromaDB
217
- self.collection.add(
218
- embeddings=embeddings,
219
- documents=chunks,
220
- metadatas=metadatas,
221
- ids=ids
222
- )
223
-
224
- # Create storage metadata
225
- storage_metadata = {
226
- "collection_name": collection_name,
227
- "total_vectors_stored": len(chunks),
228
- "embedding_dimension": len(embeddings[0]) if embeddings else 0,
229
- "embedding_model": "text-embedding-ada-002",
230
- "storage_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
231
- "database_status": "Successfully stored",
232
- "database_type": "ChromaDB Local",
233
- "database_path": "./chroma_db",
234
- "embedding_metadata": embedding_metadata
235
- }
236
-
237
- return storage_metadata
238
-
239
- except Exception as e:
240
- return {"error": f"Storage failed: {str(e)}"}
241
-
242
- def semantic_search(self, query: str, top_k: int = 5) -> Tuple[List[Dict], Dict]:
243
- """Perform semantic search using OpenAI embeddings and return top-k results"""
244
- if not self.collection:
245
- return [], {"error": "No collection available. Please upload and process a document first."}
246
-
247
- if not self.openai_client:
248
- return [], {"error": "OpenAI client not initialized for query embedding generation"}
249
-
250
- try:
251
- # Generate query embedding using OpenAI
252
- query_embedding = self.get_openai_embedding(query)
253
-
254
- # Search in ChromaDB
255
- results = self.collection.query(
256
- query_embeddings=[query_embedding],
257
- n_results=top_k,
258
- include=['documents', 'metadatas', 'distances']
259
- )
260
-
261
- # Format results
262
- search_results = []
263
- for i in range(len(results['documents'][0])):
264
- result = {
265
- "chunk_id": results['metadatas'][0][i]['chunk_id'],
266
- "similarity_score": 1 - results['distances'][0][i], # Convert distance to similarity
267
- "content": results['documents'][0][i][:500] + "..." if len(results['documents'][0][i]) > 500 else results['documents'][0][i],
268
- "full_content": results['documents'][0][i],
269
- "metadata": results['metadatas'][0][i]
270
- }
271
- search_results.append(result)
272
-
273
- # Create search metadata
274
- search_metadata = {
275
- "query": query,
276
- "results_found": len(search_results),
277
- "search_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
278
- "top_similarity_score": max([r["similarity_score"] for r in search_results]) if search_results else 0,
279
- "query_embedding_model": "text-embedding-ada-002",
280
- "vector_database": "ChromaDB Local"
281
- }
282
-
283
- return search_results, search_metadata
284
-
285
- except Exception as e:
286
- return [], {"error": f"Search failed: {str(e)}"}
287
-
288
- def generate_llm_response(self, query: str, search_results: List[Dict]) -> Tuple[str, Dict]:
289
- """Generate final response using OpenAI LLM"""
290
- if not self.openai_client:
291
- return "OpenAI client not initialized for LLM response generation.", {}
292
-
293
- try:
294
- # Prepare context from search results
295
- context = "\n\n".join([
296
- f"Chunk {result['chunk_id']} (Similarity: {result['similarity_score']:.3f}):\n{result['full_content']}"
297
- for result in search_results
298
- ])
299
-
300
- # Create prompt
301
- prompt = f"""Based on the following financial document excerpts, please provide a comprehensive and accurate answer to the user's question.
302
-
303
- Context from financial document:
304
- {context}
305
-
306
- User Question: {query}
307
-
308
- Instructions:
309
- 1. Provide a detailed, well-structured answer based solely on the provided context
310
- 2. If the context doesn't contain enough information to fully answer the question, clearly state this
311
- 3. Include specific numbers, dates, and financial figures when available
312
- 4. Structure your response clearly with proper formatting
313
- 5. Cite which chunk(s) your information comes from when possible
314
-
315
- Answer:"""
316
-
317
- # Generate response using OpenAI
318
- response = self.openai_client.chat.completions.create(
319
- model="gpt-3.5-turbo",
320
- messages=[
321
- {"role": "system", "content": "You are a financial analyst AI assistant. Provide accurate, well-structured responses based on the given financial document context."},
322
- {"role": "user", "content": prompt}
323
- ],
324
- max_tokens=1000,
325
- temperature=0.1
326
- )
327
-
328
- llm_response = response.choices[0].message.content
329
-
330
- # Create response metadata
331
- response_metadata = {
332
- "model_used": "gpt-3.5-turbo",
333
- "response_length": len(llm_response),
334
- "tokens_used": response.usage.total_tokens,
335
- "prompt_tokens": response.usage.prompt_tokens,
336
- "completion_tokens": response.usage.completion_tokens,
337
- "generation_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
338
- "context_chunks_used": len(search_results),
339
- "temperature": 0.1,
340
- "max_tokens": 1000
341
- }
342
-
343
- return llm_response, response_metadata
344
-
345
- except Exception as e:
346
- return f"LLM Generation failed: {str(e)}", {"error": str(e)}
347
-
348
- # Initialize RAG pipeline
349
- rag_pipeline = RAGPipeline()
350
-
351
- def configure_openai_api(openai_key):
352
- """Configure OpenAI API key"""
353
- try:
354
- # Set OpenAI API key
355
- rag_pipeline.set_openai_key(openai_key)
356
-
357
- # Test OpenAI connection
358
- if openai_key:
359
- try:
360
- # Test with a simple API call
361
- test_response = rag_pipeline.openai_client.models.list()
362
- openai_status = "βœ… OpenAI API key validated successfully"
363
- except Exception as e:
364
- openai_status = f"❌ OpenAI API key validation failed: {str(e)}"
365
- else:
366
- openai_status = "❌ OpenAI API key required"
367
-
368
- # ChromaDB status (local setup)
369
- if rag_pipeline.chroma_client:
370
- chroma_status = "βœ… ChromaDB Local database ready (./chroma_db)"
371
- else:
372
- chroma_status = "❌ ChromaDB Local database initialization failed"
373
-
374
- return f"{openai_status}\n{chroma_status}"
375
-
376
- except Exception as e:
377
- return f"❌ Configuration failed: {str(e)}"
378
-
379
- # Remove the global variable since we're storing in the class
380
- # extracted_text_store = ""
381
-
382
- def process_pdf_upload(pdf_file):
383
- """Process uploaded PDF and extract text"""
384
- if pdf_file is None:
385
- return "No file uploaded", "{}"
386
-
387
- # Extract text using the updated method
388
- text, metadata = rag_pipeline.extract_text_from_pdf(pdf_file)
389
-
390
- if text.startswith("Error"):
391
- return text, json.dumps(metadata, indent=2)
392
-
393
- # Show more text in preview (first 3000 characters instead of 2000)
394
- preview_text = text[:3000] + f"...\n\n[SHOWING FIRST 3000 CHARACTERS OF {len(text)} TOTAL CHARACTERS]\n[FULL TEXT STORED FOR PROCESSING - Total Length: {len(rag_pipeline.full_extracted_text)} chars]" if len(text) > 3000 else text
395
-
396
- return preview_text, json.dumps(metadata, indent=2)
397
-
398
- def process_chunking(text, chunk_size, overlap):
399
- """Process text chunking"""
400
- # Always use the full text stored in the pipeline object
401
- if not rag_pipeline.full_extracted_text:
402
- return "No text available for chunking. Please upload a PDF first.", "{}"
403
-
404
- full_text = rag_pipeline.full_extracted_text
405
- print(f"DEBUG: Using full text for chunking, length: {len(full_text)}")
406
-
407
- if len(full_text.strip()) == 0:
408
- return "No valid text available for chunking.", "{}"
409
-
410
- chunks, metadata = rag_pipeline.chunk_text(full_text, int(chunk_size), int(overlap))
411
-
412
- if not chunks:
413
- return "No chunks created. Please check your text and parameters.", json.dumps(metadata, indent=2)
414
-
415
- # Display first few chunks as preview
416
- preview = f"=== CHUNKING RESULTS ===\n"
417
- preview += f"Total chunks created: {len(chunks)}\n"
418
- preview += f"Full text length processed: {len(full_text)} characters\n\n"
419
- preview += "--- CHUNK PREVIEW ---\n\n"
420
-
421
- for i, chunk in enumerate(chunks[:3]):
422
- preview += f"Chunk {i+1} (Length: {len(chunk)} chars):\n"
423
- preview += f"{chunk[:200]}...\n\n"
424
- preview += "-" * 50 + "\n\n"
425
-
426
- if len(chunks) > 3:
427
- preview += f"... and {len(chunks)-3} more chunks\n"
428
- preview += f"Shortest chunk: {min(len(c) for c in chunks)} chars\n"
429
- preview += f"Longest chunk: {max(len(c) for c in chunks)} chars\n"
430
-
431
- return preview, json.dumps(metadata, indent=2)
432
-
433
- def process_vector_storage(text, chunk_size, overlap, doc_name):
434
- """Process vector storage in local ChromaDB"""
435
- if not rag_pipeline.openai_client:
436
- return "Please configure OpenAI API key first in the Configuration tab", "{}"
437
-
438
- if not rag_pipeline.chroma_client:
439
- return "ChromaDB local database not available. Please restart the application.", "{}"
440
-
441
- # Always use the stored full text
442
- if not rag_pipeline.full_extracted_text:
443
- return "No valid text to store. Please upload a PDF first.", "{}"
444
-
445
- full_text = rag_pipeline.full_extracted_text
446
- print(f"DEBUG: Using full text for storage, length: {len(full_text)}")
447
-
448
- # Re-chunk the text using full text
449
- chunks, _ = rag_pipeline.chunk_text(full_text, int(chunk_size), int(overlap))
450
-
451
- if not chunks:
452
- return "No chunks to store", "{}"
453
-
454
- # Store in ChromaDB
455
- storage_metadata = rag_pipeline.store_in_chromadb(chunks, doc_name or "financial_document")
456
-
457
- if "error" in storage_metadata:
458
- return f"Storage failed: {storage_metadata['error']}", json.dumps(storage_metadata, indent=2)
459
-
460
- return f"Successfully stored {len(chunks)} chunks in ChromaDB Local using OpenAI embeddings\nFull text length: {len(full_text)} characters", json.dumps(storage_metadata, indent=2)
461
-
462
- def process_semantic_search(query, top_k):
463
- """Process semantic search"""
464
- if not query.strip():
465
- return "Please enter a search query", "{}", ""
466
-
467
- search_results, search_metadata = rag_pipeline.semantic_search(query, int(top_k))
468
-
469
- if not search_results:
470
- return "No results found", json.dumps(search_metadata, indent=2), ""
471
-
472
- # Format results for display
473
- results_display = "=== TOP MATCHING CHUNKS ===\n\n"
474
- for i, result in enumerate(search_results, 1):
475
- results_display += f"RESULT {i}:\n"
476
- results_display += f"Chunk ID: {result['chunk_id']}\n"
477
- results_display += f"Similarity Score: {result['similarity_score']:.4f}\n"
478
- results_display += f"Content Preview: {result['content']}\n"
479
- results_display += "-" * 50 + "\n\n"
480
-
481
- # Create DataFrame for structured display
482
- df_data = []
483
- for result in search_results:
484
- df_data.append({
485
- "Chunk ID": result['chunk_id'],
486
- "Similarity Score": f"{result['similarity_score']:.4f}",
487
- "Content Length": len(result['full_content']),
488
- "Preview": result['content'][:100] + "..."
489
- })
490
-
491
- df = pd.DataFrame(df_data)
492
-
493
- return results_display, json.dumps(search_metadata, indent=2), df
494
-
495
- def generate_final_response(query, top_k):
496
- """Generate final LLM response"""
497
- if not rag_pipeline.openai_client:
498
- return "Please configure OpenAI API key first in the Configuration tab", "{}"
499
-
500
- if not query.strip():
501
- return "Please enter a query first", "{}"
502
-
503
- # Get search results
504
- search_results, _ = rag_pipeline.semantic_search(query, int(top_k))
505
-
506
- if not search_results:
507
- return "No search results available for LLM generation", "{}"
508
-
509
- # Generate LLM response
510
- response, metadata = rag_pipeline.generate_llm_response(query, search_results)
511
-
512
- return response, json.dumps(metadata, indent=2)
513
-
514
- def create_gradio_interface():
515
- """Create the Gradio interface"""
516
-
517
- with gr.Blocks(title="RAG Pipeline Demo - Financial Document Analysis", theme=gr.themes.Soft()) as demo:
518
- gr.Markdown("""
519
- # 🏦 RAG Pipeline Demo - Financial Document Analysis
520
-
521
- This demo shows a complete Retrieval-Augmented Generation (RAG) pipeline with full transparency.
522
- Each step is clearly displayed so you can understand exactly what's happening in the backend.
523
-
524
- **πŸ”§ Start by configuring your API keys in the Configuration tab below.**
525
- """)
526
-
527
- # Configuration Tab - Simplified
528
- with gr.Tab("βš™οΈ Configuration"):
529
- gr.Markdown("### API Configuration")
530
- gr.Markdown("Configure your OpenAI API key. ChromaDB will run locally and store data in `./chroma_db` folder.")
531
-
532
- with gr.Row():
533
- with gr.Column():
534
- gr.Markdown("#### OpenAI API Key")
535
- gr.Markdown("Required for both embeddings generation and LLM response generation")
536
- openai_key_input = gr.Textbox(
537
- label="OpenAI API Key",
538
- type="password",
539
- placeholder="sk-...",
540
- info="Get your API key from: https://platform.openai.com/api-keys"
541
- )
542
-
543
- with gr.Column():
544
- gr.Markdown("#### ChromaDB Status")
545
- gr.Markdown("βœ… **Local ChromaDB**: Data will be stored locally in `./chroma_db`")
546
- gr.Markdown("πŸ“ **Storage Location**: Current directory/chroma_db")
547
- gr.Markdown("πŸ”„ **Persistence**: Data persists between sessions")
548
-
549
- config_btn = gr.Button("Save OpenAI Configuration", variant="primary", size="lg")
550
- config_status = gr.Textbox(label="Configuration Status", lines=3)
551
-
552
- # Step 1: Document Upload
553
- with gr.Tab("1️⃣ Document Upload"):
554
- gr.Markdown("### Step 1: Upload Your Financial PDF Document")
555
-
556
- with gr.Row():
557
- with gr.Column():
558
- pdf_input = gr.File(label="Upload PDF Document", file_types=[".pdf"])
559
- upload_btn = gr.Button("Extract Text from PDF", variant="primary")
560
-
561
- with gr.Column():
562
- extraction_output = gr.Textbox(label="Extracted Text Preview", lines=15, max_lines=20)
563
- extraction_metadata = gr.JSON(label="Extraction Metadata")
564
-
565
- # Step 2: Text Chunking
566
- with gr.Tab("2️⃣ Text Chunking"):
567
- gr.Markdown("### Step 2: Split Text into Manageable Chunks")
568
-
569
- with gr.Row():
570
- with gr.Column():
571
- chunk_size = gr.Slider(minimum=200, maximum=2000, value=1000, label="Chunk Size (characters)")
572
- overlap = gr.Slider(minimum=0, maximum=500, value=200, label="Overlap (characters)")
573
- chunk_btn = gr.Button("Create Chunks", variant="primary")
574
-
575
- with gr.Column():
576
- chunks_output = gr.Textbox(label="Chunks Preview", lines=15, max_lines=20)
577
- chunking_metadata = gr.JSON(label="Chunking Metadata")
578
-
579
- # Step 3: Vector Storage
580
- with gr.Tab("3️⃣ Vector Storage"):
581
- gr.Markdown("### Step 3: Store Chunks in ChromaDB Vector Database")
582
-
583
- with gr.Row():
584
- with gr.Column():
585
- doc_name = gr.Textbox(label="Document Name", value="financial_report", placeholder="Enter document name")
586
- storage_btn = gr.Button("Store in ChromaDB", variant="primary")
587
-
588
- with gr.Column():
589
- storage_output = gr.Textbox(label="Storage Status", lines=5)
590
- storage_metadata = gr.JSON(label="Storage Metadata")
591
-
592
- # Step 4: Semantic Search
593
- with gr.Tab("4️⃣ Semantic Search"):
594
- gr.Markdown("### Step 4: Search for Relevant Information")
595
-
596
- with gr.Row():
597
- with gr.Column():
598
- search_query = gr.Textbox(label="Enter your question", placeholder="e.g., What was the revenue growth in Q4?")
599
- top_k = gr.Slider(minimum=1, maximum=10, value=5, label="Number of results to retrieve")
600
- search_btn = gr.Button("Search Vector Database", variant="primary")
601
-
602
- with gr.Column():
603
- search_results_text = gr.Textbox(label="Search Results", lines=15, max_lines=20)
604
- search_metadata = gr.JSON(label="Search Metadata")
605
-
606
- # Results table
607
- results_table = gr.DataFrame(label="Top Matching Chunks - Structured View")
608
-
609
- # Step 5: LLM Response Generation
610
- with gr.Tab("5️⃣ LLM Response"):
611
- gr.Markdown("### Step 5: Generate Final Answer using OpenAI")
612
- gr.Markdown("*Note: OpenAI API key must be configured in the Configuration tab*")
613
-
614
- with gr.Row():
615
- with gr.Column():
616
- generate_btn = gr.Button("Generate Final Response", variant="primary")
617
- gr.Markdown("**Current Query:** Will use the query from Step 4")
618
-
619
- with gr.Column():
620
- final_response = gr.Textbox(label="AI Generated Response", lines=15, max_lines=20)
621
- response_metadata = gr.JSON(label="Response Metadata")
622
-
623
- # Complete Pipeline Tab
624
- with gr.Tab("πŸš€ Complete Pipeline"):
625
- gr.Markdown("### Run the Complete RAG Pipeline")
626
- gr.Markdown("*Note: Make sure to configure API keys in the Configuration tab first*")
627
-
628
- with gr.Row():
629
- with gr.Column():
630
- complete_pdf = gr.File(label="Upload PDF", file_types=[".pdf"])
631
- complete_query = gr.Textbox(label="Your Question", placeholder="Ask about the financial document")
632
-
633
- with gr.Column():
634
- complete_chunk_size = gr.Slider(minimum=200, maximum=2000, value=1000, label="Chunk Size")
635
- complete_overlap = gr.Slider(minimum=0, maximum=500, value=200, label="Overlap")
636
- complete_top_k = gr.Slider(minimum=1, maximum=10, value=5, label="Top K Results")
637
-
638
- complete_btn = gr.Button("Run Complete Pipeline", variant="primary", size="lg")
639
-
640
- with gr.Row():
641
- pipeline_status = gr.Textbox(label="Pipeline Status", lines=10)
642
- pipeline_response = gr.Textbox(label="Final Answer", lines=10)
643
-
644
- # Event handlers
645
- config_btn.click(
646
- configure_openai_api,
647
- inputs=[openai_key_input],
648
- outputs=[config_status]
649
- )
650
-
651
- upload_btn.click(
652
- process_pdf_upload,
653
- inputs=[pdf_input],
654
- outputs=[extraction_output, extraction_metadata]
655
- )
656
-
657
- chunk_btn.click(
658
- process_chunking,
659
- inputs=[extraction_output, chunk_size, overlap],
660
- outputs=[chunks_output, chunking_metadata]
661
- )
662
-
663
- storage_btn.click(
664
- process_vector_storage,
665
- inputs=[extraction_output, chunk_size, overlap, doc_name],
666
- outputs=[storage_output, storage_metadata]
667
- )
668
-
669
- search_btn.click(
670
- process_semantic_search,
671
- inputs=[search_query, top_k],
672
- outputs=[search_results_text, search_metadata, results_table]
673
- )
674
-
675
- generate_btn.click(
676
- generate_final_response,
677
- inputs=[search_query, top_k],
678
- outputs=[final_response, response_metadata]
679
- )
680
-
681
- # Complete pipeline function
682
- def run_complete_pipeline(pdf_file, query, chunk_size, overlap, top_k):
683
- if not pdf_file or not query:
684
- return "Please provide PDF file and query", ""
685
-
686
- if not rag_pipeline.openai_client:
687
- return "Please configure OpenAI API key in the Configuration tab first", ""
688
-
689
- if not rag_pipeline.chroma_client:
690
- return "ChromaDB local database not available. Please restart the application.", ""
691
-
692
- status = "Starting RAG Pipeline...\n\n"
693
- status += "Using: ChromaDB Local + OpenAI API\n"
694
- status += "Storage: ./chroma_db directory\n\n"
695
-
696
- try:
697
- # Step 1: Extract text
698
- status += "Step 1: Extracting text from PDF...\n"
699
- text, _ = rag_pipeline.extract_text_from_pdf(pdf_file)
700
- if text.startswith("Error"):
701
- return status + f"Failed: {text}", ""
702
- status += "βœ… Text extraction completed\n\n"
703
-
704
- # Step 2: Chunk text
705
- status += "Step 2: Chunking text...\n"
706
- chunks, _ = rag_pipeline.chunk_text(text, chunk_size, overlap)
707
- status += f"βœ… Created {len(chunks)} chunks\n\n"
708
-
709
- # Step 3: Store in vector DB
710
- status += f"Step 3: Generating OpenAI embeddings and storing in ChromaDB Local...\n"
711
- storage_result = rag_pipeline.store_in_chromadb(chunks, "complete_pipeline_doc")
712
- if "error" in storage_result:
713
- return status + f"Failed: {storage_result['error']}", ""
714
- status += f"βœ… Vectors stored in ChromaDB Local using OpenAI embeddings\n\n"
715
-
716
- # Step 4: Search
717
- status += "Step 4: Performing semantic search with OpenAI embeddings...\n"
718
- search_results, _ = rag_pipeline.semantic_search(query, top_k)
719
- if not search_results:
720
- return status + "❌ No search results found", ""
721
- status += f"βœ… Found {len(search_results)} relevant chunks\n\n"
722
-
723
- # Step 5: Generate response
724
- status += "Step 5: Generating LLM response...\n"
725
- response, _ = rag_pipeline.generate_llm_response(query, search_results)
726
- if response.startswith("LLM Generation failed"):
727
- return status + f"Failed: {response}", ""
728
- status += "βœ… Final response generated successfully!"
729
-
730
- return status, response
731
-
732
- except Exception as e:
733
- return status + f"❌ Pipeline failed: {str(e)}", ""
734
-
735
- complete_btn.click(
736
- run_complete_pipeline,
737
- inputs=[complete_pdf, complete_query, complete_chunk_size, complete_overlap, complete_top_k],
738
- outputs=[pipeline_status, pipeline_response]
739
- )
740
-
741
- return demo
742
-
743
- # Launch the application
744
- if __name__ == "__main__":
745
- # Install required packages
746
- print("Starting RAG Pipeline Demo...")
747
- print("Make sure you have installed the required packages:")
748
- print("pip install gradio PyPDF2 chromadb openai pandas numpy")
749
- print("\nConfiguration:")
750
- print("βœ… ChromaDB: Local storage (./chroma_db directory)")
751
- print("πŸ”‘ OpenAI: API key required for embeddings + LLM")
752
- print("πŸ“ Data persistence: Enabled across sessions")
753
-
754
- # Create and launch the Gradio interface
755
- demo = create_gradio_interface()
756
- demo.launch(share=True, debug=True)
 
1
+ import gradio as gr
2
+ import PyPDF2
3
+ import chromadb
4
+ from openai import OpenAI
5
+ import numpy as np
6
+ from typing import List, Dict, Tuple
7
+ import json
8
+ import io
9
+ import os
10
+ from datetime import datetime
11
+ import pandas as pd
12
+
13
+ class RAGPipeline:
14
+ def __init__(self):
15
+ # Initialize local ChromaDB client using new configuration
16
+ try:
17
+ self.chroma_client = chromadb.PersistentClient(path="./chroma_db")
18
+ except Exception as e:
19
+ print(f"ChromaDB initialization error: {e}")
20
+ self.chroma_client = None
21
+
22
+ # OpenAI client (will be set through UI)
23
+ self.openai_client = None
24
+ self.openai_api_key = None
25
+
26
+ # Collection for storing document chunks
27
+ self.collection = None
28
+
29
+ # Store document metadata and full text
30
+ self.document_metadata = {}
31
+ self.full_extracted_text = "" # Store full text here
32
+
33
+ def set_openai_key(self, openai_key: str):
34
+ """Set OpenAI API key and create client"""
35
+ self.openai_api_key = openai_key
36
+
37
+ if openai_key:
38
+ self.openai_client = OpenAI(api_key=openai_key)
39
+
40
+ def get_openai_embedding(self, text: str) -> List[float]:
41
+ """Generate embeddings using OpenAI's text-embedding-ada-002 model"""
42
+ if not self.openai_client:
43
+ raise ValueError("OpenAI client not initialized")
44
+
45
+ try:
46
+ response = self.openai_client.embeddings.create(
47
+ model="text-embedding-ada-002",
48
+ input=text
49
+ )
50
+ return response.data[0].embedding
51
+ except Exception as e:
52
+ raise Exception(f"OpenAI embedding generation failed: {str(e)}")
53
+
54
+ def extract_text_from_pdf(self, pdf_file) -> Tuple[str, Dict]:
55
+ """Extract text from uploaded PDF file"""
56
+ try:
57
+ # Handle different file types from Gradio
58
+ if hasattr(pdf_file, 'name'):
59
+ # If it's a file path, read the file
60
+ with open(pdf_file.name, 'rb') as file:
61
+ pdf_content = file.read()
62
+ elif isinstance(pdf_file, bytes):
63
+ # If it's already bytes
64
+ pdf_content = pdf_file
65
+ else:
66
+ # If it's a file-like object, read it
67
+ pdf_content = pdf_file.read() if hasattr(pdf_file, 'read') else pdf_file
68
+
69
+ # Read PDF file
70
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
71
+
72
+ text = ""
73
+ page_count = len(pdf_reader.pages)
74
+
75
+ # Extract text from all pages
76
+ for page_num, page in enumerate(pdf_reader.pages):
77
+ page_text = page.extract_text()
78
+ if page_text.strip(): # Only add non-empty pages
79
+ text += f"\n--- Page {page_num + 1} ---\n"
80
+ text += page_text + "\n"
81
+
82
+ # Clean up the text
83
+ text = text.strip()
84
+
85
+ # Store the full text in the pipeline object
86
+ self.full_extracted_text = text
87
+ print(f"DEBUG: Stored full text length: {len(self.full_extracted_text)}")
88
+
89
+ # Create extraction metadata
90
+ metadata = {
91
+ "total_pages": page_count,
92
+ "total_characters": len(text),
93
+ "extraction_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
94
+ "file_size_bytes": len(pdf_content),
95
+ "pages_with_text": sum(1 for page in pdf_reader.pages if page.extract_text().strip()),
96
+ "average_chars_per_page": len(text) // page_count if page_count > 0 else 0
97
+ }
98
+
99
+ return text, metadata
100
+
101
+ except Exception as e:
102
+ return f"Error extracting PDF: {str(e)}", {}
103
+
104
+ def chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> Tuple[List[str], Dict]:
105
+ """Split text into overlapping chunks"""
106
+ if not text or len(text.strip()) == 0:
107
+ return [], {"error": "No text provided for chunking"}
108
+
109
+ # Clean the text first
110
+ text = text.strip()
111
+
112
+ chunks = []
113
+ start = 0
114
+
115
+ print(f"DEBUG: Starting chunking with text length: {len(text)}")
116
+ print(f"DEBUG: Chunk size: {chunk_size}, Overlap: {overlap}")
117
+
118
+ while start < len(text):
119
+ end = start + chunk_size
120
+
121
+ # If we're not at the end, try to break at a sentence or word boundary
122
+ if end < len(text):
123
+ # Look for sentence boundary
124
+ last_period = text.rfind('.', start, end)
125
+ last_newline = text.rfind('\n', start, end)
126
+ last_space = text.rfind(' ', start, end)
127
+
128
+ # Choose the best breaking point
129
+ break_point = max(last_period, last_newline, last_space)
130
+ if break_point > start:
131
+ end = break_point + 1
132
+
133
+ chunk = text[start:end].strip()
134
+ if chunk and len(chunk) > 50: # Only add meaningful chunks
135
+ chunks.append(chunk)
136
+ print(f"DEBUG: Added chunk {len(chunks)}: length={len(chunk)}")
137
+
138
+ # Move start position
139
+ if end >= len(text):
140
+ break
141
+ start = end - overlap
142
+
143
+ # Prevent infinite loop
144
+ if start >= end:
145
+ start = end
146
+
147
+ print(f"DEBUG: Final chunks count: {len(chunks)}")
148
+
149
+ # Create chunking metadata
150
+ chunk_lengths = [len(chunk) for chunk in chunks]
151
+ metadata = {
152
+ "total_chunks": len(chunks),
153
+ "chunk_size": chunk_size,
154
+ "overlap": overlap,
155
+ "avg_chunk_length": np.mean(chunk_lengths) if chunks else 0,
156
+ "min_chunk_length": min(chunk_lengths) if chunks else 0,
157
+ "max_chunk_length": max(chunk_lengths) if chunks else 0,
158
+ "total_text_length": len(text),
159
+ "chunking_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
160
+ }
161
+
162
+ return chunks, metadata
163
+
164
+ def store_in_chromadb(self, chunks: List[str], document_name: str) -> Dict:
165
+ """Store chunks in ChromaDB with OpenAI embeddings"""
166
+ if not self.openai_client:
167
+ return {"error": "OpenAI client not initialized for embedding generation"}
168
+
169
+ try:
170
+ # Create or get collection
171
+ collection_name = f"financial_docs_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
172
+
173
+ try:
174
+ self.chroma_client.delete_collection(collection_name)
175
+ except:
176
+ pass
177
+
178
+ self.collection = self.chroma_client.create_collection(
179
+ name=collection_name,
180
+ metadata={"hnsw:space": "cosine"}
181
+ )
182
+
183
+ # Generate embeddings for chunks using OpenAI
184
+ embeddings = []
185
+ embedding_metadata = {
186
+ "model_used": "text-embedding-ada-002",
187
+ "total_chunks_processed": len(chunks),
188
+ "embedding_start_time": datetime.now().isoformat()
189
+ }
190
+
191
+ for i, chunk in enumerate(chunks):
192
+ try:
193
+ embedding = self.get_openai_embedding(chunk)
194
+ embeddings.append(embedding)
195
+ except Exception as e:
196
+ return {"error": f"Failed to generate embedding for chunk {i}: {str(e)}"}
197
+
198
+ embedding_metadata["embedding_end_time"] = datetime.now().isoformat()
199
+ embedding_metadata["embedding_dimension"] = len(embeddings[0]) if embeddings else 0
200
+
201
+ # Create unique IDs for each chunk
202
+ ids = [f"chunk_{i}" for i in range(len(chunks))]
203
+
204
+ # Create metadata for each chunk
205
+ metadatas = [
206
+ {
207
+ "chunk_id": i,
208
+ "document_name": document_name,
209
+ "chunk_length": len(chunk),
210
+ "created_at": datetime.now().isoformat(),
211
+ "embedding_model": "text-embedding-ada-002"
212
+ }
213
+ for i, chunk in enumerate(chunks)
214
+ ]
215
+
216
+ # Store in ChromaDB
217
+ self.collection.add(
218
+ embeddings=embeddings,
219
+ documents=chunks,
220
+ metadatas=metadatas,
221
+ ids=ids
222
+ )
223
+
224
+ # Create storage metadata
225
+ storage_metadata = {
226
+ "collection_name": collection_name,
227
+ "total_vectors_stored": len(chunks),
228
+ "embedding_dimension": len(embeddings[0]) if embeddings else 0,
229
+ "embedding_model": "text-embedding-ada-002",
230
+ "storage_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
231
+ "database_status": "Successfully stored",
232
+ "database_type": "ChromaDB Local",
233
+ "database_path": "./chroma_db",
234
+ "embedding_metadata": embedding_metadata
235
+ }
236
+
237
+ return storage_metadata
238
+
239
+ except Exception as e:
240
+ return {"error": f"Storage failed: {str(e)}"}
241
+
242
+ def semantic_search(self, query: str, top_k: int = 5) -> Tuple[List[Dict], Dict]:
243
+ """Perform semantic search using OpenAI embeddings and return top-k results"""
244
+ if not self.collection:
245
+ return [], {"error": "No collection available. Please upload and process a document first."}
246
+
247
+ if not self.openai_client:
248
+ return [], {"error": "OpenAI client not initialized for query embedding generation"}
249
+
250
+ try:
251
+ # Generate query embedding using OpenAI
252
+ query_embedding = self.get_openai_embedding(query)
253
+
254
+ # Search in ChromaDB
255
+ results = self.collection.query(
256
+ query_embeddings=[query_embedding],
257
+ n_results=top_k,
258
+ include=['documents', 'metadatas', 'distances']
259
+ )
260
+
261
+ # Format results
262
+ search_results = []
263
+ for i in range(len(results['documents'][0])):
264
+ result = {
265
+ "chunk_id": results['metadatas'][0][i]['chunk_id'],
266
+ "similarity_score": 1 - results['distances'][0][i], # Convert distance to similarity
267
+ "content": results['documents'][0][i][:500] + "..." if len(results['documents'][0][i]) > 500 else results['documents'][0][i],
268
+ "full_content": results['documents'][0][i],
269
+ "metadata": results['metadatas'][0][i]
270
+ }
271
+ search_results.append(result)
272
+
273
+ # Create search metadata
274
+ search_metadata = {
275
+ "query": query,
276
+ "results_found": len(search_results),
277
+ "search_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
278
+ "top_similarity_score": max([r["similarity_score"] for r in search_results]) if search_results else 0,
279
+ "query_embedding_model": "text-embedding-ada-002",
280
+ "vector_database": "ChromaDB Local"
281
+ }
282
+
283
+ return search_results, search_metadata
284
+
285
+ except Exception as e:
286
+ return [], {"error": f"Search failed: {str(e)}"}
287
+
288
+ def generate_llm_response(self, query: str, search_results: List[Dict]) -> Tuple[str, Dict]:
289
+ """Generate final response using OpenAI LLM"""
290
+ if not self.openai_client:
291
+ return "OpenAI client not initialized for LLM response generation.", {}
292
+
293
+ try:
294
+ # Prepare context from search results
295
+ context = "\n\n".join([
296
+ f"Chunk {result['chunk_id']} (Similarity: {result['similarity_score']:.3f}):\n{result['full_content']}"
297
+ for result in search_results
298
+ ])
299
+
300
+ # Create prompt
301
+ prompt = f"""Based on the following financial document excerpts, please provide a comprehensive and accurate answer to the user's question.
302
+
303
+ Context from financial document:
304
+ {context}
305
+
306
+ User Question: {query}
307
+
308
+ Instructions:
309
+ 1. Provide a detailed, well-structured answer based solely on the provided context
310
+ 2. If the context doesn't contain enough information to fully answer the question, clearly state this
311
+ 3. Include specific numbers, dates, and financial figures when available
312
+ 4. Structure your response clearly with proper formatting
313
+ 5. Cite which chunk(s) your information comes from when possible
314
+
315
+ Answer:"""
316
+
317
+ # Generate response using OpenAI
318
+ response = self.openai_client.chat.completions.create(
319
+ model="gpt-3.5-turbo",
320
+ messages=[
321
+ {"role": "system", "content": "You are a financial analyst AI assistant. Provide accurate, well-structured responses based on the given financial document context."},
322
+ {"role": "user", "content": prompt}
323
+ ],
324
+ max_tokens=1000,
325
+ temperature=0.1
326
+ )
327
+
328
+ llm_response = response.choices[0].message.content
329
+
330
+ # Create response metadata
331
+ response_metadata = {
332
+ "model_used": "gpt-3.5-turbo",
333
+ "response_length": len(llm_response),
334
+ "tokens_used": response.usage.total_tokens,
335
+ "prompt_tokens": response.usage.prompt_tokens,
336
+ "completion_tokens": response.usage.completion_tokens,
337
+ "generation_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
338
+ "context_chunks_used": len(search_results),
339
+ "temperature": 0.1,
340
+ "max_tokens": 1000
341
+ }
342
+
343
+ return llm_response, response_metadata
344
+
345
+ except Exception as e:
346
+ return f"LLM Generation failed: {str(e)}", {"error": str(e)}
347
+
348
+ # Initialize RAG pipeline
349
+ rag_pipeline = RAGPipeline()
350
+
351
+ def configure_openai_api(openai_key):
352
+ """Configure OpenAI API key"""
353
+ try:
354
+ # Set OpenAI API key
355
+ rag_pipeline.set_openai_key(openai_key)
356
+
357
+ # Test OpenAI connection
358
+ if openai_key:
359
+ try:
360
+ # Test with a simple API call
361
+ test_response = rag_pipeline.openai_client.models.list()
362
+ openai_status = "βœ… OpenAI API key validated successfully"
363
+ except Exception as e:
364
+ openai_status = f"❌ OpenAI API key validation failed: {str(e)}"
365
+ else:
366
+ openai_status = "❌ OpenAI API key required"
367
+
368
+ # ChromaDB status (local setup)
369
+ if rag_pipeline.chroma_client:
370
+ chroma_status = "βœ… ChromaDB Local database ready (./chroma_db)"
371
+ else:
372
+ chroma_status = "❌ ChromaDB Local database initialization failed"
373
+
374
+ return f"{openai_status}\n{chroma_status}"
375
+
376
+ except Exception as e:
377
+ return f"❌ Configuration failed: {str(e)}"
378
+
379
+ # Remove the global variable since we're storing in the class
380
+ # extracted_text_store = ""
381
+
382
+ def process_pdf_upload(pdf_file):
383
+ """Process uploaded PDF and extract text"""
384
+ if pdf_file is None:
385
+ return "No file uploaded", "{}"
386
+
387
+ # Extract text using the updated method
388
+ text, metadata = rag_pipeline.extract_text_from_pdf(pdf_file)
389
+
390
+ if text.startswith("Error"):
391
+ return text, json.dumps(metadata, indent=2)
392
+
393
+ # Show more text in preview (first 3000 characters instead of 2000)
394
+ preview_text = text[:3000] + f"...\n\n[SHOWING FIRST 3000 CHARACTERS OF {len(text)} TOTAL CHARACTERS]\n[FULL TEXT STORED FOR PROCESSING - Total Length: {len(rag_pipeline.full_extracted_text)} chars]" if len(text) > 3000 else text
395
+
396
+ return preview_text, json.dumps(metadata, indent=2)
397
+
398
+ def process_chunking(text, chunk_size, overlap):
399
+ """Process text chunking"""
400
+ # Always use the full text stored in the pipeline object
401
+ if not rag_pipeline.full_extracted_text:
402
+ return "No text available for chunking. Please upload a PDF first.", "{}"
403
+
404
+ full_text = rag_pipeline.full_extracted_text
405
+ print(f"DEBUG: Using full text for chunking, length: {len(full_text)}")
406
+
407
+ if len(full_text.strip()) == 0:
408
+ return "No valid text available for chunking.", "{}"
409
+
410
+ chunks, metadata = rag_pipeline.chunk_text(full_text, int(chunk_size), int(overlap))
411
+
412
+ if not chunks:
413
+ return "No chunks created. Please check your text and parameters.", json.dumps(metadata, indent=2)
414
+
415
+ # Display first few chunks as preview
416
+ preview = f"=== CHUNKING RESULTS ===\n"
417
+ preview += f"Total chunks created: {len(chunks)}\n"
418
+ preview += f"Full text length processed: {len(full_text)} characters\n\n"
419
+ preview += "--- CHUNK PREVIEW ---\n\n"
420
+
421
+ for i, chunk in enumerate(chunks[:3]):
422
+ preview += f"Chunk {i+1} (Length: {len(chunk)} chars):\n"
423
+ preview += f"{chunk[:200]}...\n\n"
424
+ preview += "-" * 50 + "\n\n"
425
+
426
+ if len(chunks) > 3:
427
+ preview += f"... and {len(chunks)-3} more chunks\n"
428
+ preview += f"Shortest chunk: {min(len(c) for c in chunks)} chars\n"
429
+ preview += f"Longest chunk: {max(len(c) for c in chunks)} chars\n"
430
+
431
+ return preview, json.dumps(metadata, indent=2)
432
+
433
+ def process_vector_storage(text, chunk_size, overlap, doc_name):
434
+ """Process vector storage in local ChromaDB"""
435
+ if not rag_pipeline.openai_client:
436
+ return "Please configure OpenAI API key first in the Configuration tab", "{}"
437
+
438
+ if not rag_pipeline.chroma_client:
439
+ return "ChromaDB local database not available. Please restart the application.", "{}"
440
+
441
+ # Always use the stored full text
442
+ if not rag_pipeline.full_extracted_text:
443
+ return "No valid text to store. Please upload a PDF first.", "{}"
444
+
445
+ full_text = rag_pipeline.full_extracted_text
446
+ print(f"DEBUG: Using full text for storage, length: {len(full_text)}")
447
+
448
+ # Re-chunk the text using full text
449
+ chunks, _ = rag_pipeline.chunk_text(full_text, int(chunk_size), int(overlap))
450
+
451
+ if not chunks:
452
+ return "No chunks to store", "{}"
453
+
454
+ # Store in ChromaDB
455
+ storage_metadata = rag_pipeline.store_in_chromadb(chunks, doc_name or "financial_document")
456
+
457
+ if "error" in storage_metadata:
458
+ return f"Storage failed: {storage_metadata['error']}", json.dumps(storage_metadata, indent=2)
459
+
460
+ return f"Successfully stored {len(chunks)} chunks in ChromaDB Local using OpenAI embeddings\nFull text length: {len(full_text)} characters", json.dumps(storage_metadata, indent=2)
461
+
462
+ def process_semantic_search(query, top_k):
463
+ """Process semantic search"""
464
+ if not query.strip():
465
+ return "Please enter a search query", "{}", ""
466
+
467
+ search_results, search_metadata = rag_pipeline.semantic_search(query, int(top_k))
468
+
469
+ if not search_results:
470
+ return "No results found", json.dumps(search_metadata, indent=2), ""
471
+
472
+ # Format results for display
473
+ results_display = "=== TOP MATCHING CHUNKS ===\n\n"
474
+ for i, result in enumerate(search_results, 1):
475
+ results_display += f"RESULT {i}:\n"
476
+ results_display += f"Chunk ID: {result['chunk_id']}\n"
477
+ results_display += f"Similarity Score: {result['similarity_score']:.4f}\n"
478
+ results_display += f"Content Preview: {result['content']}\n"
479
+ results_display += "-" * 50 + "\n\n"
480
+
481
+ # Create DataFrame for structured display
482
+ df_data = []
483
+ for result in search_results:
484
+ df_data.append({
485
+ "Chunk ID": result['chunk_id'],
486
+ "Similarity Score": f"{result['similarity_score']:.4f}",
487
+ "Content Length": len(result['full_content']),
488
+ "Preview": result['content'][:100] + "..."
489
+ })
490
+
491
+ df = pd.DataFrame(df_data)
492
+
493
+ return results_display, json.dumps(search_metadata, indent=2), df
494
+
495
+ def generate_final_response(query, top_k):
496
+ """Generate final LLM response"""
497
+ if not rag_pipeline.openai_client:
498
+ return "Please configure OpenAI API key first in the Configuration tab", "{}"
499
+
500
+ if not query.strip():
501
+ return "Please enter a query first", "{}"
502
+
503
+ # Get search results
504
+ search_results, _ = rag_pipeline.semantic_search(query, int(top_k))
505
+
506
+ if not search_results:
507
+ return "No search results available for LLM generation", "{}"
508
+
509
+ # Generate LLM response
510
+ response, metadata = rag_pipeline.generate_llm_response(query, search_results)
511
+
512
+ return response, json.dumps(metadata, indent=2)
513
+
514
+ def create_gradio_interface():
515
+ """Create the Gradio interface"""
516
+
517
+ with gr.Blocks(title="RAG Pipeline Demo - Financial Document Analysis", theme=gr.themes.Soft()) as demo:
518
+ gr.Markdown("""
519
+ # 🏦 RAG Pipeline Demo - Financial Document Analysis
520
+
521
+ This demo shows a complete Retrieval-Augmented Generation (RAG) pipeline with full transparency.
522
+ Each step is clearly displayed so you can understand exactly what's happening in the backend.
523
+
524
+ **πŸ”§ Start by configuring your API keys in the Configuration tab below.**
525
+ """)
526
+
527
+ # Configuration Tab - Simplified
528
+ with gr.Tab("βš™οΈ Configuration"):
529
+ gr.Markdown("### API Configuration")
530
+ gr.Markdown("Configure your OpenAI API key. ChromaDB will run locally and store data in `./chroma_db` folder.")
531
+
532
+ with gr.Row():
533
+ with gr.Column():
534
+ gr.Markdown("#### OpenAI API Key")
535
+ gr.Markdown("Required for both embeddings generation and LLM response generation")
536
+ openai_key_input = gr.Textbox(
537
+ label="OpenAI API Key",
538
+ type="password",
539
+ placeholder="sk-...",
540
+ info="Get your API key from: https://platform.openai.com/api-keys"
541
+ )
542
+
543
+ with gr.Column():
544
+ gr.Markdown("#### ChromaDB Status")
545
+ gr.Markdown("βœ… **Local ChromaDB**: Data will be stored locally in `./chroma_db`")
546
+ gr.Markdown("πŸ“ **Storage Location**: Current directory/chroma_db")
547
+ gr.Markdown("πŸ”„ **Persistence**: Data persists between sessions")
548
+
549
+ config_btn = gr.Button("Save OpenAI Configuration", variant="primary", size="lg")
550
+ config_status = gr.Textbox(label="Configuration Status", lines=3)
551
+
552
+ # Step 1: Document Upload
553
+ with gr.Tab("1️⃣ Document Upload"):
554
+ gr.Markdown("### Step 1: Upload Your Financial PDF Document")
555
+
556
+ with gr.Row():
557
+ with gr.Column():
558
+ pdf_input = gr.File(label="Upload PDF Document", file_types=[".pdf"])
559
+ upload_btn = gr.Button("Extract Text from PDF", variant="primary")
560
+
561
+ with gr.Column():
562
+ extraction_output = gr.Textbox(label="Extracted Text Preview", lines=15, max_lines=20)
563
+ extraction_metadata = gr.JSON(label="Extraction Metadata")
564
+
565
+ # Step 2: Text Chunking
566
+ with gr.Tab("2️⃣ Text Chunking"):
567
+ gr.Markdown("### Step 2: Split Text into Manageable Chunks")
568
+
569
+ with gr.Row():
570
+ with gr.Column():
571
+ chunk_size = gr.Slider(minimum=200, maximum=2000, value=1000, label="Chunk Size (characters)")
572
+ overlap = gr.Slider(minimum=0, maximum=500, value=200, label="Overlap (characters)")
573
+ chunk_btn = gr.Button("Create Chunks", variant="primary")
574
+
575
+ with gr.Column():
576
+ chunks_output = gr.Textbox(label="Chunks Preview", lines=15, max_lines=20)
577
+ chunking_metadata = gr.JSON(label="Chunking Metadata")
578
+
579
+ # Step 3: Vector Storage
580
+ with gr.Tab("3️⃣ Vector Storage"):
581
+ gr.Markdown("### Step 3: Store Chunks in ChromaDB Vector Database")
582
+
583
+ with gr.Row():
584
+ with gr.Column():
585
+ doc_name = gr.Textbox(label="Document Name", value="financial_report", placeholder="Enter document name")
586
+ storage_btn = gr.Button("Store in ChromaDB", variant="primary")
587
+
588
+ with gr.Column():
589
+ storage_output = gr.Textbox(label="Storage Status", lines=5)
590
+ storage_metadata = gr.JSON(label="Storage Metadata")
591
+
592
+ # Step 4: Semantic Search
593
+ with gr.Tab("4️⃣ Semantic Search"):
594
+ gr.Markdown("### Step 4: Search for Relevant Information")
595
+
596
+ with gr.Row():
597
+ with gr.Column():
598
+ search_query = gr.Textbox(label="Enter your question", placeholder="e.g., What was the revenue growth in Q4?")
599
+ top_k = gr.Slider(minimum=1, maximum=10, value=5, label="Number of results to retrieve")
600
+ search_btn = gr.Button("Search Vector Database", variant="primary")
601
+
602
+ with gr.Column():
603
+ search_results_text = gr.Textbox(label="Search Results", lines=15, max_lines=20)
604
+ search_metadata = gr.JSON(label="Search Metadata")
605
+
606
+ # Results table
607
+ results_table = gr.DataFrame(label="Top Matching Chunks - Structured View")
608
+
609
+ # Step 5: LLM Response Generation
610
+ with gr.Tab("5️⃣ LLM Response"):
611
+ gr.Markdown("### Step 5: Generate Final Answer using OpenAI")
612
+ gr.Markdown("*Note: OpenAI API key must be configured in the Configuration tab*")
613
+
614
+ with gr.Row():
615
+ with gr.Column():
616
+ generate_btn = gr.Button("Generate Final Response", variant="primary")
617
+ gr.Markdown("**Current Query:** Will use the query from Step 4")
618
+
619
+ with gr.Column():
620
+ final_response = gr.Textbox(label="AI Generated Response", lines=15, max_lines=20)
621
+ response_metadata = gr.JSON(label="Response Metadata")
622
+
623
+ # Complete Pipeline Tab
624
+ with gr.Tab("πŸš€ Complete Pipeline"):
625
+ gr.Markdown("### Run the Complete RAG Pipeline")
626
+ gr.Markdown("*Note: Make sure to configure API keys in the Configuration tab first*")
627
+
628
+ with gr.Row():
629
+ with gr.Column():
630
+ complete_pdf = gr.File(label="Upload PDF", file_types=[".pdf"])
631
+ complete_query = gr.Textbox(label="Your Question", placeholder="Ask about the financial document")
632
+
633
+ with gr.Column():
634
+ complete_chunk_size = gr.Slider(minimum=200, maximum=2000, value=1000, label="Chunk Size")
635
+ complete_overlap = gr.Slider(minimum=0, maximum=500, value=200, label="Overlap")
636
+ complete_top_k = gr.Slider(minimum=1, maximum=10, value=5, label="Top K Results")
637
+
638
+ complete_btn = gr.Button("Run Complete Pipeline", variant="primary", size="lg")
639
+
640
+ with gr.Row():
641
+ pipeline_status = gr.Textbox(label="Pipeline Status", lines=10)
642
+ pipeline_response = gr.Textbox(label="Final Answer", lines=10)
643
+
644
+ # Event handlers
645
+ config_btn.click(
646
+ configure_openai_api,
647
+ inputs=[openai_key_input],
648
+ outputs=[config_status]
649
+ )
650
+
651
+ upload_btn.click(
652
+ process_pdf_upload,
653
+ inputs=[pdf_input],
654
+ outputs=[extraction_output, extraction_metadata]
655
+ )
656
+
657
+ chunk_btn.click(
658
+ process_chunking,
659
+ inputs=[extraction_output, chunk_size, overlap],
660
+ outputs=[chunks_output, chunking_metadata]
661
+ )
662
+
663
+ storage_btn.click(
664
+ process_vector_storage,
665
+ inputs=[extraction_output, chunk_size, overlap, doc_name],
666
+ outputs=[storage_output, storage_metadata]
667
+ )
668
+
669
+ search_btn.click(
670
+ process_semantic_search,
671
+ inputs=[search_query, top_k],
672
+ outputs=[search_results_text, search_metadata, results_table]
673
+ )
674
+
675
+ generate_btn.click(
676
+ generate_final_response,
677
+ inputs=[search_query, top_k],
678
+ outputs=[final_response, response_metadata]
679
+ )
680
+
681
+ # Complete pipeline function
682
+ def run_complete_pipeline(pdf_file, query, chunk_size, overlap, top_k):
683
+ if not pdf_file or not query:
684
+ return "Please provide PDF file and query", ""
685
+
686
+ if not rag_pipeline.openai_client:
687
+ return "Please configure OpenAI API key in the Configuration tab first", ""
688
+
689
+ if not rag_pipeline.chroma_client:
690
+ return "ChromaDB local database not available. Please restart the application.", ""
691
+
692
+ status = "Starting RAG Pipeline...\n\n"
693
+ status += "Using: ChromaDB Local + OpenAI API\n"
694
+ status += "Storage: ./chroma_db directory\n\n"
695
+
696
+ try:
697
+ # Step 1: Extract text
698
+ status += "Step 1: Extracting text from PDF...\n"
699
+ text, _ = rag_pipeline.extract_text_from_pdf(pdf_file)
700
+ if text.startswith("Error"):
701
+ return status + f"Failed: {text}", ""
702
+ status += "βœ… Text extraction completed\n\n"
703
+
704
+ # Step 2: Chunk text
705
+ status += "Step 2: Chunking text...\n"
706
+ chunks, _ = rag_pipeline.chunk_text(text, chunk_size, overlap)
707
+ status += f"βœ… Created {len(chunks)} chunks\n\n"
708
+
709
+ # Step 3: Store in vector DB
710
+ status += f"Step 3: Generating OpenAI embeddings and storing in ChromaDB Local...\n"
711
+ storage_result = rag_pipeline.store_in_chromadb(chunks, "complete_pipeline_doc")
712
+ if "error" in storage_result:
713
+ return status + f"Failed: {storage_result['error']}", ""
714
+ status += f"βœ… Vectors stored in ChromaDB Local using OpenAI embeddings\n\n"
715
+
716
+ # Step 4: Search
717
+ status += "Step 4: Performing semantic search with OpenAI embeddings...\n"
718
+ search_results, _ = rag_pipeline.semantic_search(query, top_k)
719
+ if not search_results:
720
+ return status + "❌ No search results found", ""
721
+ status += f"βœ… Found {len(search_results)} relevant chunks\n\n"
722
+
723
+ # Step 5: Generate response
724
+ status += "Step 5: Generating LLM response...\n"
725
+ response, _ = rag_pipeline.generate_llm_response(query, search_results)
726
+ if response.startswith("LLM Generation failed"):
727
+ return status + f"Failed: {response}", ""
728
+ status += "βœ… Final response generated successfully!"
729
+
730
+ return status, response
731
+
732
+ except Exception as e:
733
+ return status + f"❌ Pipeline failed: {str(e)}", ""
734
+
735
+ complete_btn.click(
736
+ run_complete_pipeline,
737
+ inputs=[complete_pdf, complete_query, complete_chunk_size, complete_overlap, complete_top_k],
738
+ outputs=[pipeline_status, pipeline_response]
739
+ )
740
+
741
+ return demo
742
+
743
+ # Launch the application
744
+ if __name__ == "__main__":
745
+ # Install required packages
746
+ print("Starting RAG Pipeline Demo...")
747
+ print("Make sure you have installed the required packages:")
748
+ print("pip install gradio PyPDF2 chromadb openai pandas numpy")
749
+ print("\nConfiguration:")
750
+ print("βœ… ChromaDB: Local storage (./chroma_db directory)")
751
+ print("πŸ”‘ OpenAI: API key required for embeddings + LLM")
752
+ print("πŸ“ Data persistence: Enabled across sessions")
753
+
754
+ # Create and launch the Gradio interface
755
+ demo = create_gradio_interface()
756
+ demo.launch()