quantumbit commited on
Commit
6e4458f
·
verified ·
1 Parent(s): 1860a28

Delete preprocessing

Browse files
preprocessing/__init__.py DELETED
@@ -1,23 +0,0 @@
1
- # Preprocessing package
2
-
3
- from .preprocessing import DocumentPreprocessor
4
- from .preprocessing_modules import (
5
- PDFDownloader,
6
- TextExtractor,
7
- TextChunker,
8
- EmbeddingManager,
9
- VectorStorage,
10
- MetadataManager,
11
- ModularDocumentPreprocessor
12
- )
13
-
14
- __all__ = [
15
- 'DocumentPreprocessor',
16
- 'PDFDownloader',
17
- 'TextExtractor',
18
- 'TextChunker',
19
- 'EmbeddingManager',
20
- 'VectorStorage',
21
- 'MetadataManager',
22
- 'ModularDocumentPreprocessor'
23
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocessing/preprocessing.py DELETED
@@ -1,63 +0,0 @@
1
- import os
2
- import asyncio
3
- from typing import List, Dict, Any
4
-
5
- from config.config import *
6
- from .preprocessing_modules.modular_preprocessor import ModularDocumentPreprocessor
7
-
8
- # For backward compatibility, create an alias
9
- class DocumentPreprocessor(ModularDocumentPreprocessor):
10
- """Backward compatibility alias for the modular document preprocessor."""
11
- pass
12
-
13
- # CLI interface for preprocessing
14
- async def main():
15
- """Main function for command-line usage."""
16
- import argparse
17
-
18
- parser = argparse.ArgumentParser(description="Document Preprocessing for RAG")
19
- parser.add_argument("--url", type=str, help="Single PDF URL to process")
20
- parser.add_argument("--urls-file", type=str, help="File containing PDF URLs (one per line)")
21
- parser.add_argument("--force", action="store_true", help="Force reprocessing even if already processed")
22
- parser.add_argument("--list", action="store_true", help="List all processed documents")
23
- parser.add_argument("--stats", action="store_true", help="Show collection statistics")
24
-
25
- args = parser.parse_args()
26
-
27
- preprocessor = DocumentPreprocessor()
28
-
29
- if args.list:
30
- docs = preprocessor.list_processed_documents()
31
- print(f"\n📚 Processed Documents ({len(docs)}):")
32
- for doc_id, info in docs.items():
33
- print(f" • {doc_id}: {info['document_url'][:50]}... ({info.get('chunk_count', 'N/A')} chunks)")
34
-
35
- elif args.stats:
36
- stats = preprocessor.get_collection_stats()
37
- print(f"\n📊 Collection Statistics:")
38
- print(f" • Total documents: {stats['total_documents']}")
39
- print(f" • Total collections: {stats['total_collections']}")
40
- print(f" • Total chunks: {stats['total_chunks']}")
41
-
42
- elif args.url:
43
- await preprocessor.process_document(args.url, args.force)
44
-
45
- elif args.urls_file:
46
- if not os.path.exists(args.urls_file):
47
- print(f"❌ File not found: {args.urls_file}")
48
- return
49
-
50
- with open(args.urls_file, 'r') as f:
51
- urls = [line.strip() for line in f if line.strip()]
52
-
53
- if urls:
54
- await preprocessor.process_multiple_documents(urls, args.force)
55
- else:
56
- print("❌ No URLs found in file")
57
-
58
- else:
59
- print("❌ Please provide --url, --urls-file, --list, or --stats")
60
- parser.print_help()
61
-
62
- if __name__ == "__main__":
63
- asyncio.run(main())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocessing/preprocessing_modules/__init__.py DELETED
@@ -1,29 +0,0 @@
1
- # Preprocessing modules
2
-
3
- from .pdf_downloader import PDFDownloader
4
- from .file_downloader import FileDownloader
5
- from .text_extractor import TextExtractor
6
- from .text_chunker import TextChunker
7
- from .embedding_manager import EmbeddingManager
8
- from .vector_storage import VectorStorage
9
- from .metadata_manager import MetadataManager
10
- from .modular_preprocessor import ModularDocumentPreprocessor
11
- from .docx_extractor import extract_docx
12
- from .pptx_extractor import extract_pptx
13
- from .xlsx_extractor import extract_xlsx
14
- from .image_extractor import extract_image_content
15
-
16
- __all__ = [
17
- 'PDFDownloader',
18
- 'FileDownloader',
19
- 'TextExtractor',
20
- 'TextChunker',
21
- 'EmbeddingManager',
22
- 'VectorStorage',
23
- 'MetadataManager',
24
- 'ModularDocumentPreprocessor',
25
- 'extract_docx',
26
- 'extract_pptx',
27
- 'extract_xlsx',
28
- 'extract_image_content'
29
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocessing/preprocessing_modules/docx_extractor.py DELETED
@@ -1,94 +0,0 @@
1
- from docx import Document
2
- from docx.document import Document as _Document
3
- from docx.table import Table
4
- from docx.text.paragraph import Paragraph
5
- from typing import Union, List, Dict, Any
6
- from PIL import Image
7
- from io import BytesIO
8
- import pytesseract
9
- import os
10
-
11
- from zipfile import ZipFile
12
- from lxml import etree
13
- from pathlib import Path
14
- import io
15
-
16
- def extract_docx(docx_input) -> str:
17
- """Extract text from DOCX files with table and text handling."""
18
- zipf = ZipFile(docx_input)
19
- xml_content = zipf.read("word/document.xml")
20
- tree = etree.fromstring(xml_content)
21
-
22
- ns = {
23
- "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
24
- "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
25
- "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape"
26
- }
27
-
28
- text_blocks = []
29
-
30
- # Extract all tables with gridSpan handling
31
- tables = tree.xpath("//w:tbl", namespaces=ns)
32
- table_elements = set(tables)
33
- table_index = 0
34
-
35
- for tbl in tables:
36
- rows = tbl.xpath("./w:tr", namespaces=ns)
37
- sub_tables = []
38
- current_table = []
39
-
40
- prev_col_count = None
41
- for row in rows:
42
- row_texts = []
43
- cells = row.xpath("./w:tc", namespaces=ns)
44
- col_count = 0
45
-
46
- for cell in cells:
47
- cell_text = ""
48
- paragraphs = cell.xpath(".//w:p", namespaces=ns)
49
- for para in paragraphs:
50
- text_nodes = para.xpath(".//w:t", namespaces=ns)
51
- para_text = "".join(node.text for node in text_nodes if node.text)
52
- if para_text.strip():
53
- cell_text += para_text + " "
54
-
55
- # Handle gridSpan (merged cells)
56
- gridspan_elem = cell.xpath(".//w:gridSpan", namespaces=ns)
57
- span = int(gridspan_elem[0].get(ns["w"] + "val", "1")) if gridspan_elem else 1
58
-
59
- row_texts.append(cell_text.strip())
60
- col_count += span
61
-
62
- if row_texts and any(text.strip() for text in row_texts):
63
- if prev_col_count is not None and col_count != prev_col_count:
64
- # Column count changed, save current table and start new one
65
- if current_table:
66
- sub_tables.append(current_table)
67
- current_table = []
68
-
69
- current_table.append(row_texts)
70
- prev_col_count = col_count
71
-
72
- if current_table:
73
- sub_tables.append(current_table)
74
-
75
- # Format tables
76
- for sub_table in sub_tables:
77
- table_text = f"\\n--- Table {table_index + 1} ---\\n"
78
- for row in sub_table:
79
- table_text += " | ".join(row) + "\\n"
80
- text_blocks.append(table_text)
81
- table_index += 1
82
-
83
- # Extract non-table paragraphs
84
- paragraphs = tree.xpath("//w:p", namespaces=ns)
85
- for para in paragraphs:
86
- # Check if paragraph is inside a table
87
- is_in_table = any(table in para.xpath("ancestor::*") for table in table_elements)
88
- if not is_in_table:
89
- text_nodes = para.xpath(".//w:t", namespaces=ns)
90
- para_text = "".join(node.text for node in text_nodes if node.text)
91
- if para_text.strip():
92
- text_blocks.append(para_text.strip())
93
-
94
- return "\\n\\n".join(text_blocks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocessing/preprocessing_modules/embedding_manager.py DELETED
@@ -1,118 +0,0 @@
1
- """
2
- Embedding Manager Module
3
-
4
- Handles creation of embeddings for text chunks using sentence transformers.
5
- """
6
-
7
- import asyncio
8
- import numpy as np
9
- from typing import List
10
- from sentence_transformers import SentenceTransformer
11
- from config.config import EMBEDDING_MODEL, BATCH_SIZE
12
-
13
-
14
- class EmbeddingManager:
15
- """Handles embedding creation for text chunks."""
16
-
17
- def __init__(self):
18
- """Initialize the embedding manager."""
19
- self.embedding_model = None
20
- self._init_embedding_model()
21
-
22
- def _init_embedding_model(self):
23
- """Initialize the embedding model."""
24
- print(f"🔄 Loading embedding model: {EMBEDDING_MODEL}")
25
- self.embedding_model = SentenceTransformer(EMBEDDING_MODEL)
26
- print(f"✅ Embedding model loaded successfully")
27
-
28
- async def create_embeddings(self, chunks: List[str]) -> np.ndarray:
29
- """
30
- Create embeddings for text chunks.
31
-
32
- Args:
33
- chunks: List of text chunks to embed
34
-
35
- Returns:
36
- np.ndarray: Array of embeddings with shape (num_chunks, embedding_dim)
37
- """
38
- print(f"🧠 Creating embeddings for {len(chunks)} chunks")
39
-
40
- if not chunks:
41
- raise ValueError("No chunks provided for embedding creation")
42
-
43
- def create_embeddings_sync():
44
- """Synchronous embedding creation to run in thread pool."""
45
- embeddings = self.embedding_model.encode(
46
- chunks,
47
- batch_size=BATCH_SIZE,
48
- show_progress_bar=True,
49
- normalize_embeddings=True
50
- )
51
- return np.array(embeddings).astype("float32")
52
-
53
- # Run in thread pool to avoid blocking the event loop
54
- loop = asyncio.get_event_loop()
55
- embeddings = await loop.run_in_executor(None, create_embeddings_sync)
56
-
57
- print(f"✅ Created embeddings with shape: {embeddings.shape}")
58
- return embeddings
59
-
60
- def get_embedding_dimension(self) -> int:
61
- """
62
- Get the dimension of embeddings produced by the model.
63
-
64
- Returns:
65
- int: Embedding dimension
66
- """
67
- if self.embedding_model is None:
68
- raise RuntimeError("Embedding model not initialized")
69
-
70
- # Get dimension from model
71
- return self.embedding_model.get_sentence_embedding_dimension()
72
-
73
- def validate_embeddings(self, embeddings: np.ndarray, expected_count: int) -> bool:
74
- """
75
- Validate that embeddings have the expected shape and properties.
76
-
77
- Args:
78
- embeddings: The embeddings array to validate
79
- expected_count: Expected number of embeddings
80
-
81
- Returns:
82
- bool: True if embeddings are valid, False otherwise
83
- """
84
- if embeddings is None:
85
- return False
86
-
87
- if embeddings.shape[0] != expected_count:
88
- print(f"❌ Embedding count mismatch: expected {expected_count}, got {embeddings.shape[0]}")
89
- return False
90
-
91
- if embeddings.dtype != np.float32:
92
- print(f"❌ Embedding dtype mismatch: expected float32, got {embeddings.dtype}")
93
- return False
94
-
95
- # Check for NaN or infinite values
96
- if np.any(np.isnan(embeddings)) or np.any(np.isinf(embeddings)):
97
- print("❌ Embeddings contain NaN or infinite values")
98
- return False
99
-
100
- print(f"✅ Embeddings validation passed: {embeddings.shape}")
101
- return True
102
-
103
- def get_model_info(self) -> dict:
104
- """
105
- Get information about the embedding model.
106
-
107
- Returns:
108
- dict: Model information
109
- """
110
- if self.embedding_model is None:
111
- return {"model_name": EMBEDDING_MODEL, "status": "not_loaded"}
112
-
113
- return {
114
- "model_name": EMBEDDING_MODEL,
115
- "embedding_dimension": self.get_embedding_dimension(),
116
- "max_sequence_length": getattr(self.embedding_model, 'max_seq_length', 'unknown'),
117
- "status": "loaded"
118
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocessing/preprocessing_modules/file_downloader.py DELETED
@@ -1,108 +0,0 @@
1
- import aiohttp
2
- import asyncio
3
- import tempfile
4
- import os
5
- import re
6
- from urllib.parse import urlparse
7
- from typing import List, Tuple
8
-
9
- class FileDownloader:
10
- """Enhanced file downloader that supports multiple file types."""
11
-
12
- async def download_file(self, url: str, timeout: int = 300, max_retries: int = 3) -> Tuple[str, str]:
13
- """Download any file type from a URL to a temporary file with enhanced error handling."""
14
- print(f"📥 Downloading file from: {url[:60]}...")
15
-
16
- for attempt in range(max_retries):
17
- try:
18
- timeout_config = aiohttp.ClientTimeout(
19
- total=timeout,
20
- connect=30,
21
- sock_read=120
22
- )
23
-
24
- async with aiohttp.ClientSession(timeout=timeout_config) as session:
25
- print(f" Attempt {attempt + 1}/{max_retries} (timeout: {timeout}s)")
26
-
27
- async with session.get(url) as response:
28
- if response.status != 200:
29
- raise Exception(f"Failed to download file: HTTP {response.status}")
30
-
31
- # Extract filename from header or URL
32
- cd = response.headers.get('Content-Disposition', '')
33
- filename_match = re.findall('filename="?([^"]+)"?', cd)
34
- if filename_match:
35
- filename = filename_match[0]
36
- else:
37
- from urllib.parse import unquote
38
- path = urlparse(url).path
39
- filename = os.path.basename(unquote(path)) # Decode URL encoding
40
-
41
- if not filename:
42
- filename = "downloaded_file"
43
-
44
- ext = os.path.splitext(filename)[1]
45
- if not ext:
46
- return url, "url"
47
-
48
- print(f" 📁 Detected filename: {filename}, extension: {ext}")
49
-
50
- # Check if file type is supported
51
- supported_extensions = ['.pdf', '.docx', '.pptx', '.png', '.xlsx', '.jpeg', '.jpg', '.txt', '.csv']
52
- if ext not in supported_extensions:
53
- # Return extension without dot for consistency
54
- ext_without_dot = ext[1:] if ext.startswith('.') else ext
55
- print(f" ❌ File type not supported: {ext}")
56
- return 'not supported', ext_without_dot
57
-
58
- # Get content length
59
- content_length = response.headers.get('content-length')
60
- if content_length:
61
- total_size = int(content_length)
62
- print(f" File size: {total_size / (1024 * 1024):.1f} MB")
63
-
64
- # Create temp file with same extension
65
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext, prefix="download_")
66
-
67
- # Write to file
68
- downloaded = 0
69
- async for chunk in response.content.iter_chunked(16384):
70
- temp_file.write(chunk)
71
- downloaded += len(chunk)
72
-
73
- if content_length and downloaded % (1024 * 1024) == 0:
74
- progress = (downloaded / total_size) * 100
75
- print(f" Progress: {progress:.1f}% ({downloaded / (1024*1024):.1f} MB)")
76
-
77
- temp_file.close()
78
- print(f"✅ File downloaded successfully: {temp_file.name}")
79
- # Return extension without the dot for consistency with modular_preprocessor
80
- ext_without_dot = ext[1:] if ext.startswith('.') else ext
81
- return temp_file.name, ext_without_dot
82
-
83
- except asyncio.TimeoutError:
84
- print(f" ⏰ Timeout on attempt {attempt + 1}")
85
- if attempt < max_retries - 1:
86
- wait_time = (attempt + 1) * 30
87
- print(f" ⏳ Waiting {wait_time}s before retry...")
88
- await asyncio.sleep(wait_time)
89
- continue
90
-
91
- except Exception as e:
92
- print(f" ❌ Error on attempt {attempt + 1}: {str(e)}")
93
- if attempt < max_retries - 1:
94
- wait_time = (attempt + 1) * 15
95
- print(f" ⏳ Waiting {wait_time}s before retry...")
96
- await asyncio.sleep(wait_time)
97
- continue
98
-
99
- raise Exception(f"Failed to download file after {max_retries} attempts")
100
-
101
- def cleanup_temp_file(self, temp_path: str) -> None:
102
- """Clean up temporary file."""
103
- try:
104
- if os.path.exists(temp_path):
105
- os.unlink(temp_path)
106
- print(f"🗑️ Cleaned up temporary file: {temp_path}")
107
- except Exception as e:
108
- print(f"⚠️ Warning: Could not cleanup temp file {temp_path}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocessing/preprocessing_modules/image_extractor.py DELETED
@@ -1,120 +0,0 @@
1
- import cv2
2
- import pytesseract
3
- import numpy as np
4
- import pandas as pd
5
- from PIL import Image, ImageFile
6
- from typing import List, Dict, Any
7
-
8
- ImageFile.LOAD_TRUNCATED_IMAGES = True
9
-
10
- def load_local_image(path: str) -> np.ndarray:
11
- """Load image from local path."""
12
- img = Image.open(path).convert("RGB")
13
- return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
14
-
15
- def sort_contours(cnts, method="top-to-bottom"):
16
- """Sort contours based on the specified method."""
17
- reverse = False
18
- i = 1 if method == "top-to-bottom" or method == "bottom-to-top" else 0
19
- if method == "right-to-left" or method == "bottom-to-top":
20
- reverse = True
21
- boundingBoxes = [cv2.boundingRect(c) for c in cnts]
22
- (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
23
- key=lambda b: b[1][i], reverse=reverse))
24
- return cnts, boundingBoxes
25
-
26
- def extract_cells_from_grid(table_img: np.ndarray) -> pd.DataFrame:
27
- """Extract table structure from image using OpenCV."""
28
- gray = cv2.cvtColor(table_img, cv2.COLOR_BGR2GRAY)
29
- _, binary = cv2.threshold(~gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
30
-
31
- # Detect horizontal lines
32
- horizontal = binary.copy()
33
- cols = horizontal.shape[1]
34
- horizontal_size = cols // 15
35
- horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
36
- horizontal = cv2.erode(horizontal, horizontal_structure)
37
- horizontal = cv2.dilate(horizontal, horizontal_structure)
38
-
39
- # Detect vertical lines
40
- vertical = binary.copy()
41
- rows = vertical.shape[0]
42
- vertical_size = rows // 15
43
- vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
44
- vertical = cv2.erode(vertical, vertical_structure)
45
- vertical = cv2.dilate(vertical, vertical_structure)
46
-
47
- # Combine mask
48
- mask = cv2.add(horizontal, vertical)
49
- contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
50
-
51
- cells = []
52
- for contour in contours:
53
- x, y, w, h = cv2.boundingRect(contour)
54
- if w > 30 and h > 20: # Filter small contours
55
- cell_img = table_img[y:y+h, x:x+w]
56
- try:
57
- text = pytesseract.image_to_string(cell_img, config='--psm 7').strip()
58
- cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': text})
59
- except:
60
- cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': ''})
61
-
62
- # Sort cells by position to create table structure
63
- cells.sort(key=lambda cell: (cell['y'], cell['x']))
64
-
65
- # Group cells into rows
66
- rows = []
67
- current_row = []
68
- current_y = 0
69
-
70
- for cell in cells:
71
- if abs(cell['y'] - current_y) > 20: # New row threshold
72
- if current_row:
73
- rows.append(current_row)
74
- current_row = [cell]
75
- current_y = cell['y']
76
- else:
77
- current_row.append(cell)
78
-
79
- if current_row:
80
- rows.append(current_row)
81
-
82
- # Convert to DataFrame
83
- table_data = []
84
- for row in rows:
85
- row_data = [cell['text'] for cell in sorted(row, key=lambda c: c['x'])]
86
- table_data.append(row_data)
87
-
88
- if table_data:
89
- max_cols = max(len(row) for row in table_data)
90
- for row in table_data:
91
- while len(row) < max_cols:
92
- row.append('')
93
- return pd.DataFrame(table_data)
94
- else:
95
- return pd.DataFrame()
96
-
97
- def extract_image_content(image_path: str) -> str:
98
- """Extract text content from images using OCR."""
99
- try:
100
- # Load image
101
- img = load_local_image(image_path)
102
-
103
- # Basic OCR
104
- text = pytesseract.image_to_string(img)
105
-
106
- # Try to detect if it's a table
107
- if '|' in text or '\\t' in text or len(text.split('\\n')) > 3:
108
- # Try table extraction
109
- try:
110
- table_df = extract_cells_from_grid(img)
111
- if not table_df.empty:
112
- table_text = "\\n".join([" | ".join(row) for row in table_df.values])
113
- return f"[Table detected]\\n{table_text}\\n\\n[OCR Text]\\n{text}"
114
- except:
115
- pass
116
-
117
- return text.strip() if text.strip() else "[No text detected in image]"
118
-
119
- except Exception as e:
120
- return f"[Error processing image: {str(e)}]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocessing/preprocessing_modules/metadata_manager.py DELETED
@@ -1,262 +0,0 @@
1
- """
2
- Metadata Manager Module
3
-
4
- Handles document metadata storage and retrieval operations.
5
- """
6
-
7
- import json
8
- import asyncio
9
- import hashlib
10
- from typing import List, Dict, Any
11
- from pathlib import Path
12
- from config.config import EMBEDDING_MODEL, CHUNK_SIZE, CHUNK_OVERLAP
13
-
14
-
15
- class MetadataManager:
16
- """Handles document metadata operations."""
17
-
18
- def __init__(self, base_db_path: Path):
19
- """
20
- Initialize the metadata manager.
21
-
22
- Args:
23
- base_db_path: Base path for storing metadata files
24
- """
25
- self.base_db_path = base_db_path
26
- self.processed_docs_file = self.base_db_path / "processed_documents.json"
27
- self.processed_docs = self._load_processed_docs()
28
-
29
- def _load_processed_docs(self) -> Dict[str, Dict]:
30
- """Load the registry of processed documents."""
31
- if self.processed_docs_file.exists():
32
- try:
33
- with open(self.processed_docs_file, 'r', encoding='utf-8') as f:
34
- return json.load(f)
35
- except Exception as e:
36
- print(f"⚠️ Warning: Could not load processed docs registry: {e}")
37
- return {}
38
-
39
- def _save_processed_docs(self):
40
- """Save the registry of processed documents."""
41
- try:
42
- with open(self.processed_docs_file, 'w', encoding='utf-8') as f:
43
- json.dump(self.processed_docs, f, indent=2, ensure_ascii=False)
44
- except Exception as e:
45
- print(f"⚠️ Warning: Could not save processed docs registry: {e}")
46
-
47
- def generate_doc_id(self, document_url: str) -> str:
48
- """
49
- Generate a unique document ID from the URL.
50
-
51
- Args:
52
- document_url: URL of the document
53
-
54
- Returns:
55
- str: Unique document ID
56
- """
57
- url_hash = hashlib.md5(document_url.encode()).hexdigest()[:12]
58
- return f"doc_{url_hash}"
59
-
60
- def is_document_processed(self, document_url: str) -> bool:
61
- """
62
- Check if a document has already been processed.
63
-
64
- Args:
65
- document_url: URL of the document
66
-
67
- Returns:
68
- bool: True if document is already processed
69
- """
70
- doc_id = self.generate_doc_id(document_url)
71
- return doc_id in self.processed_docs
72
-
73
- def get_document_info(self, document_url: str) -> Dict[str, Any]:
74
- """
75
- Get information about a processed document.
76
-
77
- Args:
78
- document_url: URL of the document
79
-
80
- Returns:
81
- Dict[str, Any]: Document information or empty dict if not found
82
- """
83
- doc_id = self.generate_doc_id(document_url)
84
- return self.processed_docs.get(doc_id, {})
85
-
86
- def save_document_metadata(self, chunks: List[str], doc_id: str, document_url: str):
87
- """
88
- Save document metadata to JSON file and update registry.
89
-
90
- Args:
91
- chunks: List of text chunks
92
- doc_id: Document identifier
93
- document_url: Original document URL
94
- """
95
- # Calculate statistics
96
- total_chars = sum(len(chunk) for chunk in chunks)
97
- total_words = sum(len(chunk.split()) for chunk in chunks)
98
- avg_chunk_size = total_chars / len(chunks) if chunks else 0
99
-
100
- # Create metadata object
101
- metadata = {
102
- "doc_id": doc_id,
103
- "document_url": document_url,
104
- "chunk_count": len(chunks),
105
- "total_chars": total_chars,
106
- "total_words": total_words,
107
- "avg_chunk_size": avg_chunk_size,
108
- "processed_at": asyncio.get_event_loop().time(),
109
- "embedding_model": EMBEDDING_MODEL,
110
- "chunk_size": CHUNK_SIZE,
111
- "chunk_overlap": CHUNK_OVERLAP,
112
- "processing_config": {
113
- "chunk_size": CHUNK_SIZE,
114
- "chunk_overlap": CHUNK_OVERLAP,
115
- "embedding_model": EMBEDDING_MODEL
116
- }
117
- }
118
-
119
- # Save individual document metadata
120
- metadata_path = self.base_db_path / f"{doc_id}_metadata.json"
121
- try:
122
- with open(metadata_path, "w", encoding="utf-8") as f:
123
- json.dump(metadata, f, indent=2, ensure_ascii=False)
124
- print(f"✅ Saved individual metadata for {doc_id}")
125
- except Exception as e:
126
- print(f"⚠️ Warning: Could not save individual metadata for {doc_id}: {e}")
127
-
128
- # Update processed documents registry
129
- self.processed_docs[doc_id] = {
130
- "document_url": document_url,
131
- "chunk_count": len(chunks),
132
- "processed_at": metadata["processed_at"],
133
- "collection_name": f"{doc_id}_collection",
134
- "total_chars": total_chars,
135
- "total_words": total_words
136
- }
137
- self._save_processed_docs()
138
-
139
- print(f"✅ Updated registry for document {doc_id}")
140
-
141
- def get_document_metadata(self, doc_id: str) -> Dict[str, Any]:
142
- """
143
- Load individual document metadata from file.
144
-
145
- Args:
146
- doc_id: Document identifier
147
-
148
- Returns:
149
- Dict[str, Any]: Document metadata or empty dict if not found
150
- """
151
- metadata_path = self.base_db_path / f"{doc_id}_metadata.json"
152
-
153
- if not metadata_path.exists():
154
- return {}
155
-
156
- try:
157
- with open(metadata_path, 'r', encoding='utf-8') as f:
158
- return json.load(f)
159
- except Exception as e:
160
- print(f"⚠️ Warning: Could not load metadata for {doc_id}: {e}")
161
- return {}
162
-
163
- def list_processed_documents(self) -> Dict[str, Dict]:
164
- """
165
- List all processed documents.
166
-
167
- Returns:
168
- Dict[str, Dict]: Copy of processed documents registry
169
- """
170
- return self.processed_docs.copy()
171
-
172
- def get_collection_stats(self) -> Dict[str, Any]:
173
- """
174
- Get statistics about all collections.
175
-
176
- Returns:
177
- Dict[str, Any]: Collection statistics
178
- """
179
- stats = {
180
- "total_documents": len(self.processed_docs),
181
- "total_collections": 0,
182
- "total_chunks": 0,
183
- "total_characters": 0,
184
- "total_words": 0,
185
- "documents": []
186
- }
187
-
188
- for doc_id, info in self.processed_docs.items():
189
- collection_path = self.base_db_path / f"{info['collection_name']}.db"
190
- if collection_path.exists():
191
- stats["total_collections"] += 1
192
- stats["total_chunks"] += info.get("chunk_count", 0)
193
- stats["total_characters"] += info.get("total_chars", 0)
194
- stats["total_words"] += info.get("total_words", 0)
195
-
196
- stats["documents"].append({
197
- "doc_id": doc_id,
198
- "url": info["document_url"],
199
- "chunk_count": info.get("chunk_count", 0),
200
- "total_chars": info.get("total_chars", 0),
201
- "total_words": info.get("total_words", 0),
202
- "processed_at": info.get("processed_at", "unknown")
203
- })
204
-
205
- # Add averages
206
- if stats["total_documents"] > 0:
207
- stats["avg_chunks_per_doc"] = stats["total_chunks"] / stats["total_documents"]
208
- stats["avg_chars_per_doc"] = stats["total_characters"] / stats["total_documents"]
209
- stats["avg_words_per_doc"] = stats["total_words"] / stats["total_documents"]
210
-
211
- return stats
212
-
213
- def remove_document_metadata(self, doc_id: str) -> bool:
214
- """
215
- Remove document metadata and registry entry.
216
-
217
- Args:
218
- doc_id: Document identifier
219
-
220
- Returns:
221
- bool: True if successfully removed, False otherwise
222
- """
223
- try:
224
- # Remove individual metadata file
225
- metadata_path = self.base_db_path / f"{doc_id}_metadata.json"
226
- if metadata_path.exists():
227
- metadata_path.unlink()
228
- print(f"🗑️ Removed metadata file for {doc_id}")
229
-
230
- # Remove from registry
231
- if doc_id in self.processed_docs:
232
- del self.processed_docs[doc_id]
233
- self._save_processed_docs()
234
- print(f"🗑️ Removed registry entry for {doc_id}")
235
-
236
- return True
237
-
238
- except Exception as e:
239
- print(f"❌ Error removing metadata for {doc_id}: {e}")
240
- return False
241
-
242
- def update_document_status(self, doc_id: str, status_info: Dict[str, Any]):
243
- """
244
- Update status information for a document.
245
-
246
- Args:
247
- doc_id: Document identifier
248
- status_info: Status information to update
249
- """
250
- if doc_id in self.processed_docs:
251
- self.processed_docs[doc_id].update(status_info)
252
- self._save_processed_docs()
253
- print(f"✅ Updated status for document {doc_id}")
254
-
255
- def get_registry_path(self) -> str:
256
- """
257
- Get the path to the processed documents registry.
258
-
259
- Returns:
260
- str: Path to registry file
261
- """
262
- return str(self.processed_docs_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocessing/preprocessing_modules/modular_preprocessor.py DELETED
@@ -1,290 +0,0 @@
1
- """
2
- Modular Document Preprocessor
3
-
4
- Main orchestrator class that uses all preprocessing modules to process documents.
5
- """
6
-
7
- import os
8
- import asyncio
9
- from typing import List, Dict, Any, Union
10
- from pathlib import Path
11
-
12
- from config.config import OUTPUT_DIR
13
- from .pdf_downloader import PDFDownloader
14
- from .file_downloader import FileDownloader
15
- from .text_extractor import TextExtractor
16
- from .text_chunker import TextChunker
17
- from .embedding_manager import EmbeddingManager
18
- from .vector_storage import VectorStorage
19
- from .metadata_manager import MetadataManager
20
-
21
- # Import new extractors
22
- from .docx_extractor import extract_docx
23
- from .pptx_extractor import extract_pptx
24
- from .xlsx_extractor import extract_xlsx
25
- from .image_extractor import extract_image_content
26
-
27
-
28
- class ModularDocumentPreprocessor:
29
- """
30
- Modular document preprocessor that orchestrates the entire preprocessing pipeline.
31
-
32
- This class combines all preprocessing modules to provide a clean interface
33
- for document processing while maintaining separation of concerns.
34
- """
35
-
36
- def __init__(self):
37
- """Initialize the modular document preprocessor."""
38
- # Set up base database path
39
- self.base_db_path = Path(OUTPUT_DIR).resolve()
40
- self._ensure_base_directory()
41
-
42
- # Initialize all modules
43
- self.pdf_downloader = PDFDownloader() # Keep for backward compatibility
44
- self.file_downloader = FileDownloader() # New enhanced downloader
45
- self.text_extractor = TextExtractor()
46
- self.text_chunker = TextChunker()
47
- self.embedding_manager = EmbeddingManager()
48
- self.vector_storage = VectorStorage(self.base_db_path)
49
- self.metadata_manager = MetadataManager(self.base_db_path)
50
-
51
- print("✅ Modular Document Preprocessor initialized successfully")
52
-
53
- def _ensure_base_directory(self):
54
- """Ensure the base directory exists."""
55
- if not self.base_db_path.exists():
56
- try:
57
- self.base_db_path.mkdir(parents=True, exist_ok=True)
58
- print(f"✅ Created directory: {self.base_db_path}")
59
- except PermissionError:
60
- print(f"⚠️ Directory {self.base_db_path} should exist in production environment")
61
- if not self.base_db_path.exists():
62
- raise RuntimeError(f"Required directory {self.base_db_path} does not exist and cannot be created")
63
-
64
- # Delegate metadata operations to metadata manager
65
- def generate_doc_id(self, document_url: str) -> str:
66
- """Generate a unique document ID from the URL."""
67
- return self.metadata_manager.generate_doc_id(document_url)
68
-
69
- def is_document_processed(self, document_url: str) -> bool:
70
- """Check if a document has already been processed."""
71
- return self.metadata_manager.is_document_processed(document_url)
72
-
73
- def get_document_info(self, document_url: str) -> Dict[str, Any]:
74
- """Get information about a processed document."""
75
- return self.metadata_manager.get_document_info(document_url)
76
-
77
- def list_processed_documents(self) -> Dict[str, Dict]:
78
- """List all processed documents."""
79
- return self.metadata_manager.list_processed_documents()
80
-
81
- def get_collection_stats(self) -> Dict[str, Any]:
82
- """Get statistics about all collections."""
83
- return self.metadata_manager.get_collection_stats()
84
-
85
- async def process_document(self, document_url: str, force_reprocess: bool = False, timeout: int = 300) -> Union[str, List]:
86
- """
87
- Process a single document: download, extract, chunk, embed, and store.
88
-
89
- Args:
90
- document_url: URL of the document (PDF, DOCX, PPTX, XLSX, images, etc.)
91
- force_reprocess: If True, reprocess even if already processed
92
- timeout: Download timeout in seconds (default: 300s/5min)
93
-
94
- Returns:
95
- str: Document ID for normal processing
96
- List: [content, type] for special handling (oneshot, tabular, image)
97
- """
98
- doc_id = self.generate_doc_id(document_url)
99
-
100
- # Check if already processed
101
- if not force_reprocess and self.is_document_processed(document_url):
102
- print(f"✅ Document {doc_id} already processed, skipping...")
103
- return doc_id
104
-
105
- print(f"🚀 Processing document: {doc_id}")
106
- print(f"📄 URL: {document_url}")
107
-
108
- temp_file_path = None
109
- try:
110
- # Step 1: Download file (enhanced to handle multiple types)
111
- temp_file_path, ext = await self.file_downloader.download_file(document_url, timeout=timeout)
112
-
113
- if temp_file_path == 'not supported':
114
- return ['unsupported', ext]
115
-
116
- # Step 2: Extract text based on file type
117
- full_text = ""
118
- match ext:
119
- case 'pdf':
120
- full_text = await self.text_extractor.extract_text_from_pdf(temp_file_path)
121
-
122
- case 'docx':
123
- full_text = extract_docx(temp_file_path)
124
-
125
- case 'pptx':
126
- full_text = extract_pptx(temp_file_path)
127
- return [full_text, 'oneshot']
128
-
129
- case 'url':
130
- new_context = "URL for Context: " + temp_file_path
131
- return [new_context, 'oneshot']
132
-
133
- case 'txt':
134
- with open(temp_file_path, 'r', encoding='utf-8') as f:
135
- full_text = f.read()
136
-
137
- case 'xlsx':
138
- full_text = extract_xlsx(temp_file_path)
139
- # Print a short preview (10-15 chars) to verify extraction
140
- try:
141
- preview = ''.join(full_text.split())[:15]
142
- if preview:
143
- print(f"🔎 XLSX extracted preview: {preview}")
144
- except Exception:
145
- pass
146
- return [full_text, 'tabular']
147
-
148
- case 'csv':
149
- with open(temp_file_path, 'r', encoding='utf-8') as f:
150
- full_text = f.read()
151
- return [full_text, 'tabular']
152
-
153
- case 'png' | 'jpeg' | 'jpg':
154
- # Don't clean up image files - they'll be cleaned up by the caller
155
- return [temp_file_path, 'image', True] # Third element indicates no cleanup needed
156
-
157
- case _:
158
- raise Exception(f"Unsupported file type: {ext}")
159
-
160
- # Validate extracted text
161
- if not self.text_extractor.validate_extracted_text(full_text):
162
- raise Exception("No meaningful text extracted from document")
163
-
164
- # Step 3: Create chunks
165
- chunks = self.text_chunker.chunk_text(full_text)
166
-
167
- # Check if document is too short for chunking
168
- if len(chunks) < 5:
169
- print(f"Only {len(chunks)} chunks formed, going for oneshot.")
170
- return [full_text, 'oneshot']
171
-
172
- if not chunks:
173
- raise Exception("No chunks created from text")
174
-
175
- # Log chunk statistics
176
- chunk_stats = self.text_chunker.get_chunk_stats(chunks)
177
- print(f"📊 Chunk Statistics: {chunk_stats['total_chunks']} chunks, "
178
- f"avg size: {chunk_stats['avg_chunk_size']:.0f} chars")
179
-
180
- # Step 4: Create embeddings
181
- embeddings = await self.embedding_manager.create_embeddings(chunks)
182
-
183
- # Validate embeddings
184
- if not self.embedding_manager.validate_embeddings(embeddings, len(chunks)):
185
- raise Exception("Invalid embeddings generated")
186
-
187
- # Step 5: Store in Qdrant
188
- await self.vector_storage.store_in_qdrant(chunks, embeddings, doc_id)
189
-
190
- # Step 6: Save metadata
191
- self.metadata_manager.save_document_metadata(chunks, doc_id, document_url)
192
-
193
- print(f"✅ Document {doc_id} processed successfully: {len(chunks)} chunks")
194
- return doc_id
195
-
196
- except Exception as e:
197
- print(f"❌ Error processing document {doc_id}: {str(e)}")
198
- raise
199
- finally:
200
- # Clean up temporary file - but NOT for images since they need the file path
201
- if temp_file_path and ext not in ['png', 'jpeg', 'jpg']:
202
- self.file_downloader.cleanup_temp_file(temp_file_path)
203
-
204
- async def process_multiple_documents(self, document_urls: List[str], force_reprocess: bool = False) -> Dict[str, str]:
205
- """
206
- Process multiple documents concurrently.
207
-
208
- Args:
209
- document_urls: List of PDF URLs
210
- force_reprocess: If True, reprocess even if already processed
211
-
212
- Returns:
213
- Dict[str, str]: Mapping of URLs to document IDs
214
- """
215
- print(f"🚀 Processing {len(document_urls)} documents...")
216
-
217
- results = {}
218
-
219
- # Process documents concurrently (with limited concurrency)
220
- semaphore = asyncio.Semaphore(3) # Limit to 3 concurrent downloads
221
-
222
- async def process_single(url):
223
- async with semaphore:
224
- try:
225
- doc_id = await self.process_document(url, force_reprocess)
226
- return url, doc_id
227
- except Exception as e:
228
- print(f"❌ Failed to process {url}: {str(e)}")
229
- return url, None
230
-
231
- tasks = [process_single(url) for url in document_urls]
232
- completed_tasks = await asyncio.gather(*tasks, return_exceptions=True)
233
-
234
- for result in completed_tasks:
235
- if isinstance(result, tuple):
236
- url, doc_id = result
237
- if doc_id:
238
- results[url] = doc_id
239
-
240
- print(f"✅ Successfully processed {len(results)}/{len(document_urls)} documents")
241
- return results
242
-
243
- def get_system_info(self) -> Dict[str, Any]:
244
- """
245
- Get information about the preprocessing system.
246
-
247
- Returns:
248
- Dict[str, Any]: System information
249
- """
250
- return {
251
- "base_db_path": str(self.base_db_path),
252
- "embedding_model": self.embedding_manager.get_model_info(),
253
- "text_chunker_config": {
254
- "chunk_size": self.text_chunker.chunk_size,
255
- "chunk_overlap": self.text_chunker.chunk_overlap
256
- },
257
- "processed_documents_registry": self.metadata_manager.get_registry_path(),
258
- "collection_stats": self.get_collection_stats()
259
- }
260
-
261
- def cleanup_document(self, document_url: str) -> bool:
262
- """
263
- Remove all data for a specific document.
264
-
265
- Args:
266
- document_url: URL of the document to clean up
267
-
268
- Returns:
269
- bool: True if successfully cleaned up
270
- """
271
- doc_id = self.generate_doc_id(document_url)
272
-
273
- try:
274
- # Remove vector storage
275
- vector_removed = self.vector_storage.delete_collection(doc_id)
276
-
277
- # Remove metadata
278
- metadata_removed = self.metadata_manager.remove_document_metadata(doc_id)
279
-
280
- success = vector_removed and metadata_removed
281
- if success:
282
- print(f"✅ Successfully cleaned up document {doc_id}")
283
- else:
284
- print(f"⚠️ Partial cleanup for document {doc_id}")
285
-
286
- return success
287
-
288
- except Exception as e:
289
- print(f"❌ Error cleaning up document {doc_id}: {e}")
290
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocessing/preprocessing_modules/pdf_downloader.py DELETED
@@ -1,112 +0,0 @@
1
- """
2
- PDF Downloader Module
3
-
4
- Handles downloading PDFs from URLs with retry logic and progress tracking.
5
- """
6
-
7
- import os
8
- import asyncio
9
- import tempfile
10
- import aiohttp
11
- from typing import Optional
12
-
13
-
14
- class PDFDownloader:
15
- """Handles PDF downloading with enhanced error handling and retry logic."""
16
-
17
- def __init__(self):
18
- """Initialize the PDF downloader."""
19
- pass
20
-
21
- async def download_pdf(self, url: str, timeout: int = 300, max_retries: int = 3) -> str:
22
- """
23
- Download PDF from URL to a temporary file with enhanced error handling.
24
-
25
- Args:
26
- url: URL of the PDF to download
27
- timeout: Download timeout in seconds (default: 300s/5min)
28
- max_retries: Maximum number of retry attempts
29
-
30
- Returns:
31
- str: Path to the downloaded temporary file
32
-
33
- Raises:
34
- Exception: If download fails after all retries
35
- """
36
- print(f"📥 Downloading PDF from: {url[:50]}...")
37
-
38
- for attempt in range(max_retries):
39
- try:
40
- # Enhanced timeout settings for large files
41
- timeout_config = aiohttp.ClientTimeout(
42
- total=timeout, # Total timeout
43
- connect=30, # Connection timeout
44
- sock_read=120 # Socket read timeout
45
- )
46
-
47
- async with aiohttp.ClientSession(timeout=timeout_config) as session:
48
- print(f" Attempt {attempt + 1}/{max_retries} (timeout: {timeout}s)")
49
-
50
- async with session.get(url) as response:
51
- if response.status != 200:
52
- raise Exception(f"Failed to download PDF: HTTP {response.status}")
53
-
54
- # Get content length for progress tracking
55
- content_length = response.headers.get('content-length')
56
- if content_length:
57
- total_size = int(content_length)
58
- print(f" File size: {total_size / (1024*1024):.1f} MB")
59
-
60
- # Create temporary file
61
- temp_file = tempfile.NamedTemporaryFile(
62
- delete=False,
63
- suffix=".pdf",
64
- prefix="preprocess_"
65
- )
66
-
67
- # Write content to temporary file with progress tracking
68
- downloaded = 0
69
- async for chunk in response.content.iter_chunked(16384): # Larger chunks
70
- temp_file.write(chunk)
71
- downloaded += len(chunk)
72
-
73
- # Show progress for large files
74
- if content_length and downloaded % (1024*1024) == 0: # Every MB
75
- progress = (downloaded / total_size) * 100
76
- print(f" Progress: {progress:.1f}% ({downloaded/(1024*1024):.1f} MB)")
77
-
78
- temp_file.close()
79
- print(f"✅ PDF downloaded successfully: {temp_file.name}")
80
- return temp_file.name
81
-
82
- except asyncio.TimeoutError:
83
- print(f" ⏰ Timeout on attempt {attempt + 1}")
84
- if attempt < max_retries - 1:
85
- wait_time = (attempt + 1) * 30 # Increasing wait time
86
- print(f" ⏳ Waiting {wait_time}s before retry...")
87
- await asyncio.sleep(wait_time)
88
- continue
89
-
90
- except Exception as e:
91
- print(f" ❌ Error on attempt {attempt + 1}: {str(e)}")
92
- if attempt < max_retries - 1:
93
- wait_time = (attempt + 1) * 15
94
- print(f" ⏳ Waiting {wait_time}s before retry...")
95
- await asyncio.sleep(wait_time)
96
- continue
97
-
98
- raise Exception(f"Failed to download PDF after {max_retries} attempts")
99
-
100
- def cleanup_temp_file(self, temp_path: str) -> None:
101
- """
102
- Clean up temporary file.
103
-
104
- Args:
105
- temp_path: Path to the temporary file to delete
106
- """
107
- if temp_path and os.path.exists(temp_path):
108
- try:
109
- os.unlink(temp_path)
110
- print(f"🗑️ Cleaned up temporary file: {temp_path}")
111
- except Exception as e:
112
- print(f"⚠️ Warning: Could not delete temporary file {temp_path}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocessing/preprocessing_modules/pptx_extractor.py DELETED
@@ -1,118 +0,0 @@
1
- from pptx import Presentation
2
- from pptx.enum.shapes import MSO_SHAPE_TYPE
3
- from typing import List, Dict, Any
4
- from PIL import Image
5
- from io import BytesIO
6
- import requests
7
- from concurrent.futures import ThreadPoolExecutor, as_completed
8
- import tempfile
9
- import os
10
- import sys
11
- sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
12
- from config import config
13
-
14
- # OCR Space API configuration
15
- API_KEY = getattr(config, 'OCR_SPACE_API_KEY', None)
16
- API_URL = "https://api.ocr.space/parse/image"
17
-
18
- def ocr_space_file(filename, api_key=API_KEY, overlay=False, language="eng"):
19
- """Extract text from image file using OCR Space API"""
20
- if not api_key:
21
- return filename, "OCR API key not configured"
22
-
23
- payload = {
24
- "isOverlayRequired": overlay,
25
- "apikey": api_key,
26
- "language": language,
27
- "detectOrientation": True,
28
- "scale": True,
29
- "isTable": False,
30
- "OCREngine": 2
31
- }
32
- try:
33
- with open(filename, "rb") as f:
34
- response = requests.post(API_URL, files={filename: f}, data=payload, timeout=30)
35
-
36
- if response.status_code != 200:
37
- return filename, f"API Error: HTTP {response.status_code}"
38
-
39
- parsed = response.json()
40
-
41
- if parsed.get("OCRExitCode") == 1:
42
- parsed_text = parsed.get("ParsedResults", [{}])[0].get("ParsedText", "")
43
- return filename, parsed_text
44
- else:
45
- error_msg = parsed.get("ErrorMessage", ["Unknown error"])[0] if parsed.get("ErrorMessage") else "Unknown OCR error"
46
- return filename, f"OCR Error: {error_msg}"
47
-
48
- except requests.exceptions.Timeout:
49
- return filename, "Error: Request timeout"
50
- except requests.exceptions.RequestException as e:
51
- return filename, f"Error: Network error - {str(e)}"
52
- except Exception as e:
53
- return filename, f"Error: {e}"
54
-
55
- def extract_pptx(pptx_path: str) -> str:
56
- """Extract text and images from PowerPoint presentations."""
57
- try:
58
- prs = Presentation(pptx_path)
59
- except Exception as e:
60
- return f"Error loading PowerPoint file: {str(e)}"
61
-
62
- all_content = []
63
- temp_files = []
64
-
65
- try:
66
- for slide_idx, slide in enumerate(prs.slides):
67
- slide_content = [f"\\n=== Slide {slide_idx + 1} ===\\n"]
68
- slide_images = []
69
-
70
- for shape in slide.shapes:
71
- # Extract text
72
- if hasattr(shape, "text") and shape.text.strip():
73
- slide_content.append(shape.text.strip())
74
-
75
- # Extract images
76
- elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
77
- try:
78
- image = shape.image
79
- image_bytes = image.blob
80
-
81
- # Save image to temp file
82
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
83
- temp_file.write(image_bytes)
84
- temp_file.close()
85
- temp_files.append(temp_file.name)
86
- slide_images.append(temp_file.name)
87
- except Exception as e:
88
- slide_content.append(f"[Image extraction error: {str(e)}]")
89
-
90
- # Process images with OCR if API key is available
91
- if slide_images and API_KEY:
92
- try:
93
- with ThreadPoolExecutor(max_workers=3) as executor:
94
- future_to_filename = {
95
- executor.submit(ocr_space_file, img_file): img_file
96
- for img_file in slide_images
97
- }
98
-
99
- for future in as_completed(future_to_filename):
100
- filename, ocr_result = future.result()
101
- if ocr_result and not ocr_result.startswith("Error") and not ocr_result.startswith("OCR Error"):
102
- slide_content.append(f"[Image Text]: {ocr_result}")
103
- except Exception as e:
104
- slide_content.append(f"[OCR processing error: {str(e)}]")
105
- elif slide_images:
106
- slide_content.append(f"[{len(slide_images)} images found - OCR not available]")
107
-
108
- all_content.append("\\n".join(slide_content))
109
-
110
- finally:
111
- # Clean up temp files
112
- for temp_file in temp_files:
113
- try:
114
- os.unlink(temp_file)
115
- except:
116
- pass
117
-
118
- return "\\n\\n".join(all_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocessing/preprocessing_modules/text_chunker.py DELETED
@@ -1,167 +0,0 @@
1
- """
2
- Text Chunker Module
3
-
4
- Handles chunking text into smaller pieces with overlap for better context preservation.
5
- """
6
-
7
- import re
8
- from typing import List
9
- from config.config import CHUNK_SIZE, CHUNK_OVERLAP
10
-
11
-
12
- class TextChunker:
13
- """Handles text chunking with overlap and smart boundary detection."""
14
-
15
- def __init__(self):
16
- """Initialize the text chunker."""
17
- self.chunk_size = CHUNK_SIZE
18
- self.chunk_overlap = CHUNK_OVERLAP
19
-
20
- def chunk_text(self, text: str) -> List[str]:
21
- """
22
- Chunk text into smaller pieces with overlap.
23
-
24
- Args:
25
- text: The input text to chunk
26
-
27
- Returns:
28
- List[str]: List of text chunks
29
- """
30
- print(f"✂️ Chunking text into {self.chunk_size} character chunks with {self.chunk_overlap} overlap")
31
-
32
- # Clean the text
33
- cleaned_text = self._clean_text(text)
34
-
35
- chunks = []
36
- start = 0
37
-
38
- while start < len(cleaned_text):
39
- end = start + self.chunk_size
40
-
41
- # Try to end at sentence boundary
42
- if end < len(cleaned_text):
43
- end = self._find_sentence_boundary(cleaned_text, start, end)
44
-
45
- chunk = cleaned_text[start:end].strip()
46
-
47
- # Only add chunk if it's meaningful
48
- if chunk and len(chunk) > 50:
49
- chunks.append(chunk)
50
-
51
- # Move start position with overlap
52
- start = end - self.chunk_overlap
53
- if start >= len(cleaned_text):
54
- break
55
-
56
- print(f"✅ Created {len(chunks)} chunks (size={self.chunk_size}, overlap={self.chunk_overlap})")
57
- return chunks
58
-
59
- def _clean_text(self, text: str) -> str:
60
- """
61
- Clean text by normalizing whitespace and removing excessive line breaks.
62
-
63
- Args:
64
- text: Raw text to clean
65
-
66
- Returns:
67
- str: Cleaned text
68
- """
69
- # Replace multiple whitespace with single space
70
- text = re.sub(r'\s+', ' ', text)
71
- return text.strip()
72
-
73
- def _find_sentence_boundary(self, text: str, start: int, preferred_end: int) -> int:
74
- """
75
- Find the best sentence boundary near the preferred end position.
76
-
77
- Args:
78
- text: The full text
79
- start: Start position of the chunk
80
- preferred_end: Preferred end position
81
-
82
- Returns:
83
- int: Adjusted end position at sentence boundary
84
- """
85
- # Look for sentence endings within a reasonable range
86
- search_start = max(start, preferred_end - 100)
87
- search_end = min(len(text), preferred_end + 50)
88
-
89
- sentence_endings = ['.', '!', '?']
90
- best_end = preferred_end
91
-
92
- # Search backwards from preferred end for sentence boundary
93
- for i in range(preferred_end - 1, search_start - 1, -1):
94
- if text[i] in sentence_endings:
95
- # Check if this looks like a real sentence ending
96
- if self._is_valid_sentence_ending(text, i):
97
- best_end = i + 1
98
- break
99
-
100
- return best_end
101
-
102
- def _is_valid_sentence_ending(self, text: str, pos: int) -> bool:
103
- """
104
- Check if a punctuation mark represents a valid sentence ending.
105
-
106
- Args:
107
- text: The full text
108
- pos: Position of the punctuation mark
109
-
110
- Returns:
111
- bool: True if it's a valid sentence ending
112
- """
113
- # Avoid breaking on abbreviations like "Dr.", "Mr.", etc.
114
- if pos > 0 and text[pos] == '.':
115
- # Look at the character before the period
116
- char_before = text[pos - 1]
117
- if char_before.isupper():
118
- # Might be an abbreviation
119
- word_start = pos - 1
120
- while word_start > 0 and text[word_start - 1].isalpha():
121
- word_start -= 1
122
-
123
- word = text[word_start:pos]
124
- # Common abbreviations to avoid breaking on
125
- abbreviations = {'Dr', 'Mr', 'Mrs', 'Ms', 'Prof', 'Inc', 'Ltd', 'Corp', 'Co'}
126
- if word in abbreviations:
127
- return False
128
-
129
- # Check if there's a space or newline after the punctuation
130
- if pos + 1 < len(text):
131
- next_char = text[pos + 1]
132
- return next_char.isspace() or next_char.isupper()
133
-
134
- return True
135
-
136
- def get_chunk_stats(self, chunks: List[str]) -> dict:
137
- """
138
- Get statistics about the created chunks.
139
-
140
- Args:
141
- chunks: List of text chunks
142
-
143
- Returns:
144
- dict: Statistics about the chunks
145
- """
146
- if not chunks:
147
- return {
148
- "total_chunks": 0,
149
- "total_characters": 0,
150
- "total_words": 0,
151
- "avg_chunk_size": 0,
152
- "min_chunk_size": 0,
153
- "max_chunk_size": 0
154
- }
155
-
156
- chunk_sizes = [len(chunk) for chunk in chunks]
157
- total_chars = sum(chunk_sizes)
158
- total_words = sum(len(chunk.split()) for chunk in chunks)
159
-
160
- return {
161
- "total_chunks": len(chunks),
162
- "total_characters": total_chars,
163
- "total_words": total_words,
164
- "avg_chunk_size": total_chars / len(chunks),
165
- "min_chunk_size": min(chunk_sizes),
166
- "max_chunk_size": max(chunk_sizes)
167
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocessing/preprocessing_modules/text_extractor.py DELETED
@@ -1,62 +0,0 @@
1
- """
2
- Text Extractor Module
3
-
4
- Handles extracting text content from PDF files.
5
- """
6
-
7
- import pdfplumber
8
-
9
-
10
- class TextExtractor:
11
- """Handles text extraction from PDF files."""
12
-
13
- def __init__(self):
14
- """Initialize the text extractor."""
15
- pass
16
-
17
- async def extract_text_from_pdf(self, pdf_path: str) -> str:
18
- """
19
- Extract text from PDF file.
20
-
21
- Args:
22
- pdf_path: Path to the PDF file
23
-
24
- Returns:
25
- str: Extracted text content
26
-
27
- Raises:
28
- Exception: If text extraction fails
29
- """
30
- print(f"📖 Extracting text from PDF...")
31
-
32
- full_text = ""
33
- try:
34
- with pdfplumber.open(pdf_path) as pdf:
35
- for page_num, page in enumerate(pdf.pages):
36
- text = page.extract_text()
37
- if text:
38
- full_text += f"\n--- Page {page_num + 1} ---\n"
39
- full_text += text
40
-
41
- print(f"✅ Extracted {len(full_text)} characters from PDF")
42
- return full_text
43
-
44
- except Exception as e:
45
- raise Exception(f"Failed to extract text from PDF: {str(e)}")
46
-
47
- def validate_extracted_text(self, text: str) -> bool:
48
- """
49
- Validate that extracted text is not empty and contains meaningful content.
50
-
51
- Args:
52
- text: The extracted text to validate
53
-
54
- Returns:
55
- bool: True if text is valid, False otherwise
56
- """
57
- if not text or not text.strip():
58
- return False
59
-
60
- # Check if text has at least some alphabetic characters
61
- alphabetic_chars = sum(1 for char in text if char.isalpha())
62
- return alphabetic_chars > 50 # At least 50 alphabetic characters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocessing/preprocessing_modules/vector_storage.py DELETED
@@ -1,212 +0,0 @@
1
- """
2
- Vector Storage Module
3
-
4
- Handles storing chunks and embeddings in Qdrant vector database.
5
- """
6
-
7
- import numpy as np
8
- from typing import List
9
- from pathlib import Path
10
- from qdrant_client import QdrantClient
11
- from qdrant_client.models import Distance, VectorParams, PointStruct
12
-
13
-
14
- class VectorStorage:
15
- """Handles vector storage operations with Qdrant."""
16
-
17
- def __init__(self, base_db_path: Path):
18
- """
19
- Initialize the vector storage.
20
-
21
- Args:
22
- base_db_path: Base path for storing Qdrant databases
23
- """
24
- self.base_db_path = base_db_path
25
-
26
- async def store_in_qdrant(self, chunks: List[str], embeddings: np.ndarray, doc_id: str):
27
- """
28
- Store chunks and embeddings in Qdrant.
29
-
30
- Args:
31
- chunks: List of text chunks
32
- embeddings: Corresponding embeddings array
33
- doc_id: Document identifier
34
- """
35
- if len(chunks) != embeddings.shape[0]:
36
- raise ValueError(f"Chunk count ({len(chunks)}) doesn't match embedding count ({embeddings.shape[0]})")
37
-
38
- collection_name = f"{doc_id}_collection"
39
- db_path = self.base_db_path / f"{collection_name}.db"
40
- client = QdrantClient(path=str(db_path))
41
-
42
- print(f"💾 Storing {len(chunks)} vectors in collection: {collection_name}")
43
-
44
- try:
45
- # Create or recreate collection
46
- await self._setup_collection(client, collection_name, embeddings.shape[1])
47
-
48
- # Prepare and upload points
49
- await self._upload_points(client, collection_name, chunks, embeddings, doc_id)
50
-
51
- print(f"✅ Successfully stored all vectors in Qdrant")
52
-
53
- finally:
54
- client.close()
55
-
56
- async def _setup_collection(self, client: QdrantClient, collection_name: str, embedding_dim: int):
57
- """
58
- Set up Qdrant collection, recreating if it exists.
59
-
60
- Args:
61
- client: Qdrant client
62
- collection_name: Name of the collection
63
- embedding_dim: Dimension of embeddings
64
- """
65
- # Delete existing collection if it exists
66
- try:
67
- client.delete_collection(collection_name)
68
- print(f"🗑️ Deleted existing collection: {collection_name}")
69
- except Exception:
70
- pass # Collection might not exist
71
-
72
- # Create new collection
73
- client.create_collection(
74
- collection_name=collection_name,
75
- vectors_config=VectorParams(
76
- size=embedding_dim,
77
- distance=Distance.COSINE
78
- )
79
- )
80
- print(f"✅ Created new collection: {collection_name}")
81
-
82
- async def _upload_points(self, client: QdrantClient, collection_name: str,
83
- chunks: List[str], embeddings: np.ndarray, doc_id: str):
84
- """
85
- Upload points to Qdrant collection in batches.
86
-
87
- Args:
88
- client: Qdrant client
89
- collection_name: Name of the collection
90
- chunks: Text chunks
91
- embeddings: Embedding vectors
92
- doc_id: Document identifier
93
- """
94
- # Prepare points
95
- points = []
96
- for i in range(len(chunks)):
97
- points.append(
98
- PointStruct(
99
- id=i,
100
- vector=embeddings[i].tolist(),
101
- payload={
102
- "text": chunks[i],
103
- "chunk_id": i,
104
- "doc_id": doc_id,
105
- "char_count": len(chunks[i]),
106
- "word_count": len(chunks[i].split())
107
- }
108
- )
109
- )
110
-
111
- # Upload in batches to handle large documents
112
- batch_size = 100
113
- total_batches = (len(points) + batch_size - 1) // batch_size
114
-
115
- for i in range(0, len(points), batch_size):
116
- batch = points[i:i + batch_size]
117
- batch_num = (i // batch_size) + 1
118
-
119
- print(f" Uploading batch {batch_num}/{total_batches} ({len(batch)} points)")
120
- client.upsert(collection_name=collection_name, points=batch)
121
-
122
- print(f"✅ Uploaded {len(points)} points in {total_batches} batches")
123
-
124
- def collection_exists(self, doc_id: str) -> bool:
125
- """
126
- Check if a collection exists for the given document ID.
127
-
128
- Args:
129
- doc_id: Document identifier
130
-
131
- Returns:
132
- bool: True if collection exists, False otherwise
133
- """
134
- collection_name = f"{doc_id}_collection"
135
- db_path = self.base_db_path / f"{collection_name}.db"
136
- return db_path.exists()
137
-
138
- def get_collection_info(self, doc_id: str) -> dict:
139
- """
140
- Get information about a collection.
141
-
142
- Args:
143
- doc_id: Document identifier
144
-
145
- Returns:
146
- dict: Collection information
147
- """
148
- collection_name = f"{doc_id}_collection"
149
- db_path = self.base_db_path / f"{collection_name}.db"
150
-
151
- if not db_path.exists():
152
- return {
153
- "collection_name": collection_name,
154
- "exists": False,
155
- "path": str(db_path)
156
- }
157
-
158
- try:
159
- client = QdrantClient(path=str(db_path))
160
- try:
161
- collection_info = client.get_collection(collection_name)
162
- return {
163
- "collection_name": collection_name,
164
- "exists": True,
165
- "path": str(db_path),
166
- "vectors_count": collection_info.vectors_count,
167
- "status": collection_info.status
168
- }
169
- finally:
170
- client.close()
171
- except Exception as e:
172
- return {
173
- "collection_name": collection_name,
174
- "exists": True,
175
- "path": str(db_path),
176
- "error": str(e)
177
- }
178
-
179
- def delete_collection(self, doc_id: str) -> bool:
180
- """
181
- Delete a collection and its database file.
182
-
183
- Args:
184
- doc_id: Document identifier
185
-
186
- Returns:
187
- bool: True if successfully deleted, False otherwise
188
- """
189
- collection_name = f"{doc_id}_collection"
190
- db_path = self.base_db_path / f"{collection_name}.db"
191
-
192
- try:
193
- if db_path.exists():
194
- # Try to delete collection properly first
195
- try:
196
- client = QdrantClient(path=str(db_path))
197
- client.delete_collection(collection_name)
198
- client.close()
199
- except Exception:
200
- pass # Collection might not exist or be corrupted
201
-
202
- # Remove database directory
203
- import shutil
204
- shutil.rmtree(db_path, ignore_errors=True)
205
- print(f"🗑️ Deleted collection: {collection_name}")
206
- return True
207
-
208
- except Exception as e:
209
- print(f"❌ Error deleting collection {collection_name}: {e}")
210
- return False
211
-
212
- return True # Nothing to delete
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocessing/preprocessing_modules/xlsx_extractor.py DELETED
@@ -1,119 +0,0 @@
1
- from openpyxl import load_workbook
2
- from openpyxl.drawing.image import Image as OpenPyXLImage
3
- from typing import List, Dict, Any
4
- from PIL import Image
5
- from io import BytesIO
6
- import pytesseract
7
- import os
8
- import pandas as pd
9
-
10
- def extract_xlsx(xlsx_path: str, tesseract_cmd: str = None) -> str:
11
- """Extract data from Excel files including text and images."""
12
- if tesseract_cmd:
13
- pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
14
-
15
- try:
16
- wb = load_workbook(xlsx_path, data_only=True)
17
- except Exception as e:
18
- return f"Error loading Excel file: {str(e)}"
19
-
20
- all_sheets_content: list[str] = []
21
- preview_text: str | None = None
22
- any_data_found = False
23
-
24
- for sheet in wb.worksheets:
25
- sheet_content = [f"\n=== Sheet: {sheet.title} ===\n"]
26
-
27
- # Extract table data
28
- has_data = False
29
- non_empty_rows = 0
30
- for row in sheet.iter_rows(max_row=sheet.max_row, values_only=True):
31
- if row is None or all(cell is None for cell in row):
32
- continue # skip completely empty rows
33
- has_data = True
34
- non_empty_rows += 1
35
- any_data_found = True
36
- row_data = [str(cell).strip() if cell is not None else "" for cell in row]
37
- joined = " | ".join(row_data)
38
- sheet_content.append(joined)
39
- if preview_text is None and joined.strip():
40
- preview_text = joined[:15]
41
-
42
- if not has_data:
43
- sheet_content.append("[No data in this sheet]")
44
- print(f"ℹ️ XLSX: Sheet '{sheet.title}' has no data (openpyxl)")
45
- else:
46
- print(f"🧾 XLSX: Sheet '{sheet.title}' non-empty rows: {non_empty_rows}")
47
-
48
- # Extract images from the sheet
49
- if hasattr(sheet, '_images'):
50
- image_count = 0
51
- for img in sheet._images:
52
- try:
53
- if hasattr(img, '_data'): # if it's a real OpenPyXL Image
54
- image_data = img._data()
55
- elif hasattr(img, '_ref'):
56
- continue # cell ref-only images; ignore
57
- else:
58
- continue
59
-
60
- pil_img = Image.open(BytesIO(image_data))
61
- try:
62
- ocr_text = pytesseract.image_to_string(pil_img).strip()
63
- if ocr_text:
64
- sheet_content.append(f"[Image {image_count + 1} Text]: {ocr_text}")
65
- else:
66
- sheet_content.append(f"[Image {image_count + 1}]: No text detected")
67
- except Exception as ocr_e:
68
- sheet_content.append(f"[Image {image_count + 1}]: OCR failed - {str(ocr_e)}")
69
-
70
- image_count += 1
71
- except Exception as e:
72
- sheet_content.append(f"[Image extraction error: {str(e)}]")
73
-
74
- if image_count == 0:
75
- sheet_content.append("[No images found in this sheet]")
76
-
77
- all_sheets_content.append("\n".join(sheet_content))
78
-
79
- # If no data found using openpyxl, try pandas fallback (handles some edge cases better)
80
- if not any_data_found:
81
- print("ℹ️ XLSX: No data via openpyxl, trying pandas fallback…")
82
- try:
83
- xls = pd.ExcelFile(xlsx_path, engine="openpyxl")
84
- pandas_parts = []
85
- extracted_sheets = 0
86
- for sheet_name in xls.sheet_names:
87
- df = pd.read_excel(xls, sheet_name=sheet_name, dtype=str)
88
- if not df.empty:
89
- any_data_found = True
90
- header = f"\n=== Sheet: {sheet_name} ===\n"
91
- csv_like = df.fillna("").astype(str).to_csv(index=False)
92
- pandas_parts.append(header + csv_like)
93
- extracted_sheets += 1
94
- if preview_text is None:
95
- flat = "".join(csv_like.splitlines())
96
- if flat:
97
- preview_text = flat[:15]
98
- else:
99
- pandas_parts.append(f"\n=== Sheet: {sheet_name} ===\n[No data in this sheet]")
100
- if pandas_parts:
101
- all_sheets_content = pandas_parts
102
- print(f"✅ XLSX: Pandas fallback extracted {extracted_sheets} non-empty sheet(s)")
103
- except Exception as pe:
104
- # If pandas also fails, keep whatever we had
105
- all_sheets_content.append(f"[Pandas fallback failed: {str(pe)}]")
106
- print(f"❌ XLSX: Pandas fallback failed: {pe}")
107
-
108
- combined = "\n\n".join(all_sheets_content)
109
-
110
- # Print a small preview for verification
111
- if preview_text is None:
112
- # fallback: take from combined text
113
- flat_combined = "".join(combined.splitlines()).strip()
114
- if flat_combined:
115
- preview_text = flat_combined[:15]
116
- if preview_text:
117
- print(f"🔎 XLSX content preview: {preview_text}")
118
-
119
- return combined