Spaces:

quantumbit
/

rag-bajaj

Sleeping

App Files Files Community

quantumbit commited on 11 days ago

Commit

6e4458f

verified ·

1 Parent(s): 1860a28

Delete preprocessing

Browse files

Files changed (15) hide show

preprocessing/__init__.py +0 -23
preprocessing/preprocessing.py +0 -63
preprocessing/preprocessing_modules/__init__.py +0 -29
preprocessing/preprocessing_modules/docx_extractor.py +0 -94
preprocessing/preprocessing_modules/embedding_manager.py +0 -118
preprocessing/preprocessing_modules/file_downloader.py +0 -108
preprocessing/preprocessing_modules/image_extractor.py +0 -120
preprocessing/preprocessing_modules/metadata_manager.py +0 -262
preprocessing/preprocessing_modules/modular_preprocessor.py +0 -290
preprocessing/preprocessing_modules/pdf_downloader.py +0 -112
preprocessing/preprocessing_modules/pptx_extractor.py +0 -118
preprocessing/preprocessing_modules/text_chunker.py +0 -167
preprocessing/preprocessing_modules/text_extractor.py +0 -62
preprocessing/preprocessing_modules/vector_storage.py +0 -212
preprocessing/preprocessing_modules/xlsx_extractor.py +0 -119

preprocessing/__init__.py DELETED Viewed

@@ -1,23 +0,0 @@
-# Preprocessing package
-from .preprocessing import DocumentPreprocessor
-from .preprocessing_modules import (
-    PDFDownloader,
-    TextExtractor,
-    TextChunker,
-    EmbeddingManager,
-    VectorStorage,
-    MetadataManager,
-    ModularDocumentPreprocessor
-)
-__all__ = [
-    'DocumentPreprocessor',
-    'PDFDownloader',
-    'TextExtractor',
-    'TextChunker',
-    'EmbeddingManager',
-    'VectorStorage',
-    'MetadataManager',
-    'ModularDocumentPreprocessor'
-]

preprocessing/preprocessing.py DELETED Viewed

@@ -1,63 +0,0 @@
-import os
-import asyncio
-from typing import List, Dict, Any
-from config.config import *
-from .preprocessing_modules.modular_preprocessor import ModularDocumentPreprocessor
-# For backward compatibility, create an alias
-class DocumentPreprocessor(ModularDocumentPreprocessor):
-    """Backward compatibility alias for the modular document preprocessor."""
-    pass
-# CLI interface for preprocessing
-async def main():
-    """Main function for command-line usage."""
-    import argparse
-    parser = argparse.ArgumentParser(description="Document Preprocessing for RAG")
-    parser.add_argument("--url", type=str, help="Single PDF URL to process")
-    parser.add_argument("--urls-file", type=str, help="File containing PDF URLs (one per line)")
-    parser.add_argument("--force", action="store_true", help="Force reprocessing even if already processed")
-    parser.add_argument("--list", action="store_true", help="List all processed documents")
-    parser.add_argument("--stats", action="store_true", help="Show collection statistics")
-    args = parser.parse_args()
-    preprocessor = DocumentPreprocessor()
-    if args.list:
-        docs = preprocessor.list_processed_documents()
-        print(f"\n📚 Processed Documents ({len(docs)}):")
-        for doc_id, info in docs.items():
-            print(f"  • {doc_id}: {info['document_url'][:50]}... ({info.get('chunk_count', 'N/A')} chunks)")
-    elif args.stats:
-        stats = preprocessor.get_collection_stats()
-        print(f"\n📊 Collection Statistics:")
-        print(f"  • Total documents: {stats['total_documents']}")
-        print(f"  • Total collections: {stats['total_collections']}")
-        print(f"  • Total chunks: {stats['total_chunks']}")
-    elif args.url:
-        await preprocessor.process_document(args.url, args.force)
-    elif args.urls_file:
-        if not os.path.exists(args.urls_file):
-            print(f"❌ File not found: {args.urls_file}")
-            return
-        with open(args.urls_file, 'r') as f:
-            urls = [line.strip() for line in f if line.strip()]
-        if urls:
-            await preprocessor.process_multiple_documents(urls, args.force)
-        else:
-            print("❌ No URLs found in file")
-    else:
-        print("❌ Please provide --url, --urls-file, --list, or --stats")
-        parser.print_help()
-if __name__ == "__main__":
-    asyncio.run(main())

preprocessing/preprocessing_modules/__init__.py DELETED Viewed

@@ -1,29 +0,0 @@
-# Preprocessing modules
-from .pdf_downloader import PDFDownloader
-from .file_downloader import FileDownloader
-from .text_extractor import TextExtractor
-from .text_chunker import TextChunker
-from .embedding_manager import EmbeddingManager
-from .vector_storage import VectorStorage
-from .metadata_manager import MetadataManager
-from .modular_preprocessor import ModularDocumentPreprocessor
-from .docx_extractor import extract_docx
-from .pptx_extractor import extract_pptx
-from .xlsx_extractor import extract_xlsx
-from .image_extractor import extract_image_content
-__all__ = [
-    'PDFDownloader',
-    'FileDownloader',
-    'TextExtractor',
-    'TextChunker',
-    'EmbeddingManager',
-    'VectorStorage',
-    'MetadataManager',
-    'ModularDocumentPreprocessor',
-    'extract_docx',
-    'extract_pptx',
-    'extract_xlsx',
-    'extract_image_content'
-]

preprocessing/preprocessing_modules/docx_extractor.py DELETED Viewed

@@ -1,94 +0,0 @@
-from docx import Document
-from docx.document import Document as _Document
-from docx.table import Table
-from docx.text.paragraph import Paragraph
-from typing import Union, List, Dict, Any
-from PIL import Image
-from io import BytesIO
-import pytesseract
-import os
-from zipfile import ZipFile
-from lxml import etree
-from pathlib import Path
-import io
-def extract_docx(docx_input) -> str:
-    """Extract text from DOCX files with table and text handling."""
-    zipf = ZipFile(docx_input)
-    xml_content = zipf.read("word/document.xml")
-    tree = etree.fromstring(xml_content)
-    ns = {
-        "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
-        "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
-        "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape"
-    }
-    text_blocks = []
-    # Extract all tables with gridSpan handling
-    tables = tree.xpath("//w:tbl", namespaces=ns)
-    table_elements = set(tables)
-    table_index = 0
-    for tbl in tables:
-        rows = tbl.xpath("./w:tr", namespaces=ns)
-        sub_tables = []
-        current_table = []
-        prev_col_count = None
-        for row in rows:
-            row_texts = []
-            cells = row.xpath("./w:tc", namespaces=ns)
-            col_count = 0
-            for cell in cells:
-                cell_text = ""
-                paragraphs = cell.xpath(".//w:p", namespaces=ns)
-                for para in paragraphs:
-                    text_nodes = para.xpath(".//w:t", namespaces=ns)
-                    para_text = "".join(node.text for node in text_nodes if node.text)
-                    if para_text.strip():
-                        cell_text += para_text + " "
-                # Handle gridSpan (merged cells)
-                gridspan_elem = cell.xpath(".//w:gridSpan", namespaces=ns)
-                span = int(gridspan_elem[0].get(ns["w"] + "val", "1")) if gridspan_elem else 1
-                row_texts.append(cell_text.strip())
-                col_count += span
-            if row_texts and any(text.strip() for text in row_texts):
-                if prev_col_count is not None and col_count != prev_col_count:
-                    # Column count changed, save current table and start new one
-                    if current_table:
-                        sub_tables.append(current_table)
-                        current_table = []
-                current_table.append(row_texts)
-                prev_col_count = col_count
-        if current_table:
-            sub_tables.append(current_table)
-        # Format tables
-        for sub_table in sub_tables:
-            table_text = f"\\n--- Table {table_index + 1} ---\\n"
-            for row in sub_table:
-                table_text += " | ".join(row) + "\\n"
-            text_blocks.append(table_text)
-            table_index += 1
-    # Extract non-table paragraphs
-    paragraphs = tree.xpath("//w:p", namespaces=ns)
-    for para in paragraphs:
-        # Check if paragraph is inside a table
-        is_in_table = any(table in para.xpath("ancestor::*") for table in table_elements)
-        if not is_in_table:
-            text_nodes = para.xpath(".//w:t", namespaces=ns)
-            para_text = "".join(node.text for node in text_nodes if node.text)
-            if para_text.strip():
-                text_blocks.append(para_text.strip())
-    return "\\n\\n".join(text_blocks)

preprocessing/preprocessing_modules/embedding_manager.py DELETED Viewed

@@ -1,118 +0,0 @@
-"""
-Embedding Manager Module
-Handles creation of embeddings for text chunks using sentence transformers.
-"""
-import asyncio
-import numpy as np
-from typing import List
-from sentence_transformers import SentenceTransformer
-from config.config import EMBEDDING_MODEL, BATCH_SIZE
-class EmbeddingManager:
-    """Handles embedding creation for text chunks."""
-    def __init__(self):
-        """Initialize the embedding manager."""
-        self.embedding_model = None
-        self._init_embedding_model()
-    def _init_embedding_model(self):
-        """Initialize the embedding model."""
-        print(f"🔄 Loading embedding model: {EMBEDDING_MODEL}")
-        self.embedding_model = SentenceTransformer(EMBEDDING_MODEL)
-        print(f"✅ Embedding model loaded successfully")
-    async def create_embeddings(self, chunks: List[str]) -> np.ndarray:
-        """
-        Create embeddings for text chunks.
-        Args:
-            chunks: List of text chunks to embed
-        Returns:
-            np.ndarray: Array of embeddings with shape (num_chunks, embedding_dim)
-        """
-        print(f"🧠 Creating embeddings for {len(chunks)} chunks")
-        if not chunks:
-            raise ValueError("No chunks provided for embedding creation")
-        def create_embeddings_sync():
-            """Synchronous embedding creation to run in thread pool."""
-            embeddings = self.embedding_model.encode(
-                chunks,
-                batch_size=BATCH_SIZE,
-                show_progress_bar=True,
-                normalize_embeddings=True
-            )
-            return np.array(embeddings).astype("float32")
-        # Run in thread pool to avoid blocking the event loop
-        loop = asyncio.get_event_loop()
-        embeddings = await loop.run_in_executor(None, create_embeddings_sync)
-        print(f"✅ Created embeddings with shape: {embeddings.shape}")
-        return embeddings
-    def get_embedding_dimension(self) -> int:
-        """
-        Get the dimension of embeddings produced by the model.
-        Returns:
-            int: Embedding dimension
-        """
-        if self.embedding_model is None:
-            raise RuntimeError("Embedding model not initialized")
-        # Get dimension from model
-        return self.embedding_model.get_sentence_embedding_dimension()
-    def validate_embeddings(self, embeddings: np.ndarray, expected_count: int) -> bool:
-        """
-        Validate that embeddings have the expected shape and properties.
-        Args:
-            embeddings: The embeddings array to validate
-            expected_count: Expected number of embeddings
-        Returns:
-            bool: True if embeddings are valid, False otherwise
-        """
-        if embeddings is None:
-            return False
-        if embeddings.shape[0] != expected_count:
-            print(f"❌ Embedding count mismatch: expected {expected_count}, got {embeddings.shape[0]}")
-            return False
-        if embeddings.dtype != np.float32:
-            print(f"❌ Embedding dtype mismatch: expected float32, got {embeddings.dtype}")
-            return False
-        # Check for NaN or infinite values
-        if np.any(np.isnan(embeddings)) or np.any(np.isinf(embeddings)):
-            print("❌ Embeddings contain NaN or infinite values")
-            return False
-        print(f"✅ Embeddings validation passed: {embeddings.shape}")
-        return True
-    def get_model_info(self) -> dict:
-        """
-        Get information about the embedding model.
-        Returns:
-            dict: Model information
-        """
-        if self.embedding_model is None:
-            return {"model_name": EMBEDDING_MODEL, "status": "not_loaded"}
-        return {
-            "model_name": EMBEDDING_MODEL,
-            "embedding_dimension": self.get_embedding_dimension(),
-            "max_sequence_length": getattr(self.embedding_model, 'max_seq_length', 'unknown'),
-            "status": "loaded"
-        }

preprocessing/preprocessing_modules/file_downloader.py DELETED Viewed

@@ -1,108 +0,0 @@
-import aiohttp
-import asyncio
-import tempfile
-import os
-import re
-from urllib.parse import urlparse
-from typing import List, Tuple
-class FileDownloader:
-    """Enhanced file downloader that supports multiple file types."""
-    async def download_file(self, url: str, timeout: int = 300, max_retries: int = 3) -> Tuple[str, str]:
-        """Download any file type from a URL to a temporary file with enhanced error handling."""
-        print(f"📥 Downloading file from: {url[:60]}...")
-        for attempt in range(max_retries):
-            try:
-                timeout_config = aiohttp.ClientTimeout(
-                    total=timeout,
-                    connect=30,
-                    sock_read=120
-                )
-                async with aiohttp.ClientSession(timeout=timeout_config) as session:
-                    print(f"   Attempt {attempt + 1}/{max_retries} (timeout: {timeout}s)")
-                    async with session.get(url) as response:
-                        if response.status != 200:
-                            raise Exception(f"Failed to download file: HTTP {response.status}")
-                        # Extract filename from header or URL
-                        cd = response.headers.get('Content-Disposition', '')
-                        filename_match = re.findall('filename="?([^"]+)"?', cd)
-                        if filename_match:
-                            filename = filename_match[0]
-                        else:
-                            from urllib.parse import unquote
-                            path = urlparse(url).path
-                            filename = os.path.basename(unquote(path))  # Decode URL encoding
-                        if not filename:
-                            filename = "downloaded_file"
-                        ext = os.path.splitext(filename)[1]
-                        if not ext:
-                            return url, "url"
-                        print(f"   📁 Detected filename: {filename}, extension: {ext}")
-                        # Check if file type is supported
-                        supported_extensions = ['.pdf', '.docx', '.pptx', '.png', '.xlsx', '.jpeg', '.jpg', '.txt', '.csv']
-                        if ext not in supported_extensions:
-                            # Return extension without dot for consistency
-                            ext_without_dot = ext[1:] if ext.startswith('.') else ext
-                            print(f"   ❌ File type not supported: {ext}")
-                            return 'not supported', ext_without_dot
-                        # Get content length
-                        content_length = response.headers.get('content-length')
-                        if content_length:
-                            total_size = int(content_length)
-                            print(f"   File size: {total_size / (1024 * 1024):.1f} MB")
-                        # Create temp file with same extension
-                        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext, prefix="download_")
-                        # Write to file
-                        downloaded = 0
-                        async for chunk in response.content.iter_chunked(16384):
-                            temp_file.write(chunk)
-                            downloaded += len(chunk)
-                            if content_length and downloaded % (1024 * 1024) == 0:
-                                progress = (downloaded / total_size) * 100
-                                print(f"   Progress: {progress:.1f}% ({downloaded / (1024*1024):.1f} MB)")
-                        temp_file.close()
-                        print(f"✅ File downloaded successfully: {temp_file.name}")
-                        # Return extension without the dot for consistency with modular_preprocessor
-                        ext_without_dot = ext[1:] if ext.startswith('.') else ext
-                        return temp_file.name, ext_without_dot
-            except asyncio.TimeoutError:
-                print(f"   ⏰ Timeout on attempt {attempt + 1}")
-                if attempt < max_retries - 1:
-                    wait_time = (attempt + 1) * 30
-                    print(f"   ⏳ Waiting {wait_time}s before retry...")
-                    await asyncio.sleep(wait_time)
-                continue
-            except Exception as e:
-                print(f"   ❌ Error on attempt {attempt + 1}: {str(e)}")
-                if attempt < max_retries - 1:
-                    wait_time = (attempt + 1) * 15
-                    print(f"   ⏳ Waiting {wait_time}s before retry...")
-                    await asyncio.sleep(wait_time)
-                continue
-        raise Exception(f"Failed to download file after {max_retries} attempts")
-    def cleanup_temp_file(self, temp_path: str) -> None:
-        """Clean up temporary file."""
-        try:
-            if os.path.exists(temp_path):
-                os.unlink(temp_path)
-                print(f"🗑️ Cleaned up temporary file: {temp_path}")
-        except Exception as e:
-            print(f"⚠️ Warning: Could not cleanup temp file {temp_path}: {e}")

preprocessing/preprocessing_modules/image_extractor.py DELETED Viewed

@@ -1,120 +0,0 @@
-import cv2
-import pytesseract
-import numpy as np
-import pandas as pd
-from PIL import Image, ImageFile
-from typing import List, Dict, Any
-ImageFile.LOAD_TRUNCATED_IMAGES = True
-def load_local_image(path: str) -> np.ndarray:
-    """Load image from local path."""
-    img = Image.open(path).convert("RGB")
-    return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
-def sort_contours(cnts, method="top-to-bottom"):
-    """Sort contours based on the specified method."""
-    reverse = False
-    i = 1 if method == "top-to-bottom" or method == "bottom-to-top" else 0
-    if method == "right-to-left" or method == "bottom-to-top":
-        reverse = True
-    boundingBoxes = [cv2.boundingRect(c) for c in cnts]
-    (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
-                                        key=lambda b: b[1][i], reverse=reverse))
-    return cnts, boundingBoxes
-def extract_cells_from_grid(table_img: np.ndarray) -> pd.DataFrame:
-    """Extract table structure from image using OpenCV."""
-    gray = cv2.cvtColor(table_img, cv2.COLOR_BGR2GRAY)
-    _, binary = cv2.threshold(~gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
-    # Detect horizontal lines
-    horizontal = binary.copy()
-    cols = horizontal.shape[1]
-    horizontal_size = cols // 15
-    horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
-    horizontal = cv2.erode(horizontal, horizontal_structure)
-    horizontal = cv2.dilate(horizontal, horizontal_structure)
-    # Detect vertical lines
-    vertical = binary.copy()
-    rows = vertical.shape[0]
-    vertical_size = rows // 15
-    vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
-    vertical = cv2.erode(vertical, vertical_structure)
-    vertical = cv2.dilate(vertical, vertical_structure)
-    # Combine mask
-    mask = cv2.add(horizontal, vertical)
-    contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
-    cells = []
-    for contour in contours:
-        x, y, w, h = cv2.boundingRect(contour)
-        if w > 30 and h > 20:  # Filter small contours
-            cell_img = table_img[y:y+h, x:x+w]
-            try:
-                text = pytesseract.image_to_string(cell_img, config='--psm 7').strip()
-                cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': text})
-            except:
-                cells.append({'x': x, 'y': y, 'w': w, 'h': h, 'text': ''})
-    # Sort cells by position to create table structure
-    cells.sort(key=lambda cell: (cell['y'], cell['x']))
-    # Group cells into rows
-    rows = []
-    current_row = []
-    current_y = 0
-    for cell in cells:
-        if abs(cell['y'] - current_y) > 20:  # New row threshold
-            if current_row:
-                rows.append(current_row)
-            current_row = [cell]
-            current_y = cell['y']
-        else:
-            current_row.append(cell)
-    if current_row:
-        rows.append(current_row)
-    # Convert to DataFrame
-    table_data = []
-    for row in rows:
-        row_data = [cell['text'] for cell in sorted(row, key=lambda c: c['x'])]
-        table_data.append(row_data)
-    if table_data:
-        max_cols = max(len(row) for row in table_data)
-        for row in table_data:
-            while len(row) < max_cols:
-                row.append('')
-        return pd.DataFrame(table_data)
-    else:
-        return pd.DataFrame()
-def extract_image_content(image_path: str) -> str:
-    """Extract text content from images using OCR."""
-    try:
-        # Load image
-        img = load_local_image(image_path)
-        # Basic OCR
-        text = pytesseract.image_to_string(img)
-        # Try to detect if it's a table
-        if '|' in text or '\\t' in text or len(text.split('\\n')) > 3:
-            # Try table extraction
-            try:
-                table_df = extract_cells_from_grid(img)
-                if not table_df.empty:
-                    table_text = "\\n".join([" | ".join(row) for row in table_df.values])
-                    return f"[Table detected]\\n{table_text}\\n\\n[OCR Text]\\n{text}"
-            except:
-                pass
-        return text.strip() if text.strip() else "[No text detected in image]"
-    except Exception as e:
-        return f"[Error processing image: {str(e)}]"

preprocessing/preprocessing_modules/metadata_manager.py DELETED Viewed

@@ -1,262 +0,0 @@
-"""
-Metadata Manager Module
-Handles document metadata storage and retrieval operations.
-"""
-import json
-import asyncio
-import hashlib
-from typing import List, Dict, Any
-from pathlib import Path
-from config.config import EMBEDDING_MODEL, CHUNK_SIZE, CHUNK_OVERLAP
-class MetadataManager:
-    """Handles document metadata operations."""
-    def __init__(self, base_db_path: Path):
-        """
-        Initialize the metadata manager.
-        Args:
-            base_db_path: Base path for storing metadata files
-        """
-        self.base_db_path = base_db_path
-        self.processed_docs_file = self.base_db_path / "processed_documents.json"
-        self.processed_docs = self._load_processed_docs()
-    def _load_processed_docs(self) -> Dict[str, Dict]:
-        """Load the registry of processed documents."""
-        if self.processed_docs_file.exists():
-            try:
-                with open(self.processed_docs_file, 'r', encoding='utf-8') as f:
-                    return json.load(f)
-            except Exception as e:
-                print(f"⚠️ Warning: Could not load processed docs registry: {e}")
-        return {}
-    def _save_processed_docs(self):
-        """Save the registry of processed documents."""
-        try:
-            with open(self.processed_docs_file, 'w', encoding='utf-8') as f:
-                json.dump(self.processed_docs, f, indent=2, ensure_ascii=False)
-        except Exception as e:
-            print(f"⚠️ Warning: Could not save processed docs registry: {e}")
-    def generate_doc_id(self, document_url: str) -> str:
-        """
-        Generate a unique document ID from the URL.
-        Args:
-            document_url: URL of the document
-        Returns:
-            str: Unique document ID
-        """
-        url_hash = hashlib.md5(document_url.encode()).hexdigest()[:12]
-        return f"doc_{url_hash}"
-    def is_document_processed(self, document_url: str) -> bool:
-        """
-        Check if a document has already been processed.
-        Args:
-            document_url: URL of the document
-        Returns:
-            bool: True if document is already processed
-        """
-        doc_id = self.generate_doc_id(document_url)
-        return doc_id in self.processed_docs
-    def get_document_info(self, document_url: str) -> Dict[str, Any]:
-        """
-        Get information about a processed document.
-        Args:
-            document_url: URL of the document
-        Returns:
-            Dict[str, Any]: Document information or empty dict if not found
-        """
-        doc_id = self.generate_doc_id(document_url)
-        return self.processed_docs.get(doc_id, {})
-    def save_document_metadata(self, chunks: List[str], doc_id: str, document_url: str):
-        """
-        Save document metadata to JSON file and update registry.
-        Args:
-            chunks: List of text chunks
-            doc_id: Document identifier
-            document_url: Original document URL
-        """
-        # Calculate statistics
-        total_chars = sum(len(chunk) for chunk in chunks)
-        total_words = sum(len(chunk.split()) for chunk in chunks)
-        avg_chunk_size = total_chars / len(chunks) if chunks else 0
-        # Create metadata object
-        metadata = {
-            "doc_id": doc_id,
-            "document_url": document_url,
-            "chunk_count": len(chunks),
-            "total_chars": total_chars,
-            "total_words": total_words,
-            "avg_chunk_size": avg_chunk_size,
-            "processed_at": asyncio.get_event_loop().time(),
-            "embedding_model": EMBEDDING_MODEL,
-            "chunk_size": CHUNK_SIZE,
-            "chunk_overlap": CHUNK_OVERLAP,
-            "processing_config": {
-                "chunk_size": CHUNK_SIZE,
-                "chunk_overlap": CHUNK_OVERLAP,
-                "embedding_model": EMBEDDING_MODEL
-            }
-        }
-        # Save individual document metadata
-        metadata_path = self.base_db_path / f"{doc_id}_metadata.json"
-        try:
-            with open(metadata_path, "w", encoding="utf-8") as f:
-                json.dump(metadata, f, indent=2, ensure_ascii=False)
-            print(f"✅ Saved individual metadata for {doc_id}")
-        except Exception as e:
-            print(f"⚠️ Warning: Could not save individual metadata for {doc_id}: {e}")
-        # Update processed documents registry
-        self.processed_docs[doc_id] = {
-            "document_url": document_url,
-            "chunk_count": len(chunks),
-            "processed_at": metadata["processed_at"],
-            "collection_name": f"{doc_id}_collection",
-            "total_chars": total_chars,
-            "total_words": total_words
-        }
-        self._save_processed_docs()
-        print(f"✅ Updated registry for document {doc_id}")
-    def get_document_metadata(self, doc_id: str) -> Dict[str, Any]:
-        """
-        Load individual document metadata from file.
-        Args:
-            doc_id: Document identifier
-        Returns:
-            Dict[str, Any]: Document metadata or empty dict if not found
-        """
-        metadata_path = self.base_db_path / f"{doc_id}_metadata.json"
-        if not metadata_path.exists():
-            return {}
-        try:
-            with open(metadata_path, 'r', encoding='utf-8') as f:
-                return json.load(f)
-        except Exception as e:
-            print(f"⚠️ Warning: Could not load metadata for {doc_id}: {e}")
-            return {}
-    def list_processed_documents(self) -> Dict[str, Dict]:
-        """
-        List all processed documents.
-        Returns:
-            Dict[str, Dict]: Copy of processed documents registry
-        """
-        return self.processed_docs.copy()
-    def get_collection_stats(self) -> Dict[str, Any]:
-        """
-        Get statistics about all collections.
-        Returns:
-            Dict[str, Any]: Collection statistics
-        """
-        stats = {
-            "total_documents": len(self.processed_docs),
-            "total_collections": 0,
-            "total_chunks": 0,
-            "total_characters": 0,
-            "total_words": 0,
-            "documents": []
-        }
-        for doc_id, info in self.processed_docs.items():
-            collection_path = self.base_db_path / f"{info['collection_name']}.db"
-            if collection_path.exists():
-                stats["total_collections"] += 1
-                stats["total_chunks"] += info.get("chunk_count", 0)
-                stats["total_characters"] += info.get("total_chars", 0)
-                stats["total_words"] += info.get("total_words", 0)
-                stats["documents"].append({
-                    "doc_id": doc_id,
-                    "url": info["document_url"],
-                    "chunk_count": info.get("chunk_count", 0),
-                    "total_chars": info.get("total_chars", 0),
-                    "total_words": info.get("total_words", 0),
-                    "processed_at": info.get("processed_at", "unknown")
-                })
-        # Add averages
-        if stats["total_documents"] > 0:
-            stats["avg_chunks_per_doc"] = stats["total_chunks"] / stats["total_documents"]
-            stats["avg_chars_per_doc"] = stats["total_characters"] / stats["total_documents"]
-            stats["avg_words_per_doc"] = stats["total_words"] / stats["total_documents"]
-        return stats
-    def remove_document_metadata(self, doc_id: str) -> bool:
-        """
-        Remove document metadata and registry entry.
-        Args:
-            doc_id: Document identifier
-        Returns:
-            bool: True if successfully removed, False otherwise
-        """
-        try:
-            # Remove individual metadata file
-            metadata_path = self.base_db_path / f"{doc_id}_metadata.json"
-            if metadata_path.exists():
-                metadata_path.unlink()
-                print(f"🗑️ Removed metadata file for {doc_id}")
-            # Remove from registry
-            if doc_id in self.processed_docs:
-                del self.processed_docs[doc_id]
-                self._save_processed_docs()
-                print(f"🗑️ Removed registry entry for {doc_id}")
-            return True
-        except Exception as e:
-            print(f"❌ Error removing metadata for {doc_id}: {e}")
-            return False
-    def update_document_status(self, doc_id: str, status_info: Dict[str, Any]):
-        """
-        Update status information for a document.
-        Args:
-            doc_id: Document identifier
-            status_info: Status information to update
-        """
-        if doc_id in self.processed_docs:
-            self.processed_docs[doc_id].update(status_info)
-            self._save_processed_docs()
-            print(f"✅ Updated status for document {doc_id}")
-    def get_registry_path(self) -> str:
-        """
-        Get the path to the processed documents registry.
-        Returns:
-            str: Path to registry file
-        """
-        return str(self.processed_docs_file)

preprocessing/preprocessing_modules/modular_preprocessor.py DELETED Viewed

@@ -1,290 +0,0 @@
-"""
-Modular Document Preprocessor
-Main orchestrator class that uses all preprocessing modules to process documents.
-"""
-import os
-import asyncio
-from typing import List, Dict, Any, Union
-from pathlib import Path
-from config.config import OUTPUT_DIR
-from .pdf_downloader import PDFDownloader
-from .file_downloader import FileDownloader
-from .text_extractor import TextExtractor
-from .text_chunker import TextChunker
-from .embedding_manager import EmbeddingManager
-from .vector_storage import VectorStorage
-from .metadata_manager import MetadataManager
-# Import new extractors
-from .docx_extractor import extract_docx
-from .pptx_extractor import extract_pptx
-from .xlsx_extractor import extract_xlsx
-from .image_extractor import extract_image_content
-class ModularDocumentPreprocessor:
-    """
-    Modular document preprocessor that orchestrates the entire preprocessing pipeline.
-    This class combines all preprocessing modules to provide a clean interface
-    for document processing while maintaining separation of concerns.
-    """
-    def __init__(self):
-        """Initialize the modular document preprocessor."""
-        # Set up base database path
-        self.base_db_path = Path(OUTPUT_DIR).resolve()
-        self._ensure_base_directory()
-        # Initialize all modules
-        self.pdf_downloader = PDFDownloader()  # Keep for backward compatibility
-        self.file_downloader = FileDownloader()  # New enhanced downloader
-        self.text_extractor = TextExtractor()
-        self.text_chunker = TextChunker()
-        self.embedding_manager = EmbeddingManager()
-        self.vector_storage = VectorStorage(self.base_db_path)
-        self.metadata_manager = MetadataManager(self.base_db_path)
-        print("✅ Modular Document Preprocessor initialized successfully")
-    def _ensure_base_directory(self):
-        """Ensure the base directory exists."""
-        if not self.base_db_path.exists():
-            try:
-                self.base_db_path.mkdir(parents=True, exist_ok=True)
-                print(f"✅ Created directory: {self.base_db_path}")
-            except PermissionError:
-                print(f"⚠️  Directory {self.base_db_path} should exist in production environment")
-                if not self.base_db_path.exists():
-                    raise RuntimeError(f"Required directory {self.base_db_path} does not exist and cannot be created")
-    # Delegate metadata operations to metadata manager
-    def generate_doc_id(self, document_url: str) -> str:
-        """Generate a unique document ID from the URL."""
-        return self.metadata_manager.generate_doc_id(document_url)
-    def is_document_processed(self, document_url: str) -> bool:
-        """Check if a document has already been processed."""
-        return self.metadata_manager.is_document_processed(document_url)
-    def get_document_info(self, document_url: str) -> Dict[str, Any]:
-        """Get information about a processed document."""
-        return self.metadata_manager.get_document_info(document_url)
-    def list_processed_documents(self) -> Dict[str, Dict]:
-        """List all processed documents."""
-        return self.metadata_manager.list_processed_documents()
-    def get_collection_stats(self) -> Dict[str, Any]:
-        """Get statistics about all collections."""
-        return self.metadata_manager.get_collection_stats()
-    async def process_document(self, document_url: str, force_reprocess: bool = False, timeout: int = 300) -> Union[str, List]:
-        """
-        Process a single document: download, extract, chunk, embed, and store.
-        Args:
-            document_url: URL of the document (PDF, DOCX, PPTX, XLSX, images, etc.)
-            force_reprocess: If True, reprocess even if already processed
-            timeout: Download timeout in seconds (default: 300s/5min)
-        Returns:
-            str: Document ID for normal processing
-            List: [content, type] for special handling (oneshot, tabular, image)
-        """
-        doc_id = self.generate_doc_id(document_url)
-        # Check if already processed
-        if not force_reprocess and self.is_document_processed(document_url):
-            print(f"✅ Document {doc_id} already processed, skipping...")
-            return doc_id
-        print(f"🚀 Processing document: {doc_id}")
-        print(f"📄 URL: {document_url}")
-        temp_file_path = None
-        try:
-            # Step 1: Download file (enhanced to handle multiple types)
-            temp_file_path, ext = await self.file_downloader.download_file(document_url, timeout=timeout)
-            if temp_file_path == 'not supported':
-                return ['unsupported', ext]
-            # Step 2: Extract text based on file type
-            full_text = ""
-            match ext:
-                case 'pdf':
-                    full_text = await self.text_extractor.extract_text_from_pdf(temp_file_path)
-                case 'docx':
-                    full_text = extract_docx(temp_file_path)
-                case 'pptx':
-                    full_text = extract_pptx(temp_file_path)
-                    return [full_text, 'oneshot']
-                case 'url':
-                    new_context = "URL for Context: " + temp_file_path
-                    return [new_context, 'oneshot']
-                case 'txt':
-                    with open(temp_file_path, 'r', encoding='utf-8') as f:
-                        full_text = f.read()
-                case 'xlsx':
-                    full_text = extract_xlsx(temp_file_path)
-                    # Print a short preview (10-15 chars) to verify extraction
-                    try:
-                        preview = ''.join(full_text.split())[:15]
-                        if preview:
-                            print(f"🔎 XLSX extracted preview: {preview}")
-                    except Exception:
-                        pass
-                    return [full_text, 'tabular']
-                case 'csv':
-                    with open(temp_file_path, 'r', encoding='utf-8') as f:
-                        full_text = f.read()
-                    return [full_text, 'tabular']
-                case 'png' | 'jpeg' | 'jpg':
-                    # Don't clean up image files - they'll be cleaned up by the caller
-                    return [temp_file_path, 'image', True]  # Third element indicates no cleanup needed
-                case _:
-                    raise Exception(f"Unsupported file type: {ext}")
-            # Validate extracted text
-            if not self.text_extractor.validate_extracted_text(full_text):
-                raise Exception("No meaningful text extracted from document")
-            # Step 3: Create chunks
-            chunks = self.text_chunker.chunk_text(full_text)
-            # Check if document is too short for chunking
-            if len(chunks) < 5:
-                print(f"Only {len(chunks)} chunks formed, going for oneshot.")
-                return [full_text, 'oneshot']
-            if not chunks:
-                raise Exception("No chunks created from text")
-            # Log chunk statistics
-            chunk_stats = self.text_chunker.get_chunk_stats(chunks)
-            print(f"📊 Chunk Statistics: {chunk_stats['total_chunks']} chunks, "
-                  f"avg size: {chunk_stats['avg_chunk_size']:.0f} chars")
-            # Step 4: Create embeddings
-            embeddings = await self.embedding_manager.create_embeddings(chunks)
-            # Validate embeddings
-            if not self.embedding_manager.validate_embeddings(embeddings, len(chunks)):
-                raise Exception("Invalid embeddings generated")
-            # Step 5: Store in Qdrant
-            await self.vector_storage.store_in_qdrant(chunks, embeddings, doc_id)
-            # Step 6: Save metadata
-            self.metadata_manager.save_document_metadata(chunks, doc_id, document_url)
-            print(f"✅ Document {doc_id} processed successfully: {len(chunks)} chunks")
-            return doc_id
-        except Exception as e:
-            print(f"❌ Error processing document {doc_id}: {str(e)}")
-            raise
-        finally:
-            # Clean up temporary file - but NOT for images since they need the file path
-            if temp_file_path and ext not in ['png', 'jpeg', 'jpg']:
-                self.file_downloader.cleanup_temp_file(temp_file_path)
-    async def process_multiple_documents(self, document_urls: List[str], force_reprocess: bool = False) -> Dict[str, str]:
-        """
-        Process multiple documents concurrently.
-        Args:
-            document_urls: List of PDF URLs
-            force_reprocess: If True, reprocess even if already processed
-        Returns:
-            Dict[str, str]: Mapping of URLs to document IDs
-        """
-        print(f"🚀 Processing {len(document_urls)} documents...")
-        results = {}
-        # Process documents concurrently (with limited concurrency)
-        semaphore = asyncio.Semaphore(3)  # Limit to 3 concurrent downloads
-        async def process_single(url):
-            async with semaphore:
-                try:
-                    doc_id = await self.process_document(url, force_reprocess)
-                    return url, doc_id
-                except Exception as e:
-                    print(f"❌ Failed to process {url}: {str(e)}")
-                    return url, None
-        tasks = [process_single(url) for url in document_urls]
-        completed_tasks = await asyncio.gather(*tasks, return_exceptions=True)
-        for result in completed_tasks:
-            if isinstance(result, tuple):
-                url, doc_id = result
-                if doc_id:
-                    results[url] = doc_id
-        print(f"✅ Successfully processed {len(results)}/{len(document_urls)} documents")
-        return results
-    def get_system_info(self) -> Dict[str, Any]:
-        """
-        Get information about the preprocessing system.
-        Returns:
-            Dict[str, Any]: System information
-        """
-        return {
-            "base_db_path": str(self.base_db_path),
-            "embedding_model": self.embedding_manager.get_model_info(),
-            "text_chunker_config": {
-                "chunk_size": self.text_chunker.chunk_size,
-                "chunk_overlap": self.text_chunker.chunk_overlap
-            },
-            "processed_documents_registry": self.metadata_manager.get_registry_path(),
-            "collection_stats": self.get_collection_stats()
-        }
-    def cleanup_document(self, document_url: str) -> bool:
-        """
-        Remove all data for a specific document.
-        Args:
-            document_url: URL of the document to clean up
-        Returns:
-            bool: True if successfully cleaned up
-        """
-        doc_id = self.generate_doc_id(document_url)
-        try:
-            # Remove vector storage
-            vector_removed = self.vector_storage.delete_collection(doc_id)
-            # Remove metadata
-            metadata_removed = self.metadata_manager.remove_document_metadata(doc_id)
-            success = vector_removed and metadata_removed
-            if success:
-                print(f"✅ Successfully cleaned up document {doc_id}")
-            else:
-                print(f"⚠️ Partial cleanup for document {doc_id}")
-            return success
-        except Exception as e:
-            print(f"❌ Error cleaning up document {doc_id}: {e}")
-            return False

preprocessing/preprocessing_modules/pdf_downloader.py DELETED Viewed

@@ -1,112 +0,0 @@
-"""
-PDF Downloader Module
-Handles downloading PDFs from URLs with retry logic and progress tracking.
-"""
-import os
-import asyncio
-import tempfile
-import aiohttp
-from typing import Optional
-class PDFDownloader:
-    """Handles PDF downloading with enhanced error handling and retry logic."""
-    def __init__(self):
-        """Initialize the PDF downloader."""
-        pass
-    async def download_pdf(self, url: str, timeout: int = 300, max_retries: int = 3) -> str:
-        """
-        Download PDF from URL to a temporary file with enhanced error handling.
-        Args:
-            url: URL of the PDF to download
-            timeout: Download timeout in seconds (default: 300s/5min)
-            max_retries: Maximum number of retry attempts
-        Returns:
-            str: Path to the downloaded temporary file
-        Raises:
-            Exception: If download fails after all retries
-        """
-        print(f"📥 Downloading PDF from: {url[:50]}...")
-        for attempt in range(max_retries):
-            try:
-                # Enhanced timeout settings for large files
-                timeout_config = aiohttp.ClientTimeout(
-                    total=timeout,          # Total timeout
-                    connect=30,             # Connection timeout
-                    sock_read=120           # Socket read timeout
-                )
-                async with aiohttp.ClientSession(timeout=timeout_config) as session:
-                    print(f"   Attempt {attempt + 1}/{max_retries} (timeout: {timeout}s)")
-                    async with session.get(url) as response:
-                        if response.status != 200:
-                            raise Exception(f"Failed to download PDF: HTTP {response.status}")
-                        # Get content length for progress tracking
-                        content_length = response.headers.get('content-length')
-                        if content_length:
-                            total_size = int(content_length)
-                            print(f"   File size: {total_size / (1024*1024):.1f} MB")
-                        # Create temporary file
-                        temp_file = tempfile.NamedTemporaryFile(
-                            delete=False,
-                            suffix=".pdf",
-                            prefix="preprocess_"
-                        )
-                        # Write content to temporary file with progress tracking
-                        downloaded = 0
-                        async for chunk in response.content.iter_chunked(16384):  # Larger chunks
-                            temp_file.write(chunk)
-                            downloaded += len(chunk)
-                            # Show progress for large files
-                            if content_length and downloaded % (1024*1024) == 0:  # Every MB
-                                progress = (downloaded / total_size) * 100
-                                print(f"   Progress: {progress:.1f}% ({downloaded/(1024*1024):.1f} MB)")
-                        temp_file.close()
-                        print(f"✅ PDF downloaded successfully: {temp_file.name}")
-                        return temp_file.name
-            except asyncio.TimeoutError:
-                print(f"   ⏰ Timeout on attempt {attempt + 1}")
-                if attempt < max_retries - 1:
-                    wait_time = (attempt + 1) * 30  # Increasing wait time
-                    print(f"   ⏳ Waiting {wait_time}s before retry...")
-                    await asyncio.sleep(wait_time)
-                continue
-            except Exception as e:
-                print(f"   ❌ Error on attempt {attempt + 1}: {str(e)}")
-                if attempt < max_retries - 1:
-                    wait_time = (attempt + 1) * 15
-                    print(f"   ⏳ Waiting {wait_time}s before retry...")
-                    await asyncio.sleep(wait_time)
-                continue
-        raise Exception(f"Failed to download PDF after {max_retries} attempts")
-    def cleanup_temp_file(self, temp_path: str) -> None:
-        """
-        Clean up temporary file.
-        Args:
-            temp_path: Path to the temporary file to delete
-        """
-        if temp_path and os.path.exists(temp_path):
-            try:
-                os.unlink(temp_path)
-                print(f"🗑️ Cleaned up temporary file: {temp_path}")
-            except Exception as e:
-                print(f"⚠️ Warning: Could not delete temporary file {temp_path}: {e}")

preprocessing/preprocessing_modules/pptx_extractor.py DELETED Viewed

@@ -1,118 +0,0 @@
-from pptx import Presentation
-from pptx.enum.shapes import MSO_SHAPE_TYPE
-from typing import List, Dict, Any
-from PIL import Image
-from io import BytesIO
-import requests
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import tempfile
-import os
-import sys
-sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
-from config import config
-# OCR Space API configuration
-API_KEY = getattr(config, 'OCR_SPACE_API_KEY', None)
-API_URL = "https://api.ocr.space/parse/image"
-def ocr_space_file(filename, api_key=API_KEY, overlay=False, language="eng"):
-    """Extract text from image file using OCR Space API"""
-    if not api_key:
-        return filename, "OCR API key not configured"
-    payload = {
-        "isOverlayRequired": overlay,
-        "apikey": api_key,
-        "language": language,
-        "detectOrientation": True,
-        "scale": True,
-        "isTable": False,
-        "OCREngine": 2
-    }
-    try:
-        with open(filename, "rb") as f:
-            response = requests.post(API_URL, files={filename: f}, data=payload, timeout=30)
-        if response.status_code != 200:
-            return filename, f"API Error: HTTP {response.status_code}"
-        parsed = response.json()
-        if parsed.get("OCRExitCode") == 1:
-            parsed_text = parsed.get("ParsedResults", [{}])[0].get("ParsedText", "")
-            return filename, parsed_text
-        else:
-            error_msg = parsed.get("ErrorMessage", ["Unknown error"])[0] if parsed.get("ErrorMessage") else "Unknown OCR error"
-            return filename, f"OCR Error: {error_msg}"
-    except requests.exceptions.Timeout:
-        return filename, "Error: Request timeout"
-    except requests.exceptions.RequestException as e:
-        return filename, f"Error: Network error - {str(e)}"
-    except Exception as e:
-        return filename, f"Error: {e}"
-def extract_pptx(pptx_path: str) -> str:
-    """Extract text and images from PowerPoint presentations."""
-    try:
-        prs = Presentation(pptx_path)
-    except Exception as e:
-        return f"Error loading PowerPoint file: {str(e)}"
-    all_content = []
-    temp_files = []
-    try:
-        for slide_idx, slide in enumerate(prs.slides):
-            slide_content = [f"\\n=== Slide {slide_idx + 1} ===\\n"]
-            slide_images = []
-            for shape in slide.shapes:
-                # Extract text
-                if hasattr(shape, "text") and shape.text.strip():
-                    slide_content.append(shape.text.strip())
-                # Extract images
-                elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
-                    try:
-                        image = shape.image
-                        image_bytes = image.blob
-                        # Save image to temp file
-                        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
-                        temp_file.write(image_bytes)
-                        temp_file.close()
-                        temp_files.append(temp_file.name)
-                        slide_images.append(temp_file.name)
-                    except Exception as e:
-                        slide_content.append(f"[Image extraction error: {str(e)}]")
-            # Process images with OCR if API key is available
-            if slide_images and API_KEY:
-                try:
-                    with ThreadPoolExecutor(max_workers=3) as executor:
-                        future_to_filename = {
-                            executor.submit(ocr_space_file, img_file): img_file
-                            for img_file in slide_images
-                        }
-                        for future in as_completed(future_to_filename):
-                            filename, ocr_result = future.result()
-                            if ocr_result and not ocr_result.startswith("Error") and not ocr_result.startswith("OCR Error"):
-                                slide_content.append(f"[Image Text]: {ocr_result}")
-                except Exception as e:
-                    slide_content.append(f"[OCR processing error: {str(e)}]")
-            elif slide_images:
-                slide_content.append(f"[{len(slide_images)} images found - OCR not available]")
-            all_content.append("\\n".join(slide_content))
-    finally:
-        # Clean up temp files
-        for temp_file in temp_files:
-            try:
-                os.unlink(temp_file)
-            except:
-                pass
-    return "\\n\\n".join(all_content)

preprocessing/preprocessing_modules/text_chunker.py DELETED Viewed

@@ -1,167 +0,0 @@
-"""
-Text Chunker Module
-Handles chunking text into smaller pieces with overlap for better context preservation.
-"""
-import re
-from typing import List
-from config.config import CHUNK_SIZE, CHUNK_OVERLAP
-class TextChunker:
-    """Handles text chunking with overlap and smart boundary detection."""
-    def __init__(self):
-        """Initialize the text chunker."""
-        self.chunk_size = CHUNK_SIZE
-        self.chunk_overlap = CHUNK_OVERLAP
-    def chunk_text(self, text: str) -> List[str]:
-        """
-        Chunk text into smaller pieces with overlap.
-        Args:
-            text: The input text to chunk
-        Returns:
-            List[str]: List of text chunks
-        """
-        print(f"✂️ Chunking text into {self.chunk_size} character chunks with {self.chunk_overlap} overlap")
-        # Clean the text
-        cleaned_text = self._clean_text(text)
-        chunks = []
-        start = 0
-        while start < len(cleaned_text):
-            end = start + self.chunk_size
-            # Try to end at sentence boundary
-            if end < len(cleaned_text):
-                end = self._find_sentence_boundary(cleaned_text, start, end)
-            chunk = cleaned_text[start:end].strip()
-            # Only add chunk if it's meaningful
-            if chunk and len(chunk) > 50:
-                chunks.append(chunk)
-            # Move start position with overlap
-            start = end - self.chunk_overlap
-            if start >= len(cleaned_text):
-                break
-        print(f"✅ Created {len(chunks)} chunks (size={self.chunk_size}, overlap={self.chunk_overlap})")
-        return chunks
-    def _clean_text(self, text: str) -> str:
-        """
-        Clean text by normalizing whitespace and removing excessive line breaks.
-        Args:
-            text: Raw text to clean
-        Returns:
-            str: Cleaned text
-        """
-        # Replace multiple whitespace with single space
-        text = re.sub(r'\s+', ' ', text)
-        return text.strip()
-    def _find_sentence_boundary(self, text: str, start: int, preferred_end: int) -> int:
-        """
-        Find the best sentence boundary near the preferred end position.
-        Args:
-            text: The full text
-            start: Start position of the chunk
-            preferred_end: Preferred end position
-        Returns:
-            int: Adjusted end position at sentence boundary
-        """
-        # Look for sentence endings within a reasonable range
-        search_start = max(start, preferred_end - 100)
-        search_end = min(len(text), preferred_end + 50)
-        sentence_endings = ['.', '!', '?']
-        best_end = preferred_end
-        # Search backwards from preferred end for sentence boundary
-        for i in range(preferred_end - 1, search_start - 1, -1):
-            if text[i] in sentence_endings:
-                # Check if this looks like a real sentence ending
-                if self._is_valid_sentence_ending(text, i):
-                    best_end = i + 1
-                    break
-        return best_end
-    def _is_valid_sentence_ending(self, text: str, pos: int) -> bool:
-        """
-        Check if a punctuation mark represents a valid sentence ending.
-        Args:
-            text: The full text
-            pos: Position of the punctuation mark
-        Returns:
-            bool: True if it's a valid sentence ending
-        """
-        # Avoid breaking on abbreviations like "Dr.", "Mr.", etc.
-        if pos > 0 and text[pos] == '.':
-            # Look at the character before the period
-            char_before = text[pos - 1]
-            if char_before.isupper():
-                # Might be an abbreviation
-                word_start = pos - 1
-                while word_start > 0 and text[word_start - 1].isalpha():
-                    word_start -= 1
-                word = text[word_start:pos]
-                # Common abbreviations to avoid breaking on
-                abbreviations = {'Dr', 'Mr', 'Mrs', 'Ms', 'Prof', 'Inc', 'Ltd', 'Corp', 'Co'}
-                if word in abbreviations:
-                    return False
-        # Check if there's a space or newline after the punctuation
-        if pos + 1 < len(text):
-            next_char = text[pos + 1]
-            return next_char.isspace() or next_char.isupper()
-        return True
-    def get_chunk_stats(self, chunks: List[str]) -> dict:
-        """
-        Get statistics about the created chunks.
-        Args:
-            chunks: List of text chunks
-        Returns:
-            dict: Statistics about the chunks
-        """
-        if not chunks:
-            return {
-                "total_chunks": 0,
-                "total_characters": 0,
-                "total_words": 0,
-                "avg_chunk_size": 0,
-                "min_chunk_size": 0,
-                "max_chunk_size": 0
-            }
-        chunk_sizes = [len(chunk) for chunk in chunks]
-        total_chars = sum(chunk_sizes)
-        total_words = sum(len(chunk.split()) for chunk in chunks)
-        return {
-            "total_chunks": len(chunks),
-            "total_characters": total_chars,
-            "total_words": total_words,
-            "avg_chunk_size": total_chars / len(chunks),
-            "min_chunk_size": min(chunk_sizes),
-            "max_chunk_size": max(chunk_sizes)
-        }

preprocessing/preprocessing_modules/text_extractor.py DELETED Viewed

@@ -1,62 +0,0 @@
-"""
-Text Extractor Module
-Handles extracting text content from PDF files.
-"""
-import pdfplumber
-class TextExtractor:
-    """Handles text extraction from PDF files."""
-    def __init__(self):
-        """Initialize the text extractor."""
-        pass
-    async def extract_text_from_pdf(self, pdf_path: str) -> str:
-        """
-        Extract text from PDF file.
-        Args:
-            pdf_path: Path to the PDF file
-        Returns:
-            str: Extracted text content
-        Raises:
-            Exception: If text extraction fails
-        """
-        print(f"📖 Extracting text from PDF...")
-        full_text = ""
-        try:
-            with pdfplumber.open(pdf_path) as pdf:
-                for page_num, page in enumerate(pdf.pages):
-                    text = page.extract_text()
-                    if text:
-                        full_text += f"\n--- Page {page_num + 1} ---\n"
-                        full_text += text
-            print(f"✅ Extracted {len(full_text)} characters from PDF")
-            return full_text
-        except Exception as e:
-            raise Exception(f"Failed to extract text from PDF: {str(e)}")
-    def validate_extracted_text(self, text: str) -> bool:
-        """
-        Validate that extracted text is not empty and contains meaningful content.
-        Args:
-            text: The extracted text to validate
-        Returns:
-            bool: True if text is valid, False otherwise
-        """
-        if not text or not text.strip():
-            return False
-        # Check if text has at least some alphabetic characters
-        alphabetic_chars = sum(1 for char in text if char.isalpha())
-        return alphabetic_chars > 50  # At least 50 alphabetic characters

preprocessing/preprocessing_modules/vector_storage.py DELETED Viewed

@@ -1,212 +0,0 @@
-"""
-Vector Storage Module
-Handles storing chunks and embeddings in Qdrant vector database.
-"""
-import numpy as np
-from typing import List
-from pathlib import Path
-from qdrant_client import QdrantClient
-from qdrant_client.models import Distance, VectorParams, PointStruct
-class VectorStorage:
-    """Handles vector storage operations with Qdrant."""
-    def __init__(self, base_db_path: Path):
-        """
-        Initialize the vector storage.
-        Args:
-            base_db_path: Base path for storing Qdrant databases
-        """
-        self.base_db_path = base_db_path
-    async def store_in_qdrant(self, chunks: List[str], embeddings: np.ndarray, doc_id: str):
-        """
-        Store chunks and embeddings in Qdrant.
-        Args:
-            chunks: List of text chunks
-            embeddings: Corresponding embeddings array
-            doc_id: Document identifier
-        """
-        if len(chunks) != embeddings.shape[0]:
-            raise ValueError(f"Chunk count ({len(chunks)}) doesn't match embedding count ({embeddings.shape[0]})")
-        collection_name = f"{doc_id}_collection"
-        db_path = self.base_db_path / f"{collection_name}.db"
-        client = QdrantClient(path=str(db_path))
-        print(f"💾 Storing {len(chunks)} vectors in collection: {collection_name}")
-        try:
-            # Create or recreate collection
-            await self._setup_collection(client, collection_name, embeddings.shape[1])
-            # Prepare and upload points
-            await self._upload_points(client, collection_name, chunks, embeddings, doc_id)
-            print(f"✅ Successfully stored all vectors in Qdrant")
-        finally:
-            client.close()
-    async def _setup_collection(self, client: QdrantClient, collection_name: str, embedding_dim: int):
-        """
-        Set up Qdrant collection, recreating if it exists.
-        Args:
-            client: Qdrant client
-            collection_name: Name of the collection
-            embedding_dim: Dimension of embeddings
-        """
-        # Delete existing collection if it exists
-        try:
-            client.delete_collection(collection_name)
-            print(f"🗑️ Deleted existing collection: {collection_name}")
-        except Exception:
-            pass  # Collection might not exist
-        # Create new collection
-        client.create_collection(
-            collection_name=collection_name,
-            vectors_config=VectorParams(
-                size=embedding_dim,
-                distance=Distance.COSINE
-            )
-        )
-        print(f"✅ Created new collection: {collection_name}")
-    async def _upload_points(self, client: QdrantClient, collection_name: str,
-                           chunks: List[str], embeddings: np.ndarray, doc_id: str):
-        """
-        Upload points to Qdrant collection in batches.
-        Args:
-            client: Qdrant client
-            collection_name: Name of the collection
-            chunks: Text chunks
-            embeddings: Embedding vectors
-            doc_id: Document identifier
-        """
-        # Prepare points
-        points = []
-        for i in range(len(chunks)):
-            points.append(
-                PointStruct(
-                    id=i,
-                    vector=embeddings[i].tolist(),
-                    payload={
-                        "text": chunks[i],
-                        "chunk_id": i,
-                        "doc_id": doc_id,
-                        "char_count": len(chunks[i]),
-                        "word_count": len(chunks[i].split())
-                    }
-                )
-            )
-        # Upload in batches to handle large documents
-        batch_size = 100
-        total_batches = (len(points) + batch_size - 1) // batch_size
-        for i in range(0, len(points), batch_size):
-            batch = points[i:i + batch_size]
-            batch_num = (i // batch_size) + 1
-            print(f"   Uploading batch {batch_num}/{total_batches} ({len(batch)} points)")
-            client.upsert(collection_name=collection_name, points=batch)
-        print(f"✅ Uploaded {len(points)} points in {total_batches} batches")
-    def collection_exists(self, doc_id: str) -> bool:
-        """
-        Check if a collection exists for the given document ID.
-        Args:
-            doc_id: Document identifier
-        Returns:
-            bool: True if collection exists, False otherwise
-        """
-        collection_name = f"{doc_id}_collection"
-        db_path = self.base_db_path / f"{collection_name}.db"
-        return db_path.exists()
-    def get_collection_info(self, doc_id: str) -> dict:
-        """
-        Get information about a collection.
-        Args:
-            doc_id: Document identifier
-        Returns:
-            dict: Collection information
-        """
-        collection_name = f"{doc_id}_collection"
-        db_path = self.base_db_path / f"{collection_name}.db"
-        if not db_path.exists():
-            return {
-                "collection_name": collection_name,
-                "exists": False,
-                "path": str(db_path)
-            }
-        try:
-            client = QdrantClient(path=str(db_path))
-            try:
-                collection_info = client.get_collection(collection_name)
-                return {
-                    "collection_name": collection_name,
-                    "exists": True,
-                    "path": str(db_path),
-                    "vectors_count": collection_info.vectors_count,
-                    "status": collection_info.status
-                }
-            finally:
-                client.close()
-        except Exception as e:
-            return {
-                "collection_name": collection_name,
-                "exists": True,
-                "path": str(db_path),
-                "error": str(e)
-            }
-    def delete_collection(self, doc_id: str) -> bool:
-        """
-        Delete a collection and its database file.
-        Args:
-            doc_id: Document identifier
-        Returns:
-            bool: True if successfully deleted, False otherwise
-        """
-        collection_name = f"{doc_id}_collection"
-        db_path = self.base_db_path / f"{collection_name}.db"
-        try:
-            if db_path.exists():
-                # Try to delete collection properly first
-                try:
-                    client = QdrantClient(path=str(db_path))
-                    client.delete_collection(collection_name)
-                    client.close()
-                except Exception:
-                    pass  # Collection might not exist or be corrupted
-                # Remove database directory
-                import shutil
-                shutil.rmtree(db_path, ignore_errors=True)
-                print(f"🗑️ Deleted collection: {collection_name}")
-                return True
-        except Exception as e:
-            print(f"❌ Error deleting collection {collection_name}: {e}")
-            return False
-        return True  # Nothing to delete

preprocessing/preprocessing_modules/xlsx_extractor.py DELETED Viewed

@@ -1,119 +0,0 @@
-from openpyxl import load_workbook
-from openpyxl.drawing.image import Image as OpenPyXLImage
-from typing import List, Dict, Any
-from PIL import Image
-from io import BytesIO
-import pytesseract
-import os
-import pandas as pd
-def extract_xlsx(xlsx_path: str, tesseract_cmd: str = None) -> str:
-    """Extract data from Excel files including text and images."""
-    if tesseract_cmd:
-        pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
-    try:
-        wb = load_workbook(xlsx_path, data_only=True)
-    except Exception as e:
-        return f"Error loading Excel file: {str(e)}"
-    all_sheets_content: list[str] = []
-    preview_text: str | None = None
-    any_data_found = False
-    for sheet in wb.worksheets:
-        sheet_content = [f"\n=== Sheet: {sheet.title} ===\n"]
-        # Extract table data
-        has_data = False
-        non_empty_rows = 0
-        for row in sheet.iter_rows(max_row=sheet.max_row, values_only=True):
-            if row is None or all(cell is None for cell in row):
-                continue  # skip completely empty rows
-            has_data = True
-            non_empty_rows += 1
-            any_data_found = True
-            row_data = [str(cell).strip() if cell is not None else "" for cell in row]
-            joined = " | ".join(row_data)
-            sheet_content.append(joined)
-            if preview_text is None and joined.strip():
-                preview_text = joined[:15]
-        if not has_data:
-            sheet_content.append("[No data in this sheet]")
-            print(f"ℹ️ XLSX: Sheet '{sheet.title}' has no data (openpyxl)")
-        else:
-            print(f"🧾 XLSX: Sheet '{sheet.title}' non-empty rows: {non_empty_rows}")
-        # Extract images from the sheet
-        if hasattr(sheet, '_images'):
-            image_count = 0
-            for img in sheet._images:
-                try:
-                    if hasattr(img, '_data'):  # if it's a real OpenPyXL Image
-                        image_data = img._data()
-                    elif hasattr(img, '_ref'):
-                        continue  # cell ref-only images; ignore
-                    else:
-                        continue
-                    pil_img = Image.open(BytesIO(image_data))
-                    try:
-                        ocr_text = pytesseract.image_to_string(pil_img).strip()
-                        if ocr_text:
-                            sheet_content.append(f"[Image {image_count + 1} Text]: {ocr_text}")
-                        else:
-                            sheet_content.append(f"[Image {image_count + 1}]: No text detected")
-                    except Exception as ocr_e:
-                        sheet_content.append(f"[Image {image_count + 1}]: OCR failed - {str(ocr_e)}")
-                    image_count += 1
-                except Exception as e:
-                    sheet_content.append(f"[Image extraction error: {str(e)}]")
-            if image_count == 0:
-                sheet_content.append("[No images found in this sheet]")
-        all_sheets_content.append("\n".join(sheet_content))
-    # If no data found using openpyxl, try pandas fallback (handles some edge cases better)
-    if not any_data_found:
-        print("ℹ️ XLSX: No data via openpyxl, trying pandas fallback…")
-        try:
-            xls = pd.ExcelFile(xlsx_path, engine="openpyxl")
-            pandas_parts = []
-            extracted_sheets = 0
-            for sheet_name in xls.sheet_names:
-                df = pd.read_excel(xls, sheet_name=sheet_name, dtype=str)
-                if not df.empty:
-                    any_data_found = True
-                    header = f"\n=== Sheet: {sheet_name} ===\n"
-                    csv_like = df.fillna("").astype(str).to_csv(index=False)
-                    pandas_parts.append(header + csv_like)
-                    extracted_sheets += 1
-                    if preview_text is None:
-                        flat = "".join(csv_like.splitlines())
-                        if flat:
-                            preview_text = flat[:15]
-                else:
-                    pandas_parts.append(f"\n=== Sheet: {sheet_name} ===\n[No data in this sheet]")
-            if pandas_parts:
-                all_sheets_content = pandas_parts
-            print(f"✅ XLSX: Pandas fallback extracted {extracted_sheets} non-empty sheet(s)")
-        except Exception as pe:
-            # If pandas also fails, keep whatever we had
-            all_sheets_content.append(f"[Pandas fallback failed: {str(pe)}]")
-            print(f"❌ XLSX: Pandas fallback failed: {pe}")
-    combined = "\n\n".join(all_sheets_content)
-    # Print a small preview for verification
-    if preview_text is None:
-        # fallback: take from combined text
-        flat_combined = "".join(combined.splitlines()).strip()
-        if flat_combined:
-            preview_text = flat_combined[:15]
-    if preview_text:
-        print(f"🔎 XLSX content preview: {preview_text}")
-    return combined