llm

Sleeping

llm

File size: 5,203 Bytes

# services/pdf_service.py
from pathlib import Path
from typing import List, Dict, Any, Optional
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss
import numpy as np
import asyncio
from concurrent.futures import ThreadPoolExecutor
import logging
from datetime import datetime
from config.config import settings

logger = logging.getLogger(__name__)

class PDFService:
    def __init__(self, model_service):
        self.embedder = model_service.embedder
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=settings.CHUNK_SIZE,
            chunk_overlap=settings.CHUNK_OVERLAP,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
        )
        self.index = None
        self.chunks = []
        self.last_update = None
        self.pdf_metadata = {}

    async def process_pdf(self, pdf_path: Path) -> List[Dict[str, Any]]:
        """Process a single PDF file"""
        try:
            reader = PdfReader(str(pdf_path))
            chunks = []
            
            # Extract metadata
            metadata = {
                'title': reader.metadata.get('/Title', ''),
                'author': reader.metadata.get('/Author', ''),
                'creation_date': reader.metadata.get('/CreationDate', ''),
                'pages': len(reader.pages),
                'filename': pdf_path.name
            }
            self.pdf_metadata[pdf_path.name] = metadata
            
            # Process each page
            for page_num, page in enumerate(reader.pages):
                text = page.extract_text()
                if not text:
                    continue
                    
                page_chunks = self.text_splitter.split_text(text)
                for i, chunk in enumerate(page_chunks):
                    chunks.append({
                        'text': chunk,
                        'source': pdf_path.name,
                        'page': page_num + 1,
                        'chunk_index': i,
                        'metadata': metadata,
                        'timestamp': datetime.now().isoformat()
                    })
                    
            return chunks
            
        except Exception as e:
            logger.error(f"Error processing PDF {pdf_path}: {e}")
            return []

    async def index_pdfs(self, pdf_folder: Path = settings.PDF_FOLDER) -> None:
        """Index all PDFs in the specified folder"""
        try:
            pdf_files = list(pdf_folder.glob('*.pdf'))
            if not pdf_files:
                logger.warning(f"No PDF files found in {pdf_folder}")
                return
            
            # Process PDFs in parallel
            async with ThreadPoolExecutor() as executor:
                tasks = [
                    asyncio.create_task(self.process_pdf(pdf_file))
                    for pdf_file in pdf_files
                ]
                chunk_lists = await asyncio.gather(*tasks)
            
            # Combine all chunks
            self.chunks = []
            for chunk_list in chunk_lists:
                self.chunks.extend(chunk_list)
            
            # Create FAISS index
            texts = [chunk['text'] for chunk in self.chunks]
            embeddings = self.embedder.encode(
                texts,
                convert_to_tensor=True,
                show_progress_bar=True
            ).cpu().detach().numpy()
            
            dimension = embeddings.shape[1]
            self.index = faiss.IndexFlatL2(dimension)
            self.index.add(embeddings)
            
            self.last_update = datetime.now()
            
            logger.info(f"Indexed {len(self.chunks)} chunks from {len(pdf_files)} PDFs")
            
        except Exception as e:
            logger.error(f"Error indexing PDFs: {e}")
            raise

    async def search(
        self,
        query: str,
        top_k: int = 5,
        min_score: float = 0.5
    ) -> List[Dict[str, Any]]:
        """Search indexed PDFs"""
        if not self.index or not self.chunks:
            await self.index_pdfs()
        
        try:
            # Get query embedding
            query_embedding = self.embedder.encode(
                [query],
                convert_to_tensor=True
            ).cpu().detach().numpy()
            
            # Search
            distances, indices = self.index.search(query_embedding, top_k * 2)  # Get extra results for filtering
            
            # Process results
            results = []
            for i, idx in enumerate(indices[0]):
                if idx >= len(self.chunks) or distances[0][i] > min_score:
                    continue
                    
                chunk = self.chunks[idx].copy()
                chunk['score'] = float(1 - distances[0][i])  # Convert distance to similarity score
                results.append(chunk)
            
            # Sort by score and take top_k
            results.sort(key=lambda x: x['score'], reverse=True)
            return results[:top_k]
            
        except Exception as e:
            logger.error(f"Error searching PDFs: {e}")
            raise