File size: 3,053 Bytes
68a1536
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# services/pdf_service.py
from pathlib import Path
from typing import List, Dict, Any
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import asyncio
from concurrent.futures import ThreadPoolExecutor
import logging
from config.config import settings

logger = logging.getLogger(__name__)

class PDFService:
    def __init__(self, model_service):
        self.embedder = model_service.embedder
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=settings.CHUNK_SIZE,
            chunk_overlap=settings.CHUNK_OVERLAP
        )
        self.pdf_chunks = []
        self.faiss_index = None

    async def index_pdfs(self, pdf_folder: Path = settings.PDF_FOLDER) -> List[Dict[str, Any]]:
        all_texts = []
        
        async def process_pdf(pdf_file: Path) -> List[Dict[str, Any]]:
            try:
                reader = PdfReader(str(pdf_file))
                metadata = reader.metadata
                full_text = " ".join([
                    page.extract_text() 
                    for page in reader.pages 
                    if page.extract_text()
                ])
                chunks = self.text_splitter.split_text(full_text)
                return [{
                    'text': chunk,
                    'source': pdf_file.name,
                    'metadata': metadata,
                    'chunk_index': i
                } for i, chunk in enumerate(chunks)]
            except Exception as e:
                logger.error(f"Error processing PDF {pdf_file}: {e}")
                return []

        pdf_files = [f for f in pdf_folder.iterdir() if f.suffix.lower() == ".pdf"]
        
        async with ThreadPoolExecutor() as executor:
            tasks = [process_pdf(pdf_file) for pdf_file in pdf_files]
            results = await asyncio.gather(*tasks)
            
        for result in results:
            all_texts.extend(result)
            
        self.pdf_chunks = all_texts
        return all_texts

    async def search_pdfs(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        if not self.pdf_chunks:
            await self.index_pdfs()

        query_embedding = self.embedder.encode([query], convert_to_tensor=True).cpu().detach().numpy()
        
        # Create embeddings for chunks if not already done
        if not self.faiss_index:
            chunk_embeddings = self.embedder.encode(
                [chunk['text'] for chunk in self.pdf_chunks],
                convert_to_tensor=True
            ).cpu().detach().numpy()
            
            d = chunk_embeddings.shape[1]
            self.faiss_index = faiss.IndexFlatL2(d)
            self.faiss_index.add(chunk_embeddings)

        distances, indices = self.faiss_index.search(query_embedding, top_k)
        
        results = []
        for i, idx in enumerate(indices[0]):
            chunk = self.pdf_chunks[idx].copy()
            chunk['score'] = float(distances[0][i])
            results.append(chunk)
            
        return results