File size: 5,990 Bytes
68a1536
 
4920c28
68a1536
 
4920c28
 
68a1536
 
 
4920c28
68a1536
 
 
 
 
 
 
 
 
4920c28
 
68a1536
4920c28
 
 
 
68a1536
ccfe454
 
4920c28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1c9c7e
 
 
4920c28
 
 
 
 
68a1536
4920c28
 
 
 
 
 
 
 
ccfe454
 
 
 
 
 
 
4920c28
 
 
 
 
 
ccfe454
 
 
 
4920c28
 
ccfe454
 
 
 
 
 
 
 
4920c28
 
 
 
68a1536
4920c28
68a1536
4920c28
 
 
 
 
68a1536
4920c28
 
 
 
 
 
 
b1c9c7e
 
4920c28
68a1536
 
4920c28
ccfe454
 
 
 
 
 
 
 
 
68a1536
4920c28
ccfe454
4920c28
 
 
 
 
 
 
 
ccfe454
4920c28
 
 
 
b1c9c7e
 
 
 
4920c28
68a1536
4920c28
 
ccfe454
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# services/pdf_service.py
from pathlib import Path
from typing import List, Dict, Any, Optional
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss
import numpy as np
import asyncio
from concurrent.futures import ThreadPoolExecutor
import logging
from datetime import datetime
from config.config import settings

logger = logging.getLogger(__name__)

class PDFService:
    def __init__(self, model_service):
        self.embedder = model_service.embedder
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=settings.CHUNK_SIZE,
            chunk_overlap=settings.CHUNK_OVERLAP,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
        )
        self.index = None
        self.chunks = []
        self.last_update = None
        self.pdf_metadata = {}

    def process_pdf(self, pdf_path: Path) -> List[Dict[str, Any]]:
        """Process a single PDF file - now synchronous"""
        try:
            reader = PdfReader(str(pdf_path))
            chunks = []
            
            # Extract metadata
            metadata = {
                'title': reader.metadata.get('/Title', ''),
                'author': reader.metadata.get('/Author', ''),
                'creation_date': reader.metadata.get('/CreationDate', ''),
                'pages': len(reader.pages),
                'filename': pdf_path.name
            }
            self.pdf_metadata[pdf_path.name] = metadata
            
            # Process each page
            for page_num, page in enumerate(reader.pages):
                text = page.extract_text()
                if not text:
                    continue
                    
                page_chunks = self.text_splitter.split_text(text)
                for i, chunk in enumerate(page_chunks):
                    chunks.append({
                        'text': chunk,
                        'source': pdf_path.name,
                        'page': page_num + 1,
                        'chunk_index': i,
                        'metadata': metadata,
                        'timestamp': datetime.now().isoformat()
                    })
            print("--------------------------- chunks ----------------------------------")
            print("--------------------------- chunks ----------------------------------")
            print(chunks)         
            return chunks
            
        except Exception as e:
            logger.error(f"Error processing PDF {pdf_path}: {e}")
            return []

    async def index_pdfs(self, pdf_folder: Path = settings.PDF_FOLDER) -> None:
        """Index all PDFs in the specified folder"""
        try:
            pdf_files = list(pdf_folder.glob('*.pdf'))
            if not pdf_files:
                logger.warning(f"No PDF files found in {pdf_folder}")
                return
            
            # Process PDFs using thread pool
            loop = asyncio.get_running_loop()
            with ThreadPoolExecutor() as executor:
                chunk_lists = await loop.run_in_executor(
                    executor,
                    lambda: [self.process_pdf(pdf_file) for pdf_file in pdf_files]
                )
            
            # Combine all chunks
            self.chunks = []
            for chunk_list in chunk_lists:
                self.chunks.extend(chunk_list)
            
            if not self.chunks:
                logger.warning("No text chunks extracted from PDFs")
                return
                
            # Create FAISS index
            texts = [chunk['text'] for chunk in self.chunks]
            embeddings = await loop.run_in_executor(
                None,
                lambda: self.embedder.encode(
                    texts,
                    convert_to_tensor=True,
                    show_progress_bar=True
                ).cpu().detach().numpy()
            )
            
            dimension = embeddings.shape[1]
            self.index = faiss.IndexFlatL2(dimension)
            self.index.add(embeddings)
            
            self.last_update = datetime.now()
            
            logger.info(f"Indexed {len(self.chunks)} chunks from {len(pdf_files)} PDFs")
            
        except Exception as e:
            logger.error(f"Error indexing PDFs: {e}")
            raise

    async def search(
        self,
        query: str,
        top_k: int = 5,
        min_score: float = 0.5
    ) -> List[Dict[str, Any]]:
        """Search indexed PDFs"""
        print("--------------------------- query ----------------------------------")
        print(query)
        if not self.index or not self.chunks:
            await self.index_pdfs()
        
        try:
            # Get query embedding using thread pool
            loop = asyncio.get_running_loop()
            query_embedding = await loop.run_in_executor(
                None,
                lambda: self.embedder.encode(
                    [query],
                    convert_to_tensor=True
                ).cpu().detach().numpy()
            )
            
            # Search
            distances, indices = self.index.search(query_embedding, top_k * 2)
            
            # Process results
            results = []
            for i, idx in enumerate(indices[0]):
                if idx >= len(self.chunks) or distances[0][i] > min_score:
                    continue
                    
                chunk = self.chunks[idx].copy()
                chunk['score'] = float(1 - distances[0][i])
                results.append(chunk)
            
            # Sort by score and take top_k
            results.sort(key=lambda x: x['score'], reverse=True)

            print("--------------------------- results  ----------------------------------")
            print(results)

            return results[:top_k]
            
        except Exception as e:
            logger.error(f"Error searching PDFs: {e}")
            raise