Spaces:
Sleeping
Sleeping
# services/pdf_service.py | |
from pathlib import Path | |
from typing import List, Dict, Any, Optional | |
from PyPDF2 import PdfReader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
import faiss | |
import numpy as np | |
import asyncio | |
from concurrent.futures import ThreadPoolExecutor | |
import logging | |
from datetime import datetime | |
from config.config import settings | |
logger = logging.getLogger(__name__) | |
class PDFService: | |
def __init__(self, model_service): | |
self.embedder = model_service.embedder | |
self.text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=settings.CHUNK_SIZE, | |
chunk_overlap=settings.CHUNK_OVERLAP, | |
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""] | |
) | |
self.index = None | |
self.chunks = [] | |
self.last_update = None | |
self.pdf_metadata = {} | |
async def process_pdf(self, pdf_path: Path) -> List[Dict[str, Any]]: | |
"""Process a single PDF file""" | |
try: | |
reader = PdfReader(str(pdf_path)) | |
chunks = [] | |
# Extract metadata | |
metadata = { | |
'title': reader.metadata.get('/Title', ''), | |
'author': reader.metadata.get('/Author', ''), | |
'creation_date': reader.metadata.get('/CreationDate', ''), | |
'pages': len(reader.pages), | |
'filename': pdf_path.name | |
} | |
self.pdf_metadata[pdf_path.name] = metadata | |
# Process each page | |
for page_num, page in enumerate(reader.pages): | |
text = page.extract_text() | |
if not text: | |
continue | |
page_chunks = self.text_splitter.split_text(text) | |
for i, chunk in enumerate(page_chunks): | |
chunks.append({ | |
'text': chunk, | |
'source': pdf_path.name, | |
'page': page_num + 1, | |
'chunk_index': i, | |
'metadata': metadata, | |
'timestamp': datetime.now().isoformat() | |
}) | |
return chunks | |
except Exception as e: | |
logger.error(f"Error processing PDF {pdf_path}: {e}") | |
return [] | |
async def index_pdfs(self, pdf_folder: Path = settings.PDF_FOLDER) -> None: | |
"""Index all PDFs in the specified folder""" | |
try: | |
pdf_files = list(pdf_folder.glob('*.pdf')) | |
if not pdf_files: | |
logger.warning(f"No PDF files found in {pdf_folder}") | |
return | |
# Process PDFs in parallel | |
async with ThreadPoolExecutor() as executor: | |
tasks = [ | |
asyncio.create_task(self.process_pdf(pdf_file)) | |
for pdf_file in pdf_files | |
] | |
chunk_lists = await asyncio.gather(*tasks) | |
# Combine all chunks | |
self.chunks = [] | |
for chunk_list in chunk_lists: | |
self.chunks.extend(chunk_list) | |
# Create FAISS index | |
texts = [chunk['text'] for chunk in self.chunks] | |
embeddings = self.embedder.encode( | |
texts, | |
convert_to_tensor=True, | |
show_progress_bar=True | |
).cpu().detach().numpy() | |
dimension = embeddings.shape[1] | |
self.index = faiss.IndexFlatL2(dimension) | |
self.index.add(embeddings) | |
self.last_update = datetime.now() | |
logger.info(f"Indexed {len(self.chunks)} chunks from {len(pdf_files)} PDFs") | |
except Exception as e: | |
logger.error(f"Error indexing PDFs: {e}") | |
raise | |
async def search( | |
self, | |
query: str, | |
top_k: int = 5, | |
min_score: float = 0.5 | |
) -> List[Dict[str, Any]]: | |
"""Search indexed PDFs""" | |
if not self.index or not self.chunks: | |
await self.index_pdfs() | |
try: | |
# Get query embedding | |
query_embedding = self.embedder.encode( | |
[query], | |
convert_to_tensor=True | |
).cpu().detach().numpy() | |
# Search | |
distances, indices = self.index.search(query_embedding, top_k * 2) # Get extra results for filtering | |
# Process results | |
results = [] | |
for i, idx in enumerate(indices[0]): | |
if idx >= len(self.chunks) or distances[0][i] > min_score: | |
continue | |
chunk = self.chunks[idx].copy() | |
chunk['score'] = float(1 - distances[0][i]) # Convert distance to similarity score | |
results.append(chunk) | |
# Sort by score and take top_k | |
results.sort(key=lambda x: x['score'], reverse=True) | |
return results[:top_k] | |
except Exception as e: | |
logger.error(f"Error searching PDFs: {e}") | |
raise | |