Spaces:
Running
Running
# services/pdf_service.py | |
from pathlib import Path | |
from typing import List, Dict, Any | |
from PyPDF2 import PdfReader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
import asyncio | |
from concurrent.futures import ThreadPoolExecutor | |
import logging | |
from config.config import settings | |
logger = logging.getLogger(__name__) | |
class PDFService: | |
def __init__(self, model_service): | |
self.embedder = model_service.embedder | |
self.text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=settings.CHUNK_SIZE, | |
chunk_overlap=settings.CHUNK_OVERLAP | |
) | |
self.pdf_chunks = [] | |
self.faiss_index = None | |
async def index_pdfs(self, pdf_folder: Path = settings.PDF_FOLDER) -> List[Dict[str, Any]]: | |
all_texts = [] | |
async def process_pdf(pdf_file: Path) -> List[Dict[str, Any]]: | |
try: | |
reader = PdfReader(str(pdf_file)) | |
metadata = reader.metadata | |
full_text = " ".join([ | |
page.extract_text() | |
for page in reader.pages | |
if page.extract_text() | |
]) | |
chunks = self.text_splitter.split_text(full_text) | |
return [{ | |
'text': chunk, | |
'source': pdf_file.name, | |
'metadata': metadata, | |
'chunk_index': i | |
} for i, chunk in enumerate(chunks)] | |
except Exception as e: | |
logger.error(f"Error processing PDF {pdf_file}: {e}") | |
return [] | |
pdf_files = [f for f in pdf_folder.iterdir() if f.suffix.lower() == ".pdf"] | |
async with ThreadPoolExecutor() as executor: | |
tasks = [process_pdf(pdf_file) for pdf_file in pdf_files] | |
results = await asyncio.gather(*tasks) | |
for result in results: | |
all_texts.extend(result) | |
self.pdf_chunks = all_texts | |
return all_texts | |
async def search_pdfs(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]: | |
if not self.pdf_chunks: | |
await self.index_pdfs() | |
query_embedding = self.embedder.encode([query], convert_to_tensor=True).cpu().detach().numpy() | |
# Create embeddings for chunks if not already done | |
if not self.faiss_index: | |
chunk_embeddings = self.embedder.encode( | |
[chunk['text'] for chunk in self.pdf_chunks], | |
convert_to_tensor=True | |
).cpu().detach().numpy() | |
d = chunk_embeddings.shape[1] | |
self.faiss_index = faiss.IndexFlatL2(d) | |
self.faiss_index.add(chunk_embeddings) | |
distances, indices = self.faiss_index.search(query_embedding, top_k) | |
results = [] | |
for i, idx in enumerate(indices[0]): | |
chunk = self.pdf_chunks[idx].copy() | |
chunk['score'] = float(distances[0][i]) | |
results.append(chunk) | |
return results | |