A.I.StateMachine

Running

App Files Files Community

A.I.StateMachine / services /pdf_service.py

Chris4K

Create pdf_service.py

68a1536 verified 3 months ago

raw

history blame

3.05 kB

	# services/pdf_service.py
	from pathlib import Path
	from typing import List, Dict, Any
	from PyPDF2 import PdfReader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import asyncio
	from concurrent.futures import ThreadPoolExecutor
	import logging
	from config.config import settings

	logger = logging.getLogger(__name__)

	class PDFService:
	def __init__(self, model_service):
	self.embedder = model_service.embedder
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=settings.CHUNK_SIZE,
	chunk_overlap=settings.CHUNK_OVERLAP
	)
	self.pdf_chunks = []
	self.faiss_index = None

	async def index_pdfs(self, pdf_folder: Path = settings.PDF_FOLDER) -> List[Dict[str, Any]]:
	all_texts = []

	async def process_pdf(pdf_file: Path) -> List[Dict[str, Any]]:
	try:
	reader = PdfReader(str(pdf_file))
	metadata = reader.metadata
	full_text = " ".join([
	page.extract_text()
	for page in reader.pages
	if page.extract_text()
	])
	chunks = self.text_splitter.split_text(full_text)
	return [{
	'text': chunk,
	'source': pdf_file.name,
	'metadata': metadata,
	'chunk_index': i
	} for i, chunk in enumerate(chunks)]
	except Exception as e:
	logger.error(f"Error processing PDF {pdf_file}: {e}")
	return []

	pdf_files = [f for f in pdf_folder.iterdir() if f.suffix.lower() == ".pdf"]

	async with ThreadPoolExecutor() as executor:
	tasks = [process_pdf(pdf_file) for pdf_file in pdf_files]
	results = await asyncio.gather(*tasks)

	for result in results:
	all_texts.extend(result)

	self.pdf_chunks = all_texts
	return all_texts

	async def search_pdfs(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
	if not self.pdf_chunks:
	await self.index_pdfs()

	query_embedding = self.embedder.encode([query], convert_to_tensor=True).cpu().detach().numpy()

	# Create embeddings for chunks if not already done
	if not self.faiss_index:
	chunk_embeddings = self.embedder.encode(
	[chunk['text'] for chunk in self.pdf_chunks],
	convert_to_tensor=True
	).cpu().detach().numpy()

	d = chunk_embeddings.shape[1]
	self.faiss_index = faiss.IndexFlatL2(d)
	self.faiss_index.add(chunk_embeddings)

	distances, indices = self.faiss_index.search(query_embedding, top_k)

	results = []
	for i, idx in enumerate(indices[0]):
	chunk = self.pdf_chunks[idx].copy()
	chunk['score'] = float(distances[0][i])
	results.append(chunk)

	return results