File size: 5,203 Bytes
68a1536 4920c28 68a1536 4920c28 68a1536 4920c28 68a1536 4920c28 68a1536 4920c28 68a1536 4920c28 68a1536 4920c28 68a1536 4920c28 68a1536 4920c28 68a1536 4920c28 68a1536 4920c28 68a1536 4920c28 68a1536 4920c28 68a1536 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
# services/pdf_service.py
from pathlib import Path
from typing import List, Dict, Any, Optional
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss
import numpy as np
import asyncio
from concurrent.futures import ThreadPoolExecutor
import logging
from datetime import datetime
from config.config import settings
logger = logging.getLogger(__name__)
class PDFService:
def __init__(self, model_service):
self.embedder = model_service.embedder
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=settings.CHUNK_SIZE,
chunk_overlap=settings.CHUNK_OVERLAP,
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
)
self.index = None
self.chunks = []
self.last_update = None
self.pdf_metadata = {}
async def process_pdf(self, pdf_path: Path) -> List[Dict[str, Any]]:
"""Process a single PDF file"""
try:
reader = PdfReader(str(pdf_path))
chunks = []
# Extract metadata
metadata = {
'title': reader.metadata.get('/Title', ''),
'author': reader.metadata.get('/Author', ''),
'creation_date': reader.metadata.get('/CreationDate', ''),
'pages': len(reader.pages),
'filename': pdf_path.name
}
self.pdf_metadata[pdf_path.name] = metadata
# Process each page
for page_num, page in enumerate(reader.pages):
text = page.extract_text()
if not text:
continue
page_chunks = self.text_splitter.split_text(text)
for i, chunk in enumerate(page_chunks):
chunks.append({
'text': chunk,
'source': pdf_path.name,
'page': page_num + 1,
'chunk_index': i,
'metadata': metadata,
'timestamp': datetime.now().isoformat()
})
return chunks
except Exception as e:
logger.error(f"Error processing PDF {pdf_path}: {e}")
return []
async def index_pdfs(self, pdf_folder: Path = settings.PDF_FOLDER) -> None:
"""Index all PDFs in the specified folder"""
try:
pdf_files = list(pdf_folder.glob('*.pdf'))
if not pdf_files:
logger.warning(f"No PDF files found in {pdf_folder}")
return
# Process PDFs in parallel
async with ThreadPoolExecutor() as executor:
tasks = [
asyncio.create_task(self.process_pdf(pdf_file))
for pdf_file in pdf_files
]
chunk_lists = await asyncio.gather(*tasks)
# Combine all chunks
self.chunks = []
for chunk_list in chunk_lists:
self.chunks.extend(chunk_list)
# Create FAISS index
texts = [chunk['text'] for chunk in self.chunks]
embeddings = self.embedder.encode(
texts,
convert_to_tensor=True,
show_progress_bar=True
).cpu().detach().numpy()
dimension = embeddings.shape[1]
self.index = faiss.IndexFlatL2(dimension)
self.index.add(embeddings)
self.last_update = datetime.now()
logger.info(f"Indexed {len(self.chunks)} chunks from {len(pdf_files)} PDFs")
except Exception as e:
logger.error(f"Error indexing PDFs: {e}")
raise
async def search(
self,
query: str,
top_k: int = 5,
min_score: float = 0.5
) -> List[Dict[str, Any]]:
"""Search indexed PDFs"""
if not self.index or not self.chunks:
await self.index_pdfs()
try:
# Get query embedding
query_embedding = self.embedder.encode(
[query],
convert_to_tensor=True
).cpu().detach().numpy()
# Search
distances, indices = self.index.search(query_embedding, top_k * 2) # Get extra results for filtering
# Process results
results = []
for i, idx in enumerate(indices[0]):
if idx >= len(self.chunks) or distances[0][i] > min_score:
continue
chunk = self.chunks[idx].copy()
chunk['score'] = float(1 - distances[0][i]) # Convert distance to similarity score
results.append(chunk)
# Sort by score and take top_k
results.sort(key=lambda x: x['score'], reverse=True)
return results[:top_k]
except Exception as e:
logger.error(f"Error searching PDFs: {e}")
raise
|