File size: 6,429 Bytes
68a1536 4920c28 68a1536 4920c28 68a1536 4920c28 68a1536 4920c28 68a1536 4920c28 68a1536 ccfe454 4920c28 b1c9c7e 4920c28 68a1536 4920c28 ccfe454 4920c28 ccfe454 4920c28 ccfe454 4920c28 68a1536 4920c28 68a1536 4920c28 68a1536 4920c28 cf6524f b1c9c7e 4920c28 68a1536 cf6524f 4920c28 1b7cafd cf6524f 939af78 1b7cafd 4edeecd cf6524f 939af78 4920c28 939af78 cf6524f d3d2c50 939af78 d3d2c50 cf6524f 4920c28 939af78 d3d2c50 4920c28 939af78 4920c28 939af78 b1c9c7e 939af78 4920c28 cf6524f 4920c28 939af78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
# services/pdf_service.py
from pathlib import Path
from typing import List, Dict, Any, Optional
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss
import numpy as np
import asyncio
from concurrent.futures import ThreadPoolExecutor
import logging
from datetime import datetime
from config.config import settings
logger = logging.getLogger(__name__)
class PDFService:
def __init__(self, model_service):
self.embedder = model_service.embedder
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=settings.CHUNK_SIZE,
chunk_overlap=settings.CHUNK_OVERLAP,
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
)
self.index = None
self.chunks = []
self.last_update = None
self.pdf_metadata = {}
def process_pdf(self, pdf_path: Path) -> List[Dict[str, Any]]:
"""Process a single PDF file - now synchronous"""
try:
reader = PdfReader(str(pdf_path))
chunks = []
# Extract metadata
metadata = {
'title': reader.metadata.get('/Title', ''),
'author': reader.metadata.get('/Author', ''),
'creation_date': reader.metadata.get('/CreationDate', ''),
'pages': len(reader.pages),
'filename': pdf_path.name
}
self.pdf_metadata[pdf_path.name] = metadata
# Process each page
for page_num, page in enumerate(reader.pages):
text = page.extract_text()
if not text:
continue
page_chunks = self.text_splitter.split_text(text)
for i, chunk in enumerate(page_chunks):
chunks.append({
'text': chunk,
'source': pdf_path.name,
'page': page_num + 1,
'chunk_index': i,
'metadata': metadata,
'timestamp': datetime.now().isoformat()
})
print("--------------------------- chunks ----------------------------------")
print("--------------------------- chunks ----------------------------------")
print(chunks)
return chunks
except Exception as e:
logger.error(f"Error processing PDF {pdf_path}: {e}")
return []
async def index_pdfs(self, pdf_folder: Path = settings.PDF_FOLDER) -> None:
"""Index all PDFs in the specified folder"""
try:
pdf_files = list(pdf_folder.glob('*.pdf'))
if not pdf_files:
logger.warning(f"No PDF files found in {pdf_folder}")
return
# Process PDFs using thread pool
loop = asyncio.get_running_loop()
with ThreadPoolExecutor() as executor:
chunk_lists = await loop.run_in_executor(
executor,
lambda: [self.process_pdf(pdf_file) for pdf_file in pdf_files]
)
# Combine all chunks
self.chunks = []
for chunk_list in chunk_lists:
self.chunks.extend(chunk_list)
if not self.chunks:
logger.warning("No text chunks extracted from PDFs")
return
# Create FAISS index
texts = [chunk['text'] for chunk in self.chunks]
embeddings = await loop.run_in_executor(
None,
lambda: self.embedder.encode(
texts,
convert_to_tensor=True,
show_progress_bar=True
).cpu().detach().numpy()
)
dimension = embeddings.shape[1]
self.index = faiss.IndexFlatL2(dimension)
self.index.add(embeddings)
self.last_update = datetime.now()
logger.info(f"Indexed {len(self.chunks)} chunks from {len(pdf_files)} PDFs")
except Exception as e:
logger.error(f"Error indexing PDFs: {e}")
raise
async def search(
self,
query: str,
top_k: int = 5,
min_score: float = 0.5
) -> List[Dict[str, Any]]:
"""Search indexed PDFs with debug logs"""
print("--------------------------- query ----------------------------------")
print(query)
if not self.index or not self.chunks:
await self.index_pdfs()
try:
# Create query embedding
query_embedding = self.embedder.encode([query], convert_to_tensor=True)
query_embedding_np = query_embedding.cpu().detach().numpy()
print("Query Embedding Shape:", query_embedding_np.shape)
# Search in FAISS index
distances, indices = self.index.search(query_embedding_np, top_k)
print("Distances:", distances)
print("Indices:", indices)
# Process results
results = []
for i, idx in enumerate(indices[0]):
if idx >= len(self.chunks):
continue # Skip invalid indices
score = 1 - distances[0][i] # Convert distance to similarity score
print(f"Chunk Index: {idx}, Distance: {distances[0][i]}, Score: {score}")
print("----- score < min_score")
print(score < min_score)
if score < min_score:
print("skipped ---- ")
#continue # Skip low scores
chunk = self.chunks[idx].copy()
chunk['score'] = score
print("---- chuck " )
print(chunk)
results.append(chunk)
# Sort by score and take top_k
results.sort(key=lambda x: x['score'], reverse=True)
print("--------------------------- results ----------------------------------")
print(results)
return results[:top_k]
except Exception as e:
logger.error(f"Error searching PDFs: {e}")
raise
|