llm

Sleeping

App Files Files Community

Chris4K commited on Jan 12

Commit

4920c28

verified ·

1 Parent(s): fd8d102

Create services/pdf_service.py

Browse files

Files changed (1) hide show

services/pdf_service.py +117 -55

services/pdf_service.py CHANGED Viewed

@@ -1,11 +1,14 @@
 # services/pdf_service.py
 from pathlib import Path
-from typing import List, Dict, Any
 from PyPDF2 import PdfReader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 import logging
 from config.config import settings
 logger = logging.getLogger(__name__)
@@ -15,70 +18,129 @@ class PDFService:
         self.embedder = model_service.embedder
         self.text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=settings.CHUNK_SIZE,
-            chunk_overlap=settings.CHUNK_OVERLAP
         )
-        self.pdf_chunks = []
-        self.faiss_index = None
-    async def index_pdfs(self, pdf_folder: Path = settings.PDF_FOLDER) -> List[Dict[str, Any]]:
-        all_texts = []
-        async def process_pdf(pdf_file: Path) -> List[Dict[str, Any]]:
-            try:
-                reader = PdfReader(str(pdf_file))
-                metadata = reader.metadata
-                full_text = " ".join([
-                    page.extract_text()
-                    for page in reader.pages
-                    if page.extract_text()
-                ])
-                chunks = self.text_splitter.split_text(full_text)
-                return [{
-                    'text': chunk,
-                    'source': pdf_file.name,
-                    'metadata': metadata,
-                    'chunk_index': i
-                } for i, chunk in enumerate(chunks)]
-            except Exception as e:
-                logger.error(f"Error processing PDF {pdf_file}: {e}")
-                return []
-        pdf_files = [f for f in pdf_folder.iterdir() if f.suffix.lower() == ".pdf"]
-        async with ThreadPoolExecutor() as executor:
-            tasks = [process_pdf(pdf_file) for pdf_file in pdf_files]
-            results = await asyncio.gather(*tasks)
-        for result in results:
-            all_texts.extend(result)
-        self.pdf_chunks = all_texts
-        return all_texts
-    async def search_pdfs(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
-        if not self.pdf_chunks:
             await self.index_pdfs()
-        query_embedding = self.embedder.encode([query], convert_to_tensor=True).cpu().detach().numpy()
-        # Create embeddings for chunks if not already done
-        if not self.faiss_index:
-            chunk_embeddings = self.embedder.encode(
-                [chunk['text'] for chunk in self.pdf_chunks],
                 convert_to_tensor=True
             ).cpu().detach().numpy()
-            d = chunk_embeddings.shape[1]
-            self.faiss_index = faiss.IndexFlatL2(d)
-            self.faiss_index.add(chunk_embeddings)
-        distances, indices = self.faiss_index.search(query_embedding, top_k)
-        results = []
-        for i, idx in enumerate(indices[0]):
-            chunk = self.pdf_chunks[idx].copy()
-            chunk['score'] = float(distances[0][i])
-            results.append(chunk)
-        return results

 # services/pdf_service.py
 from pathlib import Path
+from typing import List, Dict, Any, Optional
 from PyPDF2 import PdfReader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+import faiss
+import numpy as np
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 import logging
+from datetime import datetime
 from config.config import settings
 logger = logging.getLogger(__name__)
         self.embedder = model_service.embedder
         self.text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=settings.CHUNK_SIZE,
+            chunk_overlap=settings.CHUNK_OVERLAP,
+            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
         )
+        self.index = None
+        self.chunks = []
+        self.last_update = None
+        self.pdf_metadata = {}
+    async def process_pdf(self, pdf_path: Path) -> List[Dict[str, Any]]:
+        """Process a single PDF file"""
+        try:
+            reader = PdfReader(str(pdf_path))
+            chunks = []
+            # Extract metadata
+            metadata = {
+                'title': reader.metadata.get('/Title', ''),
+                'author': reader.metadata.get('/Author', ''),
+                'creation_date': reader.metadata.get('/CreationDate', ''),
+                'pages': len(reader.pages),
+                'filename': pdf_path.name
+            }
+            self.pdf_metadata[pdf_path.name] = metadata
+            # Process each page
+            for page_num, page in enumerate(reader.pages):
+                text = page.extract_text()
+                if not text:
+                    continue
+                page_chunks = self.text_splitter.split_text(text)
+                for i, chunk in enumerate(page_chunks):
+                    chunks.append({
+                        'text': chunk,
+                        'source': pdf_path.name,
+                        'page': page_num + 1,
+                        'chunk_index': i,
+                        'metadata': metadata,
+                        'timestamp': datetime.now().isoformat()
+                    })
+            return chunks
+        except Exception as e:
+            logger.error(f"Error processing PDF {pdf_path}: {e}")
+            return []
+    async def index_pdfs(self, pdf_folder: Path = settings.PDF_FOLDER) -> None:
+        """Index all PDFs in the specified folder"""
+        try:
+            pdf_files = list(pdf_folder.glob('*.pdf'))
+            if not pdf_files:
+                logger.warning(f"No PDF files found in {pdf_folder}")
+                return
+            # Process PDFs in parallel
+            async with ThreadPoolExecutor() as executor:
+                tasks = [
+                    asyncio.create_task(self.process_pdf(pdf_file))
+                    for pdf_file in pdf_files
+                ]
+                chunk_lists = await asyncio.gather(*tasks)
+            # Combine all chunks
+            self.chunks = []
+            for chunk_list in chunk_lists:
+                self.chunks.extend(chunk_list)
+            # Create FAISS index
+            texts = [chunk['text'] for chunk in self.chunks]
+            embeddings = self.embedder.encode(
+                texts,
+                convert_to_tensor=True,
+                show_progress_bar=True
+            ).cpu().detach().numpy()
+            dimension = embeddings.shape[1]
+            self.index = faiss.IndexFlatL2(dimension)
+            self.index.add(embeddings)
+            self.last_update = datetime.now()
+            logger.info(f"Indexed {len(self.chunks)} chunks from {len(pdf_files)} PDFs")
+        except Exception as e:
+            logger.error(f"Error indexing PDFs: {e}")
+            raise
+    async def search(
+        self,
+        query: str,
+        top_k: int = 5,
+        min_score: float = 0.5
+    ) -> List[Dict[str, Any]]:
+        """Search indexed PDFs"""
+        if not self.index or not self.chunks:
             await self.index_pdfs()
+        try:
+            # Get query embedding
+            query_embedding = self.embedder.encode(
+                [query],
                 convert_to_tensor=True
             ).cpu().detach().numpy()
+            # Search
+            distances, indices = self.index.search(query_embedding, top_k * 2)  # Get extra results for filtering
+            # Process results
+            results = []
+            for i, idx in enumerate(indices[0]):
+                if idx >= len(self.chunks) or distances[0][i] > min_score:
+                    continue
+                chunk = self.chunks[idx].copy()
+                chunk['score'] = float(1 - distances[0][i])  # Convert distance to similarity score
+                results.append(chunk)
+            # Sort by score and take top_k
+            results.sort(key=lambda x: x['score'], reverse=True)
+            return results[:top_k]
+        except Exception as e:
+            logger.error(f"Error searching PDFs: {e}")
+            raise