llm

Sleeping

App Files Files Community

Chris4K commited on Jan 13

Commit

ccfe454

verified ·

1 Parent(s): c0aba14

Update services/pdf_service.py

Browse files

Files changed (1) hide show

services/pdf_service.py +33 -23

services/pdf_service.py CHANGED Viewed

@@ -26,8 +26,8 @@ class PDFService:
         self.last_update = None
         self.pdf_metadata = {}
-    async def process_pdf(self, pdf_path: Path) -> List[Dict[str, Any]]:
-        """Process a single PDF file"""
         try:
             reader = PdfReader(str(pdf_path))
             chunks = []
@@ -73,26 +73,33 @@ class PDFService:
                 logger.warning(f"No PDF files found in {pdf_folder}")
                 return
-            # Process PDFs in parallel
-            async with ThreadPoolExecutor() as executor:
-                tasks = [
-                    asyncio.create_task(self.process_pdf(pdf_file))
-                    for pdf_file in pdf_files
-                ]
-                chunk_lists = await asyncio.gather(*tasks)
             # Combine all chunks
             self.chunks = []
             for chunk_list in chunk_lists:
                 self.chunks.extend(chunk_list)
             # Create FAISS index
             texts = [chunk['text'] for chunk in self.chunks]
-            embeddings = self.embedder.encode(
-                texts,
-                convert_to_tensor=True,
-                show_progress_bar=True
-            ).cpu().detach().numpy()
             dimension = embeddings.shape[1]
             self.index = faiss.IndexFlatL2(dimension)
@@ -117,14 +124,18 @@ class PDFService:
             await self.index_pdfs()
         try:
-            # Get query embedding
-            query_embedding = self.embedder.encode(
-                [query],
-                convert_to_tensor=True
-            ).cpu().detach().numpy()
             # Search
-            distances, indices = self.index.search(query_embedding, top_k * 2)  # Get extra results for filtering
             # Process results
             results = []
@@ -133,7 +144,7 @@ class PDFService:
                     continue
                 chunk = self.chunks[idx].copy()
-                chunk['score'] = float(1 - distances[0][i])  # Convert distance to similarity score
                 results.append(chunk)
             # Sort by score and take top_k
@@ -142,5 +153,4 @@ class PDFService:
         except Exception as e:
             logger.error(f"Error searching PDFs: {e}")
-            raise

         self.last_update = None
         self.pdf_metadata = {}
+    def process_pdf(self, pdf_path: Path) -> List[Dict[str, Any]]:
+        """Process a single PDF file - now synchronous"""
         try:
             reader = PdfReader(str(pdf_path))
             chunks = []
                 logger.warning(f"No PDF files found in {pdf_folder}")
                 return
+            # Process PDFs using thread pool
+            loop = asyncio.get_running_loop()
+            with ThreadPoolExecutor() as executor:
+                chunk_lists = await loop.run_in_executor(
+                    executor,
+                    lambda: [self.process_pdf(pdf_file) for pdf_file in pdf_files]
+                )
             # Combine all chunks
             self.chunks = []
             for chunk_list in chunk_lists:
                 self.chunks.extend(chunk_list)
+            if not self.chunks:
+                logger.warning("No text chunks extracted from PDFs")
+                return
             # Create FAISS index
             texts = [chunk['text'] for chunk in self.chunks]
+            embeddings = await loop.run_in_executor(
+                None,
+                lambda: self.embedder.encode(
+                    texts,
+                    convert_to_tensor=True,
+                    show_progress_bar=True
+                ).cpu().detach().numpy()
+            )
             dimension = embeddings.shape[1]
             self.index = faiss.IndexFlatL2(dimension)
             await self.index_pdfs()
         try:
+            # Get query embedding using thread pool
+            loop = asyncio.get_running_loop()
+            query_embedding = await loop.run_in_executor(
+                None,
+                lambda: self.embedder.encode(
+                    [query],
+                    convert_to_tensor=True
+                ).cpu().detach().numpy()
+            )
             # Search
+            distances, indices = self.index.search(query_embedding, top_k * 2)
             # Process results
             results = []
                     continue
                 chunk = self.chunks[idx].copy()
+                chunk['score'] = float(1 - distances[0][i])
                 results.append(chunk)
             # Sort by score and take top_k
         except Exception as e:
             logger.error(f"Error searching PDFs: {e}")
+            raise