Chris4K commited on
Commit
ccfe454
·
verified ·
1 Parent(s): c0aba14

Update services/pdf_service.py

Browse files
Files changed (1) hide show
  1. services/pdf_service.py +33 -23
services/pdf_service.py CHANGED
@@ -26,8 +26,8 @@ class PDFService:
26
  self.last_update = None
27
  self.pdf_metadata = {}
28
 
29
- async def process_pdf(self, pdf_path: Path) -> List[Dict[str, Any]]:
30
- """Process a single PDF file"""
31
  try:
32
  reader = PdfReader(str(pdf_path))
33
  chunks = []
@@ -73,26 +73,33 @@ class PDFService:
73
  logger.warning(f"No PDF files found in {pdf_folder}")
74
  return
75
 
76
- # Process PDFs in parallel
77
- async with ThreadPoolExecutor() as executor:
78
- tasks = [
79
- asyncio.create_task(self.process_pdf(pdf_file))
80
- for pdf_file in pdf_files
81
- ]
82
- chunk_lists = await asyncio.gather(*tasks)
83
 
84
  # Combine all chunks
85
  self.chunks = []
86
  for chunk_list in chunk_lists:
87
  self.chunks.extend(chunk_list)
88
 
 
 
 
 
89
  # Create FAISS index
90
  texts = [chunk['text'] for chunk in self.chunks]
91
- embeddings = self.embedder.encode(
92
- texts,
93
- convert_to_tensor=True,
94
- show_progress_bar=True
95
- ).cpu().detach().numpy()
 
 
 
96
 
97
  dimension = embeddings.shape[1]
98
  self.index = faiss.IndexFlatL2(dimension)
@@ -117,14 +124,18 @@ class PDFService:
117
  await self.index_pdfs()
118
 
119
  try:
120
- # Get query embedding
121
- query_embedding = self.embedder.encode(
122
- [query],
123
- convert_to_tensor=True
124
- ).cpu().detach().numpy()
 
 
 
 
125
 
126
  # Search
127
- distances, indices = self.index.search(query_embedding, top_k * 2) # Get extra results for filtering
128
 
129
  # Process results
130
  results = []
@@ -133,7 +144,7 @@ class PDFService:
133
  continue
134
 
135
  chunk = self.chunks[idx].copy()
136
- chunk['score'] = float(1 - distances[0][i]) # Convert distance to similarity score
137
  results.append(chunk)
138
 
139
  # Sort by score and take top_k
@@ -142,5 +153,4 @@ class PDFService:
142
 
143
  except Exception as e:
144
  logger.error(f"Error searching PDFs: {e}")
145
- raise
146
-
 
26
  self.last_update = None
27
  self.pdf_metadata = {}
28
 
29
+ def process_pdf(self, pdf_path: Path) -> List[Dict[str, Any]]:
30
+ """Process a single PDF file - now synchronous"""
31
  try:
32
  reader = PdfReader(str(pdf_path))
33
  chunks = []
 
73
  logger.warning(f"No PDF files found in {pdf_folder}")
74
  return
75
 
76
+ # Process PDFs using thread pool
77
+ loop = asyncio.get_running_loop()
78
+ with ThreadPoolExecutor() as executor:
79
+ chunk_lists = await loop.run_in_executor(
80
+ executor,
81
+ lambda: [self.process_pdf(pdf_file) for pdf_file in pdf_files]
82
+ )
83
 
84
  # Combine all chunks
85
  self.chunks = []
86
  for chunk_list in chunk_lists:
87
  self.chunks.extend(chunk_list)
88
 
89
+ if not self.chunks:
90
+ logger.warning("No text chunks extracted from PDFs")
91
+ return
92
+
93
  # Create FAISS index
94
  texts = [chunk['text'] for chunk in self.chunks]
95
+ embeddings = await loop.run_in_executor(
96
+ None,
97
+ lambda: self.embedder.encode(
98
+ texts,
99
+ convert_to_tensor=True,
100
+ show_progress_bar=True
101
+ ).cpu().detach().numpy()
102
+ )
103
 
104
  dimension = embeddings.shape[1]
105
  self.index = faiss.IndexFlatL2(dimension)
 
124
  await self.index_pdfs()
125
 
126
  try:
127
+ # Get query embedding using thread pool
128
+ loop = asyncio.get_running_loop()
129
+ query_embedding = await loop.run_in_executor(
130
+ None,
131
+ lambda: self.embedder.encode(
132
+ [query],
133
+ convert_to_tensor=True
134
+ ).cpu().detach().numpy()
135
+ )
136
 
137
  # Search
138
+ distances, indices = self.index.search(query_embedding, top_k * 2)
139
 
140
  # Process results
141
  results = []
 
144
  continue
145
 
146
  chunk = self.chunks[idx].copy()
147
+ chunk['score'] = float(1 - distances[0][i])
148
  results.append(chunk)
149
 
150
  # Sort by score and take top_k
 
153
 
154
  except Exception as e:
155
  logger.error(f"Error searching PDFs: {e}")
156
+ raise