Update services/pdf_service.py
Browse files- services/pdf_service.py +16 -27
services/pdf_service.py
CHANGED
@@ -128,46 +128,35 @@ class PDFService:
|
|
128 |
await self.index_pdfs()
|
129 |
|
130 |
try:
|
131 |
-
# Get query embedding using thread pool
|
132 |
-
#loop = asyncio.get_running_loop()
|
133 |
-
#query_embedding = await loop.run_in_executor(
|
134 |
-
# None,
|
135 |
-
# lambda: self.embedder.encode(
|
136 |
-
# [query],
|
137 |
-
# convert_to_tensor=True
|
138 |
-
# ).cpu().detach().numpy()
|
139 |
-
#)
|
140 |
-
|
141 |
-
# Search
|
142 |
-
#distances, indices = self.index.search(query_embedding, top_k * 2)
|
143 |
-
|
144 |
# Create query embedding
|
145 |
query_embedding = self.embedder.encode([query], convert_to_tensor=True)
|
146 |
query_embedding_np = query_embedding.cpu().detach().numpy()
|
147 |
-
|
148 |
# Search in FAISS index
|
149 |
distances, indices = self.index.search(query_embedding_np, top_k)
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
# Process results
|
154 |
results = []
|
155 |
for i, idx in enumerate(indices[0]):
|
156 |
-
if idx >= len(self.chunks)
|
157 |
-
continue
|
158 |
-
|
|
|
|
|
|
|
|
|
159 |
chunk = self.chunks[idx].copy()
|
160 |
-
chunk['score'] =
|
161 |
results.append(chunk)
|
162 |
-
|
163 |
# Sort by score and take top_k
|
164 |
results.sort(key=lambda x: x['score'], reverse=True)
|
165 |
-
|
166 |
-
print("--------------------------- results
|
167 |
print(results)
|
168 |
-
|
169 |
return results[:top_k]
|
170 |
-
|
171 |
except Exception as e:
|
172 |
logger.error(f"Error searching PDFs: {e}")
|
173 |
-
raise
|
|
|
128 |
await self.index_pdfs()
|
129 |
|
130 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
# Create query embedding
|
132 |
query_embedding = self.embedder.encode([query], convert_to_tensor=True)
|
133 |
query_embedding_np = query_embedding.cpu().detach().numpy()
|
134 |
+
|
135 |
# Search in FAISS index
|
136 |
distances, indices = self.index.search(query_embedding_np, top_k)
|
137 |
+
|
|
|
|
|
138 |
# Process results
|
139 |
results = []
|
140 |
for i, idx in enumerate(indices[0]):
|
141 |
+
if idx >= len(self.chunks):
|
142 |
+
continue # Skip invalid indices
|
143 |
+
|
144 |
+
score = 1 - distances[0][i] # Calculate similarity score
|
145 |
+
if score < min_score:
|
146 |
+
continue # Skip low scores
|
147 |
+
|
148 |
chunk = self.chunks[idx].copy()
|
149 |
+
chunk['score'] = score
|
150 |
results.append(chunk)
|
151 |
+
|
152 |
# Sort by score and take top_k
|
153 |
results.sort(key=lambda x: x['score'], reverse=True)
|
154 |
+
|
155 |
+
print("--------------------------- results ----------------------------------")
|
156 |
print(results)
|
157 |
+
|
158 |
return results[:top_k]
|
159 |
+
|
160 |
except Exception as e:
|
161 |
logger.error(f"Error searching PDFs: {e}")
|
162 |
+
raise
|