Update services/pdf_service.py
Browse files- services/pdf_service.py +10 -6
services/pdf_service.py
CHANGED
@@ -121,30 +121,34 @@ class PDFService:
|
|
121 |
top_k: int = 5,
|
122 |
min_score: float = 0.5
|
123 |
) -> List[Dict[str, Any]]:
|
124 |
-
"""Search indexed PDFs"""
|
125 |
print("--------------------------- query ----------------------------------")
|
126 |
print(query)
|
127 |
if not self.index or not self.chunks:
|
128 |
await self.index_pdfs()
|
129 |
-
|
130 |
try:
|
131 |
# Create query embedding
|
132 |
query_embedding = self.embedder.encode([query], convert_to_tensor=True)
|
133 |
query_embedding_np = query_embedding.cpu().detach().numpy()
|
|
|
134 |
|
135 |
# Search in FAISS index
|
136 |
distances, indices = self.index.search(query_embedding_np, top_k)
|
|
|
|
|
137 |
|
138 |
# Process results
|
139 |
results = []
|
140 |
for i, idx in enumerate(indices[0]):
|
141 |
if idx >= len(self.chunks):
|
142 |
continue # Skip invalid indices
|
143 |
-
|
144 |
-
score = 1 - distances[0][i] #
|
|
|
145 |
if score < min_score:
|
146 |
continue # Skip low scores
|
147 |
-
|
148 |
chunk = self.chunks[idx].copy()
|
149 |
chunk['score'] = score
|
150 |
results.append(chunk)
|
@@ -156,7 +160,7 @@ class PDFService:
|
|
156 |
print(results)
|
157 |
|
158 |
return results[:top_k]
|
159 |
-
|
160 |
except Exception as e:
|
161 |
logger.error(f"Error searching PDFs: {e}")
|
162 |
raise
|
|
|
121 |
top_k: int = 5,
|
122 |
min_score: float = 0.5
|
123 |
) -> List[Dict[str, Any]]:
|
124 |
+
"""Search indexed PDFs with debug logs"""
|
125 |
print("--------------------------- query ----------------------------------")
|
126 |
print(query)
|
127 |
if not self.index or not self.chunks:
|
128 |
await self.index_pdfs()
|
129 |
+
|
130 |
try:
|
131 |
# Create query embedding
|
132 |
query_embedding = self.embedder.encode([query], convert_to_tensor=True)
|
133 |
query_embedding_np = query_embedding.cpu().detach().numpy()
|
134 |
+
print("Query Embedding Shape:", query_embedding_np.shape)
|
135 |
|
136 |
# Search in FAISS index
|
137 |
distances, indices = self.index.search(query_embedding_np, top_k)
|
138 |
+
print("Distances:", distances)
|
139 |
+
print("Indices:", indices)
|
140 |
|
141 |
# Process results
|
142 |
results = []
|
143 |
for i, idx in enumerate(indices[0]):
|
144 |
if idx >= len(self.chunks):
|
145 |
continue # Skip invalid indices
|
146 |
+
|
147 |
+
score = 1 - distances[0][i] # Convert distance to similarity score
|
148 |
+
print(f"Chunk Index: {idx}, Distance: {distances[0][i]}, Score: {score}")
|
149 |
if score < min_score:
|
150 |
continue # Skip low scores
|
151 |
+
|
152 |
chunk = self.chunks[idx].copy()
|
153 |
chunk['score'] = score
|
154 |
results.append(chunk)
|
|
|
160 |
print(results)
|
161 |
|
162 |
return results[:top_k]
|
163 |
+
|
164 |
except Exception as e:
|
165 |
logger.error(f"Error searching PDFs: {e}")
|
166 |
raise
|