VanKee commited on
Commit
5937f6b
Β·
1 Parent(s): a4ca4d1

fix(customization): implement anti-hallucination filtering with Top-P sampling and similarity thresholds

Browse files

- Replace direct ANNOY search with proper threshold filtering in customization pipeline
- Add Top-P sampling (0.6) with minimum 30% similarity threshold to prevent low-quality results
- Implement fallback mechanism with 25% minimum threshold for robustness
- Limit hospital custom chunks to top 3 in generation module for quality control
- Add comprehensive logging for quality assessment and debugging

This addresses the issue where hospital customization was returning hundreds of
low-relevance results (8.7%, 6.6% similarity) that contributed to hallucination
in medical advice generation.

customization/customization_pipeline.py CHANGED
@@ -142,34 +142,85 @@ def retrieve_document_chunks(query: str, top_k: int = 5, llm_client=None) -> Lis
142
  print("❌ No relevant documents found")
143
  return []
144
 
145
- # Stage 2: Find relevant chunks within these documents using chunk ANNOY index
146
  print(f"πŸ” Stage 2: Finding relevant chunks within {len(relevant_docs)} documents")
147
- chunks, chunk_distances = annoy_manager.search_chunks_in_documents(
148
- query_embedding=query_embedding,
149
- document_names=relevant_docs,
150
- n_neighbors=top_k,
151
- include_distances=True
152
- )
153
-
154
- # Convert ANNOY distances to cosine similarities
155
- from indexing.annoy_manager import convert_angular_distance_to_cosine_similarity
156
 
157
- # Format results
158
- results = []
159
- for chunk, distance in zip(chunks, chunk_distances):
160
- # Convert angular distance to cosine similarity
161
- similarity = convert_angular_distance_to_cosine_similarity(distance)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
- results.append({
164
- 'document': chunk['document'],
165
- 'chunk_text': chunk['text'],
166
- 'score': similarity,
167
- 'metadata': {
168
- 'chunk_id': chunk['chunk_id'],
169
- 'start_char': chunk.get('start_char', 0),
170
- 'end_char': chunk.get('end_char', 0)
171
- }
172
- })
173
-
174
- print(f"βœ… Retrieved {len(results)} relevant chunks")
175
  return results
 
142
  print("❌ No relevant documents found")
143
  return []
144
 
145
+ # Stage 2: Find relevant chunks within these documents using proper threshold filtering
146
  print(f"πŸ” Stage 2: Finding relevant chunks within {len(relevant_docs)} documents")
 
 
 
 
 
 
 
 
 
147
 
148
+ # Use the proper chunk retrieval function with Top-P + minimum similarity filtering
149
+ try:
150
+ filtered_chunks = find_relevant_chunks_with_fallback(
151
+ query=search_query, # Use the processed search query (with keywords if available)
152
+ model=embedding_model,
153
+ relevant_docs=relevant_docs,
154
+ chunk_embeddings=chunk_embeddings,
155
+ annoy_manager=annoy_manager, # Pass the ANNOY manager for accelerated search
156
+ strategy="top_p",
157
+ top_p=0.6, # Top-P threshold: only include chunks that make up 60% of probability mass
158
+ min_similarity=0.3, # Minimum 30% similarity threshold
159
+ similarity_metric="angular" # Use angular similarity for consistency with ANNOY
160
+ )
161
+
162
+ if not filtered_chunks:
163
+ print("❌ No chunks found above similarity threshold (30%)")
164
+ return []
165
+
166
+ print(f"βœ… Retrieved {len(filtered_chunks)} high-quality chunks (Top-P=0.6, min_sim=0.3)")
167
+
168
+ # Format results to match expected output format
169
+ results = []
170
+ for chunk in filtered_chunks:
171
+ results.append({
172
+ 'document': chunk['document'],
173
+ 'chunk_text': chunk['text'],
174
+ 'score': chunk['similarity'], # This is already a similarity score (0-1)
175
+ 'metadata': {
176
+ 'chunk_id': chunk['chunk_id'],
177
+ 'start_char': chunk.get('start_char', 0),
178
+ 'end_char': chunk.get('end_char', 0)
179
+ }
180
+ })
181
+
182
+ print(f"πŸ“Š Quality summary:")
183
+ for i, result in enumerate(results[:3]): # Show top 3
184
+ print(f" {i+1}. {result['document']} (similarity: {result['score']:.3f})")
185
+ print(f" Preview: {result['chunk_text'][:100]}...")
186
+
187
+ except Exception as e:
188
+ print(f"❌ Error in chunk filtering: {e}")
189
+ print("πŸ”„ Falling back to direct ANNOY search without filtering...")
190
+
191
+ # Fallback: Direct ANNOY search (original behavior)
192
+ chunks, chunk_distances = annoy_manager.search_chunks_in_documents(
193
+ query_embedding=query_embedding,
194
+ document_names=relevant_docs,
195
+ n_neighbors=top_k,
196
+ include_distances=True
197
+ )
198
+
199
+ # Convert ANNOY distances to cosine similarities
200
+ from indexing.annoy_manager import convert_angular_distance_to_cosine_similarity
201
+
202
+ # Format results
203
+ results = []
204
+ for chunk, distance in zip(chunks, chunk_distances):
205
+ # Convert angular distance to cosine similarity
206
+ similarity = convert_angular_distance_to_cosine_similarity(distance)
207
+
208
+ # Apply minimum similarity threshold even in fallback
209
+ if similarity >= 0.25: # 25% minimum threshold for fallback
210
+ results.append({
211
+ 'document': chunk['document'],
212
+ 'chunk_text': chunk['text'],
213
+ 'score': similarity,
214
+ 'metadata': {
215
+ 'chunk_id': chunk['chunk_id'],
216
+ 'start_char': chunk.get('start_char', 0),
217
+ 'end_char': chunk.get('end_char', 0)
218
+ }
219
+ })
220
+
221
+ if not results:
222
+ print("❌ No chunks found above minimum similarity threshold (25%)")
223
+ return []
224
 
225
+ print(f"βœ… Fallback: Retrieved {len(results)} chunks above 25% similarity")
 
 
 
 
 
 
 
 
 
 
 
226
  return results
src/generation.py CHANGED
@@ -262,8 +262,8 @@ class MedicalAdviceGenerator:
262
  selected_chunks.extend(emergency_chunks[:priorities["emergency_subset"]])
263
  selected_chunks.extend(treatment_chunks[:priorities["treatment_subset"]])
264
 
265
- # Add hospital custom chunks alongside
266
- selected_chunks.extend(hospital_custom_chunks)
267
 
268
  # TODO: Future Dataset B integration
269
  # selected_chunks.extend(symptom_chunks[:priorities["symptom_subset"]])
 
262
  selected_chunks.extend(emergency_chunks[:priorities["emergency_subset"]])
263
  selected_chunks.extend(treatment_chunks[:priorities["treatment_subset"]])
264
 
265
+ # Add hospital custom chunks alongside (limit to top 3 for quality)
266
+ selected_chunks.extend(hospital_custom_chunks[:3])
267
 
268
  # TODO: Future Dataset B integration
269
  # selected_chunks.extend(symptom_chunks[:priorities["symptom_subset"]])