YanBoChen commited on
Commit
b0f56ec
·
1 Parent(s): 40d39ed

Refactor relevance calculation and update thresholds in latency evaluator; enhance precision and MRR analyzer with angular distance metrics; increase timeout for primary generation in fallback configuration.

Browse files
evaluation/latency_evaluator.py CHANGED
@@ -273,27 +273,25 @@ class ComprehensiveEvaluator:
273
 
274
  # METRIC 3: Retrieval Relevance Analysis
275
  if processed_results:
276
- similarity_scores = []
277
  for doc_result in processed_results:
278
- similarity = (
279
- doc_result.get('distance', 0.0) or
280
- doc_result.get('similarity_score', 0.0) or
281
- doc_result.get('score', 0.0)
282
- )
283
- similarity_scores.append(similarity)
284
 
285
- average_relevance = sum(similarity_scores) / len(similarity_scores)
286
- high_relevance_count = sum(1 for score in similarity_scores if score >= 0.2)
287
 
288
  relevance_metrics = {
289
  "average_relevance": average_relevance,
290
- "max_relevance": max(similarity_scores),
291
- "min_relevance": min(similarity_scores),
292
- "similarity_scores": similarity_scores,
293
  "high_relevance_count": high_relevance_count,
294
- "high_relevance_ratio": high_relevance_count / len(similarity_scores),
295
  "retrieved_count": len(processed_results),
296
- "meets_threshold": average_relevance >= 0.2,
297
  "retrieval_time": step3_time
298
  }
299
  else:
@@ -322,7 +320,7 @@ class ComprehensiveEvaluator:
322
  "latency_metrics": {
323
  "total_latency": total_time,
324
  "timing_details": timing_details,
325
- "meets_target": total_time <= 30.0
326
  },
327
 
328
  # Metric 2: Condition Extraction - Success rate from user_prompt.py
@@ -411,7 +409,7 @@ class ComprehensiveEvaluator:
411
  "latency_metrics": {
412
  "total_latency": total_time,
413
  "timing_details": timing_details,
414
- "meets_target": total_time <= 30.0
415
  },
416
 
417
  # Metric 2: Condition Extraction - Partial data may be available before failure
@@ -546,7 +544,7 @@ class ComprehensiveEvaluator:
546
  "min_latency": min(latencies),
547
  "max_latency": max(latencies),
548
  "query_count": len(latencies),
549
- "target_compliance": sum(1 for lat in latencies if lat <= 30.0) / len(latencies),
550
  "individual_latencies": latencies
551
  }
552
  else:
@@ -661,7 +659,7 @@ class ComprehensiveEvaluator:
661
  "max_latency": max(latencies),
662
  "successful_queries": len(all_successful_results),
663
  "total_queries": total_queries,
664
- "target_compliance": sum(1 for lat in latencies if lat <= 30.0) / len(latencies)
665
  }
666
 
667
  elif metric_name == "extraction":
@@ -682,8 +680,8 @@ class ComprehensiveEvaluator:
682
  "min_relevance": min(relevance_scores),
683
  "successful_queries": len(all_successful_results),
684
  "total_queries": total_queries,
685
- "meets_threshold": (sum(relevance_scores) / len(relevance_scores)) >= 0.2,
686
- "target_compliance": (sum(relevance_scores) / len(relevance_scores)) >= 0.25
687
  }
688
 
689
  elif metric_name == "coverage" and all_successful_results:
@@ -866,7 +864,7 @@ if __name__ == "__main__":
866
 
867
  if metric_name == "latency":
868
  print(f" Average: {overall_results['average_latency']:.2f}s (±{overall_results['std_deviation']:.2f})")
869
- print(f" 30s Target: {'✅ Met' if overall_results['target_compliance'] >= 0.8 else '❌ Not Met'}")
870
 
871
  elif metric_name == "extraction":
872
  print(f" Success Rate: {overall_results['success_rate']:.1%}")
 
273
 
274
  # METRIC 3: Retrieval Relevance Analysis
275
  if processed_results:
276
+ relevance_scores = []
277
  for doc_result in processed_results:
278
+ # Get angular distance and convert to relevance using correct formula
279
+ distance = doc_result.get('distance', 1.0)
280
+ relevance = 1.0 - (distance**2) / 2.0 # Correct mathematical conversion
281
+ relevance_scores.append(relevance)
 
 
282
 
283
+ average_relevance = sum(relevance_scores) / len(relevance_scores)
284
+ high_relevance_count = sum(1 for score in relevance_scores if score >= 0.85)
285
 
286
  relevance_metrics = {
287
  "average_relevance": average_relevance,
288
+ "max_relevance": max(relevance_scores),
289
+ "min_relevance": min(relevance_scores),
290
+ "relevance_scores": relevance_scores,
291
  "high_relevance_count": high_relevance_count,
292
+ "high_relevance_ratio": high_relevance_count / len(relevance_scores),
293
  "retrieved_count": len(processed_results),
294
+ "meets_threshold": average_relevance >= 0.85,
295
  "retrieval_time": step3_time
296
  }
297
  else:
 
320
  "latency_metrics": {
321
  "total_latency": total_time,
322
  "timing_details": timing_details,
323
+ "meets_target": total_time <= 60.0
324
  },
325
 
326
  # Metric 2: Condition Extraction - Success rate from user_prompt.py
 
409
  "latency_metrics": {
410
  "total_latency": total_time,
411
  "timing_details": timing_details,
412
+ "meets_target": total_time <= 60.0
413
  },
414
 
415
  # Metric 2: Condition Extraction - Partial data may be available before failure
 
544
  "min_latency": min(latencies),
545
  "max_latency": max(latencies),
546
  "query_count": len(latencies),
547
+ "target_compliance": sum(1 for lat in latencies if lat <= 60.0) / len(latencies),
548
  "individual_latencies": latencies
549
  }
550
  else:
 
659
  "max_latency": max(latencies),
660
  "successful_queries": len(all_successful_results),
661
  "total_queries": total_queries,
662
+ "target_compliance": sum(1 for lat in latencies if lat <= 60.0) / len(latencies)
663
  }
664
 
665
  elif metric_name == "extraction":
 
680
  "min_relevance": min(relevance_scores),
681
  "successful_queries": len(all_successful_results),
682
  "total_queries": total_queries,
683
+ "meets_threshold": (sum(relevance_scores) / len(relevance_scores)) >= 0.85,
684
+ "target_compliance": (sum(relevance_scores) / len(relevance_scores)) >= 0.7
685
  }
686
 
687
  elif metric_name == "coverage" and all_successful_results:
 
864
 
865
  if metric_name == "latency":
866
  print(f" Average: {overall_results['average_latency']:.2f}s (±{overall_results['std_deviation']:.2f})")
867
+ print(f" 60s Target: {'✅ Met' if overall_results['target_compliance'] >= 0.8 else '❌ Not Met'}")
868
 
869
  elif metric_name == "extraction":
870
  print(f" Success Rate: {overall_results['success_rate']:.1%}")
evaluation/metric7_8_precision_MRR.py CHANGED
@@ -6,6 +6,12 @@ OnCall.ai System - Precision & MRR Analyzer (Metrics 7-8)
6
  Specialized analyzer for calculating Precision@K and Mean Reciprocal Rank (MRR)
7
  using data collected from latency_evaluator.py comprehensive evaluation.
8
 
 
 
 
 
 
 
9
  METRICS CALCULATED:
10
  7. Precision@K (檢索精確率) - Proportion of relevant results in top-K retrieval
11
  8. Mean Reciprocal Rank (平均倒數排名) - Average reciprocal rank of first relevant result
@@ -18,6 +24,7 @@ DESIGN PRINCIPLE:
18
 
19
  Author: YanBo Chen
20
  Date: 2025-08-04
 
21
  """
22
 
23
  import json
@@ -121,8 +128,8 @@ class PrecisionMRRAnalyzer:
121
  # Step 1: Determine query complexity
122
  is_complex = self._is_complex_query(query, processed_results)
123
 
124
- # Step 2: Choose adaptive threshold
125
- threshold = 0.15 if is_complex else 0.25
126
 
127
  print(f" 🎯 Using relevance threshold: {threshold} ({'lenient' if is_complex else 'strict'})")
128
 
 
6
  Specialized analyzer for calculating Precision@K and Mean Reciprocal Rank (MRR)
7
  using data collected from latency_evaluator.py comprehensive evaluation.
8
 
9
+ IMPORTANT CHANGES - Angular Distance & Relevance Calculation:
10
+ - DISTANCE METRIC: Uses Angular Distance from Annoy index (range: 0.0-1.0, smaller = more relevant)
11
+ - RELEVANCE CONVERSION: relevance = 1.0 - (angular_distance²) / 2.0 (mathematical correct formula)
12
+ - THRESHOLD ALIGNMENT: Aligned with Metric 3 relevance calculation standards
13
+ - DISPLAY UPDATE: Changed from "Relevance: X" to "Angular Distance: X" for clarity
14
+
15
  METRICS CALCULATED:
16
  7. Precision@K (檢索精確率) - Proportion of relevant results in top-K retrieval
17
  8. Mean Reciprocal Rank (平均倒數排名) - Average reciprocal rank of first relevant result
 
24
 
25
  Author: YanBo Chen
26
  Date: 2025-08-04
27
+ Updated: 2025-08-04 (Angular Distance alignment)
28
  """
29
 
30
  import json
 
128
  # Step 1: Determine query complexity
129
  is_complex = self._is_complex_query(query, processed_results)
130
 
131
+ # Step 2: Choose adaptive threshold (aligned with Metric 3 relevance calculation)
132
+ threshold = 0.75 if is_complex else 0.8
133
 
134
  print(f" 🎯 Using relevance threshold: {threshold} ({'lenient' if is_complex else 'strict'})")
135
 
src/generation.py CHANGED
@@ -30,7 +30,7 @@ logger = logging.getLogger(__name__)
30
 
31
  # Fallback Generation Configuration (Simplified Architecture)
32
  FALLBACK_TIMEOUTS = {
33
- "primary": 30.0, # Primary Med42-70B with full RAG context
34
  "fallback_1": 1.0, # RAG template generation (renamed from fallback_2)
35
  "fallback_2": 0.1 # Minimal template generation (instant)
36
  }
@@ -279,7 +279,7 @@ class MedicalAdviceGenerator:
279
 
280
  # Format each chunk with metadata
281
  context_part = f"""
282
- [Guideline {i}] (Source: {chunk_type.title()}, Relevance: {1-distance:.3f})
283
  {chunk_text}
284
  """.strip()
285
 
 
30
 
31
  # Fallback Generation Configuration (Simplified Architecture)
32
  FALLBACK_TIMEOUTS = {
33
+ "primary": 60.0, # Primary Med42-70B increased timeout for stable evaluation
34
  "fallback_1": 1.0, # RAG template generation (renamed from fallback_2)
35
  "fallback_2": 0.1 # Minimal template generation (instant)
36
  }
 
279
 
280
  # Format each chunk with metadata
281
  context_part = f"""
282
+ [Guideline {i}] (Source: {chunk_type.title()}, Angular Distance: {distance:.3f})
283
  {chunk_text}
284
  """.strip()
285