Spaces:
Sleeping
Sleeping
YanBoChen
commited on
Commit
·
b0f56ec
1
Parent(s):
40d39ed
Refactor relevance calculation and update thresholds in latency evaluator; enhance precision and MRR analyzer with angular distance metrics; increase timeout for primary generation in fallback configuration.
Browse files- evaluation/latency_evaluator.py +19 -21
- evaluation/metric7_8_precision_MRR.py +9 -2
- src/generation.py +2 -2
evaluation/latency_evaluator.py
CHANGED
@@ -273,27 +273,25 @@ class ComprehensiveEvaluator:
|
|
273 |
|
274 |
# METRIC 3: Retrieval Relevance Analysis
|
275 |
if processed_results:
|
276 |
-
|
277 |
for doc_result in processed_results:
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
)
|
283 |
-
similarity_scores.append(similarity)
|
284 |
|
285 |
-
average_relevance = sum(
|
286 |
-
high_relevance_count = sum(1 for score in
|
287 |
|
288 |
relevance_metrics = {
|
289 |
"average_relevance": average_relevance,
|
290 |
-
"max_relevance": max(
|
291 |
-
"min_relevance": min(
|
292 |
-
"
|
293 |
"high_relevance_count": high_relevance_count,
|
294 |
-
"high_relevance_ratio": high_relevance_count / len(
|
295 |
"retrieved_count": len(processed_results),
|
296 |
-
"meets_threshold": average_relevance >= 0.
|
297 |
"retrieval_time": step3_time
|
298 |
}
|
299 |
else:
|
@@ -322,7 +320,7 @@ class ComprehensiveEvaluator:
|
|
322 |
"latency_metrics": {
|
323 |
"total_latency": total_time,
|
324 |
"timing_details": timing_details,
|
325 |
-
"meets_target": total_time <=
|
326 |
},
|
327 |
|
328 |
# Metric 2: Condition Extraction - Success rate from user_prompt.py
|
@@ -411,7 +409,7 @@ class ComprehensiveEvaluator:
|
|
411 |
"latency_metrics": {
|
412 |
"total_latency": total_time,
|
413 |
"timing_details": timing_details,
|
414 |
-
"meets_target": total_time <=
|
415 |
},
|
416 |
|
417 |
# Metric 2: Condition Extraction - Partial data may be available before failure
|
@@ -546,7 +544,7 @@ class ComprehensiveEvaluator:
|
|
546 |
"min_latency": min(latencies),
|
547 |
"max_latency": max(latencies),
|
548 |
"query_count": len(latencies),
|
549 |
-
"target_compliance": sum(1 for lat in latencies if lat <=
|
550 |
"individual_latencies": latencies
|
551 |
}
|
552 |
else:
|
@@ -661,7 +659,7 @@ class ComprehensiveEvaluator:
|
|
661 |
"max_latency": max(latencies),
|
662 |
"successful_queries": len(all_successful_results),
|
663 |
"total_queries": total_queries,
|
664 |
-
"target_compliance": sum(1 for lat in latencies if lat <=
|
665 |
}
|
666 |
|
667 |
elif metric_name == "extraction":
|
@@ -682,8 +680,8 @@ class ComprehensiveEvaluator:
|
|
682 |
"min_relevance": min(relevance_scores),
|
683 |
"successful_queries": len(all_successful_results),
|
684 |
"total_queries": total_queries,
|
685 |
-
"meets_threshold": (sum(relevance_scores) / len(relevance_scores)) >= 0.
|
686 |
-
"target_compliance": (sum(relevance_scores) / len(relevance_scores)) >= 0.
|
687 |
}
|
688 |
|
689 |
elif metric_name == "coverage" and all_successful_results:
|
@@ -866,7 +864,7 @@ if __name__ == "__main__":
|
|
866 |
|
867 |
if metric_name == "latency":
|
868 |
print(f" Average: {overall_results['average_latency']:.2f}s (±{overall_results['std_deviation']:.2f})")
|
869 |
-
print(f"
|
870 |
|
871 |
elif metric_name == "extraction":
|
872 |
print(f" Success Rate: {overall_results['success_rate']:.1%}")
|
|
|
273 |
|
274 |
# METRIC 3: Retrieval Relevance Analysis
|
275 |
if processed_results:
|
276 |
+
relevance_scores = []
|
277 |
for doc_result in processed_results:
|
278 |
+
# Get angular distance and convert to relevance using correct formula
|
279 |
+
distance = doc_result.get('distance', 1.0)
|
280 |
+
relevance = 1.0 - (distance**2) / 2.0 # Correct mathematical conversion
|
281 |
+
relevance_scores.append(relevance)
|
|
|
|
|
282 |
|
283 |
+
average_relevance = sum(relevance_scores) / len(relevance_scores)
|
284 |
+
high_relevance_count = sum(1 for score in relevance_scores if score >= 0.85)
|
285 |
|
286 |
relevance_metrics = {
|
287 |
"average_relevance": average_relevance,
|
288 |
+
"max_relevance": max(relevance_scores),
|
289 |
+
"min_relevance": min(relevance_scores),
|
290 |
+
"relevance_scores": relevance_scores,
|
291 |
"high_relevance_count": high_relevance_count,
|
292 |
+
"high_relevance_ratio": high_relevance_count / len(relevance_scores),
|
293 |
"retrieved_count": len(processed_results),
|
294 |
+
"meets_threshold": average_relevance >= 0.85,
|
295 |
"retrieval_time": step3_time
|
296 |
}
|
297 |
else:
|
|
|
320 |
"latency_metrics": {
|
321 |
"total_latency": total_time,
|
322 |
"timing_details": timing_details,
|
323 |
+
"meets_target": total_time <= 60.0
|
324 |
},
|
325 |
|
326 |
# Metric 2: Condition Extraction - Success rate from user_prompt.py
|
|
|
409 |
"latency_metrics": {
|
410 |
"total_latency": total_time,
|
411 |
"timing_details": timing_details,
|
412 |
+
"meets_target": total_time <= 60.0
|
413 |
},
|
414 |
|
415 |
# Metric 2: Condition Extraction - Partial data may be available before failure
|
|
|
544 |
"min_latency": min(latencies),
|
545 |
"max_latency": max(latencies),
|
546 |
"query_count": len(latencies),
|
547 |
+
"target_compliance": sum(1 for lat in latencies if lat <= 60.0) / len(latencies),
|
548 |
"individual_latencies": latencies
|
549 |
}
|
550 |
else:
|
|
|
659 |
"max_latency": max(latencies),
|
660 |
"successful_queries": len(all_successful_results),
|
661 |
"total_queries": total_queries,
|
662 |
+
"target_compliance": sum(1 for lat in latencies if lat <= 60.0) / len(latencies)
|
663 |
}
|
664 |
|
665 |
elif metric_name == "extraction":
|
|
|
680 |
"min_relevance": min(relevance_scores),
|
681 |
"successful_queries": len(all_successful_results),
|
682 |
"total_queries": total_queries,
|
683 |
+
"meets_threshold": (sum(relevance_scores) / len(relevance_scores)) >= 0.85,
|
684 |
+
"target_compliance": (sum(relevance_scores) / len(relevance_scores)) >= 0.7
|
685 |
}
|
686 |
|
687 |
elif metric_name == "coverage" and all_successful_results:
|
|
|
864 |
|
865 |
if metric_name == "latency":
|
866 |
print(f" Average: {overall_results['average_latency']:.2f}s (±{overall_results['std_deviation']:.2f})")
|
867 |
+
print(f" 60s Target: {'✅ Met' if overall_results['target_compliance'] >= 0.8 else '❌ Not Met'}")
|
868 |
|
869 |
elif metric_name == "extraction":
|
870 |
print(f" Success Rate: {overall_results['success_rate']:.1%}")
|
evaluation/metric7_8_precision_MRR.py
CHANGED
@@ -6,6 +6,12 @@ OnCall.ai System - Precision & MRR Analyzer (Metrics 7-8)
|
|
6 |
Specialized analyzer for calculating Precision@K and Mean Reciprocal Rank (MRR)
|
7 |
using data collected from latency_evaluator.py comprehensive evaluation.
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
METRICS CALCULATED:
|
10 |
7. Precision@K (檢索精確率) - Proportion of relevant results in top-K retrieval
|
11 |
8. Mean Reciprocal Rank (平均倒數排名) - Average reciprocal rank of first relevant result
|
@@ -18,6 +24,7 @@ DESIGN PRINCIPLE:
|
|
18 |
|
19 |
Author: YanBo Chen
|
20 |
Date: 2025-08-04
|
|
|
21 |
"""
|
22 |
|
23 |
import json
|
@@ -121,8 +128,8 @@ class PrecisionMRRAnalyzer:
|
|
121 |
# Step 1: Determine query complexity
|
122 |
is_complex = self._is_complex_query(query, processed_results)
|
123 |
|
124 |
-
# Step 2: Choose adaptive threshold
|
125 |
-
threshold = 0.
|
126 |
|
127 |
print(f" 🎯 Using relevance threshold: {threshold} ({'lenient' if is_complex else 'strict'})")
|
128 |
|
|
|
6 |
Specialized analyzer for calculating Precision@K and Mean Reciprocal Rank (MRR)
|
7 |
using data collected from latency_evaluator.py comprehensive evaluation.
|
8 |
|
9 |
+
IMPORTANT CHANGES - Angular Distance & Relevance Calculation:
|
10 |
+
- DISTANCE METRIC: Uses Angular Distance from Annoy index (range: 0.0-1.0, smaller = more relevant)
|
11 |
+
- RELEVANCE CONVERSION: relevance = 1.0 - (angular_distance²) / 2.0 (mathematical correct formula)
|
12 |
+
- THRESHOLD ALIGNMENT: Aligned with Metric 3 relevance calculation standards
|
13 |
+
- DISPLAY UPDATE: Changed from "Relevance: X" to "Angular Distance: X" for clarity
|
14 |
+
|
15 |
METRICS CALCULATED:
|
16 |
7. Precision@K (檢索精確率) - Proportion of relevant results in top-K retrieval
|
17 |
8. Mean Reciprocal Rank (平均倒數排名) - Average reciprocal rank of first relevant result
|
|
|
24 |
|
25 |
Author: YanBo Chen
|
26 |
Date: 2025-08-04
|
27 |
+
Updated: 2025-08-04 (Angular Distance alignment)
|
28 |
"""
|
29 |
|
30 |
import json
|
|
|
128 |
# Step 1: Determine query complexity
|
129 |
is_complex = self._is_complex_query(query, processed_results)
|
130 |
|
131 |
+
# Step 2: Choose adaptive threshold (aligned with Metric 3 relevance calculation)
|
132 |
+
threshold = 0.75 if is_complex else 0.8
|
133 |
|
134 |
print(f" 🎯 Using relevance threshold: {threshold} ({'lenient' if is_complex else 'strict'})")
|
135 |
|
src/generation.py
CHANGED
@@ -30,7 +30,7 @@ logger = logging.getLogger(__name__)
|
|
30 |
|
31 |
# Fallback Generation Configuration (Simplified Architecture)
|
32 |
FALLBACK_TIMEOUTS = {
|
33 |
-
"primary":
|
34 |
"fallback_1": 1.0, # RAG template generation (renamed from fallback_2)
|
35 |
"fallback_2": 0.1 # Minimal template generation (instant)
|
36 |
}
|
@@ -279,7 +279,7 @@ class MedicalAdviceGenerator:
|
|
279 |
|
280 |
# Format each chunk with metadata
|
281 |
context_part = f"""
|
282 |
-
[Guideline {i}] (Source: {chunk_type.title()},
|
283 |
{chunk_text}
|
284 |
""".strip()
|
285 |
|
|
|
30 |
|
31 |
# Fallback Generation Configuration (Simplified Architecture)
|
32 |
FALLBACK_TIMEOUTS = {
|
33 |
+
"primary": 60.0, # Primary Med42-70B increased timeout for stable evaluation
|
34 |
"fallback_1": 1.0, # RAG template generation (renamed from fallback_2)
|
35 |
"fallback_2": 0.1 # Minimal template generation (instant)
|
36 |
}
|
|
|
279 |
|
280 |
# Format each chunk with metadata
|
281 |
context_part = f"""
|
282 |
+
[Guideline {i}] (Source: {chunk_type.title()}, Angular Distance: {distance:.3f})
|
283 |
{chunk_text}
|
284 |
""".strip()
|
285 |
|