Spaces:
Sleeping
Sleeping
YanBoChen
commited on
Commit
Β·
a2aaea2
1
Parent(s):
8e02192
Before Run the 1st Evalation: Add Precision & MRR Chart Generator and a sample test query
Browse files- Implemented a new Python script `precision_mrr_chart_generator.py` for generating comprehensive charts for Precision@K and Mean Reciprocal Rank (MRR) analysis from JSON results.
- The script includes functionalities for creating various visualizations such as comparison charts, heatmaps, and detailed statistics tables.
- Added a sample test query in `single_test_query.txt` for evaluation purposes.
- evaluation/latency_evaluator.py +74 -16
- evaluation/{latency_chart_generator.py β metric1_latency_chart_generator.py} +0 -0
- evaluation/{extraction_chart_generator.py β metric2_extraction_chart_generator.py} +0 -0
- evaluation/{relevance_chart_generator.py β metric3_relevance_chart_generator.py} +0 -0
- evaluation/{coverage_chart_generator.py β metric4_coverage_chart_generator.py} +0 -0
- evaluation/{llm_judge_evaluator.py β metric5_6_llm_judge_evaluator.py} +0 -0
- evaluation/metric7_8_precision_MRR.py +391 -0
- evaluation/{evaluation_instruction.md β old/evaluation_instruction.md} +0 -0
- evaluation/{evaluation_instruction_customization.md β old/evaluation_instruction_customization.md} +0 -0
- evaluation/precision_mrr_chart_generator.py +586 -0
- evaluation/single_test_query.txt +1 -0
evaluation/latency_evaluator.py
CHANGED
@@ -1,21 +1,48 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
-
OnCall.ai System - Comprehensive Evaluator (Metrics 1-
|
4 |
========================================================
|
5 |
|
6 |
-
Single execution to collect all metrics 1-
|
|
|
7 |
|
8 |
-
|
9 |
-
1. Total Latency (ηΈ½θηζι·) - Complete pipeline timing
|
10 |
-
2. Condition Extraction Success Rate (ζ’δ»Άζ½εζεη) - user_prompt.py success
|
11 |
-
3. Retrieval Relevance (ζͺ’η΄’ηΈιζ§) - cosine similarity from retrieval.py
|
12 |
-
4. Retrieval Coverage (ζͺ’η΄’θ¦θη) - advice utilization of retrieved content
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
17 |
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
Author: YanBo Chen
|
21 |
Date: 2025-08-04
|
@@ -320,6 +347,31 @@ class ComprehensiveEvaluator:
|
|
320 |
"timestamp": datetime.now().isoformat()
|
321 |
}
|
322 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
# Store result
|
324 |
self.comprehensive_results.append(comprehensive_result)
|
325 |
|
@@ -386,8 +438,9 @@ class ComprehensiveEvaluator:
|
|
386 |
},
|
387 |
|
388 |
# Note: Metrics 5-6 (Clinical Actionability & Evidence Quality)
|
389 |
-
# are
|
390 |
-
#
|
|
|
391 |
|
392 |
"overall_success": False,
|
393 |
"status": status,
|
@@ -395,6 +448,9 @@ class ComprehensiveEvaluator:
|
|
395 |
"timestamp": datetime.now().isoformat()
|
396 |
}
|
397 |
|
|
|
|
|
|
|
398 |
self.comprehensive_results.append(failed_result)
|
399 |
return failed_result
|
400 |
|
@@ -741,8 +797,8 @@ if __name__ == "__main__":
|
|
741 |
if len(sys.argv) > 1:
|
742 |
query_file = sys.argv[1]
|
743 |
else:
|
744 |
-
# Default to evaluation/
|
745 |
-
query_file = Path(__file__).parent / "
|
746 |
|
747 |
if not os.path.exists(query_file):
|
748 |
print(f"β Query file not found: {query_file}")
|
@@ -829,7 +885,9 @@ if __name__ == "__main__":
|
|
829 |
print(f" π {metric_name.capitalize()}: {filepath}")
|
830 |
print(f" π Medical Outputs: {outputs_path}")
|
831 |
print(f" π Comprehensive Details: {details_path}")
|
832 |
-
print(f"\nπ‘ Next step: Run
|
|
|
|
|
833 |
print(f" python latency_chart_generator.py")
|
834 |
print(f" python extraction_chart_generator.py # (create separately)")
|
835 |
print(f" python relevance_chart_generator.py # (create separately)")
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
+
OnCall.ai System - Comprehensive Evaluator (Metrics 1-8)
|
4 |
========================================================
|
5 |
|
6 |
+
Single execution to collect all metrics 1-4 data from app.py pipeline.
|
7 |
+
Generates foundation data for metrics 5-8 evaluation in downstream processors.
|
8 |
|
9 |
+
COMPLETE METRICS OVERVIEW:
|
|
|
|
|
|
|
|
|
10 |
|
11 |
+
PIPELINE PERFORMANCE METRICS (Collected by this evaluator):
|
12 |
+
1. Total Latency (ηΈ½θηζι·) - Complete pipeline processing time from query to response
|
13 |
+
2. Condition Extraction Success Rate (ζ’δ»Άζ½εζεη) - Success rate of user_prompt.py condition extraction
|
14 |
+
3. Retrieval Relevance (ζͺ’η΄’ηΈιζ§) - Average cosine similarity scores from retrieval.py results
|
15 |
+
4. Retrieval Coverage (ζͺ’η΄’θ¦θη) - Medical keyword utilization rate between retrieved content and generated advice
|
16 |
|
17 |
+
LLM JUDGE METRICS (Processed by metric5_6_llm_judge_evaluator.py):
|
18 |
+
5. Clinical Actionability (θ¨εΊε―ζδ½ζ§) - Third-party LLM evaluation of medical advice actionability (1-10 scale)
|
19 |
+
* Uses batch evaluation strategy with Llama3-70B as judge
|
20 |
+
* Measures: Can healthcare providers immediately act on this advice?
|
21 |
+
* Target threshold: β₯7.0/10 for acceptable actionability
|
22 |
+
|
23 |
+
6. Clinical Evidence Quality (θ¨εΊθζεθ³ͺ) - Third-party LLM evaluation of evidence-based quality (1-10 scale)
|
24 |
+
* Uses same batch evaluation call as metric 5 for efficiency
|
25 |
+
* Measures: Is the advice evidence-based and follows medical standards?
|
26 |
+
* Target threshold: β₯7.5/10 for acceptable evidence quality
|
27 |
+
|
28 |
+
RETRIEVAL PRECISION METRICS (Processed by metric7_8_precision_MRR.py):
|
29 |
+
7. Precision@K (ζͺ’η΄’η²Ύη’Ίη) - Proportion of relevant results in top-K retrieval results
|
30 |
+
* Uses adaptive threshold based on query complexity (0.15 for complex, 0.25 for simple queries)
|
31 |
+
* Query complexity determined by unique emergency keywords count (β₯4 = complex)
|
32 |
+
* Measures: relevant_results / total_retrieved_results
|
33 |
+
|
34 |
+
8. Mean Reciprocal Rank (εΉ³εεζΈζε) - Average reciprocal rank of first relevant result
|
35 |
+
* Uses same adaptive threshold as Precision@K
|
36 |
+
* Measures: 1 / rank_of_first_relevant_result (0 if no relevant results)
|
37 |
+
* Higher MRR indicates relevant results appear earlier in ranking
|
38 |
+
|
39 |
+
DATA FLOW ARCHITECTURE:
|
40 |
+
1. latency_evaluator.py β comprehensive_details_*.json (metrics 1-4 + pipeline data)
|
41 |
+
2. latency_evaluator.py β medical_outputs_*.json (medical advice for judge evaluation)
|
42 |
+
3. metric5_6_llm_judge_evaluator.py β judge_evaluation_*.json (metrics 5-6)
|
43 |
+
4. metric7_8_precision_MRR.py β precision_mrr_analysis_*.json (metrics 7-8)
|
44 |
+
|
45 |
+
Note: This evaluator focuses on metrics 1-4 collection. Metrics 5-8 require separate downstream evaluation.
|
46 |
|
47 |
Author: YanBo Chen
|
48 |
Date: 2025-08-04
|
|
|
347 |
"timestamp": datetime.now().isoformat()
|
348 |
}
|
349 |
|
350 |
+
# Validate data completeness for metrics 7-8 analysis
|
351 |
+
ready = True
|
352 |
+
data = comprehensive_result.get('pipeline_data', {})
|
353 |
+
|
354 |
+
# 1. Check retrieval results completeness for precision/MRR calculation
|
355 |
+
retr = data.get('retrieval_results', {}).get('processed_results', [])
|
356 |
+
if not retr or 'distance' not in retr[0]:
|
357 |
+
ready = False
|
358 |
+
|
359 |
+
# 2. Check condition extraction completeness for complexity analysis
|
360 |
+
cond = data.get('condition_result', {}).get('condition')
|
361 |
+
if not cond:
|
362 |
+
ready = False
|
363 |
+
|
364 |
+
# 3. Check overall execution status
|
365 |
+
if not comprehensive_result.get('overall_success', False):
|
366 |
+
ready = False
|
367 |
+
|
368 |
+
# 4. Check retrieval timing data completeness
|
369 |
+
if 'retrieval_time' not in comprehensive_result.get('relevance_metrics', {}):
|
370 |
+
ready = False
|
371 |
+
|
372 |
+
# Set metrics 7-8 readiness flag for downstream precision/MRR analysis
|
373 |
+
comprehensive_result['precision_mrr_ready'] = ready
|
374 |
+
|
375 |
# Store result
|
376 |
self.comprehensive_results.append(comprehensive_result)
|
377 |
|
|
|
438 |
},
|
439 |
|
440 |
# Note: Metrics 5-6 (Clinical Actionability & Evidence Quality)
|
441 |
+
# are collected by metric5_6_llm_judge_evaluator.py using medical_outputs
|
442 |
+
# Metrics 7-8 (Precision@K & MRR) are collected by metric7_8_precision_MRR.py
|
443 |
+
# using comprehensive_details pipeline data
|
444 |
|
445 |
"overall_success": False,
|
446 |
"status": status,
|
|
|
448 |
"timestamp": datetime.now().isoformat()
|
449 |
}
|
450 |
|
451 |
+
# For failed results, precision/MRR analysis data is not ready
|
452 |
+
failed_result['precision_mrr_ready'] = False
|
453 |
+
|
454 |
self.comprehensive_results.append(failed_result)
|
455 |
return failed_result
|
456 |
|
|
|
797 |
if len(sys.argv) > 1:
|
798 |
query_file = sys.argv[1]
|
799 |
else:
|
800 |
+
# Default to evaluation/single_test_query.txt for initial testing
|
801 |
+
query_file = Path(__file__).parent / "single_test_query.txt"
|
802 |
|
803 |
if not os.path.exists(query_file):
|
804 |
print(f"β Query file not found: {query_file}")
|
|
|
885 |
print(f" π {metric_name.capitalize()}: {filepath}")
|
886 |
print(f" π Medical Outputs: {outputs_path}")
|
887 |
print(f" π Comprehensive Details: {details_path}")
|
888 |
+
print(f"\nπ‘ Next step: Run downstream evaluators for metrics 5-8")
|
889 |
+
print(f" python metric5_6_llm_judge_evaluator.py rag")
|
890 |
+
print(f" python metric7_8_precision_MRR.py {details_path}")
|
891 |
print(f" python latency_chart_generator.py")
|
892 |
print(f" python extraction_chart_generator.py # (create separately)")
|
893 |
print(f" python relevance_chart_generator.py # (create separately)")
|
evaluation/{latency_chart_generator.py β metric1_latency_chart_generator.py}
RENAMED
File without changes
|
evaluation/{extraction_chart_generator.py β metric2_extraction_chart_generator.py}
RENAMED
File without changes
|
evaluation/{relevance_chart_generator.py β metric3_relevance_chart_generator.py}
RENAMED
File without changes
|
evaluation/{coverage_chart_generator.py β metric4_coverage_chart_generator.py}
RENAMED
File without changes
|
evaluation/{llm_judge_evaluator.py β metric5_6_llm_judge_evaluator.py}
RENAMED
File without changes
|
evaluation/metric7_8_precision_MRR.py
ADDED
@@ -0,0 +1,391 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
OnCall.ai System - Precision & MRR Analyzer (Metrics 7-8)
|
4 |
+
========================================================
|
5 |
+
|
6 |
+
Specialized analyzer for calculating Precision@K and Mean Reciprocal Rank (MRR)
|
7 |
+
using data collected from latency_evaluator.py comprehensive evaluation.
|
8 |
+
|
9 |
+
METRICS CALCULATED:
|
10 |
+
7. Precision@K (ζͺ’η΄’η²Ύη’Ίη) - Proportion of relevant results in top-K retrieval
|
11 |
+
8. Mean Reciprocal Rank (εΉ³εεζΈζε) - Average reciprocal rank of first relevant result
|
12 |
+
|
13 |
+
DESIGN PRINCIPLE:
|
14 |
+
- Reuses comprehensive_details_*.json from latency_evaluator.py
|
15 |
+
- Implements adaptive threshold based on query complexity
|
16 |
+
- Query complexity determined by actual matched emergency keywords count
|
17 |
+
- No additional LLM calls required
|
18 |
+
|
19 |
+
Author: YanBo Chen
|
20 |
+
Date: 2025-08-04
|
21 |
+
"""
|
22 |
+
|
23 |
+
import json
|
24 |
+
import os
|
25 |
+
import sys
|
26 |
+
from typing import Dict, List, Any, Set
|
27 |
+
from datetime import datetime
|
28 |
+
from pathlib import Path
|
29 |
+
import re
|
30 |
+
import statistics
|
31 |
+
|
32 |
+
class PrecisionMRRAnalyzer:
|
33 |
+
"""Specialized analyzer for metrics 7-8 using existing comprehensive evaluation data"""
|
34 |
+
|
35 |
+
def __init__(self):
|
36 |
+
"""Initialize analyzer"""
|
37 |
+
print("π§ Initializing Precision & MRR Analyzer...")
|
38 |
+
self.analysis_results = []
|
39 |
+
print("β
Analyzer initialization complete")
|
40 |
+
|
41 |
+
def load_comprehensive_data(self, filepath: str) -> List[Dict]:
|
42 |
+
"""
|
43 |
+
Load comprehensive evaluation data from latency_evaluator.py output
|
44 |
+
|
45 |
+
Args:
|
46 |
+
filepath: Path to comprehensive_details_*.json file
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
List of comprehensive evaluation results
|
50 |
+
"""
|
51 |
+
try:
|
52 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
53 |
+
data = json.load(f)
|
54 |
+
|
55 |
+
comprehensive_results = data.get('comprehensive_results', [])
|
56 |
+
|
57 |
+
print(f"π Loaded {len(comprehensive_results)} comprehensive evaluation results")
|
58 |
+
print(f"π Ready for precision/MRR analysis: {sum(1 for r in comprehensive_results if r.get('precision_mrr_ready'))}")
|
59 |
+
|
60 |
+
return comprehensive_results
|
61 |
+
|
62 |
+
except Exception as e:
|
63 |
+
print(f"β Failed to load comprehensive data: {e}")
|
64 |
+
return []
|
65 |
+
|
66 |
+
def _is_complex_query(self, query: str, processed_results: List[Dict]) -> bool:
|
67 |
+
"""
|
68 |
+
Determine query complexity based on actual matched emergency keywords
|
69 |
+
|
70 |
+
Args:
|
71 |
+
query: Original query text
|
72 |
+
processed_results: Retrieval results with matched keywords
|
73 |
+
|
74 |
+
Returns:
|
75 |
+
True if query is complex (should use lenient threshold)
|
76 |
+
"""
|
77 |
+
# Collect unique emergency keywords actually found in retrieval results
|
78 |
+
unique_emergency_keywords = set()
|
79 |
+
|
80 |
+
for result in processed_results:
|
81 |
+
if result.get('type') == 'emergency':
|
82 |
+
matched_keywords = result.get('matched', '')
|
83 |
+
if matched_keywords:
|
84 |
+
keywords = [kw.strip() for kw in matched_keywords.split('|') if kw.strip()]
|
85 |
+
unique_emergency_keywords.update(keywords)
|
86 |
+
|
87 |
+
keyword_count = len(unique_emergency_keywords)
|
88 |
+
|
89 |
+
# Business logic: 4+ different emergency keywords indicate complex case
|
90 |
+
is_complex = keyword_count >= 4
|
91 |
+
|
92 |
+
print(f" π§ Query complexity: {'Complex' if is_complex else 'Simple'} ({keyword_count} emergency keywords)")
|
93 |
+
print(f" π Found keywords: {', '.join(list(unique_emergency_keywords)[:5])}")
|
94 |
+
|
95 |
+
return is_complex
|
96 |
+
|
97 |
+
def calculate_precision_mrr_single(self, query_data: Dict) -> Dict[str, Any]:
|
98 |
+
"""
|
99 |
+
Calculate precision@K and MRR for single query
|
100 |
+
|
101 |
+
Args:
|
102 |
+
query_data: Single query's comprehensive evaluation result
|
103 |
+
|
104 |
+
Returns:
|
105 |
+
Precision and MRR metrics for this query
|
106 |
+
"""
|
107 |
+
query = query_data['query']
|
108 |
+
category = query_data['category']
|
109 |
+
|
110 |
+
# Extract processed results from pipeline data
|
111 |
+
pipeline_data = query_data.get('pipeline_data', {})
|
112 |
+
retrieval_results = pipeline_data.get('retrieval_results', {})
|
113 |
+
processed_results = retrieval_results.get('processed_results', [])
|
114 |
+
|
115 |
+
print(f"π Analyzing precision/MRR for: {query[:50]}...")
|
116 |
+
print(f"π Category: {category}, Results: {len(processed_results)}")
|
117 |
+
|
118 |
+
if not processed_results:
|
119 |
+
return self._create_empty_precision_mrr_result(query, category)
|
120 |
+
|
121 |
+
# Step 1: Determine query complexity
|
122 |
+
is_complex = self._is_complex_query(query, processed_results)
|
123 |
+
|
124 |
+
# Step 2: Choose adaptive threshold
|
125 |
+
threshold = 0.15 if is_complex else 0.25
|
126 |
+
|
127 |
+
print(f" π― Using relevance threshold: {threshold} ({'lenient' if is_complex else 'strict'})")
|
128 |
+
|
129 |
+
# Step 3: Calculate relevance scores (1 - distance)
|
130 |
+
relevance_scores = []
|
131 |
+
for result in processed_results:
|
132 |
+
distance = result.get('distance', 1.0)
|
133 |
+
relevance = 1.0 - distance
|
134 |
+
relevance_scores.append(relevance)
|
135 |
+
|
136 |
+
# Step 4: Calculate Precision@K
|
137 |
+
relevant_count = sum(1 for score in relevance_scores if score >= threshold)
|
138 |
+
precision_at_k = relevant_count / len(processed_results)
|
139 |
+
|
140 |
+
# Step 5: Calculate MRR
|
141 |
+
first_relevant_rank = None
|
142 |
+
for i, score in enumerate(relevance_scores, 1):
|
143 |
+
if score >= threshold:
|
144 |
+
first_relevant_rank = i
|
145 |
+
break
|
146 |
+
|
147 |
+
mrr_score = (1.0 / first_relevant_rank) if first_relevant_rank else 0.0
|
148 |
+
|
149 |
+
# Detailed analysis
|
150 |
+
result = {
|
151 |
+
"query": query,
|
152 |
+
"category": category,
|
153 |
+
"query_complexity": "complex" if is_complex else "simple",
|
154 |
+
"threshold_used": threshold,
|
155 |
+
|
156 |
+
# Metric 7: Precision@K
|
157 |
+
"precision_at_k": precision_at_k,
|
158 |
+
"relevant_count": relevant_count,
|
159 |
+
"total_results": len(processed_results),
|
160 |
+
|
161 |
+
# Metric 8: MRR
|
162 |
+
"mrr_score": mrr_score,
|
163 |
+
"first_relevant_rank": first_relevant_rank,
|
164 |
+
|
165 |
+
# Supporting data
|
166 |
+
"relevance_scores": relevance_scores,
|
167 |
+
"avg_relevance": sum(relevance_scores) / len(relevance_scores),
|
168 |
+
"max_relevance": max(relevance_scores),
|
169 |
+
"min_relevance": min(relevance_scores),
|
170 |
+
|
171 |
+
"timestamp": datetime.now().isoformat()
|
172 |
+
}
|
173 |
+
|
174 |
+
print(f" π Precision@{len(processed_results)}: {precision_at_k:.3f} ({relevant_count}/{len(processed_results)} relevant)")
|
175 |
+
print(f" π MRR: {mrr_score:.3f} (first relevant at rank {first_relevant_rank})")
|
176 |
+
|
177 |
+
return result
|
178 |
+
|
179 |
+
def _create_empty_precision_mrr_result(self, query: str, category: str) -> Dict[str, Any]:
|
180 |
+
"""Create empty result for failed queries"""
|
181 |
+
return {
|
182 |
+
"query": query,
|
183 |
+
"category": category,
|
184 |
+
"query_complexity": "unknown",
|
185 |
+
"threshold_used": 0.0,
|
186 |
+
"precision_at_k": 0.0,
|
187 |
+
"relevant_count": 0,
|
188 |
+
"total_results": 0,
|
189 |
+
"mrr_score": 0.0,
|
190 |
+
"first_relevant_rank": None,
|
191 |
+
"relevance_scores": [],
|
192 |
+
"timestamp": datetime.now().isoformat()
|
193 |
+
}
|
194 |
+
|
195 |
+
def analyze_all_queries(self, comprehensive_results: List[Dict]) -> List[Dict]:
|
196 |
+
"""
|
197 |
+
Analyze precision/MRR for all queries in comprehensive evaluation
|
198 |
+
|
199 |
+
Args:
|
200 |
+
comprehensive_results: Results from latency_evaluator.py
|
201 |
+
|
202 |
+
Returns:
|
203 |
+
List of precision/MRR analysis results
|
204 |
+
"""
|
205 |
+
print(f"\nπ Analyzing Precision@K and MRR for {len(comprehensive_results)} queries...")
|
206 |
+
|
207 |
+
analysis_results = []
|
208 |
+
|
209 |
+
for i, query_data in enumerate(comprehensive_results):
|
210 |
+
if not query_data.get('precision_mrr_ready'):
|
211 |
+
print(f"βοΈ Skipping query {i+1}: Not ready for precision/MRR analysis")
|
212 |
+
continue
|
213 |
+
|
214 |
+
if not query_data.get('overall_success'):
|
215 |
+
print(f"βοΈ Skipping query {i+1}: Pipeline failed")
|
216 |
+
analysis_results.append(self._create_empty_precision_mrr_result(
|
217 |
+
query_data['query'],
|
218 |
+
query_data['category']
|
219 |
+
))
|
220 |
+
continue
|
221 |
+
|
222 |
+
# Analyze this query
|
223 |
+
result = self.calculate_precision_mrr_single(query_data)
|
224 |
+
analysis_results.append(result)
|
225 |
+
|
226 |
+
print("") # Spacing between queries
|
227 |
+
|
228 |
+
self.analysis_results = analysis_results
|
229 |
+
return analysis_results
|
230 |
+
|
231 |
+
def calculate_statistics(self) -> Dict[str, Any]:
|
232 |
+
"""Calculate comprehensive statistics for metrics 7-8"""
|
233 |
+
|
234 |
+
if not self.analysis_results:
|
235 |
+
return {"error": "No analysis results available"}
|
236 |
+
|
237 |
+
# Separate by complexity and category
|
238 |
+
stats = {
|
239 |
+
"overall_statistics": {},
|
240 |
+
"by_complexity": {"simple": {}, "complex": {}},
|
241 |
+
"by_category": {"diagnosis": {}, "treatment": {}, "mixed": {}},
|
242 |
+
"timestamp": datetime.now().isoformat()
|
243 |
+
}
|
244 |
+
|
245 |
+
# Overall statistics
|
246 |
+
all_precision = [r['precision_at_k'] for r in self.analysis_results]
|
247 |
+
all_mrr = [r['mrr_score'] for r in self.analysis_results]
|
248 |
+
|
249 |
+
stats["overall_statistics"] = {
|
250 |
+
"total_queries": len(self.analysis_results),
|
251 |
+
"avg_precision": statistics.mean(all_precision),
|
252 |
+
"avg_mrr": statistics.mean(all_mrr),
|
253 |
+
"precision_std": statistics.stdev(all_precision) if len(all_precision) > 1 else 0.0,
|
254 |
+
"mrr_std": statistics.stdev(all_mrr) if len(all_mrr) > 1 else 0.0
|
255 |
+
}
|
256 |
+
|
257 |
+
# By complexity
|
258 |
+
for complexity in ["simple", "complex"]:
|
259 |
+
complexity_results = [r for r in self.analysis_results if r['query_complexity'] == complexity]
|
260 |
+
if complexity_results:
|
261 |
+
precision_scores = [r['precision_at_k'] for r in complexity_results]
|
262 |
+
mrr_scores = [r['mrr_score'] for r in complexity_results]
|
263 |
+
|
264 |
+
stats["by_complexity"][complexity] = {
|
265 |
+
"query_count": len(complexity_results),
|
266 |
+
"avg_precision": statistics.mean(precision_scores),
|
267 |
+
"avg_mrr": statistics.mean(mrr_scores),
|
268 |
+
"avg_threshold": statistics.mean([r['threshold_used'] for r in complexity_results])
|
269 |
+
}
|
270 |
+
|
271 |
+
# By category
|
272 |
+
for category in ["diagnosis", "treatment", "mixed"]:
|
273 |
+
category_results = [r for r in self.analysis_results if r['category'] == category]
|
274 |
+
if category_results:
|
275 |
+
precision_scores = [r['precision_at_k'] for r in category_results]
|
276 |
+
mrr_scores = [r['mrr_score'] for r in category_results]
|
277 |
+
|
278 |
+
stats["by_category"][category] = {
|
279 |
+
"query_count": len(category_results),
|
280 |
+
"avg_precision": statistics.mean(precision_scores),
|
281 |
+
"avg_mrr": statistics.mean(mrr_scores)
|
282 |
+
}
|
283 |
+
|
284 |
+
return stats
|
285 |
+
|
286 |
+
def save_results(self, filename: str = None) -> str:
|
287 |
+
"""Save precision/MRR analysis results"""
|
288 |
+
if filename is None:
|
289 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
290 |
+
filename = f"precision_mrr_analysis_{timestamp}.json"
|
291 |
+
|
292 |
+
# Ensure results directory exists
|
293 |
+
results_dir = Path(__file__).parent / "results"
|
294 |
+
results_dir.mkdir(exist_ok=True)
|
295 |
+
|
296 |
+
filepath = results_dir / filename
|
297 |
+
|
298 |
+
# Create output data
|
299 |
+
output_data = {
|
300 |
+
"analysis_metadata": {
|
301 |
+
"total_queries": len(self.analysis_results),
|
302 |
+
"analysis_type": "precision_mrr_metrics_7_8",
|
303 |
+
"timestamp": datetime.now().isoformat(),
|
304 |
+
"adaptive_threshold": True
|
305 |
+
},
|
306 |
+
"detailed_results": self.analysis_results,
|
307 |
+
"statistics": self.calculate_statistics()
|
308 |
+
}
|
309 |
+
|
310 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
311 |
+
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
312 |
+
|
313 |
+
print(f"π Precision/MRR analysis saved to: {filepath}")
|
314 |
+
return str(filepath)
|
315 |
+
|
316 |
+
|
317 |
+
# Independent execution interface
|
318 |
+
if __name__ == "__main__":
|
319 |
+
"""Independent precision/MRR analysis interface"""
|
320 |
+
|
321 |
+
print("π OnCall.ai Precision & MRR Analyzer - Metrics 7-8")
|
322 |
+
|
323 |
+
if len(sys.argv) > 1:
|
324 |
+
comprehensive_file = sys.argv[1]
|
325 |
+
else:
|
326 |
+
# Look for latest comprehensive_details file
|
327 |
+
results_dir = Path(__file__).parent / "results"
|
328 |
+
if results_dir.exists():
|
329 |
+
comprehensive_files = list(results_dir.glob("comprehensive_details_*.json"))
|
330 |
+
if comprehensive_files:
|
331 |
+
comprehensive_file = str(sorted(comprehensive_files)[-1]) # Latest file
|
332 |
+
print(f"π Using latest comprehensive file: {comprehensive_file}")
|
333 |
+
else:
|
334 |
+
print("β No comprehensive_details_*.json files found")
|
335 |
+
print("Please run latency_evaluator.py first to generate comprehensive data")
|
336 |
+
sys.exit(1)
|
337 |
+
else:
|
338 |
+
print("β Results directory not found")
|
339 |
+
sys.exit(1)
|
340 |
+
|
341 |
+
if not os.path.exists(comprehensive_file):
|
342 |
+
print(f"β Comprehensive file not found: {comprehensive_file}")
|
343 |
+
print("Usage: python precision_MRR.py [comprehensive_details_file.json]")
|
344 |
+
sys.exit(1)
|
345 |
+
|
346 |
+
# Initialize analyzer
|
347 |
+
analyzer = PrecisionMRRAnalyzer()
|
348 |
+
|
349 |
+
# Load comprehensive data from latency_evaluator.py
|
350 |
+
comprehensive_results = analyzer.load_comprehensive_data(comprehensive_file)
|
351 |
+
|
352 |
+
if not comprehensive_results:
|
353 |
+
print("β No comprehensive data loaded")
|
354 |
+
sys.exit(1)
|
355 |
+
|
356 |
+
# Analyze precision/MRR for all queries
|
357 |
+
analysis_results = analyzer.analyze_all_queries(comprehensive_results)
|
358 |
+
|
359 |
+
# Calculate and display statistics
|
360 |
+
statistics_result = analyzer.calculate_statistics()
|
361 |
+
|
362 |
+
print(f"\nπ === PRECISION & MRR ANALYSIS SUMMARY ===")
|
363 |
+
|
364 |
+
overall_stats = statistics_result['overall_statistics']
|
365 |
+
print(f"\nOVERALL METRICS:")
|
366 |
+
print(f" Precision@K: {overall_stats['avg_precision']:.3f} (Β±{overall_stats['precision_std']:.3f})")
|
367 |
+
print(f" MRR: {overall_stats['avg_mrr']:.3f} (Β±{overall_stats['mrr_std']:.3f})")
|
368 |
+
print(f" Total Queries: {overall_stats['total_queries']}")
|
369 |
+
|
370 |
+
# Complexity-based statistics
|
371 |
+
complexity_stats = statistics_result['by_complexity']
|
372 |
+
print(f"\nBY COMPLEXITY:")
|
373 |
+
for complexity, stats in complexity_stats.items():
|
374 |
+
if stats:
|
375 |
+
print(f" {complexity.title()}: Precision={stats['avg_precision']:.3f}, MRR={stats['avg_mrr']:.3f} "
|
376 |
+
f"(threshold={stats['avg_threshold']:.2f}, n={stats['query_count']})")
|
377 |
+
|
378 |
+
# Category-based statistics
|
379 |
+
category_stats = statistics_result['by_category']
|
380 |
+
print(f"\nBY CATEGORY:")
|
381 |
+
for category, stats in category_stats.items():
|
382 |
+
if stats:
|
383 |
+
print(f" {category.title()}: Precision={stats['avg_precision']:.3f}, MRR={stats['avg_mrr']:.3f} "
|
384 |
+
f"(n={stats['query_count']})")
|
385 |
+
|
386 |
+
# Save results
|
387 |
+
saved_path = analyzer.save_results()
|
388 |
+
|
389 |
+
print(f"\nβ
Precision & MRR analysis complete!")
|
390 |
+
print(f"π Results saved to: {saved_path}")
|
391 |
+
print(f"\nπ‘ Next step: Create precision_mrr_chart_generator.py for visualization")
|
evaluation/{evaluation_instruction.md β old/evaluation_instruction.md}
RENAMED
File without changes
|
evaluation/{evaluation_instruction_customization.md β old/evaluation_instruction_customization.md}
RENAMED
File without changes
|
evaluation/precision_mrr_chart_generator.py
ADDED
@@ -0,0 +1,586 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
OnCall.ai System - Precision & MRR Chart Generator (Metrics 7-8)
|
4 |
+
===============================================================
|
5 |
+
|
6 |
+
Generates comprehensive Precision@K and MRR analysis charts from saved analysis results.
|
7 |
+
Reads JSON files produced by metric7_8_precision_MRR.py and creates visualizations.
|
8 |
+
|
9 |
+
Charts generated:
|
10 |
+
1. Precision@K comparison by category and complexity
|
11 |
+
2. MRR comparison by category and complexity
|
12 |
+
3. Combined metrics heatmap
|
13 |
+
4. Threshold impact analysis
|
14 |
+
5. Detailed statistics tables
|
15 |
+
|
16 |
+
No LLM calls - pure data visualization.
|
17 |
+
|
18 |
+
Author: YanBo Chen
|
19 |
+
Date: 2025-08-04
|
20 |
+
"""
|
21 |
+
|
22 |
+
import json
|
23 |
+
import os
|
24 |
+
import sys
|
25 |
+
from typing import Dict, List, Any
|
26 |
+
from datetime import datetime
|
27 |
+
from pathlib import Path
|
28 |
+
import glob
|
29 |
+
|
30 |
+
# Visualization imports
|
31 |
+
import matplotlib.pyplot as plt
|
32 |
+
import seaborn as sns
|
33 |
+
import pandas as pd
|
34 |
+
import numpy as np
|
35 |
+
|
36 |
+
|
37 |
+
class PrecisionMRRChartGenerator:
|
38 |
+
"""Generate charts from precision/MRR analysis results - no LLM dependency"""
|
39 |
+
|
40 |
+
def __init__(self):
|
41 |
+
"""Initialize chart generator"""
|
42 |
+
print("π Initializing Precision & MRR Chart Generator...")
|
43 |
+
|
44 |
+
# Set up professional chart style
|
45 |
+
plt.style.use('default')
|
46 |
+
sns.set_palette("husl")
|
47 |
+
|
48 |
+
print("β
Chart Generator ready")
|
49 |
+
|
50 |
+
def load_latest_analysis(self, results_dir: str = None) -> Dict[str, Any]:
|
51 |
+
"""
|
52 |
+
Load the most recent precision/MRR analysis file
|
53 |
+
|
54 |
+
Args:
|
55 |
+
results_dir: Directory containing analysis files
|
56 |
+
"""
|
57 |
+
if results_dir is None:
|
58 |
+
results_dir = Path(__file__).parent / "results"
|
59 |
+
|
60 |
+
analysis_files = glob.glob(str(results_dir / "precision_mrr_analysis_*.json"))
|
61 |
+
|
62 |
+
if not analysis_files:
|
63 |
+
raise FileNotFoundError("No precision_mrr_analysis_*.json files found. Run metric7_8_precision_MRR.py first.")
|
64 |
+
|
65 |
+
latest_file = max(analysis_files, key=os.path.getctime)
|
66 |
+
print(f"π Loading latest analysis: {latest_file}")
|
67 |
+
|
68 |
+
with open(latest_file, 'r', encoding='utf-8') as f:
|
69 |
+
return json.load(f)
|
70 |
+
|
71 |
+
def create_precision_comparison_chart(self, analysis_data: Dict, save_path: str = None) -> str:
|
72 |
+
"""Create Precision@K comparison chart"""
|
73 |
+
|
74 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
75 |
+
|
76 |
+
# Chart 1: Precision by Category
|
77 |
+
category_stats = analysis_data['statistics']['by_category']
|
78 |
+
categories = []
|
79 |
+
precisions = []
|
80 |
+
|
81 |
+
for category, stats in category_stats.items():
|
82 |
+
if stats:
|
83 |
+
categories.append(category.title())
|
84 |
+
precisions.append(stats['avg_precision'])
|
85 |
+
|
86 |
+
if categories:
|
87 |
+
bars1 = ax1.bar(categories, precisions, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728'])
|
88 |
+
ax1.set_title('Precision@K by Query Category', fontweight='bold')
|
89 |
+
ax1.set_ylabel('Precision@K')
|
90 |
+
ax1.set_xlabel('Query Category')
|
91 |
+
ax1.set_ylim(0, 1.0)
|
92 |
+
ax1.grid(True, alpha=0.3)
|
93 |
+
|
94 |
+
# Add value labels
|
95 |
+
for bar, precision in zip(bars1, precisions):
|
96 |
+
height = bar.get_height()
|
97 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
98 |
+
f'{precision:.3f}', ha='center', va='bottom', fontweight='bold')
|
99 |
+
|
100 |
+
# Chart 2: Precision by Complexity
|
101 |
+
complexity_stats = analysis_data['statistics']['by_complexity']
|
102 |
+
complexities = []
|
103 |
+
comp_precisions = []
|
104 |
+
|
105 |
+
for complexity, stats in complexity_stats.items():
|
106 |
+
if stats:
|
107 |
+
complexities.append(complexity.title())
|
108 |
+
comp_precisions.append(stats['avg_precision'])
|
109 |
+
|
110 |
+
if complexities:
|
111 |
+
bars2 = ax2.bar(complexities, comp_precisions, alpha=0.8, color=['#2ca02c', '#d62728'])
|
112 |
+
ax2.set_title('Precision@K by Query Complexity', fontweight='bold')
|
113 |
+
ax2.set_ylabel('Precision@K')
|
114 |
+
ax2.set_xlabel('Query Complexity')
|
115 |
+
ax2.set_ylim(0, 1.0)
|
116 |
+
ax2.grid(True, alpha=0.3)
|
117 |
+
|
118 |
+
# Add value labels and threshold info
|
119 |
+
for bar, precision, complexity in zip(bars2, comp_precisions, complexities):
|
120 |
+
height = bar.get_height()
|
121 |
+
threshold = 0.15 if complexity.lower() == 'complex' else 0.25
|
122 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
123 |
+
f'{precision:.3f}\n(T={threshold})', ha='center', va='bottom',
|
124 |
+
fontweight='bold', fontsize=9)
|
125 |
+
|
126 |
+
plt.tight_layout()
|
127 |
+
|
128 |
+
# Save chart
|
129 |
+
if save_path is None:
|
130 |
+
save_path = Path(__file__).parent / "charts" / f"precision_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
131 |
+
|
132 |
+
save_path = Path(save_path)
|
133 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
134 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
135 |
+
plt.close()
|
136 |
+
|
137 |
+
print(f"π Precision comparison chart saved: {save_path}")
|
138 |
+
return str(save_path)
|
139 |
+
|
140 |
+
def create_mrr_comparison_chart(self, analysis_data: Dict, save_path: str = None) -> str:
|
141 |
+
"""Create MRR comparison chart"""
|
142 |
+
|
143 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
144 |
+
|
145 |
+
# Chart 1: MRR by Category
|
146 |
+
category_stats = analysis_data['statistics']['by_category']
|
147 |
+
categories = []
|
148 |
+
mrr_scores = []
|
149 |
+
|
150 |
+
for category, stats in category_stats.items():
|
151 |
+
if stats:
|
152 |
+
categories.append(category.title())
|
153 |
+
mrr_scores.append(stats['avg_mrr'])
|
154 |
+
|
155 |
+
if categories:
|
156 |
+
bars1 = ax1.bar(categories, mrr_scores, alpha=0.8, color=['#9467bd', '#8c564b', '#e377c2'])
|
157 |
+
ax1.set_title('Mean Reciprocal Rank by Query Category', fontweight='bold')
|
158 |
+
ax1.set_ylabel('MRR Score')
|
159 |
+
ax1.set_xlabel('Query Category')
|
160 |
+
ax1.set_ylim(0, 1.0)
|
161 |
+
ax1.grid(True, alpha=0.3)
|
162 |
+
|
163 |
+
# Add value labels
|
164 |
+
for bar, mrr in zip(bars1, mrr_scores):
|
165 |
+
height = bar.get_height()
|
166 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
167 |
+
f'{mrr:.3f}', ha='center', va='bottom', fontweight='bold')
|
168 |
+
|
169 |
+
# Chart 2: MRR by Complexity
|
170 |
+
complexity_stats = analysis_data['statistics']['by_complexity']
|
171 |
+
complexities = []
|
172 |
+
comp_mrr = []
|
173 |
+
|
174 |
+
for complexity, stats in complexity_stats.items():
|
175 |
+
if stats:
|
176 |
+
complexities.append(complexity.title())
|
177 |
+
comp_mrr.append(stats['avg_mrr'])
|
178 |
+
|
179 |
+
if complexities:
|
180 |
+
bars2 = ax2.bar(complexities, comp_mrr, alpha=0.8, color=['#17becf', '#bcbd22'])
|
181 |
+
ax2.set_title('MRR by Query Complexity', fontweight='bold')
|
182 |
+
ax2.set_ylabel('MRR Score')
|
183 |
+
ax2.set_xlabel('Query Complexity')
|
184 |
+
ax2.set_ylim(0, 1.0)
|
185 |
+
ax2.grid(True, alpha=0.3)
|
186 |
+
|
187 |
+
# Add value labels
|
188 |
+
for bar, mrr in zip(bars2, comp_mrr):
|
189 |
+
height = bar.get_height()
|
190 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
191 |
+
f'{mrr:.3f}', ha='center', va='bottom', fontweight='bold')
|
192 |
+
|
193 |
+
plt.tight_layout()
|
194 |
+
|
195 |
+
# Save chart
|
196 |
+
if save_path is None:
|
197 |
+
save_path = Path(__file__).parent / "charts" / f"mrr_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
198 |
+
|
199 |
+
save_path = Path(save_path)
|
200 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
201 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
202 |
+
plt.close()
|
203 |
+
|
204 |
+
print(f"π MRR comparison chart saved: {save_path}")
|
205 |
+
return str(save_path)
|
206 |
+
|
207 |
+
def create_combined_metrics_heatmap(self, analysis_data: Dict, save_path: str = None) -> str:
|
208 |
+
"""Create combined precision/MRR heatmap"""
|
209 |
+
|
210 |
+
# Prepare data for heatmap
|
211 |
+
detailed_results = analysis_data.get('detailed_results', [])
|
212 |
+
|
213 |
+
if not detailed_results:
|
214 |
+
print("β οΈ No detailed results for heatmap")
|
215 |
+
return ""
|
216 |
+
|
217 |
+
# Create DataFrame for heatmap
|
218 |
+
heatmap_data = []
|
219 |
+
for result in detailed_results:
|
220 |
+
heatmap_data.append({
|
221 |
+
'Category': result['category'].title(),
|
222 |
+
'Complexity': result['query_complexity'].title(),
|
223 |
+
'Precision@K': result['precision_at_k'],
|
224 |
+
'MRR': result['mrr_score'],
|
225 |
+
'Threshold': result['threshold_used']
|
226 |
+
})
|
227 |
+
|
228 |
+
df = pd.DataFrame(heatmap_data)
|
229 |
+
|
230 |
+
# Create pivot table for heatmap
|
231 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
|
232 |
+
|
233 |
+
# Precision heatmap
|
234 |
+
precision_pivot = df.pivot_table(values='Precision@K', index='Category', columns='Complexity', aggfunc='mean')
|
235 |
+
sns.heatmap(precision_pivot, annot=True, fmt='.3f', cmap='YlOrRd', ax=ax1,
|
236 |
+
cbar_kws={'label': 'Precision@K'}, vmin=0, vmax=1)
|
237 |
+
ax1.set_title('Precision@K Heatmap\n(Category vs Complexity)', fontweight='bold')
|
238 |
+
|
239 |
+
# MRR heatmap
|
240 |
+
mrr_pivot = df.pivot_table(values='MRR', index='Category', columns='Complexity', aggfunc='mean')
|
241 |
+
sns.heatmap(mrr_pivot, annot=True, fmt='.3f', cmap='YlGnBu', ax=ax2,
|
242 |
+
cbar_kws={'label': 'MRR Score'}, vmin=0, vmax=1)
|
243 |
+
ax2.set_title('MRR Heatmap\n(Category vs Complexity)', fontweight='bold')
|
244 |
+
|
245 |
+
plt.tight_layout()
|
246 |
+
|
247 |
+
# Save chart
|
248 |
+
if save_path is None:
|
249 |
+
save_path = Path(__file__).parent / "charts" / f"precision_mrr_heatmap_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
250 |
+
|
251 |
+
save_path = Path(save_path)
|
252 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
253 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
254 |
+
plt.close()
|
255 |
+
|
256 |
+
print(f"π Combined metrics heatmap saved: {save_path}")
|
257 |
+
return str(save_path)
|
258 |
+
|
259 |
+
def create_threshold_impact_chart(self, analysis_data: Dict, save_path: str = None) -> str:
|
260 |
+
"""Create threshold impact analysis chart"""
|
261 |
+
|
262 |
+
detailed_results = analysis_data.get('detailed_results', [])
|
263 |
+
|
264 |
+
if not detailed_results:
|
265 |
+
print("β οΈ No detailed results for threshold analysis")
|
266 |
+
return ""
|
267 |
+
|
268 |
+
# Group by complexity and calculate average relevance
|
269 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
270 |
+
|
271 |
+
# Prepare data
|
272 |
+
simple_queries = [r for r in detailed_results if r['query_complexity'] == 'simple']
|
273 |
+
complex_queries = [r for r in detailed_results if r['query_complexity'] == 'complex']
|
274 |
+
|
275 |
+
# Chart 1: Relevance distribution for different complexities
|
276 |
+
if simple_queries:
|
277 |
+
simple_relevances = []
|
278 |
+
for query in simple_queries:
|
279 |
+
simple_relevances.extend(query.get('relevance_scores', []))
|
280 |
+
|
281 |
+
ax1.hist(simple_relevances, bins=10, alpha=0.7, label=f'Simple (T=0.25)', color='#2ca02c', density=True)
|
282 |
+
ax1.axvline(x=0.25, color='#2ca02c', linestyle='--', linewidth=2, label='Simple Threshold')
|
283 |
+
|
284 |
+
if complex_queries:
|
285 |
+
complex_relevances = []
|
286 |
+
for query in complex_queries:
|
287 |
+
complex_relevances.extend(query.get('relevance_scores', []))
|
288 |
+
|
289 |
+
ax1.hist(complex_relevances, bins=10, alpha=0.7, label=f'Complex (T=0.15)', color='#d62728', density=True)
|
290 |
+
ax1.axvline(x=0.15, color='#d62728', linestyle='--', linewidth=2, label='Complex Threshold')
|
291 |
+
|
292 |
+
ax1.set_title('Relevance Score Distribution\nby Query Complexity', fontweight='bold')
|
293 |
+
ax1.set_xlabel('Relevance Score')
|
294 |
+
ax1.set_ylabel('Density')
|
295 |
+
ax1.legend()
|
296 |
+
ax1.grid(True, alpha=0.3)
|
297 |
+
|
298 |
+
# Chart 2: Metrics comparison
|
299 |
+
complexity_stats = analysis_data['statistics']['by_complexity']
|
300 |
+
|
301 |
+
complexities = []
|
302 |
+
precisions = []
|
303 |
+
mrrs = []
|
304 |
+
thresholds = []
|
305 |
+
|
306 |
+
for complexity, stats in complexity_stats.items():
|
307 |
+
if stats:
|
308 |
+
complexities.append(complexity.title())
|
309 |
+
precisions.append(stats['avg_precision'])
|
310 |
+
mrrs.append(stats['avg_mrr'])
|
311 |
+
thresholds.append(stats['avg_threshold'])
|
312 |
+
|
313 |
+
x = np.arange(len(complexities))
|
314 |
+
width = 0.35
|
315 |
+
|
316 |
+
bars1 = ax2.bar(x - width/2, precisions, width, label='Precision@K', alpha=0.8, color='#ff7f0e')
|
317 |
+
bars2 = ax2.bar(x + width/2, mrrs, width, label='MRR', alpha=0.8, color='#1f77b4')
|
318 |
+
|
319 |
+
ax2.set_title('Metrics Comparison by Complexity\n(with Adaptive Thresholds)', fontweight='bold')
|
320 |
+
ax2.set_ylabel('Score')
|
321 |
+
ax2.set_xlabel('Query Complexity')
|
322 |
+
ax2.set_xticks(x)
|
323 |
+
ax2.set_xticklabels(complexities)
|
324 |
+
ax2.legend()
|
325 |
+
ax2.grid(True, alpha=0.3)
|
326 |
+
ax2.set_ylim(0, 1.0)
|
327 |
+
|
328 |
+
# Add value labels
|
329 |
+
for bars, values, thresholds_vals in [(bars1, precisions, thresholds), (bars2, mrrs, thresholds)]:
|
330 |
+
for bar, value, threshold in zip(bars, values, thresholds_vals):
|
331 |
+
height = bar.get_height()
|
332 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
333 |
+
f'{value:.3f}', ha='center', va='bottom', fontweight='bold', fontsize=9)
|
334 |
+
|
335 |
+
plt.tight_layout()
|
336 |
+
|
337 |
+
# Save chart
|
338 |
+
if save_path is None:
|
339 |
+
save_path = Path(__file__).parent / "charts" / f"threshold_impact_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
340 |
+
|
341 |
+
save_path = Path(save_path)
|
342 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
343 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
344 |
+
plt.close()
|
345 |
+
|
346 |
+
print(f"π Threshold impact chart saved: {save_path}")
|
347 |
+
return str(save_path)
|
348 |
+
|
349 |
+
def create_detailed_analysis_table(self, analysis_data: Dict, save_path: str = None) -> str:
|
350 |
+
"""Create detailed statistics table"""
|
351 |
+
|
352 |
+
fig, ax = plt.subplots(figsize=(12, 8))
|
353 |
+
ax.axis('tight')
|
354 |
+
ax.axis('off')
|
355 |
+
|
356 |
+
# Prepare table data
|
357 |
+
table_data = []
|
358 |
+
|
359 |
+
# Overall statistics
|
360 |
+
overall_stats = analysis_data['statistics']['overall_statistics']
|
361 |
+
table_data.append(['OVERALL METRICS', '', '', '', ''])
|
362 |
+
table_data.append(['Total Queries', str(overall_stats['total_queries']), '', '', ''])
|
363 |
+
table_data.append(['Avg Precision@K', f"{overall_stats['avg_precision']:.3f}",
|
364 |
+
f"Β±{overall_stats['precision_std']:.3f}", '', ''])
|
365 |
+
table_data.append(['Avg MRR', f"{overall_stats['avg_mrr']:.3f}",
|
366 |
+
f"Β±{overall_stats['mrr_std']:.3f}", '', ''])
|
367 |
+
table_data.append(['', '', '', '', ''])
|
368 |
+
|
369 |
+
# By category
|
370 |
+
table_data.append(['BY CATEGORY', 'Queries', 'Precision@K', 'MRR', 'Notes'])
|
371 |
+
category_stats = analysis_data['statistics']['by_category']
|
372 |
+
for category, stats in category_stats.items():
|
373 |
+
if stats:
|
374 |
+
table_data.append([
|
375 |
+
category.title(),
|
376 |
+
str(stats['query_count']),
|
377 |
+
f"{stats['avg_precision']:.3f}",
|
378 |
+
f"{stats['avg_mrr']:.3f}",
|
379 |
+
''
|
380 |
+
])
|
381 |
+
|
382 |
+
table_data.append(['', '', '', '', ''])
|
383 |
+
|
384 |
+
# By complexity
|
385 |
+
table_data.append(['BY COMPLEXITY', 'Queries', 'Precision@K', 'MRR', 'Threshold'])
|
386 |
+
complexity_stats = analysis_data['statistics']['by_complexity']
|
387 |
+
for complexity, stats in complexity_stats.items():
|
388 |
+
if stats:
|
389 |
+
table_data.append([
|
390 |
+
complexity.title(),
|
391 |
+
str(stats['query_count']),
|
392 |
+
f"{stats['avg_precision']:.3f}",
|
393 |
+
f"{stats['avg_mrr']:.3f}",
|
394 |
+
f"{stats['avg_threshold']:.2f}"
|
395 |
+
])
|
396 |
+
|
397 |
+
# Create table
|
398 |
+
table = ax.table(cellText=table_data,
|
399 |
+
colLabels=['Metric', 'Value 1', 'Value 2', 'Value 3', 'Value 4'],
|
400 |
+
cellLoc='center',
|
401 |
+
loc='center',
|
402 |
+
bbox=[0, 0, 1, 1])
|
403 |
+
|
404 |
+
# Style the table
|
405 |
+
table.auto_set_font_size(False)
|
406 |
+
table.set_fontsize(10)
|
407 |
+
table.scale(1, 2)
|
408 |
+
|
409 |
+
# Header styling
|
410 |
+
for i in range(5):
|
411 |
+
table[(0, i)].set_facecolor('#40466e')
|
412 |
+
table[(0, i)].set_text_props(weight='bold', color='white')
|
413 |
+
|
414 |
+
# Section headers styling
|
415 |
+
for i, row in enumerate(table_data):
|
416 |
+
if row[0] in ['OVERALL METRICS', 'BY CATEGORY', 'BY COMPLEXITY']:
|
417 |
+
table[(i+1, 0)].set_facecolor('#1f77b4')
|
418 |
+
table[(i+1, 0)].set_text_props(weight='bold', color='white')
|
419 |
+
|
420 |
+
plt.title('Precision@K & MRR Detailed Analysis\nMetrics 7-8 Statistics',
|
421 |
+
fontweight='bold', fontsize=14, pad=20)
|
422 |
+
|
423 |
+
# Save chart
|
424 |
+
if save_path is None:
|
425 |
+
save_path = Path(__file__).parent / "charts" / f"precision_mrr_table_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
426 |
+
|
427 |
+
save_path = Path(save_path)
|
428 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
429 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
430 |
+
plt.close()
|
431 |
+
|
432 |
+
print(f"π Detailed analysis table saved: {save_path}")
|
433 |
+
return str(save_path)
|
434 |
+
|
435 |
+
def create_individual_query_analysis(self, analysis_data: Dict, save_path: str = None) -> str:
|
436 |
+
"""Create individual query analysis chart"""
|
437 |
+
|
438 |
+
detailed_results = analysis_data.get('detailed_results', [])
|
439 |
+
|
440 |
+
if not detailed_results:
|
441 |
+
print("β οΈ No detailed results for individual analysis")
|
442 |
+
return ""
|
443 |
+
|
444 |
+
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))
|
445 |
+
|
446 |
+
# Prepare data
|
447 |
+
query_indices = []
|
448 |
+
precisions = []
|
449 |
+
mrrs = []
|
450 |
+
colors = []
|
451 |
+
labels = []
|
452 |
+
|
453 |
+
for i, result in enumerate(detailed_results):
|
454 |
+
query_indices.append(i + 1)
|
455 |
+
precisions.append(result['precision_at_k'])
|
456 |
+
mrrs.append(result['mrr_score'])
|
457 |
+
|
458 |
+
# Color by complexity
|
459 |
+
if result['query_complexity'] == 'complex':
|
460 |
+
colors.append('#d62728') # Red for complex
|
461 |
+
else:
|
462 |
+
colors.append('#2ca02c') # Green for simple
|
463 |
+
|
464 |
+
# Create short label
|
465 |
+
query_short = result['query'][:30] + "..." if len(result['query']) > 30 else result['query']
|
466 |
+
category = result['category'][:4].upper()
|
467 |
+
labels.append(f"{category}\n{query_short}")
|
468 |
+
|
469 |
+
# Chart 1: Precision@K for each query
|
470 |
+
bars1 = ax1.bar(query_indices, precisions, color=colors, alpha=0.8)
|
471 |
+
ax1.set_title('Precision@K by Individual Query', fontweight='bold')
|
472 |
+
ax1.set_ylabel('Precision@K')
|
473 |
+
ax1.set_xlabel('Query Index')
|
474 |
+
ax1.set_ylim(0, 1.0)
|
475 |
+
ax1.grid(True, alpha=0.3)
|
476 |
+
|
477 |
+
# Add value labels
|
478 |
+
for bar, precision in zip(bars1, precisions):
|
479 |
+
height = bar.get_height()
|
480 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
481 |
+
f'{precision:.2f}', ha='center', va='bottom', fontsize=8)
|
482 |
+
|
483 |
+
# Chart 2: MRR for each query
|
484 |
+
bars2 = ax2.bar(query_indices, mrrs, color=colors, alpha=0.8)
|
485 |
+
ax2.set_title('MRR by Individual Query', fontweight='bold')
|
486 |
+
ax2.set_ylabel('MRR Score')
|
487 |
+
ax2.set_xlabel('Query Index')
|
488 |
+
ax2.set_ylim(0, 1.0)
|
489 |
+
ax2.grid(True, alpha=0.3)
|
490 |
+
|
491 |
+
# Add value labels
|
492 |
+
for bar, mrr in zip(bars2, mrrs):
|
493 |
+
height = bar.get_height()
|
494 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
495 |
+
f'{mrr:.2f}', ha='center', va='bottom', fontsize=8)
|
496 |
+
|
497 |
+
# Add legend
|
498 |
+
from matplotlib.patches import Patch
|
499 |
+
legend_elements = [
|
500 |
+
Patch(facecolor='#2ca02c', alpha=0.8, label='Simple Query (T=0.25)'),
|
501 |
+
Patch(facecolor='#d62728', alpha=0.8, label='Complex Query (T=0.15)')
|
502 |
+
]
|
503 |
+
ax1.legend(handles=legend_elements, loc='upper right')
|
504 |
+
|
505 |
+
plt.tight_layout()
|
506 |
+
|
507 |
+
# Save chart
|
508 |
+
if save_path is None:
|
509 |
+
save_path = Path(__file__).parent / "charts" / f"individual_query_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
510 |
+
|
511 |
+
save_path = Path(save_path)
|
512 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
513 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
514 |
+
plt.close()
|
515 |
+
|
516 |
+
print(f"π Individual query analysis saved: {save_path}")
|
517 |
+
return str(save_path)
|
518 |
+
|
519 |
+
def generate_all_charts(self, analysis_data: Dict = None) -> Dict[str, str]:
|
520 |
+
"""Generate all precision/MRR charts"""
|
521 |
+
|
522 |
+
if analysis_data is None:
|
523 |
+
analysis_data = self.load_latest_analysis()
|
524 |
+
|
525 |
+
print(f"\nπ Generating all Precision & MRR charts...")
|
526 |
+
|
527 |
+
saved_charts = {}
|
528 |
+
|
529 |
+
# Generate all chart types
|
530 |
+
try:
|
531 |
+
saved_charts['precision_comparison'] = self.create_precision_comparison_chart(analysis_data)
|
532 |
+
saved_charts['mrr_comparison'] = self.create_mrr_comparison_chart(analysis_data)
|
533 |
+
saved_charts['combined_heatmap'] = self.create_combined_metrics_heatmap(analysis_data)
|
534 |
+
saved_charts['threshold_impact'] = self.create_threshold_impact_chart(analysis_data)
|
535 |
+
saved_charts['individual_analysis'] = self.create_individual_query_analysis(analysis_data)
|
536 |
+
|
537 |
+
except Exception as e:
|
538 |
+
print(f"β Error generating charts: {e}")
|
539 |
+
return {"error": str(e)}
|
540 |
+
|
541 |
+
print(f"\nβ
All precision/MRR charts generated successfully!")
|
542 |
+
print(f"π Charts saved to: evaluation/charts/")
|
543 |
+
|
544 |
+
return saved_charts
|
545 |
+
|
546 |
+
|
547 |
+
# Independent execution interface
|
548 |
+
if __name__ == "__main__":
|
549 |
+
"""Generate precision/MRR charts from analysis results"""
|
550 |
+
|
551 |
+
print("π OnCall.ai Precision & MRR Chart Generator - Metrics 7-8")
|
552 |
+
|
553 |
+
if len(sys.argv) > 1:
|
554 |
+
analysis_file = sys.argv[1]
|
555 |
+
|
556 |
+
if not os.path.exists(analysis_file):
|
557 |
+
print(f"β Analysis file not found: {analysis_file}")
|
558 |
+
sys.exit(1)
|
559 |
+
else:
|
560 |
+
analysis_file = None # Will use latest file
|
561 |
+
|
562 |
+
# Initialize generator
|
563 |
+
generator = PrecisionMRRChartGenerator()
|
564 |
+
|
565 |
+
try:
|
566 |
+
# Load analysis data
|
567 |
+
if analysis_file:
|
568 |
+
with open(analysis_file, 'r', encoding='utf-8') as f:
|
569 |
+
analysis_data = json.load(f)
|
570 |
+
print(f"π Using specified analysis file: {analysis_file}")
|
571 |
+
else:
|
572 |
+
analysis_data = generator.load_latest_analysis()
|
573 |
+
|
574 |
+
# Generate all charts
|
575 |
+
saved_charts = generator.generate_all_charts(analysis_data)
|
576 |
+
|
577 |
+
if 'error' not in saved_charts:
|
578 |
+
print(f"\nπ === PRECISION & MRR CHART GENERATION SUMMARY ===")
|
579 |
+
for chart_type, filepath in saved_charts.items():
|
580 |
+
print(f" π {chart_type.replace('_', ' ').title()}: {filepath}")
|
581 |
+
|
582 |
+
print(f"\nπ‘ Charts ready for analysis and presentation!")
|
583 |
+
|
584 |
+
except Exception as e:
|
585 |
+
print(f"β Chart generation failed: {e}")
|
586 |
+
sys.exit(1)
|
evaluation/single_test_query.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
1.diagnosis: 60-year-old patient with hypertension history, sudden chest pain. What are possible causes and how to assess?
|