Spaces:
Sleeping
Sleeping
Merge branch 'newbranchYB-newest' into Merged20250805
Browse files- evaluation/direct_llm_evaluator.py +419 -0
- evaluation/latency_evaluator.py +892 -0
- evaluation/metric1_latency_chart_generator.py +327 -0
- evaluation/metric2_extraction_chart_generator.py +216 -0
- evaluation/metric3_relevance_chart_generator.py +231 -0
- evaluation/metric4_coverage_chart_generator.py +222 -0
- evaluation/metric5_6_judge_evaluator_manual.md +303 -0
- evaluation/metric5_6_llm_judge_chart_generator.py +430 -0
- evaluation/metric5_6_llm_judge_evaluator.py +643 -0
- evaluation/metric7_8_precision_MRR.py +402 -0
- evaluation/metric7_8_precision_mrr_chart_generator.py +586 -0
- evaluation/old/coverage_evaluator.py +560 -0
- evaluation/{evaluation_instruction.md → old/evaluation_instruction.md} +455 -36
- evaluation/{evaluation_instruction_customization.md → old/evaluation_instruction_customization.md} +0 -0
- evaluation/old/extraction_evaluator.py +379 -0
- evaluation/old/relevance_evaluator.py +447 -0
- evaluation/pre_user_query_evaluate.txt +5 -0
- evaluation/single_test_query.txt +1 -0
- evaluation/user_query.txt +7 -7
- src/generation.py +6 -6
- src/llm_clients.py +241 -8
- src/medical_conditions.py +8 -0
- src/user_prompt.py +6 -4
evaluation/direct_llm_evaluator.py
ADDED
@@ -0,0 +1,419 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
OnCall.ai System - Direct LLM Evaluator (Med42-70B Only)
|
4 |
+
========================================================
|
5 |
+
|
6 |
+
Tests Med42-70B directly without RAG pipeline.
|
7 |
+
Only applicable metrics: 1 (Latency), 5 (Actionability), 6 (Evidence Quality)
|
8 |
+
|
9 |
+
Metrics 2-4 (Extraction, Relevance, Coverage) are not applicable for direct LLM.
|
10 |
+
|
11 |
+
Author: YanBo Chen
|
12 |
+
Date: 2025-08-04
|
13 |
+
"""
|
14 |
+
|
15 |
+
import time
|
16 |
+
import json
|
17 |
+
import os
|
18 |
+
import sys
|
19 |
+
from typing import Dict, List, Any
|
20 |
+
from datetime import datetime
|
21 |
+
from pathlib import Path
|
22 |
+
import re
|
23 |
+
|
24 |
+
# Add project path
|
25 |
+
current_dir = Path(__file__).parent
|
26 |
+
project_root = current_dir.parent
|
27 |
+
src_dir = project_root / "src"
|
28 |
+
sys.path.insert(0, str(src_dir))
|
29 |
+
|
30 |
+
# Import LLM client only (no retrieval system needed)
|
31 |
+
try:
|
32 |
+
from llm_clients import llm_Med42_70BClient
|
33 |
+
except ImportError as e:
|
34 |
+
print(f"❌ Import failed: {e}")
|
35 |
+
print("Please ensure running from project root directory")
|
36 |
+
sys.exit(1)
|
37 |
+
|
38 |
+
|
39 |
+
class DirectLLMEvaluator:
|
40 |
+
"""Direct LLM evaluation without RAG pipeline"""
|
41 |
+
|
42 |
+
def __init__(self):
|
43 |
+
"""Initialize direct LLM client only"""
|
44 |
+
print("🔧 Initializing Direct LLM Evaluator...")
|
45 |
+
|
46 |
+
# Initialize only LLM client (no retrieval, no user_prompt processing)
|
47 |
+
self.llm_client = llm_Med42_70BClient()
|
48 |
+
|
49 |
+
# Results accumulation
|
50 |
+
self.direct_results = []
|
51 |
+
self.medical_outputs = []
|
52 |
+
|
53 |
+
print("✅ Direct LLM Evaluator initialization complete")
|
54 |
+
|
55 |
+
def evaluate_direct_llm_query(self, query: str, category: str = "unknown") -> Dict[str, Any]:
|
56 |
+
"""
|
57 |
+
Direct LLM evaluation for single query
|
58 |
+
|
59 |
+
Only tests direct LLM response without RAG pipeline
|
60 |
+
Applicable metrics: 1 (Latency), 5-6 (via medical output)
|
61 |
+
|
62 |
+
Args:
|
63 |
+
query: Medical query to test
|
64 |
+
category: Query category (diagnosis/treatment/mixed)
|
65 |
+
"""
|
66 |
+
print(f"🔍 Direct LLM evaluation: {query[:50]}...")
|
67 |
+
print(f"📋 Category: {category}")
|
68 |
+
|
69 |
+
overall_start = time.time()
|
70 |
+
|
71 |
+
try:
|
72 |
+
# Direct LLM call without any RAG processing
|
73 |
+
llm_start = time.time()
|
74 |
+
|
75 |
+
# Create direct medical consultation prompt
|
76 |
+
direct_prompt = f"""
|
77 |
+
You are a medical expert providing clinical guidance.
|
78 |
+
|
79 |
+
Patient Query: {query}
|
80 |
+
|
81 |
+
Please provide comprehensive medical advice including:
|
82 |
+
1. Differential diagnosis (if applicable)
|
83 |
+
2. Immediate assessment steps
|
84 |
+
3. Treatment recommendations
|
85 |
+
4. Clinical considerations
|
86 |
+
|
87 |
+
Provide evidence-based, actionable medical guidance.
|
88 |
+
"""
|
89 |
+
|
90 |
+
# Direct LLM generation (same parameters as RAG system for fair comparison)
|
91 |
+
response = self.llm_client.analyze_medical_query(
|
92 |
+
query=direct_prompt,
|
93 |
+
max_tokens=1600, # Same as RAG system primary setting
|
94 |
+
timeout=60.0 # Increased timeout for stable evaluation
|
95 |
+
)
|
96 |
+
# Extract medical advice from response (Med42 client returns dict with 'raw_response')
|
97 |
+
if isinstance(response, dict):
|
98 |
+
medical_advice = response.get('raw_response', '') or response.get('content', '')
|
99 |
+
else:
|
100 |
+
medical_advice = str(response)
|
101 |
+
|
102 |
+
llm_time = time.time() - llm_start
|
103 |
+
total_time = time.time() - overall_start
|
104 |
+
|
105 |
+
# Check if response is valid (not empty) - focus on content, not timeout
|
106 |
+
if not medical_advice or len(medical_advice.strip()) == 0:
|
107 |
+
print(f"❌ Direct LLM returned empty response after {total_time:.2f}s")
|
108 |
+
raise ValueError("Empty response from LLM - no content generated")
|
109 |
+
|
110 |
+
# Create result
|
111 |
+
result = {
|
112 |
+
"query": query,
|
113 |
+
"category": category,
|
114 |
+
|
115 |
+
# Metric 1: Total Latency (direct LLM call time)
|
116 |
+
"latency_metrics": {
|
117 |
+
"total_latency": total_time,
|
118 |
+
"llm_generation_time": llm_time,
|
119 |
+
"meets_target": total_time <= 60.0
|
120 |
+
},
|
121 |
+
|
122 |
+
# Metrics 2-4: Not applicable for direct LLM
|
123 |
+
"extraction_metrics": {
|
124 |
+
"not_applicable": True,
|
125 |
+
"reason": "No extraction pipeline in direct LLM"
|
126 |
+
},
|
127 |
+
"relevance_metrics": {
|
128 |
+
"not_applicable": True,
|
129 |
+
"reason": "No retrieval pipeline in direct LLM"
|
130 |
+
},
|
131 |
+
"coverage_metrics": {
|
132 |
+
"not_applicable": True,
|
133 |
+
"reason": "No retrieval content to cover"
|
134 |
+
},
|
135 |
+
|
136 |
+
# Medical advice for metrics 5-6 evaluation
|
137 |
+
"medical_advice": medical_advice,
|
138 |
+
"advice_length": len(medical_advice),
|
139 |
+
|
140 |
+
"overall_success": True,
|
141 |
+
"model_type": "Med42-70B_direct",
|
142 |
+
"timestamp": datetime.now().isoformat()
|
143 |
+
}
|
144 |
+
|
145 |
+
# Store result
|
146 |
+
self.direct_results.append(result)
|
147 |
+
|
148 |
+
# Store medical output for LLM judge evaluation
|
149 |
+
medical_output = {
|
150 |
+
"query": query,
|
151 |
+
"category": category,
|
152 |
+
"medical_advice": medical_advice,
|
153 |
+
"query_id": f"{category}_query_direct",
|
154 |
+
"model_type": "Med42-70B_direct",
|
155 |
+
"processing_time": total_time,
|
156 |
+
"timestamp": datetime.now().isoformat()
|
157 |
+
}
|
158 |
+
self.medical_outputs.append(medical_output)
|
159 |
+
|
160 |
+
print(f"✅ Direct LLM completed in {total_time:.2f}s")
|
161 |
+
print(f"📝 Generated advice: {len(medical_advice)} characters")
|
162 |
+
|
163 |
+
return result
|
164 |
+
|
165 |
+
except Exception as e:
|
166 |
+
total_time = time.time() - overall_start
|
167 |
+
print(f"❌ Direct LLM evaluation failed after {total_time:.2f}s: {e}")
|
168 |
+
|
169 |
+
error_result = {
|
170 |
+
"query": query,
|
171 |
+
"category": category,
|
172 |
+
"latency_metrics": {
|
173 |
+
"total_latency": total_time,
|
174 |
+
"meets_target": False
|
175 |
+
},
|
176 |
+
"overall_success": False,
|
177 |
+
"error": str(e),
|
178 |
+
"model_type": "Med42-70B_direct",
|
179 |
+
"timestamp": datetime.now().isoformat()
|
180 |
+
}
|
181 |
+
|
182 |
+
self.direct_results.append(error_result)
|
183 |
+
|
184 |
+
# Do NOT add failed queries to medical_outputs for judge evaluation
|
185 |
+
# Only successful queries with valid medical advice should be evaluated
|
186 |
+
|
187 |
+
return error_result
|
188 |
+
|
189 |
+
def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
|
190 |
+
"""Parse queries from file with category labels"""
|
191 |
+
print(f"📁 Reading queries from file: {filepath}")
|
192 |
+
|
193 |
+
try:
|
194 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
195 |
+
content = f.read()
|
196 |
+
|
197 |
+
queries_by_category = {
|
198 |
+
"diagnosis": [],
|
199 |
+
"treatment": [],
|
200 |
+
"mixed": []
|
201 |
+
}
|
202 |
+
|
203 |
+
lines = content.strip().split('\n')
|
204 |
+
|
205 |
+
for line in lines:
|
206 |
+
line = line.strip()
|
207 |
+
if not line:
|
208 |
+
continue
|
209 |
+
|
210 |
+
match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
|
211 |
+
if match:
|
212 |
+
category_raw = match.group(1).lower()
|
213 |
+
query_text = match.group(2).strip()
|
214 |
+
|
215 |
+
if category_raw in ['mixed/complicated', 'mixed']:
|
216 |
+
category = 'mixed'
|
217 |
+
else:
|
218 |
+
category = category_raw
|
219 |
+
|
220 |
+
if category in queries_by_category and len(query_text) > 15:
|
221 |
+
queries_by_category[category].append({
|
222 |
+
"text": query_text,
|
223 |
+
"category": category
|
224 |
+
})
|
225 |
+
|
226 |
+
print(f"📋 Parsed queries by category:")
|
227 |
+
for category, category_queries in queries_by_category.items():
|
228 |
+
print(f" {category.capitalize()}: {len(category_queries)} queries")
|
229 |
+
|
230 |
+
return queries_by_category
|
231 |
+
|
232 |
+
except Exception as e:
|
233 |
+
print(f"❌ Failed to read file: {e}")
|
234 |
+
return {"error": f"Failed to read file: {e}"}
|
235 |
+
|
236 |
+
def calculate_direct_llm_statistics(self) -> Dict[str, Any]:
|
237 |
+
"""Calculate statistics for direct LLM evaluation"""
|
238 |
+
successful_results = [r for r in self.direct_results if r.get('overall_success')]
|
239 |
+
|
240 |
+
if successful_results:
|
241 |
+
latencies = [r['latency_metrics']['total_latency'] for r in successful_results]
|
242 |
+
|
243 |
+
# Category-wise statistics
|
244 |
+
category_stats = {}
|
245 |
+
results_by_category = {"diagnosis": [], "treatment": [], "mixed": []}
|
246 |
+
|
247 |
+
for result in successful_results:
|
248 |
+
category = result.get('category', 'unknown')
|
249 |
+
if category in results_by_category:
|
250 |
+
results_by_category[category].append(result)
|
251 |
+
|
252 |
+
for category, results in results_by_category.items():
|
253 |
+
if results:
|
254 |
+
cat_latencies = [r['latency_metrics']['total_latency'] for r in results]
|
255 |
+
category_stats[category] = {
|
256 |
+
"average_latency": sum(cat_latencies) / len(cat_latencies),
|
257 |
+
"query_count": len(cat_latencies),
|
258 |
+
"target_compliance": sum(1 for lat in cat_latencies if lat <= 60.0) / len(cat_latencies)
|
259 |
+
}
|
260 |
+
else:
|
261 |
+
category_stats[category] = {
|
262 |
+
"average_latency": 0.0,
|
263 |
+
"query_count": 0,
|
264 |
+
"target_compliance": 0.0
|
265 |
+
}
|
266 |
+
|
267 |
+
# Overall statistics
|
268 |
+
overall_stats = {
|
269 |
+
"average_latency": sum(latencies) / len(latencies),
|
270 |
+
"min_latency": min(latencies),
|
271 |
+
"max_latency": max(latencies),
|
272 |
+
"successful_queries": len(successful_results),
|
273 |
+
"total_queries": len(self.direct_results),
|
274 |
+
"success_rate": len(successful_results) / len(self.direct_results),
|
275 |
+
"target_compliance": sum(1 for lat in latencies if lat <= 60.0) / len(latencies)
|
276 |
+
}
|
277 |
+
else:
|
278 |
+
category_stats = {cat: {"average_latency": 0.0, "query_count": 0, "target_compliance": 0.0}
|
279 |
+
for cat in ["diagnosis", "treatment", "mixed"]}
|
280 |
+
overall_stats = {
|
281 |
+
"average_latency": 0.0,
|
282 |
+
"successful_queries": 0,
|
283 |
+
"total_queries": len(self.direct_results),
|
284 |
+
"success_rate": 0.0,
|
285 |
+
"target_compliance": 0.0
|
286 |
+
}
|
287 |
+
|
288 |
+
return {
|
289 |
+
"category_results": category_stats,
|
290 |
+
"overall_results": overall_stats,
|
291 |
+
"model_type": "Med42-70B_direct",
|
292 |
+
"timestamp": datetime.now().isoformat()
|
293 |
+
}
|
294 |
+
|
295 |
+
def save_direct_llm_statistics(self, filename: str = None) -> str:
|
296 |
+
"""Save direct LLM statistics"""
|
297 |
+
stats = self.calculate_direct_llm_statistics()
|
298 |
+
|
299 |
+
if filename is None:
|
300 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
301 |
+
filename = f"direct_llm_statistics_{timestamp}.json"
|
302 |
+
|
303 |
+
results_dir = Path(__file__).parent / "results"
|
304 |
+
results_dir.mkdir(exist_ok=True)
|
305 |
+
filepath = results_dir / filename
|
306 |
+
|
307 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
308 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
309 |
+
|
310 |
+
print(f"📊 Direct LLM statistics saved to: {filepath}")
|
311 |
+
return str(filepath)
|
312 |
+
|
313 |
+
def save_direct_medical_outputs(self, filename: str = None) -> str:
|
314 |
+
"""Save medical outputs for LLM judge evaluation"""
|
315 |
+
if filename is None:
|
316 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
317 |
+
filename = f"medical_outputs_direct_{timestamp}.json"
|
318 |
+
|
319 |
+
results_dir = Path(__file__).parent / "results"
|
320 |
+
results_dir.mkdir(exist_ok=True)
|
321 |
+
filepath = results_dir / filename
|
322 |
+
|
323 |
+
output_data = {
|
324 |
+
"evaluation_metadata": {
|
325 |
+
"total_outputs": len(self.medical_outputs),
|
326 |
+
"categories": list(set(output['category'] for output in self.medical_outputs)),
|
327 |
+
"timestamp": datetime.now().isoformat(),
|
328 |
+
"model_type": "Med42-70B_direct"
|
329 |
+
},
|
330 |
+
"medical_outputs": self.medical_outputs
|
331 |
+
}
|
332 |
+
|
333 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
334 |
+
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
335 |
+
|
336 |
+
print(f"📝 Direct medical outputs saved to: {filepath}")
|
337 |
+
return str(filepath)
|
338 |
+
|
339 |
+
|
340 |
+
# Independent execution interface
|
341 |
+
if __name__ == "__main__":
|
342 |
+
"""Independent direct LLM evaluation interface"""
|
343 |
+
|
344 |
+
print("🚀 OnCall.ai Direct LLM Evaluator - Med42-70B Only")
|
345 |
+
|
346 |
+
if len(sys.argv) > 1:
|
347 |
+
query_file = sys.argv[1]
|
348 |
+
else:
|
349 |
+
# Default to evaluation/single_test_query.txt for consistency
|
350 |
+
query_file = Path(__file__).parent / "single_test_query.txt"
|
351 |
+
|
352 |
+
if not os.path.exists(query_file):
|
353 |
+
print(f"❌ Query file not found: {query_file}")
|
354 |
+
print("Usage: python direct_llm_evaluator.py [query_file.txt]")
|
355 |
+
sys.exit(1)
|
356 |
+
|
357 |
+
# Initialize evaluator
|
358 |
+
evaluator = DirectLLMEvaluator()
|
359 |
+
|
360 |
+
# Parse queries
|
361 |
+
queries_by_category = evaluator.parse_queries_from_file(str(query_file))
|
362 |
+
|
363 |
+
if "error" in queries_by_category:
|
364 |
+
print(f"❌ Failed to parse queries: {queries_by_category['error']}")
|
365 |
+
sys.exit(1)
|
366 |
+
|
367 |
+
# Test direct LLM for each query
|
368 |
+
print(f"\n🧪 Direct LLM Testing (No RAG Pipeline)")
|
369 |
+
|
370 |
+
for category, queries in queries_by_category.items():
|
371 |
+
if not queries:
|
372 |
+
continue
|
373 |
+
|
374 |
+
print(f"\n📂 Testing {category.upper()} with direct Med42-70B:")
|
375 |
+
|
376 |
+
for i, query_info in enumerate(queries):
|
377 |
+
query_text = query_info['text']
|
378 |
+
|
379 |
+
# Direct LLM evaluation
|
380 |
+
result = evaluator.evaluate_direct_llm_query(query_text, category)
|
381 |
+
|
382 |
+
# Pause between queries
|
383 |
+
if i < len(queries) - 1:
|
384 |
+
print(f" ⏳ Pausing 5s before next query...")
|
385 |
+
time.sleep(5)
|
386 |
+
|
387 |
+
# Pause between categories
|
388 |
+
if category != list(queries_by_category.keys())[-1]:
|
389 |
+
print(f"\n⏳ Pausing 10s before next category...")
|
390 |
+
time.sleep(10)
|
391 |
+
|
392 |
+
# Save results
|
393 |
+
print(f"\n📊 Generating direct LLM analysis...")
|
394 |
+
|
395 |
+
stats_path = evaluator.save_direct_llm_statistics()
|
396 |
+
outputs_path = evaluator.save_direct_medical_outputs()
|
397 |
+
|
398 |
+
# Print summary
|
399 |
+
stats = evaluator.calculate_direct_llm_statistics()
|
400 |
+
overall_results = stats['overall_results']
|
401 |
+
|
402 |
+
print(f"\n📊 === DIRECT LLM EVALUATION SUMMARY ===")
|
403 |
+
print(f"Overall Performance:")
|
404 |
+
print(f" Average Latency: {overall_results['average_latency']:.2f}s")
|
405 |
+
print(f" Success Rate: {overall_results['successful_queries']}/{overall_results['total_queries']}")
|
406 |
+
print(f" 60s Target Compliance: {overall_results['target_compliance']:.1%}")
|
407 |
+
|
408 |
+
print(f"\nApplicable Metrics:")
|
409 |
+
print(f" ✅ Metric 1 (Latency): Measured")
|
410 |
+
print(f" ❌ Metric 2 (Extraction): Not applicable - no extraction pipeline")
|
411 |
+
print(f" ❌ Metric 3 (Relevance): Not applicable - no retrieval pipeline")
|
412 |
+
print(f" ❌ Metric 4 (Coverage): Not applicable - no retrieval content")
|
413 |
+
print(f" 🔄 Metric 5 (Actionability): Requires LLM judge evaluation")
|
414 |
+
print(f" 🔄 Metric 6 (Evidence): Requires LLM judge evaluation")
|
415 |
+
|
416 |
+
print(f"\n✅ Direct LLM evaluation complete!")
|
417 |
+
print(f"📊 Statistics: {stats_path}")
|
418 |
+
print(f"📝 Medical Outputs: {outputs_path}")
|
419 |
+
print(f"\n💡 Next step: Run python metric5_6_llm_judge_evaluator.py rag,direct for metrics 5-6")
|
evaluation/latency_evaluator.py
ADDED
@@ -0,0 +1,892 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
OnCall.ai System - Comprehensive Evaluator (Metrics 1-8)
|
4 |
+
========================================================
|
5 |
+
|
6 |
+
Single execution to collect all metrics 1-4 data from app.py pipeline.
|
7 |
+
Generates foundation data for metrics 5-8 evaluation in downstream processors.
|
8 |
+
|
9 |
+
COMPLETE METRICS OVERVIEW:
|
10 |
+
|
11 |
+
PIPELINE PERFORMANCE METRICS (Collected by this evaluator):
|
12 |
+
1. Total Latency (總處理時長) - Complete pipeline processing time from query to response
|
13 |
+
2. Condition Extraction Success Rate (條件抽取成功率) - Success rate of user_prompt.py condition extraction
|
14 |
+
3. Retrieval Relevance (檢索相關性) - Average cosine similarity scores from retrieval.py results
|
15 |
+
4. Retrieval Coverage (檢索覆蓋率) - Medical keyword utilization rate between retrieved content and generated advice
|
16 |
+
|
17 |
+
LLM JUDGE METRICS (Processed by metric5_6_llm_judge_evaluator.py):
|
18 |
+
5. Clinical Actionability (臨床可操作性) - Third-party LLM evaluation of medical advice actionability (1-10 scale)
|
19 |
+
* Uses batch evaluation strategy with Llama3-70B as judge
|
20 |
+
* Measures: Can healthcare providers immediately act on this advice?
|
21 |
+
* Target threshold: ≥7.0/10 for acceptable actionability
|
22 |
+
|
23 |
+
6. Clinical Evidence Quality (臨床證據品質) - Third-party LLM evaluation of evidence-based quality (1-10 scale)
|
24 |
+
* Uses same batch evaluation call as metric 5 for efficiency
|
25 |
+
* Measures: Is the advice evidence-based and follows medical standards?
|
26 |
+
* Target threshold: ≥7.5/10 for acceptable evidence quality
|
27 |
+
|
28 |
+
RETRIEVAL PRECISION METRICS (Processed by metric7_8_precision_MRR.py):
|
29 |
+
7. Precision@K (檢索精確率) - Proportion of relevant results in top-K retrieval results
|
30 |
+
* Uses adaptive threshold based on query complexity (0.15 for complex, 0.25 for simple queries)
|
31 |
+
* Query complexity determined by unique emergency keywords count (≥4 = complex)
|
32 |
+
* Measures: relevant_results / total_retrieved_results
|
33 |
+
|
34 |
+
8. Mean Reciprocal Rank (平均倒數排名) - Average reciprocal rank of first relevant result
|
35 |
+
* Uses same adaptive threshold as Precision@K
|
36 |
+
* Measures: 1 / rank_of_first_relevant_result (0 if no relevant results)
|
37 |
+
* Higher MRR indicates relevant results appear earlier in ranking
|
38 |
+
|
39 |
+
DATA FLOW ARCHITECTURE:
|
40 |
+
1. latency_evaluator.py → comprehensive_details_*.json (metrics 1-4 + pipeline data)
|
41 |
+
2. latency_evaluator.py → medical_outputs_*.json (medical advice for judge evaluation)
|
42 |
+
3. metric5_6_llm_judge_evaluator.py → judge_evaluation_*.json (metrics 5-6)
|
43 |
+
4. metric7_8_precision_MRR.py → precision_mrr_analysis_*.json (metrics 7-8)
|
44 |
+
|
45 |
+
Note: This evaluator focuses on metrics 1-4 collection. Metrics 5-8 require separate downstream evaluation.
|
46 |
+
|
47 |
+
Author: YanBo Chen
|
48 |
+
Date: 2025-08-04
|
49 |
+
"""
|
50 |
+
|
51 |
+
import time
|
52 |
+
import json
|
53 |
+
import os
|
54 |
+
import sys
|
55 |
+
from typing import Dict, List, Any, Set
|
56 |
+
from datetime import datetime
|
57 |
+
from pathlib import Path
|
58 |
+
import re
|
59 |
+
|
60 |
+
# Add project path
|
61 |
+
current_dir = Path(__file__).parent
|
62 |
+
project_root = current_dir.parent
|
63 |
+
src_dir = project_root / "src"
|
64 |
+
sys.path.insert(0, str(src_dir))
|
65 |
+
|
66 |
+
# Import existing system components
|
67 |
+
try:
|
68 |
+
from user_prompt import UserPromptProcessor
|
69 |
+
from retrieval import BasicRetrievalSystem
|
70 |
+
from llm_clients import llm_Med42_70BClient
|
71 |
+
from generation import MedicalAdviceGenerator
|
72 |
+
except ImportError as e:
|
73 |
+
print(f"❌ Import failed: {e}")
|
74 |
+
print("Please ensure running from project root directory")
|
75 |
+
sys.exit(1)
|
76 |
+
|
77 |
+
|
78 |
+
class ComprehensiveEvaluator:
|
79 |
+
"""Comprehensive evaluator for metrics 1-4 - single execution approach"""
|
80 |
+
|
81 |
+
def __init__(self):
|
82 |
+
"""Initialize system components (identical to app.py)"""
|
83 |
+
print("🔧 Initializing Comprehensive Evaluator...")
|
84 |
+
|
85 |
+
# Initialize existing system components (same as app.py)
|
86 |
+
self.llm_client = llm_Med42_70BClient()
|
87 |
+
self.retrieval_system = BasicRetrievalSystem()
|
88 |
+
self.user_prompt_processor = UserPromptProcessor(
|
89 |
+
llm_client=self.llm_client,
|
90 |
+
retrieval_system=self.retrieval_system
|
91 |
+
)
|
92 |
+
self.medical_generator = MedicalAdviceGenerator(llm_client=self.llm_client)
|
93 |
+
|
94 |
+
# Results accumulation for all metrics
|
95 |
+
self.comprehensive_results = []
|
96 |
+
self.medical_outputs = []
|
97 |
+
|
98 |
+
print("✅ Comprehensive Evaluator initialization complete")
|
99 |
+
|
100 |
+
def extract_medical_keywords(self, text: str) -> Set[str]:
|
101 |
+
"""Extract medical keywords for coverage analysis"""
|
102 |
+
if not text:
|
103 |
+
return set()
|
104 |
+
|
105 |
+
medical_keywords = set()
|
106 |
+
text_lower = text.lower()
|
107 |
+
|
108 |
+
# Medical terminology patterns
|
109 |
+
patterns = [
|
110 |
+
r'\b[a-z]+(?:osis|itis|pathy|emia|uria|gram|scopy)\b', # Medical suffixes
|
111 |
+
r'\b(?:cardio|neuro|pulmo|gastro|hepato|nephro)[a-z]+\b', # Medical prefixes
|
112 |
+
r'\b(?:diagnosis|treatment|therapy|intervention|management)\b', # Medical actions
|
113 |
+
r'\b(?:patient|symptom|condition|disease|disorder|syndrome)\b', # Medical entities
|
114 |
+
r'\b(?:acute|chronic|severe|mild|moderate|emergency)\b', # Medical descriptors
|
115 |
+
r'\b[a-z]+(?:al|ic|ous|ive)\s+(?:pain|failure|infection|injury)\b', # Compound terms
|
116 |
+
r'\b(?:ecg|ekg|ct|mri|x-ray|ultrasound|biopsy)\b', # Medical procedures
|
117 |
+
r'\b\d+\s*(?:mg|ml|units|hours|days|minutes)\b', # Dosages and timeframes
|
118 |
+
]
|
119 |
+
|
120 |
+
for pattern in patterns:
|
121 |
+
matches = re.findall(pattern, text_lower)
|
122 |
+
medical_keywords.update(match.strip() for match in matches)
|
123 |
+
|
124 |
+
# Additional common medical terms
|
125 |
+
common_medical_terms = [
|
126 |
+
'blood', 'pressure', 'heart', 'chest', 'pain', 'stroke', 'seizure',
|
127 |
+
'emergency', 'hospital', 'monitor', 'assess', 'evaluate', 'immediate',
|
128 |
+
'protocol', 'guideline', 'recommendation', 'risk', 'factor'
|
129 |
+
]
|
130 |
+
|
131 |
+
for term in common_medical_terms:
|
132 |
+
if term in text_lower:
|
133 |
+
medical_keywords.add(term)
|
134 |
+
|
135 |
+
# Filter out very short terms and common words
|
136 |
+
filtered_keywords = {
|
137 |
+
kw for kw in medical_keywords
|
138 |
+
if len(kw) > 2 and kw not in ['the', 'and', 'for', 'with', 'are', 'can', 'may']
|
139 |
+
}
|
140 |
+
|
141 |
+
return filtered_keywords
|
142 |
+
|
143 |
+
def calculate_coverage_metrics(self, generated_advice: str, retrieval_results: List[Dict]) -> Dict[str, Any]:
|
144 |
+
"""Calculate coverage metrics from generated advice and retrieval results"""
|
145 |
+
if not generated_advice or not retrieval_results:
|
146 |
+
return {
|
147 |
+
"coverage_score": 0.0,
|
148 |
+
"matched_keywords": [],
|
149 |
+
"advice_keywords": [],
|
150 |
+
"source_keywords": [],
|
151 |
+
"coverage_percentage": 0.0,
|
152 |
+
"meets_threshold": False
|
153 |
+
}
|
154 |
+
|
155 |
+
# Extract keywords from generated advice
|
156 |
+
advice_keywords = self.extract_medical_keywords(generated_advice)
|
157 |
+
|
158 |
+
# Extract keywords from all retrieved documents
|
159 |
+
all_source_keywords = set()
|
160 |
+
for doc in retrieval_results:
|
161 |
+
doc_content = doc.get('content', '') or doc.get('text', '')
|
162 |
+
doc_keywords = self.extract_medical_keywords(doc_content)
|
163 |
+
all_source_keywords.update(doc_keywords)
|
164 |
+
|
165 |
+
# Calculate coverage
|
166 |
+
matched_keywords = advice_keywords.intersection(all_source_keywords)
|
167 |
+
coverage_score = len(matched_keywords) / len(all_source_keywords) if all_source_keywords else 0.0
|
168 |
+
|
169 |
+
return {
|
170 |
+
"coverage_score": coverage_score,
|
171 |
+
"matched_keywords": list(matched_keywords),
|
172 |
+
"advice_keywords": list(advice_keywords),
|
173 |
+
"source_keywords": list(all_source_keywords),
|
174 |
+
"advice_keywords_count": len(advice_keywords),
|
175 |
+
"source_keywords_count": len(all_source_keywords),
|
176 |
+
"matched_keywords_count": len(matched_keywords),
|
177 |
+
"coverage_percentage": coverage_score * 100,
|
178 |
+
"meets_threshold": coverage_score >= 0.4
|
179 |
+
}
|
180 |
+
|
181 |
+
def evaluate_single_query_comprehensive(self, query: str, category: str = "unknown") -> Dict[str, Any]:
|
182 |
+
"""
|
183 |
+
Comprehensive evaluation for single query - collects all metrics 1-4 data
|
184 |
+
|
185 |
+
Replicates app.py's process_medical_query pipeline exactly
|
186 |
+
|
187 |
+
Args:
|
188 |
+
query: Medical query to test
|
189 |
+
category: Query category (diagnosis/treatment/mixed)
|
190 |
+
"""
|
191 |
+
print(f"🔍 Comprehensive evaluation: {query[:50]}...")
|
192 |
+
print(f"📋 Category: {category}")
|
193 |
+
|
194 |
+
overall_start = time.time()
|
195 |
+
timing_details = {}
|
196 |
+
|
197 |
+
try:
|
198 |
+
# STEP 1: Query Processing and Condition Extraction (identical to app.py)
|
199 |
+
step1_start = time.time()
|
200 |
+
condition_result = self.user_prompt_processor.extract_condition_keywords(query)
|
201 |
+
step1_time = time.time() - step1_start
|
202 |
+
timing_details['step1_condition_extraction'] = step1_time
|
203 |
+
|
204 |
+
print(f" Step 1 - Condition extraction: {step1_time:.3f}s")
|
205 |
+
print(f" Extracted condition: {condition_result.get('condition', 'None')}")
|
206 |
+
|
207 |
+
# Check if valid medical query
|
208 |
+
if condition_result.get('query_status') in ['invalid_query', 'non_medical']:
|
209 |
+
total_time = time.time() - overall_start
|
210 |
+
return self._create_failed_result(query, category, total_time, timing_details,
|
211 |
+
"non_medical", condition_result)
|
212 |
+
|
213 |
+
# STEP 2: User Confirmation (simulate auto-confirmation)
|
214 |
+
step2_start = time.time()
|
215 |
+
confirmation = self.user_prompt_processor.handle_user_confirmation(condition_result)
|
216 |
+
step2_time = time.time() - step2_start
|
217 |
+
timing_details['step2_confirmation'] = step2_time
|
218 |
+
|
219 |
+
if not condition_result.get('condition'):
|
220 |
+
total_time = time.time() - overall_start
|
221 |
+
return self._create_failed_result(query, category, total_time, timing_details,
|
222 |
+
"no_condition", condition_result)
|
223 |
+
|
224 |
+
# STEP 3: Medical Guidelines Retrieval (identical to app.py)
|
225 |
+
step3_start = time.time()
|
226 |
+
|
227 |
+
search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
|
228 |
+
if not search_query:
|
229 |
+
search_query = condition_result.get('condition', query)
|
230 |
+
|
231 |
+
retrieval_results = self.retrieval_system.search(search_query, top_k=5)
|
232 |
+
step3_time = time.time() - step3_start
|
233 |
+
timing_details['step3_retrieval'] = step3_time
|
234 |
+
|
235 |
+
processed_results = retrieval_results.get('processed_results', [])
|
236 |
+
print(f" Step 3 - Retrieval: {step3_time:.3f}s ({len(processed_results)} results)")
|
237 |
+
|
238 |
+
# STEP 4: Medical Advice Generation (identical to app.py)
|
239 |
+
step4_start = time.time()
|
240 |
+
|
241 |
+
intention = self._detect_query_intention(query)
|
242 |
+
medical_advice_result = self.medical_generator.generate_medical_advice(
|
243 |
+
user_query=query,
|
244 |
+
retrieval_results=retrieval_results,
|
245 |
+
intention=intention
|
246 |
+
)
|
247 |
+
step4_time = time.time() - step4_start
|
248 |
+
timing_details['step4_generation'] = step4_time
|
249 |
+
|
250 |
+
generated_advice = medical_advice_result.get('medical_advice', '')
|
251 |
+
confidence_score = medical_advice_result.get('confidence_score', 0.0)
|
252 |
+
|
253 |
+
print(f" Step 4 - Generation: {step4_time:.3f}s")
|
254 |
+
|
255 |
+
total_time = time.time() - overall_start
|
256 |
+
|
257 |
+
# METRIC 2: Condition Extraction Analysis
|
258 |
+
extraction_success = (
|
259 |
+
condition_result.get('condition') and
|
260 |
+
condition_result.get('condition') != "unknown" and
|
261 |
+
condition_result.get('query_status') not in ['invalid_query', 'non_medical']
|
262 |
+
)
|
263 |
+
|
264 |
+
extraction_metrics = {
|
265 |
+
"extraction_success": extraction_success,
|
266 |
+
"extracted_condition": condition_result.get('condition'),
|
267 |
+
"query_status": condition_result.get('query_status'),
|
268 |
+
"emergency_keywords": condition_result.get('emergency_keywords', []),
|
269 |
+
"treatment_keywords": condition_result.get('treatment_keywords', []),
|
270 |
+
"fallback_level": condition_result.get('fallback_level', 'unknown'),
|
271 |
+
"extraction_time": step1_time
|
272 |
+
}
|
273 |
+
|
274 |
+
# METRIC 3: Retrieval Relevance Analysis
|
275 |
+
if processed_results:
|
276 |
+
relevance_scores = []
|
277 |
+
for doc_result in processed_results:
|
278 |
+
# Get angular distance and convert to relevance using correct formula
|
279 |
+
distance = doc_result.get('distance', 1.0)
|
280 |
+
relevance = 1.0 - (distance**2) / 2.0 # Correct mathematical conversion
|
281 |
+
relevance_scores.append(relevance)
|
282 |
+
|
283 |
+
average_relevance = sum(relevance_scores) / len(relevance_scores)
|
284 |
+
high_relevance_count = sum(1 for score in relevance_scores if score >= 0.85)
|
285 |
+
|
286 |
+
relevance_metrics = {
|
287 |
+
"average_relevance": average_relevance,
|
288 |
+
"max_relevance": max(relevance_scores),
|
289 |
+
"min_relevance": min(relevance_scores),
|
290 |
+
"relevance_scores": relevance_scores,
|
291 |
+
"high_relevance_count": high_relevance_count,
|
292 |
+
"high_relevance_ratio": high_relevance_count / len(relevance_scores),
|
293 |
+
"retrieved_count": len(processed_results),
|
294 |
+
"meets_threshold": average_relevance >= 0.85,
|
295 |
+
"retrieval_time": step3_time
|
296 |
+
}
|
297 |
+
else:
|
298 |
+
relevance_metrics = {
|
299 |
+
"average_relevance": 0.0,
|
300 |
+
"max_relevance": 0.0,
|
301 |
+
"min_relevance": 0.0,
|
302 |
+
"similarity_scores": [],
|
303 |
+
"high_relevance_count": 0,
|
304 |
+
"high_relevance_ratio": 0.0,
|
305 |
+
"retrieved_count": 0,
|
306 |
+
"meets_threshold": False,
|
307 |
+
"retrieval_time": step3_time
|
308 |
+
}
|
309 |
+
|
310 |
+
# METRIC 4: Retrieval Coverage Analysis
|
311 |
+
coverage_metrics = self.calculate_coverage_metrics(generated_advice, processed_results)
|
312 |
+
coverage_metrics["generation_time"] = step4_time
|
313 |
+
|
314 |
+
# Create comprehensive result
|
315 |
+
comprehensive_result = {
|
316 |
+
"query": query,
|
317 |
+
"category": category,
|
318 |
+
|
319 |
+
# Metric 1: Total Latency - Complete pipeline processing time
|
320 |
+
"latency_metrics": {
|
321 |
+
"total_latency": total_time,
|
322 |
+
"timing_details": timing_details,
|
323 |
+
"meets_target": total_time <= 60.0
|
324 |
+
},
|
325 |
+
|
326 |
+
# Metric 2: Condition Extraction - Success rate from user_prompt.py
|
327 |
+
"extraction_metrics": extraction_metrics,
|
328 |
+
|
329 |
+
# Metric 3: Retrieval Relevance - Cosine similarity from retrieval.py
|
330 |
+
"relevance_metrics": relevance_metrics,
|
331 |
+
|
332 |
+
# Metric 4: Retrieval Coverage - Advice utilization of retrieved content
|
333 |
+
"coverage_metrics": coverage_metrics,
|
334 |
+
|
335 |
+
# Complete pipeline data (for debugging and detailed analysis)
|
336 |
+
"pipeline_data": {
|
337 |
+
"condition_result": condition_result,
|
338 |
+
"retrieval_results": retrieval_results,
|
339 |
+
"medical_advice_result": medical_advice_result,
|
340 |
+
"search_query": search_query,
|
341 |
+
"intention": intention
|
342 |
+
},
|
343 |
+
|
344 |
+
"overall_success": True,
|
345 |
+
"timestamp": datetime.now().isoformat()
|
346 |
+
}
|
347 |
+
|
348 |
+
# Validate data completeness for metrics 7-8 analysis
|
349 |
+
ready = True
|
350 |
+
data = comprehensive_result.get('pipeline_data', {})
|
351 |
+
|
352 |
+
# 1. Check retrieval results completeness for precision/MRR calculation
|
353 |
+
retr = data.get('retrieval_results', {}).get('processed_results', [])
|
354 |
+
if not retr or 'distance' not in retr[0]:
|
355 |
+
ready = False
|
356 |
+
|
357 |
+
# 2. Check condition extraction completeness for complexity analysis
|
358 |
+
cond = data.get('condition_result', {}).get('condition')
|
359 |
+
if not cond:
|
360 |
+
ready = False
|
361 |
+
|
362 |
+
# 3. Check overall execution status
|
363 |
+
if not comprehensive_result.get('overall_success', False):
|
364 |
+
ready = False
|
365 |
+
|
366 |
+
# 4. Check retrieval timing data completeness
|
367 |
+
if 'retrieval_time' not in comprehensive_result.get('relevance_metrics', {}):
|
368 |
+
ready = False
|
369 |
+
|
370 |
+
# Set metrics 7-8 readiness flag for downstream precision/MRR analysis
|
371 |
+
comprehensive_result['precision_mrr_ready'] = ready
|
372 |
+
|
373 |
+
# Store result
|
374 |
+
self.comprehensive_results.append(comprehensive_result)
|
375 |
+
|
376 |
+
# Store medical output for model comparison
|
377 |
+
medical_output = {
|
378 |
+
"query": query,
|
379 |
+
"category": category,
|
380 |
+
"medical_advice": generated_advice,
|
381 |
+
"confidence_score": confidence_score,
|
382 |
+
"query_id": f"{category}_query",
|
383 |
+
"processing_time": total_time,
|
384 |
+
"timestamp": datetime.now().isoformat()
|
385 |
+
}
|
386 |
+
self.medical_outputs.append(medical_output)
|
387 |
+
|
388 |
+
print(f"✅ Comprehensive evaluation completed in {total_time:.2f}s")
|
389 |
+
print(f" 📊 Metrics: Latency={total_time:.2f}s, Extraction={'✅' if extraction_success else '❌'}, "
|
390 |
+
f"Relevance={average_relevance:.3f}, Coverage={coverage_metrics['coverage_score']:.3f}")
|
391 |
+
|
392 |
+
return comprehensive_result
|
393 |
+
|
394 |
+
except Exception as e:
|
395 |
+
total_time = time.time() - overall_start
|
396 |
+
print(f"❌ Comprehensive evaluation failed after {total_time:.2f}s: {e}")
|
397 |
+
|
398 |
+
return self._create_failed_result(query, category, total_time, timing_details, "error", None, str(e))
|
399 |
+
|
400 |
+
def _create_failed_result(self, query: str, category: str, total_time: float,
|
401 |
+
timing_details: Dict, status: str, condition_result: Dict = None,
|
402 |
+
error: str = None) -> Dict[str, Any]:
|
403 |
+
"""Create standardized failed result"""
|
404 |
+
failed_result = {
|
405 |
+
"query": query,
|
406 |
+
"category": category,
|
407 |
+
|
408 |
+
# Metric 1: Total Latency - Always measurable even on failure
|
409 |
+
"latency_metrics": {
|
410 |
+
"total_latency": total_time,
|
411 |
+
"timing_details": timing_details,
|
412 |
+
"meets_target": total_time <= 60.0
|
413 |
+
},
|
414 |
+
|
415 |
+
# Metric 2: Condition Extraction - Partial data may be available before failure
|
416 |
+
"extraction_metrics": {
|
417 |
+
"extraction_success": False,
|
418 |
+
"extracted_condition": condition_result.get('condition') if condition_result else None,
|
419 |
+
"query_status": condition_result.get('query_status') if condition_result else status,
|
420 |
+
"extraction_time": timing_details.get('step1_condition_extraction', 0.0)
|
421 |
+
},
|
422 |
+
|
423 |
+
# Metric 3: Retrieval Relevance - Failed due to pipeline failure
|
424 |
+
"relevance_metrics": {
|
425 |
+
"average_relevance": 0.0,
|
426 |
+
"retrieved_count": 0,
|
427 |
+
"meets_threshold": False,
|
428 |
+
"retrieval_time": timing_details.get('step3_retrieval', 0.0)
|
429 |
+
},
|
430 |
+
|
431 |
+
# Metric 4: Retrieval Coverage - Failed due to pipeline failure
|
432 |
+
"coverage_metrics": {
|
433 |
+
"coverage_score": 0.0,
|
434 |
+
"meets_threshold": False,
|
435 |
+
"generation_time": timing_details.get('step4_generation', 0.0)
|
436 |
+
},
|
437 |
+
|
438 |
+
# Note: Metrics 5-6 (Clinical Actionability & Evidence Quality)
|
439 |
+
# are collected by metric5_6_llm_judge_evaluator.py using medical_outputs
|
440 |
+
# Metrics 7-8 (Precision@K & MRR) are collected by metric7_8_precision_MRR.py
|
441 |
+
# using comprehensive_details pipeline data
|
442 |
+
|
443 |
+
"overall_success": False,
|
444 |
+
"status": status,
|
445 |
+
"error": error,
|
446 |
+
"timestamp": datetime.now().isoformat()
|
447 |
+
}
|
448 |
+
|
449 |
+
# For failed results, precision/MRR analysis data is not ready
|
450 |
+
failed_result['precision_mrr_ready'] = False
|
451 |
+
|
452 |
+
self.comprehensive_results.append(failed_result)
|
453 |
+
return failed_result
|
454 |
+
|
455 |
+
def _detect_query_intention(self, query: str) -> str:
|
456 |
+
"""Simplified query intention detection (from app.py)"""
|
457 |
+
query_lower = query.lower()
|
458 |
+
|
459 |
+
if any(word in query_lower for word in ['diagnos', 'differential', 'possible', 'causes']):
|
460 |
+
return 'diagnosis'
|
461 |
+
elif any(word in query_lower for word in ['treat', 'manage', 'therapy', 'intervention']):
|
462 |
+
return 'treatment'
|
463 |
+
else:
|
464 |
+
return 'mixed'
|
465 |
+
def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
|
466 |
+
"""Parse queries from file with category labels"""
|
467 |
+
print(f"📁 Reading queries from file: {filepath}")
|
468 |
+
|
469 |
+
try:
|
470 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
471 |
+
content = f.read()
|
472 |
+
|
473 |
+
# Parse queries with category labels
|
474 |
+
queries_by_category = {
|
475 |
+
"diagnosis": [],
|
476 |
+
"treatment": [],
|
477 |
+
"mixed": []
|
478 |
+
}
|
479 |
+
|
480 |
+
lines = content.strip().split('\n')
|
481 |
+
|
482 |
+
for line in lines:
|
483 |
+
line = line.strip()
|
484 |
+
if not line:
|
485 |
+
continue
|
486 |
+
|
487 |
+
# Parse format: "1.diagnosis: query text"
|
488 |
+
match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
|
489 |
+
if match:
|
490 |
+
category_raw = match.group(1).lower()
|
491 |
+
query_text = match.group(2).strip()
|
492 |
+
|
493 |
+
# Normalize category name
|
494 |
+
if category_raw in ['mixed/complicated', 'mixed']:
|
495 |
+
category = 'mixed'
|
496 |
+
else:
|
497 |
+
category = category_raw
|
498 |
+
|
499 |
+
if category in queries_by_category and len(query_text) > 15:
|
500 |
+
queries_by_category[category].append({
|
501 |
+
"text": query_text,
|
502 |
+
"category": category
|
503 |
+
})
|
504 |
+
|
505 |
+
print(f"📋 Parsed queries by category:")
|
506 |
+
for category, category_queries in queries_by_category.items():
|
507 |
+
print(f" {category.capitalize()}: {len(category_queries)} queries")
|
508 |
+
|
509 |
+
return queries_by_category
|
510 |
+
|
511 |
+
except Exception as e:
|
512 |
+
print(f"❌ Failed to read file: {e}")
|
513 |
+
return {"error": f"Failed to read file: {e}"}
|
514 |
+
|
515 |
+
def calculate_metric_statistics(self, metric_name: str) -> Dict[str, Any]:
|
516 |
+
"""Calculate statistics for a specific metric across all results"""
|
517 |
+
category_stats = {}
|
518 |
+
all_successful_results = []
|
519 |
+
|
520 |
+
# Group results by category
|
521 |
+
results_by_category = {
|
522 |
+
"diagnosis": [],
|
523 |
+
"treatment": [],
|
524 |
+
"mixed": []
|
525 |
+
}
|
526 |
+
|
527 |
+
for result in self.comprehensive_results:
|
528 |
+
category = result.get('category', 'unknown')
|
529 |
+
if category in results_by_category:
|
530 |
+
results_by_category[category].append(result)
|
531 |
+
if result.get('overall_success'):
|
532 |
+
all_successful_results.append(result)
|
533 |
+
|
534 |
+
# Calculate statistics for each category based on metric type
|
535 |
+
for category, results in results_by_category.items():
|
536 |
+
successful_results = [r for r in results if r.get('overall_success')]
|
537 |
+
|
538 |
+
if metric_name == "latency":
|
539 |
+
if successful_results:
|
540 |
+
latencies = [r['latency_metrics']['total_latency'] for r in successful_results]
|
541 |
+
category_stats[category] = {
|
542 |
+
"average_latency": sum(latencies) / len(latencies),
|
543 |
+
"std_deviation": self._calculate_std(latencies),
|
544 |
+
"min_latency": min(latencies),
|
545 |
+
"max_latency": max(latencies),
|
546 |
+
"query_count": len(latencies),
|
547 |
+
"target_compliance": sum(1 for lat in latencies if lat <= 60.0) / len(latencies),
|
548 |
+
"individual_latencies": latencies
|
549 |
+
}
|
550 |
+
else:
|
551 |
+
category_stats[category] = self._get_empty_latency_stats()
|
552 |
+
|
553 |
+
elif metric_name == "extraction":
|
554 |
+
extraction_successes = [r['extraction_metrics']['extraction_success'] for r in results]
|
555 |
+
successful_extractions = sum(extraction_successes)
|
556 |
+
|
557 |
+
category_stats[category] = {
|
558 |
+
"success_rate": successful_extractions / len(results) if results else 0.0,
|
559 |
+
"successful_count": successful_extractions,
|
560 |
+
"total_count": len(results),
|
561 |
+
"average_extraction_time": sum(r['extraction_metrics']['extraction_time'] for r in results) / len(results) if results else 0.0,
|
562 |
+
"meets_threshold": (successful_extractions / len(results)) >= 0.8 if results else False
|
563 |
+
}
|
564 |
+
|
565 |
+
elif metric_name == "relevance":
|
566 |
+
if successful_results:
|
567 |
+
relevance_scores = [r['relevance_metrics']['average_relevance'] for r in successful_results]
|
568 |
+
category_stats[category] = {
|
569 |
+
"average_relevance": sum(relevance_scores) / len(relevance_scores),
|
570 |
+
"max_relevance": max(relevance_scores),
|
571 |
+
"min_relevance": min(relevance_scores),
|
572 |
+
"successful_retrievals": len(successful_results),
|
573 |
+
"total_queries": len(results),
|
574 |
+
"meets_threshold": (sum(relevance_scores) / len(relevance_scores)) >= 0.85,
|
575 |
+
"individual_relevance_scores": relevance_scores
|
576 |
+
}
|
577 |
+
else:
|
578 |
+
category_stats[category] = self._get_empty_relevance_stats(len(results))
|
579 |
+
|
580 |
+
elif metric_name == "coverage":
|
581 |
+
if successful_results:
|
582 |
+
coverage_scores = [r['coverage_metrics']['coverage_score'] for r in successful_results]
|
583 |
+
category_stats[category] = {
|
584 |
+
"average_coverage": sum(coverage_scores) / len(coverage_scores),
|
585 |
+
"max_coverage": max(coverage_scores),
|
586 |
+
"min_coverage": min(coverage_scores),
|
587 |
+
"successful_evaluations": len(successful_results),
|
588 |
+
"total_queries": len(results),
|
589 |
+
"meets_threshold": (sum(coverage_scores) / len(coverage_scores)) >= 0.4,
|
590 |
+
"individual_coverage_scores": coverage_scores
|
591 |
+
}
|
592 |
+
else:
|
593 |
+
category_stats[category] = self._get_empty_coverage_stats(len(results))
|
594 |
+
|
595 |
+
# Calculate overall statistics
|
596 |
+
overall_stats = self._calculate_overall_stats(metric_name, all_successful_results)
|
597 |
+
|
598 |
+
return {
|
599 |
+
"category_results": category_stats,
|
600 |
+
"overall_results": overall_stats,
|
601 |
+
"timestamp": datetime.now().isoformat()
|
602 |
+
}
|
603 |
+
|
604 |
+
def _calculate_std(self, values: List[float]) -> float:
|
605 |
+
"""Calculate standard deviation"""
|
606 |
+
if len(values) < 2:
|
607 |
+
return 0.0
|
608 |
+
|
609 |
+
mean = sum(values) / len(values)
|
610 |
+
variance = sum((x - mean) ** 2 for x in values) / len(values)
|
611 |
+
return variance ** 0.5
|
612 |
+
|
613 |
+
def _get_empty_latency_stats(self) -> Dict[str, Any]:
|
614 |
+
"""Return empty latency statistics"""
|
615 |
+
return {
|
616 |
+
"average_latency": 0.0,
|
617 |
+
"std_deviation": 0.0,
|
618 |
+
"min_latency": 0.0,
|
619 |
+
"max_latency": 0.0,
|
620 |
+
"query_count": 0,
|
621 |
+
"target_compliance": 0.0,
|
622 |
+
"individual_latencies": []
|
623 |
+
}
|
624 |
+
|
625 |
+
def _get_empty_relevance_stats(self, total_queries: int) -> Dict[str, Any]:
|
626 |
+
"""Return empty relevance statistics"""
|
627 |
+
return {
|
628 |
+
"average_relevance": 0.0,
|
629 |
+
"max_relevance": 0.0,
|
630 |
+
"min_relevance": 0.0,
|
631 |
+
"successful_retrievals": 0,
|
632 |
+
"total_queries": total_queries,
|
633 |
+
"meets_threshold": False,
|
634 |
+
"individual_relevance_scores": []
|
635 |
+
}
|
636 |
+
|
637 |
+
def _get_empty_coverage_stats(self, total_queries: int) -> Dict[str, Any]:
|
638 |
+
"""Return empty coverage statistics"""
|
639 |
+
return {
|
640 |
+
"average_coverage": 0.0,
|
641 |
+
"max_coverage": 0.0,
|
642 |
+
"min_coverage": 0.0,
|
643 |
+
"successful_evaluations": 0,
|
644 |
+
"total_queries": total_queries,
|
645 |
+
"meets_threshold": False,
|
646 |
+
"individual_coverage_scores": []
|
647 |
+
}
|
648 |
+
|
649 |
+
def _calculate_overall_stats(self, metric_name: str, all_successful_results: List[Dict]) -> Dict[str, Any]:
|
650 |
+
"""Calculate overall statistics for a specific metric"""
|
651 |
+
total_queries = len(self.comprehensive_results)
|
652 |
+
|
653 |
+
if metric_name == "latency" and all_successful_results:
|
654 |
+
latencies = [r['latency_metrics']['total_latency'] for r in all_successful_results]
|
655 |
+
return {
|
656 |
+
"average_latency": sum(latencies) / len(latencies),
|
657 |
+
"std_deviation": self._calculate_std(latencies),
|
658 |
+
"min_latency": min(latencies),
|
659 |
+
"max_latency": max(latencies),
|
660 |
+
"successful_queries": len(all_successful_results),
|
661 |
+
"total_queries": total_queries,
|
662 |
+
"target_compliance": sum(1 for lat in latencies if lat <= 60.0) / len(latencies)
|
663 |
+
}
|
664 |
+
|
665 |
+
elif metric_name == "extraction":
|
666 |
+
all_extractions = [r['extraction_metrics']['extraction_success'] for r in self.comprehensive_results]
|
667 |
+
successful_extractions = sum(all_extractions)
|
668 |
+
return {
|
669 |
+
"success_rate": successful_extractions / len(all_extractions) if all_extractions else 0.0,
|
670 |
+
"successful_count": successful_extractions,
|
671 |
+
"total_count": len(all_extractions),
|
672 |
+
"target_compliance": (successful_extractions / len(all_extractions)) >= 0.8 if all_extractions else False
|
673 |
+
}
|
674 |
+
|
675 |
+
elif metric_name == "relevance" and all_successful_results:
|
676 |
+
relevance_scores = [r['relevance_metrics']['average_relevance'] for r in all_successful_results]
|
677 |
+
return {
|
678 |
+
"average_relevance": sum(relevance_scores) / len(relevance_scores),
|
679 |
+
"max_relevance": max(relevance_scores),
|
680 |
+
"min_relevance": min(relevance_scores),
|
681 |
+
"successful_queries": len(all_successful_results),
|
682 |
+
"total_queries": total_queries,
|
683 |
+
"meets_threshold": (sum(relevance_scores) / len(relevance_scores)) >= 0.85,
|
684 |
+
"target_compliance": (sum(relevance_scores) / len(relevance_scores)) >= 0.7
|
685 |
+
}
|
686 |
+
|
687 |
+
elif metric_name == "coverage" and all_successful_results:
|
688 |
+
coverage_scores = [r['coverage_metrics']['coverage_score'] for r in all_successful_results]
|
689 |
+
return {
|
690 |
+
"average_coverage": sum(coverage_scores) / len(coverage_scores),
|
691 |
+
"max_coverage": max(coverage_scores),
|
692 |
+
"min_coverage": min(coverage_scores),
|
693 |
+
"successful_queries": len(all_successful_results),
|
694 |
+
"total_queries": total_queries,
|
695 |
+
"meets_threshold": (sum(coverage_scores) / len(coverage_scores)) >= 0.4
|
696 |
+
}
|
697 |
+
|
698 |
+
# Return empty stats for failed cases
|
699 |
+
return {
|
700 |
+
"average_value": 0.0,
|
701 |
+
"successful_queries": len(all_successful_results),
|
702 |
+
"total_queries": total_queries,
|
703 |
+
"meets_threshold": False
|
704 |
+
}
|
705 |
+
def save_all_metric_statistics(self) -> Dict[str, str]:
|
706 |
+
"""Save separate statistics files for each metric"""
|
707 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
708 |
+
|
709 |
+
# Ensure results directory exists
|
710 |
+
results_dir = Path(__file__).parent / "results"
|
711 |
+
results_dir.mkdir(exist_ok=True)
|
712 |
+
|
713 |
+
saved_files = {}
|
714 |
+
|
715 |
+
# Save statistics for each metric
|
716 |
+
for metric_name in ["latency", "extraction", "relevance", "coverage"]:
|
717 |
+
stats = self.calculate_metric_statistics(metric_name)
|
718 |
+
filename = f"{metric_name}_statistics_{timestamp}.json"
|
719 |
+
filepath = results_dir / filename
|
720 |
+
|
721 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
722 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
723 |
+
|
724 |
+
saved_files[metric_name] = str(filepath)
|
725 |
+
print(f"📊 {metric_name.capitalize()} statistics saved to: {filepath}")
|
726 |
+
|
727 |
+
return saved_files
|
728 |
+
|
729 |
+
def save_medical_outputs(self, filename: str = None) -> str:
|
730 |
+
"""Save medical advice outputs for model comparison"""
|
731 |
+
if filename is None:
|
732 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
733 |
+
filename = f"medical_outputs_{timestamp}.json"
|
734 |
+
|
735 |
+
# Ensure results directory exists
|
736 |
+
results_dir = Path(__file__).parent / "results"
|
737 |
+
results_dir.mkdir(exist_ok=True)
|
738 |
+
|
739 |
+
filepath = results_dir / filename
|
740 |
+
|
741 |
+
# Create comprehensive output data
|
742 |
+
output_data = {
|
743 |
+
"evaluation_metadata": {
|
744 |
+
"total_outputs": len(self.medical_outputs),
|
745 |
+
"categories": list(set(output['category'] for output in self.medical_outputs)),
|
746 |
+
"timestamp": datetime.now().isoformat(),
|
747 |
+
"model_type": "Med42-70B_RAG_enhanced" # For future comparison
|
748 |
+
},
|
749 |
+
"medical_outputs": self.medical_outputs
|
750 |
+
}
|
751 |
+
|
752 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
753 |
+
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
754 |
+
|
755 |
+
print(f"📝 Medical outputs saved to: {filepath}")
|
756 |
+
return str(filepath)
|
757 |
+
|
758 |
+
def save_comprehensive_details(self, filename: str = None) -> str:
|
759 |
+
"""Save comprehensive detailed results"""
|
760 |
+
if filename is None:
|
761 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
762 |
+
filename = f"comprehensive_details_{timestamp}.json"
|
763 |
+
|
764 |
+
# Ensure results directory exists
|
765 |
+
results_dir = Path(__file__).parent / "results"
|
766 |
+
results_dir.mkdir(exist_ok=True)
|
767 |
+
|
768 |
+
filepath = results_dir / filename
|
769 |
+
|
770 |
+
# Create comprehensive evaluation data
|
771 |
+
comprehensive_data = {
|
772 |
+
"evaluation_metadata": {
|
773 |
+
"total_queries": len(self.comprehensive_results),
|
774 |
+
"successful_queries": len([r for r in self.comprehensive_results if r.get('overall_success')]),
|
775 |
+
"timestamp": datetime.now().isoformat(),
|
776 |
+
"evaluator_type": "comprehensive_metrics_1_to_4",
|
777 |
+
"metrics_evaluated": ["latency", "extraction", "relevance", "coverage"]
|
778 |
+
},
|
779 |
+
"comprehensive_results": self.comprehensive_results
|
780 |
+
}
|
781 |
+
|
782 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
783 |
+
json.dump(comprehensive_data, f, indent=2, ensure_ascii=False)
|
784 |
+
|
785 |
+
print(f"📋 Comprehensive details saved to: {filepath}")
|
786 |
+
return str(filepath)
|
787 |
+
|
788 |
+
|
789 |
+
# Independent execution interface
|
790 |
+
if __name__ == "__main__":
|
791 |
+
"""Independent comprehensive evaluation interface"""
|
792 |
+
|
793 |
+
print("🚀 OnCall.ai Comprehensive Evaluator - Metrics 1-4 in Single Run")
|
794 |
+
|
795 |
+
if len(sys.argv) > 1:
|
796 |
+
query_file = sys.argv[1]
|
797 |
+
else:
|
798 |
+
# Default to evaluation/single_test_query.txt for initial testing
|
799 |
+
query_file = Path(__file__).parent / "single_test_query.txt"
|
800 |
+
|
801 |
+
if not os.path.exists(query_file):
|
802 |
+
print(f"❌ Query file not found: {query_file}")
|
803 |
+
print("Usage: python latency_evaluator.py [query_file.txt]")
|
804 |
+
sys.exit(1)
|
805 |
+
|
806 |
+
# Initialize evaluator
|
807 |
+
evaluator = ComprehensiveEvaluator()
|
808 |
+
|
809 |
+
# Parse queries from file
|
810 |
+
queries_by_category = evaluator.parse_queries_from_file(str(query_file))
|
811 |
+
|
812 |
+
if "error" in queries_by_category:
|
813 |
+
print(f"❌ Failed to parse queries: {queries_by_category['error']}")
|
814 |
+
sys.exit(1)
|
815 |
+
|
816 |
+
# Test each query comprehensively
|
817 |
+
print(f"\n🧪 Comprehensive Evaluation - All Metrics in Single Run")
|
818 |
+
print(f"📊 Collecting metrics 1-4 from single app.py pipeline execution")
|
819 |
+
|
820 |
+
for category, queries in queries_by_category.items():
|
821 |
+
if not queries:
|
822 |
+
continue
|
823 |
+
|
824 |
+
print(f"\n📂 Testing {category.upper()} queries:")
|
825 |
+
|
826 |
+
for i, query_info in enumerate(queries):
|
827 |
+
query_text = query_info['text']
|
828 |
+
print(f"\n🔍 Query {i+1}/{len(queries)} in {category} category:")
|
829 |
+
print(f" Text: {query_text}")
|
830 |
+
|
831 |
+
# Comprehensive evaluation (collects all metrics 1-4)
|
832 |
+
result = evaluator.evaluate_single_query_comprehensive(query_text, category)
|
833 |
+
|
834 |
+
# Pause between queries to avoid rate limits
|
835 |
+
if i < len(queries) - 1:
|
836 |
+
print(f" ⏳ Pausing 5s before next query...")
|
837 |
+
time.sleep(5)
|
838 |
+
|
839 |
+
# Longer pause between categories
|
840 |
+
if category != list(queries_by_category.keys())[-1]:
|
841 |
+
print(f"\n⏳ Pausing 10s before next category...")
|
842 |
+
time.sleep(10)
|
843 |
+
|
844 |
+
# Generate and save all metric statistics
|
845 |
+
print(f"\n📊 Generating comprehensive analysis for all metrics...")
|
846 |
+
|
847 |
+
# Save separate statistics for each metric
|
848 |
+
saved_stats = evaluator.save_all_metric_statistics()
|
849 |
+
|
850 |
+
# Save medical outputs for model comparison
|
851 |
+
outputs_path = evaluator.save_medical_outputs()
|
852 |
+
|
853 |
+
# Save comprehensive details
|
854 |
+
details_path = evaluator.save_comprehensive_details()
|
855 |
+
|
856 |
+
# Print comprehensive summary
|
857 |
+
print(f"\n📊 === COMPREHENSIVE EVALUATION SUMMARY ===")
|
858 |
+
|
859 |
+
for metric_name in ["latency", "extraction", "relevance", "coverage"]:
|
860 |
+
stats = evaluator.calculate_metric_statistics(metric_name)
|
861 |
+
overall_results = stats['overall_results']
|
862 |
+
|
863 |
+
print(f"\n{metric_name.upper()} METRICS:")
|
864 |
+
|
865 |
+
if metric_name == "latency":
|
866 |
+
print(f" Average: {overall_results['average_latency']:.2f}s (±{overall_results['std_deviation']:.2f})")
|
867 |
+
print(f" 60s Target: {'✅ Met' if overall_results['target_compliance'] >= 0.8 else '❌ Not Met'}")
|
868 |
+
|
869 |
+
elif metric_name == "extraction":
|
870 |
+
print(f" Success Rate: {overall_results['success_rate']:.1%}")
|
871 |
+
print(f" 80% Target: {'✅ Met' if overall_results['target_compliance'] else '❌ Not Met'}")
|
872 |
+
|
873 |
+
elif metric_name == "relevance":
|
874 |
+
print(f" Average Relevance: {overall_results['average_relevance']:.3f}")
|
875 |
+
print(f" 0.70 Target: {'✅ Met' if overall_results.get('target_compliance', False) else '❌ Not Met'}")
|
876 |
+
|
877 |
+
elif metric_name == "coverage":
|
878 |
+
print(f" Average Coverage: {overall_results['average_coverage']:.3f} ({overall_results['average_coverage']*100:.1f}%)")
|
879 |
+
print(f" 40% Target: {'✅ Met' if overall_results['meets_threshold'] else '❌ Not Met'}")
|
880 |
+
|
881 |
+
print(f"\n✅ Comprehensive evaluation complete! Files saved:")
|
882 |
+
for metric_name, filepath in saved_stats.items():
|
883 |
+
print(f" 📊 {metric_name.capitalize()}: {filepath}")
|
884 |
+
print(f" 📝 Medical Outputs: {outputs_path}")
|
885 |
+
print(f" 📋 Comprehensive Details: {details_path}")
|
886 |
+
print(f"\n💡 Next step: Run downstream evaluators for metrics 5-8")
|
887 |
+
print(f" python metric5_6_llm_judge_evaluator.py rag")
|
888 |
+
print(f" python metric7_8_precision_MRR.py {details_path}")
|
889 |
+
print(f" python latency_chart_generator.py")
|
890 |
+
print(f" python extraction_chart_generator.py # (create separately)")
|
891 |
+
print(f" python relevance_chart_generator.py # (create separately)")
|
892 |
+
print(f" python coverage_chart_generator.py # (create separately)")
|
evaluation/metric1_latency_chart_generator.py
ADDED
@@ -0,0 +1,327 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
OnCall.ai System - Latency Chart Generator
|
4 |
+
==========================================
|
5 |
+
|
6 |
+
Generates comprehensive latency analysis charts from saved statistics.
|
7 |
+
Reads JSON files produced by latency_evaluator.py and creates visualizations.
|
8 |
+
|
9 |
+
No LLM calls - pure data visualization.
|
10 |
+
|
11 |
+
Author: YanBo Chen
|
12 |
+
Date: 2025-08-04
|
13 |
+
"""
|
14 |
+
|
15 |
+
import json
|
16 |
+
import os
|
17 |
+
import sys
|
18 |
+
from typing import Dict, List, Any
|
19 |
+
from datetime import datetime
|
20 |
+
from pathlib import Path
|
21 |
+
import glob
|
22 |
+
|
23 |
+
# Visualization imports
|
24 |
+
import matplotlib.pyplot as plt
|
25 |
+
import seaborn as sns
|
26 |
+
import pandas as pd
|
27 |
+
import numpy as np
|
28 |
+
|
29 |
+
|
30 |
+
class LatencyChartGenerator:
|
31 |
+
"""Generate charts from latency evaluation statistics - no LLM dependency"""
|
32 |
+
|
33 |
+
def __init__(self):
|
34 |
+
"""Initialize chart generator"""
|
35 |
+
print("📈 Initializing Latency Chart Generator...")
|
36 |
+
|
37 |
+
# Set up professional chart style
|
38 |
+
plt.style.use('default')
|
39 |
+
sns.set_palette("husl")
|
40 |
+
|
41 |
+
print("✅ Chart Generator ready")
|
42 |
+
|
43 |
+
def load_latest_statistics(self, results_dir: str = None) -> Dict[str, Any]:
|
44 |
+
"""
|
45 |
+
Load the most recent latency statistics file
|
46 |
+
|
47 |
+
Args:
|
48 |
+
results_dir: Directory containing statistics files
|
49 |
+
"""
|
50 |
+
if results_dir is None:
|
51 |
+
results_dir = Path(__file__).parent / "results"
|
52 |
+
|
53 |
+
# Find latest statistics file
|
54 |
+
pattern = str(results_dir / "latency_statistics_*.json")
|
55 |
+
stat_files = glob.glob(pattern)
|
56 |
+
|
57 |
+
if not stat_files:
|
58 |
+
raise FileNotFoundError(f"No latency statistics files found in {results_dir}")
|
59 |
+
|
60 |
+
# Get the most recent file
|
61 |
+
latest_file = max(stat_files, key=os.path.getmtime)
|
62 |
+
|
63 |
+
print(f"📊 Loading statistics from: {latest_file}")
|
64 |
+
|
65 |
+
with open(latest_file, 'r', encoding='utf-8') as f:
|
66 |
+
stats = json.load(f)
|
67 |
+
|
68 |
+
return stats
|
69 |
+
|
70 |
+
def generate_comprehensive_charts(self, stats: Dict[str, Any]) -> str:
|
71 |
+
"""
|
72 |
+
Generate comprehensive 4-category latency analysis charts
|
73 |
+
|
74 |
+
Creates professional charts showing:
|
75 |
+
1. Category comparison bar chart
|
76 |
+
2. Individual query scatter plot
|
77 |
+
3. Statistical summary table
|
78 |
+
4. Performance distribution box plot
|
79 |
+
"""
|
80 |
+
try:
|
81 |
+
# Create figure with subplots
|
82 |
+
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
83 |
+
fig.suptitle('OnCall.ai Latency Analysis - Category Comparison',
|
84 |
+
fontsize=16, fontweight='bold')
|
85 |
+
|
86 |
+
category_results = stats['category_results']
|
87 |
+
overall_results = stats['overall_results']
|
88 |
+
|
89 |
+
# Chart 1: Category Comparison Bar Chart
|
90 |
+
ax1 = axes[0, 0]
|
91 |
+
categories = []
|
92 |
+
avg_latencies = []
|
93 |
+
std_devs = []
|
94 |
+
|
95 |
+
# Collect category data
|
96 |
+
for category, cat_stats in category_results.items():
|
97 |
+
if cat_stats['query_count'] > 0:
|
98 |
+
categories.append(category.replace('_', ' ').title())
|
99 |
+
avg_latencies.append(cat_stats['average_latency'])
|
100 |
+
std_devs.append(cat_stats['std_deviation'])
|
101 |
+
|
102 |
+
# Add overall
|
103 |
+
categories.append('Overall')
|
104 |
+
avg_latencies.append(overall_results['average_latency'])
|
105 |
+
std_devs.append(overall_results['std_deviation'])
|
106 |
+
|
107 |
+
# Create bar chart with error bars
|
108 |
+
bars = ax1.bar(categories, avg_latencies, capsize=5, alpha=0.8,
|
109 |
+
color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
|
110 |
+
ax1.errorbar(categories, avg_latencies, yerr=std_devs, fmt='none',
|
111 |
+
color='black', capsize=3, capthick=1)
|
112 |
+
|
113 |
+
ax1.set_title('Average Latency by Category', fontweight='bold')
|
114 |
+
ax1.set_ylabel('Latency (seconds)')
|
115 |
+
ax1.set_xlabel('Query Category')
|
116 |
+
ax1.grid(True, alpha=0.3)
|
117 |
+
|
118 |
+
# Add value labels on bars
|
119 |
+
for bar, avg, std in zip(bars, avg_latencies, std_devs):
|
120 |
+
height = bar.get_height()
|
121 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + std*0.1,
|
122 |
+
f'{avg:.1f}s', ha='center', va='bottom', fontweight='bold')
|
123 |
+
|
124 |
+
# Add target line
|
125 |
+
ax1.axhline(y=30.0, color='red', linestyle='--', alpha=0.7, label='30s Target')
|
126 |
+
ax1.legend()
|
127 |
+
|
128 |
+
# Chart 2: Individual Query Performance
|
129 |
+
ax2 = axes[0, 1]
|
130 |
+
|
131 |
+
query_indices = []
|
132 |
+
latencies = []
|
133 |
+
colors = []
|
134 |
+
|
135 |
+
color_map = {'diagnosis': '#1f77b4', 'treatment': '#ff7f0e', 'mixed': '#d62728'}
|
136 |
+
query_idx = 0
|
137 |
+
|
138 |
+
for category, cat_stats in category_results.items():
|
139 |
+
for latency in cat_stats['individual_latencies']:
|
140 |
+
query_indices.append(query_idx)
|
141 |
+
latencies.append(latency)
|
142 |
+
colors.append(color_map.get(category, 'gray'))
|
143 |
+
query_idx += 1
|
144 |
+
|
145 |
+
if latencies:
|
146 |
+
ax2.scatter(query_indices, latencies, c=colors, alpha=0.7, s=100)
|
147 |
+
ax2.set_title('Individual Query Performance', fontweight='bold')
|
148 |
+
ax2.set_ylabel('Latency (seconds)')
|
149 |
+
ax2.set_xlabel('Query Index')
|
150 |
+
ax2.grid(True, alpha=0.3)
|
151 |
+
|
152 |
+
# Add target line
|
153 |
+
ax2.axhline(y=30.0, color='red', linestyle='--', alpha=0.7, label='30s Target')
|
154 |
+
|
155 |
+
# Add category legend
|
156 |
+
from matplotlib.patches import Patch
|
157 |
+
legend_elements = [Patch(facecolor=color_map[cat], label=cat.title())
|
158 |
+
for cat in color_map.keys() if cat in category_results.keys()]
|
159 |
+
ax2.legend(handles=legend_elements)
|
160 |
+
else:
|
161 |
+
ax2.text(0.5, 0.5, 'No latency data available',
|
162 |
+
ha='center', va='center', transform=ax2.transAxes)
|
163 |
+
ax2.set_title('Individual Query Performance', fontweight='bold')
|
164 |
+
|
165 |
+
# Chart 3: Statistical Summary Table
|
166 |
+
ax3 = axes[1, 0]
|
167 |
+
ax3.axis('tight')
|
168 |
+
ax3.axis('off')
|
169 |
+
|
170 |
+
# Create summary table
|
171 |
+
table_data = []
|
172 |
+
headers = ['Category', 'Avg (s)', 'Std (s)', 'Min (s)', 'Max (s)', 'Count']
|
173 |
+
|
174 |
+
for category, cat_stats in category_results.items():
|
175 |
+
if cat_stats['query_count'] > 0:
|
176 |
+
table_data.append([
|
177 |
+
category.replace('_', ' ').title(),
|
178 |
+
f"{cat_stats['average_latency']:.2f}",
|
179 |
+
f"{cat_stats['std_deviation']:.2f}",
|
180 |
+
f"{cat_stats['min_latency']:.2f}",
|
181 |
+
f"{cat_stats['max_latency']:.2f}",
|
182 |
+
str(cat_stats['query_count'])
|
183 |
+
])
|
184 |
+
|
185 |
+
# Add overall row
|
186 |
+
table_data.append([
|
187 |
+
'Overall',
|
188 |
+
f"{overall_results['average_latency']:.2f}",
|
189 |
+
f"{overall_results['std_deviation']:.2f}",
|
190 |
+
f"{overall_results['min_latency']:.2f}",
|
191 |
+
f"{overall_results['max_latency']:.2f}",
|
192 |
+
str(overall_results['successful_queries'])
|
193 |
+
])
|
194 |
+
|
195 |
+
if table_data:
|
196 |
+
table = ax3.table(cellText=table_data, colLabels=headers,
|
197 |
+
cellLoc='center', loc='center',
|
198 |
+
colWidths=[0.2, 0.15, 0.15, 0.15, 0.15, 0.1])
|
199 |
+
table.auto_set_font_size(False)
|
200 |
+
table.set_fontsize(10)
|
201 |
+
table.scale(1, 2)
|
202 |
+
|
203 |
+
# Style the table header
|
204 |
+
for i in range(len(headers)):
|
205 |
+
table[(0, i)].set_text_props(weight='bold', color='white')
|
206 |
+
table[(0, i)].set_facecolor('#2E7D32')
|
207 |
+
|
208 |
+
ax3.set_title('Statistical Summary', fontweight='bold', pad=20)
|
209 |
+
|
210 |
+
# Chart 4: Performance Distribution
|
211 |
+
ax4 = axes[1, 1]
|
212 |
+
|
213 |
+
# Create box plot if we have multiple data points
|
214 |
+
box_data = []
|
215 |
+
box_labels = []
|
216 |
+
|
217 |
+
for category, cat_stats in category_results.items():
|
218 |
+
if cat_stats['individual_latencies'] and len(cat_stats['individual_latencies']) > 0:
|
219 |
+
box_data.append(cat_stats['individual_latencies'])
|
220 |
+
box_labels.append(category.replace('_', ' ').title())
|
221 |
+
|
222 |
+
if box_data and len(box_data) > 0:
|
223 |
+
box_plot = ax4.boxplot(box_data, labels=box_labels, patch_artist=True)
|
224 |
+
|
225 |
+
# Color the boxes
|
226 |
+
colors = ['#1f77b4', '#ff7f0e', '#d62728']
|
227 |
+
for patch, color in zip(box_plot['boxes'], colors[:len(box_plot['boxes'])]):
|
228 |
+
patch.set_facecolor(color)
|
229 |
+
patch.set_alpha(0.7)
|
230 |
+
|
231 |
+
ax4.set_title('Latency Distribution by Category', fontweight='bold')
|
232 |
+
ax4.set_ylabel('Latency (seconds)')
|
233 |
+
ax4.grid(True, alpha=0.3)
|
234 |
+
|
235 |
+
# Add target line
|
236 |
+
ax4.axhline(y=30.0, color='red', linestyle='--', alpha=0.7, label='30s Target')
|
237 |
+
ax4.legend()
|
238 |
+
else:
|
239 |
+
# For single data points, show a simple bar chart
|
240 |
+
single_categories = []
|
241 |
+
single_latencies = []
|
242 |
+
|
243 |
+
for category, cat_stats in category_results.items():
|
244 |
+
if cat_stats['query_count'] > 0:
|
245 |
+
single_categories.append(category.replace('_', ' ').title())
|
246 |
+
single_latencies.append(cat_stats['average_latency'])
|
247 |
+
|
248 |
+
if single_categories:
|
249 |
+
ax4.bar(single_categories, single_latencies, alpha=0.7,
|
250 |
+
color=['#1f77b4', '#ff7f0e', '#d62728'][:len(single_categories)])
|
251 |
+
ax4.set_title('Category Latency (Single Query Each)', fontweight='bold')
|
252 |
+
ax4.set_ylabel('Latency (seconds)')
|
253 |
+
ax4.grid(True, alpha=0.3)
|
254 |
+
ax4.axhline(y=30.0, color='red', linestyle='--', alpha=0.7, label='30s Target')
|
255 |
+
ax4.legend()
|
256 |
+
else:
|
257 |
+
ax4.text(0.5, 0.5, 'No data available for distribution plot',
|
258 |
+
ha='center', va='center', transform=ax4.transAxes)
|
259 |
+
ax4.set_title('Latency Distribution', fontweight='bold')
|
260 |
+
|
261 |
+
# Adjust layout and save
|
262 |
+
plt.tight_layout()
|
263 |
+
|
264 |
+
# Save chart
|
265 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
266 |
+
chart_filename = f"latency_analysis_charts_{timestamp}.png"
|
267 |
+
|
268 |
+
# Ensure results directory exists
|
269 |
+
results_dir = Path(__file__).parent / "results"
|
270 |
+
results_dir.mkdir(exist_ok=True)
|
271 |
+
chart_path = results_dir / chart_filename
|
272 |
+
|
273 |
+
plt.savefig(chart_path, dpi=300, bbox_inches='tight',
|
274 |
+
facecolor='white', edgecolor='none')
|
275 |
+
plt.close()
|
276 |
+
|
277 |
+
print(f"📈 Charts saved to: {chart_path}")
|
278 |
+
return str(chart_path)
|
279 |
+
|
280 |
+
except Exception as e:
|
281 |
+
print(f"❌ Chart generation failed: {e}")
|
282 |
+
return ""
|
283 |
+
|
284 |
+
def print_statistics_summary(self, stats: Dict[str, Any]):
|
285 |
+
"""Print formatted statistics summary to console"""
|
286 |
+
category_results = stats['category_results']
|
287 |
+
overall_results = stats['overall_results']
|
288 |
+
|
289 |
+
print(f"\n📊 === LATENCY ANALYSIS CHART SUMMARY ===")
|
290 |
+
print(f"Overall Performance:")
|
291 |
+
print(f" Average Latency: {overall_results['average_latency']:.2f}s (±{overall_results['std_deviation']:.2f})")
|
292 |
+
print(f" Success Rate: {overall_results['successful_queries']}/{overall_results['total_queries']}")
|
293 |
+
print(f" 30s Target Compliance: {overall_results['target_compliance']:.1%}")
|
294 |
+
|
295 |
+
print(f"\nCategory Breakdown:")
|
296 |
+
for category, cat_stats in category_results.items():
|
297 |
+
if cat_stats['query_count'] > 0:
|
298 |
+
print(f" {category.capitalize()}: {cat_stats['average_latency']:.2f}s (±{cat_stats['std_deviation']:.2f}) [{cat_stats['query_count']} queries]")
|
299 |
+
|
300 |
+
|
301 |
+
# Independent execution interface
|
302 |
+
if __name__ == "__main__":
|
303 |
+
"""Independent chart generation interface"""
|
304 |
+
|
305 |
+
print("📈 OnCall.ai Latency Chart Generator")
|
306 |
+
|
307 |
+
# Initialize chart generator
|
308 |
+
chart_gen = LatencyChartGenerator()
|
309 |
+
|
310 |
+
try:
|
311 |
+
# Load latest statistics
|
312 |
+
stats = chart_gen.load_latest_statistics()
|
313 |
+
|
314 |
+
# Generate charts
|
315 |
+
chart_path = chart_gen.generate_comprehensive_charts(stats)
|
316 |
+
|
317 |
+
# Print summary
|
318 |
+
chart_gen.print_statistics_summary(stats)
|
319 |
+
|
320 |
+
print(f"\n✅ Chart generation complete!")
|
321 |
+
print(f"📈 Charts saved to: {chart_path}")
|
322 |
+
|
323 |
+
except FileNotFoundError as e:
|
324 |
+
print(f"❌ {e}")
|
325 |
+
print("💡 Please run latency_evaluator.py first to generate statistics data")
|
326 |
+
except Exception as e:
|
327 |
+
print(f"❌ Chart generation failed: {e}")
|
evaluation/metric2_extraction_chart_generator.py
ADDED
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
OnCall.ai System - Extraction Chart Generator
|
4 |
+
============================================
|
5 |
+
|
6 |
+
Generates extraction success rate charts from saved statistics.
|
7 |
+
Reads JSON files produced by comprehensive evaluator.
|
8 |
+
|
9 |
+
Author: YanBo Chen
|
10 |
+
Date: 2025-08-04
|
11 |
+
"""
|
12 |
+
|
13 |
+
import json
|
14 |
+
import os
|
15 |
+
import sys
|
16 |
+
from typing import Dict, List, Any
|
17 |
+
from datetime import datetime
|
18 |
+
from pathlib import Path
|
19 |
+
import glob
|
20 |
+
|
21 |
+
# Visualization imports
|
22 |
+
import matplotlib.pyplot as plt
|
23 |
+
import seaborn as sns
|
24 |
+
import pandas as pd
|
25 |
+
import numpy as np
|
26 |
+
|
27 |
+
|
28 |
+
class ExtractionChartGenerator:
|
29 |
+
"""Generate charts for condition extraction metrics"""
|
30 |
+
|
31 |
+
def __init__(self):
|
32 |
+
"""Initialize chart generator"""
|
33 |
+
print("📈 Initializing Extraction Chart Generator...")
|
34 |
+
plt.style.use('default')
|
35 |
+
sns.set_palette("husl")
|
36 |
+
print("✅ Chart Generator ready")
|
37 |
+
|
38 |
+
def load_latest_extraction_statistics(self, results_dir: str = None) -> Dict[str, Any]:
|
39 |
+
"""Load the most recent extraction statistics file"""
|
40 |
+
if results_dir is None:
|
41 |
+
results_dir = Path(__file__).parent / "results"
|
42 |
+
|
43 |
+
pattern = str(results_dir / "extraction_statistics_*.json")
|
44 |
+
stat_files = glob.glob(pattern)
|
45 |
+
|
46 |
+
if not stat_files:
|
47 |
+
raise FileNotFoundError(f"No extraction statistics files found in {results_dir}")
|
48 |
+
|
49 |
+
latest_file = max(stat_files, key=os.path.getmtime)
|
50 |
+
print(f"📊 Loading extraction statistics from: {latest_file}")
|
51 |
+
|
52 |
+
with open(latest_file, 'r', encoding='utf-8') as f:
|
53 |
+
stats = json.load(f)
|
54 |
+
|
55 |
+
return stats
|
56 |
+
|
57 |
+
def generate_extraction_charts(self, stats: Dict[str, Any]) -> str:
|
58 |
+
"""Generate extraction success rate analysis charts"""
|
59 |
+
try:
|
60 |
+
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
61 |
+
fig.suptitle('OnCall.ai Extraction Success Rate Analysis', fontsize=16, fontweight='bold')
|
62 |
+
|
63 |
+
category_results = stats['category_results']
|
64 |
+
overall_results = stats['overall_results']
|
65 |
+
|
66 |
+
# Chart 1: Success Rate by Category
|
67 |
+
ax1 = axes[0, 0]
|
68 |
+
categories = []
|
69 |
+
success_rates = []
|
70 |
+
|
71 |
+
for category, cat_stats in category_results.items():
|
72 |
+
if cat_stats['total_count'] > 0:
|
73 |
+
categories.append(category.replace('_', ' ').title())
|
74 |
+
success_rates.append(cat_stats['success_rate'] * 100)
|
75 |
+
|
76 |
+
categories.append('Overall')
|
77 |
+
success_rates.append(overall_results['success_rate'] * 100)
|
78 |
+
|
79 |
+
bars = ax1.bar(categories, success_rates, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
|
80 |
+
ax1.set_title('Extraction Success Rate by Category', fontweight='bold')
|
81 |
+
ax1.set_ylabel('Success Rate (%)')
|
82 |
+
ax1.set_xlabel('Query Category')
|
83 |
+
ax1.grid(True, alpha=0.3)
|
84 |
+
|
85 |
+
# Add target line
|
86 |
+
ax1.axhline(y=80, color='red', linestyle='--', alpha=0.7, label='80% Target')
|
87 |
+
ax1.legend()
|
88 |
+
|
89 |
+
# Add value labels
|
90 |
+
for bar, rate in zip(bars, success_rates):
|
91 |
+
height = bar.get_height()
|
92 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
|
93 |
+
f'{rate:.1f}%', ha='center', va='bottom', fontweight='bold')
|
94 |
+
|
95 |
+
# Chart 2: Success Count
|
96 |
+
ax2 = axes[0, 1]
|
97 |
+
successful_counts = []
|
98 |
+
total_counts = []
|
99 |
+
|
100 |
+
for category, cat_stats in category_results.items():
|
101 |
+
if cat_stats['total_count'] > 0:
|
102 |
+
successful_counts.append(cat_stats['successful_count'])
|
103 |
+
total_counts.append(cat_stats['total_count'])
|
104 |
+
|
105 |
+
successful_counts.append(overall_results['successful_count'])
|
106 |
+
total_counts.append(overall_results['total_count'])
|
107 |
+
|
108 |
+
x = np.arange(len(categories))
|
109 |
+
width = 0.35
|
110 |
+
|
111 |
+
ax2.bar(x - width/2, successful_counts, width, label='Successful', alpha=0.8)
|
112 |
+
ax2.bar(x + width/2, total_counts, width, label='Total', alpha=0.8)
|
113 |
+
|
114 |
+
ax2.set_title('Extraction Success Count', fontweight='bold')
|
115 |
+
ax2.set_ylabel('Query Count')
|
116 |
+
ax2.set_xlabel('Query Category')
|
117 |
+
ax2.set_xticks(x)
|
118 |
+
ax2.set_xticklabels(categories)
|
119 |
+
ax2.legend()
|
120 |
+
ax2.grid(True, alpha=0.3)
|
121 |
+
|
122 |
+
# Chart 3: Statistical Summary Table
|
123 |
+
ax3 = axes[1, 0]
|
124 |
+
ax3.axis('tight')
|
125 |
+
ax3.axis('off')
|
126 |
+
|
127 |
+
table_data = []
|
128 |
+
headers = ['Category', 'Success Rate', 'Success/Total', 'Avg Time (s)', 'Target Met']
|
129 |
+
|
130 |
+
for category, cat_stats in category_results.items():
|
131 |
+
if cat_stats['total_count'] > 0:
|
132 |
+
table_data.append([
|
133 |
+
category.replace('_', ' ').title(),
|
134 |
+
f"{cat_stats['success_rate']:.1%}",
|
135 |
+
f"{cat_stats['successful_count']}/{cat_stats['total_count']}",
|
136 |
+
f"{cat_stats['average_extraction_time']:.3f}",
|
137 |
+
'✅' if cat_stats.get('meets_threshold', False) else '❌'
|
138 |
+
])
|
139 |
+
|
140 |
+
table_data.append([
|
141 |
+
'Overall',
|
142 |
+
f"{overall_results['success_rate']:.1%}",
|
143 |
+
f"{overall_results['successful_count']}/{overall_results['total_count']}",
|
144 |
+
'-',
|
145 |
+
'✅' if overall_results.get('target_compliance', False) else '❌'
|
146 |
+
])
|
147 |
+
|
148 |
+
if table_data:
|
149 |
+
table = ax3.table(cellText=table_data, colLabels=headers,
|
150 |
+
cellLoc='center', loc='center')
|
151 |
+
table.auto_set_font_size(False)
|
152 |
+
table.set_fontsize(10)
|
153 |
+
table.scale(1, 2)
|
154 |
+
|
155 |
+
# Style header
|
156 |
+
for i in range(len(headers)):
|
157 |
+
table[(0, i)].set_text_props(weight='bold', color='white')
|
158 |
+
table[(0, i)].set_facecolor('#2E7D32')
|
159 |
+
|
160 |
+
ax3.set_title('Extraction Statistics Summary', fontweight='bold', pad=20)
|
161 |
+
|
162 |
+
# Chart 4: Performance visualization
|
163 |
+
ax4 = axes[1, 1]
|
164 |
+
|
165 |
+
# Simple performance indicator
|
166 |
+
overall_rate = overall_results['success_rate'] * 100
|
167 |
+
colors = ['#d62728' if overall_rate < 80 else '#2ca02c']
|
168 |
+
|
169 |
+
wedges, texts, autotexts = ax4.pie([overall_rate, 100-overall_rate],
|
170 |
+
labels=['Successful', 'Failed'],
|
171 |
+
autopct='%1.1f%%',
|
172 |
+
colors=['#2ca02c', '#ffcccc'],
|
173 |
+
startangle=90)
|
174 |
+
|
175 |
+
ax4.set_title(f'Overall Extraction Success\n{overall_rate:.1f}% Success Rate', fontweight='bold')
|
176 |
+
|
177 |
+
plt.tight_layout()
|
178 |
+
|
179 |
+
# Save chart
|
180 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
181 |
+
chart_filename = f"extraction_analysis_charts_{timestamp}.png"
|
182 |
+
|
183 |
+
results_dir = Path(__file__).parent / "results"
|
184 |
+
results_dir.mkdir(exist_ok=True)
|
185 |
+
chart_path = results_dir / chart_filename
|
186 |
+
|
187 |
+
plt.savefig(chart_path, dpi=300, bbox_inches='tight', facecolor='white')
|
188 |
+
plt.close()
|
189 |
+
|
190 |
+
print(f"📈 Extraction charts saved to: {chart_path}")
|
191 |
+
return str(chart_path)
|
192 |
+
|
193 |
+
except Exception as e:
|
194 |
+
print(f"❌ Extraction chart generation failed: {e}")
|
195 |
+
return ""
|
196 |
+
|
197 |
+
|
198 |
+
if __name__ == "__main__":
|
199 |
+
"""Independent extraction chart generation"""
|
200 |
+
|
201 |
+
print("📈 OnCall.ai Extraction Chart Generator")
|
202 |
+
|
203 |
+
chart_gen = ExtractionChartGenerator()
|
204 |
+
|
205 |
+
try:
|
206 |
+
stats = chart_gen.load_latest_extraction_statistics()
|
207 |
+
chart_path = chart_gen.generate_extraction_charts(stats)
|
208 |
+
|
209 |
+
print(f"\n✅ Extraction chart generation complete!")
|
210 |
+
print(f"📈 Charts saved to: {chart_path}")
|
211 |
+
|
212 |
+
except FileNotFoundError as e:
|
213 |
+
print(f"❌ {e}")
|
214 |
+
print("💡 Please run latency_evaluator.py first to generate extraction statistics data")
|
215 |
+
except Exception as e:
|
216 |
+
print(f"❌ Chart generation failed: {e}")
|
evaluation/metric3_relevance_chart_generator.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
OnCall.ai System - Relevance Chart Generator
|
4 |
+
============================================
|
5 |
+
|
6 |
+
Generates retrieval relevance charts from saved statistics.
|
7 |
+
Shows cosine similarity analysis and threshold compliance.
|
8 |
+
|
9 |
+
Author: YanBo Chen
|
10 |
+
Date: 2025-08-04
|
11 |
+
"""
|
12 |
+
|
13 |
+
import json
|
14 |
+
import os
|
15 |
+
import sys
|
16 |
+
from typing import Dict, List, Any
|
17 |
+
from datetime import datetime
|
18 |
+
from pathlib import Path
|
19 |
+
import glob
|
20 |
+
|
21 |
+
# Visualization imports
|
22 |
+
import matplotlib.pyplot as plt
|
23 |
+
import seaborn as sns
|
24 |
+
import pandas as pd
|
25 |
+
import numpy as np
|
26 |
+
|
27 |
+
|
28 |
+
class RelevanceChartGenerator:
|
29 |
+
"""Generate charts for retrieval relevance metrics"""
|
30 |
+
|
31 |
+
def __init__(self):
|
32 |
+
"""Initialize chart generator"""
|
33 |
+
print("📈 Initializing Relevance Chart Generator...")
|
34 |
+
plt.style.use('default')
|
35 |
+
sns.set_palette("husl")
|
36 |
+
print("✅ Chart Generator ready")
|
37 |
+
|
38 |
+
def load_latest_relevance_statistics(self, results_dir: str = None) -> Dict[str, Any]:
|
39 |
+
"""Load the most recent relevance statistics file"""
|
40 |
+
if results_dir is None:
|
41 |
+
results_dir = Path(__file__).parent / "results"
|
42 |
+
|
43 |
+
pattern = str(results_dir / "relevance_statistics_*.json")
|
44 |
+
stat_files = glob.glob(pattern)
|
45 |
+
|
46 |
+
if not stat_files:
|
47 |
+
raise FileNotFoundError(f"No relevance statistics files found in {results_dir}")
|
48 |
+
|
49 |
+
latest_file = max(stat_files, key=os.path.getmtime)
|
50 |
+
print(f"📊 Loading relevance statistics from: {latest_file}")
|
51 |
+
|
52 |
+
with open(latest_file, 'r', encoding='utf-8') as f:
|
53 |
+
stats = json.load(f)
|
54 |
+
|
55 |
+
return stats
|
56 |
+
|
57 |
+
def generate_relevance_charts(self, stats: Dict[str, Any]) -> str:
|
58 |
+
"""Generate relevance analysis charts"""
|
59 |
+
try:
|
60 |
+
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
61 |
+
fig.suptitle('OnCall.ai Retrieval Relevance Analysis', fontsize=16, fontweight='bold')
|
62 |
+
|
63 |
+
category_results = stats['category_results']
|
64 |
+
overall_results = stats['overall_results']
|
65 |
+
|
66 |
+
# Chart 1: Average Relevance by Category
|
67 |
+
ax1 = axes[0, 0]
|
68 |
+
categories = []
|
69 |
+
avg_relevances = []
|
70 |
+
|
71 |
+
for category, cat_stats in category_results.items():
|
72 |
+
if cat_stats['successful_retrievals'] > 0:
|
73 |
+
categories.append(category.replace('_', ' ').title())
|
74 |
+
avg_relevances.append(cat_stats['average_relevance'])
|
75 |
+
|
76 |
+
categories.append('Overall')
|
77 |
+
avg_relevances.append(overall_results['average_relevance'])
|
78 |
+
|
79 |
+
bars = ax1.bar(categories, avg_relevances, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
|
80 |
+
ax1.set_title('Average Relevance Score by Category', fontweight='bold')
|
81 |
+
ax1.set_ylabel('Relevance Score (Cosine Similarity)')
|
82 |
+
ax1.set_xlabel('Query Category')
|
83 |
+
ax1.grid(True, alpha=0.3)
|
84 |
+
|
85 |
+
# Add threshold lines
|
86 |
+
ax1.axhline(y=0.2, color='orange', linestyle='--', alpha=0.7, label='0.2 Threshold')
|
87 |
+
ax1.axhline(y=0.70, color='red', linestyle='--', alpha=0.7, label='0.70 Target')
|
88 |
+
ax1.legend()
|
89 |
+
|
90 |
+
# Add value labels
|
91 |
+
for bar, relevance in zip(bars, avg_relevances):
|
92 |
+
height = bar.get_height()
|
93 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
94 |
+
f'{relevance:.3f}', ha='center', va='bottom', fontweight='bold')
|
95 |
+
|
96 |
+
# Chart 2: Relevance Distribution
|
97 |
+
ax2 = axes[0, 1]
|
98 |
+
|
99 |
+
# Collect all individual relevance scores
|
100 |
+
all_scores = []
|
101 |
+
category_labels = []
|
102 |
+
|
103 |
+
for category, cat_stats in category_results.items():
|
104 |
+
if cat_stats.get('individual_relevance_scores'):
|
105 |
+
all_scores.extend(cat_stats['individual_relevance_scores'])
|
106 |
+
category_labels.extend([category] * len(cat_stats['individual_relevance_scores']))
|
107 |
+
|
108 |
+
if all_scores:
|
109 |
+
# Create histogram
|
110 |
+
ax2.hist(all_scores, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
|
111 |
+
ax2.axvline(x=0.2, color='orange', linestyle='--', alpha=0.7, label='0.2 Threshold')
|
112 |
+
ax2.axvline(x=0.70, color='red', linestyle='--', alpha=0.7, label='0.70 Target')
|
113 |
+
ax2.axvline(x=np.mean(all_scores), color='green', linestyle='-', alpha=0.8, label=f'Mean: {np.mean(all_scores):.3f}')
|
114 |
+
|
115 |
+
ax2.set_title('Relevance Score Distribution', fontweight='bold')
|
116 |
+
ax2.set_xlabel('Relevance Score')
|
117 |
+
ax2.set_ylabel('Frequency')
|
118 |
+
ax2.legend()
|
119 |
+
ax2.grid(True, alpha=0.3)
|
120 |
+
else:
|
121 |
+
ax2.text(0.5, 0.5, 'No relevance data available', ha='center', va='center', transform=ax2.transAxes)
|
122 |
+
ax2.set_title('Relevance Score Distribution', fontweight='bold')
|
123 |
+
|
124 |
+
# Chart 3: Statistical Summary Table
|
125 |
+
ax3 = axes[1, 0]
|
126 |
+
ax3.axis('tight')
|
127 |
+
ax3.axis('off')
|
128 |
+
|
129 |
+
table_data = []
|
130 |
+
headers = ['Category', 'Avg Relevance', 'Min/Max', 'Success/Total', 'Threshold Met']
|
131 |
+
|
132 |
+
for category, cat_stats in category_results.items():
|
133 |
+
if cat_stats['total_queries'] > 0:
|
134 |
+
table_data.append([
|
135 |
+
category.replace('_', ' ').title(),
|
136 |
+
f"{cat_stats['average_relevance']:.3f}",
|
137 |
+
f"{cat_stats['min_relevance']:.3f}/{cat_stats['max_relevance']:.3f}",
|
138 |
+
f"{cat_stats['successful_retrievals']}/{cat_stats['total_queries']}",
|
139 |
+
'✅' if cat_stats.get('meets_threshold', False) else '❌'
|
140 |
+
])
|
141 |
+
|
142 |
+
table_data.append([
|
143 |
+
'Overall',
|
144 |
+
f"{overall_results['average_relevance']:.3f}",
|
145 |
+
f"{overall_results['min_relevance']:.3f}/{overall_results['max_relevance']:.3f}",
|
146 |
+
f"{overall_results['successful_queries']}/{overall_results['total_queries']}",
|
147 |
+
'✅' if overall_results.get('target_compliance', False) else '❌'
|
148 |
+
])
|
149 |
+
|
150 |
+
if table_data:
|
151 |
+
table = ax3.table(cellText=table_data, colLabels=headers,
|
152 |
+
cellLoc='center', loc='center')
|
153 |
+
table.auto_set_font_size(False)
|
154 |
+
table.set_fontsize(10)
|
155 |
+
table.scale(1, 2)
|
156 |
+
|
157 |
+
# Style header
|
158 |
+
for i in range(len(headers)):
|
159 |
+
table[(0, i)].set_text_props(weight='bold', color='white')
|
160 |
+
table[(0, i)].set_facecolor('#2E7D32')
|
161 |
+
|
162 |
+
ax3.set_title('Relevance Statistics Summary', fontweight='bold', pad=20)
|
163 |
+
|
164 |
+
# Chart 4: Category Comparison Box Plot
|
165 |
+
ax4 = axes[1, 1]
|
166 |
+
|
167 |
+
box_data = []
|
168 |
+
box_labels = []
|
169 |
+
|
170 |
+
for category, cat_stats in category_results.items():
|
171 |
+
if cat_stats.get('individual_relevance_scores'):
|
172 |
+
box_data.append(cat_stats['individual_relevance_scores'])
|
173 |
+
box_labels.append(category.replace('_', ' ').title())
|
174 |
+
|
175 |
+
if box_data:
|
176 |
+
box_plot = ax4.boxplot(box_data, labels=box_labels, patch_artist=True)
|
177 |
+
colors = ['#1f77b4', '#ff7f0e', '#d62728']
|
178 |
+
for patch, color in zip(box_plot['boxes'], colors[:len(box_plot['boxes'])]):
|
179 |
+
patch.set_facecolor(color)
|
180 |
+
patch.set_alpha(0.7)
|
181 |
+
|
182 |
+
ax4.axhline(y=0.2, color='orange', linestyle='--', alpha=0.7, label='0.2 Threshold')
|
183 |
+
ax4.axhline(y=0.70, color='red', linestyle='--', alpha=0.7, label='0.70 Target')
|
184 |
+
ax4.set_title('Relevance Distribution by Category', fontweight='bold')
|
185 |
+
ax4.set_ylabel('Relevance Score')
|
186 |
+
ax4.legend()
|
187 |
+
ax4.grid(True, alpha=0.3)
|
188 |
+
else:
|
189 |
+
ax4.text(0.5, 0.5, 'Insufficient data for box plot', ha='center', va='center', transform=ax4.transAxes)
|
190 |
+
ax4.set_title('Relevance Distribution by Category', fontweight='bold')
|
191 |
+
|
192 |
+
plt.tight_layout()
|
193 |
+
|
194 |
+
# Save chart
|
195 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
196 |
+
chart_filename = f"relevance_analysis_charts_{timestamp}.png"
|
197 |
+
|
198 |
+
results_dir = Path(__file__).parent / "results"
|
199 |
+
results_dir.mkdir(exist_ok=True)
|
200 |
+
chart_path = results_dir / chart_filename
|
201 |
+
|
202 |
+
plt.savefig(chart_path, dpi=300, bbox_inches='tight', facecolor='white')
|
203 |
+
plt.close()
|
204 |
+
|
205 |
+
print(f"📈 Relevance charts saved to: {chart_path}")
|
206 |
+
return str(chart_path)
|
207 |
+
|
208 |
+
except Exception as e:
|
209 |
+
print(f"❌ Relevance chart generation failed: {e}")
|
210 |
+
return ""
|
211 |
+
|
212 |
+
|
213 |
+
if __name__ == "__main__":
|
214 |
+
"""Independent relevance chart generation"""
|
215 |
+
|
216 |
+
print("📈 OnCall.ai Relevance Chart Generator")
|
217 |
+
|
218 |
+
chart_gen = RelevanceChartGenerator()
|
219 |
+
|
220 |
+
try:
|
221 |
+
stats = chart_gen.load_latest_relevance_statistics()
|
222 |
+
chart_path = chart_gen.generate_relevance_charts(stats)
|
223 |
+
|
224 |
+
print(f"\n✅ Relevance chart generation complete!")
|
225 |
+
print(f"📈 Charts saved to: {chart_path}")
|
226 |
+
|
227 |
+
except FileNotFoundError as e:
|
228 |
+
print(f"❌ {e}")
|
229 |
+
print("💡 Please run latency_evaluator.py first to generate relevance statistics data")
|
230 |
+
except Exception as e:
|
231 |
+
print(f"❌ Chart generation failed: {e}")
|
evaluation/metric4_coverage_chart_generator.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
OnCall.ai System - Coverage Chart Generator
|
4 |
+
===========================================
|
5 |
+
|
6 |
+
Generates retrieval coverage charts from saved statistics.
|
7 |
+
Shows how well generated advice utilizes retrieved content.
|
8 |
+
|
9 |
+
Author: YanBo Chen
|
10 |
+
Date: 2025-08-04
|
11 |
+
"""
|
12 |
+
|
13 |
+
import json
|
14 |
+
import os
|
15 |
+
import sys
|
16 |
+
from typing import Dict, List, Any
|
17 |
+
from datetime import datetime
|
18 |
+
from pathlib import Path
|
19 |
+
import glob
|
20 |
+
|
21 |
+
# Visualization imports
|
22 |
+
import matplotlib.pyplot as plt
|
23 |
+
import seaborn as sns
|
24 |
+
import pandas as pd
|
25 |
+
import numpy as np
|
26 |
+
|
27 |
+
|
28 |
+
class CoverageChartGenerator:
|
29 |
+
"""Generate charts for retrieval coverage metrics"""
|
30 |
+
|
31 |
+
def __init__(self):
|
32 |
+
"""Initialize chart generator"""
|
33 |
+
print("📈 Initializing Coverage Chart Generator...")
|
34 |
+
plt.style.use('default')
|
35 |
+
sns.set_palette("husl")
|
36 |
+
print("✅ Chart Generator ready")
|
37 |
+
|
38 |
+
def load_latest_coverage_statistics(self, results_dir: str = None) -> Dict[str, Any]:
|
39 |
+
"""Load the most recent coverage statistics file"""
|
40 |
+
if results_dir is None:
|
41 |
+
results_dir = Path(__file__).parent / "results"
|
42 |
+
|
43 |
+
pattern = str(results_dir / "coverage_statistics_*.json")
|
44 |
+
stat_files = glob.glob(pattern)
|
45 |
+
|
46 |
+
if not stat_files:
|
47 |
+
raise FileNotFoundError(f"No coverage statistics files found in {results_dir}")
|
48 |
+
|
49 |
+
latest_file = max(stat_files, key=os.path.getmtime)
|
50 |
+
print(f"📊 Loading coverage statistics from: {latest_file}")
|
51 |
+
|
52 |
+
with open(latest_file, 'r', encoding='utf-8') as f:
|
53 |
+
stats = json.load(f)
|
54 |
+
|
55 |
+
return stats
|
56 |
+
|
57 |
+
def generate_coverage_charts(self, stats: Dict[str, Any]) -> str:
|
58 |
+
"""Generate coverage analysis charts"""
|
59 |
+
try:
|
60 |
+
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
61 |
+
fig.suptitle('OnCall.ai Retrieval Coverage Analysis', fontsize=16, fontweight='bold')
|
62 |
+
|
63 |
+
category_results = stats['category_results']
|
64 |
+
overall_results = stats['overall_results']
|
65 |
+
|
66 |
+
# Chart 1: Average Coverage by Category
|
67 |
+
ax1 = axes[0, 0]
|
68 |
+
categories = []
|
69 |
+
avg_coverages = []
|
70 |
+
|
71 |
+
for category, cat_stats in category_results.items():
|
72 |
+
if cat_stats['successful_evaluations'] > 0:
|
73 |
+
categories.append(category.replace('_', ' ').title())
|
74 |
+
avg_coverages.append(cat_stats['average_coverage'] * 100) # Convert to percentage
|
75 |
+
|
76 |
+
categories.append('Overall')
|
77 |
+
avg_coverages.append(overall_results['average_coverage'] * 100)
|
78 |
+
|
79 |
+
bars = ax1.bar(categories, avg_coverages, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
|
80 |
+
ax1.set_title('Average Coverage Score by Category', fontweight='bold')
|
81 |
+
ax1.set_ylabel('Coverage Score (%)')
|
82 |
+
ax1.set_xlabel('Query Category')
|
83 |
+
ax1.grid(True, alpha=0.3)
|
84 |
+
|
85 |
+
# Add target line
|
86 |
+
ax1.axhline(y=40, color='red', linestyle='--', alpha=0.7, label='40% Target')
|
87 |
+
ax1.legend()
|
88 |
+
|
89 |
+
# Add value labels
|
90 |
+
for bar, coverage in zip(bars, avg_coverages):
|
91 |
+
height = bar.get_height()
|
92 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
|
93 |
+
f'{coverage:.1f}%', ha='center', va='bottom', fontweight='bold')
|
94 |
+
|
95 |
+
# Chart 2: Coverage Distribution
|
96 |
+
ax2 = axes[0, 1]
|
97 |
+
|
98 |
+
# Collect all individual coverage scores
|
99 |
+
all_scores = []
|
100 |
+
|
101 |
+
for category, cat_stats in category_results.items():
|
102 |
+
if cat_stats.get('individual_coverage_scores'):
|
103 |
+
all_scores.extend([score * 100 for score in cat_stats['individual_coverage_scores']])
|
104 |
+
|
105 |
+
if all_scores:
|
106 |
+
# Create histogram
|
107 |
+
ax2.hist(all_scores, bins=15, alpha=0.7, color='lightcoral', edgecolor='black')
|
108 |
+
ax2.axvline(x=40, color='red', linestyle='--', alpha=0.7, label='40% Target')
|
109 |
+
ax2.axvline(x=np.mean(all_scores), color='green', linestyle='-', alpha=0.8, label=f'Mean: {np.mean(all_scores):.1f}%')
|
110 |
+
|
111 |
+
ax2.set_title('Coverage Score Distribution', fontweight='bold')
|
112 |
+
ax2.set_xlabel('Coverage Score (%)')
|
113 |
+
ax2.set_ylabel('Frequency')
|
114 |
+
ax2.legend()
|
115 |
+
ax2.grid(True, alpha=0.3)
|
116 |
+
else:
|
117 |
+
ax2.text(0.5, 0.5, 'No coverage data available', ha='center', va='center', transform=ax2.transAxes)
|
118 |
+
ax2.set_title('Coverage Score Distribution', fontweight='bold')
|
119 |
+
|
120 |
+
# Chart 3: Statistical Summary Table
|
121 |
+
ax3 = axes[1, 0]
|
122 |
+
ax3.axis('tight')
|
123 |
+
ax3.axis('off')
|
124 |
+
|
125 |
+
table_data = []
|
126 |
+
headers = ['Category', 'Avg Coverage', 'Min/Max', 'Success/Total', 'Target Met']
|
127 |
+
|
128 |
+
for category, cat_stats in category_results.items():
|
129 |
+
if cat_stats['total_queries'] > 0:
|
130 |
+
table_data.append([
|
131 |
+
category.replace('_', ' ').title(),
|
132 |
+
f"{cat_stats['average_coverage']:.3f}",
|
133 |
+
f"{cat_stats['min_coverage']:.3f}/{cat_stats['max_coverage']:.3f}",
|
134 |
+
f"{cat_stats['successful_evaluations']}/{cat_stats['total_queries']}",
|
135 |
+
'✅' if cat_stats.get('meets_threshold', False) else '❌'
|
136 |
+
])
|
137 |
+
|
138 |
+
table_data.append([
|
139 |
+
'Overall',
|
140 |
+
f"{overall_results['average_coverage']:.3f}",
|
141 |
+
f"{overall_results['min_coverage']:.3f}/{overall_results['max_coverage']:.3f}",
|
142 |
+
f"{overall_results['successful_queries']}/{overall_results['total_queries']}",
|
143 |
+
'✅' if overall_results.get('meets_threshold', False) else '❌'
|
144 |
+
])
|
145 |
+
|
146 |
+
if table_data:
|
147 |
+
table = ax3.table(cellText=table_data, colLabels=headers,
|
148 |
+
cellLoc='center', loc='center')
|
149 |
+
table.auto_set_font_size(False)
|
150 |
+
table.set_fontsize(10)
|
151 |
+
table.scale(1, 2)
|
152 |
+
|
153 |
+
# Style header
|
154 |
+
for i in range(len(headers)):
|
155 |
+
table[(0, i)].set_text_props(weight='bold', color='white')
|
156 |
+
table[(0, i)].set_facecolor('#2E7D32')
|
157 |
+
|
158 |
+
ax3.set_title('Coverage Statistics Summary', fontweight='bold', pad=20)
|
159 |
+
|
160 |
+
# Chart 4: Coverage Performance Radar/Gauge
|
161 |
+
ax4 = axes[1, 1]
|
162 |
+
|
163 |
+
# Create gauge-like visualization for overall coverage
|
164 |
+
overall_coverage_pct = overall_results['average_coverage'] * 100
|
165 |
+
|
166 |
+
# Pie chart as gauge
|
167 |
+
sizes = [overall_coverage_pct, 100 - overall_coverage_pct]
|
168 |
+
colors = ['#2ca02c' if overall_coverage_pct >= 40 else '#ff7f0e', '#f0f0f0']
|
169 |
+
|
170 |
+
wedges, texts, autotexts = ax4.pie(sizes, labels=['Covered', 'Not Covered'],
|
171 |
+
autopct='%1.1f%%',
|
172 |
+
colors=colors,
|
173 |
+
startangle=90,
|
174 |
+
counterclock=False)
|
175 |
+
|
176 |
+
# Add center text
|
177 |
+
ax4.text(0, 0, f'{overall_coverage_pct:.1f}%\nCoverage',
|
178 |
+
ha='center', va='center', fontsize=14, fontweight='bold')
|
179 |
+
|
180 |
+
ax4.set_title(f'Overall Coverage Performance\n{"✅ Target Met" if overall_coverage_pct >= 40 else "❌ Below Target"}',
|
181 |
+
fontweight='bold')
|
182 |
+
|
183 |
+
plt.tight_layout()
|
184 |
+
|
185 |
+
# Save chart
|
186 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
187 |
+
chart_filename = f"coverage_analysis_charts_{timestamp}.png"
|
188 |
+
|
189 |
+
results_dir = Path(__file__).parent / "results"
|
190 |
+
results_dir.mkdir(exist_ok=True)
|
191 |
+
chart_path = results_dir / chart_filename
|
192 |
+
|
193 |
+
plt.savefig(chart_path, dpi=300, bbox_inches='tight', facecolor='white')
|
194 |
+
plt.close()
|
195 |
+
|
196 |
+
print(f"📈 Coverage charts saved to: {chart_path}")
|
197 |
+
return str(chart_path)
|
198 |
+
|
199 |
+
except Exception as e:
|
200 |
+
print(f"❌ Coverage chart generation failed: {e}")
|
201 |
+
return ""
|
202 |
+
|
203 |
+
|
204 |
+
if __name__ == "__main__":
|
205 |
+
"""Independent coverage chart generation"""
|
206 |
+
|
207 |
+
print("📈 OnCall.ai Coverage Chart Generator")
|
208 |
+
|
209 |
+
chart_gen = CoverageChartGenerator()
|
210 |
+
|
211 |
+
try:
|
212 |
+
stats = chart_gen.load_latest_coverage_statistics()
|
213 |
+
chart_path = chart_gen.generate_coverage_charts(stats)
|
214 |
+
|
215 |
+
print(f"\n✅ Coverage chart generation complete!")
|
216 |
+
print(f"📈 Charts saved to: {chart_path}")
|
217 |
+
|
218 |
+
except FileNotFoundError as e:
|
219 |
+
print(f"❌ {e}")
|
220 |
+
print("💡 Please run latency_evaluator.py first to generate coverage statistics data")
|
221 |
+
except Exception as e:
|
222 |
+
print(f"❌ Chart generation failed: {e}")
|
evaluation/metric5_6_judge_evaluator_manual.md
ADDED
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Metric 5-6 LLM Judge Evaluator Manual
|
2 |
+
|
3 |
+
## Overview
|
4 |
+
|
5 |
+
The `metric5_6_llm_judge_evaluator.py` is a multi-system evaluation tool that uses Llama3-70B as a third-party judge to assess medical advice quality across different AI systems. It supports both single-system evaluation and multi-system comparison with a single LLM call for maximum consistency.
|
6 |
+
|
7 |
+
## Metrics Evaluated
|
8 |
+
|
9 |
+
**Metric 5: Clinical Actionability (臨床可操作性)**
|
10 |
+
- Scale: 1-10 (normalized to 0.0-1.0)
|
11 |
+
- Question: "Can healthcare providers immediately act on this advice?"
|
12 |
+
- Target: ≥7.0/10 for acceptable actionability
|
13 |
+
|
14 |
+
**Metric 6: Clinical Evidence Quality (臨床證據品質)**
|
15 |
+
- Scale: 1-10 (normalized to 0.0-1.0)
|
16 |
+
- Question: "Is the advice evidence-based and follows medical standards?"
|
17 |
+
- Target: ≥7.5/10 for acceptable evidence quality
|
18 |
+
|
19 |
+
## System Architecture
|
20 |
+
|
21 |
+
### Multi-System Support
|
22 |
+
The evaluator supports flexible system combinations:
|
23 |
+
- **Single System**: `rag` or `direct`
|
24 |
+
- **Two-System Comparison**: `rag,direct`
|
25 |
+
- **Future Extension**: `rag,direct,claude,gpt4` (any combination)
|
26 |
+
|
27 |
+
### Judge LLM
|
28 |
+
- **Model**: Llama3-70B-Instruct via Hugging Face API
|
29 |
+
- **Strategy**: Single batch call for all evaluations
|
30 |
+
- **Temperature**: 0.1 (low for consistent evaluation)
|
31 |
+
- **Max Tokens**: 2048 (sufficient for evaluation responses)
|
32 |
+
|
33 |
+
## Prerequisites
|
34 |
+
|
35 |
+
### 1. Environment Setup
|
36 |
+
```bash
|
37 |
+
# Ensure HF_TOKEN is set in your environment
|
38 |
+
export HF_TOKEN="your_huggingface_token"
|
39 |
+
|
40 |
+
# Or add to .env file
|
41 |
+
echo "HF_TOKEN=your_huggingface_token" >> .env
|
42 |
+
```
|
43 |
+
|
44 |
+
### 2. Required Data Files
|
45 |
+
Before running the judge evaluator, you must have medical outputs from your systems:
|
46 |
+
|
47 |
+
**For RAG System**:
|
48 |
+
```bash
|
49 |
+
python latency_evaluator.py single_test_query.txt
|
50 |
+
# Generates: results/medical_outputs_YYYYMMDD_HHMMSS.json
|
51 |
+
```
|
52 |
+
|
53 |
+
**For Direct LLM System**:
|
54 |
+
```bash
|
55 |
+
python direct_llm_evaluator.py single_test_query.txt
|
56 |
+
# Generates: results/medical_outputs_direct_YYYYMMDD_HHMMSS.json
|
57 |
+
```
|
58 |
+
|
59 |
+
## Usage
|
60 |
+
|
61 |
+
### Command Line Interface
|
62 |
+
|
63 |
+
#### Single System Evaluation
|
64 |
+
```bash
|
65 |
+
# Evaluate RAG system only
|
66 |
+
python metric5_6_llm_judge_evaluator.py rag
|
67 |
+
|
68 |
+
# Evaluate Direct LLM system only
|
69 |
+
python metric5_6_llm_judge_evaluator.py direct
|
70 |
+
```
|
71 |
+
|
72 |
+
#### Multi-System Comparison (Recommended)
|
73 |
+
```bash
|
74 |
+
# Compare RAG vs Direct systems
|
75 |
+
python metric5_6_llm_judge_evaluator.py rag,direct
|
76 |
+
|
77 |
+
# Future: Compare multiple systems
|
78 |
+
python metric5_6_llm_judge_evaluator.py rag,direct,claude
|
79 |
+
```
|
80 |
+
|
81 |
+
### Complete Workflow Example
|
82 |
+
|
83 |
+
```bash
|
84 |
+
# Step 1: Navigate to evaluation directory
|
85 |
+
cd /path/to/GenAI-OnCallAssistant/evaluation
|
86 |
+
|
87 |
+
# Step 2: Generate medical outputs from both systems
|
88 |
+
python latency_evaluator.py single_test_query.txt
|
89 |
+
python direct_llm_evaluator.py single_test_query.txt
|
90 |
+
|
91 |
+
# Step 3: Run comparative evaluation
|
92 |
+
python metric5_6_llm_judge_evaluator.py rag,direct
|
93 |
+
```
|
94 |
+
|
95 |
+
## Output Files
|
96 |
+
|
97 |
+
### Generated Files
|
98 |
+
- **Statistics**: `results/judge_evaluation_comparison_rag_vs_direct_YYYYMMDD_HHMMSS.json`
|
99 |
+
- **Detailed Results**: Stored in evaluator's internal results array
|
100 |
+
|
101 |
+
### File Structure
|
102 |
+
```json
|
103 |
+
{
|
104 |
+
"comparison_metadata": {
|
105 |
+
"systems_compared": ["rag", "direct"],
|
106 |
+
"comparison_type": "multi_system",
|
107 |
+
"timestamp": "2025-08-04T22:00:00"
|
108 |
+
},
|
109 |
+
"category_results": {
|
110 |
+
"diagnosis": {
|
111 |
+
"average_actionability": 0.850,
|
112 |
+
"average_evidence": 0.780,
|
113 |
+
"query_count": 1,
|
114 |
+
"actionability_target_met": true,
|
115 |
+
"evidence_target_met": true
|
116 |
+
}
|
117 |
+
},
|
118 |
+
"overall_results": {
|
119 |
+
"average_actionability": 0.850,
|
120 |
+
"average_evidence": 0.780,
|
121 |
+
"successful_evaluations": 2,
|
122 |
+
"total_queries": 2,
|
123 |
+
"actionability_target_met": true,
|
124 |
+
"evidence_target_met": true
|
125 |
+
}
|
126 |
+
}
|
127 |
+
```
|
128 |
+
|
129 |
+
## Evaluation Process
|
130 |
+
|
131 |
+
### 1. File Discovery
|
132 |
+
The evaluator automatically finds the latest medical output files:
|
133 |
+
- **RAG**: `medical_outputs_*.json`
|
134 |
+
- **Direct**: `medical_outputs_direct_*.json`
|
135 |
+
- **Custom**: `medical_outputs_{system}_*.json`
|
136 |
+
|
137 |
+
### 2. Prompt Generation
|
138 |
+
For multi-system comparison, the evaluator creates a structured prompt:
|
139 |
+
```
|
140 |
+
You are a medical expert evaluating and comparing AI systems...
|
141 |
+
|
142 |
+
SYSTEM 1 (RAG): Uses medical guidelines + LLM for evidence-based advice
|
143 |
+
SYSTEM 2 (Direct): Uses LLM only without external guidelines
|
144 |
+
|
145 |
+
QUERY 1 (DIAGNOSIS):
|
146 |
+
Patient Query: 60-year-old patient with hypertension history...
|
147 |
+
|
148 |
+
SYSTEM 1 Response: For a 60-year-old patient with...
|
149 |
+
SYSTEM 2 Response: Based on the symptoms described...
|
150 |
+
|
151 |
+
RESPONSE FORMAT:
|
152 |
+
Query 1 System 1: Actionability=X, Evidence=Y
|
153 |
+
Query 1 System 2: Actionability=X, Evidence=Y
|
154 |
+
```
|
155 |
+
|
156 |
+
### 3. LLM Judge Evaluation
|
157 |
+
- **Single API Call**: All systems evaluated in one request for consistency
|
158 |
+
- **Response Parsing**: Automatic extraction of numerical scores
|
159 |
+
- **Error Handling**: Graceful handling of parsing failures
|
160 |
+
|
161 |
+
### 4. Results Analysis
|
162 |
+
- **System-Specific Statistics**: Individual performance metrics
|
163 |
+
- **Comparative Analysis**: Direct system-to-system comparison
|
164 |
+
- **Target Compliance**: Automatic threshold checking
|
165 |
+
|
166 |
+
## Expected Output
|
167 |
+
|
168 |
+
### Console Output Example
|
169 |
+
```
|
170 |
+
🧠 OnCall.ai LLM Judge Evaluator - Metrics 5-6 Multi-System Evaluation
|
171 |
+
|
172 |
+
🧪 Multi-System Comparison: RAG vs DIRECT
|
173 |
+
📊 Found rag outputs: results/medical_outputs_20250804_215917.json
|
174 |
+
📊 Found direct outputs: results/medical_outputs_direct_20250804_220000.json
|
175 |
+
📊 Comparing 2 systems with 1 queries each
|
176 |
+
🎯 Metrics: 5 (Actionability) + 6 (Evidence Quality)
|
177 |
+
⚡ Strategy: Single comparison call for maximum consistency
|
178 |
+
|
179 |
+
🧠 Multi-system comparison: rag, direct
|
180 |
+
📊 Evaluating 1 queries across 2 systems...
|
181 |
+
📝 Comparison prompt created (2150 characters)
|
182 |
+
🔄 Calling judge LLM for multi-system comparison...
|
183 |
+
✅ Judge LLM completed comparison evaluation in 45.3s
|
184 |
+
📄 Response length: 145 characters
|
185 |
+
📊 RAG: 1 evaluations parsed
|
186 |
+
📊 DIRECT: 1 evaluations parsed
|
187 |
+
|
188 |
+
📊 === LLM JUDGE EVALUATION SUMMARY ===
|
189 |
+
Systems Compared: RAG vs DIRECT
|
190 |
+
Overall Performance:
|
191 |
+
Average Actionability: 0.850 (8.5/10)
|
192 |
+
Average Evidence Quality: 0.780 (7.8/10)
|
193 |
+
Actionability Target (≥7.0): ✅ Met
|
194 |
+
Evidence Target (≥7.5): ✅ Met
|
195 |
+
|
196 |
+
System Breakdown:
|
197 |
+
RAG: Actionability=0.900, Evidence=0.850 [1 queries]
|
198 |
+
DIRECT: Actionability=0.800, Evidence=0.710 [1 queries]
|
199 |
+
|
200 |
+
✅ LLM judge evaluation complete!
|
201 |
+
📊 Statistics: results/judge_evaluation_comparison_rag_vs_direct_20250804_220000.json
|
202 |
+
⚡ Efficiency: 2 evaluations in 1 LLM call
|
203 |
+
```
|
204 |
+
|
205 |
+
## Key Features
|
206 |
+
|
207 |
+
### 1. Scientific Comparison Design
|
208 |
+
- **Single Judge Call**: All systems evaluated simultaneously for consistency
|
209 |
+
- **Eliminates Temporal Bias**: Same judge, same context, same standards
|
210 |
+
- **Direct System Comparison**: Side-by-side evaluation format
|
211 |
+
|
212 |
+
### 2. Flexible Architecture
|
213 |
+
- **Backward Compatible**: Single system evaluation still supported
|
214 |
+
- **Future Extensible**: Easy to add new systems (`claude`, `gpt4`, etc.)
|
215 |
+
- **Modular Design**: Clean separation of concerns
|
216 |
+
|
217 |
+
### 3. Robust Error Handling
|
218 |
+
- **File Validation**: Automatic detection of missing input files
|
219 |
+
- **Query Count Verification**: Warns if systems have different query counts
|
220 |
+
- **Graceful Degradation**: Continues operation despite partial failures
|
221 |
+
|
222 |
+
### 4. Comprehensive Reporting
|
223 |
+
- **System-Specific Metrics**: Individual performance analysis
|
224 |
+
- **Comparative Statistics**: Direct system-to-system comparison
|
225 |
+
- **Target Compliance**: Automatic benchmark checking
|
226 |
+
- **Detailed Metadata**: Full traceability of evaluation parameters
|
227 |
+
|
228 |
+
## Troubleshooting
|
229 |
+
|
230 |
+
### Common Issues
|
231 |
+
|
232 |
+
#### 1. Missing Input Files
|
233 |
+
```
|
234 |
+
❌ No medical outputs files found for rag system
|
235 |
+
💡 Please run evaluators first:
|
236 |
+
python latency_evaluator.py single_test_query.txt
|
237 |
+
```
|
238 |
+
**Solution**: Run the prerequisite evaluators to generate medical outputs.
|
239 |
+
|
240 |
+
#### 2. HF_TOKEN Not Set
|
241 |
+
```
|
242 |
+
❌ HF_TOKEN is missing from environment variables
|
243 |
+
```
|
244 |
+
**Solution**: Set your Hugging Face token in environment or `.env` file.
|
245 |
+
|
246 |
+
#### 3. Query Count Mismatch
|
247 |
+
```
|
248 |
+
⚠️ Warning: Systems have different query counts: {'rag': 3, 'direct': 1}
|
249 |
+
```
|
250 |
+
**Solution**: Ensure both systems processed the same input file.
|
251 |
+
|
252 |
+
#### 4. LLM API Timeout
|
253 |
+
```
|
254 |
+
❌ Multi-system evaluation failed: timeout
|
255 |
+
```
|
256 |
+
**Solution**: Check internet connection and Hugging Face API status.
|
257 |
+
|
258 |
+
### Debug Tips
|
259 |
+
|
260 |
+
1. **Check File Existence**: Verify medical output files in `results/` directory
|
261 |
+
2. **Validate JSON Format**: Ensure input files are properly formatted
|
262 |
+
3. **Monitor API Usage**: Check Hugging Face account limits
|
263 |
+
4. **Review Logs**: Examine detailed logging output for specific errors
|
264 |
+
|
265 |
+
## Future Extensions
|
266 |
+
|
267 |
+
### Phase 2: Generic Multi-System Framework
|
268 |
+
```bash
|
269 |
+
# Configuration-driven system comparison
|
270 |
+
python metric5_6_llm_judge_evaluator.py --config comparison_config.json
|
271 |
+
```
|
272 |
+
|
273 |
+
### Phase 3: Unlimited System Support
|
274 |
+
```bash
|
275 |
+
# Dynamic system registration
|
276 |
+
python metric5_6_llm_judge_evaluator.py med42,claude,gpt4,palm,llama2
|
277 |
+
```
|
278 |
+
|
279 |
+
### Integration with Chart Generators
|
280 |
+
```bash
|
281 |
+
# Generate comparison visualizations
|
282 |
+
python metric5_6_llm_judge_chart_generator.py rag,direct
|
283 |
+
```
|
284 |
+
|
285 |
+
## Best Practices
|
286 |
+
|
287 |
+
1. **Consistent Test Data**: Use the same query file for all systems
|
288 |
+
2. **Sequential Execution**: Complete data collection before evaluation
|
289 |
+
3. **Batch Processing**: Use multi-system mode for scientific comparison
|
290 |
+
4. **Result Verification**: Review detailed statistics files for accuracy
|
291 |
+
5. **Performance Monitoring**: Track evaluation latency and API costs
|
292 |
+
|
293 |
+
## Scientific Validity
|
294 |
+
|
295 |
+
The multi-system comparison approach provides superior scientific validity compared to separate evaluations:
|
296 |
+
|
297 |
+
- **Eliminates Judge Variability**: Same judge evaluates all systems
|
298 |
+
- **Reduces Temporal Effects**: All evaluations in single time window
|
299 |
+
- **Ensures Consistent Standards**: Identical evaluation criteria applied
|
300 |
+
- **Enables Direct Comparison**: Side-by-side system assessment
|
301 |
+
- **Maximizes Efficiency**: Single API call vs multiple separate calls
|
302 |
+
|
303 |
+
This design makes the evaluation results more reliable for research publications and system optimization decisions.
|
evaluation/metric5_6_llm_judge_chart_generator.py
ADDED
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
OnCall.ai System - LLM Judge Chart Generator (Metrics 5-6)
|
4 |
+
==========================================================
|
5 |
+
|
6 |
+
Generates comprehensive comparison charts for LLM judge evaluation results.
|
7 |
+
Supports both single-system and multi-system visualization with professional layouts.
|
8 |
+
|
9 |
+
Metrics visualized:
|
10 |
+
5. Clinical Actionability (臨床可操作性) - 1-10 scale
|
11 |
+
6. Clinical Evidence Quality (臨床證據品質) - 1-10 scale
|
12 |
+
|
13 |
+
Author: YanBo Chen
|
14 |
+
Date: 2025-08-04
|
15 |
+
"""
|
16 |
+
|
17 |
+
import json
|
18 |
+
import os
|
19 |
+
import sys
|
20 |
+
from typing import Dict, List, Any, Tuple
|
21 |
+
from datetime import datetime
|
22 |
+
from pathlib import Path
|
23 |
+
import glob
|
24 |
+
import numpy as np
|
25 |
+
|
26 |
+
# Visualization imports
|
27 |
+
import matplotlib.pyplot as plt
|
28 |
+
import seaborn as sns
|
29 |
+
import pandas as pd
|
30 |
+
from matplotlib.patches import Rectangle
|
31 |
+
|
32 |
+
|
33 |
+
class LLMJudgeChartGenerator:
|
34 |
+
"""Generate professional comparison charts for LLM judge evaluation results"""
|
35 |
+
|
36 |
+
def __init__(self):
|
37 |
+
"""Initialize chart generator with professional styling"""
|
38 |
+
print("📈 Initializing LLM Judge Chart Generator...")
|
39 |
+
|
40 |
+
# Set up professional chart style
|
41 |
+
plt.style.use('default')
|
42 |
+
sns.set_palette("husl")
|
43 |
+
|
44 |
+
# Professional color scheme for medical evaluation
|
45 |
+
self.colors = {
|
46 |
+
'rag': '#2E8B57', # Sea Green - represents evidence-based
|
47 |
+
'direct': '#4682B4', # Steel Blue - represents direct approach
|
48 |
+
'claude': '#9370DB', # Medium Purple - future extension
|
49 |
+
'gpt4': '#DC143C', # Crimson - future extension
|
50 |
+
'actionability': '#FF6B6B', # Coral Red
|
51 |
+
'evidence': '#4ECDC4', # Turquoise
|
52 |
+
'target_line': '#FF4444', # Red for target thresholds
|
53 |
+
'grid': '#E0E0E0' # Light gray for grid
|
54 |
+
}
|
55 |
+
|
56 |
+
print("✅ Chart Generator ready with professional medical styling")
|
57 |
+
|
58 |
+
def load_latest_statistics(self, results_dir: str = None) -> Dict[str, Any]:
|
59 |
+
"""
|
60 |
+
Load the most recent judge evaluation statistics file
|
61 |
+
|
62 |
+
Args:
|
63 |
+
results_dir: Directory containing statistics files
|
64 |
+
"""
|
65 |
+
if results_dir is None:
|
66 |
+
results_dir = Path(__file__).parent / "results"
|
67 |
+
|
68 |
+
# Find latest comparison statistics file
|
69 |
+
pattern = str(results_dir / "judge_evaluation_comparison_*.json")
|
70 |
+
stat_files = glob.glob(pattern)
|
71 |
+
|
72 |
+
if not stat_files:
|
73 |
+
raise FileNotFoundError(f"No judge evaluation comparison files found in {results_dir}")
|
74 |
+
|
75 |
+
# Get the most recent file
|
76 |
+
latest_file = max(stat_files, key=os.path.getmtime)
|
77 |
+
|
78 |
+
print(f"📊 Loading statistics from: {latest_file}")
|
79 |
+
|
80 |
+
with open(latest_file, 'r', encoding='utf-8') as f:
|
81 |
+
return json.load(f)
|
82 |
+
|
83 |
+
def generate_comparison_charts(self, stats: Dict[str, Any], save_path: str = None) -> str:
|
84 |
+
"""
|
85 |
+
Generate comprehensive 4-panel comparison visualization
|
86 |
+
|
87 |
+
Creates professional charts showing:
|
88 |
+
1. System comparison radar chart
|
89 |
+
2. Grouped bar chart comparison
|
90 |
+
3. Actionability vs Evidence scatter plot
|
91 |
+
4. Category-wise heatmap
|
92 |
+
"""
|
93 |
+
try:
|
94 |
+
# Create figure with subplots
|
95 |
+
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
96 |
+
fig.suptitle(
|
97 |
+
'Medical AI Systems Comparison - Clinical Quality Assessment\n'
|
98 |
+
'Actionability (1-10): Can healthcare providers act immediately? | '
|
99 |
+
'Evidence Quality (1-10): Is advice evidence-based?',
|
100 |
+
fontsize=14, fontweight='bold', y=0.95
|
101 |
+
)
|
102 |
+
|
103 |
+
# Extract comparison metadata
|
104 |
+
comparison_meta = stats.get('comparison_metadata', {})
|
105 |
+
systems = comparison_meta.get('systems_compared', ['rag', 'direct'])
|
106 |
+
|
107 |
+
overall_results = stats['overall_results']
|
108 |
+
category_results = stats['category_results']
|
109 |
+
|
110 |
+
# Chart 1: System Comparison Radar Chart
|
111 |
+
self._create_radar_chart(axes[0, 0], stats, systems)
|
112 |
+
|
113 |
+
# Chart 2: Grouped Bar Chart Comparison
|
114 |
+
self._create_grouped_bar_chart(axes[0, 1], stats, systems)
|
115 |
+
|
116 |
+
# Chart 3: Actionability vs Evidence Scatter Plot
|
117 |
+
self._create_scatter_plot(axes[1, 0], stats, systems)
|
118 |
+
|
119 |
+
# Chart 4: Category-wise Performance Heatmap
|
120 |
+
self._create_heatmap(axes[1, 1], stats, systems)
|
121 |
+
|
122 |
+
# Add method annotation at bottom
|
123 |
+
method_text = (
|
124 |
+
f"Evaluation: Llama3-70B judge | Targets: Actionability ≥7.0, Evidence ≥7.5 | "
|
125 |
+
f"Systems: {', '.join([s.upper() for s in systems])} | "
|
126 |
+
f"Queries: {overall_results.get('total_queries', 'N/A')}"
|
127 |
+
)
|
128 |
+
fig.text(0.5, 0.02, method_text, ha='center', fontsize=10,
|
129 |
+
style='italic', color='gray')
|
130 |
+
|
131 |
+
# Adjust layout
|
132 |
+
plt.tight_layout()
|
133 |
+
plt.subplots_adjust(top=0.88, bottom=0.08)
|
134 |
+
|
135 |
+
# Save the chart
|
136 |
+
if save_path is None:
|
137 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
138 |
+
systems_str = "_vs_".join(systems)
|
139 |
+
save_path = f"judge_comparison_charts_{systems_str}_{timestamp}.png"
|
140 |
+
|
141 |
+
results_dir = Path(__file__).parent / "results"
|
142 |
+
results_dir.mkdir(exist_ok=True)
|
143 |
+
full_path = results_dir / save_path
|
144 |
+
|
145 |
+
plt.savefig(full_path, dpi=300, bbox_inches='tight')
|
146 |
+
plt.show()
|
147 |
+
|
148 |
+
print(f"📊 Comparison charts saved to: {full_path}")
|
149 |
+
return str(full_path)
|
150 |
+
|
151 |
+
except Exception as e:
|
152 |
+
print(f"❌ Chart generation failed: {e}")
|
153 |
+
raise
|
154 |
+
|
155 |
+
def _create_radar_chart(self, ax, stats: Dict, systems: List[str]):
|
156 |
+
"""Create radar chart for multi-dimensional system comparison"""
|
157 |
+
ax.set_title('Multi-Dimensional System Comparison', fontweight='bold', pad=20)
|
158 |
+
|
159 |
+
# Prepare data for radar chart using real system-specific data
|
160 |
+
categories = ['Overall Actionability', 'Overall Evidence', 'Diagnosis', 'Treatment', 'Mixed']
|
161 |
+
|
162 |
+
# Extract real system-specific metrics
|
163 |
+
detailed_results = stats.get('detailed_system_results', {})
|
164 |
+
system_data = {}
|
165 |
+
|
166 |
+
for system in systems:
|
167 |
+
if system in detailed_results:
|
168 |
+
system_info = detailed_results[system]
|
169 |
+
system_results = system_info['results']
|
170 |
+
|
171 |
+
# Calculate category-specific performance
|
172 |
+
category_performance = {}
|
173 |
+
for result in system_results:
|
174 |
+
category = result.get('category', 'unknown').lower()
|
175 |
+
if category not in category_performance:
|
176 |
+
category_performance[category] = {'actionability': [], 'evidence': []}
|
177 |
+
category_performance[category]['actionability'].append(result['actionability_score'])
|
178 |
+
category_performance[category]['evidence'].append(result['evidence_score'])
|
179 |
+
|
180 |
+
# Build radar chart data
|
181 |
+
system_scores = [
|
182 |
+
system_info['avg_actionability'], # Overall Actionability
|
183 |
+
system_info['avg_evidence'], # Overall Evidence
|
184 |
+
# Category-specific scores (average of actionability and evidence)
|
185 |
+
(sum(category_performance.get('diagnosis', {}).get('actionability', [0])) /
|
186 |
+
len(category_performance.get('diagnosis', {}).get('actionability', [1])) +
|
187 |
+
sum(category_performance.get('diagnosis', {}).get('evidence', [0])) /
|
188 |
+
len(category_performance.get('diagnosis', {}).get('evidence', [1]))) / 2 if 'diagnosis' in category_performance else 0.5,
|
189 |
+
|
190 |
+
(sum(category_performance.get('treatment', {}).get('actionability', [0])) /
|
191 |
+
len(category_performance.get('treatment', {}).get('actionability', [1])) +
|
192 |
+
sum(category_performance.get('treatment', {}).get('evidence', [0])) /
|
193 |
+
len(category_performance.get('treatment', {}).get('evidence', [1]))) / 2 if 'treatment' in category_performance else 0.5,
|
194 |
+
|
195 |
+
(sum(category_performance.get('mixed', {}).get('actionability', [0])) /
|
196 |
+
len(category_performance.get('mixed', {}).get('actionability', [1])) +
|
197 |
+
sum(category_performance.get('mixed', {}).get('evidence', [0])) /
|
198 |
+
len(category_performance.get('mixed', {}).get('evidence', [1]))) / 2 if 'mixed' in category_performance else 0.5
|
199 |
+
]
|
200 |
+
system_data[system] = system_scores
|
201 |
+
else:
|
202 |
+
# Fallback to overall stats if detailed results not available
|
203 |
+
overall_results = stats['overall_results']
|
204 |
+
system_data[system] = [
|
205 |
+
overall_results['average_actionability'],
|
206 |
+
overall_results['average_evidence'],
|
207 |
+
0.7, 0.6, 0.5 # Placeholder for missing category data
|
208 |
+
]
|
209 |
+
|
210 |
+
# Create radar chart
|
211 |
+
angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
|
212 |
+
angles += angles[:1] # Complete the circle
|
213 |
+
|
214 |
+
for system in systems:
|
215 |
+
values = system_data[system] + [system_data[system][0]] # Complete the circle
|
216 |
+
ax.plot(angles, values, 'o-', linewidth=2,
|
217 |
+
label=f'{system.upper()} System', color=self.colors.get(system, 'gray'))
|
218 |
+
ax.fill(angles, values, alpha=0.1, color=self.colors.get(system, 'gray'))
|
219 |
+
|
220 |
+
# Customize radar chart
|
221 |
+
ax.set_xticks(angles[:-1])
|
222 |
+
ax.set_xticklabels(categories, fontsize=9)
|
223 |
+
ax.set_ylim(0, 1)
|
224 |
+
ax.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0])
|
225 |
+
ax.set_yticklabels(['2.0', '4.0', '6.0', '8.0', '10.0'])
|
226 |
+
ax.grid(True, alpha=0.3)
|
227 |
+
ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1.0))
|
228 |
+
|
229 |
+
# Add target threshold circle
|
230 |
+
target_circle = [0.7] * (len(categories) + 1) # 7.0 threshold
|
231 |
+
ax.plot(angles, target_circle, '--', color=self.colors['target_line'],
|
232 |
+
alpha=0.7, label='Target (7.0)')
|
233 |
+
|
234 |
+
def _create_grouped_bar_chart(self, ax, stats: Dict, systems: List[str]):
|
235 |
+
"""Create grouped bar chart for direct metric comparison"""
|
236 |
+
ax.set_title('Direct Metric Comparison', fontweight='bold', pad=20)
|
237 |
+
|
238 |
+
# Prepare data using real system-specific metrics
|
239 |
+
metrics = ['Actionability', 'Evidence Quality']
|
240 |
+
detailed_results = stats.get('detailed_system_results', {})
|
241 |
+
|
242 |
+
# Extract real system-specific data
|
243 |
+
system_scores = {}
|
244 |
+
for system in systems:
|
245 |
+
if system in detailed_results:
|
246 |
+
system_info = detailed_results[system]
|
247 |
+
system_scores[system] = [
|
248 |
+
system_info['avg_actionability'],
|
249 |
+
system_info['avg_evidence']
|
250 |
+
]
|
251 |
+
else:
|
252 |
+
# Fallback to overall results
|
253 |
+
overall_results = stats['overall_results']
|
254 |
+
system_scores[system] = [
|
255 |
+
overall_results['average_actionability'],
|
256 |
+
overall_results['average_evidence']
|
257 |
+
]
|
258 |
+
|
259 |
+
# Create grouped bar chart
|
260 |
+
x = np.arange(len(metrics))
|
261 |
+
width = 0.35 if len(systems) == 2 else 0.25
|
262 |
+
|
263 |
+
for i, system in enumerate(systems):
|
264 |
+
offset = (i - len(systems)/2 + 0.5) * width
|
265 |
+
bars = ax.bar(x + offset, system_scores[system], width,
|
266 |
+
label=f'{system.upper()}', color=self.colors.get(system, 'gray'),
|
267 |
+
alpha=0.8)
|
268 |
+
|
269 |
+
# Add value labels on bars
|
270 |
+
for bar, value in zip(bars, system_scores[system]):
|
271 |
+
height = bar.get_height()
|
272 |
+
ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
273 |
+
f'{value:.3f}', ha='center', va='bottom', fontweight='bold')
|
274 |
+
|
275 |
+
# Add target threshold lines
|
276 |
+
ax.axhline(y=0.7, color=self.colors['target_line'], linestyle='--',
|
277 |
+
alpha=0.7, label='Actionability Target (7.0)')
|
278 |
+
ax.axhline(y=0.75, color=self.colors['target_line'], linestyle=':',
|
279 |
+
alpha=0.7, label='Evidence Target (7.5)')
|
280 |
+
|
281 |
+
# Customize chart
|
282 |
+
ax.set_xlabel('Evaluation Metrics')
|
283 |
+
ax.set_ylabel('Score (0-1 scale)')
|
284 |
+
ax.set_title('System Performance Comparison')
|
285 |
+
ax.set_xticks(x)
|
286 |
+
ax.set_xticklabels(metrics)
|
287 |
+
ax.legend(loc='upper left')
|
288 |
+
ax.grid(True, alpha=0.3, axis='y')
|
289 |
+
ax.set_ylim(0, 1.0)
|
290 |
+
|
291 |
+
def _create_scatter_plot(self, ax, stats: Dict, systems: List[str]):
|
292 |
+
"""Create scatter plot for actionability vs evidence quality analysis"""
|
293 |
+
ax.set_title('Actionability vs Evidence Quality Analysis', fontweight='bold', pad=20)
|
294 |
+
|
295 |
+
# Extract real query-level data from detailed results
|
296 |
+
detailed_results = stats.get('detailed_system_results', {})
|
297 |
+
|
298 |
+
for system in systems:
|
299 |
+
if system in detailed_results:
|
300 |
+
system_results = detailed_results[system]['results']
|
301 |
+
|
302 |
+
# Extract real actionability and evidence scores for each query
|
303 |
+
actionability_scores = [r['actionability_score'] for r in system_results]
|
304 |
+
evidence_scores = [r['evidence_score'] for r in system_results]
|
305 |
+
|
306 |
+
ax.scatter(actionability_scores, evidence_scores,
|
307 |
+
label=f'{system.upper()}', color=self.colors.get(system, 'gray'),
|
308 |
+
alpha=0.7, s=100, edgecolors='white', linewidth=1)
|
309 |
+
else:
|
310 |
+
# Fallback: create single point from overall averages
|
311 |
+
overall_results = stats['overall_results']
|
312 |
+
ax.scatter([overall_results['average_actionability']],
|
313 |
+
[overall_results['average_evidence']],
|
314 |
+
label=f'{system.upper()}', color=self.colors.get(system, 'gray'),
|
315 |
+
alpha=0.7, s=100, edgecolors='white', linewidth=1)
|
316 |
+
|
317 |
+
# Add target threshold lines
|
318 |
+
ax.axvline(x=0.7, color=self.colors['target_line'], linestyle='--',
|
319 |
+
alpha=0.7, label='Actionability Target')
|
320 |
+
ax.axhline(y=0.75, color=self.colors['target_line'], linestyle='--',
|
321 |
+
alpha=0.7, label='Evidence Target')
|
322 |
+
|
323 |
+
# Add target zone
|
324 |
+
target_rect = Rectangle((0.7, 0.75), 0.3, 0.25, linewidth=1,
|
325 |
+
edgecolor=self.colors['target_line'], facecolor='green',
|
326 |
+
alpha=0.1, label='Target Zone')
|
327 |
+
ax.add_patch(target_rect)
|
328 |
+
|
329 |
+
# Customize chart
|
330 |
+
ax.set_xlabel('Clinical Actionability (0-1 scale)')
|
331 |
+
ax.set_ylabel('Clinical Evidence Quality (0-1 scale)')
|
332 |
+
ax.legend(loc='lower right')
|
333 |
+
ax.grid(True, alpha=0.3)
|
334 |
+
ax.set_xlim(0, 1)
|
335 |
+
ax.set_ylim(0, 1)
|
336 |
+
|
337 |
+
def _create_heatmap(self, ax, stats: Dict, systems: List[str]):
|
338 |
+
"""Create heatmap for category-wise performance matrix"""
|
339 |
+
ax.set_title('Category-wise Performance Matrix', fontweight='bold', pad=20)
|
340 |
+
|
341 |
+
# Prepare data
|
342 |
+
categories = ['Diagnosis', 'Treatment', 'Mixed']
|
343 |
+
metrics = ['Actionability', 'Evidence']
|
344 |
+
category_results = stats['category_results']
|
345 |
+
|
346 |
+
# Create data matrix
|
347 |
+
data_matrix = []
|
348 |
+
row_labels = []
|
349 |
+
|
350 |
+
for system in systems:
|
351 |
+
for metric in metrics:
|
352 |
+
row_data = []
|
353 |
+
for category in categories:
|
354 |
+
cat_key = category.lower()
|
355 |
+
if cat_key in category_results and category_results[cat_key]['query_count'] > 0:
|
356 |
+
if metric == 'Actionability':
|
357 |
+
value = category_results[cat_key]['average_actionability']
|
358 |
+
else:
|
359 |
+
value = category_results[cat_key]['average_evidence']
|
360 |
+
else:
|
361 |
+
value = 0.5 # Placeholder for missing data
|
362 |
+
row_data.append(value)
|
363 |
+
|
364 |
+
data_matrix.append(row_data)
|
365 |
+
row_labels.append(f'{system.upper()}\n{metric}')
|
366 |
+
|
367 |
+
# Create heatmap
|
368 |
+
im = ax.imshow(data_matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
|
369 |
+
|
370 |
+
# Set ticks and labels
|
371 |
+
ax.set_xticks(np.arange(len(categories)))
|
372 |
+
ax.set_yticks(np.arange(len(row_labels)))
|
373 |
+
ax.set_xticklabels(categories)
|
374 |
+
ax.set_yticklabels(row_labels, fontsize=9)
|
375 |
+
|
376 |
+
# Add text annotations
|
377 |
+
for i in range(len(row_labels)):
|
378 |
+
for j in range(len(categories)):
|
379 |
+
text = ax.text(j, i, f'{data_matrix[i][j]:.3f}',
|
380 |
+
ha='center', va='center', fontweight='bold',
|
381 |
+
color='white' if data_matrix[i][j] < 0.5 else 'black')
|
382 |
+
|
383 |
+
# Add colorbar
|
384 |
+
cbar = plt.colorbar(im, ax=ax, shrink=0.6)
|
385 |
+
cbar.set_label('Performance Score (0-1)', rotation=270, labelpad=15)
|
386 |
+
|
387 |
+
ax.set_xlabel('Query Categories')
|
388 |
+
ax.set_ylabel('System × Metric')
|
389 |
+
|
390 |
+
|
391 |
+
# Independent execution interface
|
392 |
+
if __name__ == "__main__":
|
393 |
+
"""Independent chart generation interface"""
|
394 |
+
|
395 |
+
print("📊 OnCall.ai LLM Judge Chart Generator - Metrics 5-6 Visualization")
|
396 |
+
|
397 |
+
# Initialize generator
|
398 |
+
generator = LLMJudgeChartGenerator()
|
399 |
+
|
400 |
+
try:
|
401 |
+
# Load latest statistics
|
402 |
+
stats = generator.load_latest_statistics()
|
403 |
+
|
404 |
+
print(f"📈 Generating comparison charts...")
|
405 |
+
|
406 |
+
# Generate comprehensive comparison charts
|
407 |
+
chart_path = generator.generate_comparison_charts(stats)
|
408 |
+
|
409 |
+
# Print summary
|
410 |
+
comparison_meta = stats.get('comparison_metadata', {})
|
411 |
+
systems = comparison_meta.get('systems_compared', ['rag', 'direct'])
|
412 |
+
overall_results = stats['overall_results']
|
413 |
+
|
414 |
+
print(f"\n📊 === CHART GENERATION SUMMARY ===")
|
415 |
+
print(f"Systems Visualized: {' vs '.join([s.upper() for s in systems])}")
|
416 |
+
print(f"Overall Actionability: {overall_results['average_actionability']:.3f}")
|
417 |
+
print(f"Overall Evidence Quality: {overall_results['average_evidence']:.3f}")
|
418 |
+
print(f"Total Queries: {overall_results['total_queries']}")
|
419 |
+
print(f"Chart Components: Radar Chart, Bar Chart, Scatter Plot, Heatmap")
|
420 |
+
|
421 |
+
print(f"\n✅ Comprehensive visualization complete!")
|
422 |
+
print(f"📊 Charts saved to: {chart_path}")
|
423 |
+
print(f"💡 Tip: Charts optimized for research presentations and publications")
|
424 |
+
|
425 |
+
except FileNotFoundError as e:
|
426 |
+
print(f"❌ {e}")
|
427 |
+
print(f"💡 Please run judge evaluation first:")
|
428 |
+
print(" python metric5_6_llm_judge_evaluator.py rag,direct")
|
429 |
+
except Exception as e:
|
430 |
+
print(f"❌ Chart generation failed: {e}")
|
evaluation/metric5_6_llm_judge_evaluator.py
ADDED
@@ -0,0 +1,643 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
OnCall.ai System - LLM Judge Evaluator (Metrics 5-6)
|
4 |
+
====================================================
|
5 |
+
|
6 |
+
Uses Llama3-70B as third-party judge to evaluate medical advice quality.
|
7 |
+
Batch evaluation strategy: 1 call evaluates all queries for maximum efficiency.
|
8 |
+
|
9 |
+
Metrics evaluated:
|
10 |
+
5. Clinical Actionability (臨床可操作性)
|
11 |
+
6. Clinical Evidence Quality (臨床證據品質)
|
12 |
+
|
13 |
+
EVALUATION RUBRICS:
|
14 |
+
|
15 |
+
Metric 5: Clinical Actionability (1-10 scale)
|
16 |
+
1-2 points: Almost no actionable advice; extremely abstract or empty responses.
|
17 |
+
3-4 points: Provides some directional suggestions but too vague, lacks clear steps.
|
18 |
+
5-6 points: Offers basic executable steps but lacks details or insufficient explanation for key aspects.
|
19 |
+
7-8 points: Clear and complete steps that clinicians can follow, with occasional gaps needing supplementation.
|
20 |
+
9-10 points: Extremely actionable with precise, step-by-step executable guidance; can be used "as-is" immediately.
|
21 |
+
|
22 |
+
Metric 6: Clinical Evidence Quality (1-10 scale)
|
23 |
+
1-2 points: Almost no evidence support; cites completely irrelevant or unreliable sources.
|
24 |
+
3-4 points: References lower quality literature or guidelines, or sources lack authority.
|
25 |
+
5-6 points: Uses general quality literature/guidelines but lacks depth or currency.
|
26 |
+
7-8 points: References reliable, authoritative sources (renowned journals or authoritative guidelines) with accurate explanations.
|
27 |
+
9-10 points: Rich and high-quality evidence sources (systematic reviews, RCTs, etc.) combined with latest research; enhances recommendation credibility.
|
28 |
+
|
29 |
+
Author: YanBo Chen
|
30 |
+
Date: 2025-08-04
|
31 |
+
"""
|
32 |
+
|
33 |
+
import json
|
34 |
+
import os
|
35 |
+
import sys
|
36 |
+
import time
|
37 |
+
from typing import Dict, List, Any, Tuple
|
38 |
+
from datetime import datetime
|
39 |
+
from pathlib import Path
|
40 |
+
import glob
|
41 |
+
import re
|
42 |
+
|
43 |
+
# Evaluation Rubrics as programmable constants
|
44 |
+
ACTIONABILITY_RUBRIC = {
|
45 |
+
(1, 2): "Almost no actionable advice; extremely abstract or empty responses.",
|
46 |
+
(3, 4): "Provides some directional suggestions but too vague, lacks clear steps.",
|
47 |
+
(5, 6): "Offers basic executable steps but lacks details or insufficient explanation for key aspects.",
|
48 |
+
(7, 8): "Clear and complete steps that clinicians can follow, with occasional gaps needing supplementation.",
|
49 |
+
(9, 10): "Extremely actionable with precise, step-by-step executable guidance; can be used 'as-is' immediately."
|
50 |
+
}
|
51 |
+
|
52 |
+
EVIDENCE_RUBRIC = {
|
53 |
+
(1, 2): "Almost no evidence support; cites completely irrelevant or unreliable sources.",
|
54 |
+
(3, 4): "References lower quality literature or guidelines, or sources lack authority.",
|
55 |
+
(5, 6): "Uses general quality literature/guidelines but lacks depth or currency.",
|
56 |
+
(7, 8): "References reliable, authoritative sources (renowned journals or authoritative guidelines) with accurate explanations.",
|
57 |
+
(9, 10): "Rich and high-quality evidence sources (systematic reviews, RCTs, etc.) combined with latest research; enhances recommendation credibility."
|
58 |
+
}
|
59 |
+
|
60 |
+
def print_evaluation_rubrics():
|
61 |
+
"""Print detailed evaluation rubrics for reference"""
|
62 |
+
print("=" * 60)
|
63 |
+
print("CLINICAL EVALUATION RUBRICS")
|
64 |
+
print("=" * 60)
|
65 |
+
|
66 |
+
print("\n🎯 METRIC 5: Clinical Actionability (1-10 scale)")
|
67 |
+
print("-" * 50)
|
68 |
+
for score_range, description in ACTIONABILITY_RUBRIC.items():
|
69 |
+
print(f"{score_range[0]}–{score_range[1]} points: {description}")
|
70 |
+
|
71 |
+
print("\n📚 METRIC 6: Clinical Evidence Quality (1-10 scale)")
|
72 |
+
print("-" * 50)
|
73 |
+
for score_range, description in EVIDENCE_RUBRIC.items():
|
74 |
+
print(f"{score_range[0]}–{score_range[1]} points: {description}")
|
75 |
+
|
76 |
+
print("\n" + "=" * 60)
|
77 |
+
print("TARGET THRESHOLDS:")
|
78 |
+
print("• Actionability: ≥7.0 (Acceptable clinical utility)")
|
79 |
+
print("• Evidence Quality: ≥7.5 (Reliable evidence support)")
|
80 |
+
print("=" * 60)
|
81 |
+
|
82 |
+
def get_rubric_description(score: int, metric_type: str) -> str:
|
83 |
+
"""Get rubric description for a given score and metric type"""
|
84 |
+
rubric = ACTIONABILITY_RUBRIC if metric_type == "actionability" else EVIDENCE_RUBRIC
|
85 |
+
|
86 |
+
for score_range, description in rubric.items():
|
87 |
+
if score_range[0] <= score <= score_range[1]:
|
88 |
+
return description
|
89 |
+
|
90 |
+
return "Score out of valid range (1-10)"
|
91 |
+
|
92 |
+
# Add project path
|
93 |
+
current_dir = Path(__file__).parent
|
94 |
+
project_root = current_dir.parent
|
95 |
+
src_dir = project_root / "src"
|
96 |
+
sys.path.insert(0, str(src_dir))
|
97 |
+
|
98 |
+
# Import LLM client for judge evaluation
|
99 |
+
try:
|
100 |
+
from llm_clients import llm_Llama3_70B_JudgeClient
|
101 |
+
except ImportError as e:
|
102 |
+
print(f"❌ Import failed: {e}")
|
103 |
+
print("Please ensure running from project root directory")
|
104 |
+
sys.exit(1)
|
105 |
+
|
106 |
+
|
107 |
+
class LLMJudgeEvaluator:
|
108 |
+
"""LLM judge evaluator using batch evaluation strategy"""
|
109 |
+
|
110 |
+
def __init__(self):
|
111 |
+
"""Initialize judge LLM client"""
|
112 |
+
print("🔧 Initializing LLM Judge Evaluator...")
|
113 |
+
|
114 |
+
# Initialize Llama3-70B as judge LLM
|
115 |
+
self.judge_llm = llm_Llama3_70B_JudgeClient()
|
116 |
+
|
117 |
+
self.evaluation_results = []
|
118 |
+
|
119 |
+
print("✅ LLM Judge Evaluator initialization complete")
|
120 |
+
|
121 |
+
def load_medical_outputs(self, filepath: str) -> List[Dict[str, Any]]:
|
122 |
+
"""Load medical outputs from file"""
|
123 |
+
print(f"📁 Loading medical outputs from: {filepath}")
|
124 |
+
|
125 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
126 |
+
data = json.load(f)
|
127 |
+
|
128 |
+
medical_outputs = data.get('medical_outputs', [])
|
129 |
+
print(f"📋 Loaded {len(medical_outputs)} medical outputs")
|
130 |
+
|
131 |
+
return medical_outputs
|
132 |
+
|
133 |
+
def find_medical_outputs_for_systems(self, systems: List[str]) -> Dict[str, str]:
|
134 |
+
"""Find medical outputs files for multiple systems"""
|
135 |
+
results_dir = Path(__file__).parent / "results"
|
136 |
+
system_files = {}
|
137 |
+
|
138 |
+
for system in systems:
|
139 |
+
if system == "rag":
|
140 |
+
# Use more specific pattern to exclude direct files
|
141 |
+
pattern = str(results_dir / "medical_outputs_[0-9]*.json")
|
142 |
+
elif system == "direct":
|
143 |
+
pattern = str(results_dir / "medical_outputs_direct_*.json")
|
144 |
+
else:
|
145 |
+
# Future extension: support other systems
|
146 |
+
pattern = str(results_dir / f"medical_outputs_{system}_*.json")
|
147 |
+
|
148 |
+
print(f"🔍 Searching for {system} with pattern: {pattern}")
|
149 |
+
output_files = glob.glob(pattern)
|
150 |
+
print(f"🔍 Found files for {system}: {output_files}")
|
151 |
+
|
152 |
+
if not output_files:
|
153 |
+
raise FileNotFoundError(f"No medical outputs files found for {system} system")
|
154 |
+
|
155 |
+
latest_file = max(output_files, key=os.path.getmtime)
|
156 |
+
system_files[system] = latest_file
|
157 |
+
print(f"📊 Found {system} outputs: {latest_file}")
|
158 |
+
|
159 |
+
return system_files
|
160 |
+
|
161 |
+
def create_comparison_evaluation_prompt(self, systems_outputs: Dict[str, List[Dict]]) -> str:
|
162 |
+
"""
|
163 |
+
Create comparison evaluation prompt for multiple systems
|
164 |
+
|
165 |
+
Args:
|
166 |
+
systems_outputs: Dict mapping system names to their medical outputs
|
167 |
+
"""
|
168 |
+
system_names = list(systems_outputs.keys())
|
169 |
+
|
170 |
+
prompt_parts = [
|
171 |
+
"You are a medical expert evaluating and comparing AI systems for clinical advice quality.",
|
172 |
+
f"Please evaluate {len(system_names)} different systems using the detailed rubrics below:",
|
173 |
+
"",
|
174 |
+
"EVALUATION RUBRICS:",
|
175 |
+
"",
|
176 |
+
"METRIC 1: Clinical Actionability (1-10 scale)",
|
177 |
+
"Question: Can healthcare providers immediately act on this advice?",
|
178 |
+
"1-2 points: Almost no actionable advice; extremely abstract or empty responses.",
|
179 |
+
"3-4 points: Provides directional suggestions but too vague, lacks clear steps.",
|
180 |
+
"5-6 points: Offers basic executable steps but lacks details for key aspects.",
|
181 |
+
"7-8 points: Clear and complete steps that clinicians can follow with occasional gaps.",
|
182 |
+
"9-10 points: Extremely actionable with precise, step-by-step executable guidance.",
|
183 |
+
"",
|
184 |
+
"METRIC 2: Clinical Evidence Quality (1-10 scale)",
|
185 |
+
"Question: Is the advice evidence-based and follows medical standards?",
|
186 |
+
"1-2 points: Almost no evidence support; cites irrelevant or unreliable sources.",
|
187 |
+
"3-4 points: References lower quality literature or sources lack authority.",
|
188 |
+
"5-6 points: Uses general quality literature/guidelines but lacks depth or currency.",
|
189 |
+
"7-8 points: References reliable, authoritative sources with accurate explanations.",
|
190 |
+
"9-10 points: Rich, high-quality evidence sources combined with latest research.",
|
191 |
+
"",
|
192 |
+
"TARGET THRESHOLDS: Actionability ≥7.0, Evidence Quality ≥7.5",
|
193 |
+
""
|
194 |
+
]
|
195 |
+
|
196 |
+
# Add system descriptions
|
197 |
+
for i, system in enumerate(system_names, 1):
|
198 |
+
if system == "rag":
|
199 |
+
prompt_parts.append(f"SYSTEM {i} (RAG): Uses medical guidelines + LLM for evidence-based advice")
|
200 |
+
elif system == "direct":
|
201 |
+
prompt_parts.append(f"SYSTEM {i} (Direct): Uses LLM only without external guidelines")
|
202 |
+
else:
|
203 |
+
prompt_parts.append(f"SYSTEM {i} ({system.upper()}): {system} medical AI system")
|
204 |
+
|
205 |
+
prompt_parts.extend([
|
206 |
+
"",
|
207 |
+
"EVALUATION CRITERIA:",
|
208 |
+
"1. Clinical Actionability (1-10): Can healthcare providers immediately act on this advice?",
|
209 |
+
"2. Clinical Evidence Quality (1-10): Is the advice evidence-based and follows medical standards?",
|
210 |
+
"",
|
211 |
+
"QUERIES TO EVALUATE:",
|
212 |
+
""
|
213 |
+
])
|
214 |
+
|
215 |
+
# Get all queries (assuming all systems processed same queries)
|
216 |
+
first_system = system_names[0]
|
217 |
+
queries = systems_outputs[first_system]
|
218 |
+
|
219 |
+
# Add each query with all system responses
|
220 |
+
for i, query_data in enumerate(queries, 1):
|
221 |
+
query = query_data.get('query', '')
|
222 |
+
category = query_data.get('category', 'unknown')
|
223 |
+
|
224 |
+
prompt_parts.extend([
|
225 |
+
f"=== QUERY {i} ({category.upper()}) ===",
|
226 |
+
f"Patient Query: {query}",
|
227 |
+
""
|
228 |
+
])
|
229 |
+
|
230 |
+
# Add each system's response
|
231 |
+
for j, system in enumerate(system_names, 1):
|
232 |
+
system_query = systems_outputs[system][i-1] # Get corresponding query from this system
|
233 |
+
advice = system_query.get('medical_advice', '')
|
234 |
+
|
235 |
+
prompt_parts.extend([
|
236 |
+
f"SYSTEM {j} Response: {advice}",
|
237 |
+
""
|
238 |
+
])
|
239 |
+
|
240 |
+
prompt_parts.extend([
|
241 |
+
"RESPONSE FORMAT (provide exactly this format):",
|
242 |
+
""
|
243 |
+
])
|
244 |
+
|
245 |
+
# Add response format template
|
246 |
+
for i in range(1, len(queries) + 1):
|
247 |
+
for j, system in enumerate(system_names, 1):
|
248 |
+
prompt_parts.append(f"Query {i} System {j}: Actionability=X, Evidence=Y")
|
249 |
+
|
250 |
+
prompt_parts.extend([
|
251 |
+
"",
|
252 |
+
"Replace X and Y with numeric scores 1-10.",
|
253 |
+
"Provide only the scores in the exact format above.",
|
254 |
+
f"Note: System 1={system_names[0]}, System 2={system_names[1] if len(system_names) > 1 else 'N/A'}"
|
255 |
+
])
|
256 |
+
|
257 |
+
return "\n".join(prompt_parts)
|
258 |
+
|
259 |
+
def parse_comparison_evaluation_response(self, response: str, systems_outputs: Dict[str, List[Dict]]) -> Dict[str, List[Dict]]:
|
260 |
+
"""Parse comparison evaluation response into results by system"""
|
261 |
+
results_by_system = {}
|
262 |
+
system_names = list(systems_outputs.keys())
|
263 |
+
|
264 |
+
# Initialize results for each system
|
265 |
+
for system in system_names:
|
266 |
+
results_by_system[system] = []
|
267 |
+
|
268 |
+
lines = response.strip().split('\n')
|
269 |
+
|
270 |
+
for line in lines:
|
271 |
+
line = line.strip()
|
272 |
+
if not line:
|
273 |
+
continue
|
274 |
+
|
275 |
+
# Parse format: "Query X System Y: Actionability=A, Evidence=B"
|
276 |
+
match = re.match(r'Query\s+(\d+)\s+System\s+(\d+):\s*Actionability\s*=\s*(\d+)\s*,\s*Evidence\s*=\s*(\d+)', line, re.IGNORECASE)
|
277 |
+
|
278 |
+
if match:
|
279 |
+
query_num = int(match.group(1)) - 1 # 0-based index
|
280 |
+
system_num = int(match.group(2)) - 1 # 0-based index
|
281 |
+
actionability_score = int(match.group(3))
|
282 |
+
evidence_score = int(match.group(4))
|
283 |
+
|
284 |
+
if system_num < len(system_names) and query_num < len(systems_outputs[system_names[system_num]]):
|
285 |
+
system_name = system_names[system_num]
|
286 |
+
output = systems_outputs[system_name][query_num]
|
287 |
+
|
288 |
+
result = {
|
289 |
+
"query": output.get('query', ''),
|
290 |
+
"category": output.get('category', 'unknown'),
|
291 |
+
"system_type": system_name,
|
292 |
+
"medical_advice": output.get('medical_advice', ''),
|
293 |
+
|
294 |
+
# Metric 5: Clinical Actionability
|
295 |
+
"actionability_score": actionability_score / 10.0,
|
296 |
+
"actionability_raw": actionability_score,
|
297 |
+
|
298 |
+
# Metric 6: Clinical Evidence Quality
|
299 |
+
"evidence_score": evidence_score / 10.0,
|
300 |
+
"evidence_raw": evidence_score,
|
301 |
+
|
302 |
+
"evaluation_success": True,
|
303 |
+
"timestamp": datetime.now().isoformat()
|
304 |
+
}
|
305 |
+
|
306 |
+
results_by_system[system_name].append(result)
|
307 |
+
|
308 |
+
return results_by_system
|
309 |
+
|
310 |
+
def evaluate_multiple_systems(self, systems_outputs: Dict[str, List[Dict]]) -> Dict[str, List[Dict]]:
|
311 |
+
"""
|
312 |
+
Evaluate multiple systems using single LLM call for comparison
|
313 |
+
|
314 |
+
Args:
|
315 |
+
systems_outputs: Dict mapping system names to their medical outputs
|
316 |
+
"""
|
317 |
+
system_names = list(systems_outputs.keys())
|
318 |
+
total_queries = len(systems_outputs[system_names[0]])
|
319 |
+
|
320 |
+
print(f"🧠 Multi-system comparison: {', '.join(system_names)}")
|
321 |
+
print(f"📊 Evaluating {total_queries} queries across {len(system_names)} systems...")
|
322 |
+
|
323 |
+
try:
|
324 |
+
# Create comparison evaluation prompt
|
325 |
+
comparison_prompt = self.create_comparison_evaluation_prompt(systems_outputs)
|
326 |
+
|
327 |
+
print(f"📝 Comparison prompt created ({len(comparison_prompt)} characters)")
|
328 |
+
print(f"🔄 Calling judge LLM for multi-system comparison...")
|
329 |
+
|
330 |
+
# Single LLM call for all systems comparison
|
331 |
+
eval_start = time.time()
|
332 |
+
response = self.judge_llm.batch_evaluate(comparison_prompt)
|
333 |
+
eval_time = time.time() - eval_start
|
334 |
+
|
335 |
+
# Extract response text
|
336 |
+
response_text = response.get('content', '') if isinstance(response, dict) else str(response)
|
337 |
+
|
338 |
+
print(f"✅ Judge LLM completed comparison evaluation in {eval_time:.2f}s")
|
339 |
+
print(f"📄 Response length: {len(response_text)} characters")
|
340 |
+
|
341 |
+
# Parse comparison response
|
342 |
+
results_by_system = self.parse_comparison_evaluation_response(response_text, systems_outputs)
|
343 |
+
|
344 |
+
# Combine all results for storage
|
345 |
+
all_results = []
|
346 |
+
for system_name, system_results in results_by_system.items():
|
347 |
+
all_results.extend(system_results)
|
348 |
+
print(f"📊 {system_name.upper()}: {len(system_results)} evaluations parsed")
|
349 |
+
|
350 |
+
self.evaluation_results.extend(all_results)
|
351 |
+
|
352 |
+
return results_by_system
|
353 |
+
|
354 |
+
except Exception as e:
|
355 |
+
print(f"❌ Multi-system evaluation failed: {e}")
|
356 |
+
|
357 |
+
# Create error results for all systems
|
358 |
+
error_results = {}
|
359 |
+
for system_name, outputs in systems_outputs.items():
|
360 |
+
error_results[system_name] = []
|
361 |
+
for output in outputs:
|
362 |
+
error_result = {
|
363 |
+
"query": output.get('query', ''),
|
364 |
+
"category": output.get('category', 'unknown'),
|
365 |
+
"system_type": system_name,
|
366 |
+
"actionability_score": 0.0,
|
367 |
+
"evidence_score": 0.0,
|
368 |
+
"evaluation_success": False,
|
369 |
+
"error": str(e),
|
370 |
+
"timestamp": datetime.now().isoformat()
|
371 |
+
}
|
372 |
+
error_results[system_name].append(error_result)
|
373 |
+
self.evaluation_results.extend(error_results[system_name])
|
374 |
+
|
375 |
+
return error_results
|
376 |
+
|
377 |
+
def calculate_judge_statistics(self) -> Dict[str, Any]:
|
378 |
+
"""Calculate statistics for LLM judge evaluation"""
|
379 |
+
successful_results = [r for r in self.evaluation_results if r.get('evaluation_success')]
|
380 |
+
|
381 |
+
if not successful_results:
|
382 |
+
return {
|
383 |
+
"category_results": {},
|
384 |
+
"overall_results": {
|
385 |
+
"average_actionability": 0.0,
|
386 |
+
"average_evidence": 0.0,
|
387 |
+
"successful_evaluations": 0,
|
388 |
+
"total_queries": len(self.evaluation_results)
|
389 |
+
},
|
390 |
+
"timestamp": datetime.now().isoformat()
|
391 |
+
}
|
392 |
+
|
393 |
+
# Group by category
|
394 |
+
results_by_category = {"diagnosis": [], "treatment": [], "mixed": []}
|
395 |
+
|
396 |
+
for result in successful_results:
|
397 |
+
category = result.get('category', 'unknown')
|
398 |
+
if category in results_by_category:
|
399 |
+
results_by_category[category].append(result)
|
400 |
+
|
401 |
+
# Calculate category statistics
|
402 |
+
category_stats = {}
|
403 |
+
for category, results in results_by_category.items():
|
404 |
+
if results:
|
405 |
+
actionability_scores = [r['actionability_score'] for r in results]
|
406 |
+
evidence_scores = [r['evidence_score'] for r in results]
|
407 |
+
|
408 |
+
category_stats[category] = {
|
409 |
+
"average_actionability": sum(actionability_scores) / len(actionability_scores),
|
410 |
+
"average_evidence": sum(evidence_scores) / len(evidence_scores),
|
411 |
+
"query_count": len(results),
|
412 |
+
"actionability_target_met": (sum(actionability_scores) / len(actionability_scores)) >= 0.7,
|
413 |
+
"evidence_target_met": (sum(evidence_scores) / len(evidence_scores)) >= 0.75,
|
414 |
+
"individual_actionability_scores": actionability_scores,
|
415 |
+
"individual_evidence_scores": evidence_scores
|
416 |
+
}
|
417 |
+
else:
|
418 |
+
category_stats[category] = {
|
419 |
+
"average_actionability": 0.0,
|
420 |
+
"average_evidence": 0.0,
|
421 |
+
"query_count": 0,
|
422 |
+
"actionability_target_met": False,
|
423 |
+
"evidence_target_met": False,
|
424 |
+
"individual_actionability_scores": [],
|
425 |
+
"individual_evidence_scores": []
|
426 |
+
}
|
427 |
+
|
428 |
+
# Calculate overall statistics
|
429 |
+
all_actionability = [r['actionability_score'] for r in successful_results]
|
430 |
+
all_evidence = [r['evidence_score'] for r in successful_results]
|
431 |
+
|
432 |
+
overall_stats = {
|
433 |
+
"average_actionability": sum(all_actionability) / len(all_actionability),
|
434 |
+
"average_evidence": sum(all_evidence) / len(all_evidence),
|
435 |
+
"successful_evaluations": len(successful_results),
|
436 |
+
"total_queries": len(self.evaluation_results),
|
437 |
+
"actionability_target_met": (sum(all_actionability) / len(all_actionability)) >= 0.7,
|
438 |
+
"evidence_target_met": (sum(all_evidence) / len(all_evidence)) >= 0.75
|
439 |
+
}
|
440 |
+
|
441 |
+
return {
|
442 |
+
"category_results": category_stats,
|
443 |
+
"overall_results": overall_stats,
|
444 |
+
"timestamp": datetime.now().isoformat()
|
445 |
+
}
|
446 |
+
|
447 |
+
def save_comparison_statistics(self, systems: List[str], filename: str = None) -> str:
|
448 |
+
"""Save comparison evaluation statistics for multiple systems"""
|
449 |
+
stats = self.calculate_judge_statistics()
|
450 |
+
|
451 |
+
if filename is None:
|
452 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
453 |
+
systems_str = "_vs_".join(systems)
|
454 |
+
filename = f"judge_evaluation_comparison_{systems_str}_{timestamp}.json"
|
455 |
+
|
456 |
+
results_dir = Path(__file__).parent / "results"
|
457 |
+
results_dir.mkdir(exist_ok=True)
|
458 |
+
filepath = results_dir / filename
|
459 |
+
|
460 |
+
# Add comparison metadata
|
461 |
+
stats["comparison_metadata"] = {
|
462 |
+
"systems_compared": systems,
|
463 |
+
"comparison_type": "multi_system",
|
464 |
+
"timestamp": datetime.now().isoformat()
|
465 |
+
}
|
466 |
+
|
467 |
+
# Add detailed system-specific results for chart generation
|
468 |
+
stats["detailed_system_results"] = {}
|
469 |
+
for system in systems:
|
470 |
+
system_results = [r for r in self.evaluation_results if r.get('system_type') == system and r.get('evaluation_success')]
|
471 |
+
stats["detailed_system_results"][system] = {
|
472 |
+
"results": system_results,
|
473 |
+
"query_count": len(system_results),
|
474 |
+
"avg_actionability": sum(r['actionability_score'] for r in system_results) / len(system_results) if system_results else 0.0,
|
475 |
+
"avg_evidence": sum(r['evidence_score'] for r in system_results) / len(system_results) if system_results else 0.0
|
476 |
+
}
|
477 |
+
|
478 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
479 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
480 |
+
|
481 |
+
print(f"📊 Comparison evaluation statistics saved to: {filepath}")
|
482 |
+
return str(filepath)
|
483 |
+
|
484 |
+
|
485 |
+
# Independent execution interface
|
486 |
+
if __name__ == "__main__":
|
487 |
+
"""Independent LLM judge evaluation interface with multi-system support"""
|
488 |
+
|
489 |
+
print("🧠 OnCall.ai LLM Judge Evaluator - Metrics 5-6 Multi-System Evaluation")
|
490 |
+
|
491 |
+
# Print evaluation rubrics for reference
|
492 |
+
print_evaluation_rubrics()
|
493 |
+
|
494 |
+
if len(sys.argv) < 2:
|
495 |
+
print("Usage: python metric5_6_llm_judge_evaluator.py [system1] or [system1,system2,...]")
|
496 |
+
print(" rag - Evaluate RAG system medical outputs")
|
497 |
+
print(" direct - Evaluate direct LLM medical outputs")
|
498 |
+
print(" rag,direct - Compare RAG vs Direct systems")
|
499 |
+
print(" system1,system2,system3 - Compare multiple systems")
|
500 |
+
sys.exit(1)
|
501 |
+
|
502 |
+
# Parse systems from command line
|
503 |
+
systems_input = sys.argv[1]
|
504 |
+
systems = [s.strip() for s in systems_input.split(',')]
|
505 |
+
|
506 |
+
# Initialize evaluator
|
507 |
+
evaluator = LLMJudgeEvaluator()
|
508 |
+
|
509 |
+
try:
|
510 |
+
if len(systems) == 1:
|
511 |
+
# Single system evaluation (legacy mode)
|
512 |
+
system = systems[0]
|
513 |
+
print(f"\n🧪 Single System LLM Judge Evaluation: {system.upper()}")
|
514 |
+
|
515 |
+
# Find and load medical outputs for single system
|
516 |
+
system_files = evaluator.find_medical_outputs_for_systems([system])
|
517 |
+
medical_outputs = evaluator.load_medical_outputs(system_files[system])
|
518 |
+
|
519 |
+
if not medical_outputs:
|
520 |
+
print(f"❌ No medical outputs found for {system}")
|
521 |
+
sys.exit(1)
|
522 |
+
|
523 |
+
print(f"📊 Evaluating {len(medical_outputs)} medical advice outputs")
|
524 |
+
print(f"🎯 Metrics: 5 (Actionability) + 6 (Evidence Quality)")
|
525 |
+
|
526 |
+
# Convert to multi-system format for consistency
|
527 |
+
systems_outputs = {system: medical_outputs}
|
528 |
+
results_by_system = evaluator.evaluate_multiple_systems(systems_outputs)
|
529 |
+
|
530 |
+
# Save results
|
531 |
+
stats_path = evaluator.save_comparison_statistics([system])
|
532 |
+
|
533 |
+
else:
|
534 |
+
# Multi-system comparison evaluation
|
535 |
+
print(f"\n🧪 Multi-System Comparison: {' vs '.join([s.upper() for s in systems])}")
|
536 |
+
|
537 |
+
# Find and load medical outputs for all systems
|
538 |
+
system_files = evaluator.find_medical_outputs_for_systems(systems)
|
539 |
+
systems_outputs = {}
|
540 |
+
|
541 |
+
for system in systems:
|
542 |
+
outputs = evaluator.load_medical_outputs(system_files[system])
|
543 |
+
if not outputs:
|
544 |
+
print(f"❌ No medical outputs found for {system}")
|
545 |
+
sys.exit(1)
|
546 |
+
systems_outputs[system] = outputs
|
547 |
+
|
548 |
+
# Validate all systems have same number of queries
|
549 |
+
query_counts = [len(outputs) for outputs in systems_outputs.values()]
|
550 |
+
if len(set(query_counts)) > 1:
|
551 |
+
print(f"⚠️ Warning: Systems have different query counts: {dict(zip(systems, query_counts))}")
|
552 |
+
|
553 |
+
# Validate systems processed same queries (for scientific comparison)
|
554 |
+
print(f"🔍 Validating query consistency across systems...")
|
555 |
+
if len(systems) > 1:
|
556 |
+
first_system_queries = [q['query'] for q in systems_outputs[systems[0]]]
|
557 |
+
for i, system in enumerate(systems[1:], 1):
|
558 |
+
system_queries = [q['query'] for q in systems_outputs[system]]
|
559 |
+
|
560 |
+
if first_system_queries != system_queries:
|
561 |
+
print(f"⚠️ Warning: {systems[0]} and {system} processed different queries!")
|
562 |
+
# Show first difference
|
563 |
+
for j, (q1, q2) in enumerate(zip(first_system_queries, system_queries)):
|
564 |
+
if q1 != q2:
|
565 |
+
print(f" Query {j+1} differs:")
|
566 |
+
print(f" {systems[0]}: {q1[:50]}...")
|
567 |
+
print(f" {system}: {q2[:50]}...")
|
568 |
+
break
|
569 |
+
else:
|
570 |
+
print(f"✅ {systems[0]} and {system} processed identical queries")
|
571 |
+
|
572 |
+
# Validate systems have different model types
|
573 |
+
model_types = set()
|
574 |
+
for system, outputs in systems_outputs.items():
|
575 |
+
if outputs:
|
576 |
+
model_type = outputs[0].get('model_type', 'unknown')
|
577 |
+
model_types.add(model_type)
|
578 |
+
print(f"🏷️ {system.upper()} system model_type: {model_type}")
|
579 |
+
|
580 |
+
if len(model_types) == 1:
|
581 |
+
print(f"⚠️ Warning: All systems have same model_type - this may not be a valid comparison!")
|
582 |
+
else:
|
583 |
+
print(f"✅ Systems have different model_types: {model_types}")
|
584 |
+
|
585 |
+
print(f"📊 Comparing {len(systems)} systems with {min(query_counts)} queries each")
|
586 |
+
print(f"🎯 Metrics: 5 (Actionability) + 6 (Evidence Quality)")
|
587 |
+
print(f"⚡ Strategy: Single comparison call for maximum consistency")
|
588 |
+
|
589 |
+
# Multi-system comparison evaluation
|
590 |
+
results_by_system = evaluator.evaluate_multiple_systems(systems_outputs)
|
591 |
+
|
592 |
+
# Save comparison results
|
593 |
+
stats_path = evaluator.save_comparison_statistics(systems)
|
594 |
+
|
595 |
+
# Print summary
|
596 |
+
print(f"\n📊 Generating evaluation analysis...")
|
597 |
+
stats = evaluator.calculate_judge_statistics()
|
598 |
+
overall_results = stats['overall_results']
|
599 |
+
|
600 |
+
print(f"\n📊 === LLM JUDGE EVALUATION SUMMARY ===")
|
601 |
+
|
602 |
+
if len(systems) == 1:
|
603 |
+
print(f"System: {systems[0].upper()}")
|
604 |
+
else:
|
605 |
+
print(f"Systems Compared: {' vs '.join([s.upper() for s in systems])}")
|
606 |
+
|
607 |
+
print(f"Overall Performance:")
|
608 |
+
actionability_raw = overall_results['average_actionability'] * 10
|
609 |
+
evidence_raw = overall_results['average_evidence'] * 10
|
610 |
+
|
611 |
+
print(f" Average Actionability: {overall_results['average_actionability']:.3f} ({actionability_raw:.1f}/10)")
|
612 |
+
print(f" • {get_rubric_description(int(actionability_raw), 'actionability')}")
|
613 |
+
print(f" Average Evidence Quality: {overall_results['average_evidence']:.3f} ({evidence_raw:.1f}/10)")
|
614 |
+
print(f" • {get_rubric_description(int(evidence_raw), 'evidence')}")
|
615 |
+
print(f" Actionability Target (≥7.0): {'✅ Met' if overall_results['actionability_target_met'] else '❌ Not Met'}")
|
616 |
+
print(f" Evidence Target (≥7.5): {'✅ Met' if overall_results['evidence_target_met'] else '❌ Not Met'}")
|
617 |
+
|
618 |
+
# System-specific breakdown for multi-system comparison
|
619 |
+
if len(systems) > 1:
|
620 |
+
print(f"\nSystem Breakdown:")
|
621 |
+
for system in systems:
|
622 |
+
system_results = [r for r in evaluator.evaluation_results if r.get('system_type') == system and r.get('evaluation_success')]
|
623 |
+
if system_results:
|
624 |
+
avg_action = sum(r['actionability_score'] for r in system_results) / len(system_results)
|
625 |
+
avg_evidence = sum(r['evidence_score'] for r in system_results) / len(system_results)
|
626 |
+
print(f" {system.upper()}: Actionability={avg_action:.3f}, Evidence={avg_evidence:.3f} [{len(system_results)} queries]")
|
627 |
+
|
628 |
+
print(f"\n✅ LLM judge evaluation complete!")
|
629 |
+
print(f"📊 Statistics: {stats_path}")
|
630 |
+
print(f"⚡ Efficiency: {overall_results['total_queries']} evaluations in 1 LLM call")
|
631 |
+
|
632 |
+
except FileNotFoundError as e:
|
633 |
+
print(f"❌ {e}")
|
634 |
+
print(f"💡 Please run evaluators first:")
|
635 |
+
for system in systems:
|
636 |
+
if system == "rag":
|
637 |
+
print(" python latency_evaluator.py single_test_query.txt")
|
638 |
+
elif system == "direct":
|
639 |
+
print(" python direct_llm_evaluator.py single_test_query.txt")
|
640 |
+
else:
|
641 |
+
print(f" python {system}_evaluator.py single_test_query.txt")
|
642 |
+
except Exception as e:
|
643 |
+
print(f"❌ Judge evaluation failed: {e}")
|
evaluation/metric7_8_precision_MRR.py
ADDED
@@ -0,0 +1,402 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
OnCall.ai System - Precision & MRR Analyzer (Metrics 7-8)
|
4 |
+
========================================================
|
5 |
+
|
6 |
+
Specialized analyzer for calculating Precision@K and Mean Reciprocal Rank (MRR)
|
7 |
+
using data collected from latency_evaluator.py comprehensive evaluation.
|
8 |
+
|
9 |
+
IMPORTANT CHANGES - Angular Distance & Relevance Calculation:
|
10 |
+
- DISTANCE METRIC: Uses Angular Distance from Annoy index (range: 0.0-1.0, smaller = more relevant)
|
11 |
+
- RELEVANCE CONVERSION: relevance = 1.0 - (angular_distance²) / 2.0 (mathematical correct formula)
|
12 |
+
- THRESHOLD ALIGNMENT: Aligned with Metric 3 relevance calculation standards
|
13 |
+
- DISPLAY UPDATE: Changed from "Relevance: X" to "Angular Distance: X" for clarity
|
14 |
+
|
15 |
+
METRICS CALCULATED:
|
16 |
+
7. Precision@K (檢索精確率) - Proportion of relevant results in top-K retrieval
|
17 |
+
8. Mean Reciprocal Rank (平均倒數排名) - Average reciprocal rank of first relevant result
|
18 |
+
|
19 |
+
DESIGN PRINCIPLE:
|
20 |
+
- Reuses comprehensive_details_*.json from latency_evaluator.py
|
21 |
+
- Implements adaptive threshold based on query complexity
|
22 |
+
- Query complexity determined by actual matched emergency keywords count
|
23 |
+
- No additional LLM calls required
|
24 |
+
|
25 |
+
Author: YanBo Chen
|
26 |
+
Date: 2025-08-04
|
27 |
+
Updated: 2025-08-04 (Angular Distance alignment)
|
28 |
+
"""
|
29 |
+
|
30 |
+
import json
|
31 |
+
import os
|
32 |
+
import sys
|
33 |
+
from typing import Dict, List, Any, Set
|
34 |
+
from datetime import datetime
|
35 |
+
from pathlib import Path
|
36 |
+
import re
|
37 |
+
import statistics
|
38 |
+
|
39 |
+
# Relevance threshold constants for adaptive query complexity handling
|
40 |
+
COMPLEX_QUERY_RELEVANCE_THRESHOLD = 0.65 # For queries with multiple emergency keywords
|
41 |
+
SIMPLE_QUERY_RELEVANCE_THRESHOLD = 0.75 # For straightforward diagnostic queries
|
42 |
+
|
43 |
+
class PrecisionMRRAnalyzer:
|
44 |
+
"""Specialized analyzer for metrics 7-8 using existing comprehensive evaluation data"""
|
45 |
+
|
46 |
+
def __init__(self):
|
47 |
+
"""Initialize analyzer"""
|
48 |
+
print("🔧 Initializing Precision & MRR Analyzer...")
|
49 |
+
self.analysis_results = []
|
50 |
+
print("✅ Analyzer initialization complete")
|
51 |
+
|
52 |
+
def load_comprehensive_data(self, filepath: str) -> List[Dict]:
|
53 |
+
"""
|
54 |
+
Load comprehensive evaluation data from latency_evaluator.py output
|
55 |
+
|
56 |
+
Args:
|
57 |
+
filepath: Path to comprehensive_details_*.json file
|
58 |
+
|
59 |
+
Returns:
|
60 |
+
List of comprehensive evaluation results
|
61 |
+
"""
|
62 |
+
try:
|
63 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
64 |
+
data = json.load(f)
|
65 |
+
|
66 |
+
comprehensive_results = data.get('comprehensive_results', [])
|
67 |
+
|
68 |
+
print(f"📁 Loaded {len(comprehensive_results)} comprehensive evaluation results")
|
69 |
+
print(f"📊 Ready for precision/MRR analysis: {sum(1 for r in comprehensive_results if r.get('precision_mrr_ready'))}")
|
70 |
+
|
71 |
+
return comprehensive_results
|
72 |
+
|
73 |
+
except Exception as e:
|
74 |
+
print(f"❌ Failed to load comprehensive data: {e}")
|
75 |
+
return []
|
76 |
+
|
77 |
+
def _is_complex_query(self, query: str, processed_results: List[Dict]) -> bool:
|
78 |
+
"""
|
79 |
+
Determine query complexity based on actual matched emergency keywords
|
80 |
+
|
81 |
+
Args:
|
82 |
+
query: Original query text
|
83 |
+
processed_results: Retrieval results with matched keywords
|
84 |
+
|
85 |
+
Returns:
|
86 |
+
True if query is complex (should use lenient threshold)
|
87 |
+
"""
|
88 |
+
# Collect unique emergency keywords actually found in retrieval results
|
89 |
+
unique_emergency_keywords = set()
|
90 |
+
|
91 |
+
for result in processed_results:
|
92 |
+
if result.get('type') == 'emergency':
|
93 |
+
matched_keywords = result.get('matched', '')
|
94 |
+
if matched_keywords:
|
95 |
+
keywords = [kw.strip() for kw in matched_keywords.split('|') if kw.strip()]
|
96 |
+
unique_emergency_keywords.update(keywords)
|
97 |
+
|
98 |
+
keyword_count = len(unique_emergency_keywords)
|
99 |
+
|
100 |
+
# Business logic: 4+ different emergency keywords indicate complex case
|
101 |
+
is_complex = keyword_count >= 4
|
102 |
+
|
103 |
+
print(f" 🧠 Query complexity: {'Complex' if is_complex else 'Simple'} ({keyword_count} emergency keywords)")
|
104 |
+
print(f" 🔑 Found keywords: {', '.join(list(unique_emergency_keywords)[:5])}")
|
105 |
+
|
106 |
+
return is_complex
|
107 |
+
|
108 |
+
def calculate_precision_mrr_single(self, query_data: Dict) -> Dict[str, Any]:
|
109 |
+
"""
|
110 |
+
Calculate precision@K and MRR for single query
|
111 |
+
|
112 |
+
Args:
|
113 |
+
query_data: Single query's comprehensive evaluation result
|
114 |
+
|
115 |
+
Returns:
|
116 |
+
Precision and MRR metrics for this query
|
117 |
+
"""
|
118 |
+
query = query_data['query']
|
119 |
+
category = query_data['category']
|
120 |
+
|
121 |
+
# Extract processed results from pipeline data
|
122 |
+
pipeline_data = query_data.get('pipeline_data', {})
|
123 |
+
retrieval_results = pipeline_data.get('retrieval_results', {})
|
124 |
+
processed_results = retrieval_results.get('processed_results', [])
|
125 |
+
|
126 |
+
print(f"🔍 Analyzing precision/MRR for: {query[:50]}...")
|
127 |
+
print(f"📋 Category: {category}, Results: {len(processed_results)}")
|
128 |
+
|
129 |
+
if not processed_results:
|
130 |
+
return self._create_empty_precision_mrr_result(query, category)
|
131 |
+
|
132 |
+
# Step 1: Determine query complexity
|
133 |
+
is_complex = self._is_complex_query(query, processed_results)
|
134 |
+
|
135 |
+
# Step 2: Choose adaptive threshold (aligned with Metric 3 relevance standards)
|
136 |
+
threshold = COMPLEX_QUERY_RELEVANCE_THRESHOLD if is_complex else SIMPLE_QUERY_RELEVANCE_THRESHOLD # Updated thresholds for complex/simple queries
|
137 |
+
|
138 |
+
print(f" 🎯 Using relevance threshold: {threshold} ({'lenient' if is_complex else 'strict'})")
|
139 |
+
|
140 |
+
# Step 3: Calculate relevance scores using correct angular distance formula
|
141 |
+
relevance_scores = []
|
142 |
+
for result in processed_results:
|
143 |
+
distance = result.get('distance', 1.0)
|
144 |
+
relevance = 1.0 - (distance**2) / 2.0 # Correct mathematical conversion
|
145 |
+
relevance_scores.append(relevance)
|
146 |
+
|
147 |
+
# Step 4: Calculate Precision@K (aligned with Metric 3 thresholds)
|
148 |
+
relevant_count = sum(1 for score in relevance_scores if score >= threshold)
|
149 |
+
precision_at_k = relevant_count / len(processed_results)
|
150 |
+
|
151 |
+
# Step 5: Calculate MRR
|
152 |
+
first_relevant_rank = None
|
153 |
+
for i, score in enumerate(relevance_scores, 1):
|
154 |
+
if score >= threshold:
|
155 |
+
first_relevant_rank = i
|
156 |
+
break
|
157 |
+
|
158 |
+
mrr_score = (1.0 / first_relevant_rank) if first_relevant_rank else 0.0
|
159 |
+
|
160 |
+
# Detailed analysis
|
161 |
+
result = {
|
162 |
+
"query": query,
|
163 |
+
"category": category,
|
164 |
+
"query_complexity": "complex" if is_complex else "simple",
|
165 |
+
"threshold_used": threshold,
|
166 |
+
|
167 |
+
# Metric 7: Precision@K
|
168 |
+
"precision_at_k": precision_at_k,
|
169 |
+
"relevant_count": relevant_count,
|
170 |
+
"total_results": len(processed_results),
|
171 |
+
|
172 |
+
# Metric 8: MRR
|
173 |
+
"mrr_score": mrr_score,
|
174 |
+
"first_relevant_rank": first_relevant_rank,
|
175 |
+
|
176 |
+
# Supporting data
|
177 |
+
"relevance_scores": relevance_scores,
|
178 |
+
"avg_relevance": sum(relevance_scores) / len(relevance_scores),
|
179 |
+
"max_relevance": max(relevance_scores),
|
180 |
+
"min_relevance": min(relevance_scores),
|
181 |
+
|
182 |
+
"timestamp": datetime.now().isoformat()
|
183 |
+
}
|
184 |
+
|
185 |
+
print(f" 📊 Precision@{len(processed_results)}: {precision_at_k:.3f} ({relevant_count}/{len(processed_results)} relevant)")
|
186 |
+
print(f" 📊 MRR: {mrr_score:.3f} (first relevant at rank {first_relevant_rank})")
|
187 |
+
|
188 |
+
return result
|
189 |
+
|
190 |
+
def _create_empty_precision_mrr_result(self, query: str, category: str) -> Dict[str, Any]:
|
191 |
+
"""Create empty result for failed queries"""
|
192 |
+
return {
|
193 |
+
"query": query,
|
194 |
+
"category": category,
|
195 |
+
"query_complexity": "unknown",
|
196 |
+
"threshold_used": 0.0,
|
197 |
+
"precision_at_k": 0.0,
|
198 |
+
"relevant_count": 0,
|
199 |
+
"total_results": 0,
|
200 |
+
"mrr_score": 0.0,
|
201 |
+
"first_relevant_rank": None,
|
202 |
+
"relevance_scores": [],
|
203 |
+
"timestamp": datetime.now().isoformat()
|
204 |
+
}
|
205 |
+
|
206 |
+
def analyze_all_queries(self, comprehensive_results: List[Dict]) -> List[Dict]:
|
207 |
+
"""
|
208 |
+
Analyze precision/MRR for all queries in comprehensive evaluation
|
209 |
+
|
210 |
+
Args:
|
211 |
+
comprehensive_results: Results from latency_evaluator.py
|
212 |
+
|
213 |
+
Returns:
|
214 |
+
List of precision/MRR analysis results
|
215 |
+
"""
|
216 |
+
print(f"\n📊 Analyzing Precision@K and MRR for {len(comprehensive_results)} queries...")
|
217 |
+
|
218 |
+
analysis_results = []
|
219 |
+
|
220 |
+
for i, query_data in enumerate(comprehensive_results):
|
221 |
+
if not query_data.get('precision_mrr_ready'):
|
222 |
+
print(f"⏭️ Skipping query {i+1}: Not ready for precision/MRR analysis")
|
223 |
+
continue
|
224 |
+
|
225 |
+
if not query_data.get('overall_success'):
|
226 |
+
print(f"⏭️ Skipping query {i+1}: Pipeline failed")
|
227 |
+
analysis_results.append(self._create_empty_precision_mrr_result(
|
228 |
+
query_data['query'],
|
229 |
+
query_data['category']
|
230 |
+
))
|
231 |
+
continue
|
232 |
+
|
233 |
+
# Analyze this query
|
234 |
+
result = self.calculate_precision_mrr_single(query_data)
|
235 |
+
analysis_results.append(result)
|
236 |
+
|
237 |
+
print("") # Spacing between queries
|
238 |
+
|
239 |
+
self.analysis_results = analysis_results
|
240 |
+
return analysis_results
|
241 |
+
|
242 |
+
def calculate_statistics(self) -> Dict[str, Any]:
|
243 |
+
"""Calculate comprehensive statistics for metrics 7-8"""
|
244 |
+
|
245 |
+
if not self.analysis_results:
|
246 |
+
return {"error": "No analysis results available"}
|
247 |
+
|
248 |
+
# Separate by complexity and category
|
249 |
+
stats = {
|
250 |
+
"overall_statistics": {},
|
251 |
+
"by_complexity": {"simple": {}, "complex": {}},
|
252 |
+
"by_category": {"diagnosis": {}, "treatment": {}, "mixed": {}},
|
253 |
+
"timestamp": datetime.now().isoformat()
|
254 |
+
}
|
255 |
+
|
256 |
+
# Overall statistics
|
257 |
+
all_precision = [r['precision_at_k'] for r in self.analysis_results]
|
258 |
+
all_mrr = [r['mrr_score'] for r in self.analysis_results]
|
259 |
+
|
260 |
+
stats["overall_statistics"] = {
|
261 |
+
"total_queries": len(self.analysis_results),
|
262 |
+
"avg_precision": statistics.mean(all_precision),
|
263 |
+
"avg_mrr": statistics.mean(all_mrr),
|
264 |
+
"precision_std": statistics.stdev(all_precision) if len(all_precision) > 1 else 0.0,
|
265 |
+
"mrr_std": statistics.stdev(all_mrr) if len(all_mrr) > 1 else 0.0
|
266 |
+
}
|
267 |
+
|
268 |
+
# By complexity
|
269 |
+
for complexity in ["simple", "complex"]:
|
270 |
+
complexity_results = [r for r in self.analysis_results if r['query_complexity'] == complexity]
|
271 |
+
if complexity_results:
|
272 |
+
precision_scores = [r['precision_at_k'] for r in complexity_results]
|
273 |
+
mrr_scores = [r['mrr_score'] for r in complexity_results]
|
274 |
+
|
275 |
+
stats["by_complexity"][complexity] = {
|
276 |
+
"query_count": len(complexity_results),
|
277 |
+
"avg_precision": statistics.mean(precision_scores),
|
278 |
+
"avg_mrr": statistics.mean(mrr_scores),
|
279 |
+
"avg_threshold": statistics.mean([r['threshold_used'] for r in complexity_results])
|
280 |
+
}
|
281 |
+
|
282 |
+
# By category
|
283 |
+
for category in ["diagnosis", "treatment", "mixed"]:
|
284 |
+
category_results = [r for r in self.analysis_results if r['category'] == category]
|
285 |
+
if category_results:
|
286 |
+
precision_scores = [r['precision_at_k'] for r in category_results]
|
287 |
+
mrr_scores = [r['mrr_score'] for r in category_results]
|
288 |
+
|
289 |
+
stats["by_category"][category] = {
|
290 |
+
"query_count": len(category_results),
|
291 |
+
"avg_precision": statistics.mean(precision_scores),
|
292 |
+
"avg_mrr": statistics.mean(mrr_scores)
|
293 |
+
}
|
294 |
+
|
295 |
+
return stats
|
296 |
+
|
297 |
+
def save_results(self, filename: str = None) -> str:
|
298 |
+
"""Save precision/MRR analysis results"""
|
299 |
+
if filename is None:
|
300 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
301 |
+
filename = f"precision_mrr_analysis_{timestamp}.json"
|
302 |
+
|
303 |
+
# Ensure results directory exists
|
304 |
+
results_dir = Path(__file__).parent / "results"
|
305 |
+
results_dir.mkdir(exist_ok=True)
|
306 |
+
|
307 |
+
filepath = results_dir / filename
|
308 |
+
|
309 |
+
# Create output data
|
310 |
+
output_data = {
|
311 |
+
"analysis_metadata": {
|
312 |
+
"total_queries": len(self.analysis_results),
|
313 |
+
"analysis_type": "precision_mrr_metrics_7_8",
|
314 |
+
"timestamp": datetime.now().isoformat(),
|
315 |
+
"adaptive_threshold": True
|
316 |
+
},
|
317 |
+
"detailed_results": self.analysis_results,
|
318 |
+
"statistics": self.calculate_statistics()
|
319 |
+
}
|
320 |
+
|
321 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
322 |
+
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
323 |
+
|
324 |
+
print(f"📊 Precision/MRR analysis saved to: {filepath}")
|
325 |
+
return str(filepath)
|
326 |
+
|
327 |
+
|
328 |
+
# Independent execution interface
|
329 |
+
if __name__ == "__main__":
|
330 |
+
"""Independent precision/MRR analysis interface"""
|
331 |
+
|
332 |
+
print("📊 OnCall.ai Precision & MRR Analyzer - Metrics 7-8")
|
333 |
+
|
334 |
+
if len(sys.argv) > 1:
|
335 |
+
comprehensive_file = sys.argv[1]
|
336 |
+
else:
|
337 |
+
# Look for latest comprehensive_details file
|
338 |
+
results_dir = Path(__file__).parent / "results"
|
339 |
+
if results_dir.exists():
|
340 |
+
comprehensive_files = list(results_dir.glob("comprehensive_details_*.json"))
|
341 |
+
if comprehensive_files:
|
342 |
+
comprehensive_file = str(sorted(comprehensive_files)[-1]) # Latest file
|
343 |
+
print(f"📁 Using latest comprehensive file: {comprehensive_file}")
|
344 |
+
else:
|
345 |
+
print("❌ No comprehensive_details_*.json files found")
|
346 |
+
print("Please run latency_evaluator.py first to generate comprehensive data")
|
347 |
+
sys.exit(1)
|
348 |
+
else:
|
349 |
+
print("❌ Results directory not found")
|
350 |
+
sys.exit(1)
|
351 |
+
|
352 |
+
if not os.path.exists(comprehensive_file):
|
353 |
+
print(f"❌ Comprehensive file not found: {comprehensive_file}")
|
354 |
+
print("Usage: python precision_MRR.py [comprehensive_details_file.json]")
|
355 |
+
sys.exit(1)
|
356 |
+
|
357 |
+
# Initialize analyzer
|
358 |
+
analyzer = PrecisionMRRAnalyzer()
|
359 |
+
|
360 |
+
# Load comprehensive data from latency_evaluator.py
|
361 |
+
comprehensive_results = analyzer.load_comprehensive_data(comprehensive_file)
|
362 |
+
|
363 |
+
if not comprehensive_results:
|
364 |
+
print("❌ No comprehensive data loaded")
|
365 |
+
sys.exit(1)
|
366 |
+
|
367 |
+
# Analyze precision/MRR for all queries
|
368 |
+
analysis_results = analyzer.analyze_all_queries(comprehensive_results)
|
369 |
+
|
370 |
+
# Calculate and display statistics
|
371 |
+
statistics_result = analyzer.calculate_statistics()
|
372 |
+
|
373 |
+
print(f"\n📊 === PRECISION & MRR ANALYSIS SUMMARY ===")
|
374 |
+
|
375 |
+
overall_stats = statistics_result['overall_statistics']
|
376 |
+
print(f"\nOVERALL METRICS:")
|
377 |
+
print(f" Precision@K: {overall_stats['avg_precision']:.3f} (±{overall_stats['precision_std']:.3f})")
|
378 |
+
print(f" MRR: {overall_stats['avg_mrr']:.3f} (±{overall_stats['mrr_std']:.3f})")
|
379 |
+
print(f" Total Queries: {overall_stats['total_queries']}")
|
380 |
+
|
381 |
+
# Complexity-based statistics
|
382 |
+
complexity_stats = statistics_result['by_complexity']
|
383 |
+
print(f"\nBY COMPLEXITY:")
|
384 |
+
for complexity, stats in complexity_stats.items():
|
385 |
+
if stats:
|
386 |
+
print(f" {complexity.title()}: Precision={stats['avg_precision']:.3f}, MRR={stats['avg_mrr']:.3f} "
|
387 |
+
f"(threshold={stats['avg_threshold']:.2f}, n={stats['query_count']})")
|
388 |
+
|
389 |
+
# Category-based statistics
|
390 |
+
category_stats = statistics_result['by_category']
|
391 |
+
print(f"\nBY CATEGORY:")
|
392 |
+
for category, stats in category_stats.items():
|
393 |
+
if stats:
|
394 |
+
print(f" {category.title()}: Precision={stats['avg_precision']:.3f}, MRR={stats['avg_mrr']:.3f} "
|
395 |
+
f"(n={stats['query_count']})")
|
396 |
+
|
397 |
+
# Save results
|
398 |
+
saved_path = analyzer.save_results()
|
399 |
+
|
400 |
+
print(f"\n✅ Precision & MRR analysis complete!")
|
401 |
+
print(f"📁 Results saved to: {saved_path}")
|
402 |
+
print(f"\n💡 Next step: Create precision_mrr_chart_generator.py for visualization")
|
evaluation/metric7_8_precision_mrr_chart_generator.py
ADDED
@@ -0,0 +1,586 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
OnCall.ai System - Precision & MRR Chart Generator (Metrics 7-8)
|
4 |
+
===============================================================
|
5 |
+
|
6 |
+
Generates comprehensive Precision@K and MRR analysis charts from saved analysis results.
|
7 |
+
Reads JSON files produced by metric7_8_precision_MRR.py and creates visualizations.
|
8 |
+
|
9 |
+
Charts generated:
|
10 |
+
1. Precision@K comparison by category and complexity
|
11 |
+
2. MRR comparison by category and complexity
|
12 |
+
3. Combined metrics heatmap
|
13 |
+
4. Threshold impact analysis
|
14 |
+
5. Detailed statistics tables
|
15 |
+
|
16 |
+
No LLM calls - pure data visualization.
|
17 |
+
|
18 |
+
Author: YanBo Chen
|
19 |
+
Date: 2025-08-04
|
20 |
+
"""
|
21 |
+
|
22 |
+
import json
|
23 |
+
import os
|
24 |
+
import sys
|
25 |
+
from typing import Dict, List, Any
|
26 |
+
from datetime import datetime
|
27 |
+
from pathlib import Path
|
28 |
+
import glob
|
29 |
+
|
30 |
+
# Visualization imports
|
31 |
+
import matplotlib.pyplot as plt
|
32 |
+
import seaborn as sns
|
33 |
+
import pandas as pd
|
34 |
+
import numpy as np
|
35 |
+
|
36 |
+
|
37 |
+
class PrecisionMRRChartGenerator:
|
38 |
+
"""Generate charts from precision/MRR analysis results - no LLM dependency"""
|
39 |
+
|
40 |
+
def __init__(self):
|
41 |
+
"""Initialize chart generator"""
|
42 |
+
print("📈 Initializing Precision & MRR Chart Generator...")
|
43 |
+
|
44 |
+
# Set up professional chart style
|
45 |
+
plt.style.use('default')
|
46 |
+
sns.set_palette("husl")
|
47 |
+
|
48 |
+
print("✅ Chart Generator ready")
|
49 |
+
|
50 |
+
def load_latest_analysis(self, results_dir: str = None) -> Dict[str, Any]:
|
51 |
+
"""
|
52 |
+
Load the most recent precision/MRR analysis file
|
53 |
+
|
54 |
+
Args:
|
55 |
+
results_dir: Directory containing analysis files
|
56 |
+
"""
|
57 |
+
if results_dir is None:
|
58 |
+
results_dir = Path(__file__).parent / "results"
|
59 |
+
|
60 |
+
analysis_files = glob.glob(str(results_dir / "precision_mrr_analysis_*.json"))
|
61 |
+
|
62 |
+
if not analysis_files:
|
63 |
+
raise FileNotFoundError("No precision_mrr_analysis_*.json files found. Run metric7_8_precision_MRR.py first.")
|
64 |
+
|
65 |
+
latest_file = max(analysis_files, key=os.path.getctime)
|
66 |
+
print(f"📁 Loading latest analysis: {latest_file}")
|
67 |
+
|
68 |
+
with open(latest_file, 'r', encoding='utf-8') as f:
|
69 |
+
return json.load(f)
|
70 |
+
|
71 |
+
def create_precision_comparison_chart(self, analysis_data: Dict, save_path: str = None) -> str:
|
72 |
+
"""Create Precision@K comparison chart"""
|
73 |
+
|
74 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
75 |
+
|
76 |
+
# Chart 1: Precision by Category
|
77 |
+
category_stats = analysis_data['statistics']['by_category']
|
78 |
+
categories = []
|
79 |
+
precisions = []
|
80 |
+
|
81 |
+
for category, stats in category_stats.items():
|
82 |
+
if stats:
|
83 |
+
categories.append(category.title())
|
84 |
+
precisions.append(stats['avg_precision'])
|
85 |
+
|
86 |
+
if categories:
|
87 |
+
bars1 = ax1.bar(categories, precisions, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728'])
|
88 |
+
ax1.set_title('Precision@K by Query Category', fontweight='bold')
|
89 |
+
ax1.set_ylabel('Precision@K')
|
90 |
+
ax1.set_xlabel('Query Category')
|
91 |
+
ax1.set_ylim(0, 1.0)
|
92 |
+
ax1.grid(True, alpha=0.3)
|
93 |
+
|
94 |
+
# Add value labels
|
95 |
+
for bar, precision in zip(bars1, precisions):
|
96 |
+
height = bar.get_height()
|
97 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
98 |
+
f'{precision:.3f}', ha='center', va='bottom', fontweight='bold')
|
99 |
+
|
100 |
+
# Chart 2: Precision by Complexity
|
101 |
+
complexity_stats = analysis_data['statistics']['by_complexity']
|
102 |
+
complexities = []
|
103 |
+
comp_precisions = []
|
104 |
+
|
105 |
+
for complexity, stats in complexity_stats.items():
|
106 |
+
if stats:
|
107 |
+
complexities.append(complexity.title())
|
108 |
+
comp_precisions.append(stats['avg_precision'])
|
109 |
+
|
110 |
+
if complexities:
|
111 |
+
bars2 = ax2.bar(complexities, comp_precisions, alpha=0.8, color=['#2ca02c', '#d62728'])
|
112 |
+
ax2.set_title('Precision@K by Query Complexity', fontweight='bold')
|
113 |
+
ax2.set_ylabel('Precision@K')
|
114 |
+
ax2.set_xlabel('Query Complexity')
|
115 |
+
ax2.set_ylim(0, 1.0)
|
116 |
+
ax2.grid(True, alpha=0.3)
|
117 |
+
|
118 |
+
# Add value labels and threshold info
|
119 |
+
for bar, precision, complexity in zip(bars2, comp_precisions, complexities):
|
120 |
+
height = bar.get_height()
|
121 |
+
threshold = 0.15 if complexity.lower() == 'complex' else 0.25
|
122 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
123 |
+
f'{precision:.3f}\n(T={threshold})', ha='center', va='bottom',
|
124 |
+
fontweight='bold', fontsize=9)
|
125 |
+
|
126 |
+
plt.tight_layout()
|
127 |
+
|
128 |
+
# Save chart
|
129 |
+
if save_path is None:
|
130 |
+
save_path = Path(__file__).parent / "charts" / f"precision_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
131 |
+
|
132 |
+
save_path = Path(save_path)
|
133 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
134 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
135 |
+
plt.close()
|
136 |
+
|
137 |
+
print(f"📊 Precision comparison chart saved: {save_path}")
|
138 |
+
return str(save_path)
|
139 |
+
|
140 |
+
def create_mrr_comparison_chart(self, analysis_data: Dict, save_path: str = None) -> str:
|
141 |
+
"""Create MRR comparison chart"""
|
142 |
+
|
143 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
144 |
+
|
145 |
+
# Chart 1: MRR by Category
|
146 |
+
category_stats = analysis_data['statistics']['by_category']
|
147 |
+
categories = []
|
148 |
+
mrr_scores = []
|
149 |
+
|
150 |
+
for category, stats in category_stats.items():
|
151 |
+
if stats:
|
152 |
+
categories.append(category.title())
|
153 |
+
mrr_scores.append(stats['avg_mrr'])
|
154 |
+
|
155 |
+
if categories:
|
156 |
+
bars1 = ax1.bar(categories, mrr_scores, alpha=0.8, color=['#9467bd', '#8c564b', '#e377c2'])
|
157 |
+
ax1.set_title('Mean Reciprocal Rank by Query Category', fontweight='bold')
|
158 |
+
ax1.set_ylabel('MRR Score')
|
159 |
+
ax1.set_xlabel('Query Category')
|
160 |
+
ax1.set_ylim(0, 1.0)
|
161 |
+
ax1.grid(True, alpha=0.3)
|
162 |
+
|
163 |
+
# Add value labels
|
164 |
+
for bar, mrr in zip(bars1, mrr_scores):
|
165 |
+
height = bar.get_height()
|
166 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
167 |
+
f'{mrr:.3f}', ha='center', va='bottom', fontweight='bold')
|
168 |
+
|
169 |
+
# Chart 2: MRR by Complexity
|
170 |
+
complexity_stats = analysis_data['statistics']['by_complexity']
|
171 |
+
complexities = []
|
172 |
+
comp_mrr = []
|
173 |
+
|
174 |
+
for complexity, stats in complexity_stats.items():
|
175 |
+
if stats:
|
176 |
+
complexities.append(complexity.title())
|
177 |
+
comp_mrr.append(stats['avg_mrr'])
|
178 |
+
|
179 |
+
if complexities:
|
180 |
+
bars2 = ax2.bar(complexities, comp_mrr, alpha=0.8, color=['#17becf', '#bcbd22'])
|
181 |
+
ax2.set_title('MRR by Query Complexity', fontweight='bold')
|
182 |
+
ax2.set_ylabel('MRR Score')
|
183 |
+
ax2.set_xlabel('Query Complexity')
|
184 |
+
ax2.set_ylim(0, 1.0)
|
185 |
+
ax2.grid(True, alpha=0.3)
|
186 |
+
|
187 |
+
# Add value labels
|
188 |
+
for bar, mrr in zip(bars2, comp_mrr):
|
189 |
+
height = bar.get_height()
|
190 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
191 |
+
f'{mrr:.3f}', ha='center', va='bottom', fontweight='bold')
|
192 |
+
|
193 |
+
plt.tight_layout()
|
194 |
+
|
195 |
+
# Save chart
|
196 |
+
if save_path is None:
|
197 |
+
save_path = Path(__file__).parent / "charts" / f"mrr_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
198 |
+
|
199 |
+
save_path = Path(save_path)
|
200 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
201 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
202 |
+
plt.close()
|
203 |
+
|
204 |
+
print(f"📊 MRR comparison chart saved: {save_path}")
|
205 |
+
return str(save_path)
|
206 |
+
|
207 |
+
def create_combined_metrics_heatmap(self, analysis_data: Dict, save_path: str = None) -> str:
|
208 |
+
"""Create combined precision/MRR heatmap"""
|
209 |
+
|
210 |
+
# Prepare data for heatmap
|
211 |
+
detailed_results = analysis_data.get('detailed_results', [])
|
212 |
+
|
213 |
+
if not detailed_results:
|
214 |
+
print("⚠️ No detailed results for heatmap")
|
215 |
+
return ""
|
216 |
+
|
217 |
+
# Create DataFrame for heatmap
|
218 |
+
heatmap_data = []
|
219 |
+
for result in detailed_results:
|
220 |
+
heatmap_data.append({
|
221 |
+
'Category': result['category'].title(),
|
222 |
+
'Complexity': result['query_complexity'].title(),
|
223 |
+
'Precision@K': result['precision_at_k'],
|
224 |
+
'MRR': result['mrr_score'],
|
225 |
+
'Threshold': result['threshold_used']
|
226 |
+
})
|
227 |
+
|
228 |
+
df = pd.DataFrame(heatmap_data)
|
229 |
+
|
230 |
+
# Create pivot table for heatmap
|
231 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
|
232 |
+
|
233 |
+
# Precision heatmap
|
234 |
+
precision_pivot = df.pivot_table(values='Precision@K', index='Category', columns='Complexity', aggfunc='mean')
|
235 |
+
sns.heatmap(precision_pivot, annot=True, fmt='.3f', cmap='YlOrRd', ax=ax1,
|
236 |
+
cbar_kws={'label': 'Precision@K'}, vmin=0, vmax=1)
|
237 |
+
ax1.set_title('Precision@K Heatmap\n(Category vs Complexity)', fontweight='bold')
|
238 |
+
|
239 |
+
# MRR heatmap
|
240 |
+
mrr_pivot = df.pivot_table(values='MRR', index='Category', columns='Complexity', aggfunc='mean')
|
241 |
+
sns.heatmap(mrr_pivot, annot=True, fmt='.3f', cmap='YlGnBu', ax=ax2,
|
242 |
+
cbar_kws={'label': 'MRR Score'}, vmin=0, vmax=1)
|
243 |
+
ax2.set_title('MRR Heatmap\n(Category vs Complexity)', fontweight='bold')
|
244 |
+
|
245 |
+
plt.tight_layout()
|
246 |
+
|
247 |
+
# Save chart
|
248 |
+
if save_path is None:
|
249 |
+
save_path = Path(__file__).parent / "charts" / f"precision_mrr_heatmap_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
250 |
+
|
251 |
+
save_path = Path(save_path)
|
252 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
253 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
254 |
+
plt.close()
|
255 |
+
|
256 |
+
print(f"📊 Combined metrics heatmap saved: {save_path}")
|
257 |
+
return str(save_path)
|
258 |
+
|
259 |
+
def create_threshold_impact_chart(self, analysis_data: Dict, save_path: str = None) -> str:
|
260 |
+
"""Create threshold impact analysis chart"""
|
261 |
+
|
262 |
+
detailed_results = analysis_data.get('detailed_results', [])
|
263 |
+
|
264 |
+
if not detailed_results:
|
265 |
+
print("⚠️ No detailed results for threshold analysis")
|
266 |
+
return ""
|
267 |
+
|
268 |
+
# Group by complexity and calculate average relevance
|
269 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
270 |
+
|
271 |
+
# Prepare data
|
272 |
+
simple_queries = [r for r in detailed_results if r['query_complexity'] == 'simple']
|
273 |
+
complex_queries = [r for r in detailed_results if r['query_complexity'] == 'complex']
|
274 |
+
|
275 |
+
# Chart 1: Relevance distribution for different complexities
|
276 |
+
if simple_queries:
|
277 |
+
simple_relevances = []
|
278 |
+
for query in simple_queries:
|
279 |
+
simple_relevances.extend(query.get('relevance_scores', []))
|
280 |
+
|
281 |
+
ax1.hist(simple_relevances, bins=10, alpha=0.7, label=f'Simple (T=0.25)', color='#2ca02c', density=True)
|
282 |
+
ax1.axvline(x=0.25, color='#2ca02c', linestyle='--', linewidth=2, label='Simple Threshold')
|
283 |
+
|
284 |
+
if complex_queries:
|
285 |
+
complex_relevances = []
|
286 |
+
for query in complex_queries:
|
287 |
+
complex_relevances.extend(query.get('relevance_scores', []))
|
288 |
+
|
289 |
+
ax1.hist(complex_relevances, bins=10, alpha=0.7, label=f'Complex (T=0.15)', color='#d62728', density=True)
|
290 |
+
ax1.axvline(x=0.15, color='#d62728', linestyle='--', linewidth=2, label='Complex Threshold')
|
291 |
+
|
292 |
+
ax1.set_title('Relevance Score Distribution\nby Query Complexity', fontweight='bold')
|
293 |
+
ax1.set_xlabel('Relevance Score')
|
294 |
+
ax1.set_ylabel('Density')
|
295 |
+
ax1.legend()
|
296 |
+
ax1.grid(True, alpha=0.3)
|
297 |
+
|
298 |
+
# Chart 2: Metrics comparison
|
299 |
+
complexity_stats = analysis_data['statistics']['by_complexity']
|
300 |
+
|
301 |
+
complexities = []
|
302 |
+
precisions = []
|
303 |
+
mrrs = []
|
304 |
+
thresholds = []
|
305 |
+
|
306 |
+
for complexity, stats in complexity_stats.items():
|
307 |
+
if stats:
|
308 |
+
complexities.append(complexity.title())
|
309 |
+
precisions.append(stats['avg_precision'])
|
310 |
+
mrrs.append(stats['avg_mrr'])
|
311 |
+
thresholds.append(stats['avg_threshold'])
|
312 |
+
|
313 |
+
x = np.arange(len(complexities))
|
314 |
+
width = 0.35
|
315 |
+
|
316 |
+
bars1 = ax2.bar(x - width/2, precisions, width, label='Precision@K', alpha=0.8, color='#ff7f0e')
|
317 |
+
bars2 = ax2.bar(x + width/2, mrrs, width, label='MRR', alpha=0.8, color='#1f77b4')
|
318 |
+
|
319 |
+
ax2.set_title('Metrics Comparison by Complexity\n(with Adaptive Thresholds)', fontweight='bold')
|
320 |
+
ax2.set_ylabel('Score')
|
321 |
+
ax2.set_xlabel('Query Complexity')
|
322 |
+
ax2.set_xticks(x)
|
323 |
+
ax2.set_xticklabels(complexities)
|
324 |
+
ax2.legend()
|
325 |
+
ax2.grid(True, alpha=0.3)
|
326 |
+
ax2.set_ylim(0, 1.0)
|
327 |
+
|
328 |
+
# Add value labels
|
329 |
+
for bars, values, thresholds_vals in [(bars1, precisions, thresholds), (bars2, mrrs, thresholds)]:
|
330 |
+
for bar, value, threshold in zip(bars, values, thresholds_vals):
|
331 |
+
height = bar.get_height()
|
332 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
333 |
+
f'{value:.3f}', ha='center', va='bottom', fontweight='bold', fontsize=9)
|
334 |
+
|
335 |
+
plt.tight_layout()
|
336 |
+
|
337 |
+
# Save chart
|
338 |
+
if save_path is None:
|
339 |
+
save_path = Path(__file__).parent / "charts" / f"threshold_impact_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
340 |
+
|
341 |
+
save_path = Path(save_path)
|
342 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
343 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
344 |
+
plt.close()
|
345 |
+
|
346 |
+
print(f"📊 Threshold impact chart saved: {save_path}")
|
347 |
+
return str(save_path)
|
348 |
+
|
349 |
+
def create_detailed_analysis_table(self, analysis_data: Dict, save_path: str = None) -> str:
|
350 |
+
"""Create detailed statistics table"""
|
351 |
+
|
352 |
+
fig, ax = plt.subplots(figsize=(12, 8))
|
353 |
+
ax.axis('tight')
|
354 |
+
ax.axis('off')
|
355 |
+
|
356 |
+
# Prepare table data
|
357 |
+
table_data = []
|
358 |
+
|
359 |
+
# Overall statistics
|
360 |
+
overall_stats = analysis_data['statistics']['overall_statistics']
|
361 |
+
table_data.append(['OVERALL METRICS', '', '', '', ''])
|
362 |
+
table_data.append(['Total Queries', str(overall_stats['total_queries']), '', '', ''])
|
363 |
+
table_data.append(['Avg Precision@K', f"{overall_stats['avg_precision']:.3f}",
|
364 |
+
f"±{overall_stats['precision_std']:.3f}", '', ''])
|
365 |
+
table_data.append(['Avg MRR', f"{overall_stats['avg_mrr']:.3f}",
|
366 |
+
f"±{overall_stats['mrr_std']:.3f}", '', ''])
|
367 |
+
table_data.append(['', '', '', '', ''])
|
368 |
+
|
369 |
+
# By category
|
370 |
+
table_data.append(['BY CATEGORY', 'Queries', 'Precision@K', 'MRR', 'Notes'])
|
371 |
+
category_stats = analysis_data['statistics']['by_category']
|
372 |
+
for category, stats in category_stats.items():
|
373 |
+
if stats:
|
374 |
+
table_data.append([
|
375 |
+
category.title(),
|
376 |
+
str(stats['query_count']),
|
377 |
+
f"{stats['avg_precision']:.3f}",
|
378 |
+
f"{stats['avg_mrr']:.3f}",
|
379 |
+
''
|
380 |
+
])
|
381 |
+
|
382 |
+
table_data.append(['', '', '', '', ''])
|
383 |
+
|
384 |
+
# By complexity
|
385 |
+
table_data.append(['BY COMPLEXITY', 'Queries', 'Precision@K', 'MRR', 'Threshold'])
|
386 |
+
complexity_stats = analysis_data['statistics']['by_complexity']
|
387 |
+
for complexity, stats in complexity_stats.items():
|
388 |
+
if stats:
|
389 |
+
table_data.append([
|
390 |
+
complexity.title(),
|
391 |
+
str(stats['query_count']),
|
392 |
+
f"{stats['avg_precision']:.3f}",
|
393 |
+
f"{stats['avg_mrr']:.3f}",
|
394 |
+
f"{stats['avg_threshold']:.2f}"
|
395 |
+
])
|
396 |
+
|
397 |
+
# Create table
|
398 |
+
table = ax.table(cellText=table_data,
|
399 |
+
colLabels=['Metric', 'Value 1', 'Value 2', 'Value 3', 'Value 4'],
|
400 |
+
cellLoc='center',
|
401 |
+
loc='center',
|
402 |
+
bbox=[0, 0, 1, 1])
|
403 |
+
|
404 |
+
# Style the table
|
405 |
+
table.auto_set_font_size(False)
|
406 |
+
table.set_fontsize(10)
|
407 |
+
table.scale(1, 2)
|
408 |
+
|
409 |
+
# Header styling
|
410 |
+
for i in range(5):
|
411 |
+
table[(0, i)].set_facecolor('#40466e')
|
412 |
+
table[(0, i)].set_text_props(weight='bold', color='white')
|
413 |
+
|
414 |
+
# Section headers styling
|
415 |
+
for i, row in enumerate(table_data):
|
416 |
+
if row[0] in ['OVERALL METRICS', 'BY CATEGORY', 'BY COMPLEXITY']:
|
417 |
+
table[(i+1, 0)].set_facecolor('#1f77b4')
|
418 |
+
table[(i+1, 0)].set_text_props(weight='bold', color='white')
|
419 |
+
|
420 |
+
plt.title('Precision@K & MRR Detailed Analysis\nMetrics 7-8 Statistics',
|
421 |
+
fontweight='bold', fontsize=14, pad=20)
|
422 |
+
|
423 |
+
# Save chart
|
424 |
+
if save_path is None:
|
425 |
+
save_path = Path(__file__).parent / "charts" / f"precision_mrr_table_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
426 |
+
|
427 |
+
save_path = Path(save_path)
|
428 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
429 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
430 |
+
plt.close()
|
431 |
+
|
432 |
+
print(f"📊 Detailed analysis table saved: {save_path}")
|
433 |
+
return str(save_path)
|
434 |
+
|
435 |
+
def create_individual_query_analysis(self, analysis_data: Dict, save_path: str = None) -> str:
|
436 |
+
"""Create individual query analysis chart"""
|
437 |
+
|
438 |
+
detailed_results = analysis_data.get('detailed_results', [])
|
439 |
+
|
440 |
+
if not detailed_results:
|
441 |
+
print("⚠️ No detailed results for individual analysis")
|
442 |
+
return ""
|
443 |
+
|
444 |
+
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))
|
445 |
+
|
446 |
+
# Prepare data
|
447 |
+
query_indices = []
|
448 |
+
precisions = []
|
449 |
+
mrrs = []
|
450 |
+
colors = []
|
451 |
+
labels = []
|
452 |
+
|
453 |
+
for i, result in enumerate(detailed_results):
|
454 |
+
query_indices.append(i + 1)
|
455 |
+
precisions.append(result['precision_at_k'])
|
456 |
+
mrrs.append(result['mrr_score'])
|
457 |
+
|
458 |
+
# Color by complexity
|
459 |
+
if result['query_complexity'] == 'complex':
|
460 |
+
colors.append('#d62728') # Red for complex
|
461 |
+
else:
|
462 |
+
colors.append('#2ca02c') # Green for simple
|
463 |
+
|
464 |
+
# Create short label
|
465 |
+
query_short = result['query'][:30] + "..." if len(result['query']) > 30 else result['query']
|
466 |
+
category = result['category'][:4].upper()
|
467 |
+
labels.append(f"{category}\n{query_short}")
|
468 |
+
|
469 |
+
# Chart 1: Precision@K for each query
|
470 |
+
bars1 = ax1.bar(query_indices, precisions, color=colors, alpha=0.8)
|
471 |
+
ax1.set_title('Precision@K by Individual Query', fontweight='bold')
|
472 |
+
ax1.set_ylabel('Precision@K')
|
473 |
+
ax1.set_xlabel('Query Index')
|
474 |
+
ax1.set_ylim(0, 1.0)
|
475 |
+
ax1.grid(True, alpha=0.3)
|
476 |
+
|
477 |
+
# Add value labels
|
478 |
+
for bar, precision in zip(bars1, precisions):
|
479 |
+
height = bar.get_height()
|
480 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
481 |
+
f'{precision:.2f}', ha='center', va='bottom', fontsize=8)
|
482 |
+
|
483 |
+
# Chart 2: MRR for each query
|
484 |
+
bars2 = ax2.bar(query_indices, mrrs, color=colors, alpha=0.8)
|
485 |
+
ax2.set_title('MRR by Individual Query', fontweight='bold')
|
486 |
+
ax2.set_ylabel('MRR Score')
|
487 |
+
ax2.set_xlabel('Query Index')
|
488 |
+
ax2.set_ylim(0, 1.0)
|
489 |
+
ax2.grid(True, alpha=0.3)
|
490 |
+
|
491 |
+
# Add value labels
|
492 |
+
for bar, mrr in zip(bars2, mrrs):
|
493 |
+
height = bar.get_height()
|
494 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
|
495 |
+
f'{mrr:.2f}', ha='center', va='bottom', fontsize=8)
|
496 |
+
|
497 |
+
# Add legend
|
498 |
+
from matplotlib.patches import Patch
|
499 |
+
legend_elements = [
|
500 |
+
Patch(facecolor='#2ca02c', alpha=0.8, label='Simple Query (T=0.25)'),
|
501 |
+
Patch(facecolor='#d62728', alpha=0.8, label='Complex Query (T=0.15)')
|
502 |
+
]
|
503 |
+
ax1.legend(handles=legend_elements, loc='upper right')
|
504 |
+
|
505 |
+
plt.tight_layout()
|
506 |
+
|
507 |
+
# Save chart
|
508 |
+
if save_path is None:
|
509 |
+
save_path = Path(__file__).parent / "charts" / f"individual_query_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
|
510 |
+
|
511 |
+
save_path = Path(save_path)
|
512 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
513 |
+
plt.savefig(save_path, dpi=300, bbox_inches='tight')
|
514 |
+
plt.close()
|
515 |
+
|
516 |
+
print(f"📊 Individual query analysis saved: {save_path}")
|
517 |
+
return str(save_path)
|
518 |
+
|
519 |
+
def generate_all_charts(self, analysis_data: Dict = None) -> Dict[str, str]:
|
520 |
+
"""Generate all precision/MRR charts"""
|
521 |
+
|
522 |
+
if analysis_data is None:
|
523 |
+
analysis_data = self.load_latest_analysis()
|
524 |
+
|
525 |
+
print(f"\n📈 Generating all Precision & MRR charts...")
|
526 |
+
|
527 |
+
saved_charts = {}
|
528 |
+
|
529 |
+
# Generate all chart types
|
530 |
+
try:
|
531 |
+
saved_charts['precision_comparison'] = self.create_precision_comparison_chart(analysis_data)
|
532 |
+
saved_charts['mrr_comparison'] = self.create_mrr_comparison_chart(analysis_data)
|
533 |
+
saved_charts['combined_heatmap'] = self.create_combined_metrics_heatmap(analysis_data)
|
534 |
+
saved_charts['threshold_impact'] = self.create_threshold_impact_chart(analysis_data)
|
535 |
+
saved_charts['individual_analysis'] = self.create_individual_query_analysis(analysis_data)
|
536 |
+
|
537 |
+
except Exception as e:
|
538 |
+
print(f"❌ Error generating charts: {e}")
|
539 |
+
return {"error": str(e)}
|
540 |
+
|
541 |
+
print(f"\n✅ All precision/MRR charts generated successfully!")
|
542 |
+
print(f"📁 Charts saved to: evaluation/charts/")
|
543 |
+
|
544 |
+
return saved_charts
|
545 |
+
|
546 |
+
|
547 |
+
# Independent execution interface
|
548 |
+
if __name__ == "__main__":
|
549 |
+
"""Generate precision/MRR charts from analysis results"""
|
550 |
+
|
551 |
+
print("📈 OnCall.ai Precision & MRR Chart Generator - Metrics 7-8")
|
552 |
+
|
553 |
+
if len(sys.argv) > 1:
|
554 |
+
analysis_file = sys.argv[1]
|
555 |
+
|
556 |
+
if not os.path.exists(analysis_file):
|
557 |
+
print(f"❌ Analysis file not found: {analysis_file}")
|
558 |
+
sys.exit(1)
|
559 |
+
else:
|
560 |
+
analysis_file = None # Will use latest file
|
561 |
+
|
562 |
+
# Initialize generator
|
563 |
+
generator = PrecisionMRRChartGenerator()
|
564 |
+
|
565 |
+
try:
|
566 |
+
# Load analysis data
|
567 |
+
if analysis_file:
|
568 |
+
with open(analysis_file, 'r', encoding='utf-8') as f:
|
569 |
+
analysis_data = json.load(f)
|
570 |
+
print(f"📁 Using specified analysis file: {analysis_file}")
|
571 |
+
else:
|
572 |
+
analysis_data = generator.load_latest_analysis()
|
573 |
+
|
574 |
+
# Generate all charts
|
575 |
+
saved_charts = generator.generate_all_charts(analysis_data)
|
576 |
+
|
577 |
+
if 'error' not in saved_charts:
|
578 |
+
print(f"\n📊 === PRECISION & MRR CHART GENERATION SUMMARY ===")
|
579 |
+
for chart_type, filepath in saved_charts.items():
|
580 |
+
print(f" 📈 {chart_type.replace('_', ' ').title()}: {filepath}")
|
581 |
+
|
582 |
+
print(f"\n💡 Charts ready for analysis and presentation!")
|
583 |
+
|
584 |
+
except Exception as e:
|
585 |
+
print(f"❌ Chart generation failed: {e}")
|
586 |
+
sys.exit(1)
|
evaluation/old/coverage_evaluator.py
ADDED
@@ -0,0 +1,560 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
OnCall.ai System - Retrieval Coverage Evaluator (Metric 4)
|
4 |
+
==========================================================
|
5 |
+
|
6 |
+
Evaluates how well generated medical advice utilizes retrieved content
|
7 |
+
Automatic evaluation using keyword overlap analysis with optional LLM sampling
|
8 |
+
|
9 |
+
Author: YanBo Chen
|
10 |
+
Date: 2025-08-04
|
11 |
+
"""
|
12 |
+
|
13 |
+
import json
|
14 |
+
import os
|
15 |
+
import sys
|
16 |
+
from typing import Dict, List, Any, Set
|
17 |
+
from datetime import datetime
|
18 |
+
from pathlib import Path
|
19 |
+
import re
|
20 |
+
|
21 |
+
# Add project path
|
22 |
+
current_dir = Path(__file__).parent
|
23 |
+
project_root = current_dir.parent
|
24 |
+
src_dir = project_root / "src"
|
25 |
+
sys.path.insert(0, str(src_dir))
|
26 |
+
|
27 |
+
# Import existing system components
|
28 |
+
try:
|
29 |
+
from user_prompt import UserPromptProcessor
|
30 |
+
from retrieval import BasicRetrievalSystem
|
31 |
+
from llm_clients import llm_Med42_70BClient
|
32 |
+
from generation import MedicalAdviceGenerator
|
33 |
+
except ImportError as e:
|
34 |
+
print(f"❌ Import failed: {e}")
|
35 |
+
print("Please ensure running from project root directory")
|
36 |
+
sys.exit(1)
|
37 |
+
|
38 |
+
|
39 |
+
class CoverageEvaluator:
|
40 |
+
"""Retrieval coverage evaluator using keyword overlap analysis"""
|
41 |
+
|
42 |
+
def __init__(self):
|
43 |
+
"""Initialize system components for coverage testing"""
|
44 |
+
print("🔧 Initializing Coverage Evaluator...")
|
45 |
+
|
46 |
+
# Initialize full pipeline components (needed for advice generation)
|
47 |
+
self.llm_client = llm_Med42_70BClient()
|
48 |
+
self.retrieval_system = BasicRetrievalSystem()
|
49 |
+
self.user_prompt_processor = UserPromptProcessor(
|
50 |
+
llm_client=self.llm_client,
|
51 |
+
retrieval_system=self.retrieval_system
|
52 |
+
)
|
53 |
+
self.medical_generator = MedicalAdviceGenerator(llm_client=self.llm_client)
|
54 |
+
|
55 |
+
# Results accumulation
|
56 |
+
self.coverage_results = []
|
57 |
+
|
58 |
+
print("✅ Coverage Evaluator initialization complete")
|
59 |
+
|
60 |
+
def extract_medical_keywords(self, text: str) -> Set[str]:
|
61 |
+
"""
|
62 |
+
Extract medical keywords from text for coverage analysis
|
63 |
+
|
64 |
+
Uses medical terminology patterns and common medical terms
|
65 |
+
"""
|
66 |
+
if not text:
|
67 |
+
return set()
|
68 |
+
|
69 |
+
medical_keywords = set()
|
70 |
+
text_lower = text.lower()
|
71 |
+
|
72 |
+
# Medical terminology patterns
|
73 |
+
patterns = [
|
74 |
+
r'\b[a-z]+(?:osis|itis|pathy|emia|uria|gram|scopy)\b', # Medical suffixes
|
75 |
+
r'\b(?:cardio|neuro|pulmo|gastro|hepato|nephro)[a-z]+\b', # Medical prefixes
|
76 |
+
r'\b(?:diagnosis|treatment|therapy|intervention|management)\b', # Medical actions
|
77 |
+
r'\b(?:patient|symptom|condition|disease|disorder|syndrome)\b', # Medical entities
|
78 |
+
r'\b(?:acute|chronic|severe|mild|moderate|emergency)\b', # Medical descriptors
|
79 |
+
r'\b[a-z]+(?:al|ic|ous|ive)\s+(?:pain|failure|infection|injury)\b', # Compound terms
|
80 |
+
r'\b(?:ecg|ekg|ct|mri|x-ray|ultrasound|biopsy)\b', # Medical procedures
|
81 |
+
r'\b\d+\s*(?:mg|ml|units|hours|days|minutes)\b', # Dosages and timeframes
|
82 |
+
]
|
83 |
+
|
84 |
+
for pattern in patterns:
|
85 |
+
matches = re.findall(pattern, text_lower)
|
86 |
+
medical_keywords.update(match.strip() for match in matches)
|
87 |
+
|
88 |
+
# Additional common medical terms
|
89 |
+
common_medical_terms = [
|
90 |
+
'blood', 'pressure', 'heart', 'chest', 'pain', 'stroke', 'seizure',
|
91 |
+
'emergency', 'hospital', 'monitor', 'assess', 'evaluate', 'immediate',
|
92 |
+
'protocol', 'guideline', 'recommendation', 'risk', 'factor'
|
93 |
+
]
|
94 |
+
|
95 |
+
for term in common_medical_terms:
|
96 |
+
if term in text_lower:
|
97 |
+
medical_keywords.add(term)
|
98 |
+
|
99 |
+
# Filter out very short terms and common words
|
100 |
+
filtered_keywords = {
|
101 |
+
kw for kw in medical_keywords
|
102 |
+
if len(kw) > 2 and kw not in ['the', 'and', 'for', 'with', 'are', 'can', 'may']
|
103 |
+
}
|
104 |
+
|
105 |
+
return filtered_keywords
|
106 |
+
|
107 |
+
def calculate_coverage_score(self, generated_advice: str, retrieval_results: List[Dict]) -> Dict[str, Any]:
|
108 |
+
"""
|
109 |
+
Calculate coverage score based on keyword overlap between advice and retrieved docs
|
110 |
+
|
111 |
+
Args:
|
112 |
+
generated_advice: Generated medical advice text
|
113 |
+
retrieval_results: List of retrieved documents
|
114 |
+
"""
|
115 |
+
if not generated_advice or not retrieval_results:
|
116 |
+
return {
|
117 |
+
"coverage_score": 0.0,
|
118 |
+
"matched_keywords": [],
|
119 |
+
"advice_keywords": [],
|
120 |
+
"source_keywords": [],
|
121 |
+
"coverage_details": []
|
122 |
+
}
|
123 |
+
|
124 |
+
# Extract keywords from generated advice
|
125 |
+
advice_keywords = self.extract_medical_keywords(generated_advice)
|
126 |
+
|
127 |
+
# Extract keywords from all retrieved documents
|
128 |
+
all_source_keywords = set()
|
129 |
+
coverage_details = []
|
130 |
+
|
131 |
+
for i, doc in enumerate(retrieval_results):
|
132 |
+
doc_content = doc.get('content', '') or doc.get('text', '')
|
133 |
+
doc_keywords = self.extract_medical_keywords(doc_content)
|
134 |
+
all_source_keywords.update(doc_keywords)
|
135 |
+
|
136 |
+
# Calculate overlap for this specific document
|
137 |
+
doc_overlap = advice_keywords.intersection(doc_keywords)
|
138 |
+
doc_coverage = len(doc_overlap) / len(doc_keywords) if doc_keywords else 0.0
|
139 |
+
|
140 |
+
coverage_details.append({
|
141 |
+
"doc_index": i,
|
142 |
+
"doc_snippet": doc_content[:100] + "...",
|
143 |
+
"doc_keywords_count": len(doc_keywords),
|
144 |
+
"matched_keywords_count": len(doc_overlap),
|
145 |
+
"doc_coverage_ratio": doc_coverage,
|
146 |
+
"matched_keywords": list(doc_overlap)[:10] # Limit for readability
|
147 |
+
})
|
148 |
+
|
149 |
+
# Calculate overall coverage
|
150 |
+
matched_keywords = advice_keywords.intersection(all_source_keywords)
|
151 |
+
coverage_score = len(matched_keywords) / len(all_source_keywords) if all_source_keywords else 0.0
|
152 |
+
|
153 |
+
return {
|
154 |
+
"coverage_score": coverage_score,
|
155 |
+
"matched_keywords": list(matched_keywords),
|
156 |
+
"advice_keywords": list(advice_keywords),
|
157 |
+
"source_keywords": list(all_source_keywords),
|
158 |
+
"advice_keywords_count": len(advice_keywords),
|
159 |
+
"source_keywords_count": len(all_source_keywords),
|
160 |
+
"matched_keywords_count": len(matched_keywords),
|
161 |
+
"coverage_percentage": coverage_score * 100,
|
162 |
+
"meets_threshold": coverage_score >= 0.6,
|
163 |
+
"coverage_details": coverage_details
|
164 |
+
}
|
165 |
+
|
166 |
+
def evaluate_single_coverage(self, query: str, category: str = "unknown") -> Dict[str, Any]:
|
167 |
+
"""
|
168 |
+
Evaluate retrieval coverage for a single query
|
169 |
+
|
170 |
+
Requires full pipeline: extraction → retrieval → generation → coverage analysis
|
171 |
+
|
172 |
+
Args:
|
173 |
+
query: Medical query to test
|
174 |
+
category: Query category (diagnosis/treatment/mixed)
|
175 |
+
"""
|
176 |
+
print(f"🔍 Testing coverage for: {query[:50]}...")
|
177 |
+
print(f"📋 Category: {category}")
|
178 |
+
|
179 |
+
try:
|
180 |
+
# Step 1: Extract condition
|
181 |
+
condition_result = self.user_prompt_processor.extract_condition_keywords(query)
|
182 |
+
|
183 |
+
# Step 2: Perform retrieval
|
184 |
+
search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
|
185 |
+
if not search_query:
|
186 |
+
search_query = condition_result.get('condition', query)
|
187 |
+
|
188 |
+
retrieval_start = datetime.now()
|
189 |
+
retrieval_results = self.retrieval_system.search(search_query, top_k=5)
|
190 |
+
retrieval_time = (datetime.now() - retrieval_start).total_seconds()
|
191 |
+
|
192 |
+
processed_results = retrieval_results.get('processed_results', [])
|
193 |
+
|
194 |
+
if not processed_results:
|
195 |
+
result = {
|
196 |
+
"query": query,
|
197 |
+
"category": category,
|
198 |
+
"search_query": search_query,
|
199 |
+
"pipeline_success": False,
|
200 |
+
"coverage_score": 0.0,
|
201 |
+
"error": "No retrieval results",
|
202 |
+
"timestamp": datetime.now().isoformat()
|
203 |
+
}
|
204 |
+
|
205 |
+
self.coverage_results.append(result)
|
206 |
+
print(f" ❌ No retrieval results for coverage analysis")
|
207 |
+
return result
|
208 |
+
|
209 |
+
# Step 3: Generate medical advice
|
210 |
+
generation_start = datetime.now()
|
211 |
+
intention = self._detect_query_intention(query)
|
212 |
+
medical_advice_result = self.medical_generator.generate_medical_advice(
|
213 |
+
user_query=query,
|
214 |
+
retrieval_results=retrieval_results,
|
215 |
+
intention=intention
|
216 |
+
)
|
217 |
+
generation_time = (datetime.now() - generation_start).total_seconds()
|
218 |
+
|
219 |
+
generated_advice = medical_advice_result.get('medical_advice', '')
|
220 |
+
|
221 |
+
if not generated_advice:
|
222 |
+
result = {
|
223 |
+
"query": query,
|
224 |
+
"category": category,
|
225 |
+
"search_query": search_query,
|
226 |
+
"pipeline_success": False,
|
227 |
+
"coverage_score": 0.0,
|
228 |
+
"error": "No generated advice",
|
229 |
+
"timestamp": datetime.now().isoformat()
|
230 |
+
}
|
231 |
+
|
232 |
+
self.coverage_results.append(result)
|
233 |
+
print(f" ❌ No generated advice for coverage analysis")
|
234 |
+
return result
|
235 |
+
|
236 |
+
# Step 4: Calculate coverage
|
237 |
+
coverage_analysis = self.calculate_coverage_score(generated_advice, processed_results)
|
238 |
+
|
239 |
+
result = {
|
240 |
+
"query": query,
|
241 |
+
"category": category,
|
242 |
+
"search_query": search_query,
|
243 |
+
"pipeline_success": True,
|
244 |
+
"retrieval_time": retrieval_time,
|
245 |
+
"generation_time": generation_time,
|
246 |
+
"retrieved_docs_count": len(processed_results),
|
247 |
+
"generated_advice_length": len(generated_advice),
|
248 |
+
"coverage_analysis": coverage_analysis,
|
249 |
+
"coverage_score": coverage_analysis['coverage_score'],
|
250 |
+
"meets_threshold": coverage_analysis['meets_threshold'],
|
251 |
+
"timestamp": datetime.now().isoformat()
|
252 |
+
}
|
253 |
+
|
254 |
+
# Store result
|
255 |
+
self.coverage_results.append(result)
|
256 |
+
|
257 |
+
print(f" ✅ Pipeline: Complete")
|
258 |
+
print(f" 📊 Coverage Score: {coverage_analysis['coverage_score']:.3f} ({coverage_analysis['coverage_percentage']:.1f}%)")
|
259 |
+
print(f" 📝 Keywords: {coverage_analysis['matched_keywords_count']}/{coverage_analysis['source_keywords_count']} matched")
|
260 |
+
print(f" 🎯 Threshold: {'✅ Met' if result['meets_threshold'] else '❌ Not Met'}")
|
261 |
+
print(f" ⏱️ Times: Retrieval={retrieval_time:.2f}s, Generation={generation_time:.2f}s")
|
262 |
+
|
263 |
+
return result
|
264 |
+
|
265 |
+
except Exception as e:
|
266 |
+
error_result = {
|
267 |
+
"query": query,
|
268 |
+
"category": category,
|
269 |
+
"pipeline_success": False,
|
270 |
+
"coverage_score": 0.0,
|
271 |
+
"error": str(e),
|
272 |
+
"timestamp": datetime.now().isoformat()
|
273 |
+
}
|
274 |
+
|
275 |
+
self.coverage_results.append(error_result)
|
276 |
+
print(f" ❌ Coverage evaluation failed: {e}")
|
277 |
+
|
278 |
+
return error_result
|
279 |
+
|
280 |
+
def _detect_query_intention(self, query: str) -> str:
|
281 |
+
"""Simplified query intention detection (from app.py)"""
|
282 |
+
query_lower = query.lower()
|
283 |
+
|
284 |
+
if any(word in query_lower for word in ['diagnos', 'differential', 'possible', 'causes']):
|
285 |
+
return 'diagnosis'
|
286 |
+
elif any(word in query_lower for word in ['treat', 'manage', 'therapy', 'intervention']):
|
287 |
+
return 'treatment'
|
288 |
+
else:
|
289 |
+
return 'mixed'
|
290 |
+
|
291 |
+
def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
|
292 |
+
"""Parse queries from file with category labels"""
|
293 |
+
print(f"📁 Reading queries from file: {filepath}")
|
294 |
+
|
295 |
+
try:
|
296 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
297 |
+
content = f.read()
|
298 |
+
|
299 |
+
# Parse queries with category labels
|
300 |
+
queries_by_category = {
|
301 |
+
"diagnosis": [],
|
302 |
+
"treatment": [],
|
303 |
+
"mixed": []
|
304 |
+
}
|
305 |
+
|
306 |
+
lines = content.strip().split('\n')
|
307 |
+
|
308 |
+
for line in lines:
|
309 |
+
line = line.strip()
|
310 |
+
if not line:
|
311 |
+
continue
|
312 |
+
|
313 |
+
# Parse format: "1.diagnosis: query text"
|
314 |
+
match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
|
315 |
+
if match:
|
316 |
+
category_raw = match.group(1).lower()
|
317 |
+
query_text = match.group(2).strip()
|
318 |
+
|
319 |
+
# Normalize category name
|
320 |
+
if category_raw in ['mixed/complicated', 'mixed']:
|
321 |
+
category = 'mixed'
|
322 |
+
else:
|
323 |
+
category = category_raw
|
324 |
+
|
325 |
+
if category in queries_by_category and len(query_text) > 15:
|
326 |
+
queries_by_category[category].append({
|
327 |
+
"text": query_text,
|
328 |
+
"category": category
|
329 |
+
})
|
330 |
+
|
331 |
+
print(f"📋 Parsed queries by category:")
|
332 |
+
for category, category_queries in queries_by_category.items():
|
333 |
+
print(f" {category.capitalize()}: {len(category_queries)} queries")
|
334 |
+
|
335 |
+
return queries_by_category
|
336 |
+
|
337 |
+
except Exception as e:
|
338 |
+
print(f"❌ Failed to read file: {e}")
|
339 |
+
return {"error": f"Failed to read file: {e}"}
|
340 |
+
|
341 |
+
def calculate_coverage_statistics(self) -> Dict[str, Any]:
|
342 |
+
"""Calculate coverage statistics by category"""
|
343 |
+
category_stats = {}
|
344 |
+
all_successful_results = []
|
345 |
+
|
346 |
+
# Group results by category
|
347 |
+
results_by_category = {
|
348 |
+
"diagnosis": [],
|
349 |
+
"treatment": [],
|
350 |
+
"mixed": []
|
351 |
+
}
|
352 |
+
|
353 |
+
for result in self.coverage_results:
|
354 |
+
category = result.get('category', 'unknown')
|
355 |
+
if category in results_by_category:
|
356 |
+
results_by_category[category].append(result)
|
357 |
+
if result.get('pipeline_success'):
|
358 |
+
all_successful_results.append(result)
|
359 |
+
|
360 |
+
# Calculate statistics for each category
|
361 |
+
for category, results in results_by_category.items():
|
362 |
+
successful_results = [r for r in results if r.get('pipeline_success')]
|
363 |
+
|
364 |
+
if successful_results:
|
365 |
+
coverage_scores = [r['coverage_score'] for r in successful_results]
|
366 |
+
avg_coverage = sum(coverage_scores) / len(coverage_scores)
|
367 |
+
avg_retrieval_time = sum(r.get('retrieval_time', 0) for r in successful_results) / len(successful_results)
|
368 |
+
avg_generation_time = sum(r.get('generation_time', 0) for r in successful_results) / len(successful_results)
|
369 |
+
|
370 |
+
category_stats[category] = {
|
371 |
+
"average_coverage": avg_coverage,
|
372 |
+
"max_coverage": max(coverage_scores),
|
373 |
+
"min_coverage": min(coverage_scores),
|
374 |
+
"successful_evaluations": len(successful_results),
|
375 |
+
"total_queries": len(results),
|
376 |
+
"success_rate": len(successful_results) / len(results),
|
377 |
+
"average_retrieval_time": avg_retrieval_time,
|
378 |
+
"average_generation_time": avg_generation_time,
|
379 |
+
"meets_threshold": avg_coverage >= 0.6,
|
380 |
+
"individual_coverage_scores": coverage_scores
|
381 |
+
}
|
382 |
+
else:
|
383 |
+
category_stats[category] = {
|
384 |
+
"average_coverage": 0.0,
|
385 |
+
"max_coverage": 0.0,
|
386 |
+
"min_coverage": 0.0,
|
387 |
+
"successful_evaluations": 0,
|
388 |
+
"total_queries": len(results),
|
389 |
+
"success_rate": 0.0,
|
390 |
+
"average_retrieval_time": 0.0,
|
391 |
+
"average_generation_time": 0.0,
|
392 |
+
"meets_threshold": False,
|
393 |
+
"individual_coverage_scores": []
|
394 |
+
}
|
395 |
+
|
396 |
+
# Calculate overall statistics
|
397 |
+
if all_successful_results:
|
398 |
+
all_coverage_scores = [r['coverage_score'] for r in all_successful_results]
|
399 |
+
overall_stats = {
|
400 |
+
"average_coverage": sum(all_coverage_scores) / len(all_coverage_scores),
|
401 |
+
"max_coverage": max(all_coverage_scores),
|
402 |
+
"min_coverage": min(all_coverage_scores),
|
403 |
+
"successful_evaluations": len(all_successful_results),
|
404 |
+
"total_queries": len(self.coverage_results),
|
405 |
+
"success_rate": len(all_successful_results) / len(self.coverage_results),
|
406 |
+
"meets_threshold": (sum(all_coverage_scores) / len(all_coverage_scores)) >= 0.6,
|
407 |
+
"target_compliance": (sum(all_coverage_scores) / len(all_coverage_scores)) >= 0.6
|
408 |
+
}
|
409 |
+
else:
|
410 |
+
overall_stats = {
|
411 |
+
"average_coverage": 0.0,
|
412 |
+
"max_coverage": 0.0,
|
413 |
+
"min_coverage": 0.0,
|
414 |
+
"successful_evaluations": 0,
|
415 |
+
"total_queries": len(self.coverage_results),
|
416 |
+
"success_rate": 0.0,
|
417 |
+
"meets_threshold": False,
|
418 |
+
"target_compliance": False
|
419 |
+
}
|
420 |
+
|
421 |
+
return {
|
422 |
+
"category_results": category_stats,
|
423 |
+
"overall_results": overall_stats,
|
424 |
+
"timestamp": datetime.now().isoformat()
|
425 |
+
}
|
426 |
+
|
427 |
+
def save_coverage_statistics(self, filename: str = None) -> str:
|
428 |
+
"""Save coverage statistics for chart generation"""
|
429 |
+
stats = self.calculate_coverage_statistics()
|
430 |
+
|
431 |
+
if filename is None:
|
432 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
433 |
+
filename = f"coverage_statistics_{timestamp}.json"
|
434 |
+
|
435 |
+
# Ensure results directory exists
|
436 |
+
results_dir = Path(__file__).parent / "results"
|
437 |
+
results_dir.mkdir(exist_ok=True)
|
438 |
+
|
439 |
+
filepath = results_dir / filename
|
440 |
+
|
441 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
442 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
443 |
+
|
444 |
+
print(f"📊 Coverage statistics saved to: {filepath}")
|
445 |
+
return str(filepath)
|
446 |
+
|
447 |
+
def save_coverage_details(self, filename: str = None) -> str:
|
448 |
+
"""Save detailed coverage results"""
|
449 |
+
if filename is None:
|
450 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
451 |
+
filename = f"coverage_details_{timestamp}.json"
|
452 |
+
|
453 |
+
# Ensure results directory exists
|
454 |
+
results_dir = Path(__file__).parent / "results"
|
455 |
+
results_dir.mkdir(exist_ok=True)
|
456 |
+
|
457 |
+
filepath = results_dir / filename
|
458 |
+
|
459 |
+
# Create comprehensive coverage data
|
460 |
+
coverage_data = {
|
461 |
+
"evaluation_metadata": {
|
462 |
+
"total_queries": len(self.coverage_results),
|
463 |
+
"successful_evaluations": len([r for r in self.coverage_results if r.get('pipeline_success')]),
|
464 |
+
"timestamp": datetime.now().isoformat(),
|
465 |
+
"evaluator_type": "retrieval_coverage",
|
466 |
+
"threshold_used": 0.6
|
467 |
+
},
|
468 |
+
"coverage_results": self.coverage_results
|
469 |
+
}
|
470 |
+
|
471 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
472 |
+
json.dump(coverage_data, f, indent=2, ensure_ascii=False)
|
473 |
+
|
474 |
+
print(f"📝 Coverage details saved to: {filepath}")
|
475 |
+
return str(filepath)
|
476 |
+
|
477 |
+
|
478 |
+
# Independent execution interface
|
479 |
+
if __name__ == "__main__":
|
480 |
+
"""Independent coverage evaluation interface"""
|
481 |
+
|
482 |
+
print("📈 OnCall.ai Coverage Evaluator - Retrieval Coverage Analysis")
|
483 |
+
|
484 |
+
if len(sys.argv) > 1:
|
485 |
+
query_file = sys.argv[1]
|
486 |
+
else:
|
487 |
+
# Default to evaluation/pre_user_query_evaluate.txt
|
488 |
+
query_file = Path(__file__).parent / "pre_user_query_evaluate.txt"
|
489 |
+
|
490 |
+
if not os.path.exists(query_file):
|
491 |
+
print(f"❌ Query file not found: {query_file}")
|
492 |
+
print("Usage: python coverage_evaluator.py [query_file.txt]")
|
493 |
+
sys.exit(1)
|
494 |
+
|
495 |
+
# Initialize evaluator
|
496 |
+
evaluator = CoverageEvaluator()
|
497 |
+
|
498 |
+
# Parse queries from file
|
499 |
+
queries_by_category = evaluator.parse_queries_from_file(str(query_file))
|
500 |
+
|
501 |
+
if "error" in queries_by_category:
|
502 |
+
print(f"❌ Failed to parse queries: {queries_by_category['error']}")
|
503 |
+
sys.exit(1)
|
504 |
+
|
505 |
+
# Test coverage for each query (requires full pipeline)
|
506 |
+
print(f"\n🧪 Retrieval Coverage Testing (Full Pipeline Required)")
|
507 |
+
print(f"⚠️ Note: This evaluator requires LLM calls for advice generation")
|
508 |
+
|
509 |
+
for category, queries in queries_by_category.items():
|
510 |
+
if not queries:
|
511 |
+
continue
|
512 |
+
|
513 |
+
print(f"\n📂 Testing {category.upper()} coverage:")
|
514 |
+
|
515 |
+
for i, query_info in enumerate(queries):
|
516 |
+
query_text = query_info['text']
|
517 |
+
|
518 |
+
# Test coverage (requires full pipeline)
|
519 |
+
result = evaluator.evaluate_single_coverage(query_text, category)
|
520 |
+
|
521 |
+
# Pause between queries to avoid rate limits
|
522 |
+
if i < len(queries) - 1:
|
523 |
+
print(f" ⏳ Pausing 5s before next query...")
|
524 |
+
import time
|
525 |
+
time.sleep(5)
|
526 |
+
|
527 |
+
# Longer pause between categories
|
528 |
+
if category != list(queries_by_category.keys())[-1]:
|
529 |
+
print(f"\n⏳ Pausing 10s before next category...")
|
530 |
+
import time
|
531 |
+
time.sleep(10)
|
532 |
+
|
533 |
+
# Generate and save results
|
534 |
+
print(f"\n📊 Generating coverage analysis...")
|
535 |
+
|
536 |
+
# Save statistics and details
|
537 |
+
stats_path = evaluator.save_coverage_statistics()
|
538 |
+
details_path = evaluator.save_coverage_details()
|
539 |
+
|
540 |
+
# Print final summary
|
541 |
+
stats = evaluator.calculate_coverage_statistics()
|
542 |
+
category_results = stats['category_results']
|
543 |
+
overall_results = stats['overall_results']
|
544 |
+
|
545 |
+
print(f"\n📊 === COVERAGE EVALUATION SUMMARY ===")
|
546 |
+
print(f"Overall Performance:")
|
547 |
+
print(f" Average Coverage: {overall_results['average_coverage']:.3f} ({overall_results['average_coverage']*100:.1f}%)")
|
548 |
+
print(f" Pipeline Success Rate: {overall_results['success_rate']:.1%}")
|
549 |
+
print(f" 60% Threshold: {'✅ Met' if overall_results['meets_threshold'] else '❌ Not Met'}")
|
550 |
+
|
551 |
+
print(f"\nCategory Breakdown:")
|
552 |
+
for category, cat_stats in category_results.items():
|
553 |
+
if cat_stats['total_queries'] > 0:
|
554 |
+
print(f" {category.capitalize()}: {cat_stats['average_coverage']:.3f} "
|
555 |
+
f"({cat_stats['successful_evaluations']}/{cat_stats['total_queries']}) "
|
556 |
+
f"[R:{cat_stats['average_retrieval_time']:.2f}s, G:{cat_stats['average_generation_time']:.2f}s]")
|
557 |
+
|
558 |
+
print(f"\n✅ Coverage evaluation complete!")
|
559 |
+
print(f"📊 Statistics: {stats_path}")
|
560 |
+
print(f"📝 Details: {details_path}")
|
evaluation/{evaluation_instruction.md → old/evaluation_instruction.md}
RENAMED
@@ -1,4 +1,5 @@
|
|
1 |
# Model use
|
|
|
2 |
llm model: (for comparison) with our-own version.
|
3 |
https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B
|
4 |
https://huggingface.co/m42-health/Llama3-Med42-70B
|
@@ -12,59 +13,59 @@ https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct
|
|
12 |
"""
|
13 |
```
|
14 |
|
15 |
-
|
16 |
### 評估執行流程
|
|
|
17 |
```python
|
18 |
def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str, Any]:
|
19 |
"""執行完整的六項指標評估"""
|
20 |
-
|
21 |
results = {
|
22 |
"model": model_name,
|
23 |
"metrics": {},
|
24 |
"detailed_results": []
|
25 |
}
|
26 |
-
|
27 |
total_latencies = []
|
28 |
extraction_successes = []
|
29 |
relevance_scores = []
|
30 |
coverage_scores = []
|
31 |
actionability_scores = []
|
32 |
evidence_scores = []
|
33 |
-
|
34 |
for query in test_cases:
|
35 |
# 運行模型並測量所有指標
|
36 |
start_time = time.time()
|
37 |
-
|
38 |
# 1. 總處理時長
|
39 |
latency_result = measure_total_latency(query)
|
40 |
total_latencies.append(latency_result['total_latency'])
|
41 |
-
|
42 |
# 2. 條件抽取成功率
|
43 |
extraction_result = evaluate_condition_extraction([query])
|
44 |
extraction_successes.append(extraction_result['success_rate'])
|
45 |
-
|
46 |
# 3 & 4. 檢索相關性和覆蓋率(需要實際檢索結果)
|
47 |
retrieval_results = get_retrieval_results(query)
|
48 |
relevance_result = evaluate_retrieval_relevance(retrieval_results)
|
49 |
relevance_scores.append(relevance_result['average_relevance'])
|
50 |
-
|
51 |
generated_advice = get_generated_advice(query, retrieval_results)
|
52 |
coverage_result = evaluate_retrieval_coverage(generated_advice, retrieval_results)
|
53 |
coverage_scores.append(coverage_result['coverage'])
|
54 |
-
|
55 |
# 5 & 6. LLM 評估(需要完整回應)
|
56 |
response_data = {
|
57 |
'query': query,
|
58 |
'advice': generated_advice,
|
59 |
'retrieval_results': retrieval_results
|
60 |
}
|
61 |
-
|
62 |
actionability_result = evaluate_clinical_actionability([response_data])
|
63 |
actionability_scores.append(actionability_result[0]['overall_score'])
|
64 |
-
|
65 |
evidence_result = evaluate_clinical_evidence([response_data])
|
66 |
evidence_scores.append(evidence_result[0]['overall_score'])
|
67 |
-
|
68 |
# 記錄詳細結果
|
69 |
results["detailed_results"].append({
|
70 |
"query": query,
|
@@ -75,7 +76,7 @@ def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str,
|
|
75 |
"actionability": actionability_result[0],
|
76 |
"evidence": evidence_result[0]
|
77 |
})
|
78 |
-
|
79 |
# 計算平均指標
|
80 |
results["metrics"] = {
|
81 |
"average_latency": sum(total_latencies) / len(total_latencies),
|
@@ -85,7 +86,7 @@ def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str,
|
|
85 |
"average_actionability": sum(actionability_scores) / len(actionability_scores),
|
86 |
"average_evidence_score": sum(evidence_scores) / len(evidence_scores)
|
87 |
}
|
88 |
-
|
89 |
return results
|
90 |
```
|
91 |
|
@@ -94,41 +95,43 @@ def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str,
|
|
94 |
## 📈 評估結果分析框架
|
95 |
|
96 |
### 統計分析
|
|
|
97 |
```python
|
98 |
def analyze_evaluation_results(results_A: Dict, results_B: Dict, results_C: Dict) -> Dict:
|
99 |
"""比較三個模型的評估結果"""
|
100 |
-
|
101 |
models = ['Med42-70B_direct', 'RAG_enhanced', 'OpenBioLLM-70B']
|
102 |
metrics = ['latency', 'extraction_success_rate', 'relevance', 'coverage', 'actionability', 'evidence_score']
|
103 |
-
|
104 |
comparison = {}
|
105 |
-
|
106 |
for metric in metrics:
|
107 |
comparison[metric] = {
|
108 |
models[0]: results_A['metrics'][f'average_{metric}'],
|
109 |
models[1]: results_B['metrics'][f'average_{metric}'],
|
110 |
models[2]: results_C['metrics'][f'average_{metric}']
|
111 |
}
|
112 |
-
|
113 |
# 計算相對改進
|
114 |
baseline = comparison[metric][models[0]]
|
115 |
rag_improvement = ((comparison[metric][models[1]] - baseline) / baseline) * 100
|
116 |
-
|
117 |
comparison[metric]['rag_improvement_percent'] = rag_improvement
|
118 |
-
|
119 |
return comparison
|
120 |
```
|
121 |
|
122 |
### 報告生成
|
|
|
123 |
```python
|
124 |
def generate_evaluation_report(comparison_results: Dict) -> str:
|
125 |
"""生成評估報告"""
|
126 |
-
|
127 |
report = f"""
|
128 |
# OnCall.ai 系統評估報告
|
129 |
-
|
130 |
## 評估摘要
|
131 |
-
|
132 |
| 指標 | Med42-70B | RAG增強版 | OpenBioLLM | RAG改進% |
|
133 |
|------|-----------|-----------|------------|----------|
|
134 |
| 處理時長 | {comparison_results['latency']['Med42-70B_direct']:.2f}s | {comparison_results['latency']['RAG_enhanced']:.2f}s | {comparison_results['latency']['OpenBioLLM-70B']:.2f}s | {comparison_results['latency']['rag_improvement_percent']:+.1f}% |
|
@@ -137,9 +140,9 @@ def generate_evaluation_report(comparison_results: Dict) -> str:
|
|
137 |
| 檢索覆蓋率 | - | {comparison_results['coverage']['RAG_enhanced']:.1%} | - | - |
|
138 |
| 臨床可操作性 | {comparison_results['actionability']['Med42-70B_direct']:.1f}/10 | {comparison_results['actionability']['RAG_enhanced']:.1f}/10 | {comparison_results['actionability']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['actionability']['rag_improvement_percent']:+.1f}% |
|
139 |
| 臨床證據評分 | {comparison_results['evidence_score']['Med42-70B_direct']:.1f}/10 | {comparison_results['evidence_score']['RAG_enhanced']:.1f}/10 | {comparison_results['evidence_score']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['evidence_score']['rag_improvement_percent']:+.1f}% |
|
140 |
-
|
141 |
"""
|
142 |
-
|
143 |
return report
|
144 |
```
|
145 |
|
@@ -148,6 +151,7 @@ def generate_evaluation_report(comparison_results: Dict) -> str:
|
|
148 |
## 🔧 實驗執行步驟
|
149 |
|
150 |
### 1. 環境準備
|
|
|
151 |
```bash
|
152 |
# 設置 HuggingFace token(用於 Inference Providers)
|
153 |
export HF_TOKEN=your_huggingface_token
|
@@ -157,48 +161,49 @@ export ONCALL_EVAL_MODE=true
|
|
157 |
```
|
158 |
|
159 |
### 2. 實驗執行腳本框架
|
|
|
160 |
```python
|
161 |
# evaluation/run_evaluation.py
|
162 |
def main():
|
163 |
"""主要評估執行函數"""
|
164 |
-
|
165 |
# 加載測試用例
|
166 |
test_cases = MEDICAL_TEST_CASES
|
167 |
-
|
168 |
# 實驗 A: YanBo 系統評估
|
169 |
print("🔬 開始實驗 A: YanBo 系統評估")
|
170 |
results_med42_direct = run_complete_evaluation("Med42-70B_direct", test_cases)
|
171 |
-
results_general_rag = run_complete_evaluation("Med42-70B_general_RAG", test_cases)
|
172 |
results_openbio = run_complete_evaluation("OpenBioLLM-70B", test_cases)
|
173 |
-
|
174 |
# 分析和報告
|
175 |
comparison_A = analyze_evaluation_results(results_med42_direct, results_general_rag, results_openbio)
|
176 |
report_A = generate_evaluation_report(comparison_A)
|
177 |
-
|
178 |
# 保存結果
|
179 |
save_results("evaluation/results/yanbo_evaluation.json", {
|
180 |
"comparison": comparison_A,
|
181 |
"detailed_results": [results_med42_direct, results_general_rag, results_openbio]
|
182 |
})
|
183 |
-
|
184 |
print("✅ 實驗 A 完成,結果已保存")
|
185 |
-
|
186 |
# 實驗 B: Jeff 系統評估
|
187 |
print("🔬 開始實驗 B: Jeff 系統評估")
|
188 |
results_med42_direct_b = run_complete_evaluation("Med42-70B_direct", test_cases)
|
189 |
results_customized_rag = run_complete_evaluation("Med42-70B_customized_RAG", test_cases)
|
190 |
results_openbio_b = run_complete_evaluation("OpenBioLLM-70B", test_cases)
|
191 |
-
|
192 |
# 分析和報告
|
193 |
comparison_B = analyze_evaluation_results(results_med42_direct_b, results_customized_rag, results_openbio_b)
|
194 |
report_B = generate_evaluation_report(comparison_B)
|
195 |
-
|
196 |
# 保存結果
|
197 |
save_results("evaluation/results/jeff_evaluation.json", {
|
198 |
"comparison": comparison_B,
|
199 |
"detailed_results": [results_med42_direct_b, results_customized_rag, results_openbio_b]
|
200 |
})
|
201 |
-
|
202 |
print("✅ 實驗 B 完成,結果已保存")
|
203 |
|
204 |
if __name__ == "__main__":
|
@@ -206,6 +211,7 @@ if __name__ == "__main__":
|
|
206 |
```
|
207 |
|
208 |
### 3. 預期評估時間
|
|
|
209 |
```
|
210 |
總評估時間估算:
|
211 |
├── 每個查詢處理時間:~30秒(包含LLM評估)
|
@@ -219,10 +225,11 @@ if __name__ == "__main__":
|
|
219 |
## 📊 評估成功標準
|
220 |
|
221 |
### 系統性能目標
|
|
|
222 |
```
|
223 |
✅ 達標條件:
|
224 |
1. 總處理時長 ≤ 30秒
|
225 |
-
2. 條件抽取成功率 ≥ 80%
|
226 |
3. 檢索相關性 ≥ 0.2
|
227 |
4. 檢索覆蓋率 ≥ 60%
|
228 |
5. 臨床可操作性 ≥ 7.0/10
|
@@ -234,6 +241,7 @@ if __name__ == "__main__":
|
|
234 |
```
|
235 |
|
236 |
### 比較分析重點
|
|
|
237 |
```
|
238 |
重點分析維度:
|
239 |
├── RAG 對處理時間的影響(可能增加延遲)
|
@@ -247,6 +255,7 @@ if __name__ == "__main__":
|
|
247 |
## 🛠️ 實施建議
|
248 |
|
249 |
### 分階段實施
|
|
|
250 |
```
|
251 |
階段1: 基礎指標實現(1-4項)
|
252 |
├── 利用現有 app.py 中的時間測量
|
@@ -268,6 +277,7 @@ if __name__ == "__main__":
|
|
268 |
```
|
269 |
|
270 |
### 實施注意事項
|
|
|
271 |
```
|
272 |
⚠️ 重要提醒:
|
273 |
1. 所有評估代碼應獨立於現有系統,避免影響正常運行
|
@@ -280,3 +290,412 @@ if __name__ == "__main__":
|
|
280 |
---
|
281 |
|
282 |
**評估指南完成。請根據此指南實施評估實驗。**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Model use
|
2 |
+
|
3 |
llm model: (for comparison) with our-own version.
|
4 |
https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B
|
5 |
https://huggingface.co/m42-health/Llama3-Med42-70B
|
|
|
13 |
"""
|
14 |
```
|
15 |
|
|
|
16 |
### 評估執行流程
|
17 |
+
|
18 |
```python
|
19 |
def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str, Any]:
|
20 |
"""執行完整的六項指標評估"""
|
21 |
+
|
22 |
results = {
|
23 |
"model": model_name,
|
24 |
"metrics": {},
|
25 |
"detailed_results": []
|
26 |
}
|
27 |
+
|
28 |
total_latencies = []
|
29 |
extraction_successes = []
|
30 |
relevance_scores = []
|
31 |
coverage_scores = []
|
32 |
actionability_scores = []
|
33 |
evidence_scores = []
|
34 |
+
|
35 |
for query in test_cases:
|
36 |
# 運行模型並測量所有指標
|
37 |
start_time = time.time()
|
38 |
+
|
39 |
# 1. 總處理時長
|
40 |
latency_result = measure_total_latency(query)
|
41 |
total_latencies.append(latency_result['total_latency'])
|
42 |
+
|
43 |
# 2. 條件抽取成功率
|
44 |
extraction_result = evaluate_condition_extraction([query])
|
45 |
extraction_successes.append(extraction_result['success_rate'])
|
46 |
+
|
47 |
# 3 & 4. 檢索相關性和覆蓋率(需要實際檢索結果)
|
48 |
retrieval_results = get_retrieval_results(query)
|
49 |
relevance_result = evaluate_retrieval_relevance(retrieval_results)
|
50 |
relevance_scores.append(relevance_result['average_relevance'])
|
51 |
+
|
52 |
generated_advice = get_generated_advice(query, retrieval_results)
|
53 |
coverage_result = evaluate_retrieval_coverage(generated_advice, retrieval_results)
|
54 |
coverage_scores.append(coverage_result['coverage'])
|
55 |
+
|
56 |
# 5 & 6. LLM 評估(需要完整回應)
|
57 |
response_data = {
|
58 |
'query': query,
|
59 |
'advice': generated_advice,
|
60 |
'retrieval_results': retrieval_results
|
61 |
}
|
62 |
+
|
63 |
actionability_result = evaluate_clinical_actionability([response_data])
|
64 |
actionability_scores.append(actionability_result[0]['overall_score'])
|
65 |
+
|
66 |
evidence_result = evaluate_clinical_evidence([response_data])
|
67 |
evidence_scores.append(evidence_result[0]['overall_score'])
|
68 |
+
|
69 |
# 記錄詳細結果
|
70 |
results["detailed_results"].append({
|
71 |
"query": query,
|
|
|
76 |
"actionability": actionability_result[0],
|
77 |
"evidence": evidence_result[0]
|
78 |
})
|
79 |
+
|
80 |
# 計算平均指標
|
81 |
results["metrics"] = {
|
82 |
"average_latency": sum(total_latencies) / len(total_latencies),
|
|
|
86 |
"average_actionability": sum(actionability_scores) / len(actionability_scores),
|
87 |
"average_evidence_score": sum(evidence_scores) / len(evidence_scores)
|
88 |
}
|
89 |
+
|
90 |
return results
|
91 |
```
|
92 |
|
|
|
95 |
## 📈 評估結果分析框架
|
96 |
|
97 |
### 統計分析
|
98 |
+
|
99 |
```python
|
100 |
def analyze_evaluation_results(results_A: Dict, results_B: Dict, results_C: Dict) -> Dict:
|
101 |
"""比較三個模型的評估結果"""
|
102 |
+
|
103 |
models = ['Med42-70B_direct', 'RAG_enhanced', 'OpenBioLLM-70B']
|
104 |
metrics = ['latency', 'extraction_success_rate', 'relevance', 'coverage', 'actionability', 'evidence_score']
|
105 |
+
|
106 |
comparison = {}
|
107 |
+
|
108 |
for metric in metrics:
|
109 |
comparison[metric] = {
|
110 |
models[0]: results_A['metrics'][f'average_{metric}'],
|
111 |
models[1]: results_B['metrics'][f'average_{metric}'],
|
112 |
models[2]: results_C['metrics'][f'average_{metric}']
|
113 |
}
|
114 |
+
|
115 |
# 計算相對改進
|
116 |
baseline = comparison[metric][models[0]]
|
117 |
rag_improvement = ((comparison[metric][models[1]] - baseline) / baseline) * 100
|
118 |
+
|
119 |
comparison[metric]['rag_improvement_percent'] = rag_improvement
|
120 |
+
|
121 |
return comparison
|
122 |
```
|
123 |
|
124 |
### 報告生成
|
125 |
+
|
126 |
```python
|
127 |
def generate_evaluation_report(comparison_results: Dict) -> str:
|
128 |
"""生成評估報告"""
|
129 |
+
|
130 |
report = f"""
|
131 |
# OnCall.ai 系統評估報告
|
132 |
+
|
133 |
## 評估摘要
|
134 |
+
|
135 |
| 指標 | Med42-70B | RAG增強版 | OpenBioLLM | RAG改進% |
|
136 |
|------|-----------|-----------|------------|----------|
|
137 |
| 處理時長 | {comparison_results['latency']['Med42-70B_direct']:.2f}s | {comparison_results['latency']['RAG_enhanced']:.2f}s | {comparison_results['latency']['OpenBioLLM-70B']:.2f}s | {comparison_results['latency']['rag_improvement_percent']:+.1f}% |
|
|
|
140 |
| 檢索覆蓋率 | - | {comparison_results['coverage']['RAG_enhanced']:.1%} | - | - |
|
141 |
| 臨床可操作性 | {comparison_results['actionability']['Med42-70B_direct']:.1f}/10 | {comparison_results['actionability']['RAG_enhanced']:.1f}/10 | {comparison_results['actionability']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['actionability']['rag_improvement_percent']:+.1f}% |
|
142 |
| 臨床證據評分 | {comparison_results['evidence_score']['Med42-70B_direct']:.1f}/10 | {comparison_results['evidence_score']['RAG_enhanced']:.1f}/10 | {comparison_results['evidence_score']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['evidence_score']['rag_improvement_percent']:+.1f}% |
|
143 |
+
|
144 |
"""
|
145 |
+
|
146 |
return report
|
147 |
```
|
148 |
|
|
|
151 |
## 🔧 實驗執行步驟
|
152 |
|
153 |
### 1. 環境準備
|
154 |
+
|
155 |
```bash
|
156 |
# 設置 HuggingFace token(用於 Inference Providers)
|
157 |
export HF_TOKEN=your_huggingface_token
|
|
|
161 |
```
|
162 |
|
163 |
### 2. 實驗執行腳本框架
|
164 |
+
|
165 |
```python
|
166 |
# evaluation/run_evaluation.py
|
167 |
def main():
|
168 |
"""主要評估執行函數"""
|
169 |
+
|
170 |
# 加載測試用例
|
171 |
test_cases = MEDICAL_TEST_CASES
|
172 |
+
|
173 |
# 實驗 A: YanBo 系統評估
|
174 |
print("🔬 開始實驗 A: YanBo 系統評估")
|
175 |
results_med42_direct = run_complete_evaluation("Med42-70B_direct", test_cases)
|
176 |
+
results_general_rag = run_complete_evaluation("Med42-70B_general_RAG", test_cases)
|
177 |
results_openbio = run_complete_evaluation("OpenBioLLM-70B", test_cases)
|
178 |
+
|
179 |
# 分析和報告
|
180 |
comparison_A = analyze_evaluation_results(results_med42_direct, results_general_rag, results_openbio)
|
181 |
report_A = generate_evaluation_report(comparison_A)
|
182 |
+
|
183 |
# 保存結果
|
184 |
save_results("evaluation/results/yanbo_evaluation.json", {
|
185 |
"comparison": comparison_A,
|
186 |
"detailed_results": [results_med42_direct, results_general_rag, results_openbio]
|
187 |
})
|
188 |
+
|
189 |
print("✅ 實驗 A 完成,結果已保存")
|
190 |
+
|
191 |
# 實驗 B: Jeff 系統評估
|
192 |
print("🔬 開始實驗 B: Jeff 系統評估")
|
193 |
results_med42_direct_b = run_complete_evaluation("Med42-70B_direct", test_cases)
|
194 |
results_customized_rag = run_complete_evaluation("Med42-70B_customized_RAG", test_cases)
|
195 |
results_openbio_b = run_complete_evaluation("OpenBioLLM-70B", test_cases)
|
196 |
+
|
197 |
# 分析和報告
|
198 |
comparison_B = analyze_evaluation_results(results_med42_direct_b, results_customized_rag, results_openbio_b)
|
199 |
report_B = generate_evaluation_report(comparison_B)
|
200 |
+
|
201 |
# 保存結果
|
202 |
save_results("evaluation/results/jeff_evaluation.json", {
|
203 |
"comparison": comparison_B,
|
204 |
"detailed_results": [results_med42_direct_b, results_customized_rag, results_openbio_b]
|
205 |
})
|
206 |
+
|
207 |
print("✅ 實驗 B 完成,結果已保存")
|
208 |
|
209 |
if __name__ == "__main__":
|
|
|
211 |
```
|
212 |
|
213 |
### 3. 預期評估時間
|
214 |
+
|
215 |
```
|
216 |
總評估時間估算:
|
217 |
├── 每個查詢處理時間:~30秒(包含LLM評估)
|
|
|
225 |
## 📊 評估成功標準
|
226 |
|
227 |
### 系統性能目標
|
228 |
+
|
229 |
```
|
230 |
✅ 達標條件:
|
231 |
1. 總處理時長 ≤ 30秒
|
232 |
+
2. 條件抽取成功率 ≥ 80%
|
233 |
3. 檢索相關性 ≥ 0.2
|
234 |
4. 檢索覆蓋率 ≥ 60%
|
235 |
5. 臨床可操作性 ≥ 7.0/10
|
|
|
241 |
```
|
242 |
|
243 |
### 比較分析重點
|
244 |
+
|
245 |
```
|
246 |
重點分析維度:
|
247 |
├── RAG 對處理時間的影響(可能增加延遲)
|
|
|
255 |
## 🛠️ 實施建議
|
256 |
|
257 |
### 分階段實施
|
258 |
+
|
259 |
```
|
260 |
階段1: 基礎指標實現(1-4項)
|
261 |
├── 利用現有 app.py 中的時間測量
|
|
|
277 |
```
|
278 |
|
279 |
### 實施注意事項
|
280 |
+
|
281 |
```
|
282 |
⚠️ 重要提醒:
|
283 |
1. 所有評估代碼應獨立於現有系統,避免影響正常運行
|
|
|
290 |
---
|
291 |
|
292 |
**評估指南完成。請根據此指南實施評估實驗。**
|
293 |
+
|
294 |
+
## Phase 1: Initial Assessment
|
295 |
+
|
296 |
+
### Step 1.1 - 分析您的說明
|
297 |
+
|
298 |
+
我重新理解了您的意思!
|
299 |
+
|
300 |
+
### Step 1.2 - 無不清楚的需求
|
301 |
+
|
302 |
+
### Step 1.3 - 確認理解(中文)
|
303 |
+
|
304 |
+
完全明白了!您的評估架構是:
|
305 |
+
|
306 |
+
## 🎯 **評估指標的測試層級**
|
307 |
+
|
308 |
+
### **單獨測試 Retrieval(指標 2, 3, 4)**
|
309 |
+
|
310 |
+
```python
|
311 |
+
# 只測試 RAG 系統,因為只有 RAG 系統有這些組件:
|
312 |
+
retrieval_only_metrics = [
|
313 |
+
"條件抽取成功率", # 只有您的系統有 user_prompt.py
|
314 |
+
"檢索相關性", # 只有 RAG 系統有檢索結果
|
315 |
+
"檢索覆蓋率" # 只有 RAG 系統有檢索→生成的映射
|
316 |
+
]
|
317 |
+
|
318 |
+
# 測試對象:
|
319 |
+
- Med42-70B_general_RAG(您的系統)✅
|
320 |
+
- Med42-70B_customized_RAG(Jeff系統)✅
|
321 |
+
- Med42-70B_direct(無RAG)❌ 沒有檢索組件
|
322 |
+
- OpenBioLLM-70B(無RAG)❌ 沒有檢索組件
|
323 |
+
```
|
324 |
+
|
325 |
+
### **三模型一起測試(指標 1, 5, 6)**
|
326 |
+
|
327 |
+
```python
|
328 |
+
# 所有模型都能測試的指標:
|
329 |
+
universal_metrics = [
|
330 |
+
"總處理時長", # 所有模型都有響應時間
|
331 |
+
"臨床可操作性", # Llama3-70B 評估所有模型的輸��
|
332 |
+
"臨床證據評分" # Llama3-70B 評估所有模型的輸出
|
333 |
+
]
|
334 |
+
|
335 |
+
# 測試對象:
|
336 |
+
- Med42-70B_direct ✅
|
337 |
+
- Med42-70B_general_RAG ✅
|
338 |
+
- OpenBioLLM-70B ✅
|
339 |
+
```
|
340 |
+
|
341 |
+
## 📊 **分層評估策略**
|
342 |
+
|
343 |
+
### **階段 1:RAG 系統內部評估**
|
344 |
+
|
345 |
+
```python
|
346 |
+
# 只測試有 RAG 的系統
|
347 |
+
rag_systems = ["Med42-70B_general_RAG", "Med42-70B_customized_RAG"]
|
348 |
+
|
349 |
+
# 測試 RAG 特有指標
|
350 |
+
for system in rag_systems:
|
351 |
+
evaluate_retrieval_metrics(system) # 指標 2, 3, 4
|
352 |
+
```
|
353 |
+
|
354 |
+
### **階段 2:全模型比較評估**
|
355 |
+
|
356 |
+
```python
|
357 |
+
# 測試所有三個模型
|
358 |
+
all_systems = ["Med42-70B_direct", "Med42-70B_general_RAG", "OpenBioLLM-70B"]
|
359 |
+
|
360 |
+
# 測試通用指標
|
361 |
+
for system in all_systems:
|
362 |
+
evaluate_universal_metrics(system) # 指標 1, 5, 6
|
363 |
+
```
|
364 |
+
|
365 |
+
### **階段 3:綜合分析**
|
366 |
+
|
367 |
+
```python
|
368 |
+
# 合併兩階段結果,生成完整報告
|
369 |
+
combine_evaluation_results()
|
370 |
+
```
|
371 |
+
|
372 |
+
### Step 1.4 - 無視覺修改確認 ✅
|
373 |
+
|
374 |
+
此為評估策略討論,無涉及代碼修改。
|
375 |
+
|
376 |
+
**您的理解完全正確!RAG 特有的指標只能在 RAG 系統內部測試,而通用指標可以跨所有模型比較。這樣的分層評估策略非常合理!**
|
377 |
+
|
378 |
+
---
|
379 |
+
|
380 |
+
## 📊 第七個評估指標(YanBo系統特有)
|
381 |
+
|
382 |
+
### 7. 多層級 Fallback 效率(早期攔截率)
|
383 |
+
|
384 |
+
**定義:** 系統通過多層級 Fallback 機制在早期層級成功處理查詢的效率
|
385 |
+
|
386 |
+
**測量位置:** `src/user_prompt.py` 的 `extract_condition_keywords` 多層級處理邏輯
|
387 |
+
|
388 |
+
**計算公式:**
|
389 |
+
```
|
390 |
+
Early_Interception_Rate = (Level1_Success + Level2_Success) / Total_Queries
|
391 |
+
|
392 |
+
其中:
|
393 |
+
- Level1_Success = 在預定義映射中直接找到條件的查詢數
|
394 |
+
- Level2_Success = 通過LLM抽取成功的查詢數
|
395 |
+
- Total_Queries = 測試查詢總數
|
396 |
+
|
397 |
+
時間節省效果:
|
398 |
+
Time_Savings = (Late_Avg_Time - Early_Avg_Time) / Late_Avg_Time
|
399 |
+
|
400 |
+
早期攔截效率:
|
401 |
+
Efficiency_Score = Early_Interception_Rate × (1 + Time_Savings)
|
402 |
+
```
|
403 |
+
|
404 |
+
**ASCII 流程圖:**
|
405 |
+
```
|
406 |
+
多層級 Fallback 效率示意圖:
|
407 |
+
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
408 |
+
│ 用戶查詢 │───▶│ Level 1 │───▶│ 直接成功 │
|
409 |
+
│ "胸痛診斷" │ │ 預定義映射 │ │ 35% (快) │
|
410 |
+
└─────────────┘ └─────────────┘ └─────────────┘
|
411 |
+
│
|
412 |
+
▼ (失敗)
|
413 |
+
┌─────────────┐ ┌─────────────┐
|
414 |
+
│ Level 2 │───▶│ LLM抽取成功 │
|
415 |
+
│ LLM 條件抽取│ │ 40% (中等) │
|
416 |
+
└─────────────┘ └─────────────┘
|
417 |
+
│
|
418 |
+
▼ (失敗)
|
419 |
+
┌─────────────┐ ┌─────────────┐
|
420 |
+
│ Level 3-5 │───▶│ 後備成功 │
|
421 |
+
│ 後續層級 │ │ 20% (慢) │
|
422 |
+
└─────────────┘ └─────────────┘
|
423 |
+
│
|
424 |
+
▼ (失敗)
|
425 |
+
┌─────────────┐
|
426 |
+
│ 完全失敗 │
|
427 |
+
│ 5% (錯誤) │
|
428 |
+
└─────────────┘
|
429 |
+
|
430 |
+
早期攔截率 = (35% + 40%) = 75% ✅ 目標 > 70%
|
431 |
+
```
|
432 |
+
|
433 |
+
**實現框架:**
|
434 |
+
```python
|
435 |
+
# 基於 user_prompt.py 的多層級處理邏輯
|
436 |
+
def evaluate_early_interception_efficiency(test_queries: List[str]) -> Dict[str, float]:
|
437 |
+
"""評估早期攔截率 - YanBo系統核心優勢"""
|
438 |
+
|
439 |
+
level1_success = 0 # Level 1: 預定義映射成功
|
440 |
+
level2_success = 0 # Level 2: LLM 抽取成功
|
441 |
+
later_success = 0 # Level 3-5: 後續層級成功
|
442 |
+
total_failures = 0 # 完全失敗
|
443 |
+
|
444 |
+
early_times = [] # 早期成功的處理時間
|
445 |
+
late_times = [] # 後期成功的處理時間
|
446 |
+
|
447 |
+
for query in test_queries:
|
448 |
+
# 追蹤每個查詢的成功層級和時間
|
449 |
+
success_level, processing_time = track_query_success_level(query)
|
450 |
+
|
451 |
+
if success_level == 1:
|
452 |
+
level1_success += 1
|
453 |
+
early_times.append(processing_time)
|
454 |
+
elif success_level == 2:
|
455 |
+
level2_success += 1
|
456 |
+
early_times.append(processing_time)
|
457 |
+
elif success_level in [3, 4, 5]:
|
458 |
+
later_success += 1
|
459 |
+
late_times.append(processing_time)
|
460 |
+
else:
|
461 |
+
total_failures += 1
|
462 |
+
|
463 |
+
total_queries = len(test_queries)
|
464 |
+
early_success_count = level1_success + level2_success
|
465 |
+
|
466 |
+
# 計算時間節省效果
|
467 |
+
early_avg_time = sum(early_times) / len(early_times) if early_times else 0
|
468 |
+
late_avg_time = sum(late_times) / len(late_times) if late_times else 0
|
469 |
+
time_savings = (late_avg_time - early_avg_time) / late_avg_time if late_avg_time > 0 else 0
|
470 |
+
|
471 |
+
# 綜合效率分數
|
472 |
+
early_interception_rate = early_success_count / total_queries
|
473 |
+
efficiency_score = early_interception_rate * (1 + time_savings)
|
474 |
+
|
475 |
+
return {
|
476 |
+
# 核心指標
|
477 |
+
"early_interception_rate": early_interception_rate, # 早期攔截率
|
478 |
+
"level1_success_rate": level1_success / total_queries,
|
479 |
+
"level2_success_rate": level2_success / total_queries,
|
480 |
+
|
481 |
+
# 時間效率
|
482 |
+
"early_avg_time": early_avg_time,
|
483 |
+
"late_avg_time": late_avg_time,
|
484 |
+
"time_savings_rate": time_savings,
|
485 |
+
|
486 |
+
# 系統健康度
|
487 |
+
"total_success_rate": (total_queries - total_failures) / total_queries,
|
488 |
+
"miss_rate": total_failures / total_queries,
|
489 |
+
|
490 |
+
# 綜合效率
|
491 |
+
"overall_efficiency_score": efficiency_score,
|
492 |
+
|
493 |
+
# 詳細分布
|
494 |
+
"success_distribution": {
|
495 |
+
"level1": level1_success,
|
496 |
+
"level2": level2_success,
|
497 |
+
"later_levels": later_success,
|
498 |
+
"failures": total_failures
|
499 |
+
}
|
500 |
+
}
|
501 |
+
|
502 |
+
def track_query_success_level(query: str) -> Tuple[int, float]:
|
503 |
+
"""
|
504 |
+
追蹤查詢在哪個層級成功並記錄時間
|
505 |
+
|
506 |
+
Args:
|
507 |
+
query: 測試查詢
|
508 |
+
|
509 |
+
Returns:
|
510 |
+
Tuple of (success_level, processing_time)
|
511 |
+
"""
|
512 |
+
start_time = time.time()
|
513 |
+
|
514 |
+
# 模擬 user_prompt.py 的層級處理邏輯
|
515 |
+
try:
|
516 |
+
# Level 1: 檢查預定義映射
|
517 |
+
if check_predefined_mapping(query):
|
518 |
+
processing_time = time.time() - start_time
|
519 |
+
return (1, processing_time)
|
520 |
+
|
521 |
+
# Level 2: LLM 條件抽取
|
522 |
+
llm_result = llm_client.analyze_medical_query(query)
|
523 |
+
if llm_result.get('extracted_condition'):
|
524 |
+
processing_time = time.time() - start_time
|
525 |
+
return (2, processing_time)
|
526 |
+
|
527 |
+
# Level 3: 語義搜索
|
528 |
+
semantic_result = semantic_search_fallback(query)
|
529 |
+
if semantic_result:
|
530 |
+
processing_time = time.time() - start_time
|
531 |
+
return (3, processing_time)
|
532 |
+
|
533 |
+
# Level 4: 醫學驗證
|
534 |
+
validation_result = validate_medical_query(query)
|
535 |
+
if not validation_result: # 驗證通過
|
536 |
+
processing_time = time.time() - start_time
|
537 |
+
return (4, processing_time)
|
538 |
+
|
539 |
+
# Level 5: 通用搜索
|
540 |
+
generic_result = generic_medical_search(query)
|
541 |
+
if generic_result:
|
542 |
+
processing_time = time.time() - start_time
|
543 |
+
return (5, processing_time)
|
544 |
+
|
545 |
+
# 完全失敗
|
546 |
+
processing_time = time.time() - start_time
|
547 |
+
return (0, processing_time)
|
548 |
+
|
549 |
+
except Exception as e:
|
550 |
+
processing_time = time.time() - start_time
|
551 |
+
return (0, processing_time)
|
552 |
+
|
553 |
+
def check_predefined_mapping(query: str) -> bool:
|
554 |
+
"""檢查查詢是否在預定義映射中"""
|
555 |
+
# 基於 medical_conditions.py 的 CONDITION_KEYWORD_MAPPING
|
556 |
+
from medical_conditions import CONDITION_KEYWORD_MAPPING
|
557 |
+
|
558 |
+
query_lower = query.lower()
|
559 |
+
for condition, keywords in CONDITION_KEYWORD_MAPPING.items():
|
560 |
+
if any(keyword.lower() in query_lower for keyword in keywords):
|
561 |
+
return True
|
562 |
+
return False
|
563 |
+
```
|
564 |
+
|
565 |
+
**目標閾值:**
|
566 |
+
- 早期攔截率 ≥ 70%(前兩層解決)
|
567 |
+
- 時間節省率 ≥ 60%(早期比後期快)
|
568 |
+
- 總成功率 ≥ 95%(漏接率 < 5%)
|
569 |
+
|
570 |
+
---
|
571 |
+
|
572 |
+
## 🧪 更新的完整評估流程
|
573 |
+
|
574 |
+
### 測試用例設計
|
575 |
+
```python
|
576 |
+
# 基於 readme.md 中的範例查詢設計測試集
|
577 |
+
MEDICAL_TEST_CASES = [
|
578 |
+
# Level 1 預期成功(預定義映射)
|
579 |
+
"患者胸痛怎麼處理?",
|
580 |
+
"心肌梗死的診斷方法?",
|
581 |
+
|
582 |
+
# Level 2 預期成功(LLM抽取)
|
583 |
+
"60歲男性,有高血壓病史,突發胸痛。可能的原因和評估方法?",
|
584 |
+
"30歲患者突發嚴重頭痛和頸部僵硬。鑑別診斷?",
|
585 |
+
|
586 |
+
# Level 3+ 預期成功(複雜查詢)
|
587 |
+
"患者急性呼吸困難和腿部水腫。應該考慮什麼?",
|
588 |
+
"20歲女性,無病史,突發癲癇。可能原因和完整處理流程?",
|
589 |
+
|
590 |
+
# 邊界測試
|
591 |
+
"疑似急性出血性中風。下一步處理?"
|
592 |
+
]
|
593 |
+
```
|
594 |
+
|
595 |
+
### 更新的評估執行流程
|
596 |
+
```python
|
597 |
+
def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str, Any]:
|
598 |
+
"""執行完整的七項指標評估"""
|
599 |
+
|
600 |
+
results = {
|
601 |
+
"model": model_name,
|
602 |
+
"metrics": {},
|
603 |
+
"detailed_results": []
|
604 |
+
}
|
605 |
+
|
606 |
+
total_latencies = []
|
607 |
+
extraction_successes = []
|
608 |
+
relevance_scores = []
|
609 |
+
coverage_scores = []
|
610 |
+
actionability_scores = []
|
611 |
+
evidence_scores = []
|
612 |
+
fallback_efficiency_scores = [] # 新增
|
613 |
+
|
614 |
+
for query in test_cases:
|
615 |
+
# 運行模型並測量所有指標
|
616 |
+
|
617 |
+
# 1. 總處理時長
|
618 |
+
latency_result = measure_total_latency(query)
|
619 |
+
total_latencies.append(latency_result['total_latency'])
|
620 |
+
|
621 |
+
# 2. 條件抽取成功率
|
622 |
+
extraction_result = evaluate_condition_extraction([query])
|
623 |
+
extraction_successes.append(extraction_result['success_rate'])
|
624 |
+
|
625 |
+
# 3 & 4. 檢索相關性和覆蓋率
|
626 |
+
retrieval_results = get_retrieval_results(query)
|
627 |
+
relevance_result = evaluate_retrieval_relevance(retrieval_results)
|
628 |
+
relevance_scores.append(relevance_result['average_relevance'])
|
629 |
+
|
630 |
+
generated_advice = get_generated_advice(query, retrieval_results)
|
631 |
+
coverage_result = evaluate_retrieval_coverage(generated_advice, retrieval_results)
|
632 |
+
coverage_scores.append(coverage_result['coverage'])
|
633 |
+
|
634 |
+
# 5 & 6. LLM 評估
|
635 |
+
response_data = {
|
636 |
+
'query': query,
|
637 |
+
'advice': generated_advice,
|
638 |
+
'retrieval_results': retrieval_results
|
639 |
+
}
|
640 |
+
|
641 |
+
actionability_result = evaluate_clinical_actionability([response_data])
|
642 |
+
actionability_scores.append(actionability_result[0]['overall_score'])
|
643 |
+
|
644 |
+
evidence_result = evaluate_clinical_evidence([response_data])
|
645 |
+
evidence_scores.append(evidence_result[0]['overall_score'])
|
646 |
+
|
647 |
+
# 7. 多層級 Fallback 效率(新增)
|
648 |
+
if model_name == "Med42-70B_general_RAG": # 只對YanBo系統測量
|
649 |
+
fallback_result = evaluate_early_interception_efficiency([query])
|
650 |
+
fallback_efficiency_scores.append(fallback_result['overall_efficiency_score'])
|
651 |
+
|
652 |
+
# 記錄詳細結果...
|
653 |
+
|
654 |
+
# 計算平均指標
|
655 |
+
results["metrics"] = {
|
656 |
+
"average_latency": sum(total_latencies) / len(total_latencies),
|
657 |
+
"extraction_success_rate": sum(extraction_successes) / len(extraction_successes),
|
658 |
+
"average_relevance": sum(relevance_scores) / len(relevance_scores),
|
659 |
+
"average_coverage": sum(coverage_scores) / len(coverage_scores),
|
660 |
+
"average_actionability": sum(actionability_scores) / len(actionability_scores),
|
661 |
+
"average_evidence_score": sum(evidence_scores) / len(evidence_scores),
|
662 |
+
# 新增指標(只對RAG系統有效)
|
663 |
+
"average_fallback_efficiency": sum(fallback_efficiency_scores) / len(fallback_efficiency_scores) if fallback_efficiency_scores else 0.0
|
664 |
+
}
|
665 |
+
|
666 |
+
return results
|
667 |
+
```
|
668 |
+
|
669 |
+
---
|
670 |
+
|
671 |
+
## 📊 更新的系統成功標準
|
672 |
+
|
673 |
+
### 系統性能目標(七個指標)
|
674 |
+
```
|
675 |
+
✅ 達標條件:
|
676 |
+
1. 總處理時長 ≤ 30秒
|
677 |
+
2. 條件抽取成功率 ≥ 80%
|
678 |
+
3. 檢索相關性 ≥ 0.25(基於實際醫學數據)
|
679 |
+
4. 檢索覆蓋率 ≥ 60%
|
680 |
+
5. 臨床可操作性 ≥ 7.0/10
|
681 |
+
6. 臨床證據評分 ≥ 7.5/10
|
682 |
+
7. 早期攔截率 ≥ 70%(多層級 Fallback 效率)
|
683 |
+
|
684 |
+
🎯 YanBo RAG 系統成功標準:
|
685 |
+
- RAG增強版在 5-7 項指標上優於基線 Med42-70B
|
686 |
+
- 早期攔截率體現多層級設計的優勢
|
687 |
+
- 整體提升幅度 ≥ 15%
|
688 |
+
```
|
689 |
+
|
690 |
+
### YanBo 系統特有優勢分析
|
691 |
+
```
|
692 |
+
多層級 Fallback 優勢:
|
693 |
+
├── 漏接防護:通過多層級降低失敗率至 < 5%
|
694 |
+
├── 時間優化:70%+ 查詢在前兩層快速解決
|
695 |
+
├── 系統穩定:即使某層級失敗,後續層級提供保障
|
696 |
+
└── 智能分流:不同複雜度查詢自動分配到合適層級
|
697 |
+
```
|
698 |
+
|
699 |
+
---
|
700 |
+
|
701 |
+
**第七個指標已添加完成,專注測量您的多層級 Fallback 系統的早期攔截效率和時間節省效果。**
|
evaluation/{evaluation_instruction_customization.md → old/evaluation_instruction_customization.md}
RENAMED
File without changes
|
evaluation/old/extraction_evaluator.py
ADDED
@@ -0,0 +1,379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
OnCall.ai System - Condition Extraction Evaluator (Metric 2)
|
4 |
+
============================================================
|
5 |
+
|
6 |
+
Evaluates condition extraction success rate from user_prompt.py
|
7 |
+
Pure automatic evaluation based on extract_condition_keywords() results
|
8 |
+
|
9 |
+
Author: YanBo Chen
|
10 |
+
Date: 2025-08-04
|
11 |
+
"""
|
12 |
+
|
13 |
+
import json
|
14 |
+
import os
|
15 |
+
import sys
|
16 |
+
from typing import Dict, List, Any
|
17 |
+
from datetime import datetime
|
18 |
+
from pathlib import Path
|
19 |
+
import re
|
20 |
+
|
21 |
+
# Add project path
|
22 |
+
current_dir = Path(__file__).parent
|
23 |
+
project_root = current_dir.parent
|
24 |
+
src_dir = project_root / "src"
|
25 |
+
sys.path.insert(0, str(src_dir))
|
26 |
+
|
27 |
+
# Import existing system components
|
28 |
+
try:
|
29 |
+
from user_prompt import UserPromptProcessor
|
30 |
+
from retrieval import BasicRetrievalSystem
|
31 |
+
from llm_clients import llm_Med42_70BClient
|
32 |
+
except ImportError as e:
|
33 |
+
print(f"❌ Import failed: {e}")
|
34 |
+
print("Please ensure running from project root directory")
|
35 |
+
sys.exit(1)
|
36 |
+
|
37 |
+
|
38 |
+
class ExtractionEvaluator:
|
39 |
+
"""Condition extraction success rate evaluator - pure automatic evaluation"""
|
40 |
+
|
41 |
+
def __init__(self):
|
42 |
+
"""Initialize system components for extraction testing"""
|
43 |
+
print("🔧 Initializing Extraction Evaluator...")
|
44 |
+
|
45 |
+
# Initialize required components for extraction
|
46 |
+
self.llm_client = llm_Med42_70BClient()
|
47 |
+
self.retrieval_system = BasicRetrievalSystem()
|
48 |
+
self.user_prompt_processor = UserPromptProcessor(
|
49 |
+
llm_client=self.llm_client,
|
50 |
+
retrieval_system=self.retrieval_system
|
51 |
+
)
|
52 |
+
|
53 |
+
# Results accumulation
|
54 |
+
self.extraction_results = []
|
55 |
+
|
56 |
+
print("✅ Extraction Evaluator initialization complete")
|
57 |
+
|
58 |
+
def evaluate_single_extraction(self, query: str, category: str = "unknown") -> Dict[str, Any]:
|
59 |
+
"""
|
60 |
+
Evaluate condition extraction success for a single query
|
61 |
+
|
62 |
+
Tests user_prompt.py extract_condition_keywords() method
|
63 |
+
|
64 |
+
Args:
|
65 |
+
query: Medical query to test
|
66 |
+
category: Query category (diagnosis/treatment/mixed)
|
67 |
+
"""
|
68 |
+
print(f"🔍 Testing extraction for: {query[:50]}...")
|
69 |
+
print(f"📋 Category: {category}")
|
70 |
+
|
71 |
+
try:
|
72 |
+
# Call the actual extraction method from user_prompt.py
|
73 |
+
extraction_start = datetime.now()
|
74 |
+
condition_result = self.user_prompt_processor.extract_condition_keywords(query)
|
75 |
+
extraction_time = (datetime.now() - extraction_start).total_seconds()
|
76 |
+
|
77 |
+
# Analyze extraction success
|
78 |
+
extracted_condition = condition_result.get('condition')
|
79 |
+
query_status = condition_result.get('query_status')
|
80 |
+
emergency_keywords = condition_result.get('emergency_keywords', [])
|
81 |
+
treatment_keywords = condition_result.get('treatment_keywords', [])
|
82 |
+
fallback_level = condition_result.get('fallback_level', 'unknown')
|
83 |
+
|
84 |
+
# Define success criteria
|
85 |
+
is_successful = (
|
86 |
+
extracted_condition and
|
87 |
+
extracted_condition.strip() and
|
88 |
+
extracted_condition != "unknown" and
|
89 |
+
query_status not in ['invalid_query', 'non_medical']
|
90 |
+
)
|
91 |
+
|
92 |
+
result = {
|
93 |
+
"query": query,
|
94 |
+
"category": category,
|
95 |
+
"extraction_success": is_successful,
|
96 |
+
"extraction_time": extraction_time,
|
97 |
+
"extracted_condition": extracted_condition,
|
98 |
+
"query_status": query_status,
|
99 |
+
"emergency_keywords": emergency_keywords,
|
100 |
+
"treatment_keywords": treatment_keywords,
|
101 |
+
"fallback_level": fallback_level,
|
102 |
+
"full_condition_result": condition_result,
|
103 |
+
"timestamp": datetime.now().isoformat()
|
104 |
+
}
|
105 |
+
|
106 |
+
# Store result
|
107 |
+
self.extraction_results.append(result)
|
108 |
+
|
109 |
+
print(f" ✅ Extraction: {'Success' if is_successful else 'Failed'}")
|
110 |
+
print(f" 📝 Condition: {extracted_condition}")
|
111 |
+
print(f" 🎯 Status: {query_status}")
|
112 |
+
print(f" ⏱️ Time: {extraction_time:.3f}s")
|
113 |
+
print(f" 🔄 Fallback Level: {fallback_level}")
|
114 |
+
|
115 |
+
return result
|
116 |
+
|
117 |
+
except Exception as e:
|
118 |
+
error_result = {
|
119 |
+
"query": query,
|
120 |
+
"category": category,
|
121 |
+
"extraction_success": False,
|
122 |
+
"extraction_time": 0.0,
|
123 |
+
"error": str(e),
|
124 |
+
"timestamp": datetime.now().isoformat()
|
125 |
+
}
|
126 |
+
|
127 |
+
self.extraction_results.append(error_result)
|
128 |
+
print(f" ❌ Extraction failed: {e}")
|
129 |
+
|
130 |
+
return error_result
|
131 |
+
|
132 |
+
def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
|
133 |
+
"""Parse queries from file with category labels"""
|
134 |
+
print(f"📁 Reading queries from file: {filepath}")
|
135 |
+
|
136 |
+
try:
|
137 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
138 |
+
content = f.read()
|
139 |
+
|
140 |
+
# Parse queries with category labels
|
141 |
+
queries_by_category = {
|
142 |
+
"diagnosis": [],
|
143 |
+
"treatment": [],
|
144 |
+
"mixed": []
|
145 |
+
}
|
146 |
+
|
147 |
+
lines = content.strip().split('\n')
|
148 |
+
|
149 |
+
for line in lines:
|
150 |
+
line = line.strip()
|
151 |
+
if not line:
|
152 |
+
continue
|
153 |
+
|
154 |
+
# Parse format: "1.diagnosis: query text"
|
155 |
+
match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
|
156 |
+
if match:
|
157 |
+
category_raw = match.group(1).lower()
|
158 |
+
query_text = match.group(2).strip()
|
159 |
+
|
160 |
+
# Normalize category name
|
161 |
+
if category_raw in ['mixed/complicated', 'mixed']:
|
162 |
+
category = 'mixed'
|
163 |
+
else:
|
164 |
+
category = category_raw
|
165 |
+
|
166 |
+
if category in queries_by_category and len(query_text) > 15:
|
167 |
+
queries_by_category[category].append({
|
168 |
+
"text": query_text,
|
169 |
+
"category": category
|
170 |
+
})
|
171 |
+
|
172 |
+
print(f"📋 Parsed queries by category:")
|
173 |
+
for category, category_queries in queries_by_category.items():
|
174 |
+
print(f" {category.capitalize()}: {len(category_queries)} queries")
|
175 |
+
|
176 |
+
return queries_by_category
|
177 |
+
|
178 |
+
except Exception as e:
|
179 |
+
print(f"❌ Failed to read file: {e}")
|
180 |
+
return {"error": f"Failed to read file: {e}"}
|
181 |
+
|
182 |
+
def calculate_extraction_statistics(self) -> Dict[str, Any]:
|
183 |
+
"""Calculate extraction success statistics by category"""
|
184 |
+
category_stats = {}
|
185 |
+
all_results = []
|
186 |
+
|
187 |
+
# Group results by category
|
188 |
+
results_by_category = {
|
189 |
+
"diagnosis": [],
|
190 |
+
"treatment": [],
|
191 |
+
"mixed": []
|
192 |
+
}
|
193 |
+
|
194 |
+
for result in self.extraction_results:
|
195 |
+
category = result.get('category', 'unknown')
|
196 |
+
if category in results_by_category:
|
197 |
+
results_by_category[category].append(result)
|
198 |
+
all_results.append(result)
|
199 |
+
|
200 |
+
# Calculate statistics for each category
|
201 |
+
for category, results in results_by_category.items():
|
202 |
+
if results:
|
203 |
+
successful = [r for r in results if r.get('extraction_success')]
|
204 |
+
success_rate = len(successful) / len(results)
|
205 |
+
avg_time = sum(r.get('extraction_time', 0) for r in results) / len(results)
|
206 |
+
|
207 |
+
category_stats[category] = {
|
208 |
+
"success_rate": success_rate,
|
209 |
+
"successful_count": len(successful),
|
210 |
+
"total_count": len(results),
|
211 |
+
"average_extraction_time": avg_time,
|
212 |
+
"fallback_levels": [r.get('fallback_level') for r in results]
|
213 |
+
}
|
214 |
+
else:
|
215 |
+
category_stats[category] = {
|
216 |
+
"success_rate": 0.0,
|
217 |
+
"successful_count": 0,
|
218 |
+
"total_count": 0,
|
219 |
+
"average_extraction_time": 0.0,
|
220 |
+
"fallback_levels": []
|
221 |
+
}
|
222 |
+
|
223 |
+
# Calculate overall statistics
|
224 |
+
if all_results:
|
225 |
+
overall_successful = [r for r in all_results if r.get('extraction_success')]
|
226 |
+
overall_stats = {
|
227 |
+
"success_rate": len(overall_successful) / len(all_results),
|
228 |
+
"successful_count": len(overall_successful),
|
229 |
+
"total_count": len(all_results),
|
230 |
+
"average_extraction_time": sum(r.get('extraction_time', 0) for r in all_results) / len(all_results),
|
231 |
+
"target_compliance": len(overall_successful) / len(all_results) >= 0.8
|
232 |
+
}
|
233 |
+
else:
|
234 |
+
overall_stats = {
|
235 |
+
"success_rate": 0.0,
|
236 |
+
"successful_count": 0,
|
237 |
+
"total_count": 0,
|
238 |
+
"average_extraction_time": 0.0,
|
239 |
+
"target_compliance": False
|
240 |
+
}
|
241 |
+
|
242 |
+
return {
|
243 |
+
"category_results": category_stats,
|
244 |
+
"overall_results": overall_stats,
|
245 |
+
"timestamp": datetime.now().isoformat()
|
246 |
+
}
|
247 |
+
|
248 |
+
def save_extraction_statistics(self, filename: str = None) -> str:
|
249 |
+
"""Save extraction statistics for chart generation"""
|
250 |
+
stats = self.calculate_extraction_statistics()
|
251 |
+
|
252 |
+
if filename is None:
|
253 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
254 |
+
filename = f"extraction_statistics_{timestamp}.json"
|
255 |
+
|
256 |
+
# Ensure results directory exists
|
257 |
+
results_dir = Path(__file__).parent / "results"
|
258 |
+
results_dir.mkdir(exist_ok=True)
|
259 |
+
|
260 |
+
filepath = results_dir / filename
|
261 |
+
|
262 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
263 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
264 |
+
|
265 |
+
print(f"📊 Extraction statistics saved to: {filepath}")
|
266 |
+
return str(filepath)
|
267 |
+
|
268 |
+
def save_extraction_details(self, filename: str = None) -> str:
|
269 |
+
"""Save detailed extraction results"""
|
270 |
+
if filename is None:
|
271 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
272 |
+
filename = f"extraction_details_{timestamp}.json"
|
273 |
+
|
274 |
+
# Ensure results directory exists
|
275 |
+
results_dir = Path(__file__).parent / "results"
|
276 |
+
results_dir.mkdir(exist_ok=True)
|
277 |
+
|
278 |
+
filepath = results_dir / filename
|
279 |
+
|
280 |
+
# Create comprehensive extraction data
|
281 |
+
extraction_data = {
|
282 |
+
"evaluation_metadata": {
|
283 |
+
"total_queries": len(self.extraction_results),
|
284 |
+
"timestamp": datetime.now().isoformat(),
|
285 |
+
"evaluator_type": "condition_extraction"
|
286 |
+
},
|
287 |
+
"extraction_results": self.extraction_results
|
288 |
+
}
|
289 |
+
|
290 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
291 |
+
json.dump(extraction_data, f, indent=2, ensure_ascii=False)
|
292 |
+
|
293 |
+
print(f"📝 Extraction details saved to: {filepath}")
|
294 |
+
return str(filepath)
|
295 |
+
|
296 |
+
|
297 |
+
# Independent execution interface
|
298 |
+
if __name__ == "__main__":
|
299 |
+
"""Independent extraction evaluation interface"""
|
300 |
+
|
301 |
+
print("🔍 OnCall.ai Extraction Evaluator - Condition Extraction Success Rate")
|
302 |
+
|
303 |
+
if len(sys.argv) > 1:
|
304 |
+
query_file = sys.argv[1]
|
305 |
+
else:
|
306 |
+
# Default to evaluation/pre_user_query_evaluate.txt
|
307 |
+
query_file = Path(__file__).parent / "pre_user_query_evaluate.txt"
|
308 |
+
|
309 |
+
if not os.path.exists(query_file):
|
310 |
+
print(f"❌ Query file not found: {query_file}")
|
311 |
+
print("Usage: python extraction_evaluator.py [query_file.txt]")
|
312 |
+
sys.exit(1)
|
313 |
+
|
314 |
+
# Initialize evaluator
|
315 |
+
evaluator = ExtractionEvaluator()
|
316 |
+
|
317 |
+
# Parse queries from file
|
318 |
+
queries_by_category = evaluator.parse_queries_from_file(str(query_file))
|
319 |
+
|
320 |
+
if "error" in queries_by_category:
|
321 |
+
print(f"❌ Failed to parse queries: {queries_by_category['error']}")
|
322 |
+
sys.exit(1)
|
323 |
+
|
324 |
+
# Test extraction for each query
|
325 |
+
print(f"\n🧪 Condition Extraction Testing")
|
326 |
+
|
327 |
+
for category, queries in queries_by_category.items():
|
328 |
+
if not queries:
|
329 |
+
continue
|
330 |
+
|
331 |
+
print(f"\n📂 Testing {category.upper()} extraction:")
|
332 |
+
|
333 |
+
for i, query_info in enumerate(queries):
|
334 |
+
query_text = query_info['text']
|
335 |
+
|
336 |
+
# Test extraction
|
337 |
+
result = evaluator.evaluate_single_extraction(query_text, category)
|
338 |
+
|
339 |
+
# Pause between queries to avoid rate limits (if needed)
|
340 |
+
if i < len(queries) - 1:
|
341 |
+
print(f" ⏳ Pausing 3s before next query...")
|
342 |
+
import time
|
343 |
+
time.sleep(3)
|
344 |
+
|
345 |
+
# Pause between categories
|
346 |
+
if category != list(queries_by_category.keys())[-1]:
|
347 |
+
print(f"\n⏳ Pausing 5s before next category...")
|
348 |
+
import time
|
349 |
+
time.sleep(5)
|
350 |
+
|
351 |
+
# Generate and save results
|
352 |
+
print(f"\n📊 Generating extraction analysis...")
|
353 |
+
|
354 |
+
# Save statistics and details
|
355 |
+
stats_path = evaluator.save_extraction_statistics()
|
356 |
+
details_path = evaluator.save_extraction_details()
|
357 |
+
|
358 |
+
# Print final summary
|
359 |
+
stats = evaluator.calculate_extraction_statistics()
|
360 |
+
category_results = stats['category_results']
|
361 |
+
overall_results = stats['overall_results']
|
362 |
+
|
363 |
+
print(f"\n📊 === EXTRACTION EVALUATION SUMMARY ===")
|
364 |
+
print(f"Overall Performance:")
|
365 |
+
print(f" Success Rate: {overall_results['success_rate']:.1%}")
|
366 |
+
print(f" Successful Extractions: {overall_results['successful_count']}/{overall_results['total_count']}")
|
367 |
+
print(f" Average Extraction Time: {overall_results['average_extraction_time']:.3f}s")
|
368 |
+
print(f" 80% Target Compliance: {'✅ Met' if overall_results['target_compliance'] else '❌ Not Met'}")
|
369 |
+
|
370 |
+
print(f"\nCategory Breakdown:")
|
371 |
+
for category, cat_stats in category_results.items():
|
372 |
+
if cat_stats['total_count'] > 0:
|
373 |
+
print(f" {category.capitalize()}: {cat_stats['success_rate']:.1%} "
|
374 |
+
f"({cat_stats['successful_count']}/{cat_stats['total_count']}) "
|
375 |
+
f"[{cat_stats['average_extraction_time']:.3f}s avg]")
|
376 |
+
|
377 |
+
print(f"\n✅ Extraction evaluation complete!")
|
378 |
+
print(f"📊 Statistics: {stats_path}")
|
379 |
+
print(f"📝 Details: {details_path}")
|
evaluation/old/relevance_evaluator.py
ADDED
@@ -0,0 +1,447 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
OnCall.ai System - Retrieval Relevance Evaluator (Metric 3)
|
4 |
+
===========================================================
|
5 |
+
|
6 |
+
Evaluates retrieval relevance using cosine similarity from retrieval.py
|
7 |
+
Automatic evaluation based on existing similarity scores with optional LLM sampling
|
8 |
+
|
9 |
+
Author: YanBo Chen
|
10 |
+
Date: 2025-08-04
|
11 |
+
"""
|
12 |
+
|
13 |
+
import json
|
14 |
+
import os
|
15 |
+
import sys
|
16 |
+
from typing import Dict, List, Any
|
17 |
+
from datetime import datetime
|
18 |
+
from pathlib import Path
|
19 |
+
import re
|
20 |
+
import numpy as np
|
21 |
+
|
22 |
+
# Add project path
|
23 |
+
current_dir = Path(__file__).parent
|
24 |
+
project_root = current_dir.parent
|
25 |
+
src_dir = project_root / "src"
|
26 |
+
sys.path.insert(0, str(src_dir))
|
27 |
+
|
28 |
+
# Import existing system components
|
29 |
+
try:
|
30 |
+
from user_prompt import UserPromptProcessor
|
31 |
+
from retrieval import BasicRetrievalSystem
|
32 |
+
from llm_clients import llm_Med42_70BClient
|
33 |
+
except ImportError as e:
|
34 |
+
print(f"❌ Import failed: {e}")
|
35 |
+
print("Please ensure running from project root directory")
|
36 |
+
sys.exit(1)
|
37 |
+
|
38 |
+
|
39 |
+
class RelevanceEvaluator:
|
40 |
+
"""Retrieval relevance evaluator using cosine similarity - automatic evaluation"""
|
41 |
+
|
42 |
+
def __init__(self):
|
43 |
+
"""Initialize system components for relevance testing"""
|
44 |
+
print("🔧 Initializing Relevance Evaluator...")
|
45 |
+
|
46 |
+
# Initialize required components
|
47 |
+
self.llm_client = llm_Med42_70BClient()
|
48 |
+
self.retrieval_system = BasicRetrievalSystem()
|
49 |
+
self.user_prompt_processor = UserPromptProcessor(
|
50 |
+
llm_client=self.llm_client,
|
51 |
+
retrieval_system=self.retrieval_system
|
52 |
+
)
|
53 |
+
|
54 |
+
# Results accumulation
|
55 |
+
self.relevance_results = []
|
56 |
+
|
57 |
+
print("✅ Relevance Evaluator initialization complete")
|
58 |
+
|
59 |
+
def evaluate_single_relevance(self, query: str, category: str = "unknown") -> Dict[str, Any]:
|
60 |
+
"""
|
61 |
+
Evaluate retrieval relevance for a single query
|
62 |
+
|
63 |
+
Uses existing cosine similarity scores from retrieval.py
|
64 |
+
|
65 |
+
Args:
|
66 |
+
query: Medical query to test
|
67 |
+
category: Query category (diagnosis/treatment/mixed)
|
68 |
+
"""
|
69 |
+
print(f"🔍 Testing relevance for: {query[:50]}...")
|
70 |
+
print(f"📋 Category: {category}")
|
71 |
+
|
72 |
+
try:
|
73 |
+
# Step 1: Extract condition for search query construction
|
74 |
+
condition_result = self.user_prompt_processor.extract_condition_keywords(query)
|
75 |
+
|
76 |
+
# Step 2: Perform retrieval (same as latency_evaluator.py)
|
77 |
+
search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
|
78 |
+
if not search_query:
|
79 |
+
search_query = condition_result.get('condition', query)
|
80 |
+
|
81 |
+
retrieval_start = datetime.now()
|
82 |
+
retrieval_results = self.retrieval_system.search(search_query, top_k=5)
|
83 |
+
retrieval_time = (datetime.now() - retrieval_start).total_seconds()
|
84 |
+
|
85 |
+
# Step 3: Extract similarity scores from retrieval results
|
86 |
+
processed_results = retrieval_results.get('processed_results', [])
|
87 |
+
|
88 |
+
if not processed_results:
|
89 |
+
result = {
|
90 |
+
"query": query,
|
91 |
+
"category": category,
|
92 |
+
"search_query": search_query,
|
93 |
+
"retrieval_success": False,
|
94 |
+
"average_relevance": 0.0,
|
95 |
+
"relevance_scores": [],
|
96 |
+
"retrieved_count": 0,
|
97 |
+
"retrieval_time": retrieval_time,
|
98 |
+
"error": "No retrieval results",
|
99 |
+
"timestamp": datetime.now().isoformat()
|
100 |
+
}
|
101 |
+
|
102 |
+
self.relevance_results.append(result)
|
103 |
+
print(f" ❌ No retrieval results found")
|
104 |
+
return result
|
105 |
+
|
106 |
+
# Extract cosine similarity scores
|
107 |
+
similarity_scores = []
|
108 |
+
retrieval_details = []
|
109 |
+
|
110 |
+
for i, doc_result in enumerate(processed_results):
|
111 |
+
# Get similarity score (may be stored as 'distance', 'similarity_score', or 'score')
|
112 |
+
similarity = (
|
113 |
+
doc_result.get('distance', 0.0) or
|
114 |
+
doc_result.get('similarity_score', 0.0) or
|
115 |
+
doc_result.get('score', 0.0)
|
116 |
+
)
|
117 |
+
|
118 |
+
similarity_scores.append(similarity)
|
119 |
+
|
120 |
+
retrieval_details.append({
|
121 |
+
"doc_index": i,
|
122 |
+
"similarity_score": similarity,
|
123 |
+
"content_snippet": doc_result.get('content', '')[:100] + "...",
|
124 |
+
"doc_type": doc_result.get('type', 'unknown'),
|
125 |
+
"source": doc_result.get('source', 'unknown')
|
126 |
+
})
|
127 |
+
|
128 |
+
# Calculate relevance metrics
|
129 |
+
average_relevance = sum(similarity_scores) / len(similarity_scores)
|
130 |
+
max_relevance = max(similarity_scores)
|
131 |
+
min_relevance = min(similarity_scores)
|
132 |
+
|
133 |
+
# Count high-relevance results (threshold: 0.2 based on evaluation_instruction.md)
|
134 |
+
high_relevance_count = sum(1 for score in similarity_scores if score >= 0.2)
|
135 |
+
high_relevance_ratio = high_relevance_count / len(similarity_scores)
|
136 |
+
|
137 |
+
result = {
|
138 |
+
"query": query,
|
139 |
+
"category": category,
|
140 |
+
"search_query": search_query,
|
141 |
+
"retrieval_success": True,
|
142 |
+
"average_relevance": average_relevance,
|
143 |
+
"max_relevance": max_relevance,
|
144 |
+
"min_relevance": min_relevance,
|
145 |
+
"relevance_scores": similarity_scores,
|
146 |
+
"high_relevance_count": high_relevance_count,
|
147 |
+
"high_relevance_ratio": high_relevance_ratio,
|
148 |
+
"retrieved_count": len(processed_results),
|
149 |
+
"retrieval_time": retrieval_time,
|
150 |
+
"retrieval_details": retrieval_details,
|
151 |
+
"meets_threshold": average_relevance >= 0.2,
|
152 |
+
"timestamp": datetime.now().isoformat()
|
153 |
+
}
|
154 |
+
|
155 |
+
# Store result
|
156 |
+
self.relevance_results.append(result)
|
157 |
+
|
158 |
+
print(f" ✅ Retrieval: {len(processed_results)} documents")
|
159 |
+
print(f" 📊 Average Relevance: {average_relevance:.3f}")
|
160 |
+
print(f" 📈 High Relevance (≥0.2): {high_relevance_count}/{len(processed_results)} ({high_relevance_ratio:.1%})")
|
161 |
+
print(f" 🎯 Threshold: {'✅ Met' if result['meets_threshold'] else '❌ Not Met'}")
|
162 |
+
print(f" ⏱️ Retrieval Time: {retrieval_time:.3f}s")
|
163 |
+
|
164 |
+
return result
|
165 |
+
|
166 |
+
except Exception as e:
|
167 |
+
error_result = {
|
168 |
+
"query": query,
|
169 |
+
"category": category,
|
170 |
+
"retrieval_success": False,
|
171 |
+
"average_relevance": 0.0,
|
172 |
+
"error": str(e),
|
173 |
+
"timestamp": datetime.now().isoformat()
|
174 |
+
}
|
175 |
+
|
176 |
+
self.relevance_results.append(error_result)
|
177 |
+
print(f" ❌ Relevance evaluation failed: {e}")
|
178 |
+
|
179 |
+
return error_result
|
180 |
+
|
181 |
+
def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
|
182 |
+
"""Parse queries from file with category labels"""
|
183 |
+
print(f"📁 Reading queries from file: {filepath}")
|
184 |
+
|
185 |
+
try:
|
186 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
187 |
+
content = f.read()
|
188 |
+
|
189 |
+
# Parse queries with category labels
|
190 |
+
queries_by_category = {
|
191 |
+
"diagnosis": [],
|
192 |
+
"treatment": [],
|
193 |
+
"mixed": []
|
194 |
+
}
|
195 |
+
|
196 |
+
lines = content.strip().split('\n')
|
197 |
+
|
198 |
+
for line in lines:
|
199 |
+
line = line.strip()
|
200 |
+
if not line:
|
201 |
+
continue
|
202 |
+
|
203 |
+
# Parse format: "1.diagnosis: query text"
|
204 |
+
match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
|
205 |
+
if match:
|
206 |
+
category_raw = match.group(1).lower()
|
207 |
+
query_text = match.group(2).strip()
|
208 |
+
|
209 |
+
# Normalize category name
|
210 |
+
if category_raw in ['mixed/complicated', 'mixed']:
|
211 |
+
category = 'mixed'
|
212 |
+
else:
|
213 |
+
category = category_raw
|
214 |
+
|
215 |
+
if category in queries_by_category and len(query_text) > 15:
|
216 |
+
queries_by_category[category].append({
|
217 |
+
"text": query_text,
|
218 |
+
"category": category
|
219 |
+
})
|
220 |
+
|
221 |
+
print(f"📋 Parsed queries by category:")
|
222 |
+
for category, category_queries in queries_by_category.items():
|
223 |
+
print(f" {category.capitalize()}: {len(category_queries)} queries")
|
224 |
+
|
225 |
+
return queries_by_category
|
226 |
+
|
227 |
+
except Exception as e:
|
228 |
+
print(f"❌ Failed to read file: {e}")
|
229 |
+
return {"error": f"Failed to read file: {e}"}
|
230 |
+
|
231 |
+
def calculate_relevance_statistics(self) -> Dict[str, Any]:
|
232 |
+
"""Calculate relevance statistics by category"""
|
233 |
+
category_stats = {}
|
234 |
+
all_successful_results = []
|
235 |
+
|
236 |
+
# Group results by category
|
237 |
+
results_by_category = {
|
238 |
+
"diagnosis": [],
|
239 |
+
"treatment": [],
|
240 |
+
"mixed": []
|
241 |
+
}
|
242 |
+
|
243 |
+
for result in self.relevance_results:
|
244 |
+
category = result.get('category', 'unknown')
|
245 |
+
if category in results_by_category:
|
246 |
+
results_by_category[category].append(result)
|
247 |
+
if result.get('retrieval_success'):
|
248 |
+
all_successful_results.append(result)
|
249 |
+
|
250 |
+
# Calculate statistics for each category
|
251 |
+
for category, results in results_by_category.items():
|
252 |
+
successful_results = [r for r in results if r.get('retrieval_success')]
|
253 |
+
|
254 |
+
if successful_results:
|
255 |
+
avg_relevance = sum(r['average_relevance'] for r in successful_results) / len(successful_results)
|
256 |
+
relevance_scores = [r['average_relevance'] for r in successful_results]
|
257 |
+
avg_retrieval_time = sum(r.get('retrieval_time', 0) for r in successful_results) / len(successful_results)
|
258 |
+
|
259 |
+
category_stats[category] = {
|
260 |
+
"average_relevance": avg_relevance,
|
261 |
+
"max_relevance": max(relevance_scores),
|
262 |
+
"min_relevance": min(relevance_scores),
|
263 |
+
"successful_retrievals": len(successful_results),
|
264 |
+
"total_queries": len(results),
|
265 |
+
"success_rate": len(successful_results) / len(results),
|
266 |
+
"average_retrieval_time": avg_retrieval_time,
|
267 |
+
"meets_threshold": avg_relevance >= 0.2,
|
268 |
+
"individual_relevance_scores": relevance_scores
|
269 |
+
}
|
270 |
+
else:
|
271 |
+
category_stats[category] = {
|
272 |
+
"average_relevance": 0.0,
|
273 |
+
"max_relevance": 0.0,
|
274 |
+
"min_relevance": 0.0,
|
275 |
+
"successful_retrievals": 0,
|
276 |
+
"total_queries": len(results),
|
277 |
+
"success_rate": 0.0,
|
278 |
+
"average_retrieval_time": 0.0,
|
279 |
+
"meets_threshold": False,
|
280 |
+
"individual_relevance_scores": []
|
281 |
+
}
|
282 |
+
|
283 |
+
# Calculate overall statistics
|
284 |
+
if all_successful_results:
|
285 |
+
all_relevance_scores = [r['average_relevance'] for r in all_successful_results]
|
286 |
+
overall_stats = {
|
287 |
+
"average_relevance": sum(all_relevance_scores) / len(all_relevance_scores),
|
288 |
+
"max_relevance": max(all_relevance_scores),
|
289 |
+
"min_relevance": min(all_relevance_scores),
|
290 |
+
"successful_retrievals": len(all_successful_results),
|
291 |
+
"total_queries": len(self.relevance_results),
|
292 |
+
"success_rate": len(all_successful_results) / len(self.relevance_results),
|
293 |
+
"meets_threshold": (sum(all_relevance_scores) / len(all_relevance_scores)) >= 0.2,
|
294 |
+
"target_compliance": (sum(all_relevance_scores) / len(all_relevance_scores)) >= 0.25
|
295 |
+
}
|
296 |
+
else:
|
297 |
+
overall_stats = {
|
298 |
+
"average_relevance": 0.0,
|
299 |
+
"max_relevance": 0.0,
|
300 |
+
"min_relevance": 0.0,
|
301 |
+
"successful_retrievals": 0,
|
302 |
+
"total_queries": len(self.relevance_results),
|
303 |
+
"success_rate": 0.0,
|
304 |
+
"meets_threshold": False,
|
305 |
+
"target_compliance": False
|
306 |
+
}
|
307 |
+
|
308 |
+
return {
|
309 |
+
"category_results": category_stats,
|
310 |
+
"overall_results": overall_stats,
|
311 |
+
"timestamp": datetime.now().isoformat()
|
312 |
+
}
|
313 |
+
|
314 |
+
def save_relevance_statistics(self, filename: str = None) -> str:
|
315 |
+
"""Save relevance statistics for chart generation"""
|
316 |
+
stats = self.calculate_relevance_statistics()
|
317 |
+
|
318 |
+
if filename is None:
|
319 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
320 |
+
filename = f"relevance_statistics_{timestamp}.json"
|
321 |
+
|
322 |
+
# Ensure results directory exists
|
323 |
+
results_dir = Path(__file__).parent / "results"
|
324 |
+
results_dir.mkdir(exist_ok=True)
|
325 |
+
|
326 |
+
filepath = results_dir / filename
|
327 |
+
|
328 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
329 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
330 |
+
|
331 |
+
print(f"📊 Relevance statistics saved to: {filepath}")
|
332 |
+
return str(filepath)
|
333 |
+
|
334 |
+
def save_relevance_details(self, filename: str = None) -> str:
|
335 |
+
"""Save detailed relevance results"""
|
336 |
+
if filename is None:
|
337 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
338 |
+
filename = f"relevance_details_{timestamp}.json"
|
339 |
+
|
340 |
+
# Ensure results directory exists
|
341 |
+
results_dir = Path(__file__).parent / "results"
|
342 |
+
results_dir.mkdir(exist_ok=True)
|
343 |
+
|
344 |
+
filepath = results_dir / filename
|
345 |
+
|
346 |
+
# Create comprehensive relevance data
|
347 |
+
relevance_data = {
|
348 |
+
"evaluation_metadata": {
|
349 |
+
"total_queries": len(self.relevance_results),
|
350 |
+
"successful_retrievals": len([r for r in self.relevance_results if r.get('retrieval_success')]),
|
351 |
+
"timestamp": datetime.now().isoformat(),
|
352 |
+
"evaluator_type": "retrieval_relevance",
|
353 |
+
"threshold_used": 0.2
|
354 |
+
},
|
355 |
+
"relevance_results": self.relevance_results
|
356 |
+
}
|
357 |
+
|
358 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
359 |
+
json.dump(relevance_data, f, indent=2, ensure_ascii=False)
|
360 |
+
|
361 |
+
print(f"📝 Relevance details saved to: {filepath}")
|
362 |
+
return str(filepath)
|
363 |
+
|
364 |
+
|
365 |
+
# Independent execution interface
|
366 |
+
if __name__ == "__main__":
|
367 |
+
"""Independent relevance evaluation interface"""
|
368 |
+
|
369 |
+
print("📊 OnCall.ai Relevance Evaluator - Retrieval Relevance Analysis")
|
370 |
+
|
371 |
+
if len(sys.argv) > 1:
|
372 |
+
query_file = sys.argv[1]
|
373 |
+
else:
|
374 |
+
# Default to evaluation/pre_user_query_evaluate.txt
|
375 |
+
query_file = Path(__file__).parent / "pre_user_query_evaluate.txt"
|
376 |
+
|
377 |
+
if not os.path.exists(query_file):
|
378 |
+
print(f"❌ Query file not found: {query_file}")
|
379 |
+
print("Usage: python relevance_evaluator.py [query_file.txt]")
|
380 |
+
sys.exit(1)
|
381 |
+
|
382 |
+
# Initialize evaluator
|
383 |
+
evaluator = RelevanceEvaluator()
|
384 |
+
|
385 |
+
# Parse queries from file
|
386 |
+
queries_by_category = evaluator.parse_queries_from_file(str(query_file))
|
387 |
+
|
388 |
+
if "error" in queries_by_category:
|
389 |
+
print(f"❌ Failed to parse queries: {queries_by_category['error']}")
|
390 |
+
sys.exit(1)
|
391 |
+
|
392 |
+
# Test relevance for each query
|
393 |
+
print(f"\n🧪 Retrieval Relevance Testing")
|
394 |
+
|
395 |
+
for category, queries in queries_by_category.items():
|
396 |
+
if not queries:
|
397 |
+
continue
|
398 |
+
|
399 |
+
print(f"\n📂 Testing {category.upper()} relevance:")
|
400 |
+
|
401 |
+
for i, query_info in enumerate(queries):
|
402 |
+
query_text = query_info['text']
|
403 |
+
|
404 |
+
# Test relevance
|
405 |
+
result = evaluator.evaluate_single_relevance(query_text, category)
|
406 |
+
|
407 |
+
# Pause between queries to avoid rate limits
|
408 |
+
if i < len(queries) - 1:
|
409 |
+
print(f" ⏳ Pausing 3s before next query...")
|
410 |
+
import time
|
411 |
+
time.sleep(3)
|
412 |
+
|
413 |
+
# Pause between categories
|
414 |
+
if category != list(queries_by_category.keys())[-1]:
|
415 |
+
print(f"\n⏳ Pausing 5s before next category...")
|
416 |
+
import time
|
417 |
+
time.sleep(5)
|
418 |
+
|
419 |
+
# Generate and save results
|
420 |
+
print(f"\n📊 Generating relevance analysis...")
|
421 |
+
|
422 |
+
# Save statistics and details
|
423 |
+
stats_path = evaluator.save_relevance_statistics()
|
424 |
+
details_path = evaluator.save_relevance_details()
|
425 |
+
|
426 |
+
# Print final summary
|
427 |
+
stats = evaluator.calculate_relevance_statistics()
|
428 |
+
category_results = stats['category_results']
|
429 |
+
overall_results = stats['overall_results']
|
430 |
+
|
431 |
+
print(f"\n📊 === RELEVANCE EVALUATION SUMMARY ===")
|
432 |
+
print(f"Overall Performance:")
|
433 |
+
print(f" Average Relevance: {overall_results['average_relevance']:.3f}")
|
434 |
+
print(f" Retrieval Success Rate: {overall_results['success_rate']:.1%}")
|
435 |
+
print(f" 0.2 Threshold: {'✅ Met' if overall_results['meets_threshold'] else '❌ Not Met'}")
|
436 |
+
print(f" 0.25 Target: {'✅ Met' if overall_results['target_compliance'] else '❌ Not Met'}")
|
437 |
+
|
438 |
+
print(f"\nCategory Breakdown:")
|
439 |
+
for category, cat_stats in category_results.items():
|
440 |
+
if cat_stats['total_queries'] > 0:
|
441 |
+
print(f" {category.capitalize()}: {cat_stats['average_relevance']:.3f} "
|
442 |
+
f"({cat_stats['successful_retrievals']}/{cat_stats['total_queries']}) "
|
443 |
+
f"[{cat_stats['average_retrieval_time']:.3f}s avg]")
|
444 |
+
|
445 |
+
print(f"\n✅ Relevance evaluation complete!")
|
446 |
+
print(f"📊 Statistics: {stats_path}")
|
447 |
+
print(f"📝 Details: {details_path}")
|
evaluation/pre_user_query_evaluate.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
1.diagnosis: 60-year-old patient with hypertension history, sudden chest pain. What are possible causes and how to assess?
|
2 |
+
|
3 |
+
2.treatment: Suspected acute ischemic stroke. Tell me the next steps to take
|
4 |
+
|
5 |
+
3.mixed/complicated: 20 y/f , porphyria, sudden seizure. What are possible causes and complete management workflow?
|
evaluation/single_test_query.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
1.diagnosis: 60-year-old patient with hypertension history, sudden chest pain. What are possible causes and how to assess?
|
evaluation/user_query.txt
CHANGED
@@ -17,18 +17,18 @@ Suspected acute ischemic stroke. Tell me the next steps to take
|
|
17 |
|
18 |
### 一、Diagnosis-Focused(診斷為主)
|
19 |
|
20 |
-
1. I have a 68-year-old man with atrial fibrillation presenting with sudden slurred speech and right-sided weakness
|
21 |
-
2. A 40-year-old woman reports fever, urinary frequency, and dysuria
|
22 |
-
3. A 50-year-old patient has progressive dyspnea on exertion and orthopnea over two weeks
|
23 |
|
24 |
### 二、Treatment-Focused(治療為主)
|
25 |
|
26 |
-
4. ECG shows a suspected acute STEMI
|
27 |
-
5. I have a patient diagnosed with bacterial meningitis
|
28 |
6. A patient is in septic shock with BP 80/50 mmHg and HR 120 bpm—what fluid resuscitation and vasopressor strategy would you recommend?
|
29 |
|
30 |
### 三、Mixed(診斷+治療綜合)
|
31 |
|
32 |
7. A 75-year-old diabetic presents with a non-healing foot ulcer and fever—what differential for osteomyelitis, diagnostic workup, and management plan do you suggest?
|
33 |
-
8. A 60-year-old COPD patient has worsening dyspnea and hypercapnia on ABG
|
34 |
-
9. A 28-year-old woman is experiencing postpartum hemorrhage
|
|
|
17 |
|
18 |
### 一、Diagnosis-Focused(診斷為主)
|
19 |
|
20 |
+
1. I have a 68-year-old man with atrial fibrillation presenting with sudden slurred speech and right-sided weakness. what are the possible diagnoses, and how would you evaluate them?
|
21 |
+
2. A 40-year-old woman reports fever, urinary frequency, and dysuria. what differential diagnoses should I consider, and which tests would you order?
|
22 |
+
3. A 50-year-old patient has progressive dyspnea on exertion and orthopnea over two weeks. what are the likely causes, and what diagnostic steps should I take?
|
23 |
|
24 |
### 二、Treatment-Focused(治療為主)
|
25 |
|
26 |
+
4. ECG shows a suspected acute STEMI. what immediate interventions should I initiate in the next five minutes?
|
27 |
+
5. I have a patient diagnosed with bacterial meningitis. What empiric antibiotic regimen and supportive measures should I implement?
|
28 |
6. A patient is in septic shock with BP 80/50 mmHg and HR 120 bpm—what fluid resuscitation and vasopressor strategy would you recommend?
|
29 |
|
30 |
### 三、Mixed(診斷+治療綜合)
|
31 |
|
32 |
7. A 75-year-old diabetic presents with a non-healing foot ulcer and fever—what differential for osteomyelitis, diagnostic workup, and management plan do you suggest?
|
33 |
+
8. A 60-year-old COPD patient has worsening dyspnea and hypercapnia on ABG. How would you confirm the diagnosis, and what is your stepwise treatment approach?
|
34 |
+
9. A 28-year-old woman is experiencing postpartum hemorrhage. what are the possible causes, what immediate resuscitation steps should I take, and how would you proceed with definitive management?
|
src/generation.py
CHANGED
@@ -30,7 +30,7 @@ logger = logging.getLogger(__name__)
|
|
30 |
|
31 |
# Fallback Generation Configuration (Simplified Architecture)
|
32 |
FALLBACK_TIMEOUTS = {
|
33 |
-
"primary":
|
34 |
"fallback_1": 1.0, # RAG template generation (renamed from fallback_2)
|
35 |
"fallback_2": 0.1 # Minimal template generation (instant)
|
36 |
}
|
@@ -308,14 +308,14 @@ class MedicalAdviceGenerator:
|
|
308 |
# Special formatting for hospital-specific guidelines
|
309 |
source_label = "Hospital Protocol"
|
310 |
context_part = f"""
|
311 |
-
[Guideline {i}] (Source: {source_label}, Relevance: {1-distance:.3f})
|
312 |
-
📋 {chunk.get('matched', 'Hospital Document')}
|
313 |
-
{chunk_text}
|
314 |
""".strip()
|
315 |
else:
|
316 |
context_part = f"""
|
317 |
-
[Guideline {i}] (Source: {chunk_type.title()},
|
318 |
-
{chunk_text}
|
319 |
""".strip()
|
320 |
|
321 |
context_parts.append(context_part)
|
|
|
30 |
|
31 |
# Fallback Generation Configuration (Simplified Architecture)
|
32 |
FALLBACK_TIMEOUTS = {
|
33 |
+
"primary": 60.0, # Primary Med42-70B increased timeout for stable evaluation
|
34 |
"fallback_1": 1.0, # RAG template generation (renamed from fallback_2)
|
35 |
"fallback_2": 0.1 # Minimal template generation (instant)
|
36 |
}
|
|
|
308 |
# Special formatting for hospital-specific guidelines
|
309 |
source_label = "Hospital Protocol"
|
310 |
context_part = f"""
|
311 |
+
[Guideline {i}] (Source: {source_label}, Relevance: {1-distance:.3f})
|
312 |
+
📋 {chunk.get('matched', 'Hospital Document')}
|
313 |
+
{chunk_text}
|
314 |
""".strip()
|
315 |
else:
|
316 |
context_part = f"""
|
317 |
+
[Guideline {i}] (Source: {chunk_type.title()}, Angular Distance: {distance:.3f})
|
318 |
+
{chunk_text}
|
319 |
""".strip()
|
320 |
|
321 |
context_parts.append(context_part)
|
src/llm_clients.py
CHANGED
@@ -9,6 +9,8 @@ Date: 2025-07-29
|
|
9 |
|
10 |
import logging
|
11 |
import os
|
|
|
|
|
12 |
from typing import Dict, Optional, Union, List
|
13 |
from huggingface_hub import InferenceClient
|
14 |
from dotenv import load_dotenv
|
@@ -68,6 +70,91 @@ class llm_Med42_70BClient:
|
|
68 |
self.logger.error(f"Detailed Error: {repr(e)}")
|
69 |
raise ValueError(f"Failed to initialize Medical LLM client: {str(e)}") from e
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
def analyze_medical_query(
|
72 |
self,
|
73 |
query: str,
|
@@ -138,6 +225,13 @@ class llm_Med42_70BClient:
|
|
138 |
self.logger.info(f"Raw LLM Response: {response_text}")
|
139 |
self.logger.info(f"Query Latency: {latency:.4f} seconds")
|
140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
# Detect abnormal response
|
142 |
if self._is_abnormal_response(response_text):
|
143 |
self.logger.error(f"❌ Abnormal LLM response detected: {response_text[:50]}...")
|
@@ -149,15 +243,12 @@ class llm_Med42_70BClient:
|
|
149 |
'latency': latency
|
150 |
}
|
151 |
|
152 |
-
# Extract condition from response
|
153 |
-
extracted_condition = self._extract_condition(response_text)
|
154 |
-
|
155 |
# Log the extracted condition
|
156 |
self.logger.info(f"Extracted Condition: {extracted_condition}")
|
157 |
|
158 |
return {
|
159 |
'extracted_condition': extracted_condition,
|
160 |
-
'confidence':
|
161 |
'raw_response': response_text,
|
162 |
'latency': latency # Add latency to the return dictionary
|
163 |
}
|
@@ -264,7 +355,7 @@ Focus on: conditions, symptoms, procedures, body systems."""
|
|
264 |
|
265 |
def _extract_condition(self, response: str) -> str:
|
266 |
"""
|
267 |
-
Extract medical condition from model response.
|
268 |
|
269 |
Args:
|
270 |
response: Full model-generated text
|
@@ -272,18 +363,29 @@ Focus on: conditions, symptoms, procedures, body systems."""
|
|
272 |
Returns:
|
273 |
Extracted medical condition or empty string if non-medical
|
274 |
"""
|
|
|
|
|
275 |
# Check if this is a rejection response first
|
276 |
if self._is_rejection_response(response):
|
277 |
return ""
|
278 |
|
279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
|
281 |
-
#
|
282 |
for condition in CONDITION_KEYWORD_MAPPING.keys():
|
283 |
if condition.lower() in response.lower():
|
284 |
return condition
|
285 |
|
286 |
-
return
|
287 |
|
288 |
def _is_abnormal_response(self, response: str) -> bool:
|
289 |
"""
|
@@ -439,5 +541,136 @@ def main():
|
|
439 |
'total_execution_time': total_execution_time
|
440 |
}
|
441 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
442 |
if __name__ == "__main__":
|
443 |
main()
|
|
|
9 |
|
10 |
import logging
|
11 |
import os
|
12 |
+
import json
|
13 |
+
import re
|
14 |
from typing import Dict, Optional, Union, List
|
15 |
from huggingface_hub import InferenceClient
|
16 |
from dotenv import load_dotenv
|
|
|
70 |
self.logger.error(f"Detailed Error: {repr(e)}")
|
71 |
raise ValueError(f"Failed to initialize Medical LLM client: {str(e)}") from e
|
72 |
|
73 |
+
def fix_json_formatting(self, response_text: str) -> str:
|
74 |
+
"""
|
75 |
+
Fix common JSON formatting errors
|
76 |
+
|
77 |
+
Args:
|
78 |
+
response_text: Raw response text that may contain JSON errors
|
79 |
+
|
80 |
+
Returns:
|
81 |
+
Fixed JSON string
|
82 |
+
"""
|
83 |
+
# 1. Fix missing commas between key-value pairs
|
84 |
+
# Look for "value" "key" pattern and add comma
|
85 |
+
fixed = re.sub(r'"\s*\n\s*"', '",\n "', response_text)
|
86 |
+
|
87 |
+
# 2. Fix missing commas between values and keys
|
88 |
+
fixed = re.sub(r'"\s*(["\[])', '",\1', fixed)
|
89 |
+
|
90 |
+
# 3. Remove trailing commas
|
91 |
+
fixed = re.sub(r',\s*}', '}', fixed)
|
92 |
+
fixed = re.sub(r',\s*]', ']', fixed)
|
93 |
+
|
94 |
+
# 4. Ensure string values are properly quoted
|
95 |
+
fixed = re.sub(r':\s*([^",{}\[\]]+)\s*([,}])', r': "\1"\2', fixed)
|
96 |
+
|
97 |
+
return fixed
|
98 |
+
|
99 |
+
def parse_medical_response(self, response_text: str) -> Dict:
|
100 |
+
"""
|
101 |
+
Enhanced JSON parsing logic with error recovery
|
102 |
+
|
103 |
+
Args:
|
104 |
+
response_text: Raw response text from Med42-70B
|
105 |
+
|
106 |
+
Returns:
|
107 |
+
Parsed response dictionary
|
108 |
+
"""
|
109 |
+
try:
|
110 |
+
return json.loads(response_text)
|
111 |
+
except json.JSONDecodeError as e:
|
112 |
+
self.logger.warning(f"Initial JSON parsing failed: {e}")
|
113 |
+
|
114 |
+
# Attempt to fix common JSON errors
|
115 |
+
try:
|
116 |
+
fixed_response = self.fix_json_formatting(response_text)
|
117 |
+
self.logger.info("Attempting to parse fixed JSON")
|
118 |
+
return json.loads(fixed_response)
|
119 |
+
except json.JSONDecodeError as e2:
|
120 |
+
self.logger.error(f"Fixed JSON parsing also failed: {e2}")
|
121 |
+
|
122 |
+
# Try to extract partial information
|
123 |
+
try:
|
124 |
+
return self.extract_partial_medical_info(response_text)
|
125 |
+
except:
|
126 |
+
# Final fallback format
|
127 |
+
return {
|
128 |
+
"extracted_condition": "parsing_error",
|
129 |
+
"confidence": "0.0",
|
130 |
+
"is_medical": True,
|
131 |
+
"raw_response": response_text,
|
132 |
+
"error": str(e)
|
133 |
+
}
|
134 |
+
|
135 |
+
def extract_partial_medical_info(self, response_text: str) -> Dict:
|
136 |
+
"""
|
137 |
+
Extract partial medical information from malformed response
|
138 |
+
|
139 |
+
Args:
|
140 |
+
response_text: Malformed response text
|
141 |
+
|
142 |
+
Returns:
|
143 |
+
Dictionary with extracted information
|
144 |
+
"""
|
145 |
+
# Try to extract condition
|
146 |
+
condition_match = re.search(r'"extracted_condition":\s*"([^"]*)"', response_text)
|
147 |
+
confidence_match = re.search(r'"confidence":\s*"([^"]*)"', response_text)
|
148 |
+
medical_match = re.search(r'"is_medical":\s*(true|false)', response_text)
|
149 |
+
|
150 |
+
return {
|
151 |
+
"extracted_condition": condition_match.group(1) if condition_match else "unknown",
|
152 |
+
"confidence": confidence_match.group(1) if confidence_match else "0.0",
|
153 |
+
"is_medical": medical_match.group(1) == "true" if medical_match else True,
|
154 |
+
"raw_response": response_text,
|
155 |
+
"parsing_method": "partial_extraction"
|
156 |
+
}
|
157 |
+
|
158 |
def analyze_medical_query(
|
159 |
self,
|
160 |
query: str,
|
|
|
225 |
self.logger.info(f"Raw LLM Response: {response_text}")
|
226 |
self.logger.info(f"Query Latency: {latency:.4f} seconds")
|
227 |
|
228 |
+
# Direct text extraction - system prompt expects plain text response
|
229 |
+
# Since the system prompt instructs LLM to "Return ONLY the primary condition name",
|
230 |
+
# we should directly extract from text instead of attempting JSON parsing
|
231 |
+
extracted_condition = self._extract_condition(response_text)
|
232 |
+
confidence = '0.8'
|
233 |
+
self.logger.info(f"Extracted condition from text: {extracted_condition}")
|
234 |
+
|
235 |
# Detect abnormal response
|
236 |
if self._is_abnormal_response(response_text):
|
237 |
self.logger.error(f"❌ Abnormal LLM response detected: {response_text[:50]}...")
|
|
|
243 |
'latency': latency
|
244 |
}
|
245 |
|
|
|
|
|
|
|
246 |
# Log the extracted condition
|
247 |
self.logger.info(f"Extracted Condition: {extracted_condition}")
|
248 |
|
249 |
return {
|
250 |
'extracted_condition': extracted_condition,
|
251 |
+
'confidence': confidence,
|
252 |
'raw_response': response_text,
|
253 |
'latency': latency # Add latency to the return dictionary
|
254 |
}
|
|
|
355 |
|
356 |
def _extract_condition(self, response: str) -> str:
|
357 |
"""
|
358 |
+
Extract medical condition from model response with support for multiple formats.
|
359 |
|
360 |
Args:
|
361 |
response: Full model-generated text
|
|
|
363 |
Returns:
|
364 |
Extracted medical condition or empty string if non-medical
|
365 |
"""
|
366 |
+
from medical_conditions import CONDITION_KEYWORD_MAPPING
|
367 |
+
|
368 |
# Check if this is a rejection response first
|
369 |
if self._is_rejection_response(response):
|
370 |
return ""
|
371 |
|
372 |
+
# Try CONDITION: format first (primary format for structured responses)
|
373 |
+
match = re.search(r"CONDITION:\s*(.+)", response, re.IGNORECASE)
|
374 |
+
if not match:
|
375 |
+
# Try Primary condition: format as fallback
|
376 |
+
match = re.search(r"Primary condition:\s*(.+)", response, re.IGNORECASE)
|
377 |
+
|
378 |
+
if match:
|
379 |
+
value = match.group(1).strip()
|
380 |
+
if value.upper() not in ["NONE", "", "UNKNOWN"]:
|
381 |
+
return value
|
382 |
|
383 |
+
# Final fallback to keyword mapping for backward compatibility
|
384 |
for condition in CONDITION_KEYWORD_MAPPING.keys():
|
385 |
if condition.lower() in response.lower():
|
386 |
return condition
|
387 |
|
388 |
+
return ""
|
389 |
|
390 |
def _is_abnormal_response(self, response: str) -> bool:
|
391 |
"""
|
|
|
541 |
'total_execution_time': total_execution_time
|
542 |
}
|
543 |
|
544 |
+
|
545 |
+
class llm_Llama3_70B_JudgeClient:
|
546 |
+
"""
|
547 |
+
Llama3-70B client specifically for LLM judge evaluation.
|
548 |
+
Used for metrics 5-6 evaluation: Clinical Actionability & Evidence Quality.
|
549 |
+
"""
|
550 |
+
|
551 |
+
def __init__(
|
552 |
+
self,
|
553 |
+
model_name: str = "meta-llama/Meta-Llama-3-70B-Instruct",
|
554 |
+
timeout: float = 60.0
|
555 |
+
):
|
556 |
+
"""
|
557 |
+
Initialize Llama3-70B judge client for evaluation tasks.
|
558 |
+
|
559 |
+
Args:
|
560 |
+
model_name: Hugging Face model name for Llama3-70B
|
561 |
+
timeout: API call timeout duration (longer for judge evaluation)
|
562 |
+
|
563 |
+
Note: This client is specifically designed for third-party evaluation,
|
564 |
+
not for medical advice generation.
|
565 |
+
"""
|
566 |
+
self.logger = logging.getLogger(__name__)
|
567 |
+
self.timeout = timeout
|
568 |
+
self.model_name = model_name
|
569 |
+
|
570 |
+
# Get Hugging Face token from environment
|
571 |
+
hf_token = os.getenv('HF_TOKEN')
|
572 |
+
if not hf_token:
|
573 |
+
self.logger.error("HF_TOKEN is missing from environment variables.")
|
574 |
+
raise ValueError(
|
575 |
+
"HF_TOKEN not found in environment variables. "
|
576 |
+
"Please set HF_TOKEN in your .env file or environment."
|
577 |
+
)
|
578 |
+
|
579 |
+
# Initialize Hugging Face Inference Client for judge evaluation
|
580 |
+
try:
|
581 |
+
self.client = InferenceClient(
|
582 |
+
provider="auto",
|
583 |
+
api_key=hf_token,
|
584 |
+
)
|
585 |
+
self.logger.info(f"Llama3-70B judge client initialized with model: {model_name}")
|
586 |
+
self.logger.info("Judge LLM: Evaluation tool only. Not for medical advice generation.")
|
587 |
+
|
588 |
+
except Exception as e:
|
589 |
+
self.logger.error(f"Failed to initialize Llama3-70B judge client: {e}")
|
590 |
+
raise
|
591 |
+
|
592 |
+
def generate_completion(self, prompt: str) -> Dict[str, Union[str, float]]:
|
593 |
+
"""
|
594 |
+
Generate completion using Llama3-70B for judge evaluation.
|
595 |
+
|
596 |
+
Args:
|
597 |
+
prompt: Evaluation prompt for medical advice assessment
|
598 |
+
|
599 |
+
Returns:
|
600 |
+
Dict containing response content and timing information
|
601 |
+
"""
|
602 |
+
import time
|
603 |
+
|
604 |
+
start_time = time.time()
|
605 |
+
|
606 |
+
try:
|
607 |
+
self.logger.info(f"Calling Llama3-70B Judge with evaluation prompt ({len(prompt)} chars)")
|
608 |
+
|
609 |
+
# Call Llama3-70B for judge evaluation
|
610 |
+
completion = self.client.chat.completions.create(
|
611 |
+
model=self.model_name,
|
612 |
+
messages=[
|
613 |
+
{
|
614 |
+
"role": "user",
|
615 |
+
"content": prompt
|
616 |
+
}
|
617 |
+
],
|
618 |
+
max_tokens=2048, # Sufficient for evaluation responses
|
619 |
+
temperature=0.1, # Low temperature for consistent evaluation
|
620 |
+
)
|
621 |
+
|
622 |
+
# Extract response content
|
623 |
+
response_content = completion.choices[0].message.content
|
624 |
+
|
625 |
+
end_time = time.time()
|
626 |
+
latency = end_time - start_time
|
627 |
+
|
628 |
+
self.logger.info(f"Llama3-70B Judge Response: {response_content[:100]}...")
|
629 |
+
self.logger.info(f"Judge Evaluation Latency: {latency:.4f} seconds")
|
630 |
+
|
631 |
+
return {
|
632 |
+
'content': response_content,
|
633 |
+
'latency': latency,
|
634 |
+
'model': self.model_name,
|
635 |
+
'timestamp': time.time()
|
636 |
+
}
|
637 |
+
|
638 |
+
except Exception as e:
|
639 |
+
end_time = time.time()
|
640 |
+
error_latency = end_time - start_time
|
641 |
+
|
642 |
+
self.logger.error(f"Llama3-70B judge evaluation failed: {e}")
|
643 |
+
self.logger.error(f"Error occurred after {error_latency:.4f} seconds")
|
644 |
+
|
645 |
+
return {
|
646 |
+
'content': f"Judge evaluation error: {str(e)}",
|
647 |
+
'latency': error_latency,
|
648 |
+
'error': str(e),
|
649 |
+
'model': self.model_name,
|
650 |
+
'timestamp': time.time()
|
651 |
+
}
|
652 |
+
|
653 |
+
def batch_evaluate(self, evaluation_prompt: str) -> Dict[str, Union[str, float]]:
|
654 |
+
"""
|
655 |
+
Specialized method for batch evaluation of medical advice.
|
656 |
+
Alias for generate_completion with judge-specific logging.
|
657 |
+
|
658 |
+
Args:
|
659 |
+
evaluation_prompt: Batch evaluation prompt containing multiple queries
|
660 |
+
|
661 |
+
Returns:
|
662 |
+
Dict containing batch evaluation results and timing
|
663 |
+
"""
|
664 |
+
self.logger.info("Starting batch judge evaluation...")
|
665 |
+
result = self.generate_completion(evaluation_prompt)
|
666 |
+
|
667 |
+
if 'error' not in result:
|
668 |
+
self.logger.info(f"Batch evaluation completed successfully in {result['latency']:.2f}s")
|
669 |
+
else:
|
670 |
+
self.logger.error(f"Batch evaluation failed: {result.get('error', 'Unknown error')}")
|
671 |
+
|
672 |
+
return result
|
673 |
+
|
674 |
+
|
675 |
if __name__ == "__main__":
|
676 |
main()
|
src/medical_conditions.py
CHANGED
@@ -63,6 +63,14 @@ CONDITION_KEYWORD_MAPPING: Dict[str, Dict[str, str]] = {
|
|
63 |
"seizure disorder": {
|
64 |
"emergency": "seizure|status epilepticus|postictal state",
|
65 |
"treatment": "antiepileptic drugs|EEG monitoring|neurology consult"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
}
|
67 |
}
|
68 |
|
|
|
63 |
"seizure disorder": {
|
64 |
"emergency": "seizure|status epilepticus|postictal state",
|
65 |
"treatment": "antiepileptic drugs|EEG monitoring|neurology consult"
|
66 |
+
},
|
67 |
+
"postpartum hemorrhage": {
|
68 |
+
"emergency": "postpartum hemorrhage|uterine atony|placental retention|vaginal laceration",
|
69 |
+
"treatment": "uterine massage|IV oxytocin infusion|blood transfusion|surgical intervention"
|
70 |
+
},
|
71 |
+
"bacterial meningitis": {
|
72 |
+
"emergency": "bacterial meningitis|fever|headache|neck stiffness|altered mental status|meningitis|meningeal signs",
|
73 |
+
"treatment": "empiric antibiotics|ceftriaxone|vancomycin|dexamethasone|lumbar puncture"
|
74 |
}
|
75 |
}
|
76 |
|
src/user_prompt.py
CHANGED
@@ -255,13 +255,15 @@ Return ONLY the specified format."""
|
|
255 |
timeout=12.0 # Single call timeout
|
256 |
)
|
257 |
|
|
|
|
|
258 |
response_text = llama_response.get('extracted_condition', '').strip()
|
259 |
logger.info(f"🤖 Combined L2+4 result: {response_text}")
|
260 |
|
261 |
-
# Parse structured response
|
262 |
-
medical_status = self._extract_field(
|
263 |
-
condition_name = self._extract_field(
|
264 |
-
confidence = self._extract_field(
|
265 |
|
266 |
# Non-medical query detection
|
267 |
if medical_status == 'NO':
|
|
|
255 |
timeout=12.0 # Single call timeout
|
256 |
)
|
257 |
|
258 |
+
# Get both raw response and extracted condition
|
259 |
+
raw_response = llama_response.get('raw_response', '').strip()
|
260 |
response_text = llama_response.get('extracted_condition', '').strip()
|
261 |
logger.info(f"🤖 Combined L2+4 result: {response_text}")
|
262 |
|
263 |
+
# Parse structured response from raw LLM output (not extracted condition)
|
264 |
+
medical_status = self._extract_field(raw_response, 'MEDICAL')
|
265 |
+
condition_name = self._extract_field(raw_response, 'CONDITION')
|
266 |
+
confidence = self._extract_field(raw_response, 'CONFIDENCE')
|
267 |
|
268 |
# Non-medical query detection
|
269 |
if medical_status == 'NO':
|