Spaces:
Sleeping
Sleeping
YanBoChen
commited on
Commit
Β·
88e76fd
1
Parent(s):
3e2ffcb
Add extraction and relevance evaluators for condition extraction and retrieval relevance analysis
Browse files- Implemented `extraction_evaluator.py` to evaluate condition extraction success rates using the `UserPromptProcessor`.
- Added functionality to parse queries from a file, evaluate extractions, and calculate statistics.
- Created methods to save extraction statistics and detailed results in JSON format.
- Implemented `relevance_evaluator.py` to assess retrieval relevance using cosine similarity scores.
- Included methods for parsing queries, evaluating relevance, and generating statistics.
- Both evaluators support independent execution and provide detailed output for analysis.
evaluation/latency_evaluator.py
CHANGED
@@ -1,10 +1,21 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
-
OnCall.ai System -
|
4 |
-
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
Author: YanBo Chen
|
10 |
Date: 2025-08-04
|
@@ -14,7 +25,7 @@ import time
|
|
14 |
import json
|
15 |
import os
|
16 |
import sys
|
17 |
-
from typing import Dict, List, Any
|
18 |
from datetime import datetime
|
19 |
from pathlib import Path
|
20 |
import re
|
@@ -37,12 +48,12 @@ except ImportError as e:
|
|
37 |
sys.exit(1)
|
38 |
|
39 |
|
40 |
-
class
|
41 |
-
"""
|
42 |
|
43 |
def __init__(self):
|
44 |
-
"""Initialize
|
45 |
-
print("π§ Initializing
|
46 |
|
47 |
# Initialize existing system components (same as app.py)
|
48 |
self.llm_client = llm_Med42_70BClient()
|
@@ -53,66 +64,137 @@ class LatencyEvaluator:
|
|
53 |
)
|
54 |
self.medical_generator = MedicalAdviceGenerator(llm_client=self.llm_client)
|
55 |
|
56 |
-
# Results accumulation for
|
57 |
-
self.
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
}
|
62 |
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
-
def
|
69 |
"""
|
70 |
-
|
71 |
|
72 |
-
Replicates app.py's process_medical_query
|
73 |
|
74 |
Args:
|
75 |
query: Medical query to test
|
76 |
category: Query category (diagnosis/treatment/mixed)
|
77 |
"""
|
78 |
-
print(f"
|
79 |
print(f"π Category: {category}")
|
80 |
|
81 |
overall_start = time.time()
|
82 |
timing_details = {}
|
83 |
|
84 |
try:
|
85 |
-
# STEP 1: Condition
|
86 |
step1_start = time.time()
|
87 |
condition_result = self.user_prompt_processor.extract_condition_keywords(query)
|
88 |
-
|
|
|
89 |
|
90 |
-
print(f" Step 1 - Condition extraction: {
|
91 |
print(f" Extracted condition: {condition_result.get('condition', 'None')}")
|
92 |
|
93 |
# Check if valid medical query
|
94 |
if condition_result.get('query_status') in ['invalid_query', 'non_medical']:
|
95 |
total_time = time.time() - overall_start
|
96 |
-
|
97 |
-
|
98 |
-
"query": query,
|
99 |
-
"category": category,
|
100 |
-
"total_latency": total_time,
|
101 |
-
"timing_details": timing_details,
|
102 |
-
"status": "non_medical",
|
103 |
-
"condition_result": condition_result,
|
104 |
-
"success": False,
|
105 |
-
"timestamp": datetime.now().isoformat()
|
106 |
-
}
|
107 |
|
108 |
-
# STEP 2: User
|
109 |
step2_start = time.time()
|
110 |
confirmation = self.user_prompt_processor.handle_user_confirmation(condition_result)
|
111 |
-
|
|
|
112 |
|
113 |
-
|
|
|
|
|
|
|
114 |
|
115 |
-
# STEP 3:
|
116 |
step3_start = time.time()
|
117 |
|
118 |
search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
|
@@ -120,12 +202,13 @@ class LatencyEvaluator:
|
|
120 |
search_query = condition_result.get('condition', query)
|
121 |
|
122 |
retrieval_results = self.retrieval_system.search(search_query, top_k=5)
|
123 |
-
|
|
|
124 |
|
125 |
-
|
126 |
-
print(f" Step 3 - Retrieval: {
|
127 |
|
128 |
-
# STEP 4:
|
129 |
step4_start = time.time()
|
130 |
|
131 |
intention = self._detect_query_intention(query)
|
@@ -134,68 +217,199 @@ class LatencyEvaluator:
|
|
134 |
retrieval_results=retrieval_results,
|
135 |
intention=intention
|
136 |
)
|
137 |
-
|
|
|
|
|
|
|
|
|
138 |
|
139 |
-
print(f" Step 4 - Generation: {
|
140 |
|
141 |
total_time = time.time() - overall_start
|
142 |
|
143 |
-
#
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
-
|
|
|
148 |
"query": query,
|
149 |
"category": category,
|
150 |
-
|
151 |
-
|
152 |
-
"
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
"timestamp": datetime.now().isoformat()
|
158 |
}
|
159 |
|
160 |
-
# Store
|
|
|
|
|
|
|
161 |
medical_output = {
|
162 |
"query": query,
|
163 |
"category": category,
|
164 |
-
"medical_advice":
|
165 |
"confidence_score": confidence_score,
|
166 |
"query_id": f"{category}_query",
|
167 |
"processing_time": total_time,
|
168 |
"timestamp": datetime.now().isoformat()
|
169 |
}
|
170 |
-
|
171 |
self.medical_outputs.append(medical_output)
|
172 |
|
173 |
-
print(f"β
|
174 |
-
print(f"
|
|
|
175 |
|
176 |
-
return
|
177 |
|
178 |
except Exception as e:
|
179 |
total_time = time.time() - overall_start
|
180 |
-
print(f"β
|
181 |
|
182 |
-
return
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
"total_latency": total_time,
|
186 |
"timing_details": timing_details,
|
187 |
-
"
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
|
193 |
-
def
|
194 |
-
"""
|
195 |
-
|
196 |
|
197 |
-
|
198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
print(f"π Reading queries from file: {filepath}")
|
200 |
|
201 |
try:
|
@@ -237,8 +451,6 @@ class LatencyEvaluator:
|
|
237 |
print(f"π Parsed queries by category:")
|
238 |
for category, category_queries in queries_by_category.items():
|
239 |
print(f" {category.capitalize()}: {len(category_queries)} queries")
|
240 |
-
for i, query_info in enumerate(category_queries):
|
241 |
-
print(f" {i+1}. {query_info['text'][:60]}...")
|
242 |
|
243 |
return queries_by_category
|
244 |
|
@@ -246,23 +458,225 @@ class LatencyEvaluator:
|
|
246 |
print(f"β Failed to read file: {e}")
|
247 |
return {"error": f"Failed to read file: {e}"}
|
248 |
|
249 |
-
def
|
250 |
-
"""
|
251 |
-
|
|
|
252 |
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
-
def
|
261 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
if filename is None:
|
263 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
264 |
-
|
265 |
-
filename = f"latency_{category}_{timestamp}.json"
|
266 |
|
267 |
# Ensure results directory exists
|
268 |
results_dir = Path(__file__).parent / "results"
|
@@ -270,18 +684,59 @@ class LatencyEvaluator:
|
|
270 |
|
271 |
filepath = results_dir / filename
|
272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
with open(filepath, 'w', encoding='utf-8') as f:
|
274 |
-
json.dump(
|
275 |
|
276 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
return str(filepath)
|
278 |
|
279 |
|
280 |
# Independent execution interface
|
281 |
if __name__ == "__main__":
|
282 |
-
"""Independent
|
283 |
|
284 |
-
print("π OnCall.ai
|
285 |
|
286 |
if len(sys.argv) > 1:
|
287 |
query_file = sys.argv[1]
|
@@ -295,18 +750,18 @@ if __name__ == "__main__":
|
|
295 |
sys.exit(1)
|
296 |
|
297 |
# Initialize evaluator
|
298 |
-
evaluator =
|
299 |
|
300 |
# Parse queries from file
|
301 |
-
queries_by_category = evaluator.
|
302 |
|
303 |
if "error" in queries_by_category:
|
304 |
print(f"β Failed to parse queries: {queries_by_category['error']}")
|
305 |
sys.exit(1)
|
306 |
|
307 |
-
# Test each
|
308 |
-
print(f"\nπ§ͺ
|
309 |
-
print(f"
|
310 |
|
311 |
for category, queries in queries_by_category.items():
|
312 |
if not queries:
|
@@ -319,178 +774,63 @@ if __name__ == "__main__":
|
|
319 |
print(f"\nπ Query {i+1}/{len(queries)} in {category} category:")
|
320 |
print(f" Text: {query_text}")
|
321 |
|
322 |
-
#
|
323 |
-
result = evaluator.
|
324 |
-
|
325 |
-
# Add to accumulator for chart generation
|
326 |
-
evaluator.add_result_to_accumulator(result)
|
327 |
-
|
328 |
-
# Save individual result
|
329 |
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
330 |
-
filename = f"latency_{category}_query{i+1}_{timestamp}.json"
|
331 |
-
saved_path = evaluator.save_single_result(result, filename)
|
332 |
-
|
333 |
-
# Show summary
|
334 |
-
if result.get('success'):
|
335 |
-
print(f" β
Success: {result['total_latency']:.2f}s total")
|
336 |
-
print(f" Breakdown: Extract={result['timing_details']['step1_condition_extraction']:.2f}s, "
|
337 |
-
f"Retrieve={result['timing_details']['step3_retrieval']:.2f}s, "
|
338 |
-
f"Generate={result['timing_details']['step4_generation']:.2f}s")
|
339 |
-
else:
|
340 |
-
print(f" β Failed: {result.get('status')} - {result.get('error', 'Unknown error')}")
|
341 |
|
342 |
# Pause between queries to avoid rate limits
|
343 |
-
if i < len(queries) - 1:
|
344 |
print(f" β³ Pausing 5s before next query...")
|
345 |
time.sleep(5)
|
346 |
|
347 |
# Longer pause between categories
|
348 |
-
if category != list(queries_by_category.keys())[-1]:
|
349 |
print(f"\nβ³ Pausing 10s before next category...")
|
350 |
time.sleep(10)
|
351 |
|
352 |
-
# Generate
|
353 |
-
print(f"\nπ Generating comprehensive
|
354 |
-
|
355 |
-
# Calculate category statistics
|
356 |
-
final_stats = evaluator.calculate_category_statistics()
|
357 |
|
358 |
-
# Save statistics for
|
359 |
-
|
360 |
|
361 |
# Save medical outputs for model comparison
|
362 |
outputs_path = evaluator.save_medical_outputs()
|
363 |
|
364 |
-
#
|
365 |
-
|
366 |
-
category_results = final_stats['category_results']
|
367 |
-
overall_results = final_stats['overall_results']
|
368 |
-
|
369 |
-
print(f"Overall Performance:")
|
370 |
-
print(f" Average Latency: {overall_results['average_latency']:.2f}s (Β±{overall_results['std_deviation']:.2f})")
|
371 |
-
print(f" Success Rate: {overall_results['successful_queries']}/{overall_results['total_queries']}")
|
372 |
-
print(f" 30s Target Compliance: {overall_results['target_compliance']:.1%}")
|
373 |
-
|
374 |
-
print(f"\nCategory Breakdown:")
|
375 |
-
for category, stats in category_results.items():
|
376 |
-
if stats['query_count'] > 0:
|
377 |
-
print(f" {category.capitalize()}: {stats['average_latency']:.2f}s (Β±{stats['std_deviation']:.2f}) [{stats['query_count']} queries]")
|
378 |
-
|
379 |
-
print(f"\nβ
Data collection complete! Files saved:")
|
380 |
-
print(f" π Statistics: {stats_path}")
|
381 |
-
print(f" π Medical Outputs: {outputs_path}")
|
382 |
-
print(f" π Individual results: {Path(__file__).parent / 'results'}")
|
383 |
-
print(f"\nπ‘ Next step: Run latency_chart_generator.py to create visualizations")
|
384 |
-
|
385 |
-
def add_result_to_accumulator(self, result: Dict[str, Any]):
|
386 |
-
"""Add successful result to category accumulator"""
|
387 |
-
if result.get('success') and result.get('category') in self.accumulated_results:
|
388 |
-
category = result['category']
|
389 |
-
self.accumulated_results[category].append(result)
|
390 |
-
print(f"π Added result to {category} category. Total: {len(self.accumulated_results[category])}")
|
391 |
|
392 |
-
|
393 |
-
|
394 |
-
stats = self.calculate_category_statistics()
|
395 |
-
|
396 |
-
if filename is None:
|
397 |
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
398 |
-
filename = f"latency_statistics_{timestamp}.json"
|
399 |
-
|
400 |
-
# Ensure results directory exists
|
401 |
-
results_dir = Path(__file__).parent / "results"
|
402 |
-
results_dir.mkdir(exist_ok=True)
|
403 |
-
|
404 |
-
filepath = results_dir / filename
|
405 |
-
|
406 |
-
with open(filepath, 'w', encoding='utf-8') as f:
|
407 |
-
json.dump(stats, f, indent=2, ensure_ascii=False)
|
408 |
-
|
409 |
-
print(f"π Statistics saved to: {filepath}")
|
410 |
-
return str(filepath)
|
411 |
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
416 |
-
filename = f"medical_outputs_{timestamp}.json"
|
417 |
|
418 |
-
|
419 |
-
results_dir = Path(__file__).parent / "results"
|
420 |
-
results_dir.mkdir(exist_ok=True)
|
421 |
|
422 |
-
|
|
|
|
|
423 |
|
424 |
-
|
425 |
-
|
426 |
-
"
|
427 |
-
"total_outputs": len(self.medical_outputs),
|
428 |
-
"categories": list(set(output['category'] for output in self.medical_outputs)),
|
429 |
-
"timestamp": datetime.now().isoformat(),
|
430 |
-
"model_type": "Med42-70B_RAG_enhanced" # For future comparison
|
431 |
-
},
|
432 |
-
"medical_outputs": self.medical_outputs
|
433 |
-
}
|
434 |
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
print(f"π Medical outputs saved to: {filepath}")
|
439 |
-
print(f" Total outputs: {len(self.medical_outputs)}")
|
440 |
-
print(f" Categories: {', '.join(set(output['category'] for output in self.medical_outputs))}")
|
441 |
|
442 |
-
|
|
|
|
|
443 |
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
"average_latency": sum(latencies) / len(latencies),
|
455 |
-
"std_deviation": self._calculate_std(latencies),
|
456 |
-
"min_latency": min(latencies),
|
457 |
-
"max_latency": max(latencies),
|
458 |
-
"query_count": len(latencies),
|
459 |
-
"individual_latencies": latencies
|
460 |
-
}
|
461 |
-
all_successful_latencies.extend(latencies)
|
462 |
-
else:
|
463 |
-
category_stats[category] = {
|
464 |
-
"average_latency": 0.0,
|
465 |
-
"std_deviation": 0.0,
|
466 |
-
"min_latency": 0.0,
|
467 |
-
"max_latency": 0.0,
|
468 |
-
"query_count": 0,
|
469 |
-
"individual_latencies": []
|
470 |
-
}
|
471 |
-
|
472 |
-
# Calculate overall statistics
|
473 |
-
overall_stats = {
|
474 |
-
"average_latency": sum(all_successful_latencies) / len(all_successful_latencies) if all_successful_latencies else 0.0,
|
475 |
-
"std_deviation": self._calculate_std(all_successful_latencies),
|
476 |
-
"min_latency": min(all_successful_latencies) if all_successful_latencies else 0.0,
|
477 |
-
"max_latency": max(all_successful_latencies) if all_successful_latencies else 0.0,
|
478 |
-
"total_queries": sum(len(results) for results in self.accumulated_results.values()),
|
479 |
-
"successful_queries": len(all_successful_latencies),
|
480 |
-
"target_compliance": sum(1 for lat in all_successful_latencies if lat <= 30.0) / len(all_successful_latencies) if all_successful_latencies else 0.0
|
481 |
-
}
|
482 |
-
|
483 |
-
return {
|
484 |
-
"category_results": category_stats,
|
485 |
-
"overall_results": overall_stats,
|
486 |
-
"timestamp": datetime.now().isoformat()
|
487 |
-
}
|
488 |
-
|
489 |
-
def _calculate_std(self, values: List[float]) -> float:
|
490 |
-
"""Calculate standard deviation"""
|
491 |
-
if len(values) < 2:
|
492 |
-
return 0.0
|
493 |
-
|
494 |
-
mean = sum(values) / len(values)
|
495 |
-
variance = sum((x - mean) ** 2 for x in values) / len(values)
|
496 |
-
return variance ** 0.5
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
+
OnCall.ai System - Comprehensive Evaluator (Metrics 1-6)
|
4 |
+
========================================================
|
5 |
|
6 |
+
Single execution to collect all metrics 1-6 data from app.py pipeline:
|
7 |
+
|
8 |
+
RETRIEVAL METRICS (Only available for RAG systems):
|
9 |
+
1. Total Latency (ηΈ½θηζι·) - Complete pipeline timing
|
10 |
+
2. Condition Extraction Success Rate (ζ’δ»Άζ½εζεη) - user_prompt.py success
|
11 |
+
3. Retrieval Relevance (ζͺ’η΄’ηΈιζ§) - cosine similarity from retrieval.py
|
12 |
+
4. Retrieval Coverage (ζͺ’η΄’θ¦θη) - advice utilization of retrieved content
|
13 |
+
|
14 |
+
LLM EVALUATION METRICS (Available for all systems):
|
15 |
+
5. Clinical Actionability (θ¨εΊε―ζδ½ζ§) - Third-party LLM evaluation
|
16 |
+
6. Clinical Evidence Quality (θ¨εΊθζεθ³ͺ) - Third-party LLM evaluation
|
17 |
+
|
18 |
+
Note: This evaluator focuses on metrics 1-4. Metrics 5-6 require separate LLM evaluation.
|
19 |
|
20 |
Author: YanBo Chen
|
21 |
Date: 2025-08-04
|
|
|
25 |
import json
|
26 |
import os
|
27 |
import sys
|
28 |
+
from typing import Dict, List, Any, Set
|
29 |
from datetime import datetime
|
30 |
from pathlib import Path
|
31 |
import re
|
|
|
48 |
sys.exit(1)
|
49 |
|
50 |
|
51 |
+
class ComprehensiveEvaluator:
|
52 |
+
"""Comprehensive evaluator for metrics 1-4 - single execution approach"""
|
53 |
|
54 |
def __init__(self):
|
55 |
+
"""Initialize system components (identical to app.py)"""
|
56 |
+
print("π§ Initializing Comprehensive Evaluator...")
|
57 |
|
58 |
# Initialize existing system components (same as app.py)
|
59 |
self.llm_client = llm_Med42_70BClient()
|
|
|
64 |
)
|
65 |
self.medical_generator = MedicalAdviceGenerator(llm_client=self.llm_client)
|
66 |
|
67 |
+
# Results accumulation for all metrics
|
68 |
+
self.comprehensive_results = []
|
69 |
+
self.medical_outputs = []
|
70 |
+
|
71 |
+
print("β
Comprehensive Evaluator initialization complete")
|
72 |
+
|
73 |
+
def extract_medical_keywords(self, text: str) -> Set[str]:
|
74 |
+
"""Extract medical keywords for coverage analysis"""
|
75 |
+
if not text:
|
76 |
+
return set()
|
77 |
+
|
78 |
+
medical_keywords = set()
|
79 |
+
text_lower = text.lower()
|
80 |
+
|
81 |
+
# Medical terminology patterns
|
82 |
+
patterns = [
|
83 |
+
r'\b[a-z]+(?:osis|itis|pathy|emia|uria|gram|scopy)\b', # Medical suffixes
|
84 |
+
r'\b(?:cardio|neuro|pulmo|gastro|hepato|nephro)[a-z]+\b', # Medical prefixes
|
85 |
+
r'\b(?:diagnosis|treatment|therapy|intervention|management)\b', # Medical actions
|
86 |
+
r'\b(?:patient|symptom|condition|disease|disorder|syndrome)\b', # Medical entities
|
87 |
+
r'\b(?:acute|chronic|severe|mild|moderate|emergency)\b', # Medical descriptors
|
88 |
+
r'\b[a-z]+(?:al|ic|ous|ive)\s+(?:pain|failure|infection|injury)\b', # Compound terms
|
89 |
+
r'\b(?:ecg|ekg|ct|mri|x-ray|ultrasound|biopsy)\b', # Medical procedures
|
90 |
+
r'\b\d+\s*(?:mg|ml|units|hours|days|minutes)\b', # Dosages and timeframes
|
91 |
+
]
|
92 |
+
|
93 |
+
for pattern in patterns:
|
94 |
+
matches = re.findall(pattern, text_lower)
|
95 |
+
medical_keywords.update(match.strip() for match in matches)
|
96 |
+
|
97 |
+
# Additional common medical terms
|
98 |
+
common_medical_terms = [
|
99 |
+
'blood', 'pressure', 'heart', 'chest', 'pain', 'stroke', 'seizure',
|
100 |
+
'emergency', 'hospital', 'monitor', 'assess', 'evaluate', 'immediate',
|
101 |
+
'protocol', 'guideline', 'recommendation', 'risk', 'factor'
|
102 |
+
]
|
103 |
+
|
104 |
+
for term in common_medical_terms:
|
105 |
+
if term in text_lower:
|
106 |
+
medical_keywords.add(term)
|
107 |
+
|
108 |
+
# Filter out very short terms and common words
|
109 |
+
filtered_keywords = {
|
110 |
+
kw for kw in medical_keywords
|
111 |
+
if len(kw) > 2 and kw not in ['the', 'and', 'for', 'with', 'are', 'can', 'may']
|
112 |
}
|
113 |
|
114 |
+
return filtered_keywords
|
115 |
+
|
116 |
+
def calculate_coverage_metrics(self, generated_advice: str, retrieval_results: List[Dict]) -> Dict[str, Any]:
|
117 |
+
"""Calculate coverage metrics from generated advice and retrieval results"""
|
118 |
+
if not generated_advice or not retrieval_results:
|
119 |
+
return {
|
120 |
+
"coverage_score": 0.0,
|
121 |
+
"matched_keywords": [],
|
122 |
+
"advice_keywords": [],
|
123 |
+
"source_keywords": [],
|
124 |
+
"coverage_percentage": 0.0,
|
125 |
+
"meets_threshold": False
|
126 |
+
}
|
127 |
+
|
128 |
+
# Extract keywords from generated advice
|
129 |
+
advice_keywords = self.extract_medical_keywords(generated_advice)
|
130 |
+
|
131 |
+
# Extract keywords from all retrieved documents
|
132 |
+
all_source_keywords = set()
|
133 |
+
for doc in retrieval_results:
|
134 |
+
doc_content = doc.get('content', '') or doc.get('text', '')
|
135 |
+
doc_keywords = self.extract_medical_keywords(doc_content)
|
136 |
+
all_source_keywords.update(doc_keywords)
|
137 |
+
|
138 |
+
# Calculate coverage
|
139 |
+
matched_keywords = advice_keywords.intersection(all_source_keywords)
|
140 |
+
coverage_score = len(matched_keywords) / len(all_source_keywords) if all_source_keywords else 0.0
|
141 |
|
142 |
+
return {
|
143 |
+
"coverage_score": coverage_score,
|
144 |
+
"matched_keywords": list(matched_keywords),
|
145 |
+
"advice_keywords": list(advice_keywords),
|
146 |
+
"source_keywords": list(all_source_keywords),
|
147 |
+
"advice_keywords_count": len(advice_keywords),
|
148 |
+
"source_keywords_count": len(all_source_keywords),
|
149 |
+
"matched_keywords_count": len(matched_keywords),
|
150 |
+
"coverage_percentage": coverage_score * 100,
|
151 |
+
"meets_threshold": coverage_score >= 0.6
|
152 |
+
}
|
153 |
|
154 |
+
def evaluate_single_query_comprehensive(self, query: str, category: str = "unknown") -> Dict[str, Any]:
|
155 |
"""
|
156 |
+
Comprehensive evaluation for single query - collects all metrics 1-4 data
|
157 |
|
158 |
+
Replicates app.py's process_medical_query pipeline exactly
|
159 |
|
160 |
Args:
|
161 |
query: Medical query to test
|
162 |
category: Query category (diagnosis/treatment/mixed)
|
163 |
"""
|
164 |
+
print(f"π Comprehensive evaluation: {query[:50]}...")
|
165 |
print(f"π Category: {category}")
|
166 |
|
167 |
overall_start = time.time()
|
168 |
timing_details = {}
|
169 |
|
170 |
try:
|
171 |
+
# STEP 1: Query Processing and Condition Extraction (identical to app.py)
|
172 |
step1_start = time.time()
|
173 |
condition_result = self.user_prompt_processor.extract_condition_keywords(query)
|
174 |
+
step1_time = time.time() - step1_start
|
175 |
+
timing_details['step1_condition_extraction'] = step1_time
|
176 |
|
177 |
+
print(f" Step 1 - Condition extraction: {step1_time:.3f}s")
|
178 |
print(f" Extracted condition: {condition_result.get('condition', 'None')}")
|
179 |
|
180 |
# Check if valid medical query
|
181 |
if condition_result.get('query_status') in ['invalid_query', 'non_medical']:
|
182 |
total_time = time.time() - overall_start
|
183 |
+
return self._create_failed_result(query, category, total_time, timing_details,
|
184 |
+
"non_medical", condition_result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
|
186 |
+
# STEP 2: User Confirmation (simulate auto-confirmation)
|
187 |
step2_start = time.time()
|
188 |
confirmation = self.user_prompt_processor.handle_user_confirmation(condition_result)
|
189 |
+
step2_time = time.time() - step2_start
|
190 |
+
timing_details['step2_confirmation'] = step2_time
|
191 |
|
192 |
+
if not condition_result.get('condition'):
|
193 |
+
total_time = time.time() - overall_start
|
194 |
+
return self._create_failed_result(query, category, total_time, timing_details,
|
195 |
+
"no_condition", condition_result)
|
196 |
|
197 |
+
# STEP 3: Medical Guidelines Retrieval (identical to app.py)
|
198 |
step3_start = time.time()
|
199 |
|
200 |
search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
|
|
|
202 |
search_query = condition_result.get('condition', query)
|
203 |
|
204 |
retrieval_results = self.retrieval_system.search(search_query, top_k=5)
|
205 |
+
step3_time = time.time() - step3_start
|
206 |
+
timing_details['step3_retrieval'] = step3_time
|
207 |
|
208 |
+
processed_results = retrieval_results.get('processed_results', [])
|
209 |
+
print(f" Step 3 - Retrieval: {step3_time:.3f}s ({len(processed_results)} results)")
|
210 |
|
211 |
+
# STEP 4: Medical Advice Generation (identical to app.py)
|
212 |
step4_start = time.time()
|
213 |
|
214 |
intention = self._detect_query_intention(query)
|
|
|
217 |
retrieval_results=retrieval_results,
|
218 |
intention=intention
|
219 |
)
|
220 |
+
step4_time = time.time() - step4_start
|
221 |
+
timing_details['step4_generation'] = step4_time
|
222 |
+
|
223 |
+
generated_advice = medical_advice_result.get('medical_advice', '')
|
224 |
+
confidence_score = medical_advice_result.get('confidence_score', 0.0)
|
225 |
|
226 |
+
print(f" Step 4 - Generation: {step4_time:.3f}s")
|
227 |
|
228 |
total_time = time.time() - overall_start
|
229 |
|
230 |
+
# METRIC 2: Condition Extraction Analysis
|
231 |
+
extraction_success = (
|
232 |
+
condition_result.get('condition') and
|
233 |
+
condition_result.get('condition') != "unknown" and
|
234 |
+
condition_result.get('query_status') not in ['invalid_query', 'non_medical']
|
235 |
+
)
|
236 |
+
|
237 |
+
extraction_metrics = {
|
238 |
+
"extraction_success": extraction_success,
|
239 |
+
"extracted_condition": condition_result.get('condition'),
|
240 |
+
"query_status": condition_result.get('query_status'),
|
241 |
+
"emergency_keywords": condition_result.get('emergency_keywords', []),
|
242 |
+
"treatment_keywords": condition_result.get('treatment_keywords', []),
|
243 |
+
"fallback_level": condition_result.get('fallback_level', 'unknown'),
|
244 |
+
"extraction_time": step1_time
|
245 |
+
}
|
246 |
+
|
247 |
+
# METRIC 3: Retrieval Relevance Analysis
|
248 |
+
if processed_results:
|
249 |
+
similarity_scores = []
|
250 |
+
for doc_result in processed_results:
|
251 |
+
similarity = (
|
252 |
+
doc_result.get('distance', 0.0) or
|
253 |
+
doc_result.get('similarity_score', 0.0) or
|
254 |
+
doc_result.get('score', 0.0)
|
255 |
+
)
|
256 |
+
similarity_scores.append(similarity)
|
257 |
+
|
258 |
+
average_relevance = sum(similarity_scores) / len(similarity_scores)
|
259 |
+
high_relevance_count = sum(1 for score in similarity_scores if score >= 0.2)
|
260 |
+
|
261 |
+
relevance_metrics = {
|
262 |
+
"average_relevance": average_relevance,
|
263 |
+
"max_relevance": max(similarity_scores),
|
264 |
+
"min_relevance": min(similarity_scores),
|
265 |
+
"similarity_scores": similarity_scores,
|
266 |
+
"high_relevance_count": high_relevance_count,
|
267 |
+
"high_relevance_ratio": high_relevance_count / len(similarity_scores),
|
268 |
+
"retrieved_count": len(processed_results),
|
269 |
+
"meets_threshold": average_relevance >= 0.2,
|
270 |
+
"retrieval_time": step3_time
|
271 |
+
}
|
272 |
+
else:
|
273 |
+
relevance_metrics = {
|
274 |
+
"average_relevance": 0.0,
|
275 |
+
"max_relevance": 0.0,
|
276 |
+
"min_relevance": 0.0,
|
277 |
+
"similarity_scores": [],
|
278 |
+
"high_relevance_count": 0,
|
279 |
+
"high_relevance_ratio": 0.0,
|
280 |
+
"retrieved_count": 0,
|
281 |
+
"meets_threshold": False,
|
282 |
+
"retrieval_time": step3_time
|
283 |
+
}
|
284 |
+
|
285 |
+
# METRIC 4: Retrieval Coverage Analysis
|
286 |
+
coverage_metrics = self.calculate_coverage_metrics(generated_advice, processed_results)
|
287 |
+
coverage_metrics["generation_time"] = step4_time
|
288 |
|
289 |
+
# Create comprehensive result
|
290 |
+
comprehensive_result = {
|
291 |
"query": query,
|
292 |
"category": category,
|
293 |
+
|
294 |
+
# Metric 1: Total Latency - Complete pipeline processing time
|
295 |
+
"latency_metrics": {
|
296 |
+
"total_latency": total_time,
|
297 |
+
"timing_details": timing_details,
|
298 |
+
"meets_target": total_time <= 30.0
|
299 |
+
},
|
300 |
+
|
301 |
+
# Metric 2: Condition Extraction - Success rate from user_prompt.py
|
302 |
+
"extraction_metrics": extraction_metrics,
|
303 |
+
|
304 |
+
# Metric 3: Retrieval Relevance - Cosine similarity from retrieval.py
|
305 |
+
"relevance_metrics": relevance_metrics,
|
306 |
+
|
307 |
+
# Metric 4: Retrieval Coverage - Advice utilization of retrieved content
|
308 |
+
"coverage_metrics": coverage_metrics,
|
309 |
+
|
310 |
+
# Complete pipeline data (for debugging and detailed analysis)
|
311 |
+
"pipeline_data": {
|
312 |
+
"condition_result": condition_result,
|
313 |
+
"retrieval_results": retrieval_results,
|
314 |
+
"medical_advice_result": medical_advice_result,
|
315 |
+
"search_query": search_query,
|
316 |
+
"intention": intention
|
317 |
+
},
|
318 |
+
|
319 |
+
"overall_success": True,
|
320 |
"timestamp": datetime.now().isoformat()
|
321 |
}
|
322 |
|
323 |
+
# Store result
|
324 |
+
self.comprehensive_results.append(comprehensive_result)
|
325 |
+
|
326 |
+
# Store medical output for model comparison
|
327 |
medical_output = {
|
328 |
"query": query,
|
329 |
"category": category,
|
330 |
+
"medical_advice": generated_advice,
|
331 |
"confidence_score": confidence_score,
|
332 |
"query_id": f"{category}_query",
|
333 |
"processing_time": total_time,
|
334 |
"timestamp": datetime.now().isoformat()
|
335 |
}
|
|
|
336 |
self.medical_outputs.append(medical_output)
|
337 |
|
338 |
+
print(f"β
Comprehensive evaluation completed in {total_time:.2f}s")
|
339 |
+
print(f" π Metrics: Latency={total_time:.2f}s, Extraction={'β
' if extraction_success else 'β'}, "
|
340 |
+
f"Relevance={average_relevance:.3f}, Coverage={coverage_metrics['coverage_score']:.3f}")
|
341 |
|
342 |
+
return comprehensive_result
|
343 |
|
344 |
except Exception as e:
|
345 |
total_time = time.time() - overall_start
|
346 |
+
print(f"β Comprehensive evaluation failed after {total_time:.2f}s: {e}")
|
347 |
|
348 |
+
return self._create_failed_result(query, category, total_time, timing_details, "error", None, str(e))
|
349 |
+
|
350 |
+
def _create_failed_result(self, query: str, category: str, total_time: float,
|
351 |
+
timing_details: Dict, status: str, condition_result: Dict = None,
|
352 |
+
error: str = None) -> Dict[str, Any]:
|
353 |
+
"""Create standardized failed result"""
|
354 |
+
failed_result = {
|
355 |
+
"query": query,
|
356 |
+
"category": category,
|
357 |
+
|
358 |
+
# Metric 1: Total Latency - Always measurable even on failure
|
359 |
+
"latency_metrics": {
|
360 |
"total_latency": total_time,
|
361 |
"timing_details": timing_details,
|
362 |
+
"meets_target": total_time <= 30.0
|
363 |
+
},
|
364 |
+
|
365 |
+
# Metric 2: Condition Extraction - Partial data may be available before failure
|
366 |
+
"extraction_metrics": {
|
367 |
+
"extraction_success": False,
|
368 |
+
"extracted_condition": condition_result.get('condition') if condition_result else None,
|
369 |
+
"query_status": condition_result.get('query_status') if condition_result else status,
|
370 |
+
"extraction_time": timing_details.get('step1_condition_extraction', 0.0)
|
371 |
+
},
|
372 |
+
|
373 |
+
# Metric 3: Retrieval Relevance - Failed due to pipeline failure
|
374 |
+
"relevance_metrics": {
|
375 |
+
"average_relevance": 0.0,
|
376 |
+
"retrieved_count": 0,
|
377 |
+
"meets_threshold": False,
|
378 |
+
"retrieval_time": timing_details.get('step3_retrieval', 0.0)
|
379 |
+
},
|
380 |
+
|
381 |
+
# Metric 4: Retrieval Coverage - Failed due to pipeline failure
|
382 |
+
"coverage_metrics": {
|
383 |
+
"coverage_score": 0.0,
|
384 |
+
"meets_threshold": False,
|
385 |
+
"generation_time": timing_details.get('step4_generation', 0.0)
|
386 |
+
},
|
387 |
+
|
388 |
+
# Note: Metrics 5-6 (Clinical Actionability & Evidence Quality)
|
389 |
+
# are not collected here - they require separate LLM evaluation
|
390 |
+
# using the medical_outputs saved by this evaluator
|
391 |
+
|
392 |
+
"overall_success": False,
|
393 |
+
"status": status,
|
394 |
+
"error": error,
|
395 |
+
"timestamp": datetime.now().isoformat()
|
396 |
+
}
|
397 |
+
|
398 |
+
self.comprehensive_results.append(failed_result)
|
399 |
+
return failed_result
|
400 |
|
401 |
+
def _detect_query_intention(self, query: str) -> str:
|
402 |
+
"""Simplified query intention detection (from app.py)"""
|
403 |
+
query_lower = query.lower()
|
404 |
|
405 |
+
if any(word in query_lower for word in ['diagnos', 'differential', 'possible', 'causes']):
|
406 |
+
return 'diagnosis'
|
407 |
+
elif any(word in query_lower for word in ['treat', 'manage', 'therapy', 'intervention']):
|
408 |
+
return 'treatment'
|
409 |
+
else:
|
410 |
+
return 'mixed'
|
411 |
+
def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
|
412 |
+
"""Parse queries from file with category labels"""
|
413 |
print(f"π Reading queries from file: {filepath}")
|
414 |
|
415 |
try:
|
|
|
451 |
print(f"π Parsed queries by category:")
|
452 |
for category, category_queries in queries_by_category.items():
|
453 |
print(f" {category.capitalize()}: {len(category_queries)} queries")
|
|
|
|
|
454 |
|
455 |
return queries_by_category
|
456 |
|
|
|
458 |
print(f"β Failed to read file: {e}")
|
459 |
return {"error": f"Failed to read file: {e}"}
|
460 |
|
461 |
+
def calculate_metric_statistics(self, metric_name: str) -> Dict[str, Any]:
|
462 |
+
"""Calculate statistics for a specific metric across all results"""
|
463 |
+
category_stats = {}
|
464 |
+
all_successful_results = []
|
465 |
|
466 |
+
# Group results by category
|
467 |
+
results_by_category = {
|
468 |
+
"diagnosis": [],
|
469 |
+
"treatment": [],
|
470 |
+
"mixed": []
|
471 |
+
}
|
472 |
+
|
473 |
+
for result in self.comprehensive_results:
|
474 |
+
category = result.get('category', 'unknown')
|
475 |
+
if category in results_by_category:
|
476 |
+
results_by_category[category].append(result)
|
477 |
+
if result.get('overall_success'):
|
478 |
+
all_successful_results.append(result)
|
479 |
+
|
480 |
+
# Calculate statistics for each category based on metric type
|
481 |
+
for category, results in results_by_category.items():
|
482 |
+
successful_results = [r for r in results if r.get('overall_success')]
|
483 |
+
|
484 |
+
if metric_name == "latency":
|
485 |
+
if successful_results:
|
486 |
+
latencies = [r['latency_metrics']['total_latency'] for r in successful_results]
|
487 |
+
category_stats[category] = {
|
488 |
+
"average_latency": sum(latencies) / len(latencies),
|
489 |
+
"std_deviation": self._calculate_std(latencies),
|
490 |
+
"min_latency": min(latencies),
|
491 |
+
"max_latency": max(latencies),
|
492 |
+
"query_count": len(latencies),
|
493 |
+
"target_compliance": sum(1 for lat in latencies if lat <= 30.0) / len(latencies),
|
494 |
+
"individual_latencies": latencies
|
495 |
+
}
|
496 |
+
else:
|
497 |
+
category_stats[category] = self._get_empty_latency_stats()
|
498 |
+
|
499 |
+
elif metric_name == "extraction":
|
500 |
+
extraction_successes = [r['extraction_metrics']['extraction_success'] for r in results]
|
501 |
+
successful_extractions = sum(extraction_successes)
|
502 |
+
|
503 |
+
category_stats[category] = {
|
504 |
+
"success_rate": successful_extractions / len(results) if results else 0.0,
|
505 |
+
"successful_count": successful_extractions,
|
506 |
+
"total_count": len(results),
|
507 |
+
"average_extraction_time": sum(r['extraction_metrics']['extraction_time'] for r in results) / len(results) if results else 0.0,
|
508 |
+
"meets_threshold": (successful_extractions / len(results)) >= 0.8 if results else False
|
509 |
+
}
|
510 |
+
|
511 |
+
elif metric_name == "relevance":
|
512 |
+
if successful_results:
|
513 |
+
relevance_scores = [r['relevance_metrics']['average_relevance'] for r in successful_results]
|
514 |
+
category_stats[category] = {
|
515 |
+
"average_relevance": sum(relevance_scores) / len(relevance_scores),
|
516 |
+
"max_relevance": max(relevance_scores),
|
517 |
+
"min_relevance": min(relevance_scores),
|
518 |
+
"successful_retrievals": len(successful_results),
|
519 |
+
"total_queries": len(results),
|
520 |
+
"meets_threshold": (sum(relevance_scores) / len(relevance_scores)) >= 0.2,
|
521 |
+
"individual_relevance_scores": relevance_scores
|
522 |
+
}
|
523 |
+
else:
|
524 |
+
category_stats[category] = self._get_empty_relevance_stats(len(results))
|
525 |
+
|
526 |
+
elif metric_name == "coverage":
|
527 |
+
if successful_results:
|
528 |
+
coverage_scores = [r['coverage_metrics']['coverage_score'] for r in successful_results]
|
529 |
+
category_stats[category] = {
|
530 |
+
"average_coverage": sum(coverage_scores) / len(coverage_scores),
|
531 |
+
"max_coverage": max(coverage_scores),
|
532 |
+
"min_coverage": min(coverage_scores),
|
533 |
+
"successful_evaluations": len(successful_results),
|
534 |
+
"total_queries": len(results),
|
535 |
+
"meets_threshold": (sum(coverage_scores) / len(coverage_scores)) >= 0.6,
|
536 |
+
"individual_coverage_scores": coverage_scores
|
537 |
+
}
|
538 |
+
else:
|
539 |
+
category_stats[category] = self._get_empty_coverage_stats(len(results))
|
540 |
+
|
541 |
+
# Calculate overall statistics
|
542 |
+
overall_stats = self._calculate_overall_stats(metric_name, all_successful_results)
|
543 |
+
|
544 |
+
return {
|
545 |
+
"category_results": category_stats,
|
546 |
+
"overall_results": overall_stats,
|
547 |
+
"timestamp": datetime.now().isoformat()
|
548 |
+
}
|
549 |
+
|
550 |
+
def _calculate_std(self, values: List[float]) -> float:
|
551 |
+
"""Calculate standard deviation"""
|
552 |
+
if len(values) < 2:
|
553 |
+
return 0.0
|
554 |
+
|
555 |
+
mean = sum(values) / len(values)
|
556 |
+
variance = sum((x - mean) ** 2 for x in values) / len(values)
|
557 |
+
return variance ** 0.5
|
558 |
|
559 |
+
def _get_empty_latency_stats(self) -> Dict[str, Any]:
|
560 |
+
"""Return empty latency statistics"""
|
561 |
+
return {
|
562 |
+
"average_latency": 0.0,
|
563 |
+
"std_deviation": 0.0,
|
564 |
+
"min_latency": 0.0,
|
565 |
+
"max_latency": 0.0,
|
566 |
+
"query_count": 0,
|
567 |
+
"target_compliance": 0.0,
|
568 |
+
"individual_latencies": []
|
569 |
+
}
|
570 |
+
|
571 |
+
def _get_empty_relevance_stats(self, total_queries: int) -> Dict[str, Any]:
|
572 |
+
"""Return empty relevance statistics"""
|
573 |
+
return {
|
574 |
+
"average_relevance": 0.0,
|
575 |
+
"max_relevance": 0.0,
|
576 |
+
"min_relevance": 0.0,
|
577 |
+
"successful_retrievals": 0,
|
578 |
+
"total_queries": total_queries,
|
579 |
+
"meets_threshold": False,
|
580 |
+
"individual_relevance_scores": []
|
581 |
+
}
|
582 |
+
|
583 |
+
def _get_empty_coverage_stats(self, total_queries: int) -> Dict[str, Any]:
|
584 |
+
"""Return empty coverage statistics"""
|
585 |
+
return {
|
586 |
+
"average_coverage": 0.0,
|
587 |
+
"max_coverage": 0.0,
|
588 |
+
"min_coverage": 0.0,
|
589 |
+
"successful_evaluations": 0,
|
590 |
+
"total_queries": total_queries,
|
591 |
+
"meets_threshold": False,
|
592 |
+
"individual_coverage_scores": []
|
593 |
+
}
|
594 |
+
|
595 |
+
def _calculate_overall_stats(self, metric_name: str, all_successful_results: List[Dict]) -> Dict[str, Any]:
|
596 |
+
"""Calculate overall statistics for a specific metric"""
|
597 |
+
total_queries = len(self.comprehensive_results)
|
598 |
+
|
599 |
+
if metric_name == "latency" and all_successful_results:
|
600 |
+
latencies = [r['latency_metrics']['total_latency'] for r in all_successful_results]
|
601 |
+
return {
|
602 |
+
"average_latency": sum(latencies) / len(latencies),
|
603 |
+
"std_deviation": self._calculate_std(latencies),
|
604 |
+
"min_latency": min(latencies),
|
605 |
+
"max_latency": max(latencies),
|
606 |
+
"successful_queries": len(all_successful_results),
|
607 |
+
"total_queries": total_queries,
|
608 |
+
"target_compliance": sum(1 for lat in latencies if lat <= 30.0) / len(latencies)
|
609 |
+
}
|
610 |
+
|
611 |
+
elif metric_name == "extraction":
|
612 |
+
all_extractions = [r['extraction_metrics']['extraction_success'] for r in self.comprehensive_results]
|
613 |
+
successful_extractions = sum(all_extractions)
|
614 |
+
return {
|
615 |
+
"success_rate": successful_extractions / len(all_extractions) if all_extractions else 0.0,
|
616 |
+
"successful_count": successful_extractions,
|
617 |
+
"total_count": len(all_extractions),
|
618 |
+
"target_compliance": (successful_extractions / len(all_extractions)) >= 0.8 if all_extractions else False
|
619 |
+
}
|
620 |
+
|
621 |
+
elif metric_name == "relevance" and all_successful_results:
|
622 |
+
relevance_scores = [r['relevance_metrics']['average_relevance'] for r in all_successful_results]
|
623 |
+
return {
|
624 |
+
"average_relevance": sum(relevance_scores) / len(relevance_scores),
|
625 |
+
"max_relevance": max(relevance_scores),
|
626 |
+
"min_relevance": min(relevance_scores),
|
627 |
+
"successful_queries": len(all_successful_results),
|
628 |
+
"total_queries": total_queries,
|
629 |
+
"meets_threshold": (sum(relevance_scores) / len(relevance_scores)) >= 0.2,
|
630 |
+
"target_compliance": (sum(relevance_scores) / len(relevance_scores)) >= 0.25
|
631 |
+
}
|
632 |
+
|
633 |
+
elif metric_name == "coverage" and all_successful_results:
|
634 |
+
coverage_scores = [r['coverage_metrics']['coverage_score'] for r in all_successful_results]
|
635 |
+
return {
|
636 |
+
"average_coverage": sum(coverage_scores) / len(coverage_scores),
|
637 |
+
"max_coverage": max(coverage_scores),
|
638 |
+
"min_coverage": min(coverage_scores),
|
639 |
+
"successful_queries": len(all_successful_results),
|
640 |
+
"total_queries": total_queries,
|
641 |
+
"meets_threshold": (sum(coverage_scores) / len(coverage_scores)) >= 0.6
|
642 |
+
}
|
643 |
+
|
644 |
+
# Return empty stats for failed cases
|
645 |
+
return {
|
646 |
+
"average_value": 0.0,
|
647 |
+
"successful_queries": len(all_successful_results),
|
648 |
+
"total_queries": total_queries,
|
649 |
+
"meets_threshold": False
|
650 |
+
}
|
651 |
+
def save_all_metric_statistics(self) -> Dict[str, str]:
|
652 |
+
"""Save separate statistics files for each metric"""
|
653 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
654 |
+
|
655 |
+
# Ensure results directory exists
|
656 |
+
results_dir = Path(__file__).parent / "results"
|
657 |
+
results_dir.mkdir(exist_ok=True)
|
658 |
+
|
659 |
+
saved_files = {}
|
660 |
+
|
661 |
+
# Save statistics for each metric
|
662 |
+
for metric_name in ["latency", "extraction", "relevance", "coverage"]:
|
663 |
+
stats = self.calculate_metric_statistics(metric_name)
|
664 |
+
filename = f"{metric_name}_statistics_{timestamp}.json"
|
665 |
+
filepath = results_dir / filename
|
666 |
+
|
667 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
668 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
669 |
+
|
670 |
+
saved_files[metric_name] = str(filepath)
|
671 |
+
print(f"π {metric_name.capitalize()} statistics saved to: {filepath}")
|
672 |
+
|
673 |
+
return saved_files
|
674 |
+
|
675 |
+
def save_medical_outputs(self, filename: str = None) -> str:
|
676 |
+
"""Save medical advice outputs for model comparison"""
|
677 |
if filename is None:
|
678 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
679 |
+
filename = f"medical_outputs_{timestamp}.json"
|
|
|
680 |
|
681 |
# Ensure results directory exists
|
682 |
results_dir = Path(__file__).parent / "results"
|
|
|
684 |
|
685 |
filepath = results_dir / filename
|
686 |
|
687 |
+
# Create comprehensive output data
|
688 |
+
output_data = {
|
689 |
+
"evaluation_metadata": {
|
690 |
+
"total_outputs": len(self.medical_outputs),
|
691 |
+
"categories": list(set(output['category'] for output in self.medical_outputs)),
|
692 |
+
"timestamp": datetime.now().isoformat(),
|
693 |
+
"model_type": "Med42-70B_RAG_enhanced" # For future comparison
|
694 |
+
},
|
695 |
+
"medical_outputs": self.medical_outputs
|
696 |
+
}
|
697 |
+
|
698 |
with open(filepath, 'w', encoding='utf-8') as f:
|
699 |
+
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
700 |
|
701 |
+
print(f"π Medical outputs saved to: {filepath}")
|
702 |
+
return str(filepath)
|
703 |
+
|
704 |
+
def save_comprehensive_details(self, filename: str = None) -> str:
|
705 |
+
"""Save comprehensive detailed results"""
|
706 |
+
if filename is None:
|
707 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
708 |
+
filename = f"comprehensive_details_{timestamp}.json"
|
709 |
+
|
710 |
+
# Ensure results directory exists
|
711 |
+
results_dir = Path(__file__).parent / "results"
|
712 |
+
results_dir.mkdir(exist_ok=True)
|
713 |
+
|
714 |
+
filepath = results_dir / filename
|
715 |
+
|
716 |
+
# Create comprehensive evaluation data
|
717 |
+
comprehensive_data = {
|
718 |
+
"evaluation_metadata": {
|
719 |
+
"total_queries": len(self.comprehensive_results),
|
720 |
+
"successful_queries": len([r for r in self.comprehensive_results if r.get('overall_success')]),
|
721 |
+
"timestamp": datetime.now().isoformat(),
|
722 |
+
"evaluator_type": "comprehensive_metrics_1_to_4",
|
723 |
+
"metrics_evaluated": ["latency", "extraction", "relevance", "coverage"]
|
724 |
+
},
|
725 |
+
"comprehensive_results": self.comprehensive_results
|
726 |
+
}
|
727 |
+
|
728 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
729 |
+
json.dump(comprehensive_data, f, indent=2, ensure_ascii=False)
|
730 |
+
|
731 |
+
print(f"π Comprehensive details saved to: {filepath}")
|
732 |
return str(filepath)
|
733 |
|
734 |
|
735 |
# Independent execution interface
|
736 |
if __name__ == "__main__":
|
737 |
+
"""Independent comprehensive evaluation interface"""
|
738 |
|
739 |
+
print("π OnCall.ai Comprehensive Evaluator - Metrics 1-4 in Single Run")
|
740 |
|
741 |
if len(sys.argv) > 1:
|
742 |
query_file = sys.argv[1]
|
|
|
750 |
sys.exit(1)
|
751 |
|
752 |
# Initialize evaluator
|
753 |
+
evaluator = ComprehensiveEvaluator()
|
754 |
|
755 |
# Parse queries from file
|
756 |
+
queries_by_category = evaluator.parse_queries_from_file(str(query_file))
|
757 |
|
758 |
if "error" in queries_by_category:
|
759 |
print(f"β Failed to parse queries: {queries_by_category['error']}")
|
760 |
sys.exit(1)
|
761 |
|
762 |
+
# Test each query comprehensively
|
763 |
+
print(f"\nπ§ͺ Comprehensive Evaluation - All Metrics in Single Run")
|
764 |
+
print(f"π Collecting metrics 1-4 from single app.py pipeline execution")
|
765 |
|
766 |
for category, queries in queries_by_category.items():
|
767 |
if not queries:
|
|
|
774 |
print(f"\nπ Query {i+1}/{len(queries)} in {category} category:")
|
775 |
print(f" Text: {query_text}")
|
776 |
|
777 |
+
# Comprehensive evaluation (collects all metrics 1-4)
|
778 |
+
result = evaluator.evaluate_single_query_comprehensive(query_text, category)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
779 |
|
780 |
# Pause between queries to avoid rate limits
|
781 |
+
if i < len(queries) - 1:
|
782 |
print(f" β³ Pausing 5s before next query...")
|
783 |
time.sleep(5)
|
784 |
|
785 |
# Longer pause between categories
|
786 |
+
if category != list(queries_by_category.keys())[-1]:
|
787 |
print(f"\nβ³ Pausing 10s before next category...")
|
788 |
time.sleep(10)
|
789 |
|
790 |
+
# Generate and save all metric statistics
|
791 |
+
print(f"\nπ Generating comprehensive analysis for all metrics...")
|
|
|
|
|
|
|
792 |
|
793 |
+
# Save separate statistics for each metric
|
794 |
+
saved_stats = evaluator.save_all_metric_statistics()
|
795 |
|
796 |
# Save medical outputs for model comparison
|
797 |
outputs_path = evaluator.save_medical_outputs()
|
798 |
|
799 |
+
# Save comprehensive details
|
800 |
+
details_path = evaluator.save_comprehensive_details()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
801 |
|
802 |
+
# Print comprehensive summary
|
803 |
+
print(f"\nπ === COMPREHENSIVE EVALUATION SUMMARY ===")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
804 |
|
805 |
+
for metric_name in ["latency", "extraction", "relevance", "coverage"]:
|
806 |
+
stats = evaluator.calculate_metric_statistics(metric_name)
|
807 |
+
overall_results = stats['overall_results']
|
|
|
|
|
808 |
|
809 |
+
print(f"\n{metric_name.upper()} METRICS:")
|
|
|
|
|
810 |
|
811 |
+
if metric_name == "latency":
|
812 |
+
print(f" Average: {overall_results['average_latency']:.2f}s (Β±{overall_results['std_deviation']:.2f})")
|
813 |
+
print(f" 30s Target: {'β
Met' if overall_results['target_compliance'] >= 0.8 else 'β Not Met'}")
|
814 |
|
815 |
+
elif metric_name == "extraction":
|
816 |
+
print(f" Success Rate: {overall_results['success_rate']:.1%}")
|
817 |
+
print(f" 80% Target: {'β
Met' if overall_results['target_compliance'] else 'β Not Met'}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
818 |
|
819 |
+
elif metric_name == "relevance":
|
820 |
+
print(f" Average Relevance: {overall_results['average_relevance']:.3f}")
|
821 |
+
print(f" 0.25 Target: {'β
Met' if overall_results.get('target_compliance', False) else 'β Not Met'}")
|
|
|
|
|
|
|
822 |
|
823 |
+
elif metric_name == "coverage":
|
824 |
+
print(f" Average Coverage: {overall_results['average_coverage']:.3f} ({overall_results['average_coverage']*100:.1f}%)")
|
825 |
+
print(f" 60% Target: {'β
Met' if overall_results['meets_threshold'] else 'β Not Met'}")
|
826 |
|
827 |
+
print(f"\nβ
Comprehensive evaluation complete! Files saved:")
|
828 |
+
for metric_name, filepath in saved_stats.items():
|
829 |
+
print(f" π {metric_name.capitalize()}: {filepath}")
|
830 |
+
print(f" π Medical Outputs: {outputs_path}")
|
831 |
+
print(f" π Comprehensive Details: {details_path}")
|
832 |
+
print(f"\nπ‘ Next step: Run chart generators for individual metrics")
|
833 |
+
print(f" python latency_chart_generator.py")
|
834 |
+
print(f" python extraction_chart_generator.py # (create separately)")
|
835 |
+
print(f" python relevance_chart_generator.py # (create separately)")
|
836 |
+
print(f" python coverage_chart_generator.py # (create separately)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/{coverage_evaluator.py β old/coverage_evaluator.py}
RENAMED
File without changes
|
evaluation/{extraction_evaluator.py β old/extraction_evaluator.py}
RENAMED
File without changes
|
evaluation/{relevance_evaluator.py β old/relevance_evaluator.py}
RENAMED
File without changes
|