YanBoChen commited on
Commit
88e76fd
Β·
1 Parent(s): 3e2ffcb

Add extraction and relevance evaluators for condition extraction and retrieval relevance analysis

Browse files

- Implemented `extraction_evaluator.py` to evaluate condition extraction success rates using the `UserPromptProcessor`.
- Added functionality to parse queries from a file, evaluate extractions, and calculate statistics.
- Created methods to save extraction statistics and detailed results in JSON format.
- Implemented `relevance_evaluator.py` to assess retrieval relevance using cosine similarity scores.
- Included methods for parsing queries, evaluating relevance, and generating statistics.
- Both evaluators support independent execution and provide detailed output for analysis.

evaluation/latency_evaluator.py CHANGED
@@ -1,10 +1,21 @@
1
  #!/usr/bin/env python3
2
  """
3
- OnCall.ai System - Latency Evaluator (Single Query Test Mode)
4
- ============================================================
5
 
6
- Test latency for individual queries to avoid rate limits.
7
- Based on existing system flow: app.py -> user_prompt.py -> retrieval.py -> generation.py
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  Author: YanBo Chen
10
  Date: 2025-08-04
@@ -14,7 +25,7 @@ import time
14
  import json
15
  import os
16
  import sys
17
- from typing import Dict, List, Any
18
  from datetime import datetime
19
  from pathlib import Path
20
  import re
@@ -37,12 +48,12 @@ except ImportError as e:
37
  sys.exit(1)
38
 
39
 
40
- class LatencyEvaluator:
41
- """Pure latency measurement and medical advice output recording - no visualization"""
42
 
43
  def __init__(self):
44
- """Initialize existing system components"""
45
- print("πŸ”§ Initializing Latency Evaluator...")
46
 
47
  # Initialize existing system components (same as app.py)
48
  self.llm_client = llm_Med42_70BClient()
@@ -53,66 +64,137 @@ class LatencyEvaluator:
53
  )
54
  self.medical_generator = MedicalAdviceGenerator(llm_client=self.llm_client)
55
 
56
- # Results accumulation for summary statistics
57
- self.accumulated_results = {
58
- "diagnosis": [],
59
- "treatment": [],
60
- "mixed": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  }
62
 
63
- # Medical advice outputs for model comparison
64
- self.medical_outputs = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- print("βœ… Latency Evaluator initialization complete")
 
 
 
 
 
 
 
 
 
 
67
 
68
- def measure_single_query_latency(self, query: str, category: str = "unknown") -> Dict[str, Any]:
69
  """
70
- Measure complete processing time for a single query
71
 
72
- Replicates app.py's process_medical_query flow with timing focus
73
 
74
  Args:
75
  query: Medical query to test
76
  category: Query category (diagnosis/treatment/mixed)
77
  """
78
- print(f"⏱️ Measuring query latency: {query[:50]}...")
79
  print(f"πŸ“‹ Category: {category}")
80
 
81
  overall_start = time.time()
82
  timing_details = {}
83
 
84
  try:
85
- # STEP 1: Condition extraction (user_prompt.py)
86
  step1_start = time.time()
87
  condition_result = self.user_prompt_processor.extract_condition_keywords(query)
88
- timing_details['step1_condition_extraction'] = time.time() - step1_start
 
89
 
90
- print(f" Step 1 - Condition extraction: {timing_details['step1_condition_extraction']:.3f}s")
91
  print(f" Extracted condition: {condition_result.get('condition', 'None')}")
92
 
93
  # Check if valid medical query
94
  if condition_result.get('query_status') in ['invalid_query', 'non_medical']:
95
  total_time = time.time() - overall_start
96
- print(f" ⚠️ Non-medical query detected")
97
- return {
98
- "query": query,
99
- "category": category,
100
- "total_latency": total_time,
101
- "timing_details": timing_details,
102
- "status": "non_medical",
103
- "condition_result": condition_result,
104
- "success": False,
105
- "timestamp": datetime.now().isoformat()
106
- }
107
 
108
- # STEP 2: User confirmation (simulate auto-confirmation)
109
  step2_start = time.time()
110
  confirmation = self.user_prompt_processor.handle_user_confirmation(condition_result)
111
- timing_details['step2_confirmation'] = time.time() - step2_start
 
112
 
113
- print(f" Step 2 - User confirmation: {timing_details['step2_confirmation']:.3f}s")
 
 
 
114
 
115
- # STEP 3: Retrieve relevant guidelines (retrieval.py)
116
  step3_start = time.time()
117
 
118
  search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
@@ -120,12 +202,13 @@ class LatencyEvaluator:
120
  search_query = condition_result.get('condition', query)
121
 
122
  retrieval_results = self.retrieval_system.search(search_query, top_k=5)
123
- timing_details['step3_retrieval'] = time.time() - step3_start
 
124
 
125
- retrieved_count = len(retrieval_results.get('processed_results', []))
126
- print(f" Step 3 - Retrieval: {timing_details['step3_retrieval']:.3f}s ({retrieved_count} results)")
127
 
128
- # STEP 4: Generate medical advice (generation.py)
129
  step4_start = time.time()
130
 
131
  intention = self._detect_query_intention(query)
@@ -134,68 +217,199 @@ class LatencyEvaluator:
134
  retrieval_results=retrieval_results,
135
  intention=intention
136
  )
137
- timing_details['step4_generation'] = time.time() - step4_start
 
 
 
 
138
 
139
- print(f" Step 4 - Generation: {timing_details['step4_generation']:.3f}s")
140
 
141
  total_time = time.time() - overall_start
142
 
143
- # Extract medical advice output for future model comparison
144
- medical_advice_text = medical_advice_result.get('medical_advice', '')
145
- confidence_score = medical_advice_result.get('confidence_score', 0.0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
- result = {
 
148
  "query": query,
149
  "category": category,
150
- "total_latency": total_time,
151
- "timing_details": timing_details,
152
- "condition_result": condition_result,
153
- "retrieval_results": retrieval_results,
154
- "medical_advice_result": medical_advice_result,
155
- "status": "success",
156
- "success": True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  "timestamp": datetime.now().isoformat()
158
  }
159
 
160
- # Store medical output separately for model comparison
 
 
 
161
  medical_output = {
162
  "query": query,
163
  "category": category,
164
- "medical_advice": medical_advice_text,
165
  "confidence_score": confidence_score,
166
  "query_id": f"{category}_query",
167
  "processing_time": total_time,
168
  "timestamp": datetime.now().isoformat()
169
  }
170
-
171
  self.medical_outputs.append(medical_output)
172
 
173
- print(f"βœ… Query completed successfully in {total_time:.2f}s")
174
- print(f"πŸ“ Medical advice recorded ({len(medical_advice_text)} characters)")
 
175
 
176
- return result
177
 
178
  except Exception as e:
179
  total_time = time.time() - overall_start
180
- print(f"❌ Query failed after {total_time:.2f}s: {e}")
181
 
182
- return {
183
- "query": query,
184
- "category": category,
 
 
 
 
 
 
 
 
 
185
  "total_latency": total_time,
186
  "timing_details": timing_details,
187
- "error": str(e),
188
- "status": "error",
189
- "success": False,
190
- "timestamp": datetime.now().isoformat()
191
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
- def test_individual_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
194
- """
195
- Parse queries from file and return them for individual testing
196
 
197
- Returns categorized queries for separate testing
198
- """
 
 
 
 
 
 
199
  print(f"πŸ“ Reading queries from file: {filepath}")
200
 
201
  try:
@@ -237,8 +451,6 @@ class LatencyEvaluator:
237
  print(f"πŸ“‹ Parsed queries by category:")
238
  for category, category_queries in queries_by_category.items():
239
  print(f" {category.capitalize()}: {len(category_queries)} queries")
240
- for i, query_info in enumerate(category_queries):
241
- print(f" {i+1}. {query_info['text'][:60]}...")
242
 
243
  return queries_by_category
244
 
@@ -246,23 +458,225 @@ class LatencyEvaluator:
246
  print(f"❌ Failed to read file: {e}")
247
  return {"error": f"Failed to read file: {e}"}
248
 
249
- def _detect_query_intention(self, query: str) -> str:
250
- """Simplified query intention detection (from app.py)"""
251
- query_lower = query.lower()
 
252
 
253
- if any(word in query_lower for word in ['diagnos', 'differential', 'possible', 'causes']):
254
- return 'diagnosis'
255
- elif any(word in query_lower for word in ['treat', 'manage', 'therapy', 'intervention']):
256
- return 'treatment'
257
- else:
258
- return 'mixed'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
- def save_single_result(self, result: Dict[str, Any], filename: str = None) -> str:
261
- """Save single query evaluation result"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  if filename is None:
263
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
264
- category = result.get('category', 'unknown')
265
- filename = f"latency_{category}_{timestamp}.json"
266
 
267
  # Ensure results directory exists
268
  results_dir = Path(__file__).parent / "results"
@@ -270,18 +684,59 @@ class LatencyEvaluator:
270
 
271
  filepath = results_dir / filename
272
 
 
 
 
 
 
 
 
 
 
 
 
273
  with open(filepath, 'w', encoding='utf-8') as f:
274
- json.dump(result, f, indent=2, ensure_ascii=False)
275
 
276
- print(f"πŸ’Ύ Result saved to: {filepath}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  return str(filepath)
278
 
279
 
280
  # Independent execution interface
281
  if __name__ == "__main__":
282
- """Independent test interface for single queries"""
283
 
284
- print("πŸš€ OnCall.ai Latency Evaluator - Single Query Test Mode")
285
 
286
  if len(sys.argv) > 1:
287
  query_file = sys.argv[1]
@@ -295,18 +750,18 @@ if __name__ == "__main__":
295
  sys.exit(1)
296
 
297
  # Initialize evaluator
298
- evaluator = LatencyEvaluator()
299
 
300
  # Parse queries from file
301
- queries_by_category = evaluator.test_individual_queries_from_file(str(query_file))
302
 
303
  if "error" in queries_by_category:
304
  print(f"❌ Failed to parse queries: {queries_by_category['error']}")
305
  sys.exit(1)
306
 
307
- # Test each category individually
308
- print(f"\nπŸ§ͺ Individual Query Testing Mode with Result Accumulation")
309
- print(f"πŸ“ Test each query separately to avoid rate limits")
310
 
311
  for category, queries in queries_by_category.items():
312
  if not queries:
@@ -319,178 +774,63 @@ if __name__ == "__main__":
319
  print(f"\nπŸ” Query {i+1}/{len(queries)} in {category} category:")
320
  print(f" Text: {query_text}")
321
 
322
- # Test single query
323
- result = evaluator.measure_single_query_latency(query_text, category)
324
-
325
- # Add to accumulator for chart generation
326
- evaluator.add_result_to_accumulator(result)
327
-
328
- # Save individual result
329
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
330
- filename = f"latency_{category}_query{i+1}_{timestamp}.json"
331
- saved_path = evaluator.save_single_result(result, filename)
332
-
333
- # Show summary
334
- if result.get('success'):
335
- print(f" βœ… Success: {result['total_latency']:.2f}s total")
336
- print(f" Breakdown: Extract={result['timing_details']['step1_condition_extraction']:.2f}s, "
337
- f"Retrieve={result['timing_details']['step3_retrieval']:.2f}s, "
338
- f"Generate={result['timing_details']['step4_generation']:.2f}s")
339
- else:
340
- print(f" ❌ Failed: {result.get('status')} - {result.get('error', 'Unknown error')}")
341
 
342
  # Pause between queries to avoid rate limits
343
- if i < len(queries) - 1: # Not the last query in category
344
  print(f" ⏳ Pausing 5s before next query...")
345
  time.sleep(5)
346
 
347
  # Longer pause between categories
348
- if category != list(queries_by_category.keys())[-1]: # Not the last category
349
  print(f"\n⏳ Pausing 10s before next category...")
350
  time.sleep(10)
351
 
352
- # Generate comprehensive analysis (no charts - pure data)
353
- print(f"\nπŸ“Š Generating comprehensive statistical summary...")
354
-
355
- # Calculate category statistics
356
- final_stats = evaluator.calculate_category_statistics()
357
 
358
- # Save statistics for chart generation
359
- stats_path = evaluator.save_statistics_summary()
360
 
361
  # Save medical outputs for model comparison
362
  outputs_path = evaluator.save_medical_outputs()
363
 
364
- # Print final summary
365
- print(f"\nπŸ“Š === FINAL LATENCY ANALYSIS SUMMARY ===")
366
- category_results = final_stats['category_results']
367
- overall_results = final_stats['overall_results']
368
-
369
- print(f"Overall Performance:")
370
- print(f" Average Latency: {overall_results['average_latency']:.2f}s (Β±{overall_results['std_deviation']:.2f})")
371
- print(f" Success Rate: {overall_results['successful_queries']}/{overall_results['total_queries']}")
372
- print(f" 30s Target Compliance: {overall_results['target_compliance']:.1%}")
373
-
374
- print(f"\nCategory Breakdown:")
375
- for category, stats in category_results.items():
376
- if stats['query_count'] > 0:
377
- print(f" {category.capitalize()}: {stats['average_latency']:.2f}s (Β±{stats['std_deviation']:.2f}) [{stats['query_count']} queries]")
378
-
379
- print(f"\nβœ… Data collection complete! Files saved:")
380
- print(f" πŸ“Š Statistics: {stats_path}")
381
- print(f" πŸ“ Medical Outputs: {outputs_path}")
382
- print(f" πŸ“ Individual results: {Path(__file__).parent / 'results'}")
383
- print(f"\nπŸ’‘ Next step: Run latency_chart_generator.py to create visualizations")
384
-
385
- def add_result_to_accumulator(self, result: Dict[str, Any]):
386
- """Add successful result to category accumulator"""
387
- if result.get('success') and result.get('category') in self.accumulated_results:
388
- category = result['category']
389
- self.accumulated_results[category].append(result)
390
- print(f"πŸ“Š Added result to {category} category. Total: {len(self.accumulated_results[category])}")
391
 
392
- def save_statistics_summary(self, filename: str = None) -> str:
393
- """Save statistical summary for chart generation"""
394
- stats = self.calculate_category_statistics()
395
-
396
- if filename is None:
397
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
398
- filename = f"latency_statistics_{timestamp}.json"
399
-
400
- # Ensure results directory exists
401
- results_dir = Path(__file__).parent / "results"
402
- results_dir.mkdir(exist_ok=True)
403
-
404
- filepath = results_dir / filename
405
-
406
- with open(filepath, 'w', encoding='utf-8') as f:
407
- json.dump(stats, f, indent=2, ensure_ascii=False)
408
-
409
- print(f"πŸ“Š Statistics saved to: {filepath}")
410
- return str(filepath)
411
 
412
- def save_medical_outputs(self, filename: str = None) -> str:
413
- """Save medical advice outputs for model comparison"""
414
- if filename is None:
415
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
416
- filename = f"medical_outputs_{timestamp}.json"
417
 
418
- # Ensure results directory exists
419
- results_dir = Path(__file__).parent / "results"
420
- results_dir.mkdir(exist_ok=True)
421
 
422
- filepath = results_dir / filename
 
 
423
 
424
- # Create comprehensive output data
425
- output_data = {
426
- "evaluation_metadata": {
427
- "total_outputs": len(self.medical_outputs),
428
- "categories": list(set(output['category'] for output in self.medical_outputs)),
429
- "timestamp": datetime.now().isoformat(),
430
- "model_type": "Med42-70B_RAG_enhanced" # For future comparison
431
- },
432
- "medical_outputs": self.medical_outputs
433
- }
434
 
435
- with open(filepath, 'w', encoding='utf-8') as f:
436
- json.dump(output_data, f, indent=2, ensure_ascii=False)
437
-
438
- print(f"πŸ“ Medical outputs saved to: {filepath}")
439
- print(f" Total outputs: {len(self.medical_outputs)}")
440
- print(f" Categories: {', '.join(set(output['category'] for output in self.medical_outputs))}")
441
 
442
- return str(filepath)
 
 
443
 
444
- def calculate_category_statistics(self) -> Dict[str, Any]:
445
- """Calculate statistics for each category and overall"""
446
- category_stats = {}
447
- all_successful_latencies = []
448
-
449
- for category, results in self.accumulated_results.items():
450
- latencies = [r['total_latency'] for r in results if r.get('success')]
451
-
452
- if latencies:
453
- category_stats[category] = {
454
- "average_latency": sum(latencies) / len(latencies),
455
- "std_deviation": self._calculate_std(latencies),
456
- "min_latency": min(latencies),
457
- "max_latency": max(latencies),
458
- "query_count": len(latencies),
459
- "individual_latencies": latencies
460
- }
461
- all_successful_latencies.extend(latencies)
462
- else:
463
- category_stats[category] = {
464
- "average_latency": 0.0,
465
- "std_deviation": 0.0,
466
- "min_latency": 0.0,
467
- "max_latency": 0.0,
468
- "query_count": 0,
469
- "individual_latencies": []
470
- }
471
-
472
- # Calculate overall statistics
473
- overall_stats = {
474
- "average_latency": sum(all_successful_latencies) / len(all_successful_latencies) if all_successful_latencies else 0.0,
475
- "std_deviation": self._calculate_std(all_successful_latencies),
476
- "min_latency": min(all_successful_latencies) if all_successful_latencies else 0.0,
477
- "max_latency": max(all_successful_latencies) if all_successful_latencies else 0.0,
478
- "total_queries": sum(len(results) for results in self.accumulated_results.values()),
479
- "successful_queries": len(all_successful_latencies),
480
- "target_compliance": sum(1 for lat in all_successful_latencies if lat <= 30.0) / len(all_successful_latencies) if all_successful_latencies else 0.0
481
- }
482
-
483
- return {
484
- "category_results": category_stats,
485
- "overall_results": overall_stats,
486
- "timestamp": datetime.now().isoformat()
487
- }
488
-
489
- def _calculate_std(self, values: List[float]) -> float:
490
- """Calculate standard deviation"""
491
- if len(values) < 2:
492
- return 0.0
493
-
494
- mean = sum(values) / len(values)
495
- variance = sum((x - mean) ** 2 for x in values) / len(values)
496
- return variance ** 0.5
 
1
  #!/usr/bin/env python3
2
  """
3
+ OnCall.ai System - Comprehensive Evaluator (Metrics 1-6)
4
+ ========================================================
5
 
6
+ Single execution to collect all metrics 1-6 data from app.py pipeline:
7
+
8
+ RETRIEVAL METRICS (Only available for RAG systems):
9
+ 1. Total Latency (總處理時長) - Complete pipeline timing
10
+ 2. Condition Extraction Success Rate (ζ’δ»ΆζŠ½ε–ζˆεŠŸηŽ‡) - user_prompt.py success
11
+ 3. Retrieval Relevance (ζͺ’η΄’η›Έι—œζ€§) - cosine similarity from retrieval.py
12
+ 4. Retrieval Coverage (ζͺ’η΄’θ¦†θ“‹ηŽ‡) - advice utilization of retrieved content
13
+
14
+ LLM EVALUATION METRICS (Available for all systems):
15
+ 5. Clinical Actionability (θ‡¨εΊŠε―ζ“δ½œζ€§) - Third-party LLM evaluation
16
+ 6. Clinical Evidence Quality (θ‡¨εΊŠθ­‰ζ“šε“θ³ͺ) - Third-party LLM evaluation
17
+
18
+ Note: This evaluator focuses on metrics 1-4. Metrics 5-6 require separate LLM evaluation.
19
 
20
  Author: YanBo Chen
21
  Date: 2025-08-04
 
25
  import json
26
  import os
27
  import sys
28
+ from typing import Dict, List, Any, Set
29
  from datetime import datetime
30
  from pathlib import Path
31
  import re
 
48
  sys.exit(1)
49
 
50
 
51
+ class ComprehensiveEvaluator:
52
+ """Comprehensive evaluator for metrics 1-4 - single execution approach"""
53
 
54
  def __init__(self):
55
+ """Initialize system components (identical to app.py)"""
56
+ print("πŸ”§ Initializing Comprehensive Evaluator...")
57
 
58
  # Initialize existing system components (same as app.py)
59
  self.llm_client = llm_Med42_70BClient()
 
64
  )
65
  self.medical_generator = MedicalAdviceGenerator(llm_client=self.llm_client)
66
 
67
+ # Results accumulation for all metrics
68
+ self.comprehensive_results = []
69
+ self.medical_outputs = []
70
+
71
+ print("βœ… Comprehensive Evaluator initialization complete")
72
+
73
+ def extract_medical_keywords(self, text: str) -> Set[str]:
74
+ """Extract medical keywords for coverage analysis"""
75
+ if not text:
76
+ return set()
77
+
78
+ medical_keywords = set()
79
+ text_lower = text.lower()
80
+
81
+ # Medical terminology patterns
82
+ patterns = [
83
+ r'\b[a-z]+(?:osis|itis|pathy|emia|uria|gram|scopy)\b', # Medical suffixes
84
+ r'\b(?:cardio|neuro|pulmo|gastro|hepato|nephro)[a-z]+\b', # Medical prefixes
85
+ r'\b(?:diagnosis|treatment|therapy|intervention|management)\b', # Medical actions
86
+ r'\b(?:patient|symptom|condition|disease|disorder|syndrome)\b', # Medical entities
87
+ r'\b(?:acute|chronic|severe|mild|moderate|emergency)\b', # Medical descriptors
88
+ r'\b[a-z]+(?:al|ic|ous|ive)\s+(?:pain|failure|infection|injury)\b', # Compound terms
89
+ r'\b(?:ecg|ekg|ct|mri|x-ray|ultrasound|biopsy)\b', # Medical procedures
90
+ r'\b\d+\s*(?:mg|ml|units|hours|days|minutes)\b', # Dosages and timeframes
91
+ ]
92
+
93
+ for pattern in patterns:
94
+ matches = re.findall(pattern, text_lower)
95
+ medical_keywords.update(match.strip() for match in matches)
96
+
97
+ # Additional common medical terms
98
+ common_medical_terms = [
99
+ 'blood', 'pressure', 'heart', 'chest', 'pain', 'stroke', 'seizure',
100
+ 'emergency', 'hospital', 'monitor', 'assess', 'evaluate', 'immediate',
101
+ 'protocol', 'guideline', 'recommendation', 'risk', 'factor'
102
+ ]
103
+
104
+ for term in common_medical_terms:
105
+ if term in text_lower:
106
+ medical_keywords.add(term)
107
+
108
+ # Filter out very short terms and common words
109
+ filtered_keywords = {
110
+ kw for kw in medical_keywords
111
+ if len(kw) > 2 and kw not in ['the', 'and', 'for', 'with', 'are', 'can', 'may']
112
  }
113
 
114
+ return filtered_keywords
115
+
116
+ def calculate_coverage_metrics(self, generated_advice: str, retrieval_results: List[Dict]) -> Dict[str, Any]:
117
+ """Calculate coverage metrics from generated advice and retrieval results"""
118
+ if not generated_advice or not retrieval_results:
119
+ return {
120
+ "coverage_score": 0.0,
121
+ "matched_keywords": [],
122
+ "advice_keywords": [],
123
+ "source_keywords": [],
124
+ "coverage_percentage": 0.0,
125
+ "meets_threshold": False
126
+ }
127
+
128
+ # Extract keywords from generated advice
129
+ advice_keywords = self.extract_medical_keywords(generated_advice)
130
+
131
+ # Extract keywords from all retrieved documents
132
+ all_source_keywords = set()
133
+ for doc in retrieval_results:
134
+ doc_content = doc.get('content', '') or doc.get('text', '')
135
+ doc_keywords = self.extract_medical_keywords(doc_content)
136
+ all_source_keywords.update(doc_keywords)
137
+
138
+ # Calculate coverage
139
+ matched_keywords = advice_keywords.intersection(all_source_keywords)
140
+ coverage_score = len(matched_keywords) / len(all_source_keywords) if all_source_keywords else 0.0
141
 
142
+ return {
143
+ "coverage_score": coverage_score,
144
+ "matched_keywords": list(matched_keywords),
145
+ "advice_keywords": list(advice_keywords),
146
+ "source_keywords": list(all_source_keywords),
147
+ "advice_keywords_count": len(advice_keywords),
148
+ "source_keywords_count": len(all_source_keywords),
149
+ "matched_keywords_count": len(matched_keywords),
150
+ "coverage_percentage": coverage_score * 100,
151
+ "meets_threshold": coverage_score >= 0.6
152
+ }
153
 
154
+ def evaluate_single_query_comprehensive(self, query: str, category: str = "unknown") -> Dict[str, Any]:
155
  """
156
+ Comprehensive evaluation for single query - collects all metrics 1-4 data
157
 
158
+ Replicates app.py's process_medical_query pipeline exactly
159
 
160
  Args:
161
  query: Medical query to test
162
  category: Query category (diagnosis/treatment/mixed)
163
  """
164
+ print(f"πŸ” Comprehensive evaluation: {query[:50]}...")
165
  print(f"πŸ“‹ Category: {category}")
166
 
167
  overall_start = time.time()
168
  timing_details = {}
169
 
170
  try:
171
+ # STEP 1: Query Processing and Condition Extraction (identical to app.py)
172
  step1_start = time.time()
173
  condition_result = self.user_prompt_processor.extract_condition_keywords(query)
174
+ step1_time = time.time() - step1_start
175
+ timing_details['step1_condition_extraction'] = step1_time
176
 
177
+ print(f" Step 1 - Condition extraction: {step1_time:.3f}s")
178
  print(f" Extracted condition: {condition_result.get('condition', 'None')}")
179
 
180
  # Check if valid medical query
181
  if condition_result.get('query_status') in ['invalid_query', 'non_medical']:
182
  total_time = time.time() - overall_start
183
+ return self._create_failed_result(query, category, total_time, timing_details,
184
+ "non_medical", condition_result)
 
 
 
 
 
 
 
 
 
185
 
186
+ # STEP 2: User Confirmation (simulate auto-confirmation)
187
  step2_start = time.time()
188
  confirmation = self.user_prompt_processor.handle_user_confirmation(condition_result)
189
+ step2_time = time.time() - step2_start
190
+ timing_details['step2_confirmation'] = step2_time
191
 
192
+ if not condition_result.get('condition'):
193
+ total_time = time.time() - overall_start
194
+ return self._create_failed_result(query, category, total_time, timing_details,
195
+ "no_condition", condition_result)
196
 
197
+ # STEP 3: Medical Guidelines Retrieval (identical to app.py)
198
  step3_start = time.time()
199
 
200
  search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
 
202
  search_query = condition_result.get('condition', query)
203
 
204
  retrieval_results = self.retrieval_system.search(search_query, top_k=5)
205
+ step3_time = time.time() - step3_start
206
+ timing_details['step3_retrieval'] = step3_time
207
 
208
+ processed_results = retrieval_results.get('processed_results', [])
209
+ print(f" Step 3 - Retrieval: {step3_time:.3f}s ({len(processed_results)} results)")
210
 
211
+ # STEP 4: Medical Advice Generation (identical to app.py)
212
  step4_start = time.time()
213
 
214
  intention = self._detect_query_intention(query)
 
217
  retrieval_results=retrieval_results,
218
  intention=intention
219
  )
220
+ step4_time = time.time() - step4_start
221
+ timing_details['step4_generation'] = step4_time
222
+
223
+ generated_advice = medical_advice_result.get('medical_advice', '')
224
+ confidence_score = medical_advice_result.get('confidence_score', 0.0)
225
 
226
+ print(f" Step 4 - Generation: {step4_time:.3f}s")
227
 
228
  total_time = time.time() - overall_start
229
 
230
+ # METRIC 2: Condition Extraction Analysis
231
+ extraction_success = (
232
+ condition_result.get('condition') and
233
+ condition_result.get('condition') != "unknown" and
234
+ condition_result.get('query_status') not in ['invalid_query', 'non_medical']
235
+ )
236
+
237
+ extraction_metrics = {
238
+ "extraction_success": extraction_success,
239
+ "extracted_condition": condition_result.get('condition'),
240
+ "query_status": condition_result.get('query_status'),
241
+ "emergency_keywords": condition_result.get('emergency_keywords', []),
242
+ "treatment_keywords": condition_result.get('treatment_keywords', []),
243
+ "fallback_level": condition_result.get('fallback_level', 'unknown'),
244
+ "extraction_time": step1_time
245
+ }
246
+
247
+ # METRIC 3: Retrieval Relevance Analysis
248
+ if processed_results:
249
+ similarity_scores = []
250
+ for doc_result in processed_results:
251
+ similarity = (
252
+ doc_result.get('distance', 0.0) or
253
+ doc_result.get('similarity_score', 0.0) or
254
+ doc_result.get('score', 0.0)
255
+ )
256
+ similarity_scores.append(similarity)
257
+
258
+ average_relevance = sum(similarity_scores) / len(similarity_scores)
259
+ high_relevance_count = sum(1 for score in similarity_scores if score >= 0.2)
260
+
261
+ relevance_metrics = {
262
+ "average_relevance": average_relevance,
263
+ "max_relevance": max(similarity_scores),
264
+ "min_relevance": min(similarity_scores),
265
+ "similarity_scores": similarity_scores,
266
+ "high_relevance_count": high_relevance_count,
267
+ "high_relevance_ratio": high_relevance_count / len(similarity_scores),
268
+ "retrieved_count": len(processed_results),
269
+ "meets_threshold": average_relevance >= 0.2,
270
+ "retrieval_time": step3_time
271
+ }
272
+ else:
273
+ relevance_metrics = {
274
+ "average_relevance": 0.0,
275
+ "max_relevance": 0.0,
276
+ "min_relevance": 0.0,
277
+ "similarity_scores": [],
278
+ "high_relevance_count": 0,
279
+ "high_relevance_ratio": 0.0,
280
+ "retrieved_count": 0,
281
+ "meets_threshold": False,
282
+ "retrieval_time": step3_time
283
+ }
284
+
285
+ # METRIC 4: Retrieval Coverage Analysis
286
+ coverage_metrics = self.calculate_coverage_metrics(generated_advice, processed_results)
287
+ coverage_metrics["generation_time"] = step4_time
288
 
289
+ # Create comprehensive result
290
+ comprehensive_result = {
291
  "query": query,
292
  "category": category,
293
+
294
+ # Metric 1: Total Latency - Complete pipeline processing time
295
+ "latency_metrics": {
296
+ "total_latency": total_time,
297
+ "timing_details": timing_details,
298
+ "meets_target": total_time <= 30.0
299
+ },
300
+
301
+ # Metric 2: Condition Extraction - Success rate from user_prompt.py
302
+ "extraction_metrics": extraction_metrics,
303
+
304
+ # Metric 3: Retrieval Relevance - Cosine similarity from retrieval.py
305
+ "relevance_metrics": relevance_metrics,
306
+
307
+ # Metric 4: Retrieval Coverage - Advice utilization of retrieved content
308
+ "coverage_metrics": coverage_metrics,
309
+
310
+ # Complete pipeline data (for debugging and detailed analysis)
311
+ "pipeline_data": {
312
+ "condition_result": condition_result,
313
+ "retrieval_results": retrieval_results,
314
+ "medical_advice_result": medical_advice_result,
315
+ "search_query": search_query,
316
+ "intention": intention
317
+ },
318
+
319
+ "overall_success": True,
320
  "timestamp": datetime.now().isoformat()
321
  }
322
 
323
+ # Store result
324
+ self.comprehensive_results.append(comprehensive_result)
325
+
326
+ # Store medical output for model comparison
327
  medical_output = {
328
  "query": query,
329
  "category": category,
330
+ "medical_advice": generated_advice,
331
  "confidence_score": confidence_score,
332
  "query_id": f"{category}_query",
333
  "processing_time": total_time,
334
  "timestamp": datetime.now().isoformat()
335
  }
 
336
  self.medical_outputs.append(medical_output)
337
 
338
+ print(f"βœ… Comprehensive evaluation completed in {total_time:.2f}s")
339
+ print(f" πŸ“Š Metrics: Latency={total_time:.2f}s, Extraction={'βœ…' if extraction_success else '❌'}, "
340
+ f"Relevance={average_relevance:.3f}, Coverage={coverage_metrics['coverage_score']:.3f}")
341
 
342
+ return comprehensive_result
343
 
344
  except Exception as e:
345
  total_time = time.time() - overall_start
346
+ print(f"❌ Comprehensive evaluation failed after {total_time:.2f}s: {e}")
347
 
348
+ return self._create_failed_result(query, category, total_time, timing_details, "error", None, str(e))
349
+
350
+ def _create_failed_result(self, query: str, category: str, total_time: float,
351
+ timing_details: Dict, status: str, condition_result: Dict = None,
352
+ error: str = None) -> Dict[str, Any]:
353
+ """Create standardized failed result"""
354
+ failed_result = {
355
+ "query": query,
356
+ "category": category,
357
+
358
+ # Metric 1: Total Latency - Always measurable even on failure
359
+ "latency_metrics": {
360
  "total_latency": total_time,
361
  "timing_details": timing_details,
362
+ "meets_target": total_time <= 30.0
363
+ },
364
+
365
+ # Metric 2: Condition Extraction - Partial data may be available before failure
366
+ "extraction_metrics": {
367
+ "extraction_success": False,
368
+ "extracted_condition": condition_result.get('condition') if condition_result else None,
369
+ "query_status": condition_result.get('query_status') if condition_result else status,
370
+ "extraction_time": timing_details.get('step1_condition_extraction', 0.0)
371
+ },
372
+
373
+ # Metric 3: Retrieval Relevance - Failed due to pipeline failure
374
+ "relevance_metrics": {
375
+ "average_relevance": 0.0,
376
+ "retrieved_count": 0,
377
+ "meets_threshold": False,
378
+ "retrieval_time": timing_details.get('step3_retrieval', 0.0)
379
+ },
380
+
381
+ # Metric 4: Retrieval Coverage - Failed due to pipeline failure
382
+ "coverage_metrics": {
383
+ "coverage_score": 0.0,
384
+ "meets_threshold": False,
385
+ "generation_time": timing_details.get('step4_generation', 0.0)
386
+ },
387
+
388
+ # Note: Metrics 5-6 (Clinical Actionability & Evidence Quality)
389
+ # are not collected here - they require separate LLM evaluation
390
+ # using the medical_outputs saved by this evaluator
391
+
392
+ "overall_success": False,
393
+ "status": status,
394
+ "error": error,
395
+ "timestamp": datetime.now().isoformat()
396
+ }
397
+
398
+ self.comprehensive_results.append(failed_result)
399
+ return failed_result
400
 
401
+ def _detect_query_intention(self, query: str) -> str:
402
+ """Simplified query intention detection (from app.py)"""
403
+ query_lower = query.lower()
404
 
405
+ if any(word in query_lower for word in ['diagnos', 'differential', 'possible', 'causes']):
406
+ return 'diagnosis'
407
+ elif any(word in query_lower for word in ['treat', 'manage', 'therapy', 'intervention']):
408
+ return 'treatment'
409
+ else:
410
+ return 'mixed'
411
+ def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
412
+ """Parse queries from file with category labels"""
413
  print(f"πŸ“ Reading queries from file: {filepath}")
414
 
415
  try:
 
451
  print(f"πŸ“‹ Parsed queries by category:")
452
  for category, category_queries in queries_by_category.items():
453
  print(f" {category.capitalize()}: {len(category_queries)} queries")
 
 
454
 
455
  return queries_by_category
456
 
 
458
  print(f"❌ Failed to read file: {e}")
459
  return {"error": f"Failed to read file: {e}"}
460
 
461
+ def calculate_metric_statistics(self, metric_name: str) -> Dict[str, Any]:
462
+ """Calculate statistics for a specific metric across all results"""
463
+ category_stats = {}
464
+ all_successful_results = []
465
 
466
+ # Group results by category
467
+ results_by_category = {
468
+ "diagnosis": [],
469
+ "treatment": [],
470
+ "mixed": []
471
+ }
472
+
473
+ for result in self.comprehensive_results:
474
+ category = result.get('category', 'unknown')
475
+ if category in results_by_category:
476
+ results_by_category[category].append(result)
477
+ if result.get('overall_success'):
478
+ all_successful_results.append(result)
479
+
480
+ # Calculate statistics for each category based on metric type
481
+ for category, results in results_by_category.items():
482
+ successful_results = [r for r in results if r.get('overall_success')]
483
+
484
+ if metric_name == "latency":
485
+ if successful_results:
486
+ latencies = [r['latency_metrics']['total_latency'] for r in successful_results]
487
+ category_stats[category] = {
488
+ "average_latency": sum(latencies) / len(latencies),
489
+ "std_deviation": self._calculate_std(latencies),
490
+ "min_latency": min(latencies),
491
+ "max_latency": max(latencies),
492
+ "query_count": len(latencies),
493
+ "target_compliance": sum(1 for lat in latencies if lat <= 30.0) / len(latencies),
494
+ "individual_latencies": latencies
495
+ }
496
+ else:
497
+ category_stats[category] = self._get_empty_latency_stats()
498
+
499
+ elif metric_name == "extraction":
500
+ extraction_successes = [r['extraction_metrics']['extraction_success'] for r in results]
501
+ successful_extractions = sum(extraction_successes)
502
+
503
+ category_stats[category] = {
504
+ "success_rate": successful_extractions / len(results) if results else 0.0,
505
+ "successful_count": successful_extractions,
506
+ "total_count": len(results),
507
+ "average_extraction_time": sum(r['extraction_metrics']['extraction_time'] for r in results) / len(results) if results else 0.0,
508
+ "meets_threshold": (successful_extractions / len(results)) >= 0.8 if results else False
509
+ }
510
+
511
+ elif metric_name == "relevance":
512
+ if successful_results:
513
+ relevance_scores = [r['relevance_metrics']['average_relevance'] for r in successful_results]
514
+ category_stats[category] = {
515
+ "average_relevance": sum(relevance_scores) / len(relevance_scores),
516
+ "max_relevance": max(relevance_scores),
517
+ "min_relevance": min(relevance_scores),
518
+ "successful_retrievals": len(successful_results),
519
+ "total_queries": len(results),
520
+ "meets_threshold": (sum(relevance_scores) / len(relevance_scores)) >= 0.2,
521
+ "individual_relevance_scores": relevance_scores
522
+ }
523
+ else:
524
+ category_stats[category] = self._get_empty_relevance_stats(len(results))
525
+
526
+ elif metric_name == "coverage":
527
+ if successful_results:
528
+ coverage_scores = [r['coverage_metrics']['coverage_score'] for r in successful_results]
529
+ category_stats[category] = {
530
+ "average_coverage": sum(coverage_scores) / len(coverage_scores),
531
+ "max_coverage": max(coverage_scores),
532
+ "min_coverage": min(coverage_scores),
533
+ "successful_evaluations": len(successful_results),
534
+ "total_queries": len(results),
535
+ "meets_threshold": (sum(coverage_scores) / len(coverage_scores)) >= 0.6,
536
+ "individual_coverage_scores": coverage_scores
537
+ }
538
+ else:
539
+ category_stats[category] = self._get_empty_coverage_stats(len(results))
540
+
541
+ # Calculate overall statistics
542
+ overall_stats = self._calculate_overall_stats(metric_name, all_successful_results)
543
+
544
+ return {
545
+ "category_results": category_stats,
546
+ "overall_results": overall_stats,
547
+ "timestamp": datetime.now().isoformat()
548
+ }
549
+
550
+ def _calculate_std(self, values: List[float]) -> float:
551
+ """Calculate standard deviation"""
552
+ if len(values) < 2:
553
+ return 0.0
554
+
555
+ mean = sum(values) / len(values)
556
+ variance = sum((x - mean) ** 2 for x in values) / len(values)
557
+ return variance ** 0.5
558
 
559
+ def _get_empty_latency_stats(self) -> Dict[str, Any]:
560
+ """Return empty latency statistics"""
561
+ return {
562
+ "average_latency": 0.0,
563
+ "std_deviation": 0.0,
564
+ "min_latency": 0.0,
565
+ "max_latency": 0.0,
566
+ "query_count": 0,
567
+ "target_compliance": 0.0,
568
+ "individual_latencies": []
569
+ }
570
+
571
+ def _get_empty_relevance_stats(self, total_queries: int) -> Dict[str, Any]:
572
+ """Return empty relevance statistics"""
573
+ return {
574
+ "average_relevance": 0.0,
575
+ "max_relevance": 0.0,
576
+ "min_relevance": 0.0,
577
+ "successful_retrievals": 0,
578
+ "total_queries": total_queries,
579
+ "meets_threshold": False,
580
+ "individual_relevance_scores": []
581
+ }
582
+
583
+ def _get_empty_coverage_stats(self, total_queries: int) -> Dict[str, Any]:
584
+ """Return empty coverage statistics"""
585
+ return {
586
+ "average_coverage": 0.0,
587
+ "max_coverage": 0.0,
588
+ "min_coverage": 0.0,
589
+ "successful_evaluations": 0,
590
+ "total_queries": total_queries,
591
+ "meets_threshold": False,
592
+ "individual_coverage_scores": []
593
+ }
594
+
595
+ def _calculate_overall_stats(self, metric_name: str, all_successful_results: List[Dict]) -> Dict[str, Any]:
596
+ """Calculate overall statistics for a specific metric"""
597
+ total_queries = len(self.comprehensive_results)
598
+
599
+ if metric_name == "latency" and all_successful_results:
600
+ latencies = [r['latency_metrics']['total_latency'] for r in all_successful_results]
601
+ return {
602
+ "average_latency": sum(latencies) / len(latencies),
603
+ "std_deviation": self._calculate_std(latencies),
604
+ "min_latency": min(latencies),
605
+ "max_latency": max(latencies),
606
+ "successful_queries": len(all_successful_results),
607
+ "total_queries": total_queries,
608
+ "target_compliance": sum(1 for lat in latencies if lat <= 30.0) / len(latencies)
609
+ }
610
+
611
+ elif metric_name == "extraction":
612
+ all_extractions = [r['extraction_metrics']['extraction_success'] for r in self.comprehensive_results]
613
+ successful_extractions = sum(all_extractions)
614
+ return {
615
+ "success_rate": successful_extractions / len(all_extractions) if all_extractions else 0.0,
616
+ "successful_count": successful_extractions,
617
+ "total_count": len(all_extractions),
618
+ "target_compliance": (successful_extractions / len(all_extractions)) >= 0.8 if all_extractions else False
619
+ }
620
+
621
+ elif metric_name == "relevance" and all_successful_results:
622
+ relevance_scores = [r['relevance_metrics']['average_relevance'] for r in all_successful_results]
623
+ return {
624
+ "average_relevance": sum(relevance_scores) / len(relevance_scores),
625
+ "max_relevance": max(relevance_scores),
626
+ "min_relevance": min(relevance_scores),
627
+ "successful_queries": len(all_successful_results),
628
+ "total_queries": total_queries,
629
+ "meets_threshold": (sum(relevance_scores) / len(relevance_scores)) >= 0.2,
630
+ "target_compliance": (sum(relevance_scores) / len(relevance_scores)) >= 0.25
631
+ }
632
+
633
+ elif metric_name == "coverage" and all_successful_results:
634
+ coverage_scores = [r['coverage_metrics']['coverage_score'] for r in all_successful_results]
635
+ return {
636
+ "average_coverage": sum(coverage_scores) / len(coverage_scores),
637
+ "max_coverage": max(coverage_scores),
638
+ "min_coverage": min(coverage_scores),
639
+ "successful_queries": len(all_successful_results),
640
+ "total_queries": total_queries,
641
+ "meets_threshold": (sum(coverage_scores) / len(coverage_scores)) >= 0.6
642
+ }
643
+
644
+ # Return empty stats for failed cases
645
+ return {
646
+ "average_value": 0.0,
647
+ "successful_queries": len(all_successful_results),
648
+ "total_queries": total_queries,
649
+ "meets_threshold": False
650
+ }
651
+ def save_all_metric_statistics(self) -> Dict[str, str]:
652
+ """Save separate statistics files for each metric"""
653
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
654
+
655
+ # Ensure results directory exists
656
+ results_dir = Path(__file__).parent / "results"
657
+ results_dir.mkdir(exist_ok=True)
658
+
659
+ saved_files = {}
660
+
661
+ # Save statistics for each metric
662
+ for metric_name in ["latency", "extraction", "relevance", "coverage"]:
663
+ stats = self.calculate_metric_statistics(metric_name)
664
+ filename = f"{metric_name}_statistics_{timestamp}.json"
665
+ filepath = results_dir / filename
666
+
667
+ with open(filepath, 'w', encoding='utf-8') as f:
668
+ json.dump(stats, f, indent=2, ensure_ascii=False)
669
+
670
+ saved_files[metric_name] = str(filepath)
671
+ print(f"πŸ“Š {metric_name.capitalize()} statistics saved to: {filepath}")
672
+
673
+ return saved_files
674
+
675
+ def save_medical_outputs(self, filename: str = None) -> str:
676
+ """Save medical advice outputs for model comparison"""
677
  if filename is None:
678
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
679
+ filename = f"medical_outputs_{timestamp}.json"
 
680
 
681
  # Ensure results directory exists
682
  results_dir = Path(__file__).parent / "results"
 
684
 
685
  filepath = results_dir / filename
686
 
687
+ # Create comprehensive output data
688
+ output_data = {
689
+ "evaluation_metadata": {
690
+ "total_outputs": len(self.medical_outputs),
691
+ "categories": list(set(output['category'] for output in self.medical_outputs)),
692
+ "timestamp": datetime.now().isoformat(),
693
+ "model_type": "Med42-70B_RAG_enhanced" # For future comparison
694
+ },
695
+ "medical_outputs": self.medical_outputs
696
+ }
697
+
698
  with open(filepath, 'w', encoding='utf-8') as f:
699
+ json.dump(output_data, f, indent=2, ensure_ascii=False)
700
 
701
+ print(f"πŸ“ Medical outputs saved to: {filepath}")
702
+ return str(filepath)
703
+
704
+ def save_comprehensive_details(self, filename: str = None) -> str:
705
+ """Save comprehensive detailed results"""
706
+ if filename is None:
707
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
708
+ filename = f"comprehensive_details_{timestamp}.json"
709
+
710
+ # Ensure results directory exists
711
+ results_dir = Path(__file__).parent / "results"
712
+ results_dir.mkdir(exist_ok=True)
713
+
714
+ filepath = results_dir / filename
715
+
716
+ # Create comprehensive evaluation data
717
+ comprehensive_data = {
718
+ "evaluation_metadata": {
719
+ "total_queries": len(self.comprehensive_results),
720
+ "successful_queries": len([r for r in self.comprehensive_results if r.get('overall_success')]),
721
+ "timestamp": datetime.now().isoformat(),
722
+ "evaluator_type": "comprehensive_metrics_1_to_4",
723
+ "metrics_evaluated": ["latency", "extraction", "relevance", "coverage"]
724
+ },
725
+ "comprehensive_results": self.comprehensive_results
726
+ }
727
+
728
+ with open(filepath, 'w', encoding='utf-8') as f:
729
+ json.dump(comprehensive_data, f, indent=2, ensure_ascii=False)
730
+
731
+ print(f"πŸ“‹ Comprehensive details saved to: {filepath}")
732
  return str(filepath)
733
 
734
 
735
  # Independent execution interface
736
  if __name__ == "__main__":
737
+ """Independent comprehensive evaluation interface"""
738
 
739
+ print("πŸš€ OnCall.ai Comprehensive Evaluator - Metrics 1-4 in Single Run")
740
 
741
  if len(sys.argv) > 1:
742
  query_file = sys.argv[1]
 
750
  sys.exit(1)
751
 
752
  # Initialize evaluator
753
+ evaluator = ComprehensiveEvaluator()
754
 
755
  # Parse queries from file
756
+ queries_by_category = evaluator.parse_queries_from_file(str(query_file))
757
 
758
  if "error" in queries_by_category:
759
  print(f"❌ Failed to parse queries: {queries_by_category['error']}")
760
  sys.exit(1)
761
 
762
+ # Test each query comprehensively
763
+ print(f"\nπŸ§ͺ Comprehensive Evaluation - All Metrics in Single Run")
764
+ print(f"πŸ“Š Collecting metrics 1-4 from single app.py pipeline execution")
765
 
766
  for category, queries in queries_by_category.items():
767
  if not queries:
 
774
  print(f"\nπŸ” Query {i+1}/{len(queries)} in {category} category:")
775
  print(f" Text: {query_text}")
776
 
777
+ # Comprehensive evaluation (collects all metrics 1-4)
778
+ result = evaluator.evaluate_single_query_comprehensive(query_text, category)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
779
 
780
  # Pause between queries to avoid rate limits
781
+ if i < len(queries) - 1:
782
  print(f" ⏳ Pausing 5s before next query...")
783
  time.sleep(5)
784
 
785
  # Longer pause between categories
786
+ if category != list(queries_by_category.keys())[-1]:
787
  print(f"\n⏳ Pausing 10s before next category...")
788
  time.sleep(10)
789
 
790
+ # Generate and save all metric statistics
791
+ print(f"\nπŸ“Š Generating comprehensive analysis for all metrics...")
 
 
 
792
 
793
+ # Save separate statistics for each metric
794
+ saved_stats = evaluator.save_all_metric_statistics()
795
 
796
  # Save medical outputs for model comparison
797
  outputs_path = evaluator.save_medical_outputs()
798
 
799
+ # Save comprehensive details
800
+ details_path = evaluator.save_comprehensive_details()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
801
 
802
+ # Print comprehensive summary
803
+ print(f"\nπŸ“Š === COMPREHENSIVE EVALUATION SUMMARY ===")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
804
 
805
+ for metric_name in ["latency", "extraction", "relevance", "coverage"]:
806
+ stats = evaluator.calculate_metric_statistics(metric_name)
807
+ overall_results = stats['overall_results']
 
 
808
 
809
+ print(f"\n{metric_name.upper()} METRICS:")
 
 
810
 
811
+ if metric_name == "latency":
812
+ print(f" Average: {overall_results['average_latency']:.2f}s (Β±{overall_results['std_deviation']:.2f})")
813
+ print(f" 30s Target: {'βœ… Met' if overall_results['target_compliance'] >= 0.8 else '❌ Not Met'}")
814
 
815
+ elif metric_name == "extraction":
816
+ print(f" Success Rate: {overall_results['success_rate']:.1%}")
817
+ print(f" 80% Target: {'βœ… Met' if overall_results['target_compliance'] else '❌ Not Met'}")
 
 
 
 
 
 
 
818
 
819
+ elif metric_name == "relevance":
820
+ print(f" Average Relevance: {overall_results['average_relevance']:.3f}")
821
+ print(f" 0.25 Target: {'βœ… Met' if overall_results.get('target_compliance', False) else '❌ Not Met'}")
 
 
 
822
 
823
+ elif metric_name == "coverage":
824
+ print(f" Average Coverage: {overall_results['average_coverage']:.3f} ({overall_results['average_coverage']*100:.1f}%)")
825
+ print(f" 60% Target: {'βœ… Met' if overall_results['meets_threshold'] else '❌ Not Met'}")
826
 
827
+ print(f"\nβœ… Comprehensive evaluation complete! Files saved:")
828
+ for metric_name, filepath in saved_stats.items():
829
+ print(f" πŸ“Š {metric_name.capitalize()}: {filepath}")
830
+ print(f" πŸ“ Medical Outputs: {outputs_path}")
831
+ print(f" πŸ“‹ Comprehensive Details: {details_path}")
832
+ print(f"\nπŸ’‘ Next step: Run chart generators for individual metrics")
833
+ print(f" python latency_chart_generator.py")
834
+ print(f" python extraction_chart_generator.py # (create separately)")
835
+ print(f" python relevance_chart_generator.py # (create separately)")
836
+ print(f" python coverage_chart_generator.py # (create separately)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/{coverage_evaluator.py β†’ old/coverage_evaluator.py} RENAMED
File without changes
evaluation/{extraction_evaluator.py β†’ old/extraction_evaluator.py} RENAMED
File without changes
evaluation/{relevance_evaluator.py β†’ old/relevance_evaluator.py} RENAMED
File without changes