Jeff Ma commited on
Commit
0e255cb
·
2 Parent(s): f3eba79 03afbd6

Merge pull request #12 from YanBoChen0928/Merged20250805

Browse files
evaluation/direct_llm_evaluator.py ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - Direct LLM Evaluator (Med42-70B Only)
4
+ ========================================================
5
+
6
+ Tests Med42-70B directly without RAG pipeline.
7
+ Only applicable metrics: 1 (Latency), 5 (Actionability), 6 (Evidence Quality)
8
+
9
+ Metrics 2-4 (Extraction, Relevance, Coverage) are not applicable for direct LLM.
10
+
11
+ Author: YanBo Chen
12
+ Date: 2025-08-04
13
+ """
14
+
15
+ import time
16
+ import json
17
+ import os
18
+ import sys
19
+ from typing import Dict, List, Any
20
+ from datetime import datetime
21
+ from pathlib import Path
22
+ import re
23
+
24
+ # Add project path
25
+ current_dir = Path(__file__).parent
26
+ project_root = current_dir.parent
27
+ src_dir = project_root / "src"
28
+ sys.path.insert(0, str(src_dir))
29
+
30
+ # Import LLM client only (no retrieval system needed)
31
+ try:
32
+ from llm_clients import llm_Med42_70BClient
33
+ except ImportError as e:
34
+ print(f"❌ Import failed: {e}")
35
+ print("Please ensure running from project root directory")
36
+ sys.exit(1)
37
+
38
+
39
+ class DirectLLMEvaluator:
40
+ """Direct LLM evaluation without RAG pipeline"""
41
+
42
+ def __init__(self):
43
+ """Initialize direct LLM client only"""
44
+ print("🔧 Initializing Direct LLM Evaluator...")
45
+
46
+ # Initialize only LLM client (no retrieval, no user_prompt processing)
47
+ self.llm_client = llm_Med42_70BClient()
48
+
49
+ # Results accumulation
50
+ self.direct_results = []
51
+ self.medical_outputs = []
52
+
53
+ print("✅ Direct LLM Evaluator initialization complete")
54
+
55
+ def evaluate_direct_llm_query(self, query: str, category: str = "unknown") -> Dict[str, Any]:
56
+ """
57
+ Direct LLM evaluation for single query
58
+
59
+ Only tests direct LLM response without RAG pipeline
60
+ Applicable metrics: 1 (Latency), 5-6 (via medical output)
61
+
62
+ Args:
63
+ query: Medical query to test
64
+ category: Query category (diagnosis/treatment/mixed)
65
+ """
66
+ print(f"🔍 Direct LLM evaluation: {query[:50]}...")
67
+ print(f"📋 Category: {category}")
68
+
69
+ overall_start = time.time()
70
+
71
+ try:
72
+ # Direct LLM call without any RAG processing
73
+ llm_start = time.time()
74
+
75
+ # Create direct medical consultation prompt
76
+ direct_prompt = f"""
77
+ You are a medical expert providing clinical guidance.
78
+
79
+ Patient Query: {query}
80
+
81
+ Please provide comprehensive medical advice including:
82
+ 1. Differential diagnosis (if applicable)
83
+ 2. Immediate assessment steps
84
+ 3. Treatment recommendations
85
+ 4. Clinical considerations
86
+
87
+ Provide evidence-based, actionable medical guidance.
88
+ """
89
+
90
+ # Direct LLM generation (same parameters as RAG system for fair comparison)
91
+ response = self.llm_client.analyze_medical_query(
92
+ query=direct_prompt,
93
+ max_tokens=1600, # Same as RAG system primary setting
94
+ timeout=60.0 # Increased timeout for stable evaluation
95
+ )
96
+ # Extract medical advice from response (Med42 client returns dict with 'raw_response')
97
+ if isinstance(response, dict):
98
+ medical_advice = response.get('raw_response', '') or response.get('content', '')
99
+ else:
100
+ medical_advice = str(response)
101
+
102
+ llm_time = time.time() - llm_start
103
+ total_time = time.time() - overall_start
104
+
105
+ # Check if response is valid (not empty) - focus on content, not timeout
106
+ if not medical_advice or len(medical_advice.strip()) == 0:
107
+ print(f"❌ Direct LLM returned empty response after {total_time:.2f}s")
108
+ raise ValueError("Empty response from LLM - no content generated")
109
+
110
+ # Create result
111
+ result = {
112
+ "query": query,
113
+ "category": category,
114
+
115
+ # Metric 1: Total Latency (direct LLM call time)
116
+ "latency_metrics": {
117
+ "total_latency": total_time,
118
+ "llm_generation_time": llm_time,
119
+ "meets_target": total_time <= 60.0
120
+ },
121
+
122
+ # Metrics 2-4: Not applicable for direct LLM
123
+ "extraction_metrics": {
124
+ "not_applicable": True,
125
+ "reason": "No extraction pipeline in direct LLM"
126
+ },
127
+ "relevance_metrics": {
128
+ "not_applicable": True,
129
+ "reason": "No retrieval pipeline in direct LLM"
130
+ },
131
+ "coverage_metrics": {
132
+ "not_applicable": True,
133
+ "reason": "No retrieval content to cover"
134
+ },
135
+
136
+ # Medical advice for metrics 5-6 evaluation
137
+ "medical_advice": medical_advice,
138
+ "advice_length": len(medical_advice),
139
+
140
+ "overall_success": True,
141
+ "model_type": "Med42-70B_direct",
142
+ "timestamp": datetime.now().isoformat()
143
+ }
144
+
145
+ # Store result
146
+ self.direct_results.append(result)
147
+
148
+ # Store medical output for LLM judge evaluation
149
+ medical_output = {
150
+ "query": query,
151
+ "category": category,
152
+ "medical_advice": medical_advice,
153
+ "query_id": f"{category}_query_direct",
154
+ "model_type": "Med42-70B_direct",
155
+ "processing_time": total_time,
156
+ "timestamp": datetime.now().isoformat()
157
+ }
158
+ self.medical_outputs.append(medical_output)
159
+
160
+ print(f"✅ Direct LLM completed in {total_time:.2f}s")
161
+ print(f"📝 Generated advice: {len(medical_advice)} characters")
162
+
163
+ return result
164
+
165
+ except Exception as e:
166
+ total_time = time.time() - overall_start
167
+ print(f"❌ Direct LLM evaluation failed after {total_time:.2f}s: {e}")
168
+
169
+ error_result = {
170
+ "query": query,
171
+ "category": category,
172
+ "latency_metrics": {
173
+ "total_latency": total_time,
174
+ "meets_target": False
175
+ },
176
+ "overall_success": False,
177
+ "error": str(e),
178
+ "model_type": "Med42-70B_direct",
179
+ "timestamp": datetime.now().isoformat()
180
+ }
181
+
182
+ self.direct_results.append(error_result)
183
+
184
+ # Do NOT add failed queries to medical_outputs for judge evaluation
185
+ # Only successful queries with valid medical advice should be evaluated
186
+
187
+ return error_result
188
+
189
+ def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
190
+ """Parse queries from file with category labels"""
191
+ print(f"📁 Reading queries from file: {filepath}")
192
+
193
+ try:
194
+ with open(filepath, 'r', encoding='utf-8') as f:
195
+ content = f.read()
196
+
197
+ queries_by_category = {
198
+ "diagnosis": [],
199
+ "treatment": [],
200
+ "mixed": []
201
+ }
202
+
203
+ lines = content.strip().split('\n')
204
+
205
+ for line in lines:
206
+ line = line.strip()
207
+ if not line:
208
+ continue
209
+
210
+ match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
211
+ if match:
212
+ category_raw = match.group(1).lower()
213
+ query_text = match.group(2).strip()
214
+
215
+ if category_raw in ['mixed/complicated', 'mixed']:
216
+ category = 'mixed'
217
+ else:
218
+ category = category_raw
219
+
220
+ if category in queries_by_category and len(query_text) > 15:
221
+ queries_by_category[category].append({
222
+ "text": query_text,
223
+ "category": category
224
+ })
225
+
226
+ print(f"📋 Parsed queries by category:")
227
+ for category, category_queries in queries_by_category.items():
228
+ print(f" {category.capitalize()}: {len(category_queries)} queries")
229
+
230
+ return queries_by_category
231
+
232
+ except Exception as e:
233
+ print(f"❌ Failed to read file: {e}")
234
+ return {"error": f"Failed to read file: {e}"}
235
+
236
+ def calculate_direct_llm_statistics(self) -> Dict[str, Any]:
237
+ """Calculate statistics for direct LLM evaluation"""
238
+ successful_results = [r for r in self.direct_results if r.get('overall_success')]
239
+
240
+ if successful_results:
241
+ latencies = [r['latency_metrics']['total_latency'] for r in successful_results]
242
+
243
+ # Category-wise statistics
244
+ category_stats = {}
245
+ results_by_category = {"diagnosis": [], "treatment": [], "mixed": []}
246
+
247
+ for result in successful_results:
248
+ category = result.get('category', 'unknown')
249
+ if category in results_by_category:
250
+ results_by_category[category].append(result)
251
+
252
+ for category, results in results_by_category.items():
253
+ if results:
254
+ cat_latencies = [r['latency_metrics']['total_latency'] for r in results]
255
+ category_stats[category] = {
256
+ "average_latency": sum(cat_latencies) / len(cat_latencies),
257
+ "query_count": len(cat_latencies),
258
+ "target_compliance": sum(1 for lat in cat_latencies if lat <= 60.0) / len(cat_latencies)
259
+ }
260
+ else:
261
+ category_stats[category] = {
262
+ "average_latency": 0.0,
263
+ "query_count": 0,
264
+ "target_compliance": 0.0
265
+ }
266
+
267
+ # Overall statistics
268
+ overall_stats = {
269
+ "average_latency": sum(latencies) / len(latencies),
270
+ "min_latency": min(latencies),
271
+ "max_latency": max(latencies),
272
+ "successful_queries": len(successful_results),
273
+ "total_queries": len(self.direct_results),
274
+ "success_rate": len(successful_results) / len(self.direct_results),
275
+ "target_compliance": sum(1 for lat in latencies if lat <= 60.0) / len(latencies)
276
+ }
277
+ else:
278
+ category_stats = {cat: {"average_latency": 0.0, "query_count": 0, "target_compliance": 0.0}
279
+ for cat in ["diagnosis", "treatment", "mixed"]}
280
+ overall_stats = {
281
+ "average_latency": 0.0,
282
+ "successful_queries": 0,
283
+ "total_queries": len(self.direct_results),
284
+ "success_rate": 0.0,
285
+ "target_compliance": 0.0
286
+ }
287
+
288
+ return {
289
+ "category_results": category_stats,
290
+ "overall_results": overall_stats,
291
+ "model_type": "Med42-70B_direct",
292
+ "timestamp": datetime.now().isoformat()
293
+ }
294
+
295
+ def save_direct_llm_statistics(self, filename: str = None) -> str:
296
+ """Save direct LLM statistics"""
297
+ stats = self.calculate_direct_llm_statistics()
298
+
299
+ if filename is None:
300
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
301
+ filename = f"direct_llm_statistics_{timestamp}.json"
302
+
303
+ results_dir = Path(__file__).parent / "results"
304
+ results_dir.mkdir(exist_ok=True)
305
+ filepath = results_dir / filename
306
+
307
+ with open(filepath, 'w', encoding='utf-8') as f:
308
+ json.dump(stats, f, indent=2, ensure_ascii=False)
309
+
310
+ print(f"📊 Direct LLM statistics saved to: {filepath}")
311
+ return str(filepath)
312
+
313
+ def save_direct_medical_outputs(self, filename: str = None) -> str:
314
+ """Save medical outputs for LLM judge evaluation"""
315
+ if filename is None:
316
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
317
+ filename = f"medical_outputs_direct_{timestamp}.json"
318
+
319
+ results_dir = Path(__file__).parent / "results"
320
+ results_dir.mkdir(exist_ok=True)
321
+ filepath = results_dir / filename
322
+
323
+ output_data = {
324
+ "evaluation_metadata": {
325
+ "total_outputs": len(self.medical_outputs),
326
+ "categories": list(set(output['category'] for output in self.medical_outputs)),
327
+ "timestamp": datetime.now().isoformat(),
328
+ "model_type": "Med42-70B_direct"
329
+ },
330
+ "medical_outputs": self.medical_outputs
331
+ }
332
+
333
+ with open(filepath, 'w', encoding='utf-8') as f:
334
+ json.dump(output_data, f, indent=2, ensure_ascii=False)
335
+
336
+ print(f"📝 Direct medical outputs saved to: {filepath}")
337
+ return str(filepath)
338
+
339
+
340
+ # Independent execution interface
341
+ if __name__ == "__main__":
342
+ """Independent direct LLM evaluation interface"""
343
+
344
+ print("🚀 OnCall.ai Direct LLM Evaluator - Med42-70B Only")
345
+
346
+ if len(sys.argv) > 1:
347
+ query_file = sys.argv[1]
348
+ else:
349
+ # Default to evaluation/single_test_query.txt for consistency
350
+ query_file = Path(__file__).parent / "single_test_query.txt"
351
+
352
+ if not os.path.exists(query_file):
353
+ print(f"❌ Query file not found: {query_file}")
354
+ print("Usage: python direct_llm_evaluator.py [query_file.txt]")
355
+ sys.exit(1)
356
+
357
+ # Initialize evaluator
358
+ evaluator = DirectLLMEvaluator()
359
+
360
+ # Parse queries
361
+ queries_by_category = evaluator.parse_queries_from_file(str(query_file))
362
+
363
+ if "error" in queries_by_category:
364
+ print(f"❌ Failed to parse queries: {queries_by_category['error']}")
365
+ sys.exit(1)
366
+
367
+ # Test direct LLM for each query
368
+ print(f"\n🧪 Direct LLM Testing (No RAG Pipeline)")
369
+
370
+ for category, queries in queries_by_category.items():
371
+ if not queries:
372
+ continue
373
+
374
+ print(f"\n📂 Testing {category.upper()} with direct Med42-70B:")
375
+
376
+ for i, query_info in enumerate(queries):
377
+ query_text = query_info['text']
378
+
379
+ # Direct LLM evaluation
380
+ result = evaluator.evaluate_direct_llm_query(query_text, category)
381
+
382
+ # Pause between queries
383
+ if i < len(queries) - 1:
384
+ print(f" ⏳ Pausing 5s before next query...")
385
+ time.sleep(5)
386
+
387
+ # Pause between categories
388
+ if category != list(queries_by_category.keys())[-1]:
389
+ print(f"\n⏳ Pausing 10s before next category...")
390
+ time.sleep(10)
391
+
392
+ # Save results
393
+ print(f"\n📊 Generating direct LLM analysis...")
394
+
395
+ stats_path = evaluator.save_direct_llm_statistics()
396
+ outputs_path = evaluator.save_direct_medical_outputs()
397
+
398
+ # Print summary
399
+ stats = evaluator.calculate_direct_llm_statistics()
400
+ overall_results = stats['overall_results']
401
+
402
+ print(f"\n📊 === DIRECT LLM EVALUATION SUMMARY ===")
403
+ print(f"Overall Performance:")
404
+ print(f" Average Latency: {overall_results['average_latency']:.2f}s")
405
+ print(f" Success Rate: {overall_results['successful_queries']}/{overall_results['total_queries']}")
406
+ print(f" 60s Target Compliance: {overall_results['target_compliance']:.1%}")
407
+
408
+ print(f"\nApplicable Metrics:")
409
+ print(f" ✅ Metric 1 (Latency): Measured")
410
+ print(f" ❌ Metric 2 (Extraction): Not applicable - no extraction pipeline")
411
+ print(f" ❌ Metric 3 (Relevance): Not applicable - no retrieval pipeline")
412
+ print(f" ❌ Metric 4 (Coverage): Not applicable - no retrieval content")
413
+ print(f" 🔄 Metric 5 (Actionability): Requires LLM judge evaluation")
414
+ print(f" 🔄 Metric 6 (Evidence): Requires LLM judge evaluation")
415
+
416
+ print(f"\n✅ Direct LLM evaluation complete!")
417
+ print(f"📊 Statistics: {stats_path}")
418
+ print(f"📝 Medical Outputs: {outputs_path}")
419
+ print(f"\n💡 Next step: Run python metric5_6_llm_judge_evaluator.py rag,direct for metrics 5-6")
evaluation/latency_evaluator.py ADDED
@@ -0,0 +1,892 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - Comprehensive Evaluator (Metrics 1-8)
4
+ ========================================================
5
+
6
+ Single execution to collect all metrics 1-4 data from app.py pipeline.
7
+ Generates foundation data for metrics 5-8 evaluation in downstream processors.
8
+
9
+ COMPLETE METRICS OVERVIEW:
10
+
11
+ PIPELINE PERFORMANCE METRICS (Collected by this evaluator):
12
+ 1. Total Latency (總處理時長) - Complete pipeline processing time from query to response
13
+ 2. Condition Extraction Success Rate (條件抽取成功率) - Success rate of user_prompt.py condition extraction
14
+ 3. Retrieval Relevance (檢索相關性) - Average cosine similarity scores from retrieval.py results
15
+ 4. Retrieval Coverage (檢索覆蓋率) - Medical keyword utilization rate between retrieved content and generated advice
16
+
17
+ LLM JUDGE METRICS (Processed by metric5_6_llm_judge_evaluator.py):
18
+ 5. Clinical Actionability (臨床可操作性) - Third-party LLM evaluation of medical advice actionability (1-10 scale)
19
+ * Uses batch evaluation strategy with Llama3-70B as judge
20
+ * Measures: Can healthcare providers immediately act on this advice?
21
+ * Target threshold: ≥7.0/10 for acceptable actionability
22
+
23
+ 6. Clinical Evidence Quality (臨床證據品質) - Third-party LLM evaluation of evidence-based quality (1-10 scale)
24
+ * Uses same batch evaluation call as metric 5 for efficiency
25
+ * Measures: Is the advice evidence-based and follows medical standards?
26
+ * Target threshold: ≥7.5/10 for acceptable evidence quality
27
+
28
+ RETRIEVAL PRECISION METRICS (Processed by metric7_8_precision_MRR.py):
29
+ 7. Precision@K (檢索精確率) - Proportion of relevant results in top-K retrieval results
30
+ * Uses adaptive threshold based on query complexity (0.15 for complex, 0.25 for simple queries)
31
+ * Query complexity determined by unique emergency keywords count (≥4 = complex)
32
+ * Measures: relevant_results / total_retrieved_results
33
+
34
+ 8. Mean Reciprocal Rank (平均倒數排名) - Average reciprocal rank of first relevant result
35
+ * Uses same adaptive threshold as Precision@K
36
+ * Measures: 1 / rank_of_first_relevant_result (0 if no relevant results)
37
+ * Higher MRR indicates relevant results appear earlier in ranking
38
+
39
+ DATA FLOW ARCHITECTURE:
40
+ 1. latency_evaluator.py → comprehensive_details_*.json (metrics 1-4 + pipeline data)
41
+ 2. latency_evaluator.py → medical_outputs_*.json (medical advice for judge evaluation)
42
+ 3. metric5_6_llm_judge_evaluator.py → judge_evaluation_*.json (metrics 5-6)
43
+ 4. metric7_8_precision_MRR.py → precision_mrr_analysis_*.json (metrics 7-8)
44
+
45
+ Note: This evaluator focuses on metrics 1-4 collection. Metrics 5-8 require separate downstream evaluation.
46
+
47
+ Author: YanBo Chen
48
+ Date: 2025-08-04
49
+ """
50
+
51
+ import time
52
+ import json
53
+ import os
54
+ import sys
55
+ from typing import Dict, List, Any, Set
56
+ from datetime import datetime
57
+ from pathlib import Path
58
+ import re
59
+
60
+ # Add project path
61
+ current_dir = Path(__file__).parent
62
+ project_root = current_dir.parent
63
+ src_dir = project_root / "src"
64
+ sys.path.insert(0, str(src_dir))
65
+
66
+ # Import existing system components
67
+ try:
68
+ from user_prompt import UserPromptProcessor
69
+ from retrieval import BasicRetrievalSystem
70
+ from llm_clients import llm_Med42_70BClient
71
+ from generation import MedicalAdviceGenerator
72
+ except ImportError as e:
73
+ print(f"❌ Import failed: {e}")
74
+ print("Please ensure running from project root directory")
75
+ sys.exit(1)
76
+
77
+
78
+ class ComprehensiveEvaluator:
79
+ """Comprehensive evaluator for metrics 1-4 - single execution approach"""
80
+
81
+ def __init__(self):
82
+ """Initialize system components (identical to app.py)"""
83
+ print("🔧 Initializing Comprehensive Evaluator...")
84
+
85
+ # Initialize existing system components (same as app.py)
86
+ self.llm_client = llm_Med42_70BClient()
87
+ self.retrieval_system = BasicRetrievalSystem()
88
+ self.user_prompt_processor = UserPromptProcessor(
89
+ llm_client=self.llm_client,
90
+ retrieval_system=self.retrieval_system
91
+ )
92
+ self.medical_generator = MedicalAdviceGenerator(llm_client=self.llm_client)
93
+
94
+ # Results accumulation for all metrics
95
+ self.comprehensive_results = []
96
+ self.medical_outputs = []
97
+
98
+ print("✅ Comprehensive Evaluator initialization complete")
99
+
100
+ def extract_medical_keywords(self, text: str) -> Set[str]:
101
+ """Extract medical keywords for coverage analysis"""
102
+ if not text:
103
+ return set()
104
+
105
+ medical_keywords = set()
106
+ text_lower = text.lower()
107
+
108
+ # Medical terminology patterns
109
+ patterns = [
110
+ r'\b[a-z]+(?:osis|itis|pathy|emia|uria|gram|scopy)\b', # Medical suffixes
111
+ r'\b(?:cardio|neuro|pulmo|gastro|hepato|nephro)[a-z]+\b', # Medical prefixes
112
+ r'\b(?:diagnosis|treatment|therapy|intervention|management)\b', # Medical actions
113
+ r'\b(?:patient|symptom|condition|disease|disorder|syndrome)\b', # Medical entities
114
+ r'\b(?:acute|chronic|severe|mild|moderate|emergency)\b', # Medical descriptors
115
+ r'\b[a-z]+(?:al|ic|ous|ive)\s+(?:pain|failure|infection|injury)\b', # Compound terms
116
+ r'\b(?:ecg|ekg|ct|mri|x-ray|ultrasound|biopsy)\b', # Medical procedures
117
+ r'\b\d+\s*(?:mg|ml|units|hours|days|minutes)\b', # Dosages and timeframes
118
+ ]
119
+
120
+ for pattern in patterns:
121
+ matches = re.findall(pattern, text_lower)
122
+ medical_keywords.update(match.strip() for match in matches)
123
+
124
+ # Additional common medical terms
125
+ common_medical_terms = [
126
+ 'blood', 'pressure', 'heart', 'chest', 'pain', 'stroke', 'seizure',
127
+ 'emergency', 'hospital', 'monitor', 'assess', 'evaluate', 'immediate',
128
+ 'protocol', 'guideline', 'recommendation', 'risk', 'factor'
129
+ ]
130
+
131
+ for term in common_medical_terms:
132
+ if term in text_lower:
133
+ medical_keywords.add(term)
134
+
135
+ # Filter out very short terms and common words
136
+ filtered_keywords = {
137
+ kw for kw in medical_keywords
138
+ if len(kw) > 2 and kw not in ['the', 'and', 'for', 'with', 'are', 'can', 'may']
139
+ }
140
+
141
+ return filtered_keywords
142
+
143
+ def calculate_coverage_metrics(self, generated_advice: str, retrieval_results: List[Dict]) -> Dict[str, Any]:
144
+ """Calculate coverage metrics from generated advice and retrieval results"""
145
+ if not generated_advice or not retrieval_results:
146
+ return {
147
+ "coverage_score": 0.0,
148
+ "matched_keywords": [],
149
+ "advice_keywords": [],
150
+ "source_keywords": [],
151
+ "coverage_percentage": 0.0,
152
+ "meets_threshold": False
153
+ }
154
+
155
+ # Extract keywords from generated advice
156
+ advice_keywords = self.extract_medical_keywords(generated_advice)
157
+
158
+ # Extract keywords from all retrieved documents
159
+ all_source_keywords = set()
160
+ for doc in retrieval_results:
161
+ doc_content = doc.get('content', '') or doc.get('text', '')
162
+ doc_keywords = self.extract_medical_keywords(doc_content)
163
+ all_source_keywords.update(doc_keywords)
164
+
165
+ # Calculate coverage
166
+ matched_keywords = advice_keywords.intersection(all_source_keywords)
167
+ coverage_score = len(matched_keywords) / len(all_source_keywords) if all_source_keywords else 0.0
168
+
169
+ return {
170
+ "coverage_score": coverage_score,
171
+ "matched_keywords": list(matched_keywords),
172
+ "advice_keywords": list(advice_keywords),
173
+ "source_keywords": list(all_source_keywords),
174
+ "advice_keywords_count": len(advice_keywords),
175
+ "source_keywords_count": len(all_source_keywords),
176
+ "matched_keywords_count": len(matched_keywords),
177
+ "coverage_percentage": coverage_score * 100,
178
+ "meets_threshold": coverage_score >= 0.4
179
+ }
180
+
181
+ def evaluate_single_query_comprehensive(self, query: str, category: str = "unknown") -> Dict[str, Any]:
182
+ """
183
+ Comprehensive evaluation for single query - collects all metrics 1-4 data
184
+
185
+ Replicates app.py's process_medical_query pipeline exactly
186
+
187
+ Args:
188
+ query: Medical query to test
189
+ category: Query category (diagnosis/treatment/mixed)
190
+ """
191
+ print(f"🔍 Comprehensive evaluation: {query[:50]}...")
192
+ print(f"📋 Category: {category}")
193
+
194
+ overall_start = time.time()
195
+ timing_details = {}
196
+
197
+ try:
198
+ # STEP 1: Query Processing and Condition Extraction (identical to app.py)
199
+ step1_start = time.time()
200
+ condition_result = self.user_prompt_processor.extract_condition_keywords(query)
201
+ step1_time = time.time() - step1_start
202
+ timing_details['step1_condition_extraction'] = step1_time
203
+
204
+ print(f" Step 1 - Condition extraction: {step1_time:.3f}s")
205
+ print(f" Extracted condition: {condition_result.get('condition', 'None')}")
206
+
207
+ # Check if valid medical query
208
+ if condition_result.get('query_status') in ['invalid_query', 'non_medical']:
209
+ total_time = time.time() - overall_start
210
+ return self._create_failed_result(query, category, total_time, timing_details,
211
+ "non_medical", condition_result)
212
+
213
+ # STEP 2: User Confirmation (simulate auto-confirmation)
214
+ step2_start = time.time()
215
+ confirmation = self.user_prompt_processor.handle_user_confirmation(condition_result)
216
+ step2_time = time.time() - step2_start
217
+ timing_details['step2_confirmation'] = step2_time
218
+
219
+ if not condition_result.get('condition'):
220
+ total_time = time.time() - overall_start
221
+ return self._create_failed_result(query, category, total_time, timing_details,
222
+ "no_condition", condition_result)
223
+
224
+ # STEP 3: Medical Guidelines Retrieval (identical to app.py)
225
+ step3_start = time.time()
226
+
227
+ search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
228
+ if not search_query:
229
+ search_query = condition_result.get('condition', query)
230
+
231
+ retrieval_results = self.retrieval_system.search(search_query, top_k=5)
232
+ step3_time = time.time() - step3_start
233
+ timing_details['step3_retrieval'] = step3_time
234
+
235
+ processed_results = retrieval_results.get('processed_results', [])
236
+ print(f" Step 3 - Retrieval: {step3_time:.3f}s ({len(processed_results)} results)")
237
+
238
+ # STEP 4: Medical Advice Generation (identical to app.py)
239
+ step4_start = time.time()
240
+
241
+ intention = self._detect_query_intention(query)
242
+ medical_advice_result = self.medical_generator.generate_medical_advice(
243
+ user_query=query,
244
+ retrieval_results=retrieval_results,
245
+ intention=intention
246
+ )
247
+ step4_time = time.time() - step4_start
248
+ timing_details['step4_generation'] = step4_time
249
+
250
+ generated_advice = medical_advice_result.get('medical_advice', '')
251
+ confidence_score = medical_advice_result.get('confidence_score', 0.0)
252
+
253
+ print(f" Step 4 - Generation: {step4_time:.3f}s")
254
+
255
+ total_time = time.time() - overall_start
256
+
257
+ # METRIC 2: Condition Extraction Analysis
258
+ extraction_success = (
259
+ condition_result.get('condition') and
260
+ condition_result.get('condition') != "unknown" and
261
+ condition_result.get('query_status') not in ['invalid_query', 'non_medical']
262
+ )
263
+
264
+ extraction_metrics = {
265
+ "extraction_success": extraction_success,
266
+ "extracted_condition": condition_result.get('condition'),
267
+ "query_status": condition_result.get('query_status'),
268
+ "emergency_keywords": condition_result.get('emergency_keywords', []),
269
+ "treatment_keywords": condition_result.get('treatment_keywords', []),
270
+ "fallback_level": condition_result.get('fallback_level', 'unknown'),
271
+ "extraction_time": step1_time
272
+ }
273
+
274
+ # METRIC 3: Retrieval Relevance Analysis
275
+ if processed_results:
276
+ relevance_scores = []
277
+ for doc_result in processed_results:
278
+ # Get angular distance and convert to relevance using correct formula
279
+ distance = doc_result.get('distance', 1.0)
280
+ relevance = 1.0 - (distance**2) / 2.0 # Correct mathematical conversion
281
+ relevance_scores.append(relevance)
282
+
283
+ average_relevance = sum(relevance_scores) / len(relevance_scores)
284
+ high_relevance_count = sum(1 for score in relevance_scores if score >= 0.85)
285
+
286
+ relevance_metrics = {
287
+ "average_relevance": average_relevance,
288
+ "max_relevance": max(relevance_scores),
289
+ "min_relevance": min(relevance_scores),
290
+ "relevance_scores": relevance_scores,
291
+ "high_relevance_count": high_relevance_count,
292
+ "high_relevance_ratio": high_relevance_count / len(relevance_scores),
293
+ "retrieved_count": len(processed_results),
294
+ "meets_threshold": average_relevance >= 0.85,
295
+ "retrieval_time": step3_time
296
+ }
297
+ else:
298
+ relevance_metrics = {
299
+ "average_relevance": 0.0,
300
+ "max_relevance": 0.0,
301
+ "min_relevance": 0.0,
302
+ "similarity_scores": [],
303
+ "high_relevance_count": 0,
304
+ "high_relevance_ratio": 0.0,
305
+ "retrieved_count": 0,
306
+ "meets_threshold": False,
307
+ "retrieval_time": step3_time
308
+ }
309
+
310
+ # METRIC 4: Retrieval Coverage Analysis
311
+ coverage_metrics = self.calculate_coverage_metrics(generated_advice, processed_results)
312
+ coverage_metrics["generation_time"] = step4_time
313
+
314
+ # Create comprehensive result
315
+ comprehensive_result = {
316
+ "query": query,
317
+ "category": category,
318
+
319
+ # Metric 1: Total Latency - Complete pipeline processing time
320
+ "latency_metrics": {
321
+ "total_latency": total_time,
322
+ "timing_details": timing_details,
323
+ "meets_target": total_time <= 60.0
324
+ },
325
+
326
+ # Metric 2: Condition Extraction - Success rate from user_prompt.py
327
+ "extraction_metrics": extraction_metrics,
328
+
329
+ # Metric 3: Retrieval Relevance - Cosine similarity from retrieval.py
330
+ "relevance_metrics": relevance_metrics,
331
+
332
+ # Metric 4: Retrieval Coverage - Advice utilization of retrieved content
333
+ "coverage_metrics": coverage_metrics,
334
+
335
+ # Complete pipeline data (for debugging and detailed analysis)
336
+ "pipeline_data": {
337
+ "condition_result": condition_result,
338
+ "retrieval_results": retrieval_results,
339
+ "medical_advice_result": medical_advice_result,
340
+ "search_query": search_query,
341
+ "intention": intention
342
+ },
343
+
344
+ "overall_success": True,
345
+ "timestamp": datetime.now().isoformat()
346
+ }
347
+
348
+ # Validate data completeness for metrics 7-8 analysis
349
+ ready = True
350
+ data = comprehensive_result.get('pipeline_data', {})
351
+
352
+ # 1. Check retrieval results completeness for precision/MRR calculation
353
+ retr = data.get('retrieval_results', {}).get('processed_results', [])
354
+ if not retr or 'distance' not in retr[0]:
355
+ ready = False
356
+
357
+ # 2. Check condition extraction completeness for complexity analysis
358
+ cond = data.get('condition_result', {}).get('condition')
359
+ if not cond:
360
+ ready = False
361
+
362
+ # 3. Check overall execution status
363
+ if not comprehensive_result.get('overall_success', False):
364
+ ready = False
365
+
366
+ # 4. Check retrieval timing data completeness
367
+ if 'retrieval_time' not in comprehensive_result.get('relevance_metrics', {}):
368
+ ready = False
369
+
370
+ # Set metrics 7-8 readiness flag for downstream precision/MRR analysis
371
+ comprehensive_result['precision_mrr_ready'] = ready
372
+
373
+ # Store result
374
+ self.comprehensive_results.append(comprehensive_result)
375
+
376
+ # Store medical output for model comparison
377
+ medical_output = {
378
+ "query": query,
379
+ "category": category,
380
+ "medical_advice": generated_advice,
381
+ "confidence_score": confidence_score,
382
+ "query_id": f"{category}_query",
383
+ "processing_time": total_time,
384
+ "timestamp": datetime.now().isoformat()
385
+ }
386
+ self.medical_outputs.append(medical_output)
387
+
388
+ print(f"✅ Comprehensive evaluation completed in {total_time:.2f}s")
389
+ print(f" 📊 Metrics: Latency={total_time:.2f}s, Extraction={'✅' if extraction_success else '❌'}, "
390
+ f"Relevance={average_relevance:.3f}, Coverage={coverage_metrics['coverage_score']:.3f}")
391
+
392
+ return comprehensive_result
393
+
394
+ except Exception as e:
395
+ total_time = time.time() - overall_start
396
+ print(f"❌ Comprehensive evaluation failed after {total_time:.2f}s: {e}")
397
+
398
+ return self._create_failed_result(query, category, total_time, timing_details, "error", None, str(e))
399
+
400
+ def _create_failed_result(self, query: str, category: str, total_time: float,
401
+ timing_details: Dict, status: str, condition_result: Dict = None,
402
+ error: str = None) -> Dict[str, Any]:
403
+ """Create standardized failed result"""
404
+ failed_result = {
405
+ "query": query,
406
+ "category": category,
407
+
408
+ # Metric 1: Total Latency - Always measurable even on failure
409
+ "latency_metrics": {
410
+ "total_latency": total_time,
411
+ "timing_details": timing_details,
412
+ "meets_target": total_time <= 60.0
413
+ },
414
+
415
+ # Metric 2: Condition Extraction - Partial data may be available before failure
416
+ "extraction_metrics": {
417
+ "extraction_success": False,
418
+ "extracted_condition": condition_result.get('condition') if condition_result else None,
419
+ "query_status": condition_result.get('query_status') if condition_result else status,
420
+ "extraction_time": timing_details.get('step1_condition_extraction', 0.0)
421
+ },
422
+
423
+ # Metric 3: Retrieval Relevance - Failed due to pipeline failure
424
+ "relevance_metrics": {
425
+ "average_relevance": 0.0,
426
+ "retrieved_count": 0,
427
+ "meets_threshold": False,
428
+ "retrieval_time": timing_details.get('step3_retrieval', 0.0)
429
+ },
430
+
431
+ # Metric 4: Retrieval Coverage - Failed due to pipeline failure
432
+ "coverage_metrics": {
433
+ "coverage_score": 0.0,
434
+ "meets_threshold": False,
435
+ "generation_time": timing_details.get('step4_generation', 0.0)
436
+ },
437
+
438
+ # Note: Metrics 5-6 (Clinical Actionability & Evidence Quality)
439
+ # are collected by metric5_6_llm_judge_evaluator.py using medical_outputs
440
+ # Metrics 7-8 (Precision@K & MRR) are collected by metric7_8_precision_MRR.py
441
+ # using comprehensive_details pipeline data
442
+
443
+ "overall_success": False,
444
+ "status": status,
445
+ "error": error,
446
+ "timestamp": datetime.now().isoformat()
447
+ }
448
+
449
+ # For failed results, precision/MRR analysis data is not ready
450
+ failed_result['precision_mrr_ready'] = False
451
+
452
+ self.comprehensive_results.append(failed_result)
453
+ return failed_result
454
+
455
+ def _detect_query_intention(self, query: str) -> str:
456
+ """Simplified query intention detection (from app.py)"""
457
+ query_lower = query.lower()
458
+
459
+ if any(word in query_lower for word in ['diagnos', 'differential', 'possible', 'causes']):
460
+ return 'diagnosis'
461
+ elif any(word in query_lower for word in ['treat', 'manage', 'therapy', 'intervention']):
462
+ return 'treatment'
463
+ else:
464
+ return 'mixed'
465
+ def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
466
+ """Parse queries from file with category labels"""
467
+ print(f"📁 Reading queries from file: {filepath}")
468
+
469
+ try:
470
+ with open(filepath, 'r', encoding='utf-8') as f:
471
+ content = f.read()
472
+
473
+ # Parse queries with category labels
474
+ queries_by_category = {
475
+ "diagnosis": [],
476
+ "treatment": [],
477
+ "mixed": []
478
+ }
479
+
480
+ lines = content.strip().split('\n')
481
+
482
+ for line in lines:
483
+ line = line.strip()
484
+ if not line:
485
+ continue
486
+
487
+ # Parse format: "1.diagnosis: query text"
488
+ match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
489
+ if match:
490
+ category_raw = match.group(1).lower()
491
+ query_text = match.group(2).strip()
492
+
493
+ # Normalize category name
494
+ if category_raw in ['mixed/complicated', 'mixed']:
495
+ category = 'mixed'
496
+ else:
497
+ category = category_raw
498
+
499
+ if category in queries_by_category and len(query_text) > 15:
500
+ queries_by_category[category].append({
501
+ "text": query_text,
502
+ "category": category
503
+ })
504
+
505
+ print(f"📋 Parsed queries by category:")
506
+ for category, category_queries in queries_by_category.items():
507
+ print(f" {category.capitalize()}: {len(category_queries)} queries")
508
+
509
+ return queries_by_category
510
+
511
+ except Exception as e:
512
+ print(f"❌ Failed to read file: {e}")
513
+ return {"error": f"Failed to read file: {e}"}
514
+
515
+ def calculate_metric_statistics(self, metric_name: str) -> Dict[str, Any]:
516
+ """Calculate statistics for a specific metric across all results"""
517
+ category_stats = {}
518
+ all_successful_results = []
519
+
520
+ # Group results by category
521
+ results_by_category = {
522
+ "diagnosis": [],
523
+ "treatment": [],
524
+ "mixed": []
525
+ }
526
+
527
+ for result in self.comprehensive_results:
528
+ category = result.get('category', 'unknown')
529
+ if category in results_by_category:
530
+ results_by_category[category].append(result)
531
+ if result.get('overall_success'):
532
+ all_successful_results.append(result)
533
+
534
+ # Calculate statistics for each category based on metric type
535
+ for category, results in results_by_category.items():
536
+ successful_results = [r for r in results if r.get('overall_success')]
537
+
538
+ if metric_name == "latency":
539
+ if successful_results:
540
+ latencies = [r['latency_metrics']['total_latency'] for r in successful_results]
541
+ category_stats[category] = {
542
+ "average_latency": sum(latencies) / len(latencies),
543
+ "std_deviation": self._calculate_std(latencies),
544
+ "min_latency": min(latencies),
545
+ "max_latency": max(latencies),
546
+ "query_count": len(latencies),
547
+ "target_compliance": sum(1 for lat in latencies if lat <= 60.0) / len(latencies),
548
+ "individual_latencies": latencies
549
+ }
550
+ else:
551
+ category_stats[category] = self._get_empty_latency_stats()
552
+
553
+ elif metric_name == "extraction":
554
+ extraction_successes = [r['extraction_metrics']['extraction_success'] for r in results]
555
+ successful_extractions = sum(extraction_successes)
556
+
557
+ category_stats[category] = {
558
+ "success_rate": successful_extractions / len(results) if results else 0.0,
559
+ "successful_count": successful_extractions,
560
+ "total_count": len(results),
561
+ "average_extraction_time": sum(r['extraction_metrics']['extraction_time'] for r in results) / len(results) if results else 0.0,
562
+ "meets_threshold": (successful_extractions / len(results)) >= 0.8 if results else False
563
+ }
564
+
565
+ elif metric_name == "relevance":
566
+ if successful_results:
567
+ relevance_scores = [r['relevance_metrics']['average_relevance'] for r in successful_results]
568
+ category_stats[category] = {
569
+ "average_relevance": sum(relevance_scores) / len(relevance_scores),
570
+ "max_relevance": max(relevance_scores),
571
+ "min_relevance": min(relevance_scores),
572
+ "successful_retrievals": len(successful_results),
573
+ "total_queries": len(results),
574
+ "meets_threshold": (sum(relevance_scores) / len(relevance_scores)) >= 0.85,
575
+ "individual_relevance_scores": relevance_scores
576
+ }
577
+ else:
578
+ category_stats[category] = self._get_empty_relevance_stats(len(results))
579
+
580
+ elif metric_name == "coverage":
581
+ if successful_results:
582
+ coverage_scores = [r['coverage_metrics']['coverage_score'] for r in successful_results]
583
+ category_stats[category] = {
584
+ "average_coverage": sum(coverage_scores) / len(coverage_scores),
585
+ "max_coverage": max(coverage_scores),
586
+ "min_coverage": min(coverage_scores),
587
+ "successful_evaluations": len(successful_results),
588
+ "total_queries": len(results),
589
+ "meets_threshold": (sum(coverage_scores) / len(coverage_scores)) >= 0.4,
590
+ "individual_coverage_scores": coverage_scores
591
+ }
592
+ else:
593
+ category_stats[category] = self._get_empty_coverage_stats(len(results))
594
+
595
+ # Calculate overall statistics
596
+ overall_stats = self._calculate_overall_stats(metric_name, all_successful_results)
597
+
598
+ return {
599
+ "category_results": category_stats,
600
+ "overall_results": overall_stats,
601
+ "timestamp": datetime.now().isoformat()
602
+ }
603
+
604
+ def _calculate_std(self, values: List[float]) -> float:
605
+ """Calculate standard deviation"""
606
+ if len(values) < 2:
607
+ return 0.0
608
+
609
+ mean = sum(values) / len(values)
610
+ variance = sum((x - mean) ** 2 for x in values) / len(values)
611
+ return variance ** 0.5
612
+
613
+ def _get_empty_latency_stats(self) -> Dict[str, Any]:
614
+ """Return empty latency statistics"""
615
+ return {
616
+ "average_latency": 0.0,
617
+ "std_deviation": 0.0,
618
+ "min_latency": 0.0,
619
+ "max_latency": 0.0,
620
+ "query_count": 0,
621
+ "target_compliance": 0.0,
622
+ "individual_latencies": []
623
+ }
624
+
625
+ def _get_empty_relevance_stats(self, total_queries: int) -> Dict[str, Any]:
626
+ """Return empty relevance statistics"""
627
+ return {
628
+ "average_relevance": 0.0,
629
+ "max_relevance": 0.0,
630
+ "min_relevance": 0.0,
631
+ "successful_retrievals": 0,
632
+ "total_queries": total_queries,
633
+ "meets_threshold": False,
634
+ "individual_relevance_scores": []
635
+ }
636
+
637
+ def _get_empty_coverage_stats(self, total_queries: int) -> Dict[str, Any]:
638
+ """Return empty coverage statistics"""
639
+ return {
640
+ "average_coverage": 0.0,
641
+ "max_coverage": 0.0,
642
+ "min_coverage": 0.0,
643
+ "successful_evaluations": 0,
644
+ "total_queries": total_queries,
645
+ "meets_threshold": False,
646
+ "individual_coverage_scores": []
647
+ }
648
+
649
+ def _calculate_overall_stats(self, metric_name: str, all_successful_results: List[Dict]) -> Dict[str, Any]:
650
+ """Calculate overall statistics for a specific metric"""
651
+ total_queries = len(self.comprehensive_results)
652
+
653
+ if metric_name == "latency" and all_successful_results:
654
+ latencies = [r['latency_metrics']['total_latency'] for r in all_successful_results]
655
+ return {
656
+ "average_latency": sum(latencies) / len(latencies),
657
+ "std_deviation": self._calculate_std(latencies),
658
+ "min_latency": min(latencies),
659
+ "max_latency": max(latencies),
660
+ "successful_queries": len(all_successful_results),
661
+ "total_queries": total_queries,
662
+ "target_compliance": sum(1 for lat in latencies if lat <= 60.0) / len(latencies)
663
+ }
664
+
665
+ elif metric_name == "extraction":
666
+ all_extractions = [r['extraction_metrics']['extraction_success'] for r in self.comprehensive_results]
667
+ successful_extractions = sum(all_extractions)
668
+ return {
669
+ "success_rate": successful_extractions / len(all_extractions) if all_extractions else 0.0,
670
+ "successful_count": successful_extractions,
671
+ "total_count": len(all_extractions),
672
+ "target_compliance": (successful_extractions / len(all_extractions)) >= 0.8 if all_extractions else False
673
+ }
674
+
675
+ elif metric_name == "relevance" and all_successful_results:
676
+ relevance_scores = [r['relevance_metrics']['average_relevance'] for r in all_successful_results]
677
+ return {
678
+ "average_relevance": sum(relevance_scores) / len(relevance_scores),
679
+ "max_relevance": max(relevance_scores),
680
+ "min_relevance": min(relevance_scores),
681
+ "successful_queries": len(all_successful_results),
682
+ "total_queries": total_queries,
683
+ "meets_threshold": (sum(relevance_scores) / len(relevance_scores)) >= 0.85,
684
+ "target_compliance": (sum(relevance_scores) / len(relevance_scores)) >= 0.7
685
+ }
686
+
687
+ elif metric_name == "coverage" and all_successful_results:
688
+ coverage_scores = [r['coverage_metrics']['coverage_score'] for r in all_successful_results]
689
+ return {
690
+ "average_coverage": sum(coverage_scores) / len(coverage_scores),
691
+ "max_coverage": max(coverage_scores),
692
+ "min_coverage": min(coverage_scores),
693
+ "successful_queries": len(all_successful_results),
694
+ "total_queries": total_queries,
695
+ "meets_threshold": (sum(coverage_scores) / len(coverage_scores)) >= 0.4
696
+ }
697
+
698
+ # Return empty stats for failed cases
699
+ return {
700
+ "average_value": 0.0,
701
+ "successful_queries": len(all_successful_results),
702
+ "total_queries": total_queries,
703
+ "meets_threshold": False
704
+ }
705
+ def save_all_metric_statistics(self) -> Dict[str, str]:
706
+ """Save separate statistics files for each metric"""
707
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
708
+
709
+ # Ensure results directory exists
710
+ results_dir = Path(__file__).parent / "results"
711
+ results_dir.mkdir(exist_ok=True)
712
+
713
+ saved_files = {}
714
+
715
+ # Save statistics for each metric
716
+ for metric_name in ["latency", "extraction", "relevance", "coverage"]:
717
+ stats = self.calculate_metric_statistics(metric_name)
718
+ filename = f"{metric_name}_statistics_{timestamp}.json"
719
+ filepath = results_dir / filename
720
+
721
+ with open(filepath, 'w', encoding='utf-8') as f:
722
+ json.dump(stats, f, indent=2, ensure_ascii=False)
723
+
724
+ saved_files[metric_name] = str(filepath)
725
+ print(f"📊 {metric_name.capitalize()} statistics saved to: {filepath}")
726
+
727
+ return saved_files
728
+
729
+ def save_medical_outputs(self, filename: str = None) -> str:
730
+ """Save medical advice outputs for model comparison"""
731
+ if filename is None:
732
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
733
+ filename = f"medical_outputs_{timestamp}.json"
734
+
735
+ # Ensure results directory exists
736
+ results_dir = Path(__file__).parent / "results"
737
+ results_dir.mkdir(exist_ok=True)
738
+
739
+ filepath = results_dir / filename
740
+
741
+ # Create comprehensive output data
742
+ output_data = {
743
+ "evaluation_metadata": {
744
+ "total_outputs": len(self.medical_outputs),
745
+ "categories": list(set(output['category'] for output in self.medical_outputs)),
746
+ "timestamp": datetime.now().isoformat(),
747
+ "model_type": "Med42-70B_RAG_enhanced" # For future comparison
748
+ },
749
+ "medical_outputs": self.medical_outputs
750
+ }
751
+
752
+ with open(filepath, 'w', encoding='utf-8') as f:
753
+ json.dump(output_data, f, indent=2, ensure_ascii=False)
754
+
755
+ print(f"📝 Medical outputs saved to: {filepath}")
756
+ return str(filepath)
757
+
758
+ def save_comprehensive_details(self, filename: str = None) -> str:
759
+ """Save comprehensive detailed results"""
760
+ if filename is None:
761
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
762
+ filename = f"comprehensive_details_{timestamp}.json"
763
+
764
+ # Ensure results directory exists
765
+ results_dir = Path(__file__).parent / "results"
766
+ results_dir.mkdir(exist_ok=True)
767
+
768
+ filepath = results_dir / filename
769
+
770
+ # Create comprehensive evaluation data
771
+ comprehensive_data = {
772
+ "evaluation_metadata": {
773
+ "total_queries": len(self.comprehensive_results),
774
+ "successful_queries": len([r for r in self.comprehensive_results if r.get('overall_success')]),
775
+ "timestamp": datetime.now().isoformat(),
776
+ "evaluator_type": "comprehensive_metrics_1_to_4",
777
+ "metrics_evaluated": ["latency", "extraction", "relevance", "coverage"]
778
+ },
779
+ "comprehensive_results": self.comprehensive_results
780
+ }
781
+
782
+ with open(filepath, 'w', encoding='utf-8') as f:
783
+ json.dump(comprehensive_data, f, indent=2, ensure_ascii=False)
784
+
785
+ print(f"📋 Comprehensive details saved to: {filepath}")
786
+ return str(filepath)
787
+
788
+
789
+ # Independent execution interface
790
+ if __name__ == "__main__":
791
+ """Independent comprehensive evaluation interface"""
792
+
793
+ print("🚀 OnCall.ai Comprehensive Evaluator - Metrics 1-4 in Single Run")
794
+
795
+ if len(sys.argv) > 1:
796
+ query_file = sys.argv[1]
797
+ else:
798
+ # Default to evaluation/single_test_query.txt for initial testing
799
+ query_file = Path(__file__).parent / "single_test_query.txt"
800
+
801
+ if not os.path.exists(query_file):
802
+ print(f"❌ Query file not found: {query_file}")
803
+ print("Usage: python latency_evaluator.py [query_file.txt]")
804
+ sys.exit(1)
805
+
806
+ # Initialize evaluator
807
+ evaluator = ComprehensiveEvaluator()
808
+
809
+ # Parse queries from file
810
+ queries_by_category = evaluator.parse_queries_from_file(str(query_file))
811
+
812
+ if "error" in queries_by_category:
813
+ print(f"❌ Failed to parse queries: {queries_by_category['error']}")
814
+ sys.exit(1)
815
+
816
+ # Test each query comprehensively
817
+ print(f"\n🧪 Comprehensive Evaluation - All Metrics in Single Run")
818
+ print(f"📊 Collecting metrics 1-4 from single app.py pipeline execution")
819
+
820
+ for category, queries in queries_by_category.items():
821
+ if not queries:
822
+ continue
823
+
824
+ print(f"\n📂 Testing {category.upper()} queries:")
825
+
826
+ for i, query_info in enumerate(queries):
827
+ query_text = query_info['text']
828
+ print(f"\n🔍 Query {i+1}/{len(queries)} in {category} category:")
829
+ print(f" Text: {query_text}")
830
+
831
+ # Comprehensive evaluation (collects all metrics 1-4)
832
+ result = evaluator.evaluate_single_query_comprehensive(query_text, category)
833
+
834
+ # Pause between queries to avoid rate limits
835
+ if i < len(queries) - 1:
836
+ print(f" ⏳ Pausing 5s before next query...")
837
+ time.sleep(5)
838
+
839
+ # Longer pause between categories
840
+ if category != list(queries_by_category.keys())[-1]:
841
+ print(f"\n⏳ Pausing 10s before next category...")
842
+ time.sleep(10)
843
+
844
+ # Generate and save all metric statistics
845
+ print(f"\n📊 Generating comprehensive analysis for all metrics...")
846
+
847
+ # Save separate statistics for each metric
848
+ saved_stats = evaluator.save_all_metric_statistics()
849
+
850
+ # Save medical outputs for model comparison
851
+ outputs_path = evaluator.save_medical_outputs()
852
+
853
+ # Save comprehensive details
854
+ details_path = evaluator.save_comprehensive_details()
855
+
856
+ # Print comprehensive summary
857
+ print(f"\n📊 === COMPREHENSIVE EVALUATION SUMMARY ===")
858
+
859
+ for metric_name in ["latency", "extraction", "relevance", "coverage"]:
860
+ stats = evaluator.calculate_metric_statistics(metric_name)
861
+ overall_results = stats['overall_results']
862
+
863
+ print(f"\n{metric_name.upper()} METRICS:")
864
+
865
+ if metric_name == "latency":
866
+ print(f" Average: {overall_results['average_latency']:.2f}s (±{overall_results['std_deviation']:.2f})")
867
+ print(f" 60s Target: {'✅ Met' if overall_results['target_compliance'] >= 0.8 else '❌ Not Met'}")
868
+
869
+ elif metric_name == "extraction":
870
+ print(f" Success Rate: {overall_results['success_rate']:.1%}")
871
+ print(f" 80% Target: {'✅ Met' if overall_results['target_compliance'] else '❌ Not Met'}")
872
+
873
+ elif metric_name == "relevance":
874
+ print(f" Average Relevance: {overall_results['average_relevance']:.3f}")
875
+ print(f" 0.70 Target: {'✅ Met' if overall_results.get('target_compliance', False) else '❌ Not Met'}")
876
+
877
+ elif metric_name == "coverage":
878
+ print(f" Average Coverage: {overall_results['average_coverage']:.3f} ({overall_results['average_coverage']*100:.1f}%)")
879
+ print(f" 40% Target: {'✅ Met' if overall_results['meets_threshold'] else '❌ Not Met'}")
880
+
881
+ print(f"\n✅ Comprehensive evaluation complete! Files saved:")
882
+ for metric_name, filepath in saved_stats.items():
883
+ print(f" 📊 {metric_name.capitalize()}: {filepath}")
884
+ print(f" 📝 Medical Outputs: {outputs_path}")
885
+ print(f" 📋 Comprehensive Details: {details_path}")
886
+ print(f"\n💡 Next step: Run downstream evaluators for metrics 5-8")
887
+ print(f" python metric5_6_llm_judge_evaluator.py rag")
888
+ print(f" python metric7_8_precision_MRR.py {details_path}")
889
+ print(f" python latency_chart_generator.py")
890
+ print(f" python extraction_chart_generator.py # (create separately)")
891
+ print(f" python relevance_chart_generator.py # (create separately)")
892
+ print(f" python coverage_chart_generator.py # (create separately)")
evaluation/metric1_latency_chart_generator.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - Latency Chart Generator
4
+ ==========================================
5
+
6
+ Generates comprehensive latency analysis charts from saved statistics.
7
+ Reads JSON files produced by latency_evaluator.py and creates visualizations.
8
+
9
+ No LLM calls - pure data visualization.
10
+
11
+ Author: YanBo Chen
12
+ Date: 2025-08-04
13
+ """
14
+
15
+ import json
16
+ import os
17
+ import sys
18
+ from typing import Dict, List, Any
19
+ from datetime import datetime
20
+ from pathlib import Path
21
+ import glob
22
+
23
+ # Visualization imports
24
+ import matplotlib.pyplot as plt
25
+ import seaborn as sns
26
+ import pandas as pd
27
+ import numpy as np
28
+
29
+
30
+ class LatencyChartGenerator:
31
+ """Generate charts from latency evaluation statistics - no LLM dependency"""
32
+
33
+ def __init__(self):
34
+ """Initialize chart generator"""
35
+ print("📈 Initializing Latency Chart Generator...")
36
+
37
+ # Set up professional chart style
38
+ plt.style.use('default')
39
+ sns.set_palette("husl")
40
+
41
+ print("✅ Chart Generator ready")
42
+
43
+ def load_latest_statistics(self, results_dir: str = None) -> Dict[str, Any]:
44
+ """
45
+ Load the most recent latency statistics file
46
+
47
+ Args:
48
+ results_dir: Directory containing statistics files
49
+ """
50
+ if results_dir is None:
51
+ results_dir = Path(__file__).parent / "results"
52
+
53
+ # Find latest statistics file
54
+ pattern = str(results_dir / "latency_statistics_*.json")
55
+ stat_files = glob.glob(pattern)
56
+
57
+ if not stat_files:
58
+ raise FileNotFoundError(f"No latency statistics files found in {results_dir}")
59
+
60
+ # Get the most recent file
61
+ latest_file = max(stat_files, key=os.path.getmtime)
62
+
63
+ print(f"📊 Loading statistics from: {latest_file}")
64
+
65
+ with open(latest_file, 'r', encoding='utf-8') as f:
66
+ stats = json.load(f)
67
+
68
+ return stats
69
+
70
+ def generate_comprehensive_charts(self, stats: Dict[str, Any]) -> str:
71
+ """
72
+ Generate comprehensive 4-category latency analysis charts
73
+
74
+ Creates professional charts showing:
75
+ 1. Category comparison bar chart
76
+ 2. Individual query scatter plot
77
+ 3. Statistical summary table
78
+ 4. Performance distribution box plot
79
+ """
80
+ try:
81
+ # Create figure with subplots
82
+ fig, axes = plt.subplots(2, 2, figsize=(16, 12))
83
+ fig.suptitle('OnCall.ai Latency Analysis - Category Comparison',
84
+ fontsize=16, fontweight='bold')
85
+
86
+ category_results = stats['category_results']
87
+ overall_results = stats['overall_results']
88
+
89
+ # Chart 1: Category Comparison Bar Chart
90
+ ax1 = axes[0, 0]
91
+ categories = []
92
+ avg_latencies = []
93
+ std_devs = []
94
+
95
+ # Collect category data
96
+ for category, cat_stats in category_results.items():
97
+ if cat_stats['query_count'] > 0:
98
+ categories.append(category.replace('_', ' ').title())
99
+ avg_latencies.append(cat_stats['average_latency'])
100
+ std_devs.append(cat_stats['std_deviation'])
101
+
102
+ # Add overall
103
+ categories.append('Overall')
104
+ avg_latencies.append(overall_results['average_latency'])
105
+ std_devs.append(overall_results['std_deviation'])
106
+
107
+ # Create bar chart with error bars
108
+ bars = ax1.bar(categories, avg_latencies, capsize=5, alpha=0.8,
109
+ color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
110
+ ax1.errorbar(categories, avg_latencies, yerr=std_devs, fmt='none',
111
+ color='black', capsize=3, capthick=1)
112
+
113
+ ax1.set_title('Average Latency by Category', fontweight='bold')
114
+ ax1.set_ylabel('Latency (seconds)')
115
+ ax1.set_xlabel('Query Category')
116
+ ax1.grid(True, alpha=0.3)
117
+
118
+ # Add value labels on bars
119
+ for bar, avg, std in zip(bars, avg_latencies, std_devs):
120
+ height = bar.get_height()
121
+ ax1.text(bar.get_x() + bar.get_width()/2., height + std*0.1,
122
+ f'{avg:.1f}s', ha='center', va='bottom', fontweight='bold')
123
+
124
+ # Add target line
125
+ ax1.axhline(y=30.0, color='red', linestyle='--', alpha=0.7, label='30s Target')
126
+ ax1.legend()
127
+
128
+ # Chart 2: Individual Query Performance
129
+ ax2 = axes[0, 1]
130
+
131
+ query_indices = []
132
+ latencies = []
133
+ colors = []
134
+
135
+ color_map = {'diagnosis': '#1f77b4', 'treatment': '#ff7f0e', 'mixed': '#d62728'}
136
+ query_idx = 0
137
+
138
+ for category, cat_stats in category_results.items():
139
+ for latency in cat_stats['individual_latencies']:
140
+ query_indices.append(query_idx)
141
+ latencies.append(latency)
142
+ colors.append(color_map.get(category, 'gray'))
143
+ query_idx += 1
144
+
145
+ if latencies:
146
+ ax2.scatter(query_indices, latencies, c=colors, alpha=0.7, s=100)
147
+ ax2.set_title('Individual Query Performance', fontweight='bold')
148
+ ax2.set_ylabel('Latency (seconds)')
149
+ ax2.set_xlabel('Query Index')
150
+ ax2.grid(True, alpha=0.3)
151
+
152
+ # Add target line
153
+ ax2.axhline(y=30.0, color='red', linestyle='--', alpha=0.7, label='30s Target')
154
+
155
+ # Add category legend
156
+ from matplotlib.patches import Patch
157
+ legend_elements = [Patch(facecolor=color_map[cat], label=cat.title())
158
+ for cat in color_map.keys() if cat in category_results.keys()]
159
+ ax2.legend(handles=legend_elements)
160
+ else:
161
+ ax2.text(0.5, 0.5, 'No latency data available',
162
+ ha='center', va='center', transform=ax2.transAxes)
163
+ ax2.set_title('Individual Query Performance', fontweight='bold')
164
+
165
+ # Chart 3: Statistical Summary Table
166
+ ax3 = axes[1, 0]
167
+ ax3.axis('tight')
168
+ ax3.axis('off')
169
+
170
+ # Create summary table
171
+ table_data = []
172
+ headers = ['Category', 'Avg (s)', 'Std (s)', 'Min (s)', 'Max (s)', 'Count']
173
+
174
+ for category, cat_stats in category_results.items():
175
+ if cat_stats['query_count'] > 0:
176
+ table_data.append([
177
+ category.replace('_', ' ').title(),
178
+ f"{cat_stats['average_latency']:.2f}",
179
+ f"{cat_stats['std_deviation']:.2f}",
180
+ f"{cat_stats['min_latency']:.2f}",
181
+ f"{cat_stats['max_latency']:.2f}",
182
+ str(cat_stats['query_count'])
183
+ ])
184
+
185
+ # Add overall row
186
+ table_data.append([
187
+ 'Overall',
188
+ f"{overall_results['average_latency']:.2f}",
189
+ f"{overall_results['std_deviation']:.2f}",
190
+ f"{overall_results['min_latency']:.2f}",
191
+ f"{overall_results['max_latency']:.2f}",
192
+ str(overall_results['successful_queries'])
193
+ ])
194
+
195
+ if table_data:
196
+ table = ax3.table(cellText=table_data, colLabels=headers,
197
+ cellLoc='center', loc='center',
198
+ colWidths=[0.2, 0.15, 0.15, 0.15, 0.15, 0.1])
199
+ table.auto_set_font_size(False)
200
+ table.set_fontsize(10)
201
+ table.scale(1, 2)
202
+
203
+ # Style the table header
204
+ for i in range(len(headers)):
205
+ table[(0, i)].set_text_props(weight='bold', color='white')
206
+ table[(0, i)].set_facecolor('#2E7D32')
207
+
208
+ ax3.set_title('Statistical Summary', fontweight='bold', pad=20)
209
+
210
+ # Chart 4: Performance Distribution
211
+ ax4 = axes[1, 1]
212
+
213
+ # Create box plot if we have multiple data points
214
+ box_data = []
215
+ box_labels = []
216
+
217
+ for category, cat_stats in category_results.items():
218
+ if cat_stats['individual_latencies'] and len(cat_stats['individual_latencies']) > 0:
219
+ box_data.append(cat_stats['individual_latencies'])
220
+ box_labels.append(category.replace('_', ' ').title())
221
+
222
+ if box_data and len(box_data) > 0:
223
+ box_plot = ax4.boxplot(box_data, labels=box_labels, patch_artist=True)
224
+
225
+ # Color the boxes
226
+ colors = ['#1f77b4', '#ff7f0e', '#d62728']
227
+ for patch, color in zip(box_plot['boxes'], colors[:len(box_plot['boxes'])]):
228
+ patch.set_facecolor(color)
229
+ patch.set_alpha(0.7)
230
+
231
+ ax4.set_title('Latency Distribution by Category', fontweight='bold')
232
+ ax4.set_ylabel('Latency (seconds)')
233
+ ax4.grid(True, alpha=0.3)
234
+
235
+ # Add target line
236
+ ax4.axhline(y=30.0, color='red', linestyle='--', alpha=0.7, label='30s Target')
237
+ ax4.legend()
238
+ else:
239
+ # For single data points, show a simple bar chart
240
+ single_categories = []
241
+ single_latencies = []
242
+
243
+ for category, cat_stats in category_results.items():
244
+ if cat_stats['query_count'] > 0:
245
+ single_categories.append(category.replace('_', ' ').title())
246
+ single_latencies.append(cat_stats['average_latency'])
247
+
248
+ if single_categories:
249
+ ax4.bar(single_categories, single_latencies, alpha=0.7,
250
+ color=['#1f77b4', '#ff7f0e', '#d62728'][:len(single_categories)])
251
+ ax4.set_title('Category Latency (Single Query Each)', fontweight='bold')
252
+ ax4.set_ylabel('Latency (seconds)')
253
+ ax4.grid(True, alpha=0.3)
254
+ ax4.axhline(y=30.0, color='red', linestyle='--', alpha=0.7, label='30s Target')
255
+ ax4.legend()
256
+ else:
257
+ ax4.text(0.5, 0.5, 'No data available for distribution plot',
258
+ ha='center', va='center', transform=ax4.transAxes)
259
+ ax4.set_title('Latency Distribution', fontweight='bold')
260
+
261
+ # Adjust layout and save
262
+ plt.tight_layout()
263
+
264
+ # Save chart
265
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
266
+ chart_filename = f"latency_analysis_charts_{timestamp}.png"
267
+
268
+ # Ensure results directory exists
269
+ results_dir = Path(__file__).parent / "results"
270
+ results_dir.mkdir(exist_ok=True)
271
+ chart_path = results_dir / chart_filename
272
+
273
+ plt.savefig(chart_path, dpi=300, bbox_inches='tight',
274
+ facecolor='white', edgecolor='none')
275
+ plt.close()
276
+
277
+ print(f"📈 Charts saved to: {chart_path}")
278
+ return str(chart_path)
279
+
280
+ except Exception as e:
281
+ print(f"❌ Chart generation failed: {e}")
282
+ return ""
283
+
284
+ def print_statistics_summary(self, stats: Dict[str, Any]):
285
+ """Print formatted statistics summary to console"""
286
+ category_results = stats['category_results']
287
+ overall_results = stats['overall_results']
288
+
289
+ print(f"\n📊 === LATENCY ANALYSIS CHART SUMMARY ===")
290
+ print(f"Overall Performance:")
291
+ print(f" Average Latency: {overall_results['average_latency']:.2f}s (±{overall_results['std_deviation']:.2f})")
292
+ print(f" Success Rate: {overall_results['successful_queries']}/{overall_results['total_queries']}")
293
+ print(f" 30s Target Compliance: {overall_results['target_compliance']:.1%}")
294
+
295
+ print(f"\nCategory Breakdown:")
296
+ for category, cat_stats in category_results.items():
297
+ if cat_stats['query_count'] > 0:
298
+ print(f" {category.capitalize()}: {cat_stats['average_latency']:.2f}s (±{cat_stats['std_deviation']:.2f}) [{cat_stats['query_count']} queries]")
299
+
300
+
301
+ # Independent execution interface
302
+ if __name__ == "__main__":
303
+ """Independent chart generation interface"""
304
+
305
+ print("📈 OnCall.ai Latency Chart Generator")
306
+
307
+ # Initialize chart generator
308
+ chart_gen = LatencyChartGenerator()
309
+
310
+ try:
311
+ # Load latest statistics
312
+ stats = chart_gen.load_latest_statistics()
313
+
314
+ # Generate charts
315
+ chart_path = chart_gen.generate_comprehensive_charts(stats)
316
+
317
+ # Print summary
318
+ chart_gen.print_statistics_summary(stats)
319
+
320
+ print(f"\n✅ Chart generation complete!")
321
+ print(f"📈 Charts saved to: {chart_path}")
322
+
323
+ except FileNotFoundError as e:
324
+ print(f"❌ {e}")
325
+ print("💡 Please run latency_evaluator.py first to generate statistics data")
326
+ except Exception as e:
327
+ print(f"❌ Chart generation failed: {e}")
evaluation/metric2_extraction_chart_generator.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - Extraction Chart Generator
4
+ ============================================
5
+
6
+ Generates extraction success rate charts from saved statistics.
7
+ Reads JSON files produced by comprehensive evaluator.
8
+
9
+ Author: YanBo Chen
10
+ Date: 2025-08-04
11
+ """
12
+
13
+ import json
14
+ import os
15
+ import sys
16
+ from typing import Dict, List, Any
17
+ from datetime import datetime
18
+ from pathlib import Path
19
+ import glob
20
+
21
+ # Visualization imports
22
+ import matplotlib.pyplot as plt
23
+ import seaborn as sns
24
+ import pandas as pd
25
+ import numpy as np
26
+
27
+
28
+ class ExtractionChartGenerator:
29
+ """Generate charts for condition extraction metrics"""
30
+
31
+ def __init__(self):
32
+ """Initialize chart generator"""
33
+ print("📈 Initializing Extraction Chart Generator...")
34
+ plt.style.use('default')
35
+ sns.set_palette("husl")
36
+ print("✅ Chart Generator ready")
37
+
38
+ def load_latest_extraction_statistics(self, results_dir: str = None) -> Dict[str, Any]:
39
+ """Load the most recent extraction statistics file"""
40
+ if results_dir is None:
41
+ results_dir = Path(__file__).parent / "results"
42
+
43
+ pattern = str(results_dir / "extraction_statistics_*.json")
44
+ stat_files = glob.glob(pattern)
45
+
46
+ if not stat_files:
47
+ raise FileNotFoundError(f"No extraction statistics files found in {results_dir}")
48
+
49
+ latest_file = max(stat_files, key=os.path.getmtime)
50
+ print(f"📊 Loading extraction statistics from: {latest_file}")
51
+
52
+ with open(latest_file, 'r', encoding='utf-8') as f:
53
+ stats = json.load(f)
54
+
55
+ return stats
56
+
57
+ def generate_extraction_charts(self, stats: Dict[str, Any]) -> str:
58
+ """Generate extraction success rate analysis charts"""
59
+ try:
60
+ fig, axes = plt.subplots(2, 2, figsize=(16, 12))
61
+ fig.suptitle('OnCall.ai Extraction Success Rate Analysis', fontsize=16, fontweight='bold')
62
+
63
+ category_results = stats['category_results']
64
+ overall_results = stats['overall_results']
65
+
66
+ # Chart 1: Success Rate by Category
67
+ ax1 = axes[0, 0]
68
+ categories = []
69
+ success_rates = []
70
+
71
+ for category, cat_stats in category_results.items():
72
+ if cat_stats['total_count'] > 0:
73
+ categories.append(category.replace('_', ' ').title())
74
+ success_rates.append(cat_stats['success_rate'] * 100)
75
+
76
+ categories.append('Overall')
77
+ success_rates.append(overall_results['success_rate'] * 100)
78
+
79
+ bars = ax1.bar(categories, success_rates, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
80
+ ax1.set_title('Extraction Success Rate by Category', fontweight='bold')
81
+ ax1.set_ylabel('Success Rate (%)')
82
+ ax1.set_xlabel('Query Category')
83
+ ax1.grid(True, alpha=0.3)
84
+
85
+ # Add target line
86
+ ax1.axhline(y=80, color='red', linestyle='--', alpha=0.7, label='80% Target')
87
+ ax1.legend()
88
+
89
+ # Add value labels
90
+ for bar, rate in zip(bars, success_rates):
91
+ height = bar.get_height()
92
+ ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
93
+ f'{rate:.1f}%', ha='center', va='bottom', fontweight='bold')
94
+
95
+ # Chart 2: Success Count
96
+ ax2 = axes[0, 1]
97
+ successful_counts = []
98
+ total_counts = []
99
+
100
+ for category, cat_stats in category_results.items():
101
+ if cat_stats['total_count'] > 0:
102
+ successful_counts.append(cat_stats['successful_count'])
103
+ total_counts.append(cat_stats['total_count'])
104
+
105
+ successful_counts.append(overall_results['successful_count'])
106
+ total_counts.append(overall_results['total_count'])
107
+
108
+ x = np.arange(len(categories))
109
+ width = 0.35
110
+
111
+ ax2.bar(x - width/2, successful_counts, width, label='Successful', alpha=0.8)
112
+ ax2.bar(x + width/2, total_counts, width, label='Total', alpha=0.8)
113
+
114
+ ax2.set_title('Extraction Success Count', fontweight='bold')
115
+ ax2.set_ylabel('Query Count')
116
+ ax2.set_xlabel('Query Category')
117
+ ax2.set_xticks(x)
118
+ ax2.set_xticklabels(categories)
119
+ ax2.legend()
120
+ ax2.grid(True, alpha=0.3)
121
+
122
+ # Chart 3: Statistical Summary Table
123
+ ax3 = axes[1, 0]
124
+ ax3.axis('tight')
125
+ ax3.axis('off')
126
+
127
+ table_data = []
128
+ headers = ['Category', 'Success Rate', 'Success/Total', 'Avg Time (s)', 'Target Met']
129
+
130
+ for category, cat_stats in category_results.items():
131
+ if cat_stats['total_count'] > 0:
132
+ table_data.append([
133
+ category.replace('_', ' ').title(),
134
+ f"{cat_stats['success_rate']:.1%}",
135
+ f"{cat_stats['successful_count']}/{cat_stats['total_count']}",
136
+ f"{cat_stats['average_extraction_time']:.3f}",
137
+ '✅' if cat_stats.get('meets_threshold', False) else '❌'
138
+ ])
139
+
140
+ table_data.append([
141
+ 'Overall',
142
+ f"{overall_results['success_rate']:.1%}",
143
+ f"{overall_results['successful_count']}/{overall_results['total_count']}",
144
+ '-',
145
+ '✅' if overall_results.get('target_compliance', False) else '❌'
146
+ ])
147
+
148
+ if table_data:
149
+ table = ax3.table(cellText=table_data, colLabels=headers,
150
+ cellLoc='center', loc='center')
151
+ table.auto_set_font_size(False)
152
+ table.set_fontsize(10)
153
+ table.scale(1, 2)
154
+
155
+ # Style header
156
+ for i in range(len(headers)):
157
+ table[(0, i)].set_text_props(weight='bold', color='white')
158
+ table[(0, i)].set_facecolor('#2E7D32')
159
+
160
+ ax3.set_title('Extraction Statistics Summary', fontweight='bold', pad=20)
161
+
162
+ # Chart 4: Performance visualization
163
+ ax4 = axes[1, 1]
164
+
165
+ # Simple performance indicator
166
+ overall_rate = overall_results['success_rate'] * 100
167
+ colors = ['#d62728' if overall_rate < 80 else '#2ca02c']
168
+
169
+ wedges, texts, autotexts = ax4.pie([overall_rate, 100-overall_rate],
170
+ labels=['Successful', 'Failed'],
171
+ autopct='%1.1f%%',
172
+ colors=['#2ca02c', '#ffcccc'],
173
+ startangle=90)
174
+
175
+ ax4.set_title(f'Overall Extraction Success\n{overall_rate:.1f}% Success Rate', fontweight='bold')
176
+
177
+ plt.tight_layout()
178
+
179
+ # Save chart
180
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
181
+ chart_filename = f"extraction_analysis_charts_{timestamp}.png"
182
+
183
+ results_dir = Path(__file__).parent / "results"
184
+ results_dir.mkdir(exist_ok=True)
185
+ chart_path = results_dir / chart_filename
186
+
187
+ plt.savefig(chart_path, dpi=300, bbox_inches='tight', facecolor='white')
188
+ plt.close()
189
+
190
+ print(f"📈 Extraction charts saved to: {chart_path}")
191
+ return str(chart_path)
192
+
193
+ except Exception as e:
194
+ print(f"❌ Extraction chart generation failed: {e}")
195
+ return ""
196
+
197
+
198
+ if __name__ == "__main__":
199
+ """Independent extraction chart generation"""
200
+
201
+ print("📈 OnCall.ai Extraction Chart Generator")
202
+
203
+ chart_gen = ExtractionChartGenerator()
204
+
205
+ try:
206
+ stats = chart_gen.load_latest_extraction_statistics()
207
+ chart_path = chart_gen.generate_extraction_charts(stats)
208
+
209
+ print(f"\n✅ Extraction chart generation complete!")
210
+ print(f"📈 Charts saved to: {chart_path}")
211
+
212
+ except FileNotFoundError as e:
213
+ print(f"❌ {e}")
214
+ print("💡 Please run latency_evaluator.py first to generate extraction statistics data")
215
+ except Exception as e:
216
+ print(f"❌ Chart generation failed: {e}")
evaluation/metric3_relevance_chart_generator.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - Relevance Chart Generator
4
+ ============================================
5
+
6
+ Generates retrieval relevance charts from saved statistics.
7
+ Shows cosine similarity analysis and threshold compliance.
8
+
9
+ Author: YanBo Chen
10
+ Date: 2025-08-04
11
+ """
12
+
13
+ import json
14
+ import os
15
+ import sys
16
+ from typing import Dict, List, Any
17
+ from datetime import datetime
18
+ from pathlib import Path
19
+ import glob
20
+
21
+ # Visualization imports
22
+ import matplotlib.pyplot as plt
23
+ import seaborn as sns
24
+ import pandas as pd
25
+ import numpy as np
26
+
27
+
28
+ class RelevanceChartGenerator:
29
+ """Generate charts for retrieval relevance metrics"""
30
+
31
+ def __init__(self):
32
+ """Initialize chart generator"""
33
+ print("📈 Initializing Relevance Chart Generator...")
34
+ plt.style.use('default')
35
+ sns.set_palette("husl")
36
+ print("✅ Chart Generator ready")
37
+
38
+ def load_latest_relevance_statistics(self, results_dir: str = None) -> Dict[str, Any]:
39
+ """Load the most recent relevance statistics file"""
40
+ if results_dir is None:
41
+ results_dir = Path(__file__).parent / "results"
42
+
43
+ pattern = str(results_dir / "relevance_statistics_*.json")
44
+ stat_files = glob.glob(pattern)
45
+
46
+ if not stat_files:
47
+ raise FileNotFoundError(f"No relevance statistics files found in {results_dir}")
48
+
49
+ latest_file = max(stat_files, key=os.path.getmtime)
50
+ print(f"📊 Loading relevance statistics from: {latest_file}")
51
+
52
+ with open(latest_file, 'r', encoding='utf-8') as f:
53
+ stats = json.load(f)
54
+
55
+ return stats
56
+
57
+ def generate_relevance_charts(self, stats: Dict[str, Any]) -> str:
58
+ """Generate relevance analysis charts"""
59
+ try:
60
+ fig, axes = plt.subplots(2, 2, figsize=(16, 12))
61
+ fig.suptitle('OnCall.ai Retrieval Relevance Analysis', fontsize=16, fontweight='bold')
62
+
63
+ category_results = stats['category_results']
64
+ overall_results = stats['overall_results']
65
+
66
+ # Chart 1: Average Relevance by Category
67
+ ax1 = axes[0, 0]
68
+ categories = []
69
+ avg_relevances = []
70
+
71
+ for category, cat_stats in category_results.items():
72
+ if cat_stats['successful_retrievals'] > 0:
73
+ categories.append(category.replace('_', ' ').title())
74
+ avg_relevances.append(cat_stats['average_relevance'])
75
+
76
+ categories.append('Overall')
77
+ avg_relevances.append(overall_results['average_relevance'])
78
+
79
+ bars = ax1.bar(categories, avg_relevances, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
80
+ ax1.set_title('Average Relevance Score by Category', fontweight='bold')
81
+ ax1.set_ylabel('Relevance Score (Cosine Similarity)')
82
+ ax1.set_xlabel('Query Category')
83
+ ax1.grid(True, alpha=0.3)
84
+
85
+ # Add threshold lines
86
+ ax1.axhline(y=0.2, color='orange', linestyle='--', alpha=0.7, label='0.2 Threshold')
87
+ ax1.axhline(y=0.70, color='red', linestyle='--', alpha=0.7, label='0.70 Target')
88
+ ax1.legend()
89
+
90
+ # Add value labels
91
+ for bar, relevance in zip(bars, avg_relevances):
92
+ height = bar.get_height()
93
+ ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
94
+ f'{relevance:.3f}', ha='center', va='bottom', fontweight='bold')
95
+
96
+ # Chart 2: Relevance Distribution
97
+ ax2 = axes[0, 1]
98
+
99
+ # Collect all individual relevance scores
100
+ all_scores = []
101
+ category_labels = []
102
+
103
+ for category, cat_stats in category_results.items():
104
+ if cat_stats.get('individual_relevance_scores'):
105
+ all_scores.extend(cat_stats['individual_relevance_scores'])
106
+ category_labels.extend([category] * len(cat_stats['individual_relevance_scores']))
107
+
108
+ if all_scores:
109
+ # Create histogram
110
+ ax2.hist(all_scores, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
111
+ ax2.axvline(x=0.2, color='orange', linestyle='--', alpha=0.7, label='0.2 Threshold')
112
+ ax2.axvline(x=0.70, color='red', linestyle='--', alpha=0.7, label='0.70 Target')
113
+ ax2.axvline(x=np.mean(all_scores), color='green', linestyle='-', alpha=0.8, label=f'Mean: {np.mean(all_scores):.3f}')
114
+
115
+ ax2.set_title('Relevance Score Distribution', fontweight='bold')
116
+ ax2.set_xlabel('Relevance Score')
117
+ ax2.set_ylabel('Frequency')
118
+ ax2.legend()
119
+ ax2.grid(True, alpha=0.3)
120
+ else:
121
+ ax2.text(0.5, 0.5, 'No relevance data available', ha='center', va='center', transform=ax2.transAxes)
122
+ ax2.set_title('Relevance Score Distribution', fontweight='bold')
123
+
124
+ # Chart 3: Statistical Summary Table
125
+ ax3 = axes[1, 0]
126
+ ax3.axis('tight')
127
+ ax3.axis('off')
128
+
129
+ table_data = []
130
+ headers = ['Category', 'Avg Relevance', 'Min/Max', 'Success/Total', 'Threshold Met']
131
+
132
+ for category, cat_stats in category_results.items():
133
+ if cat_stats['total_queries'] > 0:
134
+ table_data.append([
135
+ category.replace('_', ' ').title(),
136
+ f"{cat_stats['average_relevance']:.3f}",
137
+ f"{cat_stats['min_relevance']:.3f}/{cat_stats['max_relevance']:.3f}",
138
+ f"{cat_stats['successful_retrievals']}/{cat_stats['total_queries']}",
139
+ '✅' if cat_stats.get('meets_threshold', False) else '❌'
140
+ ])
141
+
142
+ table_data.append([
143
+ 'Overall',
144
+ f"{overall_results['average_relevance']:.3f}",
145
+ f"{overall_results['min_relevance']:.3f}/{overall_results['max_relevance']:.3f}",
146
+ f"{overall_results['successful_queries']}/{overall_results['total_queries']}",
147
+ '✅' if overall_results.get('target_compliance', False) else '❌'
148
+ ])
149
+
150
+ if table_data:
151
+ table = ax3.table(cellText=table_data, colLabels=headers,
152
+ cellLoc='center', loc='center')
153
+ table.auto_set_font_size(False)
154
+ table.set_fontsize(10)
155
+ table.scale(1, 2)
156
+
157
+ # Style header
158
+ for i in range(len(headers)):
159
+ table[(0, i)].set_text_props(weight='bold', color='white')
160
+ table[(0, i)].set_facecolor('#2E7D32')
161
+
162
+ ax3.set_title('Relevance Statistics Summary', fontweight='bold', pad=20)
163
+
164
+ # Chart 4: Category Comparison Box Plot
165
+ ax4 = axes[1, 1]
166
+
167
+ box_data = []
168
+ box_labels = []
169
+
170
+ for category, cat_stats in category_results.items():
171
+ if cat_stats.get('individual_relevance_scores'):
172
+ box_data.append(cat_stats['individual_relevance_scores'])
173
+ box_labels.append(category.replace('_', ' ').title())
174
+
175
+ if box_data:
176
+ box_plot = ax4.boxplot(box_data, labels=box_labels, patch_artist=True)
177
+ colors = ['#1f77b4', '#ff7f0e', '#d62728']
178
+ for patch, color in zip(box_plot['boxes'], colors[:len(box_plot['boxes'])]):
179
+ patch.set_facecolor(color)
180
+ patch.set_alpha(0.7)
181
+
182
+ ax4.axhline(y=0.2, color='orange', linestyle='--', alpha=0.7, label='0.2 Threshold')
183
+ ax4.axhline(y=0.70, color='red', linestyle='--', alpha=0.7, label='0.70 Target')
184
+ ax4.set_title('Relevance Distribution by Category', fontweight='bold')
185
+ ax4.set_ylabel('Relevance Score')
186
+ ax4.legend()
187
+ ax4.grid(True, alpha=0.3)
188
+ else:
189
+ ax4.text(0.5, 0.5, 'Insufficient data for box plot', ha='center', va='center', transform=ax4.transAxes)
190
+ ax4.set_title('Relevance Distribution by Category', fontweight='bold')
191
+
192
+ plt.tight_layout()
193
+
194
+ # Save chart
195
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
196
+ chart_filename = f"relevance_analysis_charts_{timestamp}.png"
197
+
198
+ results_dir = Path(__file__).parent / "results"
199
+ results_dir.mkdir(exist_ok=True)
200
+ chart_path = results_dir / chart_filename
201
+
202
+ plt.savefig(chart_path, dpi=300, bbox_inches='tight', facecolor='white')
203
+ plt.close()
204
+
205
+ print(f"📈 Relevance charts saved to: {chart_path}")
206
+ return str(chart_path)
207
+
208
+ except Exception as e:
209
+ print(f"❌ Relevance chart generation failed: {e}")
210
+ return ""
211
+
212
+
213
+ if __name__ == "__main__":
214
+ """Independent relevance chart generation"""
215
+
216
+ print("📈 OnCall.ai Relevance Chart Generator")
217
+
218
+ chart_gen = RelevanceChartGenerator()
219
+
220
+ try:
221
+ stats = chart_gen.load_latest_relevance_statistics()
222
+ chart_path = chart_gen.generate_relevance_charts(stats)
223
+
224
+ print(f"\n✅ Relevance chart generation complete!")
225
+ print(f"📈 Charts saved to: {chart_path}")
226
+
227
+ except FileNotFoundError as e:
228
+ print(f"❌ {e}")
229
+ print("💡 Please run latency_evaluator.py first to generate relevance statistics data")
230
+ except Exception as e:
231
+ print(f"❌ Chart generation failed: {e}")
evaluation/metric4_coverage_chart_generator.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - Coverage Chart Generator
4
+ ===========================================
5
+
6
+ Generates retrieval coverage charts from saved statistics.
7
+ Shows how well generated advice utilizes retrieved content.
8
+
9
+ Author: YanBo Chen
10
+ Date: 2025-08-04
11
+ """
12
+
13
+ import json
14
+ import os
15
+ import sys
16
+ from typing import Dict, List, Any
17
+ from datetime import datetime
18
+ from pathlib import Path
19
+ import glob
20
+
21
+ # Visualization imports
22
+ import matplotlib.pyplot as plt
23
+ import seaborn as sns
24
+ import pandas as pd
25
+ import numpy as np
26
+
27
+
28
+ class CoverageChartGenerator:
29
+ """Generate charts for retrieval coverage metrics"""
30
+
31
+ def __init__(self):
32
+ """Initialize chart generator"""
33
+ print("📈 Initializing Coverage Chart Generator...")
34
+ plt.style.use('default')
35
+ sns.set_palette("husl")
36
+ print("✅ Chart Generator ready")
37
+
38
+ def load_latest_coverage_statistics(self, results_dir: str = None) -> Dict[str, Any]:
39
+ """Load the most recent coverage statistics file"""
40
+ if results_dir is None:
41
+ results_dir = Path(__file__).parent / "results"
42
+
43
+ pattern = str(results_dir / "coverage_statistics_*.json")
44
+ stat_files = glob.glob(pattern)
45
+
46
+ if not stat_files:
47
+ raise FileNotFoundError(f"No coverage statistics files found in {results_dir}")
48
+
49
+ latest_file = max(stat_files, key=os.path.getmtime)
50
+ print(f"📊 Loading coverage statistics from: {latest_file}")
51
+
52
+ with open(latest_file, 'r', encoding='utf-8') as f:
53
+ stats = json.load(f)
54
+
55
+ return stats
56
+
57
+ def generate_coverage_charts(self, stats: Dict[str, Any]) -> str:
58
+ """Generate coverage analysis charts"""
59
+ try:
60
+ fig, axes = plt.subplots(2, 2, figsize=(16, 12))
61
+ fig.suptitle('OnCall.ai Retrieval Coverage Analysis', fontsize=16, fontweight='bold')
62
+
63
+ category_results = stats['category_results']
64
+ overall_results = stats['overall_results']
65
+
66
+ # Chart 1: Average Coverage by Category
67
+ ax1 = axes[0, 0]
68
+ categories = []
69
+ avg_coverages = []
70
+
71
+ for category, cat_stats in category_results.items():
72
+ if cat_stats['successful_evaluations'] > 0:
73
+ categories.append(category.replace('_', ' ').title())
74
+ avg_coverages.append(cat_stats['average_coverage'] * 100) # Convert to percentage
75
+
76
+ categories.append('Overall')
77
+ avg_coverages.append(overall_results['average_coverage'] * 100)
78
+
79
+ bars = ax1.bar(categories, avg_coverages, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
80
+ ax1.set_title('Average Coverage Score by Category', fontweight='bold')
81
+ ax1.set_ylabel('Coverage Score (%)')
82
+ ax1.set_xlabel('Query Category')
83
+ ax1.grid(True, alpha=0.3)
84
+
85
+ # Add target line
86
+ ax1.axhline(y=40, color='red', linestyle='--', alpha=0.7, label='40% Target')
87
+ ax1.legend()
88
+
89
+ # Add value labels
90
+ for bar, coverage in zip(bars, avg_coverages):
91
+ height = bar.get_height()
92
+ ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
93
+ f'{coverage:.1f}%', ha='center', va='bottom', fontweight='bold')
94
+
95
+ # Chart 2: Coverage Distribution
96
+ ax2 = axes[0, 1]
97
+
98
+ # Collect all individual coverage scores
99
+ all_scores = []
100
+
101
+ for category, cat_stats in category_results.items():
102
+ if cat_stats.get('individual_coverage_scores'):
103
+ all_scores.extend([score * 100 for score in cat_stats['individual_coverage_scores']])
104
+
105
+ if all_scores:
106
+ # Create histogram
107
+ ax2.hist(all_scores, bins=15, alpha=0.7, color='lightcoral', edgecolor='black')
108
+ ax2.axvline(x=40, color='red', linestyle='--', alpha=0.7, label='40% Target')
109
+ ax2.axvline(x=np.mean(all_scores), color='green', linestyle='-', alpha=0.8, label=f'Mean: {np.mean(all_scores):.1f}%')
110
+
111
+ ax2.set_title('Coverage Score Distribution', fontweight='bold')
112
+ ax2.set_xlabel('Coverage Score (%)')
113
+ ax2.set_ylabel('Frequency')
114
+ ax2.legend()
115
+ ax2.grid(True, alpha=0.3)
116
+ else:
117
+ ax2.text(0.5, 0.5, 'No coverage data available', ha='center', va='center', transform=ax2.transAxes)
118
+ ax2.set_title('Coverage Score Distribution', fontweight='bold')
119
+
120
+ # Chart 3: Statistical Summary Table
121
+ ax3 = axes[1, 0]
122
+ ax3.axis('tight')
123
+ ax3.axis('off')
124
+
125
+ table_data = []
126
+ headers = ['Category', 'Avg Coverage', 'Min/Max', 'Success/Total', 'Target Met']
127
+
128
+ for category, cat_stats in category_results.items():
129
+ if cat_stats['total_queries'] > 0:
130
+ table_data.append([
131
+ category.replace('_', ' ').title(),
132
+ f"{cat_stats['average_coverage']:.3f}",
133
+ f"{cat_stats['min_coverage']:.3f}/{cat_stats['max_coverage']:.3f}",
134
+ f"{cat_stats['successful_evaluations']}/{cat_stats['total_queries']}",
135
+ '✅' if cat_stats.get('meets_threshold', False) else '❌'
136
+ ])
137
+
138
+ table_data.append([
139
+ 'Overall',
140
+ f"{overall_results['average_coverage']:.3f}",
141
+ f"{overall_results['min_coverage']:.3f}/{overall_results['max_coverage']:.3f}",
142
+ f"{overall_results['successful_queries']}/{overall_results['total_queries']}",
143
+ '✅' if overall_results.get('meets_threshold', False) else '❌'
144
+ ])
145
+
146
+ if table_data:
147
+ table = ax3.table(cellText=table_data, colLabels=headers,
148
+ cellLoc='center', loc='center')
149
+ table.auto_set_font_size(False)
150
+ table.set_fontsize(10)
151
+ table.scale(1, 2)
152
+
153
+ # Style header
154
+ for i in range(len(headers)):
155
+ table[(0, i)].set_text_props(weight='bold', color='white')
156
+ table[(0, i)].set_facecolor('#2E7D32')
157
+
158
+ ax3.set_title('Coverage Statistics Summary', fontweight='bold', pad=20)
159
+
160
+ # Chart 4: Coverage Performance Radar/Gauge
161
+ ax4 = axes[1, 1]
162
+
163
+ # Create gauge-like visualization for overall coverage
164
+ overall_coverage_pct = overall_results['average_coverage'] * 100
165
+
166
+ # Pie chart as gauge
167
+ sizes = [overall_coverage_pct, 100 - overall_coverage_pct]
168
+ colors = ['#2ca02c' if overall_coverage_pct >= 40 else '#ff7f0e', '#f0f0f0']
169
+
170
+ wedges, texts, autotexts = ax4.pie(sizes, labels=['Covered', 'Not Covered'],
171
+ autopct='%1.1f%%',
172
+ colors=colors,
173
+ startangle=90,
174
+ counterclock=False)
175
+
176
+ # Add center text
177
+ ax4.text(0, 0, f'{overall_coverage_pct:.1f}%\nCoverage',
178
+ ha='center', va='center', fontsize=14, fontweight='bold')
179
+
180
+ ax4.set_title(f'Overall Coverage Performance\n{"✅ Target Met" if overall_coverage_pct >= 40 else "❌ Below Target"}',
181
+ fontweight='bold')
182
+
183
+ plt.tight_layout()
184
+
185
+ # Save chart
186
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
187
+ chart_filename = f"coverage_analysis_charts_{timestamp}.png"
188
+
189
+ results_dir = Path(__file__).parent / "results"
190
+ results_dir.mkdir(exist_ok=True)
191
+ chart_path = results_dir / chart_filename
192
+
193
+ plt.savefig(chart_path, dpi=300, bbox_inches='tight', facecolor='white')
194
+ plt.close()
195
+
196
+ print(f"📈 Coverage charts saved to: {chart_path}")
197
+ return str(chart_path)
198
+
199
+ except Exception as e:
200
+ print(f"❌ Coverage chart generation failed: {e}")
201
+ return ""
202
+
203
+
204
+ if __name__ == "__main__":
205
+ """Independent coverage chart generation"""
206
+
207
+ print("📈 OnCall.ai Coverage Chart Generator")
208
+
209
+ chart_gen = CoverageChartGenerator()
210
+
211
+ try:
212
+ stats = chart_gen.load_latest_coverage_statistics()
213
+ chart_path = chart_gen.generate_coverage_charts(stats)
214
+
215
+ print(f"\n✅ Coverage chart generation complete!")
216
+ print(f"📈 Charts saved to: {chart_path}")
217
+
218
+ except FileNotFoundError as e:
219
+ print(f"❌ {e}")
220
+ print("💡 Please run latency_evaluator.py first to generate coverage statistics data")
221
+ except Exception as e:
222
+ print(f"❌ Chart generation failed: {e}")
evaluation/metric5_6_judge_evaluator_manual.md ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Metric 5-6 LLM Judge Evaluator Manual
2
+
3
+ ## Overview
4
+
5
+ The `metric5_6_llm_judge_evaluator.py` is a multi-system evaluation tool that uses Llama3-70B as a third-party judge to assess medical advice quality across different AI systems. It supports both single-system evaluation and multi-system comparison with a single LLM call for maximum consistency.
6
+
7
+ ## Metrics Evaluated
8
+
9
+ **Metric 5: Clinical Actionability (臨床可操作性)**
10
+ - Scale: 1-10 (normalized to 0.0-1.0)
11
+ - Question: "Can healthcare providers immediately act on this advice?"
12
+ - Target: ≥7.0/10 for acceptable actionability
13
+
14
+ **Metric 6: Clinical Evidence Quality (臨床證據品質)**
15
+ - Scale: 1-10 (normalized to 0.0-1.0)
16
+ - Question: "Is the advice evidence-based and follows medical standards?"
17
+ - Target: ≥7.5/10 for acceptable evidence quality
18
+
19
+ ## System Architecture
20
+
21
+ ### Multi-System Support
22
+ The evaluator supports flexible system combinations:
23
+ - **Single System**: `rag` or `direct`
24
+ - **Two-System Comparison**: `rag,direct`
25
+ - **Future Extension**: `rag,direct,claude,gpt4` (any combination)
26
+
27
+ ### Judge LLM
28
+ - **Model**: Llama3-70B-Instruct via Hugging Face API
29
+ - **Strategy**: Single batch call for all evaluations
30
+ - **Temperature**: 0.1 (low for consistent evaluation)
31
+ - **Max Tokens**: 2048 (sufficient for evaluation responses)
32
+
33
+ ## Prerequisites
34
+
35
+ ### 1. Environment Setup
36
+ ```bash
37
+ # Ensure HF_TOKEN is set in your environment
38
+ export HF_TOKEN="your_huggingface_token"
39
+
40
+ # Or add to .env file
41
+ echo "HF_TOKEN=your_huggingface_token" >> .env
42
+ ```
43
+
44
+ ### 2. Required Data Files
45
+ Before running the judge evaluator, you must have medical outputs from your systems:
46
+
47
+ **For RAG System**:
48
+ ```bash
49
+ python latency_evaluator.py single_test_query.txt
50
+ # Generates: results/medical_outputs_YYYYMMDD_HHMMSS.json
51
+ ```
52
+
53
+ **For Direct LLM System**:
54
+ ```bash
55
+ python direct_llm_evaluator.py single_test_query.txt
56
+ # Generates: results/medical_outputs_direct_YYYYMMDD_HHMMSS.json
57
+ ```
58
+
59
+ ## Usage
60
+
61
+ ### Command Line Interface
62
+
63
+ #### Single System Evaluation
64
+ ```bash
65
+ # Evaluate RAG system only
66
+ python metric5_6_llm_judge_evaluator.py rag
67
+
68
+ # Evaluate Direct LLM system only
69
+ python metric5_6_llm_judge_evaluator.py direct
70
+ ```
71
+
72
+ #### Multi-System Comparison (Recommended)
73
+ ```bash
74
+ # Compare RAG vs Direct systems
75
+ python metric5_6_llm_judge_evaluator.py rag,direct
76
+
77
+ # Future: Compare multiple systems
78
+ python metric5_6_llm_judge_evaluator.py rag,direct,claude
79
+ ```
80
+
81
+ ### Complete Workflow Example
82
+
83
+ ```bash
84
+ # Step 1: Navigate to evaluation directory
85
+ cd /path/to/GenAI-OnCallAssistant/evaluation
86
+
87
+ # Step 2: Generate medical outputs from both systems
88
+ python latency_evaluator.py single_test_query.txt
89
+ python direct_llm_evaluator.py single_test_query.txt
90
+
91
+ # Step 3: Run comparative evaluation
92
+ python metric5_6_llm_judge_evaluator.py rag,direct
93
+ ```
94
+
95
+ ## Output Files
96
+
97
+ ### Generated Files
98
+ - **Statistics**: `results/judge_evaluation_comparison_rag_vs_direct_YYYYMMDD_HHMMSS.json`
99
+ - **Detailed Results**: Stored in evaluator's internal results array
100
+
101
+ ### File Structure
102
+ ```json
103
+ {
104
+ "comparison_metadata": {
105
+ "systems_compared": ["rag", "direct"],
106
+ "comparison_type": "multi_system",
107
+ "timestamp": "2025-08-04T22:00:00"
108
+ },
109
+ "category_results": {
110
+ "diagnosis": {
111
+ "average_actionability": 0.850,
112
+ "average_evidence": 0.780,
113
+ "query_count": 1,
114
+ "actionability_target_met": true,
115
+ "evidence_target_met": true
116
+ }
117
+ },
118
+ "overall_results": {
119
+ "average_actionability": 0.850,
120
+ "average_evidence": 0.780,
121
+ "successful_evaluations": 2,
122
+ "total_queries": 2,
123
+ "actionability_target_met": true,
124
+ "evidence_target_met": true
125
+ }
126
+ }
127
+ ```
128
+
129
+ ## Evaluation Process
130
+
131
+ ### 1. File Discovery
132
+ The evaluator automatically finds the latest medical output files:
133
+ - **RAG**: `medical_outputs_*.json`
134
+ - **Direct**: `medical_outputs_direct_*.json`
135
+ - **Custom**: `medical_outputs_{system}_*.json`
136
+
137
+ ### 2. Prompt Generation
138
+ For multi-system comparison, the evaluator creates a structured prompt:
139
+ ```
140
+ You are a medical expert evaluating and comparing AI systems...
141
+
142
+ SYSTEM 1 (RAG): Uses medical guidelines + LLM for evidence-based advice
143
+ SYSTEM 2 (Direct): Uses LLM only without external guidelines
144
+
145
+ QUERY 1 (DIAGNOSIS):
146
+ Patient Query: 60-year-old patient with hypertension history...
147
+
148
+ SYSTEM 1 Response: For a 60-year-old patient with...
149
+ SYSTEM 2 Response: Based on the symptoms described...
150
+
151
+ RESPONSE FORMAT:
152
+ Query 1 System 1: Actionability=X, Evidence=Y
153
+ Query 1 System 2: Actionability=X, Evidence=Y
154
+ ```
155
+
156
+ ### 3. LLM Judge Evaluation
157
+ - **Single API Call**: All systems evaluated in one request for consistency
158
+ - **Response Parsing**: Automatic extraction of numerical scores
159
+ - **Error Handling**: Graceful handling of parsing failures
160
+
161
+ ### 4. Results Analysis
162
+ - **System-Specific Statistics**: Individual performance metrics
163
+ - **Comparative Analysis**: Direct system-to-system comparison
164
+ - **Target Compliance**: Automatic threshold checking
165
+
166
+ ## Expected Output
167
+
168
+ ### Console Output Example
169
+ ```
170
+ 🧠 OnCall.ai LLM Judge Evaluator - Metrics 5-6 Multi-System Evaluation
171
+
172
+ 🧪 Multi-System Comparison: RAG vs DIRECT
173
+ 📊 Found rag outputs: results/medical_outputs_20250804_215917.json
174
+ 📊 Found direct outputs: results/medical_outputs_direct_20250804_220000.json
175
+ 📊 Comparing 2 systems with 1 queries each
176
+ 🎯 Metrics: 5 (Actionability) + 6 (Evidence Quality)
177
+ ⚡ Strategy: Single comparison call for maximum consistency
178
+
179
+ 🧠 Multi-system comparison: rag, direct
180
+ 📊 Evaluating 1 queries across 2 systems...
181
+ 📝 Comparison prompt created (2150 characters)
182
+ 🔄 Calling judge LLM for multi-system comparison...
183
+ ✅ Judge LLM completed comparison evaluation in 45.3s
184
+ 📄 Response length: 145 characters
185
+ 📊 RAG: 1 evaluations parsed
186
+ 📊 DIRECT: 1 evaluations parsed
187
+
188
+ 📊 === LLM JUDGE EVALUATION SUMMARY ===
189
+ Systems Compared: RAG vs DIRECT
190
+ Overall Performance:
191
+ Average Actionability: 0.850 (8.5/10)
192
+ Average Evidence Quality: 0.780 (7.8/10)
193
+ Actionability Target (≥7.0): ✅ Met
194
+ Evidence Target (≥7.5): ✅ Met
195
+
196
+ System Breakdown:
197
+ RAG: Actionability=0.900, Evidence=0.850 [1 queries]
198
+ DIRECT: Actionability=0.800, Evidence=0.710 [1 queries]
199
+
200
+ ✅ LLM judge evaluation complete!
201
+ 📊 Statistics: results/judge_evaluation_comparison_rag_vs_direct_20250804_220000.json
202
+ ⚡ Efficiency: 2 evaluations in 1 LLM call
203
+ ```
204
+
205
+ ## Key Features
206
+
207
+ ### 1. Scientific Comparison Design
208
+ - **Single Judge Call**: All systems evaluated simultaneously for consistency
209
+ - **Eliminates Temporal Bias**: Same judge, same context, same standards
210
+ - **Direct System Comparison**: Side-by-side evaluation format
211
+
212
+ ### 2. Flexible Architecture
213
+ - **Backward Compatible**: Single system evaluation still supported
214
+ - **Future Extensible**: Easy to add new systems (`claude`, `gpt4`, etc.)
215
+ - **Modular Design**: Clean separation of concerns
216
+
217
+ ### 3. Robust Error Handling
218
+ - **File Validation**: Automatic detection of missing input files
219
+ - **Query Count Verification**: Warns if systems have different query counts
220
+ - **Graceful Degradation**: Continues operation despite partial failures
221
+
222
+ ### 4. Comprehensive Reporting
223
+ - **System-Specific Metrics**: Individual performance analysis
224
+ - **Comparative Statistics**: Direct system-to-system comparison
225
+ - **Target Compliance**: Automatic benchmark checking
226
+ - **Detailed Metadata**: Full traceability of evaluation parameters
227
+
228
+ ## Troubleshooting
229
+
230
+ ### Common Issues
231
+
232
+ #### 1. Missing Input Files
233
+ ```
234
+ ❌ No medical outputs files found for rag system
235
+ 💡 Please run evaluators first:
236
+ python latency_evaluator.py single_test_query.txt
237
+ ```
238
+ **Solution**: Run the prerequisite evaluators to generate medical outputs.
239
+
240
+ #### 2. HF_TOKEN Not Set
241
+ ```
242
+ ❌ HF_TOKEN is missing from environment variables
243
+ ```
244
+ **Solution**: Set your Hugging Face token in environment or `.env` file.
245
+
246
+ #### 3. Query Count Mismatch
247
+ ```
248
+ ⚠️ Warning: Systems have different query counts: {'rag': 3, 'direct': 1}
249
+ ```
250
+ **Solution**: Ensure both systems processed the same input file.
251
+
252
+ #### 4. LLM API Timeout
253
+ ```
254
+ ❌ Multi-system evaluation failed: timeout
255
+ ```
256
+ **Solution**: Check internet connection and Hugging Face API status.
257
+
258
+ ### Debug Tips
259
+
260
+ 1. **Check File Existence**: Verify medical output files in `results/` directory
261
+ 2. **Validate JSON Format**: Ensure input files are properly formatted
262
+ 3. **Monitor API Usage**: Check Hugging Face account limits
263
+ 4. **Review Logs**: Examine detailed logging output for specific errors
264
+
265
+ ## Future Extensions
266
+
267
+ ### Phase 2: Generic Multi-System Framework
268
+ ```bash
269
+ # Configuration-driven system comparison
270
+ python metric5_6_llm_judge_evaluator.py --config comparison_config.json
271
+ ```
272
+
273
+ ### Phase 3: Unlimited System Support
274
+ ```bash
275
+ # Dynamic system registration
276
+ python metric5_6_llm_judge_evaluator.py med42,claude,gpt4,palm,llama2
277
+ ```
278
+
279
+ ### Integration with Chart Generators
280
+ ```bash
281
+ # Generate comparison visualizations
282
+ python metric5_6_llm_judge_chart_generator.py rag,direct
283
+ ```
284
+
285
+ ## Best Practices
286
+
287
+ 1. **Consistent Test Data**: Use the same query file for all systems
288
+ 2. **Sequential Execution**: Complete data collection before evaluation
289
+ 3. **Batch Processing**: Use multi-system mode for scientific comparison
290
+ 4. **Result Verification**: Review detailed statistics files for accuracy
291
+ 5. **Performance Monitoring**: Track evaluation latency and API costs
292
+
293
+ ## Scientific Validity
294
+
295
+ The multi-system comparison approach provides superior scientific validity compared to separate evaluations:
296
+
297
+ - **Eliminates Judge Variability**: Same judge evaluates all systems
298
+ - **Reduces Temporal Effects**: All evaluations in single time window
299
+ - **Ensures Consistent Standards**: Identical evaluation criteria applied
300
+ - **Enables Direct Comparison**: Side-by-side system assessment
301
+ - **Maximizes Efficiency**: Single API call vs multiple separate calls
302
+
303
+ This design makes the evaluation results more reliable for research publications and system optimization decisions.
evaluation/metric5_6_llm_judge_chart_generator.py ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - LLM Judge Chart Generator (Metrics 5-6)
4
+ ==========================================================
5
+
6
+ Generates comprehensive comparison charts for LLM judge evaluation results.
7
+ Supports both single-system and multi-system visualization with professional layouts.
8
+
9
+ Metrics visualized:
10
+ 5. Clinical Actionability (臨床可操作性) - 1-10 scale
11
+ 6. Clinical Evidence Quality (臨床證據品質) - 1-10 scale
12
+
13
+ Author: YanBo Chen
14
+ Date: 2025-08-04
15
+ """
16
+
17
+ import json
18
+ import os
19
+ import sys
20
+ from typing import Dict, List, Any, Tuple
21
+ from datetime import datetime
22
+ from pathlib import Path
23
+ import glob
24
+ import numpy as np
25
+
26
+ # Visualization imports
27
+ import matplotlib.pyplot as plt
28
+ import seaborn as sns
29
+ import pandas as pd
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ class LLMJudgeChartGenerator:
34
+ """Generate professional comparison charts for LLM judge evaluation results"""
35
+
36
+ def __init__(self):
37
+ """Initialize chart generator with professional styling"""
38
+ print("📈 Initializing LLM Judge Chart Generator...")
39
+
40
+ # Set up professional chart style
41
+ plt.style.use('default')
42
+ sns.set_palette("husl")
43
+
44
+ # Professional color scheme for medical evaluation
45
+ self.colors = {
46
+ 'rag': '#2E8B57', # Sea Green - represents evidence-based
47
+ 'direct': '#4682B4', # Steel Blue - represents direct approach
48
+ 'claude': '#9370DB', # Medium Purple - future extension
49
+ 'gpt4': '#DC143C', # Crimson - future extension
50
+ 'actionability': '#FF6B6B', # Coral Red
51
+ 'evidence': '#4ECDC4', # Turquoise
52
+ 'target_line': '#FF4444', # Red for target thresholds
53
+ 'grid': '#E0E0E0' # Light gray for grid
54
+ }
55
+
56
+ print("✅ Chart Generator ready with professional medical styling")
57
+
58
+ def load_latest_statistics(self, results_dir: str = None) -> Dict[str, Any]:
59
+ """
60
+ Load the most recent judge evaluation statistics file
61
+
62
+ Args:
63
+ results_dir: Directory containing statistics files
64
+ """
65
+ if results_dir is None:
66
+ results_dir = Path(__file__).parent / "results"
67
+
68
+ # Find latest comparison statistics file
69
+ pattern = str(results_dir / "judge_evaluation_comparison_*.json")
70
+ stat_files = glob.glob(pattern)
71
+
72
+ if not stat_files:
73
+ raise FileNotFoundError(f"No judge evaluation comparison files found in {results_dir}")
74
+
75
+ # Get the most recent file
76
+ latest_file = max(stat_files, key=os.path.getmtime)
77
+
78
+ print(f"📊 Loading statistics from: {latest_file}")
79
+
80
+ with open(latest_file, 'r', encoding='utf-8') as f:
81
+ return json.load(f)
82
+
83
+ def generate_comparison_charts(self, stats: Dict[str, Any], save_path: str = None) -> str:
84
+ """
85
+ Generate comprehensive 4-panel comparison visualization
86
+
87
+ Creates professional charts showing:
88
+ 1. System comparison radar chart
89
+ 2. Grouped bar chart comparison
90
+ 3. Actionability vs Evidence scatter plot
91
+ 4. Category-wise heatmap
92
+ """
93
+ try:
94
+ # Create figure with subplots
95
+ fig, axes = plt.subplots(2, 2, figsize=(16, 12))
96
+ fig.suptitle(
97
+ 'Medical AI Systems Comparison - Clinical Quality Assessment\n'
98
+ 'Actionability (1-10): Can healthcare providers act immediately? | '
99
+ 'Evidence Quality (1-10): Is advice evidence-based?',
100
+ fontsize=14, fontweight='bold', y=0.95
101
+ )
102
+
103
+ # Extract comparison metadata
104
+ comparison_meta = stats.get('comparison_metadata', {})
105
+ systems = comparison_meta.get('systems_compared', ['rag', 'direct'])
106
+
107
+ overall_results = stats['overall_results']
108
+ category_results = stats['category_results']
109
+
110
+ # Chart 1: System Comparison Radar Chart
111
+ self._create_radar_chart(axes[0, 0], stats, systems)
112
+
113
+ # Chart 2: Grouped Bar Chart Comparison
114
+ self._create_grouped_bar_chart(axes[0, 1], stats, systems)
115
+
116
+ # Chart 3: Actionability vs Evidence Scatter Plot
117
+ self._create_scatter_plot(axes[1, 0], stats, systems)
118
+
119
+ # Chart 4: Category-wise Performance Heatmap
120
+ self._create_heatmap(axes[1, 1], stats, systems)
121
+
122
+ # Add method annotation at bottom
123
+ method_text = (
124
+ f"Evaluation: Llama3-70B judge | Targets: Actionability ≥7.0, Evidence ≥7.5 | "
125
+ f"Systems: {', '.join([s.upper() for s in systems])} | "
126
+ f"Queries: {overall_results.get('total_queries', 'N/A')}"
127
+ )
128
+ fig.text(0.5, 0.02, method_text, ha='center', fontsize=10,
129
+ style='italic', color='gray')
130
+
131
+ # Adjust layout
132
+ plt.tight_layout()
133
+ plt.subplots_adjust(top=0.88, bottom=0.08)
134
+
135
+ # Save the chart
136
+ if save_path is None:
137
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
138
+ systems_str = "_vs_".join(systems)
139
+ save_path = f"judge_comparison_charts_{systems_str}_{timestamp}.png"
140
+
141
+ results_dir = Path(__file__).parent / "results"
142
+ results_dir.mkdir(exist_ok=True)
143
+ full_path = results_dir / save_path
144
+
145
+ plt.savefig(full_path, dpi=300, bbox_inches='tight')
146
+ plt.show()
147
+
148
+ print(f"📊 Comparison charts saved to: {full_path}")
149
+ return str(full_path)
150
+
151
+ except Exception as e:
152
+ print(f"❌ Chart generation failed: {e}")
153
+ raise
154
+
155
+ def _create_radar_chart(self, ax, stats: Dict, systems: List[str]):
156
+ """Create radar chart for multi-dimensional system comparison"""
157
+ ax.set_title('Multi-Dimensional System Comparison', fontweight='bold', pad=20)
158
+
159
+ # Prepare data for radar chart using real system-specific data
160
+ categories = ['Overall Actionability', 'Overall Evidence', 'Diagnosis', 'Treatment', 'Mixed']
161
+
162
+ # Extract real system-specific metrics
163
+ detailed_results = stats.get('detailed_system_results', {})
164
+ system_data = {}
165
+
166
+ for system in systems:
167
+ if system in detailed_results:
168
+ system_info = detailed_results[system]
169
+ system_results = system_info['results']
170
+
171
+ # Calculate category-specific performance
172
+ category_performance = {}
173
+ for result in system_results:
174
+ category = result.get('category', 'unknown').lower()
175
+ if category not in category_performance:
176
+ category_performance[category] = {'actionability': [], 'evidence': []}
177
+ category_performance[category]['actionability'].append(result['actionability_score'])
178
+ category_performance[category]['evidence'].append(result['evidence_score'])
179
+
180
+ # Build radar chart data
181
+ system_scores = [
182
+ system_info['avg_actionability'], # Overall Actionability
183
+ system_info['avg_evidence'], # Overall Evidence
184
+ # Category-specific scores (average of actionability and evidence)
185
+ (sum(category_performance.get('diagnosis', {}).get('actionability', [0])) /
186
+ len(category_performance.get('diagnosis', {}).get('actionability', [1])) +
187
+ sum(category_performance.get('diagnosis', {}).get('evidence', [0])) /
188
+ len(category_performance.get('diagnosis', {}).get('evidence', [1]))) / 2 if 'diagnosis' in category_performance else 0.5,
189
+
190
+ (sum(category_performance.get('treatment', {}).get('actionability', [0])) /
191
+ len(category_performance.get('treatment', {}).get('actionability', [1])) +
192
+ sum(category_performance.get('treatment', {}).get('evidence', [0])) /
193
+ len(category_performance.get('treatment', {}).get('evidence', [1]))) / 2 if 'treatment' in category_performance else 0.5,
194
+
195
+ (sum(category_performance.get('mixed', {}).get('actionability', [0])) /
196
+ len(category_performance.get('mixed', {}).get('actionability', [1])) +
197
+ sum(category_performance.get('mixed', {}).get('evidence', [0])) /
198
+ len(category_performance.get('mixed', {}).get('evidence', [1]))) / 2 if 'mixed' in category_performance else 0.5
199
+ ]
200
+ system_data[system] = system_scores
201
+ else:
202
+ # Fallback to overall stats if detailed results not available
203
+ overall_results = stats['overall_results']
204
+ system_data[system] = [
205
+ overall_results['average_actionability'],
206
+ overall_results['average_evidence'],
207
+ 0.7, 0.6, 0.5 # Placeholder for missing category data
208
+ ]
209
+
210
+ # Create radar chart
211
+ angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
212
+ angles += angles[:1] # Complete the circle
213
+
214
+ for system in systems:
215
+ values = system_data[system] + [system_data[system][0]] # Complete the circle
216
+ ax.plot(angles, values, 'o-', linewidth=2,
217
+ label=f'{system.upper()} System', color=self.colors.get(system, 'gray'))
218
+ ax.fill(angles, values, alpha=0.1, color=self.colors.get(system, 'gray'))
219
+
220
+ # Customize radar chart
221
+ ax.set_xticks(angles[:-1])
222
+ ax.set_xticklabels(categories, fontsize=9)
223
+ ax.set_ylim(0, 1)
224
+ ax.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0])
225
+ ax.set_yticklabels(['2.0', '4.0', '6.0', '8.0', '10.0'])
226
+ ax.grid(True, alpha=0.3)
227
+ ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1.0))
228
+
229
+ # Add target threshold circle
230
+ target_circle = [0.7] * (len(categories) + 1) # 7.0 threshold
231
+ ax.plot(angles, target_circle, '--', color=self.colors['target_line'],
232
+ alpha=0.7, label='Target (7.0)')
233
+
234
+ def _create_grouped_bar_chart(self, ax, stats: Dict, systems: List[str]):
235
+ """Create grouped bar chart for direct metric comparison"""
236
+ ax.set_title('Direct Metric Comparison', fontweight='bold', pad=20)
237
+
238
+ # Prepare data using real system-specific metrics
239
+ metrics = ['Actionability', 'Evidence Quality']
240
+ detailed_results = stats.get('detailed_system_results', {})
241
+
242
+ # Extract real system-specific data
243
+ system_scores = {}
244
+ for system in systems:
245
+ if system in detailed_results:
246
+ system_info = detailed_results[system]
247
+ system_scores[system] = [
248
+ system_info['avg_actionability'],
249
+ system_info['avg_evidence']
250
+ ]
251
+ else:
252
+ # Fallback to overall results
253
+ overall_results = stats['overall_results']
254
+ system_scores[system] = [
255
+ overall_results['average_actionability'],
256
+ overall_results['average_evidence']
257
+ ]
258
+
259
+ # Create grouped bar chart
260
+ x = np.arange(len(metrics))
261
+ width = 0.35 if len(systems) == 2 else 0.25
262
+
263
+ for i, system in enumerate(systems):
264
+ offset = (i - len(systems)/2 + 0.5) * width
265
+ bars = ax.bar(x + offset, system_scores[system], width,
266
+ label=f'{system.upper()}', color=self.colors.get(system, 'gray'),
267
+ alpha=0.8)
268
+
269
+ # Add value labels on bars
270
+ for bar, value in zip(bars, system_scores[system]):
271
+ height = bar.get_height()
272
+ ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
273
+ f'{value:.3f}', ha='center', va='bottom', fontweight='bold')
274
+
275
+ # Add target threshold lines
276
+ ax.axhline(y=0.7, color=self.colors['target_line'], linestyle='--',
277
+ alpha=0.7, label='Actionability Target (7.0)')
278
+ ax.axhline(y=0.75, color=self.colors['target_line'], linestyle=':',
279
+ alpha=0.7, label='Evidence Target (7.5)')
280
+
281
+ # Customize chart
282
+ ax.set_xlabel('Evaluation Metrics')
283
+ ax.set_ylabel('Score (0-1 scale)')
284
+ ax.set_title('System Performance Comparison')
285
+ ax.set_xticks(x)
286
+ ax.set_xticklabels(metrics)
287
+ ax.legend(loc='upper left')
288
+ ax.grid(True, alpha=0.3, axis='y')
289
+ ax.set_ylim(0, 1.0)
290
+
291
+ def _create_scatter_plot(self, ax, stats: Dict, systems: List[str]):
292
+ """Create scatter plot for actionability vs evidence quality analysis"""
293
+ ax.set_title('Actionability vs Evidence Quality Analysis', fontweight='bold', pad=20)
294
+
295
+ # Extract real query-level data from detailed results
296
+ detailed_results = stats.get('detailed_system_results', {})
297
+
298
+ for system in systems:
299
+ if system in detailed_results:
300
+ system_results = detailed_results[system]['results']
301
+
302
+ # Extract real actionability and evidence scores for each query
303
+ actionability_scores = [r['actionability_score'] for r in system_results]
304
+ evidence_scores = [r['evidence_score'] for r in system_results]
305
+
306
+ ax.scatter(actionability_scores, evidence_scores,
307
+ label=f'{system.upper()}', color=self.colors.get(system, 'gray'),
308
+ alpha=0.7, s=100, edgecolors='white', linewidth=1)
309
+ else:
310
+ # Fallback: create single point from overall averages
311
+ overall_results = stats['overall_results']
312
+ ax.scatter([overall_results['average_actionability']],
313
+ [overall_results['average_evidence']],
314
+ label=f'{system.upper()}', color=self.colors.get(system, 'gray'),
315
+ alpha=0.7, s=100, edgecolors='white', linewidth=1)
316
+
317
+ # Add target threshold lines
318
+ ax.axvline(x=0.7, color=self.colors['target_line'], linestyle='--',
319
+ alpha=0.7, label='Actionability Target')
320
+ ax.axhline(y=0.75, color=self.colors['target_line'], linestyle='--',
321
+ alpha=0.7, label='Evidence Target')
322
+
323
+ # Add target zone
324
+ target_rect = Rectangle((0.7, 0.75), 0.3, 0.25, linewidth=1,
325
+ edgecolor=self.colors['target_line'], facecolor='green',
326
+ alpha=0.1, label='Target Zone')
327
+ ax.add_patch(target_rect)
328
+
329
+ # Customize chart
330
+ ax.set_xlabel('Clinical Actionability (0-1 scale)')
331
+ ax.set_ylabel('Clinical Evidence Quality (0-1 scale)')
332
+ ax.legend(loc='lower right')
333
+ ax.grid(True, alpha=0.3)
334
+ ax.set_xlim(0, 1)
335
+ ax.set_ylim(0, 1)
336
+
337
+ def _create_heatmap(self, ax, stats: Dict, systems: List[str]):
338
+ """Create heatmap for category-wise performance matrix"""
339
+ ax.set_title('Category-wise Performance Matrix', fontweight='bold', pad=20)
340
+
341
+ # Prepare data
342
+ categories = ['Diagnosis', 'Treatment', 'Mixed']
343
+ metrics = ['Actionability', 'Evidence']
344
+ category_results = stats['category_results']
345
+
346
+ # Create data matrix
347
+ data_matrix = []
348
+ row_labels = []
349
+
350
+ for system in systems:
351
+ for metric in metrics:
352
+ row_data = []
353
+ for category in categories:
354
+ cat_key = category.lower()
355
+ if cat_key in category_results and category_results[cat_key]['query_count'] > 0:
356
+ if metric == 'Actionability':
357
+ value = category_results[cat_key]['average_actionability']
358
+ else:
359
+ value = category_results[cat_key]['average_evidence']
360
+ else:
361
+ value = 0.5 # Placeholder for missing data
362
+ row_data.append(value)
363
+
364
+ data_matrix.append(row_data)
365
+ row_labels.append(f'{system.upper()}\n{metric}')
366
+
367
+ # Create heatmap
368
+ im = ax.imshow(data_matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
369
+
370
+ # Set ticks and labels
371
+ ax.set_xticks(np.arange(len(categories)))
372
+ ax.set_yticks(np.arange(len(row_labels)))
373
+ ax.set_xticklabels(categories)
374
+ ax.set_yticklabels(row_labels, fontsize=9)
375
+
376
+ # Add text annotations
377
+ for i in range(len(row_labels)):
378
+ for j in range(len(categories)):
379
+ text = ax.text(j, i, f'{data_matrix[i][j]:.3f}',
380
+ ha='center', va='center', fontweight='bold',
381
+ color='white' if data_matrix[i][j] < 0.5 else 'black')
382
+
383
+ # Add colorbar
384
+ cbar = plt.colorbar(im, ax=ax, shrink=0.6)
385
+ cbar.set_label('Performance Score (0-1)', rotation=270, labelpad=15)
386
+
387
+ ax.set_xlabel('Query Categories')
388
+ ax.set_ylabel('System × Metric')
389
+
390
+
391
+ # Independent execution interface
392
+ if __name__ == "__main__":
393
+ """Independent chart generation interface"""
394
+
395
+ print("📊 OnCall.ai LLM Judge Chart Generator - Metrics 5-6 Visualization")
396
+
397
+ # Initialize generator
398
+ generator = LLMJudgeChartGenerator()
399
+
400
+ try:
401
+ # Load latest statistics
402
+ stats = generator.load_latest_statistics()
403
+
404
+ print(f"📈 Generating comparison charts...")
405
+
406
+ # Generate comprehensive comparison charts
407
+ chart_path = generator.generate_comparison_charts(stats)
408
+
409
+ # Print summary
410
+ comparison_meta = stats.get('comparison_metadata', {})
411
+ systems = comparison_meta.get('systems_compared', ['rag', 'direct'])
412
+ overall_results = stats['overall_results']
413
+
414
+ print(f"\n📊 === CHART GENERATION SUMMARY ===")
415
+ print(f"Systems Visualized: {' vs '.join([s.upper() for s in systems])}")
416
+ print(f"Overall Actionability: {overall_results['average_actionability']:.3f}")
417
+ print(f"Overall Evidence Quality: {overall_results['average_evidence']:.3f}")
418
+ print(f"Total Queries: {overall_results['total_queries']}")
419
+ print(f"Chart Components: Radar Chart, Bar Chart, Scatter Plot, Heatmap")
420
+
421
+ print(f"\n✅ Comprehensive visualization complete!")
422
+ print(f"📊 Charts saved to: {chart_path}")
423
+ print(f"💡 Tip: Charts optimized for research presentations and publications")
424
+
425
+ except FileNotFoundError as e:
426
+ print(f"❌ {e}")
427
+ print(f"💡 Please run judge evaluation first:")
428
+ print(" python metric5_6_llm_judge_evaluator.py rag,direct")
429
+ except Exception as e:
430
+ print(f"❌ Chart generation failed: {e}")
evaluation/metric5_6_llm_judge_evaluator.py ADDED
@@ -0,0 +1,643 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - LLM Judge Evaluator (Metrics 5-6)
4
+ ====================================================
5
+
6
+ Uses Llama3-70B as third-party judge to evaluate medical advice quality.
7
+ Batch evaluation strategy: 1 call evaluates all queries for maximum efficiency.
8
+
9
+ Metrics evaluated:
10
+ 5. Clinical Actionability (臨床可操作性)
11
+ 6. Clinical Evidence Quality (臨床證據品質)
12
+
13
+ EVALUATION RUBRICS:
14
+
15
+ Metric 5: Clinical Actionability (1-10 scale)
16
+ 1-2 points: Almost no actionable advice; extremely abstract or empty responses.
17
+ 3-4 points: Provides some directional suggestions but too vague, lacks clear steps.
18
+ 5-6 points: Offers basic executable steps but lacks details or insufficient explanation for key aspects.
19
+ 7-8 points: Clear and complete steps that clinicians can follow, with occasional gaps needing supplementation.
20
+ 9-10 points: Extremely actionable with precise, step-by-step executable guidance; can be used "as-is" immediately.
21
+
22
+ Metric 6: Clinical Evidence Quality (1-10 scale)
23
+ 1-2 points: Almost no evidence support; cites completely irrelevant or unreliable sources.
24
+ 3-4 points: References lower quality literature or guidelines, or sources lack authority.
25
+ 5-6 points: Uses general quality literature/guidelines but lacks depth or currency.
26
+ 7-8 points: References reliable, authoritative sources (renowned journals or authoritative guidelines) with accurate explanations.
27
+ 9-10 points: Rich and high-quality evidence sources (systematic reviews, RCTs, etc.) combined with latest research; enhances recommendation credibility.
28
+
29
+ Author: YanBo Chen
30
+ Date: 2025-08-04
31
+ """
32
+
33
+ import json
34
+ import os
35
+ import sys
36
+ import time
37
+ from typing import Dict, List, Any, Tuple
38
+ from datetime import datetime
39
+ from pathlib import Path
40
+ import glob
41
+ import re
42
+
43
+ # Evaluation Rubrics as programmable constants
44
+ ACTIONABILITY_RUBRIC = {
45
+ (1, 2): "Almost no actionable advice; extremely abstract or empty responses.",
46
+ (3, 4): "Provides some directional suggestions but too vague, lacks clear steps.",
47
+ (5, 6): "Offers basic executable steps but lacks details or insufficient explanation for key aspects.",
48
+ (7, 8): "Clear and complete steps that clinicians can follow, with occasional gaps needing supplementation.",
49
+ (9, 10): "Extremely actionable with precise, step-by-step executable guidance; can be used 'as-is' immediately."
50
+ }
51
+
52
+ EVIDENCE_RUBRIC = {
53
+ (1, 2): "Almost no evidence support; cites completely irrelevant or unreliable sources.",
54
+ (3, 4): "References lower quality literature or guidelines, or sources lack authority.",
55
+ (5, 6): "Uses general quality literature/guidelines but lacks depth or currency.",
56
+ (7, 8): "References reliable, authoritative sources (renowned journals or authoritative guidelines) with accurate explanations.",
57
+ (9, 10): "Rich and high-quality evidence sources (systematic reviews, RCTs, etc.) combined with latest research; enhances recommendation credibility."
58
+ }
59
+
60
+ def print_evaluation_rubrics():
61
+ """Print detailed evaluation rubrics for reference"""
62
+ print("=" * 60)
63
+ print("CLINICAL EVALUATION RUBRICS")
64
+ print("=" * 60)
65
+
66
+ print("\n🎯 METRIC 5: Clinical Actionability (1-10 scale)")
67
+ print("-" * 50)
68
+ for score_range, description in ACTIONABILITY_RUBRIC.items():
69
+ print(f"{score_range[0]}–{score_range[1]} points: {description}")
70
+
71
+ print("\n📚 METRIC 6: Clinical Evidence Quality (1-10 scale)")
72
+ print("-" * 50)
73
+ for score_range, description in EVIDENCE_RUBRIC.items():
74
+ print(f"{score_range[0]}–{score_range[1]} points: {description}")
75
+
76
+ print("\n" + "=" * 60)
77
+ print("TARGET THRESHOLDS:")
78
+ print("• Actionability: ≥7.0 (Acceptable clinical utility)")
79
+ print("• Evidence Quality: ≥7.5 (Reliable evidence support)")
80
+ print("=" * 60)
81
+
82
+ def get_rubric_description(score: int, metric_type: str) -> str:
83
+ """Get rubric description for a given score and metric type"""
84
+ rubric = ACTIONABILITY_RUBRIC if metric_type == "actionability" else EVIDENCE_RUBRIC
85
+
86
+ for score_range, description in rubric.items():
87
+ if score_range[0] <= score <= score_range[1]:
88
+ return description
89
+
90
+ return "Score out of valid range (1-10)"
91
+
92
+ # Add project path
93
+ current_dir = Path(__file__).parent
94
+ project_root = current_dir.parent
95
+ src_dir = project_root / "src"
96
+ sys.path.insert(0, str(src_dir))
97
+
98
+ # Import LLM client for judge evaluation
99
+ try:
100
+ from llm_clients import llm_Llama3_70B_JudgeClient
101
+ except ImportError as e:
102
+ print(f"❌ Import failed: {e}")
103
+ print("Please ensure running from project root directory")
104
+ sys.exit(1)
105
+
106
+
107
+ class LLMJudgeEvaluator:
108
+ """LLM judge evaluator using batch evaluation strategy"""
109
+
110
+ def __init__(self):
111
+ """Initialize judge LLM client"""
112
+ print("🔧 Initializing LLM Judge Evaluator...")
113
+
114
+ # Initialize Llama3-70B as judge LLM
115
+ self.judge_llm = llm_Llama3_70B_JudgeClient()
116
+
117
+ self.evaluation_results = []
118
+
119
+ print("✅ LLM Judge Evaluator initialization complete")
120
+
121
+ def load_medical_outputs(self, filepath: str) -> List[Dict[str, Any]]:
122
+ """Load medical outputs from file"""
123
+ print(f"📁 Loading medical outputs from: {filepath}")
124
+
125
+ with open(filepath, 'r', encoding='utf-8') as f:
126
+ data = json.load(f)
127
+
128
+ medical_outputs = data.get('medical_outputs', [])
129
+ print(f"📋 Loaded {len(medical_outputs)} medical outputs")
130
+
131
+ return medical_outputs
132
+
133
+ def find_medical_outputs_for_systems(self, systems: List[str]) -> Dict[str, str]:
134
+ """Find medical outputs files for multiple systems"""
135
+ results_dir = Path(__file__).parent / "results"
136
+ system_files = {}
137
+
138
+ for system in systems:
139
+ if system == "rag":
140
+ # Use more specific pattern to exclude direct files
141
+ pattern = str(results_dir / "medical_outputs_[0-9]*.json")
142
+ elif system == "direct":
143
+ pattern = str(results_dir / "medical_outputs_direct_*.json")
144
+ else:
145
+ # Future extension: support other systems
146
+ pattern = str(results_dir / f"medical_outputs_{system}_*.json")
147
+
148
+ print(f"🔍 Searching for {system} with pattern: {pattern}")
149
+ output_files = glob.glob(pattern)
150
+ print(f"🔍 Found files for {system}: {output_files}")
151
+
152
+ if not output_files:
153
+ raise FileNotFoundError(f"No medical outputs files found for {system} system")
154
+
155
+ latest_file = max(output_files, key=os.path.getmtime)
156
+ system_files[system] = latest_file
157
+ print(f"📊 Found {system} outputs: {latest_file}")
158
+
159
+ return system_files
160
+
161
+ def create_comparison_evaluation_prompt(self, systems_outputs: Dict[str, List[Dict]]) -> str:
162
+ """
163
+ Create comparison evaluation prompt for multiple systems
164
+
165
+ Args:
166
+ systems_outputs: Dict mapping system names to their medical outputs
167
+ """
168
+ system_names = list(systems_outputs.keys())
169
+
170
+ prompt_parts = [
171
+ "You are a medical expert evaluating and comparing AI systems for clinical advice quality.",
172
+ f"Please evaluate {len(system_names)} different systems using the detailed rubrics below:",
173
+ "",
174
+ "EVALUATION RUBRICS:",
175
+ "",
176
+ "METRIC 1: Clinical Actionability (1-10 scale)",
177
+ "Question: Can healthcare providers immediately act on this advice?",
178
+ "1-2 points: Almost no actionable advice; extremely abstract or empty responses.",
179
+ "3-4 points: Provides directional suggestions but too vague, lacks clear steps.",
180
+ "5-6 points: Offers basic executable steps but lacks details for key aspects.",
181
+ "7-8 points: Clear and complete steps that clinicians can follow with occasional gaps.",
182
+ "9-10 points: Extremely actionable with precise, step-by-step executable guidance.",
183
+ "",
184
+ "METRIC 2: Clinical Evidence Quality (1-10 scale)",
185
+ "Question: Is the advice evidence-based and follows medical standards?",
186
+ "1-2 points: Almost no evidence support; cites irrelevant or unreliable sources.",
187
+ "3-4 points: References lower quality literature or sources lack authority.",
188
+ "5-6 points: Uses general quality literature/guidelines but lacks depth or currency.",
189
+ "7-8 points: References reliable, authoritative sources with accurate explanations.",
190
+ "9-10 points: Rich, high-quality evidence sources combined with latest research.",
191
+ "",
192
+ "TARGET THRESHOLDS: Actionability ≥7.0, Evidence Quality ≥7.5",
193
+ ""
194
+ ]
195
+
196
+ # Add system descriptions
197
+ for i, system in enumerate(system_names, 1):
198
+ if system == "rag":
199
+ prompt_parts.append(f"SYSTEM {i} (RAG): Uses medical guidelines + LLM for evidence-based advice")
200
+ elif system == "direct":
201
+ prompt_parts.append(f"SYSTEM {i} (Direct): Uses LLM only without external guidelines")
202
+ else:
203
+ prompt_parts.append(f"SYSTEM {i} ({system.upper()}): {system} medical AI system")
204
+
205
+ prompt_parts.extend([
206
+ "",
207
+ "EVALUATION CRITERIA:",
208
+ "1. Clinical Actionability (1-10): Can healthcare providers immediately act on this advice?",
209
+ "2. Clinical Evidence Quality (1-10): Is the advice evidence-based and follows medical standards?",
210
+ "",
211
+ "QUERIES TO EVALUATE:",
212
+ ""
213
+ ])
214
+
215
+ # Get all queries (assuming all systems processed same queries)
216
+ first_system = system_names[0]
217
+ queries = systems_outputs[first_system]
218
+
219
+ # Add each query with all system responses
220
+ for i, query_data in enumerate(queries, 1):
221
+ query = query_data.get('query', '')
222
+ category = query_data.get('category', 'unknown')
223
+
224
+ prompt_parts.extend([
225
+ f"=== QUERY {i} ({category.upper()}) ===",
226
+ f"Patient Query: {query}",
227
+ ""
228
+ ])
229
+
230
+ # Add each system's response
231
+ for j, system in enumerate(system_names, 1):
232
+ system_query = systems_outputs[system][i-1] # Get corresponding query from this system
233
+ advice = system_query.get('medical_advice', '')
234
+
235
+ prompt_parts.extend([
236
+ f"SYSTEM {j} Response: {advice}",
237
+ ""
238
+ ])
239
+
240
+ prompt_parts.extend([
241
+ "RESPONSE FORMAT (provide exactly this format):",
242
+ ""
243
+ ])
244
+
245
+ # Add response format template
246
+ for i in range(1, len(queries) + 1):
247
+ for j, system in enumerate(system_names, 1):
248
+ prompt_parts.append(f"Query {i} System {j}: Actionability=X, Evidence=Y")
249
+
250
+ prompt_parts.extend([
251
+ "",
252
+ "Replace X and Y with numeric scores 1-10.",
253
+ "Provide only the scores in the exact format above.",
254
+ f"Note: System 1={system_names[0]}, System 2={system_names[1] if len(system_names) > 1 else 'N/A'}"
255
+ ])
256
+
257
+ return "\n".join(prompt_parts)
258
+
259
+ def parse_comparison_evaluation_response(self, response: str, systems_outputs: Dict[str, List[Dict]]) -> Dict[str, List[Dict]]:
260
+ """Parse comparison evaluation response into results by system"""
261
+ results_by_system = {}
262
+ system_names = list(systems_outputs.keys())
263
+
264
+ # Initialize results for each system
265
+ for system in system_names:
266
+ results_by_system[system] = []
267
+
268
+ lines = response.strip().split('\n')
269
+
270
+ for line in lines:
271
+ line = line.strip()
272
+ if not line:
273
+ continue
274
+
275
+ # Parse format: "Query X System Y: Actionability=A, Evidence=B"
276
+ match = re.match(r'Query\s+(\d+)\s+System\s+(\d+):\s*Actionability\s*=\s*(\d+)\s*,\s*Evidence\s*=\s*(\d+)', line, re.IGNORECASE)
277
+
278
+ if match:
279
+ query_num = int(match.group(1)) - 1 # 0-based index
280
+ system_num = int(match.group(2)) - 1 # 0-based index
281
+ actionability_score = int(match.group(3))
282
+ evidence_score = int(match.group(4))
283
+
284
+ if system_num < len(system_names) and query_num < len(systems_outputs[system_names[system_num]]):
285
+ system_name = system_names[system_num]
286
+ output = systems_outputs[system_name][query_num]
287
+
288
+ result = {
289
+ "query": output.get('query', ''),
290
+ "category": output.get('category', 'unknown'),
291
+ "system_type": system_name,
292
+ "medical_advice": output.get('medical_advice', ''),
293
+
294
+ # Metric 5: Clinical Actionability
295
+ "actionability_score": actionability_score / 10.0,
296
+ "actionability_raw": actionability_score,
297
+
298
+ # Metric 6: Clinical Evidence Quality
299
+ "evidence_score": evidence_score / 10.0,
300
+ "evidence_raw": evidence_score,
301
+
302
+ "evaluation_success": True,
303
+ "timestamp": datetime.now().isoformat()
304
+ }
305
+
306
+ results_by_system[system_name].append(result)
307
+
308
+ return results_by_system
309
+
310
+ def evaluate_multiple_systems(self, systems_outputs: Dict[str, List[Dict]]) -> Dict[str, List[Dict]]:
311
+ """
312
+ Evaluate multiple systems using single LLM call for comparison
313
+
314
+ Args:
315
+ systems_outputs: Dict mapping system names to their medical outputs
316
+ """
317
+ system_names = list(systems_outputs.keys())
318
+ total_queries = len(systems_outputs[system_names[0]])
319
+
320
+ print(f"🧠 Multi-system comparison: {', '.join(system_names)}")
321
+ print(f"📊 Evaluating {total_queries} queries across {len(system_names)} systems...")
322
+
323
+ try:
324
+ # Create comparison evaluation prompt
325
+ comparison_prompt = self.create_comparison_evaluation_prompt(systems_outputs)
326
+
327
+ print(f"📝 Comparison prompt created ({len(comparison_prompt)} characters)")
328
+ print(f"🔄 Calling judge LLM for multi-system comparison...")
329
+
330
+ # Single LLM call for all systems comparison
331
+ eval_start = time.time()
332
+ response = self.judge_llm.batch_evaluate(comparison_prompt)
333
+ eval_time = time.time() - eval_start
334
+
335
+ # Extract response text
336
+ response_text = response.get('content', '') if isinstance(response, dict) else str(response)
337
+
338
+ print(f"✅ Judge LLM completed comparison evaluation in {eval_time:.2f}s")
339
+ print(f"📄 Response length: {len(response_text)} characters")
340
+
341
+ # Parse comparison response
342
+ results_by_system = self.parse_comparison_evaluation_response(response_text, systems_outputs)
343
+
344
+ # Combine all results for storage
345
+ all_results = []
346
+ for system_name, system_results in results_by_system.items():
347
+ all_results.extend(system_results)
348
+ print(f"📊 {system_name.upper()}: {len(system_results)} evaluations parsed")
349
+
350
+ self.evaluation_results.extend(all_results)
351
+
352
+ return results_by_system
353
+
354
+ except Exception as e:
355
+ print(f"❌ Multi-system evaluation failed: {e}")
356
+
357
+ # Create error results for all systems
358
+ error_results = {}
359
+ for system_name, outputs in systems_outputs.items():
360
+ error_results[system_name] = []
361
+ for output in outputs:
362
+ error_result = {
363
+ "query": output.get('query', ''),
364
+ "category": output.get('category', 'unknown'),
365
+ "system_type": system_name,
366
+ "actionability_score": 0.0,
367
+ "evidence_score": 0.0,
368
+ "evaluation_success": False,
369
+ "error": str(e),
370
+ "timestamp": datetime.now().isoformat()
371
+ }
372
+ error_results[system_name].append(error_result)
373
+ self.evaluation_results.extend(error_results[system_name])
374
+
375
+ return error_results
376
+
377
+ def calculate_judge_statistics(self) -> Dict[str, Any]:
378
+ """Calculate statistics for LLM judge evaluation"""
379
+ successful_results = [r for r in self.evaluation_results if r.get('evaluation_success')]
380
+
381
+ if not successful_results:
382
+ return {
383
+ "category_results": {},
384
+ "overall_results": {
385
+ "average_actionability": 0.0,
386
+ "average_evidence": 0.0,
387
+ "successful_evaluations": 0,
388
+ "total_queries": len(self.evaluation_results)
389
+ },
390
+ "timestamp": datetime.now().isoformat()
391
+ }
392
+
393
+ # Group by category
394
+ results_by_category = {"diagnosis": [], "treatment": [], "mixed": []}
395
+
396
+ for result in successful_results:
397
+ category = result.get('category', 'unknown')
398
+ if category in results_by_category:
399
+ results_by_category[category].append(result)
400
+
401
+ # Calculate category statistics
402
+ category_stats = {}
403
+ for category, results in results_by_category.items():
404
+ if results:
405
+ actionability_scores = [r['actionability_score'] for r in results]
406
+ evidence_scores = [r['evidence_score'] for r in results]
407
+
408
+ category_stats[category] = {
409
+ "average_actionability": sum(actionability_scores) / len(actionability_scores),
410
+ "average_evidence": sum(evidence_scores) / len(evidence_scores),
411
+ "query_count": len(results),
412
+ "actionability_target_met": (sum(actionability_scores) / len(actionability_scores)) >= 0.7,
413
+ "evidence_target_met": (sum(evidence_scores) / len(evidence_scores)) >= 0.75,
414
+ "individual_actionability_scores": actionability_scores,
415
+ "individual_evidence_scores": evidence_scores
416
+ }
417
+ else:
418
+ category_stats[category] = {
419
+ "average_actionability": 0.0,
420
+ "average_evidence": 0.0,
421
+ "query_count": 0,
422
+ "actionability_target_met": False,
423
+ "evidence_target_met": False,
424
+ "individual_actionability_scores": [],
425
+ "individual_evidence_scores": []
426
+ }
427
+
428
+ # Calculate overall statistics
429
+ all_actionability = [r['actionability_score'] for r in successful_results]
430
+ all_evidence = [r['evidence_score'] for r in successful_results]
431
+
432
+ overall_stats = {
433
+ "average_actionability": sum(all_actionability) / len(all_actionability),
434
+ "average_evidence": sum(all_evidence) / len(all_evidence),
435
+ "successful_evaluations": len(successful_results),
436
+ "total_queries": len(self.evaluation_results),
437
+ "actionability_target_met": (sum(all_actionability) / len(all_actionability)) >= 0.7,
438
+ "evidence_target_met": (sum(all_evidence) / len(all_evidence)) >= 0.75
439
+ }
440
+
441
+ return {
442
+ "category_results": category_stats,
443
+ "overall_results": overall_stats,
444
+ "timestamp": datetime.now().isoformat()
445
+ }
446
+
447
+ def save_comparison_statistics(self, systems: List[str], filename: str = None) -> str:
448
+ """Save comparison evaluation statistics for multiple systems"""
449
+ stats = self.calculate_judge_statistics()
450
+
451
+ if filename is None:
452
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
453
+ systems_str = "_vs_".join(systems)
454
+ filename = f"judge_evaluation_comparison_{systems_str}_{timestamp}.json"
455
+
456
+ results_dir = Path(__file__).parent / "results"
457
+ results_dir.mkdir(exist_ok=True)
458
+ filepath = results_dir / filename
459
+
460
+ # Add comparison metadata
461
+ stats["comparison_metadata"] = {
462
+ "systems_compared": systems,
463
+ "comparison_type": "multi_system",
464
+ "timestamp": datetime.now().isoformat()
465
+ }
466
+
467
+ # Add detailed system-specific results for chart generation
468
+ stats["detailed_system_results"] = {}
469
+ for system in systems:
470
+ system_results = [r for r in self.evaluation_results if r.get('system_type') == system and r.get('evaluation_success')]
471
+ stats["detailed_system_results"][system] = {
472
+ "results": system_results,
473
+ "query_count": len(system_results),
474
+ "avg_actionability": sum(r['actionability_score'] for r in system_results) / len(system_results) if system_results else 0.0,
475
+ "avg_evidence": sum(r['evidence_score'] for r in system_results) / len(system_results) if system_results else 0.0
476
+ }
477
+
478
+ with open(filepath, 'w', encoding='utf-8') as f:
479
+ json.dump(stats, f, indent=2, ensure_ascii=False)
480
+
481
+ print(f"📊 Comparison evaluation statistics saved to: {filepath}")
482
+ return str(filepath)
483
+
484
+
485
+ # Independent execution interface
486
+ if __name__ == "__main__":
487
+ """Independent LLM judge evaluation interface with multi-system support"""
488
+
489
+ print("🧠 OnCall.ai LLM Judge Evaluator - Metrics 5-6 Multi-System Evaluation")
490
+
491
+ # Print evaluation rubrics for reference
492
+ print_evaluation_rubrics()
493
+
494
+ if len(sys.argv) < 2:
495
+ print("Usage: python metric5_6_llm_judge_evaluator.py [system1] or [system1,system2,...]")
496
+ print(" rag - Evaluate RAG system medical outputs")
497
+ print(" direct - Evaluate direct LLM medical outputs")
498
+ print(" rag,direct - Compare RAG vs Direct systems")
499
+ print(" system1,system2,system3 - Compare multiple systems")
500
+ sys.exit(1)
501
+
502
+ # Parse systems from command line
503
+ systems_input = sys.argv[1]
504
+ systems = [s.strip() for s in systems_input.split(',')]
505
+
506
+ # Initialize evaluator
507
+ evaluator = LLMJudgeEvaluator()
508
+
509
+ try:
510
+ if len(systems) == 1:
511
+ # Single system evaluation (legacy mode)
512
+ system = systems[0]
513
+ print(f"\n🧪 Single System LLM Judge Evaluation: {system.upper()}")
514
+
515
+ # Find and load medical outputs for single system
516
+ system_files = evaluator.find_medical_outputs_for_systems([system])
517
+ medical_outputs = evaluator.load_medical_outputs(system_files[system])
518
+
519
+ if not medical_outputs:
520
+ print(f"❌ No medical outputs found for {system}")
521
+ sys.exit(1)
522
+
523
+ print(f"📊 Evaluating {len(medical_outputs)} medical advice outputs")
524
+ print(f"🎯 Metrics: 5 (Actionability) + 6 (Evidence Quality)")
525
+
526
+ # Convert to multi-system format for consistency
527
+ systems_outputs = {system: medical_outputs}
528
+ results_by_system = evaluator.evaluate_multiple_systems(systems_outputs)
529
+
530
+ # Save results
531
+ stats_path = evaluator.save_comparison_statistics([system])
532
+
533
+ else:
534
+ # Multi-system comparison evaluation
535
+ print(f"\n🧪 Multi-System Comparison: {' vs '.join([s.upper() for s in systems])}")
536
+
537
+ # Find and load medical outputs for all systems
538
+ system_files = evaluator.find_medical_outputs_for_systems(systems)
539
+ systems_outputs = {}
540
+
541
+ for system in systems:
542
+ outputs = evaluator.load_medical_outputs(system_files[system])
543
+ if not outputs:
544
+ print(f"❌ No medical outputs found for {system}")
545
+ sys.exit(1)
546
+ systems_outputs[system] = outputs
547
+
548
+ # Validate all systems have same number of queries
549
+ query_counts = [len(outputs) for outputs in systems_outputs.values()]
550
+ if len(set(query_counts)) > 1:
551
+ print(f"⚠️ Warning: Systems have different query counts: {dict(zip(systems, query_counts))}")
552
+
553
+ # Validate systems processed same queries (for scientific comparison)
554
+ print(f"🔍 Validating query consistency across systems...")
555
+ if len(systems) > 1:
556
+ first_system_queries = [q['query'] for q in systems_outputs[systems[0]]]
557
+ for i, system in enumerate(systems[1:], 1):
558
+ system_queries = [q['query'] for q in systems_outputs[system]]
559
+
560
+ if first_system_queries != system_queries:
561
+ print(f"⚠️ Warning: {systems[0]} and {system} processed different queries!")
562
+ # Show first difference
563
+ for j, (q1, q2) in enumerate(zip(first_system_queries, system_queries)):
564
+ if q1 != q2:
565
+ print(f" Query {j+1} differs:")
566
+ print(f" {systems[0]}: {q1[:50]}...")
567
+ print(f" {system}: {q2[:50]}...")
568
+ break
569
+ else:
570
+ print(f"✅ {systems[0]} and {system} processed identical queries")
571
+
572
+ # Validate systems have different model types
573
+ model_types = set()
574
+ for system, outputs in systems_outputs.items():
575
+ if outputs:
576
+ model_type = outputs[0].get('model_type', 'unknown')
577
+ model_types.add(model_type)
578
+ print(f"🏷️ {system.upper()} system model_type: {model_type}")
579
+
580
+ if len(model_types) == 1:
581
+ print(f"⚠️ Warning: All systems have same model_type - this may not be a valid comparison!")
582
+ else:
583
+ print(f"✅ Systems have different model_types: {model_types}")
584
+
585
+ print(f"📊 Comparing {len(systems)} systems with {min(query_counts)} queries each")
586
+ print(f"🎯 Metrics: 5 (Actionability) + 6 (Evidence Quality)")
587
+ print(f"⚡ Strategy: Single comparison call for maximum consistency")
588
+
589
+ # Multi-system comparison evaluation
590
+ results_by_system = evaluator.evaluate_multiple_systems(systems_outputs)
591
+
592
+ # Save comparison results
593
+ stats_path = evaluator.save_comparison_statistics(systems)
594
+
595
+ # Print summary
596
+ print(f"\n📊 Generating evaluation analysis...")
597
+ stats = evaluator.calculate_judge_statistics()
598
+ overall_results = stats['overall_results']
599
+
600
+ print(f"\n📊 === LLM JUDGE EVALUATION SUMMARY ===")
601
+
602
+ if len(systems) == 1:
603
+ print(f"System: {systems[0].upper()}")
604
+ else:
605
+ print(f"Systems Compared: {' vs '.join([s.upper() for s in systems])}")
606
+
607
+ print(f"Overall Performance:")
608
+ actionability_raw = overall_results['average_actionability'] * 10
609
+ evidence_raw = overall_results['average_evidence'] * 10
610
+
611
+ print(f" Average Actionability: {overall_results['average_actionability']:.3f} ({actionability_raw:.1f}/10)")
612
+ print(f" • {get_rubric_description(int(actionability_raw), 'actionability')}")
613
+ print(f" Average Evidence Quality: {overall_results['average_evidence']:.3f} ({evidence_raw:.1f}/10)")
614
+ print(f" • {get_rubric_description(int(evidence_raw), 'evidence')}")
615
+ print(f" Actionability Target (≥7.0): {'✅ Met' if overall_results['actionability_target_met'] else '❌ Not Met'}")
616
+ print(f" Evidence Target (≥7.5): {'✅ Met' if overall_results['evidence_target_met'] else '❌ Not Met'}")
617
+
618
+ # System-specific breakdown for multi-system comparison
619
+ if len(systems) > 1:
620
+ print(f"\nSystem Breakdown:")
621
+ for system in systems:
622
+ system_results = [r for r in evaluator.evaluation_results if r.get('system_type') == system and r.get('evaluation_success')]
623
+ if system_results:
624
+ avg_action = sum(r['actionability_score'] for r in system_results) / len(system_results)
625
+ avg_evidence = sum(r['evidence_score'] for r in system_results) / len(system_results)
626
+ print(f" {system.upper()}: Actionability={avg_action:.3f}, Evidence={avg_evidence:.3f} [{len(system_results)} queries]")
627
+
628
+ print(f"\n✅ LLM judge evaluation complete!")
629
+ print(f"📊 Statistics: {stats_path}")
630
+ print(f"⚡ Efficiency: {overall_results['total_queries']} evaluations in 1 LLM call")
631
+
632
+ except FileNotFoundError as e:
633
+ print(f"❌ {e}")
634
+ print(f"💡 Please run evaluators first:")
635
+ for system in systems:
636
+ if system == "rag":
637
+ print(" python latency_evaluator.py single_test_query.txt")
638
+ elif system == "direct":
639
+ print(" python direct_llm_evaluator.py single_test_query.txt")
640
+ else:
641
+ print(f" python {system}_evaluator.py single_test_query.txt")
642
+ except Exception as e:
643
+ print(f"❌ Judge evaluation failed: {e}")
evaluation/metric7_8_precision_MRR.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - Precision & MRR Analyzer (Metrics 7-8)
4
+ ========================================================
5
+
6
+ Specialized analyzer for calculating Precision@K and Mean Reciprocal Rank (MRR)
7
+ using data collected from latency_evaluator.py comprehensive evaluation.
8
+
9
+ IMPORTANT CHANGES - Angular Distance & Relevance Calculation:
10
+ - DISTANCE METRIC: Uses Angular Distance from Annoy index (range: 0.0-1.0, smaller = more relevant)
11
+ - RELEVANCE CONVERSION: relevance = 1.0 - (angular_distance²) / 2.0 (mathematical correct formula)
12
+ - THRESHOLD ALIGNMENT: Aligned with Metric 3 relevance calculation standards
13
+ - DISPLAY UPDATE: Changed from "Relevance: X" to "Angular Distance: X" for clarity
14
+
15
+ METRICS CALCULATED:
16
+ 7. Precision@K (檢索精確率) - Proportion of relevant results in top-K retrieval
17
+ 8. Mean Reciprocal Rank (平均倒數排名) - Average reciprocal rank of first relevant result
18
+
19
+ DESIGN PRINCIPLE:
20
+ - Reuses comprehensive_details_*.json from latency_evaluator.py
21
+ - Implements adaptive threshold based on query complexity
22
+ - Query complexity determined by actual matched emergency keywords count
23
+ - No additional LLM calls required
24
+
25
+ Author: YanBo Chen
26
+ Date: 2025-08-04
27
+ Updated: 2025-08-04 (Angular Distance alignment)
28
+ """
29
+
30
+ import json
31
+ import os
32
+ import sys
33
+ from typing import Dict, List, Any, Set
34
+ from datetime import datetime
35
+ from pathlib import Path
36
+ import re
37
+ import statistics
38
+
39
+ # Relevance threshold constants for adaptive query complexity handling
40
+ COMPLEX_QUERY_RELEVANCE_THRESHOLD = 0.65 # For queries with multiple emergency keywords
41
+ SIMPLE_QUERY_RELEVANCE_THRESHOLD = 0.75 # For straightforward diagnostic queries
42
+
43
+ class PrecisionMRRAnalyzer:
44
+ """Specialized analyzer for metrics 7-8 using existing comprehensive evaluation data"""
45
+
46
+ def __init__(self):
47
+ """Initialize analyzer"""
48
+ print("🔧 Initializing Precision & MRR Analyzer...")
49
+ self.analysis_results = []
50
+ print("✅ Analyzer initialization complete")
51
+
52
+ def load_comprehensive_data(self, filepath: str) -> List[Dict]:
53
+ """
54
+ Load comprehensive evaluation data from latency_evaluator.py output
55
+
56
+ Args:
57
+ filepath: Path to comprehensive_details_*.json file
58
+
59
+ Returns:
60
+ List of comprehensive evaluation results
61
+ """
62
+ try:
63
+ with open(filepath, 'r', encoding='utf-8') as f:
64
+ data = json.load(f)
65
+
66
+ comprehensive_results = data.get('comprehensive_results', [])
67
+
68
+ print(f"📁 Loaded {len(comprehensive_results)} comprehensive evaluation results")
69
+ print(f"📊 Ready for precision/MRR analysis: {sum(1 for r in comprehensive_results if r.get('precision_mrr_ready'))}")
70
+
71
+ return comprehensive_results
72
+
73
+ except Exception as e:
74
+ print(f"❌ Failed to load comprehensive data: {e}")
75
+ return []
76
+
77
+ def _is_complex_query(self, query: str, processed_results: List[Dict]) -> bool:
78
+ """
79
+ Determine query complexity based on actual matched emergency keywords
80
+
81
+ Args:
82
+ query: Original query text
83
+ processed_results: Retrieval results with matched keywords
84
+
85
+ Returns:
86
+ True if query is complex (should use lenient threshold)
87
+ """
88
+ # Collect unique emergency keywords actually found in retrieval results
89
+ unique_emergency_keywords = set()
90
+
91
+ for result in processed_results:
92
+ if result.get('type') == 'emergency':
93
+ matched_keywords = result.get('matched', '')
94
+ if matched_keywords:
95
+ keywords = [kw.strip() for kw in matched_keywords.split('|') if kw.strip()]
96
+ unique_emergency_keywords.update(keywords)
97
+
98
+ keyword_count = len(unique_emergency_keywords)
99
+
100
+ # Business logic: 4+ different emergency keywords indicate complex case
101
+ is_complex = keyword_count >= 4
102
+
103
+ print(f" 🧠 Query complexity: {'Complex' if is_complex else 'Simple'} ({keyword_count} emergency keywords)")
104
+ print(f" 🔑 Found keywords: {', '.join(list(unique_emergency_keywords)[:5])}")
105
+
106
+ return is_complex
107
+
108
+ def calculate_precision_mrr_single(self, query_data: Dict) -> Dict[str, Any]:
109
+ """
110
+ Calculate precision@K and MRR for single query
111
+
112
+ Args:
113
+ query_data: Single query's comprehensive evaluation result
114
+
115
+ Returns:
116
+ Precision and MRR metrics for this query
117
+ """
118
+ query = query_data['query']
119
+ category = query_data['category']
120
+
121
+ # Extract processed results from pipeline data
122
+ pipeline_data = query_data.get('pipeline_data', {})
123
+ retrieval_results = pipeline_data.get('retrieval_results', {})
124
+ processed_results = retrieval_results.get('processed_results', [])
125
+
126
+ print(f"🔍 Analyzing precision/MRR for: {query[:50]}...")
127
+ print(f"📋 Category: {category}, Results: {len(processed_results)}")
128
+
129
+ if not processed_results:
130
+ return self._create_empty_precision_mrr_result(query, category)
131
+
132
+ # Step 1: Determine query complexity
133
+ is_complex = self._is_complex_query(query, processed_results)
134
+
135
+ # Step 2: Choose adaptive threshold (aligned with Metric 3 relevance standards)
136
+ threshold = COMPLEX_QUERY_RELEVANCE_THRESHOLD if is_complex else SIMPLE_QUERY_RELEVANCE_THRESHOLD # Updated thresholds for complex/simple queries
137
+
138
+ print(f" 🎯 Using relevance threshold: {threshold} ({'lenient' if is_complex else 'strict'})")
139
+
140
+ # Step 3: Calculate relevance scores using correct angular distance formula
141
+ relevance_scores = []
142
+ for result in processed_results:
143
+ distance = result.get('distance', 1.0)
144
+ relevance = 1.0 - (distance**2) / 2.0 # Correct mathematical conversion
145
+ relevance_scores.append(relevance)
146
+
147
+ # Step 4: Calculate Precision@K (aligned with Metric 3 thresholds)
148
+ relevant_count = sum(1 for score in relevance_scores if score >= threshold)
149
+ precision_at_k = relevant_count / len(processed_results)
150
+
151
+ # Step 5: Calculate MRR
152
+ first_relevant_rank = None
153
+ for i, score in enumerate(relevance_scores, 1):
154
+ if score >= threshold:
155
+ first_relevant_rank = i
156
+ break
157
+
158
+ mrr_score = (1.0 / first_relevant_rank) if first_relevant_rank else 0.0
159
+
160
+ # Detailed analysis
161
+ result = {
162
+ "query": query,
163
+ "category": category,
164
+ "query_complexity": "complex" if is_complex else "simple",
165
+ "threshold_used": threshold,
166
+
167
+ # Metric 7: Precision@K
168
+ "precision_at_k": precision_at_k,
169
+ "relevant_count": relevant_count,
170
+ "total_results": len(processed_results),
171
+
172
+ # Metric 8: MRR
173
+ "mrr_score": mrr_score,
174
+ "first_relevant_rank": first_relevant_rank,
175
+
176
+ # Supporting data
177
+ "relevance_scores": relevance_scores,
178
+ "avg_relevance": sum(relevance_scores) / len(relevance_scores),
179
+ "max_relevance": max(relevance_scores),
180
+ "min_relevance": min(relevance_scores),
181
+
182
+ "timestamp": datetime.now().isoformat()
183
+ }
184
+
185
+ print(f" 📊 Precision@{len(processed_results)}: {precision_at_k:.3f} ({relevant_count}/{len(processed_results)} relevant)")
186
+ print(f" 📊 MRR: {mrr_score:.3f} (first relevant at rank {first_relevant_rank})")
187
+
188
+ return result
189
+
190
+ def _create_empty_precision_mrr_result(self, query: str, category: str) -> Dict[str, Any]:
191
+ """Create empty result for failed queries"""
192
+ return {
193
+ "query": query,
194
+ "category": category,
195
+ "query_complexity": "unknown",
196
+ "threshold_used": 0.0,
197
+ "precision_at_k": 0.0,
198
+ "relevant_count": 0,
199
+ "total_results": 0,
200
+ "mrr_score": 0.0,
201
+ "first_relevant_rank": None,
202
+ "relevance_scores": [],
203
+ "timestamp": datetime.now().isoformat()
204
+ }
205
+
206
+ def analyze_all_queries(self, comprehensive_results: List[Dict]) -> List[Dict]:
207
+ """
208
+ Analyze precision/MRR for all queries in comprehensive evaluation
209
+
210
+ Args:
211
+ comprehensive_results: Results from latency_evaluator.py
212
+
213
+ Returns:
214
+ List of precision/MRR analysis results
215
+ """
216
+ print(f"\n📊 Analyzing Precision@K and MRR for {len(comprehensive_results)} queries...")
217
+
218
+ analysis_results = []
219
+
220
+ for i, query_data in enumerate(comprehensive_results):
221
+ if not query_data.get('precision_mrr_ready'):
222
+ print(f"⏭️ Skipping query {i+1}: Not ready for precision/MRR analysis")
223
+ continue
224
+
225
+ if not query_data.get('overall_success'):
226
+ print(f"⏭️ Skipping query {i+1}: Pipeline failed")
227
+ analysis_results.append(self._create_empty_precision_mrr_result(
228
+ query_data['query'],
229
+ query_data['category']
230
+ ))
231
+ continue
232
+
233
+ # Analyze this query
234
+ result = self.calculate_precision_mrr_single(query_data)
235
+ analysis_results.append(result)
236
+
237
+ print("") # Spacing between queries
238
+
239
+ self.analysis_results = analysis_results
240
+ return analysis_results
241
+
242
+ def calculate_statistics(self) -> Dict[str, Any]:
243
+ """Calculate comprehensive statistics for metrics 7-8"""
244
+
245
+ if not self.analysis_results:
246
+ return {"error": "No analysis results available"}
247
+
248
+ # Separate by complexity and category
249
+ stats = {
250
+ "overall_statistics": {},
251
+ "by_complexity": {"simple": {}, "complex": {}},
252
+ "by_category": {"diagnosis": {}, "treatment": {}, "mixed": {}},
253
+ "timestamp": datetime.now().isoformat()
254
+ }
255
+
256
+ # Overall statistics
257
+ all_precision = [r['precision_at_k'] for r in self.analysis_results]
258
+ all_mrr = [r['mrr_score'] for r in self.analysis_results]
259
+
260
+ stats["overall_statistics"] = {
261
+ "total_queries": len(self.analysis_results),
262
+ "avg_precision": statistics.mean(all_precision),
263
+ "avg_mrr": statistics.mean(all_mrr),
264
+ "precision_std": statistics.stdev(all_precision) if len(all_precision) > 1 else 0.0,
265
+ "mrr_std": statistics.stdev(all_mrr) if len(all_mrr) > 1 else 0.0
266
+ }
267
+
268
+ # By complexity
269
+ for complexity in ["simple", "complex"]:
270
+ complexity_results = [r for r in self.analysis_results if r['query_complexity'] == complexity]
271
+ if complexity_results:
272
+ precision_scores = [r['precision_at_k'] for r in complexity_results]
273
+ mrr_scores = [r['mrr_score'] for r in complexity_results]
274
+
275
+ stats["by_complexity"][complexity] = {
276
+ "query_count": len(complexity_results),
277
+ "avg_precision": statistics.mean(precision_scores),
278
+ "avg_mrr": statistics.mean(mrr_scores),
279
+ "avg_threshold": statistics.mean([r['threshold_used'] for r in complexity_results])
280
+ }
281
+
282
+ # By category
283
+ for category in ["diagnosis", "treatment", "mixed"]:
284
+ category_results = [r for r in self.analysis_results if r['category'] == category]
285
+ if category_results:
286
+ precision_scores = [r['precision_at_k'] for r in category_results]
287
+ mrr_scores = [r['mrr_score'] for r in category_results]
288
+
289
+ stats["by_category"][category] = {
290
+ "query_count": len(category_results),
291
+ "avg_precision": statistics.mean(precision_scores),
292
+ "avg_mrr": statistics.mean(mrr_scores)
293
+ }
294
+
295
+ return stats
296
+
297
+ def save_results(self, filename: str = None) -> str:
298
+ """Save precision/MRR analysis results"""
299
+ if filename is None:
300
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
301
+ filename = f"precision_mrr_analysis_{timestamp}.json"
302
+
303
+ # Ensure results directory exists
304
+ results_dir = Path(__file__).parent / "results"
305
+ results_dir.mkdir(exist_ok=True)
306
+
307
+ filepath = results_dir / filename
308
+
309
+ # Create output data
310
+ output_data = {
311
+ "analysis_metadata": {
312
+ "total_queries": len(self.analysis_results),
313
+ "analysis_type": "precision_mrr_metrics_7_8",
314
+ "timestamp": datetime.now().isoformat(),
315
+ "adaptive_threshold": True
316
+ },
317
+ "detailed_results": self.analysis_results,
318
+ "statistics": self.calculate_statistics()
319
+ }
320
+
321
+ with open(filepath, 'w', encoding='utf-8') as f:
322
+ json.dump(output_data, f, indent=2, ensure_ascii=False)
323
+
324
+ print(f"📊 Precision/MRR analysis saved to: {filepath}")
325
+ return str(filepath)
326
+
327
+
328
+ # Independent execution interface
329
+ if __name__ == "__main__":
330
+ """Independent precision/MRR analysis interface"""
331
+
332
+ print("📊 OnCall.ai Precision & MRR Analyzer - Metrics 7-8")
333
+
334
+ if len(sys.argv) > 1:
335
+ comprehensive_file = sys.argv[1]
336
+ else:
337
+ # Look for latest comprehensive_details file
338
+ results_dir = Path(__file__).parent / "results"
339
+ if results_dir.exists():
340
+ comprehensive_files = list(results_dir.glob("comprehensive_details_*.json"))
341
+ if comprehensive_files:
342
+ comprehensive_file = str(sorted(comprehensive_files)[-1]) # Latest file
343
+ print(f"📁 Using latest comprehensive file: {comprehensive_file}")
344
+ else:
345
+ print("❌ No comprehensive_details_*.json files found")
346
+ print("Please run latency_evaluator.py first to generate comprehensive data")
347
+ sys.exit(1)
348
+ else:
349
+ print("❌ Results directory not found")
350
+ sys.exit(1)
351
+
352
+ if not os.path.exists(comprehensive_file):
353
+ print(f"❌ Comprehensive file not found: {comprehensive_file}")
354
+ print("Usage: python precision_MRR.py [comprehensive_details_file.json]")
355
+ sys.exit(1)
356
+
357
+ # Initialize analyzer
358
+ analyzer = PrecisionMRRAnalyzer()
359
+
360
+ # Load comprehensive data from latency_evaluator.py
361
+ comprehensive_results = analyzer.load_comprehensive_data(comprehensive_file)
362
+
363
+ if not comprehensive_results:
364
+ print("❌ No comprehensive data loaded")
365
+ sys.exit(1)
366
+
367
+ # Analyze precision/MRR for all queries
368
+ analysis_results = analyzer.analyze_all_queries(comprehensive_results)
369
+
370
+ # Calculate and display statistics
371
+ statistics_result = analyzer.calculate_statistics()
372
+
373
+ print(f"\n📊 === PRECISION & MRR ANALYSIS SUMMARY ===")
374
+
375
+ overall_stats = statistics_result['overall_statistics']
376
+ print(f"\nOVERALL METRICS:")
377
+ print(f" Precision@K: {overall_stats['avg_precision']:.3f} (±{overall_stats['precision_std']:.3f})")
378
+ print(f" MRR: {overall_stats['avg_mrr']:.3f} (±{overall_stats['mrr_std']:.3f})")
379
+ print(f" Total Queries: {overall_stats['total_queries']}")
380
+
381
+ # Complexity-based statistics
382
+ complexity_stats = statistics_result['by_complexity']
383
+ print(f"\nBY COMPLEXITY:")
384
+ for complexity, stats in complexity_stats.items():
385
+ if stats:
386
+ print(f" {complexity.title()}: Precision={stats['avg_precision']:.3f}, MRR={stats['avg_mrr']:.3f} "
387
+ f"(threshold={stats['avg_threshold']:.2f}, n={stats['query_count']})")
388
+
389
+ # Category-based statistics
390
+ category_stats = statistics_result['by_category']
391
+ print(f"\nBY CATEGORY:")
392
+ for category, stats in category_stats.items():
393
+ if stats:
394
+ print(f" {category.title()}: Precision={stats['avg_precision']:.3f}, MRR={stats['avg_mrr']:.3f} "
395
+ f"(n={stats['query_count']})")
396
+
397
+ # Save results
398
+ saved_path = analyzer.save_results()
399
+
400
+ print(f"\n✅ Precision & MRR analysis complete!")
401
+ print(f"📁 Results saved to: {saved_path}")
402
+ print(f"\n💡 Next step: Create precision_mrr_chart_generator.py for visualization")
evaluation/metric7_8_precision_mrr_chart_generator.py ADDED
@@ -0,0 +1,586 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - Precision & MRR Chart Generator (Metrics 7-8)
4
+ ===============================================================
5
+
6
+ Generates comprehensive Precision@K and MRR analysis charts from saved analysis results.
7
+ Reads JSON files produced by metric7_8_precision_MRR.py and creates visualizations.
8
+
9
+ Charts generated:
10
+ 1. Precision@K comparison by category and complexity
11
+ 2. MRR comparison by category and complexity
12
+ 3. Combined metrics heatmap
13
+ 4. Threshold impact analysis
14
+ 5. Detailed statistics tables
15
+
16
+ No LLM calls - pure data visualization.
17
+
18
+ Author: YanBo Chen
19
+ Date: 2025-08-04
20
+ """
21
+
22
+ import json
23
+ import os
24
+ import sys
25
+ from typing import Dict, List, Any
26
+ from datetime import datetime
27
+ from pathlib import Path
28
+ import glob
29
+
30
+ # Visualization imports
31
+ import matplotlib.pyplot as plt
32
+ import seaborn as sns
33
+ import pandas as pd
34
+ import numpy as np
35
+
36
+
37
+ class PrecisionMRRChartGenerator:
38
+ """Generate charts from precision/MRR analysis results - no LLM dependency"""
39
+
40
+ def __init__(self):
41
+ """Initialize chart generator"""
42
+ print("📈 Initializing Precision & MRR Chart Generator...")
43
+
44
+ # Set up professional chart style
45
+ plt.style.use('default')
46
+ sns.set_palette("husl")
47
+
48
+ print("✅ Chart Generator ready")
49
+
50
+ def load_latest_analysis(self, results_dir: str = None) -> Dict[str, Any]:
51
+ """
52
+ Load the most recent precision/MRR analysis file
53
+
54
+ Args:
55
+ results_dir: Directory containing analysis files
56
+ """
57
+ if results_dir is None:
58
+ results_dir = Path(__file__).parent / "results"
59
+
60
+ analysis_files = glob.glob(str(results_dir / "precision_mrr_analysis_*.json"))
61
+
62
+ if not analysis_files:
63
+ raise FileNotFoundError("No precision_mrr_analysis_*.json files found. Run metric7_8_precision_MRR.py first.")
64
+
65
+ latest_file = max(analysis_files, key=os.path.getctime)
66
+ print(f"📁 Loading latest analysis: {latest_file}")
67
+
68
+ with open(latest_file, 'r', encoding='utf-8') as f:
69
+ return json.load(f)
70
+
71
+ def create_precision_comparison_chart(self, analysis_data: Dict, save_path: str = None) -> str:
72
+ """Create Precision@K comparison chart"""
73
+
74
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
75
+
76
+ # Chart 1: Precision by Category
77
+ category_stats = analysis_data['statistics']['by_category']
78
+ categories = []
79
+ precisions = []
80
+
81
+ for category, stats in category_stats.items():
82
+ if stats:
83
+ categories.append(category.title())
84
+ precisions.append(stats['avg_precision'])
85
+
86
+ if categories:
87
+ bars1 = ax1.bar(categories, precisions, alpha=0.8, color=['#1f77b4', '#ff7f0e', '#d62728'])
88
+ ax1.set_title('Precision@K by Query Category', fontweight='bold')
89
+ ax1.set_ylabel('Precision@K')
90
+ ax1.set_xlabel('Query Category')
91
+ ax1.set_ylim(0, 1.0)
92
+ ax1.grid(True, alpha=0.3)
93
+
94
+ # Add value labels
95
+ for bar, precision in zip(bars1, precisions):
96
+ height = bar.get_height()
97
+ ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
98
+ f'{precision:.3f}', ha='center', va='bottom', fontweight='bold')
99
+
100
+ # Chart 2: Precision by Complexity
101
+ complexity_stats = analysis_data['statistics']['by_complexity']
102
+ complexities = []
103
+ comp_precisions = []
104
+
105
+ for complexity, stats in complexity_stats.items():
106
+ if stats:
107
+ complexities.append(complexity.title())
108
+ comp_precisions.append(stats['avg_precision'])
109
+
110
+ if complexities:
111
+ bars2 = ax2.bar(complexities, comp_precisions, alpha=0.8, color=['#2ca02c', '#d62728'])
112
+ ax2.set_title('Precision@K by Query Complexity', fontweight='bold')
113
+ ax2.set_ylabel('Precision@K')
114
+ ax2.set_xlabel('Query Complexity')
115
+ ax2.set_ylim(0, 1.0)
116
+ ax2.grid(True, alpha=0.3)
117
+
118
+ # Add value labels and threshold info
119
+ for bar, precision, complexity in zip(bars2, comp_precisions, complexities):
120
+ height = bar.get_height()
121
+ threshold = 0.15 if complexity.lower() == 'complex' else 0.25
122
+ ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
123
+ f'{precision:.3f}\n(T={threshold})', ha='center', va='bottom',
124
+ fontweight='bold', fontsize=9)
125
+
126
+ plt.tight_layout()
127
+
128
+ # Save chart
129
+ if save_path is None:
130
+ save_path = Path(__file__).parent / "charts" / f"precision_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
131
+
132
+ save_path = Path(save_path)
133
+ save_path.parent.mkdir(parents=True, exist_ok=True)
134
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
135
+ plt.close()
136
+
137
+ print(f"📊 Precision comparison chart saved: {save_path}")
138
+ return str(save_path)
139
+
140
+ def create_mrr_comparison_chart(self, analysis_data: Dict, save_path: str = None) -> str:
141
+ """Create MRR comparison chart"""
142
+
143
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
144
+
145
+ # Chart 1: MRR by Category
146
+ category_stats = analysis_data['statistics']['by_category']
147
+ categories = []
148
+ mrr_scores = []
149
+
150
+ for category, stats in category_stats.items():
151
+ if stats:
152
+ categories.append(category.title())
153
+ mrr_scores.append(stats['avg_mrr'])
154
+
155
+ if categories:
156
+ bars1 = ax1.bar(categories, mrr_scores, alpha=0.8, color=['#9467bd', '#8c564b', '#e377c2'])
157
+ ax1.set_title('Mean Reciprocal Rank by Query Category', fontweight='bold')
158
+ ax1.set_ylabel('MRR Score')
159
+ ax1.set_xlabel('Query Category')
160
+ ax1.set_ylim(0, 1.0)
161
+ ax1.grid(True, alpha=0.3)
162
+
163
+ # Add value labels
164
+ for bar, mrr in zip(bars1, mrr_scores):
165
+ height = bar.get_height()
166
+ ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
167
+ f'{mrr:.3f}', ha='center', va='bottom', fontweight='bold')
168
+
169
+ # Chart 2: MRR by Complexity
170
+ complexity_stats = analysis_data['statistics']['by_complexity']
171
+ complexities = []
172
+ comp_mrr = []
173
+
174
+ for complexity, stats in complexity_stats.items():
175
+ if stats:
176
+ complexities.append(complexity.title())
177
+ comp_mrr.append(stats['avg_mrr'])
178
+
179
+ if complexities:
180
+ bars2 = ax2.bar(complexities, comp_mrr, alpha=0.8, color=['#17becf', '#bcbd22'])
181
+ ax2.set_title('MRR by Query Complexity', fontweight='bold')
182
+ ax2.set_ylabel('MRR Score')
183
+ ax2.set_xlabel('Query Complexity')
184
+ ax2.set_ylim(0, 1.0)
185
+ ax2.grid(True, alpha=0.3)
186
+
187
+ # Add value labels
188
+ for bar, mrr in zip(bars2, comp_mrr):
189
+ height = bar.get_height()
190
+ ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
191
+ f'{mrr:.3f}', ha='center', va='bottom', fontweight='bold')
192
+
193
+ plt.tight_layout()
194
+
195
+ # Save chart
196
+ if save_path is None:
197
+ save_path = Path(__file__).parent / "charts" / f"mrr_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
198
+
199
+ save_path = Path(save_path)
200
+ save_path.parent.mkdir(parents=True, exist_ok=True)
201
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
202
+ plt.close()
203
+
204
+ print(f"📊 MRR comparison chart saved: {save_path}")
205
+ return str(save_path)
206
+
207
+ def create_combined_metrics_heatmap(self, analysis_data: Dict, save_path: str = None) -> str:
208
+ """Create combined precision/MRR heatmap"""
209
+
210
+ # Prepare data for heatmap
211
+ detailed_results = analysis_data.get('detailed_results', [])
212
+
213
+ if not detailed_results:
214
+ print("⚠️ No detailed results for heatmap")
215
+ return ""
216
+
217
+ # Create DataFrame for heatmap
218
+ heatmap_data = []
219
+ for result in detailed_results:
220
+ heatmap_data.append({
221
+ 'Category': result['category'].title(),
222
+ 'Complexity': result['query_complexity'].title(),
223
+ 'Precision@K': result['precision_at_k'],
224
+ 'MRR': result['mrr_score'],
225
+ 'Threshold': result['threshold_used']
226
+ })
227
+
228
+ df = pd.DataFrame(heatmap_data)
229
+
230
+ # Create pivot table for heatmap
231
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
232
+
233
+ # Precision heatmap
234
+ precision_pivot = df.pivot_table(values='Precision@K', index='Category', columns='Complexity', aggfunc='mean')
235
+ sns.heatmap(precision_pivot, annot=True, fmt='.3f', cmap='YlOrRd', ax=ax1,
236
+ cbar_kws={'label': 'Precision@K'}, vmin=0, vmax=1)
237
+ ax1.set_title('Precision@K Heatmap\n(Category vs Complexity)', fontweight='bold')
238
+
239
+ # MRR heatmap
240
+ mrr_pivot = df.pivot_table(values='MRR', index='Category', columns='Complexity', aggfunc='mean')
241
+ sns.heatmap(mrr_pivot, annot=True, fmt='.3f', cmap='YlGnBu', ax=ax2,
242
+ cbar_kws={'label': 'MRR Score'}, vmin=0, vmax=1)
243
+ ax2.set_title('MRR Heatmap\n(Category vs Complexity)', fontweight='bold')
244
+
245
+ plt.tight_layout()
246
+
247
+ # Save chart
248
+ if save_path is None:
249
+ save_path = Path(__file__).parent / "charts" / f"precision_mrr_heatmap_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
250
+
251
+ save_path = Path(save_path)
252
+ save_path.parent.mkdir(parents=True, exist_ok=True)
253
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
254
+ plt.close()
255
+
256
+ print(f"📊 Combined metrics heatmap saved: {save_path}")
257
+ return str(save_path)
258
+
259
+ def create_threshold_impact_chart(self, analysis_data: Dict, save_path: str = None) -> str:
260
+ """Create threshold impact analysis chart"""
261
+
262
+ detailed_results = analysis_data.get('detailed_results', [])
263
+
264
+ if not detailed_results:
265
+ print("⚠️ No detailed results for threshold analysis")
266
+ return ""
267
+
268
+ # Group by complexity and calculate average relevance
269
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
270
+
271
+ # Prepare data
272
+ simple_queries = [r for r in detailed_results if r['query_complexity'] == 'simple']
273
+ complex_queries = [r for r in detailed_results if r['query_complexity'] == 'complex']
274
+
275
+ # Chart 1: Relevance distribution for different complexities
276
+ if simple_queries:
277
+ simple_relevances = []
278
+ for query in simple_queries:
279
+ simple_relevances.extend(query.get('relevance_scores', []))
280
+
281
+ ax1.hist(simple_relevances, bins=10, alpha=0.7, label=f'Simple (T=0.25)', color='#2ca02c', density=True)
282
+ ax1.axvline(x=0.25, color='#2ca02c', linestyle='--', linewidth=2, label='Simple Threshold')
283
+
284
+ if complex_queries:
285
+ complex_relevances = []
286
+ for query in complex_queries:
287
+ complex_relevances.extend(query.get('relevance_scores', []))
288
+
289
+ ax1.hist(complex_relevances, bins=10, alpha=0.7, label=f'Complex (T=0.15)', color='#d62728', density=True)
290
+ ax1.axvline(x=0.15, color='#d62728', linestyle='--', linewidth=2, label='Complex Threshold')
291
+
292
+ ax1.set_title('Relevance Score Distribution\nby Query Complexity', fontweight='bold')
293
+ ax1.set_xlabel('Relevance Score')
294
+ ax1.set_ylabel('Density')
295
+ ax1.legend()
296
+ ax1.grid(True, alpha=0.3)
297
+
298
+ # Chart 2: Metrics comparison
299
+ complexity_stats = analysis_data['statistics']['by_complexity']
300
+
301
+ complexities = []
302
+ precisions = []
303
+ mrrs = []
304
+ thresholds = []
305
+
306
+ for complexity, stats in complexity_stats.items():
307
+ if stats:
308
+ complexities.append(complexity.title())
309
+ precisions.append(stats['avg_precision'])
310
+ mrrs.append(stats['avg_mrr'])
311
+ thresholds.append(stats['avg_threshold'])
312
+
313
+ x = np.arange(len(complexities))
314
+ width = 0.35
315
+
316
+ bars1 = ax2.bar(x - width/2, precisions, width, label='Precision@K', alpha=0.8, color='#ff7f0e')
317
+ bars2 = ax2.bar(x + width/2, mrrs, width, label='MRR', alpha=0.8, color='#1f77b4')
318
+
319
+ ax2.set_title('Metrics Comparison by Complexity\n(with Adaptive Thresholds)', fontweight='bold')
320
+ ax2.set_ylabel('Score')
321
+ ax2.set_xlabel('Query Complexity')
322
+ ax2.set_xticks(x)
323
+ ax2.set_xticklabels(complexities)
324
+ ax2.legend()
325
+ ax2.grid(True, alpha=0.3)
326
+ ax2.set_ylim(0, 1.0)
327
+
328
+ # Add value labels
329
+ for bars, values, thresholds_vals in [(bars1, precisions, thresholds), (bars2, mrrs, thresholds)]:
330
+ for bar, value, threshold in zip(bars, values, thresholds_vals):
331
+ height = bar.get_height()
332
+ ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
333
+ f'{value:.3f}', ha='center', va='bottom', fontweight='bold', fontsize=9)
334
+
335
+ plt.tight_layout()
336
+
337
+ # Save chart
338
+ if save_path is None:
339
+ save_path = Path(__file__).parent / "charts" / f"threshold_impact_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
340
+
341
+ save_path = Path(save_path)
342
+ save_path.parent.mkdir(parents=True, exist_ok=True)
343
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
344
+ plt.close()
345
+
346
+ print(f"📊 Threshold impact chart saved: {save_path}")
347
+ return str(save_path)
348
+
349
+ def create_detailed_analysis_table(self, analysis_data: Dict, save_path: str = None) -> str:
350
+ """Create detailed statistics table"""
351
+
352
+ fig, ax = plt.subplots(figsize=(12, 8))
353
+ ax.axis('tight')
354
+ ax.axis('off')
355
+
356
+ # Prepare table data
357
+ table_data = []
358
+
359
+ # Overall statistics
360
+ overall_stats = analysis_data['statistics']['overall_statistics']
361
+ table_data.append(['OVERALL METRICS', '', '', '', ''])
362
+ table_data.append(['Total Queries', str(overall_stats['total_queries']), '', '', ''])
363
+ table_data.append(['Avg Precision@K', f"{overall_stats['avg_precision']:.3f}",
364
+ f"±{overall_stats['precision_std']:.3f}", '', ''])
365
+ table_data.append(['Avg MRR', f"{overall_stats['avg_mrr']:.3f}",
366
+ f"±{overall_stats['mrr_std']:.3f}", '', ''])
367
+ table_data.append(['', '', '', '', ''])
368
+
369
+ # By category
370
+ table_data.append(['BY CATEGORY', 'Queries', 'Precision@K', 'MRR', 'Notes'])
371
+ category_stats = analysis_data['statistics']['by_category']
372
+ for category, stats in category_stats.items():
373
+ if stats:
374
+ table_data.append([
375
+ category.title(),
376
+ str(stats['query_count']),
377
+ f"{stats['avg_precision']:.3f}",
378
+ f"{stats['avg_mrr']:.3f}",
379
+ ''
380
+ ])
381
+
382
+ table_data.append(['', '', '', '', ''])
383
+
384
+ # By complexity
385
+ table_data.append(['BY COMPLEXITY', 'Queries', 'Precision@K', 'MRR', 'Threshold'])
386
+ complexity_stats = analysis_data['statistics']['by_complexity']
387
+ for complexity, stats in complexity_stats.items():
388
+ if stats:
389
+ table_data.append([
390
+ complexity.title(),
391
+ str(stats['query_count']),
392
+ f"{stats['avg_precision']:.3f}",
393
+ f"{stats['avg_mrr']:.3f}",
394
+ f"{stats['avg_threshold']:.2f}"
395
+ ])
396
+
397
+ # Create table
398
+ table = ax.table(cellText=table_data,
399
+ colLabels=['Metric', 'Value 1', 'Value 2', 'Value 3', 'Value 4'],
400
+ cellLoc='center',
401
+ loc='center',
402
+ bbox=[0, 0, 1, 1])
403
+
404
+ # Style the table
405
+ table.auto_set_font_size(False)
406
+ table.set_fontsize(10)
407
+ table.scale(1, 2)
408
+
409
+ # Header styling
410
+ for i in range(5):
411
+ table[(0, i)].set_facecolor('#40466e')
412
+ table[(0, i)].set_text_props(weight='bold', color='white')
413
+
414
+ # Section headers styling
415
+ for i, row in enumerate(table_data):
416
+ if row[0] in ['OVERALL METRICS', 'BY CATEGORY', 'BY COMPLEXITY']:
417
+ table[(i+1, 0)].set_facecolor('#1f77b4')
418
+ table[(i+1, 0)].set_text_props(weight='bold', color='white')
419
+
420
+ plt.title('Precision@K & MRR Detailed Analysis\nMetrics 7-8 Statistics',
421
+ fontweight='bold', fontsize=14, pad=20)
422
+
423
+ # Save chart
424
+ if save_path is None:
425
+ save_path = Path(__file__).parent / "charts" / f"precision_mrr_table_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
426
+
427
+ save_path = Path(save_path)
428
+ save_path.parent.mkdir(parents=True, exist_ok=True)
429
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
430
+ plt.close()
431
+
432
+ print(f"📊 Detailed analysis table saved: {save_path}")
433
+ return str(save_path)
434
+
435
+ def create_individual_query_analysis(self, analysis_data: Dict, save_path: str = None) -> str:
436
+ """Create individual query analysis chart"""
437
+
438
+ detailed_results = analysis_data.get('detailed_results', [])
439
+
440
+ if not detailed_results:
441
+ print("⚠️ No detailed results for individual analysis")
442
+ return ""
443
+
444
+ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))
445
+
446
+ # Prepare data
447
+ query_indices = []
448
+ precisions = []
449
+ mrrs = []
450
+ colors = []
451
+ labels = []
452
+
453
+ for i, result in enumerate(detailed_results):
454
+ query_indices.append(i + 1)
455
+ precisions.append(result['precision_at_k'])
456
+ mrrs.append(result['mrr_score'])
457
+
458
+ # Color by complexity
459
+ if result['query_complexity'] == 'complex':
460
+ colors.append('#d62728') # Red for complex
461
+ else:
462
+ colors.append('#2ca02c') # Green for simple
463
+
464
+ # Create short label
465
+ query_short = result['query'][:30] + "..." if len(result['query']) > 30 else result['query']
466
+ category = result['category'][:4].upper()
467
+ labels.append(f"{category}\n{query_short}")
468
+
469
+ # Chart 1: Precision@K for each query
470
+ bars1 = ax1.bar(query_indices, precisions, color=colors, alpha=0.8)
471
+ ax1.set_title('Precision@K by Individual Query', fontweight='bold')
472
+ ax1.set_ylabel('Precision@K')
473
+ ax1.set_xlabel('Query Index')
474
+ ax1.set_ylim(0, 1.0)
475
+ ax1.grid(True, alpha=0.3)
476
+
477
+ # Add value labels
478
+ for bar, precision in zip(bars1, precisions):
479
+ height = bar.get_height()
480
+ ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
481
+ f'{precision:.2f}', ha='center', va='bottom', fontsize=8)
482
+
483
+ # Chart 2: MRR for each query
484
+ bars2 = ax2.bar(query_indices, mrrs, color=colors, alpha=0.8)
485
+ ax2.set_title('MRR by Individual Query', fontweight='bold')
486
+ ax2.set_ylabel('MRR Score')
487
+ ax2.set_xlabel('Query Index')
488
+ ax2.set_ylim(0, 1.0)
489
+ ax2.grid(True, alpha=0.3)
490
+
491
+ # Add value labels
492
+ for bar, mrr in zip(bars2, mrrs):
493
+ height = bar.get_height()
494
+ ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
495
+ f'{mrr:.2f}', ha='center', va='bottom', fontsize=8)
496
+
497
+ # Add legend
498
+ from matplotlib.patches import Patch
499
+ legend_elements = [
500
+ Patch(facecolor='#2ca02c', alpha=0.8, label='Simple Query (T=0.25)'),
501
+ Patch(facecolor='#d62728', alpha=0.8, label='Complex Query (T=0.15)')
502
+ ]
503
+ ax1.legend(handles=legend_elements, loc='upper right')
504
+
505
+ plt.tight_layout()
506
+
507
+ # Save chart
508
+ if save_path is None:
509
+ save_path = Path(__file__).parent / "charts" / f"individual_query_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
510
+
511
+ save_path = Path(save_path)
512
+ save_path.parent.mkdir(parents=True, exist_ok=True)
513
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
514
+ plt.close()
515
+
516
+ print(f"📊 Individual query analysis saved: {save_path}")
517
+ return str(save_path)
518
+
519
+ def generate_all_charts(self, analysis_data: Dict = None) -> Dict[str, str]:
520
+ """Generate all precision/MRR charts"""
521
+
522
+ if analysis_data is None:
523
+ analysis_data = self.load_latest_analysis()
524
+
525
+ print(f"\n📈 Generating all Precision & MRR charts...")
526
+
527
+ saved_charts = {}
528
+
529
+ # Generate all chart types
530
+ try:
531
+ saved_charts['precision_comparison'] = self.create_precision_comparison_chart(analysis_data)
532
+ saved_charts['mrr_comparison'] = self.create_mrr_comparison_chart(analysis_data)
533
+ saved_charts['combined_heatmap'] = self.create_combined_metrics_heatmap(analysis_data)
534
+ saved_charts['threshold_impact'] = self.create_threshold_impact_chart(analysis_data)
535
+ saved_charts['individual_analysis'] = self.create_individual_query_analysis(analysis_data)
536
+
537
+ except Exception as e:
538
+ print(f"❌ Error generating charts: {e}")
539
+ return {"error": str(e)}
540
+
541
+ print(f"\n✅ All precision/MRR charts generated successfully!")
542
+ print(f"📁 Charts saved to: evaluation/charts/")
543
+
544
+ return saved_charts
545
+
546
+
547
+ # Independent execution interface
548
+ if __name__ == "__main__":
549
+ """Generate precision/MRR charts from analysis results"""
550
+
551
+ print("📈 OnCall.ai Precision & MRR Chart Generator - Metrics 7-8")
552
+
553
+ if len(sys.argv) > 1:
554
+ analysis_file = sys.argv[1]
555
+
556
+ if not os.path.exists(analysis_file):
557
+ print(f"❌ Analysis file not found: {analysis_file}")
558
+ sys.exit(1)
559
+ else:
560
+ analysis_file = None # Will use latest file
561
+
562
+ # Initialize generator
563
+ generator = PrecisionMRRChartGenerator()
564
+
565
+ try:
566
+ # Load analysis data
567
+ if analysis_file:
568
+ with open(analysis_file, 'r', encoding='utf-8') as f:
569
+ analysis_data = json.load(f)
570
+ print(f"📁 Using specified analysis file: {analysis_file}")
571
+ else:
572
+ analysis_data = generator.load_latest_analysis()
573
+
574
+ # Generate all charts
575
+ saved_charts = generator.generate_all_charts(analysis_data)
576
+
577
+ if 'error' not in saved_charts:
578
+ print(f"\n📊 === PRECISION & MRR CHART GENERATION SUMMARY ===")
579
+ for chart_type, filepath in saved_charts.items():
580
+ print(f" 📈 {chart_type.replace('_', ' ').title()}: {filepath}")
581
+
582
+ print(f"\n💡 Charts ready for analysis and presentation!")
583
+
584
+ except Exception as e:
585
+ print(f"❌ Chart generation failed: {e}")
586
+ sys.exit(1)
evaluation/old/coverage_evaluator.py ADDED
@@ -0,0 +1,560 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - Retrieval Coverage Evaluator (Metric 4)
4
+ ==========================================================
5
+
6
+ Evaluates how well generated medical advice utilizes retrieved content
7
+ Automatic evaluation using keyword overlap analysis with optional LLM sampling
8
+
9
+ Author: YanBo Chen
10
+ Date: 2025-08-04
11
+ """
12
+
13
+ import json
14
+ import os
15
+ import sys
16
+ from typing import Dict, List, Any, Set
17
+ from datetime import datetime
18
+ from pathlib import Path
19
+ import re
20
+
21
+ # Add project path
22
+ current_dir = Path(__file__).parent
23
+ project_root = current_dir.parent
24
+ src_dir = project_root / "src"
25
+ sys.path.insert(0, str(src_dir))
26
+
27
+ # Import existing system components
28
+ try:
29
+ from user_prompt import UserPromptProcessor
30
+ from retrieval import BasicRetrievalSystem
31
+ from llm_clients import llm_Med42_70BClient
32
+ from generation import MedicalAdviceGenerator
33
+ except ImportError as e:
34
+ print(f"❌ Import failed: {e}")
35
+ print("Please ensure running from project root directory")
36
+ sys.exit(1)
37
+
38
+
39
+ class CoverageEvaluator:
40
+ """Retrieval coverage evaluator using keyword overlap analysis"""
41
+
42
+ def __init__(self):
43
+ """Initialize system components for coverage testing"""
44
+ print("🔧 Initializing Coverage Evaluator...")
45
+
46
+ # Initialize full pipeline components (needed for advice generation)
47
+ self.llm_client = llm_Med42_70BClient()
48
+ self.retrieval_system = BasicRetrievalSystem()
49
+ self.user_prompt_processor = UserPromptProcessor(
50
+ llm_client=self.llm_client,
51
+ retrieval_system=self.retrieval_system
52
+ )
53
+ self.medical_generator = MedicalAdviceGenerator(llm_client=self.llm_client)
54
+
55
+ # Results accumulation
56
+ self.coverage_results = []
57
+
58
+ print("✅ Coverage Evaluator initialization complete")
59
+
60
+ def extract_medical_keywords(self, text: str) -> Set[str]:
61
+ """
62
+ Extract medical keywords from text for coverage analysis
63
+
64
+ Uses medical terminology patterns and common medical terms
65
+ """
66
+ if not text:
67
+ return set()
68
+
69
+ medical_keywords = set()
70
+ text_lower = text.lower()
71
+
72
+ # Medical terminology patterns
73
+ patterns = [
74
+ r'\b[a-z]+(?:osis|itis|pathy|emia|uria|gram|scopy)\b', # Medical suffixes
75
+ r'\b(?:cardio|neuro|pulmo|gastro|hepato|nephro)[a-z]+\b', # Medical prefixes
76
+ r'\b(?:diagnosis|treatment|therapy|intervention|management)\b', # Medical actions
77
+ r'\b(?:patient|symptom|condition|disease|disorder|syndrome)\b', # Medical entities
78
+ r'\b(?:acute|chronic|severe|mild|moderate|emergency)\b', # Medical descriptors
79
+ r'\b[a-z]+(?:al|ic|ous|ive)\s+(?:pain|failure|infection|injury)\b', # Compound terms
80
+ r'\b(?:ecg|ekg|ct|mri|x-ray|ultrasound|biopsy)\b', # Medical procedures
81
+ r'\b\d+\s*(?:mg|ml|units|hours|days|minutes)\b', # Dosages and timeframes
82
+ ]
83
+
84
+ for pattern in patterns:
85
+ matches = re.findall(pattern, text_lower)
86
+ medical_keywords.update(match.strip() for match in matches)
87
+
88
+ # Additional common medical terms
89
+ common_medical_terms = [
90
+ 'blood', 'pressure', 'heart', 'chest', 'pain', 'stroke', 'seizure',
91
+ 'emergency', 'hospital', 'monitor', 'assess', 'evaluate', 'immediate',
92
+ 'protocol', 'guideline', 'recommendation', 'risk', 'factor'
93
+ ]
94
+
95
+ for term in common_medical_terms:
96
+ if term in text_lower:
97
+ medical_keywords.add(term)
98
+
99
+ # Filter out very short terms and common words
100
+ filtered_keywords = {
101
+ kw for kw in medical_keywords
102
+ if len(kw) > 2 and kw not in ['the', 'and', 'for', 'with', 'are', 'can', 'may']
103
+ }
104
+
105
+ return filtered_keywords
106
+
107
+ def calculate_coverage_score(self, generated_advice: str, retrieval_results: List[Dict]) -> Dict[str, Any]:
108
+ """
109
+ Calculate coverage score based on keyword overlap between advice and retrieved docs
110
+
111
+ Args:
112
+ generated_advice: Generated medical advice text
113
+ retrieval_results: List of retrieved documents
114
+ """
115
+ if not generated_advice or not retrieval_results:
116
+ return {
117
+ "coverage_score": 0.0,
118
+ "matched_keywords": [],
119
+ "advice_keywords": [],
120
+ "source_keywords": [],
121
+ "coverage_details": []
122
+ }
123
+
124
+ # Extract keywords from generated advice
125
+ advice_keywords = self.extract_medical_keywords(generated_advice)
126
+
127
+ # Extract keywords from all retrieved documents
128
+ all_source_keywords = set()
129
+ coverage_details = []
130
+
131
+ for i, doc in enumerate(retrieval_results):
132
+ doc_content = doc.get('content', '') or doc.get('text', '')
133
+ doc_keywords = self.extract_medical_keywords(doc_content)
134
+ all_source_keywords.update(doc_keywords)
135
+
136
+ # Calculate overlap for this specific document
137
+ doc_overlap = advice_keywords.intersection(doc_keywords)
138
+ doc_coverage = len(doc_overlap) / len(doc_keywords) if doc_keywords else 0.0
139
+
140
+ coverage_details.append({
141
+ "doc_index": i,
142
+ "doc_snippet": doc_content[:100] + "...",
143
+ "doc_keywords_count": len(doc_keywords),
144
+ "matched_keywords_count": len(doc_overlap),
145
+ "doc_coverage_ratio": doc_coverage,
146
+ "matched_keywords": list(doc_overlap)[:10] # Limit for readability
147
+ })
148
+
149
+ # Calculate overall coverage
150
+ matched_keywords = advice_keywords.intersection(all_source_keywords)
151
+ coverage_score = len(matched_keywords) / len(all_source_keywords) if all_source_keywords else 0.0
152
+
153
+ return {
154
+ "coverage_score": coverage_score,
155
+ "matched_keywords": list(matched_keywords),
156
+ "advice_keywords": list(advice_keywords),
157
+ "source_keywords": list(all_source_keywords),
158
+ "advice_keywords_count": len(advice_keywords),
159
+ "source_keywords_count": len(all_source_keywords),
160
+ "matched_keywords_count": len(matched_keywords),
161
+ "coverage_percentage": coverage_score * 100,
162
+ "meets_threshold": coverage_score >= 0.6,
163
+ "coverage_details": coverage_details
164
+ }
165
+
166
+ def evaluate_single_coverage(self, query: str, category: str = "unknown") -> Dict[str, Any]:
167
+ """
168
+ Evaluate retrieval coverage for a single query
169
+
170
+ Requires full pipeline: extraction → retrieval → generation → coverage analysis
171
+
172
+ Args:
173
+ query: Medical query to test
174
+ category: Query category (diagnosis/treatment/mixed)
175
+ """
176
+ print(f"🔍 Testing coverage for: {query[:50]}...")
177
+ print(f"📋 Category: {category}")
178
+
179
+ try:
180
+ # Step 1: Extract condition
181
+ condition_result = self.user_prompt_processor.extract_condition_keywords(query)
182
+
183
+ # Step 2: Perform retrieval
184
+ search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
185
+ if not search_query:
186
+ search_query = condition_result.get('condition', query)
187
+
188
+ retrieval_start = datetime.now()
189
+ retrieval_results = self.retrieval_system.search(search_query, top_k=5)
190
+ retrieval_time = (datetime.now() - retrieval_start).total_seconds()
191
+
192
+ processed_results = retrieval_results.get('processed_results', [])
193
+
194
+ if not processed_results:
195
+ result = {
196
+ "query": query,
197
+ "category": category,
198
+ "search_query": search_query,
199
+ "pipeline_success": False,
200
+ "coverage_score": 0.0,
201
+ "error": "No retrieval results",
202
+ "timestamp": datetime.now().isoformat()
203
+ }
204
+
205
+ self.coverage_results.append(result)
206
+ print(f" ❌ No retrieval results for coverage analysis")
207
+ return result
208
+
209
+ # Step 3: Generate medical advice
210
+ generation_start = datetime.now()
211
+ intention = self._detect_query_intention(query)
212
+ medical_advice_result = self.medical_generator.generate_medical_advice(
213
+ user_query=query,
214
+ retrieval_results=retrieval_results,
215
+ intention=intention
216
+ )
217
+ generation_time = (datetime.now() - generation_start).total_seconds()
218
+
219
+ generated_advice = medical_advice_result.get('medical_advice', '')
220
+
221
+ if not generated_advice:
222
+ result = {
223
+ "query": query,
224
+ "category": category,
225
+ "search_query": search_query,
226
+ "pipeline_success": False,
227
+ "coverage_score": 0.0,
228
+ "error": "No generated advice",
229
+ "timestamp": datetime.now().isoformat()
230
+ }
231
+
232
+ self.coverage_results.append(result)
233
+ print(f" ❌ No generated advice for coverage analysis")
234
+ return result
235
+
236
+ # Step 4: Calculate coverage
237
+ coverage_analysis = self.calculate_coverage_score(generated_advice, processed_results)
238
+
239
+ result = {
240
+ "query": query,
241
+ "category": category,
242
+ "search_query": search_query,
243
+ "pipeline_success": True,
244
+ "retrieval_time": retrieval_time,
245
+ "generation_time": generation_time,
246
+ "retrieved_docs_count": len(processed_results),
247
+ "generated_advice_length": len(generated_advice),
248
+ "coverage_analysis": coverage_analysis,
249
+ "coverage_score": coverage_analysis['coverage_score'],
250
+ "meets_threshold": coverage_analysis['meets_threshold'],
251
+ "timestamp": datetime.now().isoformat()
252
+ }
253
+
254
+ # Store result
255
+ self.coverage_results.append(result)
256
+
257
+ print(f" ✅ Pipeline: Complete")
258
+ print(f" 📊 Coverage Score: {coverage_analysis['coverage_score']:.3f} ({coverage_analysis['coverage_percentage']:.1f}%)")
259
+ print(f" 📝 Keywords: {coverage_analysis['matched_keywords_count']}/{coverage_analysis['source_keywords_count']} matched")
260
+ print(f" 🎯 Threshold: {'✅ Met' if result['meets_threshold'] else '❌ Not Met'}")
261
+ print(f" ⏱️ Times: Retrieval={retrieval_time:.2f}s, Generation={generation_time:.2f}s")
262
+
263
+ return result
264
+
265
+ except Exception as e:
266
+ error_result = {
267
+ "query": query,
268
+ "category": category,
269
+ "pipeline_success": False,
270
+ "coverage_score": 0.0,
271
+ "error": str(e),
272
+ "timestamp": datetime.now().isoformat()
273
+ }
274
+
275
+ self.coverage_results.append(error_result)
276
+ print(f" ❌ Coverage evaluation failed: {e}")
277
+
278
+ return error_result
279
+
280
+ def _detect_query_intention(self, query: str) -> str:
281
+ """Simplified query intention detection (from app.py)"""
282
+ query_lower = query.lower()
283
+
284
+ if any(word in query_lower for word in ['diagnos', 'differential', 'possible', 'causes']):
285
+ return 'diagnosis'
286
+ elif any(word in query_lower for word in ['treat', 'manage', 'therapy', 'intervention']):
287
+ return 'treatment'
288
+ else:
289
+ return 'mixed'
290
+
291
+ def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
292
+ """Parse queries from file with category labels"""
293
+ print(f"📁 Reading queries from file: {filepath}")
294
+
295
+ try:
296
+ with open(filepath, 'r', encoding='utf-8') as f:
297
+ content = f.read()
298
+
299
+ # Parse queries with category labels
300
+ queries_by_category = {
301
+ "diagnosis": [],
302
+ "treatment": [],
303
+ "mixed": []
304
+ }
305
+
306
+ lines = content.strip().split('\n')
307
+
308
+ for line in lines:
309
+ line = line.strip()
310
+ if not line:
311
+ continue
312
+
313
+ # Parse format: "1.diagnosis: query text"
314
+ match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
315
+ if match:
316
+ category_raw = match.group(1).lower()
317
+ query_text = match.group(2).strip()
318
+
319
+ # Normalize category name
320
+ if category_raw in ['mixed/complicated', 'mixed']:
321
+ category = 'mixed'
322
+ else:
323
+ category = category_raw
324
+
325
+ if category in queries_by_category and len(query_text) > 15:
326
+ queries_by_category[category].append({
327
+ "text": query_text,
328
+ "category": category
329
+ })
330
+
331
+ print(f"📋 Parsed queries by category:")
332
+ for category, category_queries in queries_by_category.items():
333
+ print(f" {category.capitalize()}: {len(category_queries)} queries")
334
+
335
+ return queries_by_category
336
+
337
+ except Exception as e:
338
+ print(f"❌ Failed to read file: {e}")
339
+ return {"error": f"Failed to read file: {e}"}
340
+
341
+ def calculate_coverage_statistics(self) -> Dict[str, Any]:
342
+ """Calculate coverage statistics by category"""
343
+ category_stats = {}
344
+ all_successful_results = []
345
+
346
+ # Group results by category
347
+ results_by_category = {
348
+ "diagnosis": [],
349
+ "treatment": [],
350
+ "mixed": []
351
+ }
352
+
353
+ for result in self.coverage_results:
354
+ category = result.get('category', 'unknown')
355
+ if category in results_by_category:
356
+ results_by_category[category].append(result)
357
+ if result.get('pipeline_success'):
358
+ all_successful_results.append(result)
359
+
360
+ # Calculate statistics for each category
361
+ for category, results in results_by_category.items():
362
+ successful_results = [r for r in results if r.get('pipeline_success')]
363
+
364
+ if successful_results:
365
+ coverage_scores = [r['coverage_score'] for r in successful_results]
366
+ avg_coverage = sum(coverage_scores) / len(coverage_scores)
367
+ avg_retrieval_time = sum(r.get('retrieval_time', 0) for r in successful_results) / len(successful_results)
368
+ avg_generation_time = sum(r.get('generation_time', 0) for r in successful_results) / len(successful_results)
369
+
370
+ category_stats[category] = {
371
+ "average_coverage": avg_coverage,
372
+ "max_coverage": max(coverage_scores),
373
+ "min_coverage": min(coverage_scores),
374
+ "successful_evaluations": len(successful_results),
375
+ "total_queries": len(results),
376
+ "success_rate": len(successful_results) / len(results),
377
+ "average_retrieval_time": avg_retrieval_time,
378
+ "average_generation_time": avg_generation_time,
379
+ "meets_threshold": avg_coverage >= 0.6,
380
+ "individual_coverage_scores": coverage_scores
381
+ }
382
+ else:
383
+ category_stats[category] = {
384
+ "average_coverage": 0.0,
385
+ "max_coverage": 0.0,
386
+ "min_coverage": 0.0,
387
+ "successful_evaluations": 0,
388
+ "total_queries": len(results),
389
+ "success_rate": 0.0,
390
+ "average_retrieval_time": 0.0,
391
+ "average_generation_time": 0.0,
392
+ "meets_threshold": False,
393
+ "individual_coverage_scores": []
394
+ }
395
+
396
+ # Calculate overall statistics
397
+ if all_successful_results:
398
+ all_coverage_scores = [r['coverage_score'] for r in all_successful_results]
399
+ overall_stats = {
400
+ "average_coverage": sum(all_coverage_scores) / len(all_coverage_scores),
401
+ "max_coverage": max(all_coverage_scores),
402
+ "min_coverage": min(all_coverage_scores),
403
+ "successful_evaluations": len(all_successful_results),
404
+ "total_queries": len(self.coverage_results),
405
+ "success_rate": len(all_successful_results) / len(self.coverage_results),
406
+ "meets_threshold": (sum(all_coverage_scores) / len(all_coverage_scores)) >= 0.6,
407
+ "target_compliance": (sum(all_coverage_scores) / len(all_coverage_scores)) >= 0.6
408
+ }
409
+ else:
410
+ overall_stats = {
411
+ "average_coverage": 0.0,
412
+ "max_coverage": 0.0,
413
+ "min_coverage": 0.0,
414
+ "successful_evaluations": 0,
415
+ "total_queries": len(self.coverage_results),
416
+ "success_rate": 0.0,
417
+ "meets_threshold": False,
418
+ "target_compliance": False
419
+ }
420
+
421
+ return {
422
+ "category_results": category_stats,
423
+ "overall_results": overall_stats,
424
+ "timestamp": datetime.now().isoformat()
425
+ }
426
+
427
+ def save_coverage_statistics(self, filename: str = None) -> str:
428
+ """Save coverage statistics for chart generation"""
429
+ stats = self.calculate_coverage_statistics()
430
+
431
+ if filename is None:
432
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
433
+ filename = f"coverage_statistics_{timestamp}.json"
434
+
435
+ # Ensure results directory exists
436
+ results_dir = Path(__file__).parent / "results"
437
+ results_dir.mkdir(exist_ok=True)
438
+
439
+ filepath = results_dir / filename
440
+
441
+ with open(filepath, 'w', encoding='utf-8') as f:
442
+ json.dump(stats, f, indent=2, ensure_ascii=False)
443
+
444
+ print(f"📊 Coverage statistics saved to: {filepath}")
445
+ return str(filepath)
446
+
447
+ def save_coverage_details(self, filename: str = None) -> str:
448
+ """Save detailed coverage results"""
449
+ if filename is None:
450
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
451
+ filename = f"coverage_details_{timestamp}.json"
452
+
453
+ # Ensure results directory exists
454
+ results_dir = Path(__file__).parent / "results"
455
+ results_dir.mkdir(exist_ok=True)
456
+
457
+ filepath = results_dir / filename
458
+
459
+ # Create comprehensive coverage data
460
+ coverage_data = {
461
+ "evaluation_metadata": {
462
+ "total_queries": len(self.coverage_results),
463
+ "successful_evaluations": len([r for r in self.coverage_results if r.get('pipeline_success')]),
464
+ "timestamp": datetime.now().isoformat(),
465
+ "evaluator_type": "retrieval_coverage",
466
+ "threshold_used": 0.6
467
+ },
468
+ "coverage_results": self.coverage_results
469
+ }
470
+
471
+ with open(filepath, 'w', encoding='utf-8') as f:
472
+ json.dump(coverage_data, f, indent=2, ensure_ascii=False)
473
+
474
+ print(f"📝 Coverage details saved to: {filepath}")
475
+ return str(filepath)
476
+
477
+
478
+ # Independent execution interface
479
+ if __name__ == "__main__":
480
+ """Independent coverage evaluation interface"""
481
+
482
+ print("📈 OnCall.ai Coverage Evaluator - Retrieval Coverage Analysis")
483
+
484
+ if len(sys.argv) > 1:
485
+ query_file = sys.argv[1]
486
+ else:
487
+ # Default to evaluation/pre_user_query_evaluate.txt
488
+ query_file = Path(__file__).parent / "pre_user_query_evaluate.txt"
489
+
490
+ if not os.path.exists(query_file):
491
+ print(f"❌ Query file not found: {query_file}")
492
+ print("Usage: python coverage_evaluator.py [query_file.txt]")
493
+ sys.exit(1)
494
+
495
+ # Initialize evaluator
496
+ evaluator = CoverageEvaluator()
497
+
498
+ # Parse queries from file
499
+ queries_by_category = evaluator.parse_queries_from_file(str(query_file))
500
+
501
+ if "error" in queries_by_category:
502
+ print(f"❌ Failed to parse queries: {queries_by_category['error']}")
503
+ sys.exit(1)
504
+
505
+ # Test coverage for each query (requires full pipeline)
506
+ print(f"\n🧪 Retrieval Coverage Testing (Full Pipeline Required)")
507
+ print(f"⚠️ Note: This evaluator requires LLM calls for advice generation")
508
+
509
+ for category, queries in queries_by_category.items():
510
+ if not queries:
511
+ continue
512
+
513
+ print(f"\n📂 Testing {category.upper()} coverage:")
514
+
515
+ for i, query_info in enumerate(queries):
516
+ query_text = query_info['text']
517
+
518
+ # Test coverage (requires full pipeline)
519
+ result = evaluator.evaluate_single_coverage(query_text, category)
520
+
521
+ # Pause between queries to avoid rate limits
522
+ if i < len(queries) - 1:
523
+ print(f" ⏳ Pausing 5s before next query...")
524
+ import time
525
+ time.sleep(5)
526
+
527
+ # Longer pause between categories
528
+ if category != list(queries_by_category.keys())[-1]:
529
+ print(f"\n⏳ Pausing 10s before next category...")
530
+ import time
531
+ time.sleep(10)
532
+
533
+ # Generate and save results
534
+ print(f"\n📊 Generating coverage analysis...")
535
+
536
+ # Save statistics and details
537
+ stats_path = evaluator.save_coverage_statistics()
538
+ details_path = evaluator.save_coverage_details()
539
+
540
+ # Print final summary
541
+ stats = evaluator.calculate_coverage_statistics()
542
+ category_results = stats['category_results']
543
+ overall_results = stats['overall_results']
544
+
545
+ print(f"\n📊 === COVERAGE EVALUATION SUMMARY ===")
546
+ print(f"Overall Performance:")
547
+ print(f" Average Coverage: {overall_results['average_coverage']:.3f} ({overall_results['average_coverage']*100:.1f}%)")
548
+ print(f" Pipeline Success Rate: {overall_results['success_rate']:.1%}")
549
+ print(f" 60% Threshold: {'✅ Met' if overall_results['meets_threshold'] else '❌ Not Met'}")
550
+
551
+ print(f"\nCategory Breakdown:")
552
+ for category, cat_stats in category_results.items():
553
+ if cat_stats['total_queries'] > 0:
554
+ print(f" {category.capitalize()}: {cat_stats['average_coverage']:.3f} "
555
+ f"({cat_stats['successful_evaluations']}/{cat_stats['total_queries']}) "
556
+ f"[R:{cat_stats['average_retrieval_time']:.2f}s, G:{cat_stats['average_generation_time']:.2f}s]")
557
+
558
+ print(f"\n✅ Coverage evaluation complete!")
559
+ print(f"📊 Statistics: {stats_path}")
560
+ print(f"📝 Details: {details_path}")
evaluation/{evaluation_instruction.md → old/evaluation_instruction.md} RENAMED
@@ -1,4 +1,5 @@
1
  # Model use
 
2
  llm model: (for comparison) with our-own version.
3
  https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B
4
  https://huggingface.co/m42-health/Llama3-Med42-70B
@@ -12,59 +13,59 @@ https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct
12
  """
13
  ```
14
 
15
-
16
  ### 評估執行流程
 
17
  ```python
18
  def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str, Any]:
19
  """執行完整的六項指標評估"""
20
-
21
  results = {
22
  "model": model_name,
23
  "metrics": {},
24
  "detailed_results": []
25
  }
26
-
27
  total_latencies = []
28
  extraction_successes = []
29
  relevance_scores = []
30
  coverage_scores = []
31
  actionability_scores = []
32
  evidence_scores = []
33
-
34
  for query in test_cases:
35
  # 運行模型並測量所有指標
36
  start_time = time.time()
37
-
38
  # 1. 總處理時長
39
  latency_result = measure_total_latency(query)
40
  total_latencies.append(latency_result['total_latency'])
41
-
42
  # 2. 條件抽取成功率
43
  extraction_result = evaluate_condition_extraction([query])
44
  extraction_successes.append(extraction_result['success_rate'])
45
-
46
  # 3 & 4. 檢索相關性和覆蓋率(需要實際檢索結果)
47
  retrieval_results = get_retrieval_results(query)
48
  relevance_result = evaluate_retrieval_relevance(retrieval_results)
49
  relevance_scores.append(relevance_result['average_relevance'])
50
-
51
  generated_advice = get_generated_advice(query, retrieval_results)
52
  coverage_result = evaluate_retrieval_coverage(generated_advice, retrieval_results)
53
  coverage_scores.append(coverage_result['coverage'])
54
-
55
  # 5 & 6. LLM 評估(需要完整回應)
56
  response_data = {
57
  'query': query,
58
  'advice': generated_advice,
59
  'retrieval_results': retrieval_results
60
  }
61
-
62
  actionability_result = evaluate_clinical_actionability([response_data])
63
  actionability_scores.append(actionability_result[0]['overall_score'])
64
-
65
  evidence_result = evaluate_clinical_evidence([response_data])
66
  evidence_scores.append(evidence_result[0]['overall_score'])
67
-
68
  # 記錄詳細結果
69
  results["detailed_results"].append({
70
  "query": query,
@@ -75,7 +76,7 @@ def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str,
75
  "actionability": actionability_result[0],
76
  "evidence": evidence_result[0]
77
  })
78
-
79
  # 計算平均指標
80
  results["metrics"] = {
81
  "average_latency": sum(total_latencies) / len(total_latencies),
@@ -85,7 +86,7 @@ def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str,
85
  "average_actionability": sum(actionability_scores) / len(actionability_scores),
86
  "average_evidence_score": sum(evidence_scores) / len(evidence_scores)
87
  }
88
-
89
  return results
90
  ```
91
 
@@ -94,41 +95,43 @@ def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str,
94
  ## 📈 評估結果分析框架
95
 
96
  ### 統計分析
 
97
  ```python
98
  def analyze_evaluation_results(results_A: Dict, results_B: Dict, results_C: Dict) -> Dict:
99
  """比較三個模型的評估結果"""
100
-
101
  models = ['Med42-70B_direct', 'RAG_enhanced', 'OpenBioLLM-70B']
102
  metrics = ['latency', 'extraction_success_rate', 'relevance', 'coverage', 'actionability', 'evidence_score']
103
-
104
  comparison = {}
105
-
106
  for metric in metrics:
107
  comparison[metric] = {
108
  models[0]: results_A['metrics'][f'average_{metric}'],
109
  models[1]: results_B['metrics'][f'average_{metric}'],
110
  models[2]: results_C['metrics'][f'average_{metric}']
111
  }
112
-
113
  # 計算相對改進
114
  baseline = comparison[metric][models[0]]
115
  rag_improvement = ((comparison[metric][models[1]] - baseline) / baseline) * 100
116
-
117
  comparison[metric]['rag_improvement_percent'] = rag_improvement
118
-
119
  return comparison
120
  ```
121
 
122
  ### 報告生成
 
123
  ```python
124
  def generate_evaluation_report(comparison_results: Dict) -> str:
125
  """生成評估報告"""
126
-
127
  report = f"""
128
  # OnCall.ai 系統評估報告
129
-
130
  ## 評估摘要
131
-
132
  | 指標 | Med42-70B | RAG增強版 | OpenBioLLM | RAG改進% |
133
  |------|-----------|-----------|------------|----------|
134
  | 處理時長 | {comparison_results['latency']['Med42-70B_direct']:.2f}s | {comparison_results['latency']['RAG_enhanced']:.2f}s | {comparison_results['latency']['OpenBioLLM-70B']:.2f}s | {comparison_results['latency']['rag_improvement_percent']:+.1f}% |
@@ -137,9 +140,9 @@ def generate_evaluation_report(comparison_results: Dict) -> str:
137
  | 檢索覆蓋率 | - | {comparison_results['coverage']['RAG_enhanced']:.1%} | - | - |
138
  | 臨床可操作性 | {comparison_results['actionability']['Med42-70B_direct']:.1f}/10 | {comparison_results['actionability']['RAG_enhanced']:.1f}/10 | {comparison_results['actionability']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['actionability']['rag_improvement_percent']:+.1f}% |
139
  | 臨床證據評分 | {comparison_results['evidence_score']['Med42-70B_direct']:.1f}/10 | {comparison_results['evidence_score']['RAG_enhanced']:.1f}/10 | {comparison_results['evidence_score']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['evidence_score']['rag_improvement_percent']:+.1f}% |
140
-
141
  """
142
-
143
  return report
144
  ```
145
 
@@ -148,6 +151,7 @@ def generate_evaluation_report(comparison_results: Dict) -> str:
148
  ## 🔧 實驗執行步驟
149
 
150
  ### 1. 環境準備
 
151
  ```bash
152
  # 設置 HuggingFace token(用於 Inference Providers)
153
  export HF_TOKEN=your_huggingface_token
@@ -157,48 +161,49 @@ export ONCALL_EVAL_MODE=true
157
  ```
158
 
159
  ### 2. 實驗執行腳本框架
 
160
  ```python
161
  # evaluation/run_evaluation.py
162
  def main():
163
  """主要評估執行函數"""
164
-
165
  # 加載測試用例
166
  test_cases = MEDICAL_TEST_CASES
167
-
168
  # 實驗 A: YanBo 系統評估
169
  print("🔬 開始實驗 A: YanBo 系統評估")
170
  results_med42_direct = run_complete_evaluation("Med42-70B_direct", test_cases)
171
- results_general_rag = run_complete_evaluation("Med42-70B_general_RAG", test_cases)
172
  results_openbio = run_complete_evaluation("OpenBioLLM-70B", test_cases)
173
-
174
  # 分析和報告
175
  comparison_A = analyze_evaluation_results(results_med42_direct, results_general_rag, results_openbio)
176
  report_A = generate_evaluation_report(comparison_A)
177
-
178
  # 保存結果
179
  save_results("evaluation/results/yanbo_evaluation.json", {
180
  "comparison": comparison_A,
181
  "detailed_results": [results_med42_direct, results_general_rag, results_openbio]
182
  })
183
-
184
  print("✅ 實驗 A 完成,結果已保存")
185
-
186
  # 實驗 B: Jeff 系統評估
187
  print("🔬 開始實驗 B: Jeff 系統評估")
188
  results_med42_direct_b = run_complete_evaluation("Med42-70B_direct", test_cases)
189
  results_customized_rag = run_complete_evaluation("Med42-70B_customized_RAG", test_cases)
190
  results_openbio_b = run_complete_evaluation("OpenBioLLM-70B", test_cases)
191
-
192
  # 分析和報告
193
  comparison_B = analyze_evaluation_results(results_med42_direct_b, results_customized_rag, results_openbio_b)
194
  report_B = generate_evaluation_report(comparison_B)
195
-
196
  # 保存結果
197
  save_results("evaluation/results/jeff_evaluation.json", {
198
  "comparison": comparison_B,
199
  "detailed_results": [results_med42_direct_b, results_customized_rag, results_openbio_b]
200
  })
201
-
202
  print("✅ 實驗 B 完成,結果已保存")
203
 
204
  if __name__ == "__main__":
@@ -206,6 +211,7 @@ if __name__ == "__main__":
206
  ```
207
 
208
  ### 3. 預期評估時間
 
209
  ```
210
  總評估時間估算:
211
  ├── 每個查詢處理時間:~30秒(包含LLM評估)
@@ -219,10 +225,11 @@ if __name__ == "__main__":
219
  ## 📊 評估成功標準
220
 
221
  ### 系統性能目標
 
222
  ```
223
  ✅ 達標條件:
224
  1. 總處理時長 ≤ 30秒
225
- 2. 條件抽取成功率 ≥ 80%
226
  3. 檢索相關性 ≥ 0.2
227
  4. 檢索覆蓋率 ≥ 60%
228
  5. 臨床可操作性 ≥ 7.0/10
@@ -234,6 +241,7 @@ if __name__ == "__main__":
234
  ```
235
 
236
  ### 比較分析重點
 
237
  ```
238
  重點分析維度:
239
  ├── RAG 對處理時間的影響(可能增加延遲)
@@ -247,6 +255,7 @@ if __name__ == "__main__":
247
  ## 🛠️ 實施建議
248
 
249
  ### 分階段實施
 
250
  ```
251
  階段1: 基礎指標實現(1-4項)
252
  ├── 利用現有 app.py 中的時間測量
@@ -268,6 +277,7 @@ if __name__ == "__main__":
268
  ```
269
 
270
  ### 實施注意事項
 
271
  ```
272
  ⚠️ 重要提醒:
273
  1. 所有評估代碼應獨立於現有系統,避免影響正常運行
@@ -280,3 +290,412 @@ if __name__ == "__main__":
280
  ---
281
 
282
  **評估指南完成。請根據此指南實施評估實驗。**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Model use
2
+
3
  llm model: (for comparison) with our-own version.
4
  https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B
5
  https://huggingface.co/m42-health/Llama3-Med42-70B
 
13
  """
14
  ```
15
 
 
16
  ### 評估執行流程
17
+
18
  ```python
19
  def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str, Any]:
20
  """執行完整的六項指標評估"""
21
+
22
  results = {
23
  "model": model_name,
24
  "metrics": {},
25
  "detailed_results": []
26
  }
27
+
28
  total_latencies = []
29
  extraction_successes = []
30
  relevance_scores = []
31
  coverage_scores = []
32
  actionability_scores = []
33
  evidence_scores = []
34
+
35
  for query in test_cases:
36
  # 運行模型並測量所有指標
37
  start_time = time.time()
38
+
39
  # 1. 總處理時長
40
  latency_result = measure_total_latency(query)
41
  total_latencies.append(latency_result['total_latency'])
42
+
43
  # 2. 條件抽取成功率
44
  extraction_result = evaluate_condition_extraction([query])
45
  extraction_successes.append(extraction_result['success_rate'])
46
+
47
  # 3 & 4. 檢索相關性和覆蓋率(需要實際檢索結果)
48
  retrieval_results = get_retrieval_results(query)
49
  relevance_result = evaluate_retrieval_relevance(retrieval_results)
50
  relevance_scores.append(relevance_result['average_relevance'])
51
+
52
  generated_advice = get_generated_advice(query, retrieval_results)
53
  coverage_result = evaluate_retrieval_coverage(generated_advice, retrieval_results)
54
  coverage_scores.append(coverage_result['coverage'])
55
+
56
  # 5 & 6. LLM 評估(需要完整回應)
57
  response_data = {
58
  'query': query,
59
  'advice': generated_advice,
60
  'retrieval_results': retrieval_results
61
  }
62
+
63
  actionability_result = evaluate_clinical_actionability([response_data])
64
  actionability_scores.append(actionability_result[0]['overall_score'])
65
+
66
  evidence_result = evaluate_clinical_evidence([response_data])
67
  evidence_scores.append(evidence_result[0]['overall_score'])
68
+
69
  # 記錄詳細結果
70
  results["detailed_results"].append({
71
  "query": query,
 
76
  "actionability": actionability_result[0],
77
  "evidence": evidence_result[0]
78
  })
79
+
80
  # 計算平均指標
81
  results["metrics"] = {
82
  "average_latency": sum(total_latencies) / len(total_latencies),
 
86
  "average_actionability": sum(actionability_scores) / len(actionability_scores),
87
  "average_evidence_score": sum(evidence_scores) / len(evidence_scores)
88
  }
89
+
90
  return results
91
  ```
92
 
 
95
  ## 📈 評估結果分析框架
96
 
97
  ### 統計分析
98
+
99
  ```python
100
  def analyze_evaluation_results(results_A: Dict, results_B: Dict, results_C: Dict) -> Dict:
101
  """比較三個模型的評估結果"""
102
+
103
  models = ['Med42-70B_direct', 'RAG_enhanced', 'OpenBioLLM-70B']
104
  metrics = ['latency', 'extraction_success_rate', 'relevance', 'coverage', 'actionability', 'evidence_score']
105
+
106
  comparison = {}
107
+
108
  for metric in metrics:
109
  comparison[metric] = {
110
  models[0]: results_A['metrics'][f'average_{metric}'],
111
  models[1]: results_B['metrics'][f'average_{metric}'],
112
  models[2]: results_C['metrics'][f'average_{metric}']
113
  }
114
+
115
  # 計算相對改進
116
  baseline = comparison[metric][models[0]]
117
  rag_improvement = ((comparison[metric][models[1]] - baseline) / baseline) * 100
118
+
119
  comparison[metric]['rag_improvement_percent'] = rag_improvement
120
+
121
  return comparison
122
  ```
123
 
124
  ### 報告生成
125
+
126
  ```python
127
  def generate_evaluation_report(comparison_results: Dict) -> str:
128
  """生成評估報告"""
129
+
130
  report = f"""
131
  # OnCall.ai 系統評估報告
132
+
133
  ## 評估摘要
134
+
135
  | 指標 | Med42-70B | RAG增強版 | OpenBioLLM | RAG改進% |
136
  |------|-----------|-----------|------------|----------|
137
  | 處理時長 | {comparison_results['latency']['Med42-70B_direct']:.2f}s | {comparison_results['latency']['RAG_enhanced']:.2f}s | {comparison_results['latency']['OpenBioLLM-70B']:.2f}s | {comparison_results['latency']['rag_improvement_percent']:+.1f}% |
 
140
  | 檢索覆蓋率 | - | {comparison_results['coverage']['RAG_enhanced']:.1%} | - | - |
141
  | 臨床可操作性 | {comparison_results['actionability']['Med42-70B_direct']:.1f}/10 | {comparison_results['actionability']['RAG_enhanced']:.1f}/10 | {comparison_results['actionability']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['actionability']['rag_improvement_percent']:+.1f}% |
142
  | 臨床證據評分 | {comparison_results['evidence_score']['Med42-70B_direct']:.1f}/10 | {comparison_results['evidence_score']['RAG_enhanced']:.1f}/10 | {comparison_results['evidence_score']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['evidence_score']['rag_improvement_percent']:+.1f}% |
143
+
144
  """
145
+
146
  return report
147
  ```
148
 
 
151
  ## 🔧 實驗執行步驟
152
 
153
  ### 1. 環境準備
154
+
155
  ```bash
156
  # 設置 HuggingFace token(用於 Inference Providers)
157
  export HF_TOKEN=your_huggingface_token
 
161
  ```
162
 
163
  ### 2. 實驗執行腳本框架
164
+
165
  ```python
166
  # evaluation/run_evaluation.py
167
  def main():
168
  """主要評估執行函數"""
169
+
170
  # 加載測試用例
171
  test_cases = MEDICAL_TEST_CASES
172
+
173
  # 實驗 A: YanBo 系統評估
174
  print("🔬 開始實驗 A: YanBo 系統評估")
175
  results_med42_direct = run_complete_evaluation("Med42-70B_direct", test_cases)
176
+ results_general_rag = run_complete_evaluation("Med42-70B_general_RAG", test_cases)
177
  results_openbio = run_complete_evaluation("OpenBioLLM-70B", test_cases)
178
+
179
  # 分析和報告
180
  comparison_A = analyze_evaluation_results(results_med42_direct, results_general_rag, results_openbio)
181
  report_A = generate_evaluation_report(comparison_A)
182
+
183
  # 保存結果
184
  save_results("evaluation/results/yanbo_evaluation.json", {
185
  "comparison": comparison_A,
186
  "detailed_results": [results_med42_direct, results_general_rag, results_openbio]
187
  })
188
+
189
  print("✅ 實驗 A 完成,結果已保存")
190
+
191
  # 實驗 B: Jeff 系統評估
192
  print("🔬 開始實驗 B: Jeff 系統評估")
193
  results_med42_direct_b = run_complete_evaluation("Med42-70B_direct", test_cases)
194
  results_customized_rag = run_complete_evaluation("Med42-70B_customized_RAG", test_cases)
195
  results_openbio_b = run_complete_evaluation("OpenBioLLM-70B", test_cases)
196
+
197
  # 分析和報告
198
  comparison_B = analyze_evaluation_results(results_med42_direct_b, results_customized_rag, results_openbio_b)
199
  report_B = generate_evaluation_report(comparison_B)
200
+
201
  # 保存結果
202
  save_results("evaluation/results/jeff_evaluation.json", {
203
  "comparison": comparison_B,
204
  "detailed_results": [results_med42_direct_b, results_customized_rag, results_openbio_b]
205
  })
206
+
207
  print("✅ 實驗 B 完成,結果已保存")
208
 
209
  if __name__ == "__main__":
 
211
  ```
212
 
213
  ### 3. 預期評估時間
214
+
215
  ```
216
  總評估時間估算:
217
  ├── 每個查詢處理時間:~30秒(包含LLM評估)
 
225
  ## 📊 評估成功標準
226
 
227
  ### 系統性能目標
228
+
229
  ```
230
  ✅ 達標條件:
231
  1. 總處理時長 ≤ 30秒
232
+ 2. 條件抽取成功率 ≥ 80%
233
  3. 檢索相關性 ≥ 0.2
234
  4. 檢索覆蓋率 ≥ 60%
235
  5. 臨床可操作性 ≥ 7.0/10
 
241
  ```
242
 
243
  ### 比較分析重點
244
+
245
  ```
246
  重點分析維度:
247
  ├── RAG 對處理時間的影響(可能增加延遲)
 
255
  ## 🛠️ 實施建議
256
 
257
  ### 分階段實施
258
+
259
  ```
260
  階段1: 基礎指標實現(1-4項)
261
  ├── 利用現有 app.py 中的時間測量
 
277
  ```
278
 
279
  ### 實施注意事項
280
+
281
  ```
282
  ⚠️ 重要提醒:
283
  1. 所有評估代碼應獨立於現有系統,避免影響正常運行
 
290
  ---
291
 
292
  **評估指南完成。請根據此指南實施評估實驗。**
293
+
294
+ ## Phase 1: Initial Assessment
295
+
296
+ ### Step 1.1 - 分析您的說明
297
+
298
+ 我重新理解了您的意思!
299
+
300
+ ### Step 1.2 - 無不清楚的需求
301
+
302
+ ### Step 1.3 - 確認理解(中文)
303
+
304
+ 完全明白了!您的評估架構是:
305
+
306
+ ## 🎯 **評估指標的測試層級**
307
+
308
+ ### **單獨測試 Retrieval(指標 2, 3, 4)**
309
+
310
+ ```python
311
+ # 只測試 RAG 系統,因為只有 RAG 系統有這些組件:
312
+ retrieval_only_metrics = [
313
+ "條件抽取成功率", # 只有您的系統有 user_prompt.py
314
+ "檢索相關性", # 只有 RAG 系統有檢索結果
315
+ "檢索覆蓋率" # 只有 RAG 系統有檢索→生成的映射
316
+ ]
317
+
318
+ # 測試對象:
319
+ - Med42-70B_general_RAG(您的系統)✅
320
+ - Med42-70B_customized_RAG(Jeff系統)✅
321
+ - Med42-70B_direct(無RAG)❌ 沒有檢索組件
322
+ - OpenBioLLM-70B(無RAG)❌ 沒有檢索組件
323
+ ```
324
+
325
+ ### **三模型一起測試(指標 1, 5, 6)**
326
+
327
+ ```python
328
+ # 所有模型都能測試的指標:
329
+ universal_metrics = [
330
+ "總處理時長", # 所有模型都有響應時間
331
+ "臨床可操作性", # Llama3-70B 評估所有模型的輸��
332
+ "臨床證據評分" # Llama3-70B 評估所有模型的輸出
333
+ ]
334
+
335
+ # 測試對象:
336
+ - Med42-70B_direct ✅
337
+ - Med42-70B_general_RAG ✅
338
+ - OpenBioLLM-70B ✅
339
+ ```
340
+
341
+ ## 📊 **分層評估策略**
342
+
343
+ ### **階段 1:RAG 系統內部評估**
344
+
345
+ ```python
346
+ # 只測試有 RAG 的系統
347
+ rag_systems = ["Med42-70B_general_RAG", "Med42-70B_customized_RAG"]
348
+
349
+ # 測試 RAG 特有指標
350
+ for system in rag_systems:
351
+ evaluate_retrieval_metrics(system) # 指標 2, 3, 4
352
+ ```
353
+
354
+ ### **階段 2:全模型比較評估**
355
+
356
+ ```python
357
+ # 測試所有三個模型
358
+ all_systems = ["Med42-70B_direct", "Med42-70B_general_RAG", "OpenBioLLM-70B"]
359
+
360
+ # 測試通用指標
361
+ for system in all_systems:
362
+ evaluate_universal_metrics(system) # 指標 1, 5, 6
363
+ ```
364
+
365
+ ### **階段 3:綜合分析**
366
+
367
+ ```python
368
+ # 合併兩階段結果,生成完整報告
369
+ combine_evaluation_results()
370
+ ```
371
+
372
+ ### Step 1.4 - 無視覺修改確認 ✅
373
+
374
+ 此為評估策略討論,無涉及代碼修改。
375
+
376
+ **您的理解完全正確!RAG 特有的指標只能在 RAG 系統內部測試,而通用指標可以跨所有模型比較。這樣的分層評估策略非常合理!**
377
+
378
+ ---
379
+
380
+ ## 📊 第七個評估指標(YanBo系統特有)
381
+
382
+ ### 7. 多層級 Fallback 效率(早期攔截率)
383
+
384
+ **定義:** 系統通過多層級 Fallback 機制在早期層級成功處理查詢的效率
385
+
386
+ **測量位置:** `src/user_prompt.py` 的 `extract_condition_keywords` 多層級處理邏輯
387
+
388
+ **計算公式:**
389
+ ```
390
+ Early_Interception_Rate = (Level1_Success + Level2_Success) / Total_Queries
391
+
392
+ 其中:
393
+ - Level1_Success = 在預定義映射中直接找到條件的查詢數
394
+ - Level2_Success = 通過LLM抽取成功的查詢數
395
+ - Total_Queries = 測試查詢總數
396
+
397
+ 時間節省效果:
398
+ Time_Savings = (Late_Avg_Time - Early_Avg_Time) / Late_Avg_Time
399
+
400
+ 早期攔截效率:
401
+ Efficiency_Score = Early_Interception_Rate × (1 + Time_Savings)
402
+ ```
403
+
404
+ **ASCII 流程圖:**
405
+ ```
406
+ 多層級 Fallback 效率示意圖:
407
+ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
408
+ │ 用戶查詢 │───▶│ Level 1 │───▶│ 直接成功 │
409
+ │ "胸痛診斷" │ │ 預定義映射 │ │ 35% (快) │
410
+ └─────────────┘ └─────────────┘ └─────────────┘
411
+
412
+ ▼ (失敗)
413
+ ┌─────────────┐ ┌─────────────┐
414
+ │ Level 2 │───▶│ LLM抽取成功 │
415
+ │ LLM 條件抽取│ │ 40% (中等) │
416
+ └─────────────┘ └─────────────┘
417
+
418
+ ▼ (失敗)
419
+ ┌─────────────┐ ┌─────────────┐
420
+ │ Level 3-5 │───▶│ 後備成功 │
421
+ │ 後續層級 │ │ 20% (慢) │
422
+ └─────────────┘ └─────────────┘
423
+
424
+ ▼ (失敗)
425
+ ┌─────────────┐
426
+ │ 完全失敗 │
427
+ │ 5% (錯誤) │
428
+ └─────────────┘
429
+
430
+ 早期攔截率 = (35% + 40%) = 75% ✅ 目標 > 70%
431
+ ```
432
+
433
+ **實現框架:**
434
+ ```python
435
+ # 基於 user_prompt.py 的多層級處理邏輯
436
+ def evaluate_early_interception_efficiency(test_queries: List[str]) -> Dict[str, float]:
437
+ """評估早期攔截率 - YanBo系統核心優勢"""
438
+
439
+ level1_success = 0 # Level 1: 預定義映射成功
440
+ level2_success = 0 # Level 2: LLM 抽取成功
441
+ later_success = 0 # Level 3-5: 後續層級成功
442
+ total_failures = 0 # 完全失敗
443
+
444
+ early_times = [] # 早期成功的處理時間
445
+ late_times = [] # 後期成功的處理時間
446
+
447
+ for query in test_queries:
448
+ # 追蹤每個查詢的成功層級和時間
449
+ success_level, processing_time = track_query_success_level(query)
450
+
451
+ if success_level == 1:
452
+ level1_success += 1
453
+ early_times.append(processing_time)
454
+ elif success_level == 2:
455
+ level2_success += 1
456
+ early_times.append(processing_time)
457
+ elif success_level in [3, 4, 5]:
458
+ later_success += 1
459
+ late_times.append(processing_time)
460
+ else:
461
+ total_failures += 1
462
+
463
+ total_queries = len(test_queries)
464
+ early_success_count = level1_success + level2_success
465
+
466
+ # 計算時間節省效果
467
+ early_avg_time = sum(early_times) / len(early_times) if early_times else 0
468
+ late_avg_time = sum(late_times) / len(late_times) if late_times else 0
469
+ time_savings = (late_avg_time - early_avg_time) / late_avg_time if late_avg_time > 0 else 0
470
+
471
+ # 綜合效率分數
472
+ early_interception_rate = early_success_count / total_queries
473
+ efficiency_score = early_interception_rate * (1 + time_savings)
474
+
475
+ return {
476
+ # 核心指標
477
+ "early_interception_rate": early_interception_rate, # 早期攔截率
478
+ "level1_success_rate": level1_success / total_queries,
479
+ "level2_success_rate": level2_success / total_queries,
480
+
481
+ # 時間效率
482
+ "early_avg_time": early_avg_time,
483
+ "late_avg_time": late_avg_time,
484
+ "time_savings_rate": time_savings,
485
+
486
+ # 系統健康度
487
+ "total_success_rate": (total_queries - total_failures) / total_queries,
488
+ "miss_rate": total_failures / total_queries,
489
+
490
+ # 綜合效率
491
+ "overall_efficiency_score": efficiency_score,
492
+
493
+ # 詳細分布
494
+ "success_distribution": {
495
+ "level1": level1_success,
496
+ "level2": level2_success,
497
+ "later_levels": later_success,
498
+ "failures": total_failures
499
+ }
500
+ }
501
+
502
+ def track_query_success_level(query: str) -> Tuple[int, float]:
503
+ """
504
+ 追蹤查詢在哪個層級成功並記錄時間
505
+
506
+ Args:
507
+ query: 測試查詢
508
+
509
+ Returns:
510
+ Tuple of (success_level, processing_time)
511
+ """
512
+ start_time = time.time()
513
+
514
+ # 模擬 user_prompt.py 的層級處理邏輯
515
+ try:
516
+ # Level 1: 檢查預定義映射
517
+ if check_predefined_mapping(query):
518
+ processing_time = time.time() - start_time
519
+ return (1, processing_time)
520
+
521
+ # Level 2: LLM 條件抽取
522
+ llm_result = llm_client.analyze_medical_query(query)
523
+ if llm_result.get('extracted_condition'):
524
+ processing_time = time.time() - start_time
525
+ return (2, processing_time)
526
+
527
+ # Level 3: 語義搜索
528
+ semantic_result = semantic_search_fallback(query)
529
+ if semantic_result:
530
+ processing_time = time.time() - start_time
531
+ return (3, processing_time)
532
+
533
+ # Level 4: 醫學驗證
534
+ validation_result = validate_medical_query(query)
535
+ if not validation_result: # 驗證通過
536
+ processing_time = time.time() - start_time
537
+ return (4, processing_time)
538
+
539
+ # Level 5: 通用搜索
540
+ generic_result = generic_medical_search(query)
541
+ if generic_result:
542
+ processing_time = time.time() - start_time
543
+ return (5, processing_time)
544
+
545
+ # 完全失敗
546
+ processing_time = time.time() - start_time
547
+ return (0, processing_time)
548
+
549
+ except Exception as e:
550
+ processing_time = time.time() - start_time
551
+ return (0, processing_time)
552
+
553
+ def check_predefined_mapping(query: str) -> bool:
554
+ """檢查查詢是否在預定義映射中"""
555
+ # 基於 medical_conditions.py 的 CONDITION_KEYWORD_MAPPING
556
+ from medical_conditions import CONDITION_KEYWORD_MAPPING
557
+
558
+ query_lower = query.lower()
559
+ for condition, keywords in CONDITION_KEYWORD_MAPPING.items():
560
+ if any(keyword.lower() in query_lower for keyword in keywords):
561
+ return True
562
+ return False
563
+ ```
564
+
565
+ **目標閾值:**
566
+ - 早期攔截率 ≥ 70%(前兩層解決)
567
+ - 時間節省率 ≥ 60%(早期比後期快)
568
+ - 總成功率 ≥ 95%(漏接率 < 5%)
569
+
570
+ ---
571
+
572
+ ## 🧪 更新的完整評估流程
573
+
574
+ ### 測試用例設計
575
+ ```python
576
+ # 基於 readme.md 中的範例查詢設計測試集
577
+ MEDICAL_TEST_CASES = [
578
+ # Level 1 預期成功(預定義映射)
579
+ "患者胸痛怎麼處理?",
580
+ "心肌梗死的診斷方法?",
581
+
582
+ # Level 2 預期成功(LLM抽取)
583
+ "60歲男性,有高血壓病史,突發胸痛。可能的原因和評估方法?",
584
+ "30歲患者突發嚴重頭痛和頸部僵硬。鑑別診斷?",
585
+
586
+ # Level 3+ 預期成功(複雜查詢)
587
+ "患者急性呼吸困難和腿部水腫。應該考慮什麼?",
588
+ "20歲女性,無病史,突發癲癇。可能原因和完整處理流程?",
589
+
590
+ # 邊界測試
591
+ "疑似急性出血性中風。下一步處理?"
592
+ ]
593
+ ```
594
+
595
+ ### 更新的評估執行流程
596
+ ```python
597
+ def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str, Any]:
598
+ """執行完整的七項指標評估"""
599
+
600
+ results = {
601
+ "model": model_name,
602
+ "metrics": {},
603
+ "detailed_results": []
604
+ }
605
+
606
+ total_latencies = []
607
+ extraction_successes = []
608
+ relevance_scores = []
609
+ coverage_scores = []
610
+ actionability_scores = []
611
+ evidence_scores = []
612
+ fallback_efficiency_scores = [] # 新增
613
+
614
+ for query in test_cases:
615
+ # 運行模型並測量所有指標
616
+
617
+ # 1. 總處理時長
618
+ latency_result = measure_total_latency(query)
619
+ total_latencies.append(latency_result['total_latency'])
620
+
621
+ # 2. 條件抽取成功率
622
+ extraction_result = evaluate_condition_extraction([query])
623
+ extraction_successes.append(extraction_result['success_rate'])
624
+
625
+ # 3 & 4. 檢索相關性和覆蓋率
626
+ retrieval_results = get_retrieval_results(query)
627
+ relevance_result = evaluate_retrieval_relevance(retrieval_results)
628
+ relevance_scores.append(relevance_result['average_relevance'])
629
+
630
+ generated_advice = get_generated_advice(query, retrieval_results)
631
+ coverage_result = evaluate_retrieval_coverage(generated_advice, retrieval_results)
632
+ coverage_scores.append(coverage_result['coverage'])
633
+
634
+ # 5 & 6. LLM 評估
635
+ response_data = {
636
+ 'query': query,
637
+ 'advice': generated_advice,
638
+ 'retrieval_results': retrieval_results
639
+ }
640
+
641
+ actionability_result = evaluate_clinical_actionability([response_data])
642
+ actionability_scores.append(actionability_result[0]['overall_score'])
643
+
644
+ evidence_result = evaluate_clinical_evidence([response_data])
645
+ evidence_scores.append(evidence_result[0]['overall_score'])
646
+
647
+ # 7. 多層級 Fallback 效率(新增)
648
+ if model_name == "Med42-70B_general_RAG": # 只對YanBo系統測量
649
+ fallback_result = evaluate_early_interception_efficiency([query])
650
+ fallback_efficiency_scores.append(fallback_result['overall_efficiency_score'])
651
+
652
+ # 記錄詳細結果...
653
+
654
+ # 計算平均指標
655
+ results["metrics"] = {
656
+ "average_latency": sum(total_latencies) / len(total_latencies),
657
+ "extraction_success_rate": sum(extraction_successes) / len(extraction_successes),
658
+ "average_relevance": sum(relevance_scores) / len(relevance_scores),
659
+ "average_coverage": sum(coverage_scores) / len(coverage_scores),
660
+ "average_actionability": sum(actionability_scores) / len(actionability_scores),
661
+ "average_evidence_score": sum(evidence_scores) / len(evidence_scores),
662
+ # 新增指標(只對RAG系統有效)
663
+ "average_fallback_efficiency": sum(fallback_efficiency_scores) / len(fallback_efficiency_scores) if fallback_efficiency_scores else 0.0
664
+ }
665
+
666
+ return results
667
+ ```
668
+
669
+ ---
670
+
671
+ ## 📊 更新的系統成功標準
672
+
673
+ ### 系統性能目標(七個指標)
674
+ ```
675
+ ✅ 達標條件:
676
+ 1. 總處理時長 ≤ 30秒
677
+ 2. 條件抽取成功率 ≥ 80%
678
+ 3. 檢索相關性 ≥ 0.25(基於實際醫學數據)
679
+ 4. 檢索覆蓋率 ≥ 60%
680
+ 5. 臨床可操作性 ≥ 7.0/10
681
+ 6. 臨床證據評分 ≥ 7.5/10
682
+ 7. 早期攔截率 ≥ 70%(多層級 Fallback 效率)
683
+
684
+ 🎯 YanBo RAG 系統成功標準:
685
+ - RAG增強版在 5-7 項指標上優於基線 Med42-70B
686
+ - 早期攔截率體現多層級設計的優勢
687
+ - 整體提升幅度 ≥ 15%
688
+ ```
689
+
690
+ ### YanBo 系統特有優勢分析
691
+ ```
692
+ 多層級 Fallback 優勢:
693
+ ├── 漏接防護:通過多層級降低失敗率至 < 5%
694
+ ├── 時間優化:70%+ 查詢在前兩層快速解決
695
+ ├── 系統穩定:即使某層級失敗,後續層級提供保障
696
+ └── 智能分流:不同複雜度查詢自動分配到合適層級
697
+ ```
698
+
699
+ ---
700
+
701
+ **第七個指標已添加完成,專注測量您的多層級 Fallback 系統的早期攔截效率和時間節省效果。**
evaluation/{evaluation_instruction_customization.md → old/evaluation_instruction_customization.md} RENAMED
File without changes
evaluation/old/extraction_evaluator.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - Condition Extraction Evaluator (Metric 2)
4
+ ============================================================
5
+
6
+ Evaluates condition extraction success rate from user_prompt.py
7
+ Pure automatic evaluation based on extract_condition_keywords() results
8
+
9
+ Author: YanBo Chen
10
+ Date: 2025-08-04
11
+ """
12
+
13
+ import json
14
+ import os
15
+ import sys
16
+ from typing import Dict, List, Any
17
+ from datetime import datetime
18
+ from pathlib import Path
19
+ import re
20
+
21
+ # Add project path
22
+ current_dir = Path(__file__).parent
23
+ project_root = current_dir.parent
24
+ src_dir = project_root / "src"
25
+ sys.path.insert(0, str(src_dir))
26
+
27
+ # Import existing system components
28
+ try:
29
+ from user_prompt import UserPromptProcessor
30
+ from retrieval import BasicRetrievalSystem
31
+ from llm_clients import llm_Med42_70BClient
32
+ except ImportError as e:
33
+ print(f"❌ Import failed: {e}")
34
+ print("Please ensure running from project root directory")
35
+ sys.exit(1)
36
+
37
+
38
+ class ExtractionEvaluator:
39
+ """Condition extraction success rate evaluator - pure automatic evaluation"""
40
+
41
+ def __init__(self):
42
+ """Initialize system components for extraction testing"""
43
+ print("🔧 Initializing Extraction Evaluator...")
44
+
45
+ # Initialize required components for extraction
46
+ self.llm_client = llm_Med42_70BClient()
47
+ self.retrieval_system = BasicRetrievalSystem()
48
+ self.user_prompt_processor = UserPromptProcessor(
49
+ llm_client=self.llm_client,
50
+ retrieval_system=self.retrieval_system
51
+ )
52
+
53
+ # Results accumulation
54
+ self.extraction_results = []
55
+
56
+ print("✅ Extraction Evaluator initialization complete")
57
+
58
+ def evaluate_single_extraction(self, query: str, category: str = "unknown") -> Dict[str, Any]:
59
+ """
60
+ Evaluate condition extraction success for a single query
61
+
62
+ Tests user_prompt.py extract_condition_keywords() method
63
+
64
+ Args:
65
+ query: Medical query to test
66
+ category: Query category (diagnosis/treatment/mixed)
67
+ """
68
+ print(f"🔍 Testing extraction for: {query[:50]}...")
69
+ print(f"📋 Category: {category}")
70
+
71
+ try:
72
+ # Call the actual extraction method from user_prompt.py
73
+ extraction_start = datetime.now()
74
+ condition_result = self.user_prompt_processor.extract_condition_keywords(query)
75
+ extraction_time = (datetime.now() - extraction_start).total_seconds()
76
+
77
+ # Analyze extraction success
78
+ extracted_condition = condition_result.get('condition')
79
+ query_status = condition_result.get('query_status')
80
+ emergency_keywords = condition_result.get('emergency_keywords', [])
81
+ treatment_keywords = condition_result.get('treatment_keywords', [])
82
+ fallback_level = condition_result.get('fallback_level', 'unknown')
83
+
84
+ # Define success criteria
85
+ is_successful = (
86
+ extracted_condition and
87
+ extracted_condition.strip() and
88
+ extracted_condition != "unknown" and
89
+ query_status not in ['invalid_query', 'non_medical']
90
+ )
91
+
92
+ result = {
93
+ "query": query,
94
+ "category": category,
95
+ "extraction_success": is_successful,
96
+ "extraction_time": extraction_time,
97
+ "extracted_condition": extracted_condition,
98
+ "query_status": query_status,
99
+ "emergency_keywords": emergency_keywords,
100
+ "treatment_keywords": treatment_keywords,
101
+ "fallback_level": fallback_level,
102
+ "full_condition_result": condition_result,
103
+ "timestamp": datetime.now().isoformat()
104
+ }
105
+
106
+ # Store result
107
+ self.extraction_results.append(result)
108
+
109
+ print(f" ✅ Extraction: {'Success' if is_successful else 'Failed'}")
110
+ print(f" 📝 Condition: {extracted_condition}")
111
+ print(f" 🎯 Status: {query_status}")
112
+ print(f" ⏱️ Time: {extraction_time:.3f}s")
113
+ print(f" 🔄 Fallback Level: {fallback_level}")
114
+
115
+ return result
116
+
117
+ except Exception as e:
118
+ error_result = {
119
+ "query": query,
120
+ "category": category,
121
+ "extraction_success": False,
122
+ "extraction_time": 0.0,
123
+ "error": str(e),
124
+ "timestamp": datetime.now().isoformat()
125
+ }
126
+
127
+ self.extraction_results.append(error_result)
128
+ print(f" ❌ Extraction failed: {e}")
129
+
130
+ return error_result
131
+
132
+ def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
133
+ """Parse queries from file with category labels"""
134
+ print(f"📁 Reading queries from file: {filepath}")
135
+
136
+ try:
137
+ with open(filepath, 'r', encoding='utf-8') as f:
138
+ content = f.read()
139
+
140
+ # Parse queries with category labels
141
+ queries_by_category = {
142
+ "diagnosis": [],
143
+ "treatment": [],
144
+ "mixed": []
145
+ }
146
+
147
+ lines = content.strip().split('\n')
148
+
149
+ for line in lines:
150
+ line = line.strip()
151
+ if not line:
152
+ continue
153
+
154
+ # Parse format: "1.diagnosis: query text"
155
+ match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
156
+ if match:
157
+ category_raw = match.group(1).lower()
158
+ query_text = match.group(2).strip()
159
+
160
+ # Normalize category name
161
+ if category_raw in ['mixed/complicated', 'mixed']:
162
+ category = 'mixed'
163
+ else:
164
+ category = category_raw
165
+
166
+ if category in queries_by_category and len(query_text) > 15:
167
+ queries_by_category[category].append({
168
+ "text": query_text,
169
+ "category": category
170
+ })
171
+
172
+ print(f"📋 Parsed queries by category:")
173
+ for category, category_queries in queries_by_category.items():
174
+ print(f" {category.capitalize()}: {len(category_queries)} queries")
175
+
176
+ return queries_by_category
177
+
178
+ except Exception as e:
179
+ print(f"❌ Failed to read file: {e}")
180
+ return {"error": f"Failed to read file: {e}"}
181
+
182
+ def calculate_extraction_statistics(self) -> Dict[str, Any]:
183
+ """Calculate extraction success statistics by category"""
184
+ category_stats = {}
185
+ all_results = []
186
+
187
+ # Group results by category
188
+ results_by_category = {
189
+ "diagnosis": [],
190
+ "treatment": [],
191
+ "mixed": []
192
+ }
193
+
194
+ for result in self.extraction_results:
195
+ category = result.get('category', 'unknown')
196
+ if category in results_by_category:
197
+ results_by_category[category].append(result)
198
+ all_results.append(result)
199
+
200
+ # Calculate statistics for each category
201
+ for category, results in results_by_category.items():
202
+ if results:
203
+ successful = [r for r in results if r.get('extraction_success')]
204
+ success_rate = len(successful) / len(results)
205
+ avg_time = sum(r.get('extraction_time', 0) for r in results) / len(results)
206
+
207
+ category_stats[category] = {
208
+ "success_rate": success_rate,
209
+ "successful_count": len(successful),
210
+ "total_count": len(results),
211
+ "average_extraction_time": avg_time,
212
+ "fallback_levels": [r.get('fallback_level') for r in results]
213
+ }
214
+ else:
215
+ category_stats[category] = {
216
+ "success_rate": 0.0,
217
+ "successful_count": 0,
218
+ "total_count": 0,
219
+ "average_extraction_time": 0.0,
220
+ "fallback_levels": []
221
+ }
222
+
223
+ # Calculate overall statistics
224
+ if all_results:
225
+ overall_successful = [r for r in all_results if r.get('extraction_success')]
226
+ overall_stats = {
227
+ "success_rate": len(overall_successful) / len(all_results),
228
+ "successful_count": len(overall_successful),
229
+ "total_count": len(all_results),
230
+ "average_extraction_time": sum(r.get('extraction_time', 0) for r in all_results) / len(all_results),
231
+ "target_compliance": len(overall_successful) / len(all_results) >= 0.8
232
+ }
233
+ else:
234
+ overall_stats = {
235
+ "success_rate": 0.0,
236
+ "successful_count": 0,
237
+ "total_count": 0,
238
+ "average_extraction_time": 0.0,
239
+ "target_compliance": False
240
+ }
241
+
242
+ return {
243
+ "category_results": category_stats,
244
+ "overall_results": overall_stats,
245
+ "timestamp": datetime.now().isoformat()
246
+ }
247
+
248
+ def save_extraction_statistics(self, filename: str = None) -> str:
249
+ """Save extraction statistics for chart generation"""
250
+ stats = self.calculate_extraction_statistics()
251
+
252
+ if filename is None:
253
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
254
+ filename = f"extraction_statistics_{timestamp}.json"
255
+
256
+ # Ensure results directory exists
257
+ results_dir = Path(__file__).parent / "results"
258
+ results_dir.mkdir(exist_ok=True)
259
+
260
+ filepath = results_dir / filename
261
+
262
+ with open(filepath, 'w', encoding='utf-8') as f:
263
+ json.dump(stats, f, indent=2, ensure_ascii=False)
264
+
265
+ print(f"📊 Extraction statistics saved to: {filepath}")
266
+ return str(filepath)
267
+
268
+ def save_extraction_details(self, filename: str = None) -> str:
269
+ """Save detailed extraction results"""
270
+ if filename is None:
271
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
272
+ filename = f"extraction_details_{timestamp}.json"
273
+
274
+ # Ensure results directory exists
275
+ results_dir = Path(__file__).parent / "results"
276
+ results_dir.mkdir(exist_ok=True)
277
+
278
+ filepath = results_dir / filename
279
+
280
+ # Create comprehensive extraction data
281
+ extraction_data = {
282
+ "evaluation_metadata": {
283
+ "total_queries": len(self.extraction_results),
284
+ "timestamp": datetime.now().isoformat(),
285
+ "evaluator_type": "condition_extraction"
286
+ },
287
+ "extraction_results": self.extraction_results
288
+ }
289
+
290
+ with open(filepath, 'w', encoding='utf-8') as f:
291
+ json.dump(extraction_data, f, indent=2, ensure_ascii=False)
292
+
293
+ print(f"📝 Extraction details saved to: {filepath}")
294
+ return str(filepath)
295
+
296
+
297
+ # Independent execution interface
298
+ if __name__ == "__main__":
299
+ """Independent extraction evaluation interface"""
300
+
301
+ print("🔍 OnCall.ai Extraction Evaluator - Condition Extraction Success Rate")
302
+
303
+ if len(sys.argv) > 1:
304
+ query_file = sys.argv[1]
305
+ else:
306
+ # Default to evaluation/pre_user_query_evaluate.txt
307
+ query_file = Path(__file__).parent / "pre_user_query_evaluate.txt"
308
+
309
+ if not os.path.exists(query_file):
310
+ print(f"❌ Query file not found: {query_file}")
311
+ print("Usage: python extraction_evaluator.py [query_file.txt]")
312
+ sys.exit(1)
313
+
314
+ # Initialize evaluator
315
+ evaluator = ExtractionEvaluator()
316
+
317
+ # Parse queries from file
318
+ queries_by_category = evaluator.parse_queries_from_file(str(query_file))
319
+
320
+ if "error" in queries_by_category:
321
+ print(f"❌ Failed to parse queries: {queries_by_category['error']}")
322
+ sys.exit(1)
323
+
324
+ # Test extraction for each query
325
+ print(f"\n🧪 Condition Extraction Testing")
326
+
327
+ for category, queries in queries_by_category.items():
328
+ if not queries:
329
+ continue
330
+
331
+ print(f"\n📂 Testing {category.upper()} extraction:")
332
+
333
+ for i, query_info in enumerate(queries):
334
+ query_text = query_info['text']
335
+
336
+ # Test extraction
337
+ result = evaluator.evaluate_single_extraction(query_text, category)
338
+
339
+ # Pause between queries to avoid rate limits (if needed)
340
+ if i < len(queries) - 1:
341
+ print(f" ⏳ Pausing 3s before next query...")
342
+ import time
343
+ time.sleep(3)
344
+
345
+ # Pause between categories
346
+ if category != list(queries_by_category.keys())[-1]:
347
+ print(f"\n⏳ Pausing 5s before next category...")
348
+ import time
349
+ time.sleep(5)
350
+
351
+ # Generate and save results
352
+ print(f"\n📊 Generating extraction analysis...")
353
+
354
+ # Save statistics and details
355
+ stats_path = evaluator.save_extraction_statistics()
356
+ details_path = evaluator.save_extraction_details()
357
+
358
+ # Print final summary
359
+ stats = evaluator.calculate_extraction_statistics()
360
+ category_results = stats['category_results']
361
+ overall_results = stats['overall_results']
362
+
363
+ print(f"\n📊 === EXTRACTION EVALUATION SUMMARY ===")
364
+ print(f"Overall Performance:")
365
+ print(f" Success Rate: {overall_results['success_rate']:.1%}")
366
+ print(f" Successful Extractions: {overall_results['successful_count']}/{overall_results['total_count']}")
367
+ print(f" Average Extraction Time: {overall_results['average_extraction_time']:.3f}s")
368
+ print(f" 80% Target Compliance: {'✅ Met' if overall_results['target_compliance'] else '❌ Not Met'}")
369
+
370
+ print(f"\nCategory Breakdown:")
371
+ for category, cat_stats in category_results.items():
372
+ if cat_stats['total_count'] > 0:
373
+ print(f" {category.capitalize()}: {cat_stats['success_rate']:.1%} "
374
+ f"({cat_stats['successful_count']}/{cat_stats['total_count']}) "
375
+ f"[{cat_stats['average_extraction_time']:.3f}s avg]")
376
+
377
+ print(f"\n✅ Extraction evaluation complete!")
378
+ print(f"📊 Statistics: {stats_path}")
379
+ print(f"📝 Details: {details_path}")
evaluation/old/relevance_evaluator.py ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - Retrieval Relevance Evaluator (Metric 3)
4
+ ===========================================================
5
+
6
+ Evaluates retrieval relevance using cosine similarity from retrieval.py
7
+ Automatic evaluation based on existing similarity scores with optional LLM sampling
8
+
9
+ Author: YanBo Chen
10
+ Date: 2025-08-04
11
+ """
12
+
13
+ import json
14
+ import os
15
+ import sys
16
+ from typing import Dict, List, Any
17
+ from datetime import datetime
18
+ from pathlib import Path
19
+ import re
20
+ import numpy as np
21
+
22
+ # Add project path
23
+ current_dir = Path(__file__).parent
24
+ project_root = current_dir.parent
25
+ src_dir = project_root / "src"
26
+ sys.path.insert(0, str(src_dir))
27
+
28
+ # Import existing system components
29
+ try:
30
+ from user_prompt import UserPromptProcessor
31
+ from retrieval import BasicRetrievalSystem
32
+ from llm_clients import llm_Med42_70BClient
33
+ except ImportError as e:
34
+ print(f"❌ Import failed: {e}")
35
+ print("Please ensure running from project root directory")
36
+ sys.exit(1)
37
+
38
+
39
+ class RelevanceEvaluator:
40
+ """Retrieval relevance evaluator using cosine similarity - automatic evaluation"""
41
+
42
+ def __init__(self):
43
+ """Initialize system components for relevance testing"""
44
+ print("🔧 Initializing Relevance Evaluator...")
45
+
46
+ # Initialize required components
47
+ self.llm_client = llm_Med42_70BClient()
48
+ self.retrieval_system = BasicRetrievalSystem()
49
+ self.user_prompt_processor = UserPromptProcessor(
50
+ llm_client=self.llm_client,
51
+ retrieval_system=self.retrieval_system
52
+ )
53
+
54
+ # Results accumulation
55
+ self.relevance_results = []
56
+
57
+ print("✅ Relevance Evaluator initialization complete")
58
+
59
+ def evaluate_single_relevance(self, query: str, category: str = "unknown") -> Dict[str, Any]:
60
+ """
61
+ Evaluate retrieval relevance for a single query
62
+
63
+ Uses existing cosine similarity scores from retrieval.py
64
+
65
+ Args:
66
+ query: Medical query to test
67
+ category: Query category (diagnosis/treatment/mixed)
68
+ """
69
+ print(f"🔍 Testing relevance for: {query[:50]}...")
70
+ print(f"📋 Category: {category}")
71
+
72
+ try:
73
+ # Step 1: Extract condition for search query construction
74
+ condition_result = self.user_prompt_processor.extract_condition_keywords(query)
75
+
76
+ # Step 2: Perform retrieval (same as latency_evaluator.py)
77
+ search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
78
+ if not search_query:
79
+ search_query = condition_result.get('condition', query)
80
+
81
+ retrieval_start = datetime.now()
82
+ retrieval_results = self.retrieval_system.search(search_query, top_k=5)
83
+ retrieval_time = (datetime.now() - retrieval_start).total_seconds()
84
+
85
+ # Step 3: Extract similarity scores from retrieval results
86
+ processed_results = retrieval_results.get('processed_results', [])
87
+
88
+ if not processed_results:
89
+ result = {
90
+ "query": query,
91
+ "category": category,
92
+ "search_query": search_query,
93
+ "retrieval_success": False,
94
+ "average_relevance": 0.0,
95
+ "relevance_scores": [],
96
+ "retrieved_count": 0,
97
+ "retrieval_time": retrieval_time,
98
+ "error": "No retrieval results",
99
+ "timestamp": datetime.now().isoformat()
100
+ }
101
+
102
+ self.relevance_results.append(result)
103
+ print(f" ❌ No retrieval results found")
104
+ return result
105
+
106
+ # Extract cosine similarity scores
107
+ similarity_scores = []
108
+ retrieval_details = []
109
+
110
+ for i, doc_result in enumerate(processed_results):
111
+ # Get similarity score (may be stored as 'distance', 'similarity_score', or 'score')
112
+ similarity = (
113
+ doc_result.get('distance', 0.0) or
114
+ doc_result.get('similarity_score', 0.0) or
115
+ doc_result.get('score', 0.0)
116
+ )
117
+
118
+ similarity_scores.append(similarity)
119
+
120
+ retrieval_details.append({
121
+ "doc_index": i,
122
+ "similarity_score": similarity,
123
+ "content_snippet": doc_result.get('content', '')[:100] + "...",
124
+ "doc_type": doc_result.get('type', 'unknown'),
125
+ "source": doc_result.get('source', 'unknown')
126
+ })
127
+
128
+ # Calculate relevance metrics
129
+ average_relevance = sum(similarity_scores) / len(similarity_scores)
130
+ max_relevance = max(similarity_scores)
131
+ min_relevance = min(similarity_scores)
132
+
133
+ # Count high-relevance results (threshold: 0.2 based on evaluation_instruction.md)
134
+ high_relevance_count = sum(1 for score in similarity_scores if score >= 0.2)
135
+ high_relevance_ratio = high_relevance_count / len(similarity_scores)
136
+
137
+ result = {
138
+ "query": query,
139
+ "category": category,
140
+ "search_query": search_query,
141
+ "retrieval_success": True,
142
+ "average_relevance": average_relevance,
143
+ "max_relevance": max_relevance,
144
+ "min_relevance": min_relevance,
145
+ "relevance_scores": similarity_scores,
146
+ "high_relevance_count": high_relevance_count,
147
+ "high_relevance_ratio": high_relevance_ratio,
148
+ "retrieved_count": len(processed_results),
149
+ "retrieval_time": retrieval_time,
150
+ "retrieval_details": retrieval_details,
151
+ "meets_threshold": average_relevance >= 0.2,
152
+ "timestamp": datetime.now().isoformat()
153
+ }
154
+
155
+ # Store result
156
+ self.relevance_results.append(result)
157
+
158
+ print(f" ✅ Retrieval: {len(processed_results)} documents")
159
+ print(f" 📊 Average Relevance: {average_relevance:.3f}")
160
+ print(f" 📈 High Relevance (≥0.2): {high_relevance_count}/{len(processed_results)} ({high_relevance_ratio:.1%})")
161
+ print(f" 🎯 Threshold: {'✅ Met' if result['meets_threshold'] else '❌ Not Met'}")
162
+ print(f" ⏱️ Retrieval Time: {retrieval_time:.3f}s")
163
+
164
+ return result
165
+
166
+ except Exception as e:
167
+ error_result = {
168
+ "query": query,
169
+ "category": category,
170
+ "retrieval_success": False,
171
+ "average_relevance": 0.0,
172
+ "error": str(e),
173
+ "timestamp": datetime.now().isoformat()
174
+ }
175
+
176
+ self.relevance_results.append(error_result)
177
+ print(f" ❌ Relevance evaluation failed: {e}")
178
+
179
+ return error_result
180
+
181
+ def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
182
+ """Parse queries from file with category labels"""
183
+ print(f"📁 Reading queries from file: {filepath}")
184
+
185
+ try:
186
+ with open(filepath, 'r', encoding='utf-8') as f:
187
+ content = f.read()
188
+
189
+ # Parse queries with category labels
190
+ queries_by_category = {
191
+ "diagnosis": [],
192
+ "treatment": [],
193
+ "mixed": []
194
+ }
195
+
196
+ lines = content.strip().split('\n')
197
+
198
+ for line in lines:
199
+ line = line.strip()
200
+ if not line:
201
+ continue
202
+
203
+ # Parse format: "1.diagnosis: query text"
204
+ match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
205
+ if match:
206
+ category_raw = match.group(1).lower()
207
+ query_text = match.group(2).strip()
208
+
209
+ # Normalize category name
210
+ if category_raw in ['mixed/complicated', 'mixed']:
211
+ category = 'mixed'
212
+ else:
213
+ category = category_raw
214
+
215
+ if category in queries_by_category and len(query_text) > 15:
216
+ queries_by_category[category].append({
217
+ "text": query_text,
218
+ "category": category
219
+ })
220
+
221
+ print(f"📋 Parsed queries by category:")
222
+ for category, category_queries in queries_by_category.items():
223
+ print(f" {category.capitalize()}: {len(category_queries)} queries")
224
+
225
+ return queries_by_category
226
+
227
+ except Exception as e:
228
+ print(f"❌ Failed to read file: {e}")
229
+ return {"error": f"Failed to read file: {e}"}
230
+
231
+ def calculate_relevance_statistics(self) -> Dict[str, Any]:
232
+ """Calculate relevance statistics by category"""
233
+ category_stats = {}
234
+ all_successful_results = []
235
+
236
+ # Group results by category
237
+ results_by_category = {
238
+ "diagnosis": [],
239
+ "treatment": [],
240
+ "mixed": []
241
+ }
242
+
243
+ for result in self.relevance_results:
244
+ category = result.get('category', 'unknown')
245
+ if category in results_by_category:
246
+ results_by_category[category].append(result)
247
+ if result.get('retrieval_success'):
248
+ all_successful_results.append(result)
249
+
250
+ # Calculate statistics for each category
251
+ for category, results in results_by_category.items():
252
+ successful_results = [r for r in results if r.get('retrieval_success')]
253
+
254
+ if successful_results:
255
+ avg_relevance = sum(r['average_relevance'] for r in successful_results) / len(successful_results)
256
+ relevance_scores = [r['average_relevance'] for r in successful_results]
257
+ avg_retrieval_time = sum(r.get('retrieval_time', 0) for r in successful_results) / len(successful_results)
258
+
259
+ category_stats[category] = {
260
+ "average_relevance": avg_relevance,
261
+ "max_relevance": max(relevance_scores),
262
+ "min_relevance": min(relevance_scores),
263
+ "successful_retrievals": len(successful_results),
264
+ "total_queries": len(results),
265
+ "success_rate": len(successful_results) / len(results),
266
+ "average_retrieval_time": avg_retrieval_time,
267
+ "meets_threshold": avg_relevance >= 0.2,
268
+ "individual_relevance_scores": relevance_scores
269
+ }
270
+ else:
271
+ category_stats[category] = {
272
+ "average_relevance": 0.0,
273
+ "max_relevance": 0.0,
274
+ "min_relevance": 0.0,
275
+ "successful_retrievals": 0,
276
+ "total_queries": len(results),
277
+ "success_rate": 0.0,
278
+ "average_retrieval_time": 0.0,
279
+ "meets_threshold": False,
280
+ "individual_relevance_scores": []
281
+ }
282
+
283
+ # Calculate overall statistics
284
+ if all_successful_results:
285
+ all_relevance_scores = [r['average_relevance'] for r in all_successful_results]
286
+ overall_stats = {
287
+ "average_relevance": sum(all_relevance_scores) / len(all_relevance_scores),
288
+ "max_relevance": max(all_relevance_scores),
289
+ "min_relevance": min(all_relevance_scores),
290
+ "successful_retrievals": len(all_successful_results),
291
+ "total_queries": len(self.relevance_results),
292
+ "success_rate": len(all_successful_results) / len(self.relevance_results),
293
+ "meets_threshold": (sum(all_relevance_scores) / len(all_relevance_scores)) >= 0.2,
294
+ "target_compliance": (sum(all_relevance_scores) / len(all_relevance_scores)) >= 0.25
295
+ }
296
+ else:
297
+ overall_stats = {
298
+ "average_relevance": 0.0,
299
+ "max_relevance": 0.0,
300
+ "min_relevance": 0.0,
301
+ "successful_retrievals": 0,
302
+ "total_queries": len(self.relevance_results),
303
+ "success_rate": 0.0,
304
+ "meets_threshold": False,
305
+ "target_compliance": False
306
+ }
307
+
308
+ return {
309
+ "category_results": category_stats,
310
+ "overall_results": overall_stats,
311
+ "timestamp": datetime.now().isoformat()
312
+ }
313
+
314
+ def save_relevance_statistics(self, filename: str = None) -> str:
315
+ """Save relevance statistics for chart generation"""
316
+ stats = self.calculate_relevance_statistics()
317
+
318
+ if filename is None:
319
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
320
+ filename = f"relevance_statistics_{timestamp}.json"
321
+
322
+ # Ensure results directory exists
323
+ results_dir = Path(__file__).parent / "results"
324
+ results_dir.mkdir(exist_ok=True)
325
+
326
+ filepath = results_dir / filename
327
+
328
+ with open(filepath, 'w', encoding='utf-8') as f:
329
+ json.dump(stats, f, indent=2, ensure_ascii=False)
330
+
331
+ print(f"📊 Relevance statistics saved to: {filepath}")
332
+ return str(filepath)
333
+
334
+ def save_relevance_details(self, filename: str = None) -> str:
335
+ """Save detailed relevance results"""
336
+ if filename is None:
337
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
338
+ filename = f"relevance_details_{timestamp}.json"
339
+
340
+ # Ensure results directory exists
341
+ results_dir = Path(__file__).parent / "results"
342
+ results_dir.mkdir(exist_ok=True)
343
+
344
+ filepath = results_dir / filename
345
+
346
+ # Create comprehensive relevance data
347
+ relevance_data = {
348
+ "evaluation_metadata": {
349
+ "total_queries": len(self.relevance_results),
350
+ "successful_retrievals": len([r for r in self.relevance_results if r.get('retrieval_success')]),
351
+ "timestamp": datetime.now().isoformat(),
352
+ "evaluator_type": "retrieval_relevance",
353
+ "threshold_used": 0.2
354
+ },
355
+ "relevance_results": self.relevance_results
356
+ }
357
+
358
+ with open(filepath, 'w', encoding='utf-8') as f:
359
+ json.dump(relevance_data, f, indent=2, ensure_ascii=False)
360
+
361
+ print(f"📝 Relevance details saved to: {filepath}")
362
+ return str(filepath)
363
+
364
+
365
+ # Independent execution interface
366
+ if __name__ == "__main__":
367
+ """Independent relevance evaluation interface"""
368
+
369
+ print("📊 OnCall.ai Relevance Evaluator - Retrieval Relevance Analysis")
370
+
371
+ if len(sys.argv) > 1:
372
+ query_file = sys.argv[1]
373
+ else:
374
+ # Default to evaluation/pre_user_query_evaluate.txt
375
+ query_file = Path(__file__).parent / "pre_user_query_evaluate.txt"
376
+
377
+ if not os.path.exists(query_file):
378
+ print(f"❌ Query file not found: {query_file}")
379
+ print("Usage: python relevance_evaluator.py [query_file.txt]")
380
+ sys.exit(1)
381
+
382
+ # Initialize evaluator
383
+ evaluator = RelevanceEvaluator()
384
+
385
+ # Parse queries from file
386
+ queries_by_category = evaluator.parse_queries_from_file(str(query_file))
387
+
388
+ if "error" in queries_by_category:
389
+ print(f"❌ Failed to parse queries: {queries_by_category['error']}")
390
+ sys.exit(1)
391
+
392
+ # Test relevance for each query
393
+ print(f"\n🧪 Retrieval Relevance Testing")
394
+
395
+ for category, queries in queries_by_category.items():
396
+ if not queries:
397
+ continue
398
+
399
+ print(f"\n📂 Testing {category.upper()} relevance:")
400
+
401
+ for i, query_info in enumerate(queries):
402
+ query_text = query_info['text']
403
+
404
+ # Test relevance
405
+ result = evaluator.evaluate_single_relevance(query_text, category)
406
+
407
+ # Pause between queries to avoid rate limits
408
+ if i < len(queries) - 1:
409
+ print(f" ⏳ Pausing 3s before next query...")
410
+ import time
411
+ time.sleep(3)
412
+
413
+ # Pause between categories
414
+ if category != list(queries_by_category.keys())[-1]:
415
+ print(f"\n⏳ Pausing 5s before next category...")
416
+ import time
417
+ time.sleep(5)
418
+
419
+ # Generate and save results
420
+ print(f"\n📊 Generating relevance analysis...")
421
+
422
+ # Save statistics and details
423
+ stats_path = evaluator.save_relevance_statistics()
424
+ details_path = evaluator.save_relevance_details()
425
+
426
+ # Print final summary
427
+ stats = evaluator.calculate_relevance_statistics()
428
+ category_results = stats['category_results']
429
+ overall_results = stats['overall_results']
430
+
431
+ print(f"\n📊 === RELEVANCE EVALUATION SUMMARY ===")
432
+ print(f"Overall Performance:")
433
+ print(f" Average Relevance: {overall_results['average_relevance']:.3f}")
434
+ print(f" Retrieval Success Rate: {overall_results['success_rate']:.1%}")
435
+ print(f" 0.2 Threshold: {'✅ Met' if overall_results['meets_threshold'] else '❌ Not Met'}")
436
+ print(f" 0.25 Target: {'✅ Met' if overall_results['target_compliance'] else '❌ Not Met'}")
437
+
438
+ print(f"\nCategory Breakdown:")
439
+ for category, cat_stats in category_results.items():
440
+ if cat_stats['total_queries'] > 0:
441
+ print(f" {category.capitalize()}: {cat_stats['average_relevance']:.3f} "
442
+ f"({cat_stats['successful_retrievals']}/{cat_stats['total_queries']}) "
443
+ f"[{cat_stats['average_retrieval_time']:.3f}s avg]")
444
+
445
+ print(f"\n✅ Relevance evaluation complete!")
446
+ print(f"📊 Statistics: {stats_path}")
447
+ print(f"📝 Details: {details_path}")
evaluation/pre_user_query_evaluate.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 1.diagnosis: 60-year-old patient with hypertension history, sudden chest pain. What are possible causes and how to assess?
2
+
3
+ 2.treatment: Suspected acute ischemic stroke. Tell me the next steps to take
4
+
5
+ 3.mixed/complicated: 20 y/f , porphyria, sudden seizure. What are possible causes and complete management workflow?
evaluation/single_test_query.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 1.diagnosis: 60-year-old patient with hypertension history, sudden chest pain. What are possible causes and how to assess?
evaluation/user_query.txt CHANGED
@@ -17,18 +17,18 @@ Suspected acute ischemic stroke. Tell me the next steps to take
17
 
18
  ### 一、Diagnosis-Focused(診斷為主)
19
 
20
- 1. I have a 68-year-old man with atrial fibrillation presenting with sudden slurred speech and right-sided weaknesswhat are the possible diagnoses, and how would you evaluate them?
21
- 2. A 40-year-old woman reports fever, urinary frequency, and dysuriawhat differential diagnoses should I consider, and which tests would you order?
22
- 3. A 50-year-old patient has progressive dyspnea on exertion and orthopnea over two weekswhat are the likely causes, and what diagnostic steps should I take?
23
 
24
  ### 二、Treatment-Focused(治療為主)
25
 
26
- 4. ECG shows a suspected acute STEMIwhat immediate interventions should I initiate in the next five minutes?
27
- 5. I have a patient diagnosed with bacterial meningitis—what empiric antibiotic regimen and supportive measures should I implement?
28
  6. A patient is in septic shock with BP 80/50 mmHg and HR 120 bpm—what fluid resuscitation and vasopressor strategy would you recommend?
29
 
30
  ### 三、Mixed(診斷+治療綜合)
31
 
32
  7. A 75-year-old diabetic presents with a non-healing foot ulcer and fever—what differential for osteomyelitis, diagnostic workup, and management plan do you suggest?
33
- 8. A 60-year-old COPD patient has worsening dyspnea and hypercapnia on ABG—how would you confirm the diagnosis, and what is your stepwise treatment approach?
34
- 9. A 28-year-old woman is experiencing postpartum hemorrhagewhat are the possible causes, what immediate resuscitation steps should I take, and how would you proceed with definitive management?
 
17
 
18
  ### 一、Diagnosis-Focused(診斷為主)
19
 
20
+ 1. I have a 68-year-old man with atrial fibrillation presenting with sudden slurred speech and right-sided weakness. what are the possible diagnoses, and how would you evaluate them?
21
+ 2. A 40-year-old woman reports fever, urinary frequency, and dysuria. what differential diagnoses should I consider, and which tests would you order?
22
+ 3. A 50-year-old patient has progressive dyspnea on exertion and orthopnea over two weeks. what are the likely causes, and what diagnostic steps should I take?
23
 
24
  ### 二、Treatment-Focused(治療為主)
25
 
26
+ 4. ECG shows a suspected acute STEMI. what immediate interventions should I initiate in the next five minutes?
27
+ 5. I have a patient diagnosed with bacterial meningitis. What empiric antibiotic regimen and supportive measures should I implement?
28
  6. A patient is in septic shock with BP 80/50 mmHg and HR 120 bpm—what fluid resuscitation and vasopressor strategy would you recommend?
29
 
30
  ### 三、Mixed(診斷+治療綜合)
31
 
32
  7. A 75-year-old diabetic presents with a non-healing foot ulcer and fever—what differential for osteomyelitis, diagnostic workup, and management plan do you suggest?
33
+ 8. A 60-year-old COPD patient has worsening dyspnea and hypercapnia on ABG. How would you confirm the diagnosis, and what is your stepwise treatment approach?
34
+ 9. A 28-year-old woman is experiencing postpartum hemorrhage. what are the possible causes, what immediate resuscitation steps should I take, and how would you proceed with definitive management?
src/generation.py CHANGED
@@ -30,7 +30,7 @@ logger = logging.getLogger(__name__)
30
 
31
  # Fallback Generation Configuration (Simplified Architecture)
32
  FALLBACK_TIMEOUTS = {
33
- "primary": 30.0, # Primary Med42-70B with full RAG context
34
  "fallback_1": 1.0, # RAG template generation (renamed from fallback_2)
35
  "fallback_2": 0.1 # Minimal template generation (instant)
36
  }
@@ -277,7 +277,7 @@ class MedicalAdviceGenerator:
277
 
278
  # Sort by relevance (distance) and take top 6
279
  all_chunks_sorted = sorted(all_chunks, key=lambda x: x.get("distance", 999))
280
- selected_chunks = all_chunks_sorted[:6]
281
 
282
  logger.info(f"Selected chunks by relevance (no intention): {len(selected_chunks)} total")
283
 
@@ -308,14 +308,14 @@ class MedicalAdviceGenerator:
308
  # Special formatting for hospital-specific guidelines
309
  source_label = "Hospital Protocol"
310
  context_part = f"""
311
- [Guideline {i}] (Source: {source_label}, Relevance: {1-distance:.3f})
312
- 📋 {chunk.get('matched', 'Hospital Document')}
313
- {chunk_text}
314
  """.strip()
315
  else:
316
  context_part = f"""
317
- [Guideline {i}] (Source: {chunk_type.title()}, Relevance: {1-distance:.3f})
318
- {chunk_text}
319
  """.strip()
320
 
321
  context_parts.append(context_part)
 
30
 
31
  # Fallback Generation Configuration (Simplified Architecture)
32
  FALLBACK_TIMEOUTS = {
33
+ "primary": 60.0, # Primary Med42-70B increased timeout for stable evaluation
34
  "fallback_1": 1.0, # RAG template generation (renamed from fallback_2)
35
  "fallback_2": 0.1 # Minimal template generation (instant)
36
  }
 
277
 
278
  # Sort by relevance (distance) and take top 6
279
  all_chunks_sorted = sorted(all_chunks, key=lambda x: x.get("distance", 999))
280
+ selected_chunks = all_chunks_sorted[:6] # Limit to top 6 most relevant
281
 
282
  logger.info(f"Selected chunks by relevance (no intention): {len(selected_chunks)} total")
283
 
 
308
  # Special formatting for hospital-specific guidelines
309
  source_label = "Hospital Protocol"
310
  context_part = f"""
311
+ [Guideline {i}] (Source: {source_label}, Relevance: {1-distance:.3f})
312
+ 📋 {chunk.get('matched', 'Hospital Document')}
313
+ {chunk_text}
314
  """.strip()
315
  else:
316
  context_part = f"""
317
+ [Guideline {i}] (Source: {chunk_type.title()}, Angular Distance: {distance:.3f})
318
+ {chunk_text}
319
  """.strip()
320
 
321
  context_parts.append(context_part)
src/llm_clients.py CHANGED
@@ -9,6 +9,8 @@ Date: 2025-07-29
9
 
10
  import logging
11
  import os
 
 
12
  from typing import Dict, Optional, Union, List
13
  from huggingface_hub import InferenceClient
14
  from dotenv import load_dotenv
@@ -68,6 +70,91 @@ class llm_Med42_70BClient:
68
  self.logger.error(f"Detailed Error: {repr(e)}")
69
  raise ValueError(f"Failed to initialize Medical LLM client: {str(e)}") from e
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  def analyze_medical_query(
72
  self,
73
  query: str,
@@ -138,6 +225,13 @@ class llm_Med42_70BClient:
138
  self.logger.info(f"Raw LLM Response: {response_text}")
139
  self.logger.info(f"Query Latency: {latency:.4f} seconds")
140
 
 
 
 
 
 
 
 
141
  # Detect abnormal response
142
  if self._is_abnormal_response(response_text):
143
  self.logger.error(f"❌ Abnormal LLM response detected: {response_text[:50]}...")
@@ -149,15 +243,12 @@ class llm_Med42_70BClient:
149
  'latency': latency
150
  }
151
 
152
- # Extract condition from response
153
- extracted_condition = self._extract_condition(response_text)
154
-
155
  # Log the extracted condition
156
  self.logger.info(f"Extracted Condition: {extracted_condition}")
157
 
158
  return {
159
  'extracted_condition': extracted_condition,
160
- 'confidence': '0.8',
161
  'raw_response': response_text,
162
  'latency': latency # Add latency to the return dictionary
163
  }
@@ -264,7 +355,7 @@ Focus on: conditions, symptoms, procedures, body systems."""
264
 
265
  def _extract_condition(self, response: str) -> str:
266
  """
267
- Extract medical condition from model response.
268
 
269
  Args:
270
  response: Full model-generated text
@@ -272,18 +363,29 @@ Focus on: conditions, symptoms, procedures, body systems."""
272
  Returns:
273
  Extracted medical condition or empty string if non-medical
274
  """
 
 
275
  # Check if this is a rejection response first
276
  if self._is_rejection_response(response):
277
  return ""
278
 
279
- from medical_conditions import CONDITION_KEYWORD_MAPPING
 
 
 
 
 
 
 
 
 
280
 
281
- # Search in known medical conditions
282
  for condition in CONDITION_KEYWORD_MAPPING.keys():
283
  if condition.lower() in response.lower():
284
  return condition
285
 
286
- return response.split('\n')[0].strip() or ""
287
 
288
  def _is_abnormal_response(self, response: str) -> bool:
289
  """
@@ -439,5 +541,136 @@ def main():
439
  'total_execution_time': total_execution_time
440
  }
441
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
  if __name__ == "__main__":
443
  main()
 
9
 
10
  import logging
11
  import os
12
+ import json
13
+ import re
14
  from typing import Dict, Optional, Union, List
15
  from huggingface_hub import InferenceClient
16
  from dotenv import load_dotenv
 
70
  self.logger.error(f"Detailed Error: {repr(e)}")
71
  raise ValueError(f"Failed to initialize Medical LLM client: {str(e)}") from e
72
 
73
+ def fix_json_formatting(self, response_text: str) -> str:
74
+ """
75
+ Fix common JSON formatting errors
76
+
77
+ Args:
78
+ response_text: Raw response text that may contain JSON errors
79
+
80
+ Returns:
81
+ Fixed JSON string
82
+ """
83
+ # 1. Fix missing commas between key-value pairs
84
+ # Look for "value" "key" pattern and add comma
85
+ fixed = re.sub(r'"\s*\n\s*"', '",\n "', response_text)
86
+
87
+ # 2. Fix missing commas between values and keys
88
+ fixed = re.sub(r'"\s*(["\[])', '",\1', fixed)
89
+
90
+ # 3. Remove trailing commas
91
+ fixed = re.sub(r',\s*}', '}', fixed)
92
+ fixed = re.sub(r',\s*]', ']', fixed)
93
+
94
+ # 4. Ensure string values are properly quoted
95
+ fixed = re.sub(r':\s*([^",{}\[\]]+)\s*([,}])', r': "\1"\2', fixed)
96
+
97
+ return fixed
98
+
99
+ def parse_medical_response(self, response_text: str) -> Dict:
100
+ """
101
+ Enhanced JSON parsing logic with error recovery
102
+
103
+ Args:
104
+ response_text: Raw response text from Med42-70B
105
+
106
+ Returns:
107
+ Parsed response dictionary
108
+ """
109
+ try:
110
+ return json.loads(response_text)
111
+ except json.JSONDecodeError as e:
112
+ self.logger.warning(f"Initial JSON parsing failed: {e}")
113
+
114
+ # Attempt to fix common JSON errors
115
+ try:
116
+ fixed_response = self.fix_json_formatting(response_text)
117
+ self.logger.info("Attempting to parse fixed JSON")
118
+ return json.loads(fixed_response)
119
+ except json.JSONDecodeError as e2:
120
+ self.logger.error(f"Fixed JSON parsing also failed: {e2}")
121
+
122
+ # Try to extract partial information
123
+ try:
124
+ return self.extract_partial_medical_info(response_text)
125
+ except:
126
+ # Final fallback format
127
+ return {
128
+ "extracted_condition": "parsing_error",
129
+ "confidence": "0.0",
130
+ "is_medical": True,
131
+ "raw_response": response_text,
132
+ "error": str(e)
133
+ }
134
+
135
+ def extract_partial_medical_info(self, response_text: str) -> Dict:
136
+ """
137
+ Extract partial medical information from malformed response
138
+
139
+ Args:
140
+ response_text: Malformed response text
141
+
142
+ Returns:
143
+ Dictionary with extracted information
144
+ """
145
+ # Try to extract condition
146
+ condition_match = re.search(r'"extracted_condition":\s*"([^"]*)"', response_text)
147
+ confidence_match = re.search(r'"confidence":\s*"([^"]*)"', response_text)
148
+ medical_match = re.search(r'"is_medical":\s*(true|false)', response_text)
149
+
150
+ return {
151
+ "extracted_condition": condition_match.group(1) if condition_match else "unknown",
152
+ "confidence": confidence_match.group(1) if confidence_match else "0.0",
153
+ "is_medical": medical_match.group(1) == "true" if medical_match else True,
154
+ "raw_response": response_text,
155
+ "parsing_method": "partial_extraction"
156
+ }
157
+
158
  def analyze_medical_query(
159
  self,
160
  query: str,
 
225
  self.logger.info(f"Raw LLM Response: {response_text}")
226
  self.logger.info(f"Query Latency: {latency:.4f} seconds")
227
 
228
+ # Direct text extraction - system prompt expects plain text response
229
+ # Since the system prompt instructs LLM to "Return ONLY the primary condition name",
230
+ # we should directly extract from text instead of attempting JSON parsing
231
+ extracted_condition = self._extract_condition(response_text)
232
+ confidence = '0.8'
233
+ self.logger.info(f"Extracted condition from text: {extracted_condition}")
234
+
235
  # Detect abnormal response
236
  if self._is_abnormal_response(response_text):
237
  self.logger.error(f"❌ Abnormal LLM response detected: {response_text[:50]}...")
 
243
  'latency': latency
244
  }
245
 
 
 
 
246
  # Log the extracted condition
247
  self.logger.info(f"Extracted Condition: {extracted_condition}")
248
 
249
  return {
250
  'extracted_condition': extracted_condition,
251
+ 'confidence': confidence,
252
  'raw_response': response_text,
253
  'latency': latency # Add latency to the return dictionary
254
  }
 
355
 
356
  def _extract_condition(self, response: str) -> str:
357
  """
358
+ Extract medical condition from model response with support for multiple formats.
359
 
360
  Args:
361
  response: Full model-generated text
 
363
  Returns:
364
  Extracted medical condition or empty string if non-medical
365
  """
366
+ from medical_conditions import CONDITION_KEYWORD_MAPPING
367
+
368
  # Check if this is a rejection response first
369
  if self._is_rejection_response(response):
370
  return ""
371
 
372
+ # Try CONDITION: format first (primary format for structured responses)
373
+ match = re.search(r"CONDITION:\s*(.+)", response, re.IGNORECASE)
374
+ if not match:
375
+ # Try Primary condition: format as fallback
376
+ match = re.search(r"Primary condition:\s*(.+)", response, re.IGNORECASE)
377
+
378
+ if match:
379
+ value = match.group(1).strip()
380
+ if value.upper() not in ["NONE", "", "UNKNOWN"]:
381
+ return value
382
 
383
+ # Final fallback to keyword mapping for backward compatibility
384
  for condition in CONDITION_KEYWORD_MAPPING.keys():
385
  if condition.lower() in response.lower():
386
  return condition
387
 
388
+ return ""
389
 
390
  def _is_abnormal_response(self, response: str) -> bool:
391
  """
 
541
  'total_execution_time': total_execution_time
542
  }
543
 
544
+
545
+ class llm_Llama3_70B_JudgeClient:
546
+ """
547
+ Llama3-70B client specifically for LLM judge evaluation.
548
+ Used for metrics 5-6 evaluation: Clinical Actionability & Evidence Quality.
549
+ """
550
+
551
+ def __init__(
552
+ self,
553
+ model_name: str = "meta-llama/Meta-Llama-3-70B-Instruct",
554
+ timeout: float = 60.0
555
+ ):
556
+ """
557
+ Initialize Llama3-70B judge client for evaluation tasks.
558
+
559
+ Args:
560
+ model_name: Hugging Face model name for Llama3-70B
561
+ timeout: API call timeout duration (longer for judge evaluation)
562
+
563
+ Note: This client is specifically designed for third-party evaluation,
564
+ not for medical advice generation.
565
+ """
566
+ self.logger = logging.getLogger(__name__)
567
+ self.timeout = timeout
568
+ self.model_name = model_name
569
+
570
+ # Get Hugging Face token from environment
571
+ hf_token = os.getenv('HF_TOKEN')
572
+ if not hf_token:
573
+ self.logger.error("HF_TOKEN is missing from environment variables.")
574
+ raise ValueError(
575
+ "HF_TOKEN not found in environment variables. "
576
+ "Please set HF_TOKEN in your .env file or environment."
577
+ )
578
+
579
+ # Initialize Hugging Face Inference Client for judge evaluation
580
+ try:
581
+ self.client = InferenceClient(
582
+ provider="auto",
583
+ api_key=hf_token,
584
+ )
585
+ self.logger.info(f"Llama3-70B judge client initialized with model: {model_name}")
586
+ self.logger.info("Judge LLM: Evaluation tool only. Not for medical advice generation.")
587
+
588
+ except Exception as e:
589
+ self.logger.error(f"Failed to initialize Llama3-70B judge client: {e}")
590
+ raise
591
+
592
+ def generate_completion(self, prompt: str) -> Dict[str, Union[str, float]]:
593
+ """
594
+ Generate completion using Llama3-70B for judge evaluation.
595
+
596
+ Args:
597
+ prompt: Evaluation prompt for medical advice assessment
598
+
599
+ Returns:
600
+ Dict containing response content and timing information
601
+ """
602
+ import time
603
+
604
+ start_time = time.time()
605
+
606
+ try:
607
+ self.logger.info(f"Calling Llama3-70B Judge with evaluation prompt ({len(prompt)} chars)")
608
+
609
+ # Call Llama3-70B for judge evaluation
610
+ completion = self.client.chat.completions.create(
611
+ model=self.model_name,
612
+ messages=[
613
+ {
614
+ "role": "user",
615
+ "content": prompt
616
+ }
617
+ ],
618
+ max_tokens=2048, # Sufficient for evaluation responses
619
+ temperature=0.1, # Low temperature for consistent evaluation
620
+ )
621
+
622
+ # Extract response content
623
+ response_content = completion.choices[0].message.content
624
+
625
+ end_time = time.time()
626
+ latency = end_time - start_time
627
+
628
+ self.logger.info(f"Llama3-70B Judge Response: {response_content[:100]}...")
629
+ self.logger.info(f"Judge Evaluation Latency: {latency:.4f} seconds")
630
+
631
+ return {
632
+ 'content': response_content,
633
+ 'latency': latency,
634
+ 'model': self.model_name,
635
+ 'timestamp': time.time()
636
+ }
637
+
638
+ except Exception as e:
639
+ end_time = time.time()
640
+ error_latency = end_time - start_time
641
+
642
+ self.logger.error(f"Llama3-70B judge evaluation failed: {e}")
643
+ self.logger.error(f"Error occurred after {error_latency:.4f} seconds")
644
+
645
+ return {
646
+ 'content': f"Judge evaluation error: {str(e)}",
647
+ 'latency': error_latency,
648
+ 'error': str(e),
649
+ 'model': self.model_name,
650
+ 'timestamp': time.time()
651
+ }
652
+
653
+ def batch_evaluate(self, evaluation_prompt: str) -> Dict[str, Union[str, float]]:
654
+ """
655
+ Specialized method for batch evaluation of medical advice.
656
+ Alias for generate_completion with judge-specific logging.
657
+
658
+ Args:
659
+ evaluation_prompt: Batch evaluation prompt containing multiple queries
660
+
661
+ Returns:
662
+ Dict containing batch evaluation results and timing
663
+ """
664
+ self.logger.info("Starting batch judge evaluation...")
665
+ result = self.generate_completion(evaluation_prompt)
666
+
667
+ if 'error' not in result:
668
+ self.logger.info(f"Batch evaluation completed successfully in {result['latency']:.2f}s")
669
+ else:
670
+ self.logger.error(f"Batch evaluation failed: {result.get('error', 'Unknown error')}")
671
+
672
+ return result
673
+
674
+
675
  if __name__ == "__main__":
676
  main()
src/medical_conditions.py CHANGED
@@ -63,6 +63,18 @@ CONDITION_KEYWORD_MAPPING: Dict[str, Dict[str, str]] = {
63
  "seizure disorder": {
64
  "emergency": "seizure|status epilepticus|postictal state",
65
  "treatment": "antiepileptic drugs|EEG monitoring|neurology consult"
 
 
 
 
 
 
 
 
 
 
 
 
66
  }
67
  }
68
 
 
63
  "seizure disorder": {
64
  "emergency": "seizure|status epilepticus|postictal state",
65
  "treatment": "antiepileptic drugs|EEG monitoring|neurology consult"
66
+ },
67
+ "postpartum hemorrhage": {
68
+ "emergency": "postpartum hemorrhage|uterine atony|placental retention|vaginal laceration",
69
+ "treatment": "uterine massage|IV oxytocin infusion|blood transfusion|surgical intervention"
70
+ },
71
+ "bacterial meningitis": {
72
+ "emergency": "bacterial meningitis|fever|headache|neck stiffness|altered mental status|meningitis|meningeal signs",
73
+ "treatment": "empiric antibiotics|ceftriaxone|vancomycin|dexamethasone|lumbar puncture"
74
+ },
75
+ "anaphylaxis": {
76
+ "emergency": "anaphylaxis|allergic reaction|airway compromise|hypotension",
77
+ "treatment": "epinephrine|adrenaline|IV fluids|antihistamine|corticosteroids"
78
  }
79
  }
80
 
src/user_prompt.py CHANGED
@@ -255,13 +255,15 @@ Return ONLY the specified format."""
255
  timeout=12.0 # Single call timeout
256
  )
257
 
 
 
258
  response_text = llama_response.get('extracted_condition', '').strip()
259
  logger.info(f"🤖 Combined L2+4 result: {response_text}")
260
 
261
- # Parse structured response
262
- medical_status = self._extract_field(response_text, 'MEDICAL')
263
- condition_name = self._extract_field(response_text, 'CONDITION')
264
- confidence = self._extract_field(response_text, 'CONFIDENCE')
265
 
266
  # Non-medical query detection
267
  if medical_status == 'NO':
 
255
  timeout=12.0 # Single call timeout
256
  )
257
 
258
+ # Get both raw response and extracted condition
259
+ raw_response = llama_response.get('raw_response', '').strip()
260
  response_text = llama_response.get('extracted_condition', '').strip()
261
  logger.info(f"🤖 Combined L2+4 result: {response_text}")
262
 
263
+ # Parse structured response from raw LLM output (not extracted condition)
264
+ medical_status = self._extract_field(raw_response, 'MEDICAL')
265
+ condition_name = self._extract_field(raw_response, 'CONDITION')
266
+ confidence = self._extract_field(raw_response, 'CONFIDENCE')
267
 
268
  # Non-medical query detection
269
  if medical_status == 'NO':
united_requirements.txt ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==1.9.0
2
+ aiofiles==24.1.0
3
+ aiohappyeyeballs==2.6.1
4
+ aiohttp==3.12.14
5
+ aiosignal==1.4.0
6
+ aiosqlite==0.21.0
7
+ annotated-types==0.7.0
8
+ annoy==1.17.3
9
+ anyio==4.9.0
10
+ attrs==25.3.0
11
+ banks==2.2.0
12
+ beautifulsoup4==4.13.4
13
+ Brotli==1.1.0
14
+ certifi==2025.7.14
15
+ cffi==1.17.1
16
+ charset-normalizer==3.4.2
17
+ click==8.2.1
18
+ colorama==0.4.6
19
+ contourpy==1.3.3
20
+ cryptography==45.0.5
21
+ cycler==0.12.1
22
+ dataclasses-json==0.6.7
23
+ datasets==4.0.0
24
+ Deprecated==1.2.18
25
+ dill==0.3.8
26
+ dirtyjson==1.0.8
27
+ distro==1.9.0
28
+ easyocr==1.7.2
29
+ fastapi==0.116.1
30
+ ffmpy==0.6.1
31
+ filelock==3.18.0
32
+ filetype==1.2.0
33
+ fonttools==4.59.0
34
+ frozenlist==1.7.0
35
+ fsspec==2023.12.2
36
+ gradio==5.38.0
37
+ gradio_client==1.11.0
38
+ greenlet==3.2.3
39
+ griffe==1.9.0
40
+ groovy==0.1.2
41
+ h11==0.16.0
42
+ hf-xet==1.1.5
43
+ httpcore==1.0.9
44
+ httpx==0.28.1
45
+ huggingface-hub==0.33.5
46
+ idna==3.10
47
+ imageio==2.37.0
48
+ Jinja2==3.1.6
49
+ jiter==0.10.0
50
+ joblib==1.5.1
51
+ kiwisolver==1.4.8
52
+ lazy_loader==0.4
53
+ llama-index-core==0.12.50
54
+ llama-index-embeddings-huggingface==0.5.5
55
+ llama-index-instrumentation==0.4.0
56
+ llama-index-llms-huggingface==0.5.0
57
+ llama-index-workflows==1.2.0
58
+ lxml==6.0.0
59
+ markdown-it-py==3.0.0
60
+ MarkupSafe==3.0.2
61
+ marshmallow==3.26.1
62
+ matplotlib==3.10.3
63
+ mdurl==0.1.2
64
+ mpmath==1.3.0
65
+ multidict==6.6.3
66
+ multiprocess==0.70.16
67
+ mypy_extensions==1.1.0
68
+ nest-asyncio==1.6.0
69
+ networkx==3.5
70
+ ninja==1.11.1.4
71
+ nltk==3.9.1
72
+ numpy==2.2.6
73
+ openai==1.97.0
74
+ opencv-python-headless==4.12.0.88
75
+ orjson==3.11.1
76
+ packaging==25.0
77
+ pandas==2.2.3
78
+ pdfminer.six==20250506
79
+ pdfplumber==0.11.7
80
+ pillow==11.3.0
81
+ platformdirs==4.3.8
82
+ propcache==0.3.2
83
+ psutil==7.0.0
84
+ pyarrow==21.0.0
85
+ pyclipper==1.3.0.post6
86
+ pycparser==2.22
87
+ pydantic==2.11.7
88
+ pydantic_core==2.33.2
89
+ pydub==0.25.1
90
+ Pygments==2.19.2
91
+ PyMuPDF==1.26.3
92
+ pyparsing==3.2.3
93
+ pypdf==5.8.0
94
+ pypdfium2==4.30.0
95
+ python-bidi==0.6.6
96
+ python-dateutil==2.9.0.post0
97
+ python-dotenv==1.1.1
98
+ python-multipart==0.0.20
99
+ pytz==2025.2
100
+ PyYAML==6.0.2
101
+ regex==2024.11.6
102
+ requests==2.32.4
103
+ rich==14.0.0
104
+ ruff==0.12.4
105
+ safehttpx==0.1.6
106
+ safetensors==0.5.3
107
+ scikit-image==0.25.2
108
+ scikit-learn==1.7.1
109
+ scipy==1.16.0
110
+ seaborn==0.13.2
111
+ semantic-version==2.10.0
112
+ sentence-transformers==3.0.1
113
+ shapely==2.1.1
114
+ shellingham==1.5.4
115
+ six==1.17.0
116
+ sniffio==1.3.1
117
+ soupsieve==2.7
118
+ SQLAlchemy==2.0.42
119
+ starlette==0.47.2
120
+ sympy==1.14.0
121
+ tenacity==9.1.2
122
+ threadpoolctl==3.6.0
123
+ tifffile==2025.6.11
124
+ tiktoken==0.10.0
125
+ tokenizers==0.21.2
126
+ tomlkit==0.13.3
127
+ torch==2.7.1
128
+ torchvision==0.22.1
129
+ tqdm==4.67.1
130
+ transformers==4.53.2
131
+ typer==0.16.0
132
+ typing-inspect==0.9.0
133
+ typing-inspection==0.4.1
134
+ typing_extensions==4.14.1
135
+ tzdata==2025.2
136
+ urllib3==2.5.0
137
+ uvicorn==0.35.0
138
+ websockets==15.0.1
139
+ wrapt==1.17.2
140
+ xxhash==3.5.0
141
+ yarl==1.20.1