YanBoChen commited on
Commit
3e2ffcb
Β·
1 Parent(s): 9e4c1bc

Add latency and relevance evaluators for medical query analysis (evaluatoin)

Browse files

- Implemented LatencyEvaluator to measure processing time for individual medical queries, including detailed timing for each processing step.
- Created a pre-defined query file (pre_user_query_evaluate.txt) with sample queries categorized as diagnosis, treatment, and mixed.
- Developed RelevanceEvaluator to assess retrieval relevance using cosine similarity, with automatic evaluation based on existing similarity scores.
- Added functionality to parse queries from a file and generate comprehensive statistics for both latency and relevance evaluations.
- Included methods to save results and statistics in JSON format for further analysis and visualization.

evaluation/coverage_evaluator.py ADDED
@@ -0,0 +1,560 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - Retrieval Coverage Evaluator (Metric 4)
4
+ ==========================================================
5
+
6
+ Evaluates how well generated medical advice utilizes retrieved content
7
+ Automatic evaluation using keyword overlap analysis with optional LLM sampling
8
+
9
+ Author: YanBo Chen
10
+ Date: 2025-08-04
11
+ """
12
+
13
+ import json
14
+ import os
15
+ import sys
16
+ from typing import Dict, List, Any, Set
17
+ from datetime import datetime
18
+ from pathlib import Path
19
+ import re
20
+
21
+ # Add project path
22
+ current_dir = Path(__file__).parent
23
+ project_root = current_dir.parent
24
+ src_dir = project_root / "src"
25
+ sys.path.insert(0, str(src_dir))
26
+
27
+ # Import existing system components
28
+ try:
29
+ from user_prompt import UserPromptProcessor
30
+ from retrieval import BasicRetrievalSystem
31
+ from llm_clients import llm_Med42_70BClient
32
+ from generation import MedicalAdviceGenerator
33
+ except ImportError as e:
34
+ print(f"❌ Import failed: {e}")
35
+ print("Please ensure running from project root directory")
36
+ sys.exit(1)
37
+
38
+
39
+ class CoverageEvaluator:
40
+ """Retrieval coverage evaluator using keyword overlap analysis"""
41
+
42
+ def __init__(self):
43
+ """Initialize system components for coverage testing"""
44
+ print("πŸ”§ Initializing Coverage Evaluator...")
45
+
46
+ # Initialize full pipeline components (needed for advice generation)
47
+ self.llm_client = llm_Med42_70BClient()
48
+ self.retrieval_system = BasicRetrievalSystem()
49
+ self.user_prompt_processor = UserPromptProcessor(
50
+ llm_client=self.llm_client,
51
+ retrieval_system=self.retrieval_system
52
+ )
53
+ self.medical_generator = MedicalAdviceGenerator(llm_client=self.llm_client)
54
+
55
+ # Results accumulation
56
+ self.coverage_results = []
57
+
58
+ print("βœ… Coverage Evaluator initialization complete")
59
+
60
+ def extract_medical_keywords(self, text: str) -> Set[str]:
61
+ """
62
+ Extract medical keywords from text for coverage analysis
63
+
64
+ Uses medical terminology patterns and common medical terms
65
+ """
66
+ if not text:
67
+ return set()
68
+
69
+ medical_keywords = set()
70
+ text_lower = text.lower()
71
+
72
+ # Medical terminology patterns
73
+ patterns = [
74
+ r'\b[a-z]+(?:osis|itis|pathy|emia|uria|gram|scopy)\b', # Medical suffixes
75
+ r'\b(?:cardio|neuro|pulmo|gastro|hepato|nephro)[a-z]+\b', # Medical prefixes
76
+ r'\b(?:diagnosis|treatment|therapy|intervention|management)\b', # Medical actions
77
+ r'\b(?:patient|symptom|condition|disease|disorder|syndrome)\b', # Medical entities
78
+ r'\b(?:acute|chronic|severe|mild|moderate|emergency)\b', # Medical descriptors
79
+ r'\b[a-z]+(?:al|ic|ous|ive)\s+(?:pain|failure|infection|injury)\b', # Compound terms
80
+ r'\b(?:ecg|ekg|ct|mri|x-ray|ultrasound|biopsy)\b', # Medical procedures
81
+ r'\b\d+\s*(?:mg|ml|units|hours|days|minutes)\b', # Dosages and timeframes
82
+ ]
83
+
84
+ for pattern in patterns:
85
+ matches = re.findall(pattern, text_lower)
86
+ medical_keywords.update(match.strip() for match in matches)
87
+
88
+ # Additional common medical terms
89
+ common_medical_terms = [
90
+ 'blood', 'pressure', 'heart', 'chest', 'pain', 'stroke', 'seizure',
91
+ 'emergency', 'hospital', 'monitor', 'assess', 'evaluate', 'immediate',
92
+ 'protocol', 'guideline', 'recommendation', 'risk', 'factor'
93
+ ]
94
+
95
+ for term in common_medical_terms:
96
+ if term in text_lower:
97
+ medical_keywords.add(term)
98
+
99
+ # Filter out very short terms and common words
100
+ filtered_keywords = {
101
+ kw for kw in medical_keywords
102
+ if len(kw) > 2 and kw not in ['the', 'and', 'for', 'with', 'are', 'can', 'may']
103
+ }
104
+
105
+ return filtered_keywords
106
+
107
+ def calculate_coverage_score(self, generated_advice: str, retrieval_results: List[Dict]) -> Dict[str, Any]:
108
+ """
109
+ Calculate coverage score based on keyword overlap between advice and retrieved docs
110
+
111
+ Args:
112
+ generated_advice: Generated medical advice text
113
+ retrieval_results: List of retrieved documents
114
+ """
115
+ if not generated_advice or not retrieval_results:
116
+ return {
117
+ "coverage_score": 0.0,
118
+ "matched_keywords": [],
119
+ "advice_keywords": [],
120
+ "source_keywords": [],
121
+ "coverage_details": []
122
+ }
123
+
124
+ # Extract keywords from generated advice
125
+ advice_keywords = self.extract_medical_keywords(generated_advice)
126
+
127
+ # Extract keywords from all retrieved documents
128
+ all_source_keywords = set()
129
+ coverage_details = []
130
+
131
+ for i, doc in enumerate(retrieval_results):
132
+ doc_content = doc.get('content', '') or doc.get('text', '')
133
+ doc_keywords = self.extract_medical_keywords(doc_content)
134
+ all_source_keywords.update(doc_keywords)
135
+
136
+ # Calculate overlap for this specific document
137
+ doc_overlap = advice_keywords.intersection(doc_keywords)
138
+ doc_coverage = len(doc_overlap) / len(doc_keywords) if doc_keywords else 0.0
139
+
140
+ coverage_details.append({
141
+ "doc_index": i,
142
+ "doc_snippet": doc_content[:100] + "...",
143
+ "doc_keywords_count": len(doc_keywords),
144
+ "matched_keywords_count": len(doc_overlap),
145
+ "doc_coverage_ratio": doc_coverage,
146
+ "matched_keywords": list(doc_overlap)[:10] # Limit for readability
147
+ })
148
+
149
+ # Calculate overall coverage
150
+ matched_keywords = advice_keywords.intersection(all_source_keywords)
151
+ coverage_score = len(matched_keywords) / len(all_source_keywords) if all_source_keywords else 0.0
152
+
153
+ return {
154
+ "coverage_score": coverage_score,
155
+ "matched_keywords": list(matched_keywords),
156
+ "advice_keywords": list(advice_keywords),
157
+ "source_keywords": list(all_source_keywords),
158
+ "advice_keywords_count": len(advice_keywords),
159
+ "source_keywords_count": len(all_source_keywords),
160
+ "matched_keywords_count": len(matched_keywords),
161
+ "coverage_percentage": coverage_score * 100,
162
+ "meets_threshold": coverage_score >= 0.6,
163
+ "coverage_details": coverage_details
164
+ }
165
+
166
+ def evaluate_single_coverage(self, query: str, category: str = "unknown") -> Dict[str, Any]:
167
+ """
168
+ Evaluate retrieval coverage for a single query
169
+
170
+ Requires full pipeline: extraction β†’ retrieval β†’ generation β†’ coverage analysis
171
+
172
+ Args:
173
+ query: Medical query to test
174
+ category: Query category (diagnosis/treatment/mixed)
175
+ """
176
+ print(f"πŸ” Testing coverage for: {query[:50]}...")
177
+ print(f"πŸ“‹ Category: {category}")
178
+
179
+ try:
180
+ # Step 1: Extract condition
181
+ condition_result = self.user_prompt_processor.extract_condition_keywords(query)
182
+
183
+ # Step 2: Perform retrieval
184
+ search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
185
+ if not search_query:
186
+ search_query = condition_result.get('condition', query)
187
+
188
+ retrieval_start = datetime.now()
189
+ retrieval_results = self.retrieval_system.search(search_query, top_k=5)
190
+ retrieval_time = (datetime.now() - retrieval_start).total_seconds()
191
+
192
+ processed_results = retrieval_results.get('processed_results', [])
193
+
194
+ if not processed_results:
195
+ result = {
196
+ "query": query,
197
+ "category": category,
198
+ "search_query": search_query,
199
+ "pipeline_success": False,
200
+ "coverage_score": 0.0,
201
+ "error": "No retrieval results",
202
+ "timestamp": datetime.now().isoformat()
203
+ }
204
+
205
+ self.coverage_results.append(result)
206
+ print(f" ❌ No retrieval results for coverage analysis")
207
+ return result
208
+
209
+ # Step 3: Generate medical advice
210
+ generation_start = datetime.now()
211
+ intention = self._detect_query_intention(query)
212
+ medical_advice_result = self.medical_generator.generate_medical_advice(
213
+ user_query=query,
214
+ retrieval_results=retrieval_results,
215
+ intention=intention
216
+ )
217
+ generation_time = (datetime.now() - generation_start).total_seconds()
218
+
219
+ generated_advice = medical_advice_result.get('medical_advice', '')
220
+
221
+ if not generated_advice:
222
+ result = {
223
+ "query": query,
224
+ "category": category,
225
+ "search_query": search_query,
226
+ "pipeline_success": False,
227
+ "coverage_score": 0.0,
228
+ "error": "No generated advice",
229
+ "timestamp": datetime.now().isoformat()
230
+ }
231
+
232
+ self.coverage_results.append(result)
233
+ print(f" ❌ No generated advice for coverage analysis")
234
+ return result
235
+
236
+ # Step 4: Calculate coverage
237
+ coverage_analysis = self.calculate_coverage_score(generated_advice, processed_results)
238
+
239
+ result = {
240
+ "query": query,
241
+ "category": category,
242
+ "search_query": search_query,
243
+ "pipeline_success": True,
244
+ "retrieval_time": retrieval_time,
245
+ "generation_time": generation_time,
246
+ "retrieved_docs_count": len(processed_results),
247
+ "generated_advice_length": len(generated_advice),
248
+ "coverage_analysis": coverage_analysis,
249
+ "coverage_score": coverage_analysis['coverage_score'],
250
+ "meets_threshold": coverage_analysis['meets_threshold'],
251
+ "timestamp": datetime.now().isoformat()
252
+ }
253
+
254
+ # Store result
255
+ self.coverage_results.append(result)
256
+
257
+ print(f" βœ… Pipeline: Complete")
258
+ print(f" πŸ“Š Coverage Score: {coverage_analysis['coverage_score']:.3f} ({coverage_analysis['coverage_percentage']:.1f}%)")
259
+ print(f" πŸ“ Keywords: {coverage_analysis['matched_keywords_count']}/{coverage_analysis['source_keywords_count']} matched")
260
+ print(f" 🎯 Threshold: {'βœ… Met' if result['meets_threshold'] else '❌ Not Met'}")
261
+ print(f" ⏱️ Times: Retrieval={retrieval_time:.2f}s, Generation={generation_time:.2f}s")
262
+
263
+ return result
264
+
265
+ except Exception as e:
266
+ error_result = {
267
+ "query": query,
268
+ "category": category,
269
+ "pipeline_success": False,
270
+ "coverage_score": 0.0,
271
+ "error": str(e),
272
+ "timestamp": datetime.now().isoformat()
273
+ }
274
+
275
+ self.coverage_results.append(error_result)
276
+ print(f" ❌ Coverage evaluation failed: {e}")
277
+
278
+ return error_result
279
+
280
+ def _detect_query_intention(self, query: str) -> str:
281
+ """Simplified query intention detection (from app.py)"""
282
+ query_lower = query.lower()
283
+
284
+ if any(word in query_lower for word in ['diagnos', 'differential', 'possible', 'causes']):
285
+ return 'diagnosis'
286
+ elif any(word in query_lower for word in ['treat', 'manage', 'therapy', 'intervention']):
287
+ return 'treatment'
288
+ else:
289
+ return 'mixed'
290
+
291
+ def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
292
+ """Parse queries from file with category labels"""
293
+ print(f"πŸ“ Reading queries from file: {filepath}")
294
+
295
+ try:
296
+ with open(filepath, 'r', encoding='utf-8') as f:
297
+ content = f.read()
298
+
299
+ # Parse queries with category labels
300
+ queries_by_category = {
301
+ "diagnosis": [],
302
+ "treatment": [],
303
+ "mixed": []
304
+ }
305
+
306
+ lines = content.strip().split('\n')
307
+
308
+ for line in lines:
309
+ line = line.strip()
310
+ if not line:
311
+ continue
312
+
313
+ # Parse format: "1.diagnosis: query text"
314
+ match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
315
+ if match:
316
+ category_raw = match.group(1).lower()
317
+ query_text = match.group(2).strip()
318
+
319
+ # Normalize category name
320
+ if category_raw in ['mixed/complicated', 'mixed']:
321
+ category = 'mixed'
322
+ else:
323
+ category = category_raw
324
+
325
+ if category in queries_by_category and len(query_text) > 15:
326
+ queries_by_category[category].append({
327
+ "text": query_text,
328
+ "category": category
329
+ })
330
+
331
+ print(f"πŸ“‹ Parsed queries by category:")
332
+ for category, category_queries in queries_by_category.items():
333
+ print(f" {category.capitalize()}: {len(category_queries)} queries")
334
+
335
+ return queries_by_category
336
+
337
+ except Exception as e:
338
+ print(f"❌ Failed to read file: {e}")
339
+ return {"error": f"Failed to read file: {e}"}
340
+
341
+ def calculate_coverage_statistics(self) -> Dict[str, Any]:
342
+ """Calculate coverage statistics by category"""
343
+ category_stats = {}
344
+ all_successful_results = []
345
+
346
+ # Group results by category
347
+ results_by_category = {
348
+ "diagnosis": [],
349
+ "treatment": [],
350
+ "mixed": []
351
+ }
352
+
353
+ for result in self.coverage_results:
354
+ category = result.get('category', 'unknown')
355
+ if category in results_by_category:
356
+ results_by_category[category].append(result)
357
+ if result.get('pipeline_success'):
358
+ all_successful_results.append(result)
359
+
360
+ # Calculate statistics for each category
361
+ for category, results in results_by_category.items():
362
+ successful_results = [r for r in results if r.get('pipeline_success')]
363
+
364
+ if successful_results:
365
+ coverage_scores = [r['coverage_score'] for r in successful_results]
366
+ avg_coverage = sum(coverage_scores) / len(coverage_scores)
367
+ avg_retrieval_time = sum(r.get('retrieval_time', 0) for r in successful_results) / len(successful_results)
368
+ avg_generation_time = sum(r.get('generation_time', 0) for r in successful_results) / len(successful_results)
369
+
370
+ category_stats[category] = {
371
+ "average_coverage": avg_coverage,
372
+ "max_coverage": max(coverage_scores),
373
+ "min_coverage": min(coverage_scores),
374
+ "successful_evaluations": len(successful_results),
375
+ "total_queries": len(results),
376
+ "success_rate": len(successful_results) / len(results),
377
+ "average_retrieval_time": avg_retrieval_time,
378
+ "average_generation_time": avg_generation_time,
379
+ "meets_threshold": avg_coverage >= 0.6,
380
+ "individual_coverage_scores": coverage_scores
381
+ }
382
+ else:
383
+ category_stats[category] = {
384
+ "average_coverage": 0.0,
385
+ "max_coverage": 0.0,
386
+ "min_coverage": 0.0,
387
+ "successful_evaluations": 0,
388
+ "total_queries": len(results),
389
+ "success_rate": 0.0,
390
+ "average_retrieval_time": 0.0,
391
+ "average_generation_time": 0.0,
392
+ "meets_threshold": False,
393
+ "individual_coverage_scores": []
394
+ }
395
+
396
+ # Calculate overall statistics
397
+ if all_successful_results:
398
+ all_coverage_scores = [r['coverage_score'] for r in all_successful_results]
399
+ overall_stats = {
400
+ "average_coverage": sum(all_coverage_scores) / len(all_coverage_scores),
401
+ "max_coverage": max(all_coverage_scores),
402
+ "min_coverage": min(all_coverage_scores),
403
+ "successful_evaluations": len(all_successful_results),
404
+ "total_queries": len(self.coverage_results),
405
+ "success_rate": len(all_successful_results) / len(self.coverage_results),
406
+ "meets_threshold": (sum(all_coverage_scores) / len(all_coverage_scores)) >= 0.6,
407
+ "target_compliance": (sum(all_coverage_scores) / len(all_coverage_scores)) >= 0.6
408
+ }
409
+ else:
410
+ overall_stats = {
411
+ "average_coverage": 0.0,
412
+ "max_coverage": 0.0,
413
+ "min_coverage": 0.0,
414
+ "successful_evaluations": 0,
415
+ "total_queries": len(self.coverage_results),
416
+ "success_rate": 0.0,
417
+ "meets_threshold": False,
418
+ "target_compliance": False
419
+ }
420
+
421
+ return {
422
+ "category_results": category_stats,
423
+ "overall_results": overall_stats,
424
+ "timestamp": datetime.now().isoformat()
425
+ }
426
+
427
+ def save_coverage_statistics(self, filename: str = None) -> str:
428
+ """Save coverage statistics for chart generation"""
429
+ stats = self.calculate_coverage_statistics()
430
+
431
+ if filename is None:
432
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
433
+ filename = f"coverage_statistics_{timestamp}.json"
434
+
435
+ # Ensure results directory exists
436
+ results_dir = Path(__file__).parent / "results"
437
+ results_dir.mkdir(exist_ok=True)
438
+
439
+ filepath = results_dir / filename
440
+
441
+ with open(filepath, 'w', encoding='utf-8') as f:
442
+ json.dump(stats, f, indent=2, ensure_ascii=False)
443
+
444
+ print(f"πŸ“Š Coverage statistics saved to: {filepath}")
445
+ return str(filepath)
446
+
447
+ def save_coverage_details(self, filename: str = None) -> str:
448
+ """Save detailed coverage results"""
449
+ if filename is None:
450
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
451
+ filename = f"coverage_details_{timestamp}.json"
452
+
453
+ # Ensure results directory exists
454
+ results_dir = Path(__file__).parent / "results"
455
+ results_dir.mkdir(exist_ok=True)
456
+
457
+ filepath = results_dir / filename
458
+
459
+ # Create comprehensive coverage data
460
+ coverage_data = {
461
+ "evaluation_metadata": {
462
+ "total_queries": len(self.coverage_results),
463
+ "successful_evaluations": len([r for r in self.coverage_results if r.get('pipeline_success')]),
464
+ "timestamp": datetime.now().isoformat(),
465
+ "evaluator_type": "retrieval_coverage",
466
+ "threshold_used": 0.6
467
+ },
468
+ "coverage_results": self.coverage_results
469
+ }
470
+
471
+ with open(filepath, 'w', encoding='utf-8') as f:
472
+ json.dump(coverage_data, f, indent=2, ensure_ascii=False)
473
+
474
+ print(f"πŸ“ Coverage details saved to: {filepath}")
475
+ return str(filepath)
476
+
477
+
478
+ # Independent execution interface
479
+ if __name__ == "__main__":
480
+ """Independent coverage evaluation interface"""
481
+
482
+ print("πŸ“ˆ OnCall.ai Coverage Evaluator - Retrieval Coverage Analysis")
483
+
484
+ if len(sys.argv) > 1:
485
+ query_file = sys.argv[1]
486
+ else:
487
+ # Default to evaluation/pre_user_query_evaluate.txt
488
+ query_file = Path(__file__).parent / "pre_user_query_evaluate.txt"
489
+
490
+ if not os.path.exists(query_file):
491
+ print(f"❌ Query file not found: {query_file}")
492
+ print("Usage: python coverage_evaluator.py [query_file.txt]")
493
+ sys.exit(1)
494
+
495
+ # Initialize evaluator
496
+ evaluator = CoverageEvaluator()
497
+
498
+ # Parse queries from file
499
+ queries_by_category = evaluator.parse_queries_from_file(str(query_file))
500
+
501
+ if "error" in queries_by_category:
502
+ print(f"❌ Failed to parse queries: {queries_by_category['error']}")
503
+ sys.exit(1)
504
+
505
+ # Test coverage for each query (requires full pipeline)
506
+ print(f"\nπŸ§ͺ Retrieval Coverage Testing (Full Pipeline Required)")
507
+ print(f"⚠️ Note: This evaluator requires LLM calls for advice generation")
508
+
509
+ for category, queries in queries_by_category.items():
510
+ if not queries:
511
+ continue
512
+
513
+ print(f"\nπŸ“‚ Testing {category.upper()} coverage:")
514
+
515
+ for i, query_info in enumerate(queries):
516
+ query_text = query_info['text']
517
+
518
+ # Test coverage (requires full pipeline)
519
+ result = evaluator.evaluate_single_coverage(query_text, category)
520
+
521
+ # Pause between queries to avoid rate limits
522
+ if i < len(queries) - 1:
523
+ print(f" ⏳ Pausing 5s before next query...")
524
+ import time
525
+ time.sleep(5)
526
+
527
+ # Longer pause between categories
528
+ if category != list(queries_by_category.keys())[-1]:
529
+ print(f"\n⏳ Pausing 10s before next category...")
530
+ import time
531
+ time.sleep(10)
532
+
533
+ # Generate and save results
534
+ print(f"\nπŸ“Š Generating coverage analysis...")
535
+
536
+ # Save statistics and details
537
+ stats_path = evaluator.save_coverage_statistics()
538
+ details_path = evaluator.save_coverage_details()
539
+
540
+ # Print final summary
541
+ stats = evaluator.calculate_coverage_statistics()
542
+ category_results = stats['category_results']
543
+ overall_results = stats['overall_results']
544
+
545
+ print(f"\nπŸ“Š === COVERAGE EVALUATION SUMMARY ===")
546
+ print(f"Overall Performance:")
547
+ print(f" Average Coverage: {overall_results['average_coverage']:.3f} ({overall_results['average_coverage']*100:.1f}%)")
548
+ print(f" Pipeline Success Rate: {overall_results['success_rate']:.1%}")
549
+ print(f" 60% Threshold: {'βœ… Met' if overall_results['meets_threshold'] else '❌ Not Met'}")
550
+
551
+ print(f"\nCategory Breakdown:")
552
+ for category, cat_stats in category_results.items():
553
+ if cat_stats['total_queries'] > 0:
554
+ print(f" {category.capitalize()}: {cat_stats['average_coverage']:.3f} "
555
+ f"({cat_stats['successful_evaluations']}/{cat_stats['total_queries']}) "
556
+ f"[R:{cat_stats['average_retrieval_time']:.2f}s, G:{cat_stats['average_generation_time']:.2f}s]")
557
+
558
+ print(f"\nβœ… Coverage evaluation complete!")
559
+ print(f"πŸ“Š Statistics: {stats_path}")
560
+ print(f"πŸ“ Details: {details_path}")
evaluation/extraction_evaluator.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - Condition Extraction Evaluator (Metric 2)
4
+ ============================================================
5
+
6
+ Evaluates condition extraction success rate from user_prompt.py
7
+ Pure automatic evaluation based on extract_condition_keywords() results
8
+
9
+ Author: YanBo Chen
10
+ Date: 2025-08-04
11
+ """
12
+
13
+ import json
14
+ import os
15
+ import sys
16
+ from typing import Dict, List, Any
17
+ from datetime import datetime
18
+ from pathlib import Path
19
+ import re
20
+
21
+ # Add project path
22
+ current_dir = Path(__file__).parent
23
+ project_root = current_dir.parent
24
+ src_dir = project_root / "src"
25
+ sys.path.insert(0, str(src_dir))
26
+
27
+ # Import existing system components
28
+ try:
29
+ from user_prompt import UserPromptProcessor
30
+ from retrieval import BasicRetrievalSystem
31
+ from llm_clients import llm_Med42_70BClient
32
+ except ImportError as e:
33
+ print(f"❌ Import failed: {e}")
34
+ print("Please ensure running from project root directory")
35
+ sys.exit(1)
36
+
37
+
38
+ class ExtractionEvaluator:
39
+ """Condition extraction success rate evaluator - pure automatic evaluation"""
40
+
41
+ def __init__(self):
42
+ """Initialize system components for extraction testing"""
43
+ print("πŸ”§ Initializing Extraction Evaluator...")
44
+
45
+ # Initialize required components for extraction
46
+ self.llm_client = llm_Med42_70BClient()
47
+ self.retrieval_system = BasicRetrievalSystem()
48
+ self.user_prompt_processor = UserPromptProcessor(
49
+ llm_client=self.llm_client,
50
+ retrieval_system=self.retrieval_system
51
+ )
52
+
53
+ # Results accumulation
54
+ self.extraction_results = []
55
+
56
+ print("βœ… Extraction Evaluator initialization complete")
57
+
58
+ def evaluate_single_extraction(self, query: str, category: str = "unknown") -> Dict[str, Any]:
59
+ """
60
+ Evaluate condition extraction success for a single query
61
+
62
+ Tests user_prompt.py extract_condition_keywords() method
63
+
64
+ Args:
65
+ query: Medical query to test
66
+ category: Query category (diagnosis/treatment/mixed)
67
+ """
68
+ print(f"πŸ” Testing extraction for: {query[:50]}...")
69
+ print(f"πŸ“‹ Category: {category}")
70
+
71
+ try:
72
+ # Call the actual extraction method from user_prompt.py
73
+ extraction_start = datetime.now()
74
+ condition_result = self.user_prompt_processor.extract_condition_keywords(query)
75
+ extraction_time = (datetime.now() - extraction_start).total_seconds()
76
+
77
+ # Analyze extraction success
78
+ extracted_condition = condition_result.get('condition')
79
+ query_status = condition_result.get('query_status')
80
+ emergency_keywords = condition_result.get('emergency_keywords', [])
81
+ treatment_keywords = condition_result.get('treatment_keywords', [])
82
+ fallback_level = condition_result.get('fallback_level', 'unknown')
83
+
84
+ # Define success criteria
85
+ is_successful = (
86
+ extracted_condition and
87
+ extracted_condition.strip() and
88
+ extracted_condition != "unknown" and
89
+ query_status not in ['invalid_query', 'non_medical']
90
+ )
91
+
92
+ result = {
93
+ "query": query,
94
+ "category": category,
95
+ "extraction_success": is_successful,
96
+ "extraction_time": extraction_time,
97
+ "extracted_condition": extracted_condition,
98
+ "query_status": query_status,
99
+ "emergency_keywords": emergency_keywords,
100
+ "treatment_keywords": treatment_keywords,
101
+ "fallback_level": fallback_level,
102
+ "full_condition_result": condition_result,
103
+ "timestamp": datetime.now().isoformat()
104
+ }
105
+
106
+ # Store result
107
+ self.extraction_results.append(result)
108
+
109
+ print(f" βœ… Extraction: {'Success' if is_successful else 'Failed'}")
110
+ print(f" πŸ“ Condition: {extracted_condition}")
111
+ print(f" 🎯 Status: {query_status}")
112
+ print(f" ⏱️ Time: {extraction_time:.3f}s")
113
+ print(f" πŸ”„ Fallback Level: {fallback_level}")
114
+
115
+ return result
116
+
117
+ except Exception as e:
118
+ error_result = {
119
+ "query": query,
120
+ "category": category,
121
+ "extraction_success": False,
122
+ "extraction_time": 0.0,
123
+ "error": str(e),
124
+ "timestamp": datetime.now().isoformat()
125
+ }
126
+
127
+ self.extraction_results.append(error_result)
128
+ print(f" ❌ Extraction failed: {e}")
129
+
130
+ return error_result
131
+
132
+ def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
133
+ """Parse queries from file with category labels"""
134
+ print(f"πŸ“ Reading queries from file: {filepath}")
135
+
136
+ try:
137
+ with open(filepath, 'r', encoding='utf-8') as f:
138
+ content = f.read()
139
+
140
+ # Parse queries with category labels
141
+ queries_by_category = {
142
+ "diagnosis": [],
143
+ "treatment": [],
144
+ "mixed": []
145
+ }
146
+
147
+ lines = content.strip().split('\n')
148
+
149
+ for line in lines:
150
+ line = line.strip()
151
+ if not line:
152
+ continue
153
+
154
+ # Parse format: "1.diagnosis: query text"
155
+ match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
156
+ if match:
157
+ category_raw = match.group(1).lower()
158
+ query_text = match.group(2).strip()
159
+
160
+ # Normalize category name
161
+ if category_raw in ['mixed/complicated', 'mixed']:
162
+ category = 'mixed'
163
+ else:
164
+ category = category_raw
165
+
166
+ if category in queries_by_category and len(query_text) > 15:
167
+ queries_by_category[category].append({
168
+ "text": query_text,
169
+ "category": category
170
+ })
171
+
172
+ print(f"πŸ“‹ Parsed queries by category:")
173
+ for category, category_queries in queries_by_category.items():
174
+ print(f" {category.capitalize()}: {len(category_queries)} queries")
175
+
176
+ return queries_by_category
177
+
178
+ except Exception as e:
179
+ print(f"❌ Failed to read file: {e}")
180
+ return {"error": f"Failed to read file: {e}"}
181
+
182
+ def calculate_extraction_statistics(self) -> Dict[str, Any]:
183
+ """Calculate extraction success statistics by category"""
184
+ category_stats = {}
185
+ all_results = []
186
+
187
+ # Group results by category
188
+ results_by_category = {
189
+ "diagnosis": [],
190
+ "treatment": [],
191
+ "mixed": []
192
+ }
193
+
194
+ for result in self.extraction_results:
195
+ category = result.get('category', 'unknown')
196
+ if category in results_by_category:
197
+ results_by_category[category].append(result)
198
+ all_results.append(result)
199
+
200
+ # Calculate statistics for each category
201
+ for category, results in results_by_category.items():
202
+ if results:
203
+ successful = [r for r in results if r.get('extraction_success')]
204
+ success_rate = len(successful) / len(results)
205
+ avg_time = sum(r.get('extraction_time', 0) for r in results) / len(results)
206
+
207
+ category_stats[category] = {
208
+ "success_rate": success_rate,
209
+ "successful_count": len(successful),
210
+ "total_count": len(results),
211
+ "average_extraction_time": avg_time,
212
+ "fallback_levels": [r.get('fallback_level') for r in results]
213
+ }
214
+ else:
215
+ category_stats[category] = {
216
+ "success_rate": 0.0,
217
+ "successful_count": 0,
218
+ "total_count": 0,
219
+ "average_extraction_time": 0.0,
220
+ "fallback_levels": []
221
+ }
222
+
223
+ # Calculate overall statistics
224
+ if all_results:
225
+ overall_successful = [r for r in all_results if r.get('extraction_success')]
226
+ overall_stats = {
227
+ "success_rate": len(overall_successful) / len(all_results),
228
+ "successful_count": len(overall_successful),
229
+ "total_count": len(all_results),
230
+ "average_extraction_time": sum(r.get('extraction_time', 0) for r in all_results) / len(all_results),
231
+ "target_compliance": len(overall_successful) / len(all_results) >= 0.8
232
+ }
233
+ else:
234
+ overall_stats = {
235
+ "success_rate": 0.0,
236
+ "successful_count": 0,
237
+ "total_count": 0,
238
+ "average_extraction_time": 0.0,
239
+ "target_compliance": False
240
+ }
241
+
242
+ return {
243
+ "category_results": category_stats,
244
+ "overall_results": overall_stats,
245
+ "timestamp": datetime.now().isoformat()
246
+ }
247
+
248
+ def save_extraction_statistics(self, filename: str = None) -> str:
249
+ """Save extraction statistics for chart generation"""
250
+ stats = self.calculate_extraction_statistics()
251
+
252
+ if filename is None:
253
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
254
+ filename = f"extraction_statistics_{timestamp}.json"
255
+
256
+ # Ensure results directory exists
257
+ results_dir = Path(__file__).parent / "results"
258
+ results_dir.mkdir(exist_ok=True)
259
+
260
+ filepath = results_dir / filename
261
+
262
+ with open(filepath, 'w', encoding='utf-8') as f:
263
+ json.dump(stats, f, indent=2, ensure_ascii=False)
264
+
265
+ print(f"πŸ“Š Extraction statistics saved to: {filepath}")
266
+ return str(filepath)
267
+
268
+ def save_extraction_details(self, filename: str = None) -> str:
269
+ """Save detailed extraction results"""
270
+ if filename is None:
271
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
272
+ filename = f"extraction_details_{timestamp}.json"
273
+
274
+ # Ensure results directory exists
275
+ results_dir = Path(__file__).parent / "results"
276
+ results_dir.mkdir(exist_ok=True)
277
+
278
+ filepath = results_dir / filename
279
+
280
+ # Create comprehensive extraction data
281
+ extraction_data = {
282
+ "evaluation_metadata": {
283
+ "total_queries": len(self.extraction_results),
284
+ "timestamp": datetime.now().isoformat(),
285
+ "evaluator_type": "condition_extraction"
286
+ },
287
+ "extraction_results": self.extraction_results
288
+ }
289
+
290
+ with open(filepath, 'w', encoding='utf-8') as f:
291
+ json.dump(extraction_data, f, indent=2, ensure_ascii=False)
292
+
293
+ print(f"πŸ“ Extraction details saved to: {filepath}")
294
+ return str(filepath)
295
+
296
+
297
+ # Independent execution interface
298
+ if __name__ == "__main__":
299
+ """Independent extraction evaluation interface"""
300
+
301
+ print("πŸ” OnCall.ai Extraction Evaluator - Condition Extraction Success Rate")
302
+
303
+ if len(sys.argv) > 1:
304
+ query_file = sys.argv[1]
305
+ else:
306
+ # Default to evaluation/pre_user_query_evaluate.txt
307
+ query_file = Path(__file__).parent / "pre_user_query_evaluate.txt"
308
+
309
+ if not os.path.exists(query_file):
310
+ print(f"❌ Query file not found: {query_file}")
311
+ print("Usage: python extraction_evaluator.py [query_file.txt]")
312
+ sys.exit(1)
313
+
314
+ # Initialize evaluator
315
+ evaluator = ExtractionEvaluator()
316
+
317
+ # Parse queries from file
318
+ queries_by_category = evaluator.parse_queries_from_file(str(query_file))
319
+
320
+ if "error" in queries_by_category:
321
+ print(f"❌ Failed to parse queries: {queries_by_category['error']}")
322
+ sys.exit(1)
323
+
324
+ # Test extraction for each query
325
+ print(f"\nπŸ§ͺ Condition Extraction Testing")
326
+
327
+ for category, queries in queries_by_category.items():
328
+ if not queries:
329
+ continue
330
+
331
+ print(f"\nπŸ“‚ Testing {category.upper()} extraction:")
332
+
333
+ for i, query_info in enumerate(queries):
334
+ query_text = query_info['text']
335
+
336
+ # Test extraction
337
+ result = evaluator.evaluate_single_extraction(query_text, category)
338
+
339
+ # Pause between queries to avoid rate limits (if needed)
340
+ if i < len(queries) - 1:
341
+ print(f" ⏳ Pausing 3s before next query...")
342
+ import time
343
+ time.sleep(3)
344
+
345
+ # Pause between categories
346
+ if category != list(queries_by_category.keys())[-1]:
347
+ print(f"\n⏳ Pausing 5s before next category...")
348
+ import time
349
+ time.sleep(5)
350
+
351
+ # Generate and save results
352
+ print(f"\nπŸ“Š Generating extraction analysis...")
353
+
354
+ # Save statistics and details
355
+ stats_path = evaluator.save_extraction_statistics()
356
+ details_path = evaluator.save_extraction_details()
357
+
358
+ # Print final summary
359
+ stats = evaluator.calculate_extraction_statistics()
360
+ category_results = stats['category_results']
361
+ overall_results = stats['overall_results']
362
+
363
+ print(f"\nπŸ“Š === EXTRACTION EVALUATION SUMMARY ===")
364
+ print(f"Overall Performance:")
365
+ print(f" Success Rate: {overall_results['success_rate']:.1%}")
366
+ print(f" Successful Extractions: {overall_results['successful_count']}/{overall_results['total_count']}")
367
+ print(f" Average Extraction Time: {overall_results['average_extraction_time']:.3f}s")
368
+ print(f" 80% Target Compliance: {'βœ… Met' if overall_results['target_compliance'] else '❌ Not Met'}")
369
+
370
+ print(f"\nCategory Breakdown:")
371
+ for category, cat_stats in category_results.items():
372
+ if cat_stats['total_count'] > 0:
373
+ print(f" {category.capitalize()}: {cat_stats['success_rate']:.1%} "
374
+ f"({cat_stats['successful_count']}/{cat_stats['total_count']}) "
375
+ f"[{cat_stats['average_extraction_time']:.3f}s avg]")
376
+
377
+ print(f"\nβœ… Extraction evaluation complete!")
378
+ print(f"πŸ“Š Statistics: {stats_path}")
379
+ print(f"πŸ“ Details: {details_path}")
evaluation/latency_chart_generator.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - Latency Chart Generator
4
+ ==========================================
5
+
6
+ Generates comprehensive latency analysis charts from saved statistics.
7
+ Reads JSON files produced by latency_evaluator.py and creates visualizations.
8
+
9
+ No LLM calls - pure data visualization.
10
+
11
+ Author: YanBo Chen
12
+ Date: 2025-08-04
13
+ """
14
+
15
+ import json
16
+ import os
17
+ import sys
18
+ from typing import Dict, List, Any
19
+ from datetime import datetime
20
+ from pathlib import Path
21
+ import glob
22
+
23
+ # Visualization imports
24
+ import matplotlib.pyplot as plt
25
+ import seaborn as sns
26
+ import pandas as pd
27
+ import numpy as np
28
+
29
+
30
+ class LatencyChartGenerator:
31
+ """Generate charts from latency evaluation statistics - no LLM dependency"""
32
+
33
+ def __init__(self):
34
+ """Initialize chart generator"""
35
+ print("πŸ“ˆ Initializing Latency Chart Generator...")
36
+
37
+ # Set up professional chart style
38
+ plt.style.use('default')
39
+ sns.set_palette("husl")
40
+
41
+ print("βœ… Chart Generator ready")
42
+
43
+ def load_latest_statistics(self, results_dir: str = None) -> Dict[str, Any]:
44
+ """
45
+ Load the most recent latency statistics file
46
+
47
+ Args:
48
+ results_dir: Directory containing statistics files
49
+ """
50
+ if results_dir is None:
51
+ results_dir = Path(__file__).parent / "results"
52
+
53
+ # Find latest statistics file
54
+ pattern = str(results_dir / "latency_statistics_*.json")
55
+ stat_files = glob.glob(pattern)
56
+
57
+ if not stat_files:
58
+ raise FileNotFoundError(f"No latency statistics files found in {results_dir}")
59
+
60
+ # Get the most recent file
61
+ latest_file = max(stat_files, key=os.path.getmtime)
62
+
63
+ print(f"πŸ“Š Loading statistics from: {latest_file}")
64
+
65
+ with open(latest_file, 'r', encoding='utf-8') as f:
66
+ stats = json.load(f)
67
+
68
+ return stats
69
+
70
+ def generate_comprehensive_charts(self, stats: Dict[str, Any]) -> str:
71
+ """
72
+ Generate comprehensive 4-category latency analysis charts
73
+
74
+ Creates professional charts showing:
75
+ 1. Category comparison bar chart
76
+ 2. Individual query scatter plot
77
+ 3. Statistical summary table
78
+ 4. Performance distribution box plot
79
+ """
80
+ try:
81
+ # Create figure with subplots
82
+ fig, axes = plt.subplots(2, 2, figsize=(16, 12))
83
+ fig.suptitle('OnCall.ai Latency Analysis - Category Comparison',
84
+ fontsize=16, fontweight='bold')
85
+
86
+ category_results = stats['category_results']
87
+ overall_results = stats['overall_results']
88
+
89
+ # Chart 1: Category Comparison Bar Chart
90
+ ax1 = axes[0, 0]
91
+ categories = []
92
+ avg_latencies = []
93
+ std_devs = []
94
+
95
+ # Collect category data
96
+ for category, cat_stats in category_results.items():
97
+ if cat_stats['query_count'] > 0:
98
+ categories.append(category.replace('_', ' ').title())
99
+ avg_latencies.append(cat_stats['average_latency'])
100
+ std_devs.append(cat_stats['std_deviation'])
101
+
102
+ # Add overall
103
+ categories.append('Overall')
104
+ avg_latencies.append(overall_results['average_latency'])
105
+ std_devs.append(overall_results['std_deviation'])
106
+
107
+ # Create bar chart with error bars
108
+ bars = ax1.bar(categories, avg_latencies, capsize=5, alpha=0.8,
109
+ color=['#1f77b4', '#ff7f0e', '#d62728', '#2ca02c'])
110
+ ax1.errorbar(categories, avg_latencies, yerr=std_devs, fmt='none',
111
+ color='black', capsize=3, capthick=1)
112
+
113
+ ax1.set_title('Average Latency by Category', fontweight='bold')
114
+ ax1.set_ylabel('Latency (seconds)')
115
+ ax1.set_xlabel('Query Category')
116
+ ax1.grid(True, alpha=0.3)
117
+
118
+ # Add value labels on bars
119
+ for bar, avg, std in zip(bars, avg_latencies, std_devs):
120
+ height = bar.get_height()
121
+ ax1.text(bar.get_x() + bar.get_width()/2., height + std*0.1,
122
+ f'{avg:.1f}s', ha='center', va='bottom', fontweight='bold')
123
+
124
+ # Add target line
125
+ ax1.axhline(y=30.0, color='red', linestyle='--', alpha=0.7, label='30s Target')
126
+ ax1.legend()
127
+
128
+ # Chart 2: Individual Query Performance
129
+ ax2 = axes[0, 1]
130
+
131
+ query_indices = []
132
+ latencies = []
133
+ colors = []
134
+
135
+ color_map = {'diagnosis': '#1f77b4', 'treatment': '#ff7f0e', 'mixed': '#d62728'}
136
+ query_idx = 0
137
+
138
+ for category, cat_stats in category_results.items():
139
+ for latency in cat_stats['individual_latencies']:
140
+ query_indices.append(query_idx)
141
+ latencies.append(latency)
142
+ colors.append(color_map.get(category, 'gray'))
143
+ query_idx += 1
144
+
145
+ if latencies:
146
+ ax2.scatter(query_indices, latencies, c=colors, alpha=0.7, s=100)
147
+ ax2.set_title('Individual Query Performance', fontweight='bold')
148
+ ax2.set_ylabel('Latency (seconds)')
149
+ ax2.set_xlabel('Query Index')
150
+ ax2.grid(True, alpha=0.3)
151
+
152
+ # Add target line
153
+ ax2.axhline(y=30.0, color='red', linestyle='--', alpha=0.7, label='30s Target')
154
+
155
+ # Add category legend
156
+ from matplotlib.patches import Patch
157
+ legend_elements = [Patch(facecolor=color_map[cat], label=cat.title())
158
+ for cat in color_map.keys() if cat in category_results.keys()]
159
+ ax2.legend(handles=legend_elements)
160
+ else:
161
+ ax2.text(0.5, 0.5, 'No latency data available',
162
+ ha='center', va='center', transform=ax2.transAxes)
163
+ ax2.set_title('Individual Query Performance', fontweight='bold')
164
+
165
+ # Chart 3: Statistical Summary Table
166
+ ax3 = axes[1, 0]
167
+ ax3.axis('tight')
168
+ ax3.axis('off')
169
+
170
+ # Create summary table
171
+ table_data = []
172
+ headers = ['Category', 'Avg (s)', 'Std (s)', 'Min (s)', 'Max (s)', 'Count']
173
+
174
+ for category, cat_stats in category_results.items():
175
+ if cat_stats['query_count'] > 0:
176
+ table_data.append([
177
+ category.replace('_', ' ').title(),
178
+ f"{cat_stats['average_latency']:.2f}",
179
+ f"{cat_stats['std_deviation']:.2f}",
180
+ f"{cat_stats['min_latency']:.2f}",
181
+ f"{cat_stats['max_latency']:.2f}",
182
+ str(cat_stats['query_count'])
183
+ ])
184
+
185
+ # Add overall row
186
+ table_data.append([
187
+ 'Overall',
188
+ f"{overall_results['average_latency']:.2f}",
189
+ f"{overall_results['std_deviation']:.2f}",
190
+ f"{overall_results['min_latency']:.2f}",
191
+ f"{overall_results['max_latency']:.2f}",
192
+ str(overall_results['successful_queries'])
193
+ ])
194
+
195
+ if table_data:
196
+ table = ax3.table(cellText=table_data, colLabels=headers,
197
+ cellLoc='center', loc='center',
198
+ colWidths=[0.2, 0.15, 0.15, 0.15, 0.15, 0.1])
199
+ table.auto_set_font_size(False)
200
+ table.set_fontsize(10)
201
+ table.scale(1, 2)
202
+
203
+ # Style the table header
204
+ for i in range(len(headers)):
205
+ table[(0, i)].set_text_props(weight='bold', color='white')
206
+ table[(0, i)].set_facecolor('#2E7D32')
207
+
208
+ ax3.set_title('Statistical Summary', fontweight='bold', pad=20)
209
+
210
+ # Chart 4: Performance Distribution
211
+ ax4 = axes[1, 1]
212
+
213
+ # Create box plot if we have multiple data points
214
+ box_data = []
215
+ box_labels = []
216
+
217
+ for category, cat_stats in category_results.items():
218
+ if cat_stats['individual_latencies'] and len(cat_stats['individual_latencies']) > 0:
219
+ box_data.append(cat_stats['individual_latencies'])
220
+ box_labels.append(category.replace('_', ' ').title())
221
+
222
+ if box_data and len(box_data) > 0:
223
+ box_plot = ax4.boxplot(box_data, labels=box_labels, patch_artist=True)
224
+
225
+ # Color the boxes
226
+ colors = ['#1f77b4', '#ff7f0e', '#d62728']
227
+ for patch, color in zip(box_plot['boxes'], colors[:len(box_plot['boxes'])]):
228
+ patch.set_facecolor(color)
229
+ patch.set_alpha(0.7)
230
+
231
+ ax4.set_title('Latency Distribution by Category', fontweight='bold')
232
+ ax4.set_ylabel('Latency (seconds)')
233
+ ax4.grid(True, alpha=0.3)
234
+
235
+ # Add target line
236
+ ax4.axhline(y=30.0, color='red', linestyle='--', alpha=0.7, label='30s Target')
237
+ ax4.legend()
238
+ else:
239
+ # For single data points, show a simple bar chart
240
+ single_categories = []
241
+ single_latencies = []
242
+
243
+ for category, cat_stats in category_results.items():
244
+ if cat_stats['query_count'] > 0:
245
+ single_categories.append(category.replace('_', ' ').title())
246
+ single_latencies.append(cat_stats['average_latency'])
247
+
248
+ if single_categories:
249
+ ax4.bar(single_categories, single_latencies, alpha=0.7,
250
+ color=['#1f77b4', '#ff7f0e', '#d62728'][:len(single_categories)])
251
+ ax4.set_title('Category Latency (Single Query Each)', fontweight='bold')
252
+ ax4.set_ylabel('Latency (seconds)')
253
+ ax4.grid(True, alpha=0.3)
254
+ ax4.axhline(y=30.0, color='red', linestyle='--', alpha=0.7, label='30s Target')
255
+ ax4.legend()
256
+ else:
257
+ ax4.text(0.5, 0.5, 'No data available for distribution plot',
258
+ ha='center', va='center', transform=ax4.transAxes)
259
+ ax4.set_title('Latency Distribution', fontweight='bold')
260
+
261
+ # Adjust layout and save
262
+ plt.tight_layout()
263
+
264
+ # Save chart
265
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
266
+ chart_filename = f"latency_analysis_charts_{timestamp}.png"
267
+
268
+ # Ensure results directory exists
269
+ results_dir = Path(__file__).parent / "results"
270
+ results_dir.mkdir(exist_ok=True)
271
+ chart_path = results_dir / chart_filename
272
+
273
+ plt.savefig(chart_path, dpi=300, bbox_inches='tight',
274
+ facecolor='white', edgecolor='none')
275
+ plt.close()
276
+
277
+ print(f"πŸ“ˆ Charts saved to: {chart_path}")
278
+ return str(chart_path)
279
+
280
+ except Exception as e:
281
+ print(f"❌ Chart generation failed: {e}")
282
+ return ""
283
+
284
+ def print_statistics_summary(self, stats: Dict[str, Any]):
285
+ """Print formatted statistics summary to console"""
286
+ category_results = stats['category_results']
287
+ overall_results = stats['overall_results']
288
+
289
+ print(f"\nπŸ“Š === LATENCY ANALYSIS CHART SUMMARY ===")
290
+ print(f"Overall Performance:")
291
+ print(f" Average Latency: {overall_results['average_latency']:.2f}s (Β±{overall_results['std_deviation']:.2f})")
292
+ print(f" Success Rate: {overall_results['successful_queries']}/{overall_results['total_queries']}")
293
+ print(f" 30s Target Compliance: {overall_results['target_compliance']:.1%}")
294
+
295
+ print(f"\nCategory Breakdown:")
296
+ for category, cat_stats in category_results.items():
297
+ if cat_stats['query_count'] > 0:
298
+ print(f" {category.capitalize()}: {cat_stats['average_latency']:.2f}s (Β±{cat_stats['std_deviation']:.2f}) [{cat_stats['query_count']} queries]")
299
+
300
+
301
+ # Independent execution interface
302
+ if __name__ == "__main__":
303
+ """Independent chart generation interface"""
304
+
305
+ print("πŸ“ˆ OnCall.ai Latency Chart Generator")
306
+
307
+ # Initialize chart generator
308
+ chart_gen = LatencyChartGenerator()
309
+
310
+ try:
311
+ # Load latest statistics
312
+ stats = chart_gen.load_latest_statistics()
313
+
314
+ # Generate charts
315
+ chart_path = chart_gen.generate_comprehensive_charts(stats)
316
+
317
+ # Print summary
318
+ chart_gen.print_statistics_summary(stats)
319
+
320
+ print(f"\nβœ… Chart generation complete!")
321
+ print(f"πŸ“ˆ Charts saved to: {chart_path}")
322
+
323
+ except FileNotFoundError as e:
324
+ print(f"❌ {e}")
325
+ print("πŸ’‘ Please run latency_evaluator.py first to generate statistics data")
326
+ except Exception as e:
327
+ print(f"❌ Chart generation failed: {e}")
evaluation/latency_evaluator.py ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - Latency Evaluator (Single Query Test Mode)
4
+ ============================================================
5
+
6
+ Test latency for individual queries to avoid rate limits.
7
+ Based on existing system flow: app.py -> user_prompt.py -> retrieval.py -> generation.py
8
+
9
+ Author: YanBo Chen
10
+ Date: 2025-08-04
11
+ """
12
+
13
+ import time
14
+ import json
15
+ import os
16
+ import sys
17
+ from typing import Dict, List, Any
18
+ from datetime import datetime
19
+ from pathlib import Path
20
+ import re
21
+
22
+ # Add project path
23
+ current_dir = Path(__file__).parent
24
+ project_root = current_dir.parent
25
+ src_dir = project_root / "src"
26
+ sys.path.insert(0, str(src_dir))
27
+
28
+ # Import existing system components
29
+ try:
30
+ from user_prompt import UserPromptProcessor
31
+ from retrieval import BasicRetrievalSystem
32
+ from llm_clients import llm_Med42_70BClient
33
+ from generation import MedicalAdviceGenerator
34
+ except ImportError as e:
35
+ print(f"❌ Import failed: {e}")
36
+ print("Please ensure running from project root directory")
37
+ sys.exit(1)
38
+
39
+
40
+ class LatencyEvaluator:
41
+ """Pure latency measurement and medical advice output recording - no visualization"""
42
+
43
+ def __init__(self):
44
+ """Initialize existing system components"""
45
+ print("πŸ”§ Initializing Latency Evaluator...")
46
+
47
+ # Initialize existing system components (same as app.py)
48
+ self.llm_client = llm_Med42_70BClient()
49
+ self.retrieval_system = BasicRetrievalSystem()
50
+ self.user_prompt_processor = UserPromptProcessor(
51
+ llm_client=self.llm_client,
52
+ retrieval_system=self.retrieval_system
53
+ )
54
+ self.medical_generator = MedicalAdviceGenerator(llm_client=self.llm_client)
55
+
56
+ # Results accumulation for summary statistics
57
+ self.accumulated_results = {
58
+ "diagnosis": [],
59
+ "treatment": [],
60
+ "mixed": []
61
+ }
62
+
63
+ # Medical advice outputs for model comparison
64
+ self.medical_outputs = []
65
+
66
+ print("βœ… Latency Evaluator initialization complete")
67
+
68
+ def measure_single_query_latency(self, query: str, category: str = "unknown") -> Dict[str, Any]:
69
+ """
70
+ Measure complete processing time for a single query
71
+
72
+ Replicates app.py's process_medical_query flow with timing focus
73
+
74
+ Args:
75
+ query: Medical query to test
76
+ category: Query category (diagnosis/treatment/mixed)
77
+ """
78
+ print(f"⏱️ Measuring query latency: {query[:50]}...")
79
+ print(f"πŸ“‹ Category: {category}")
80
+
81
+ overall_start = time.time()
82
+ timing_details = {}
83
+
84
+ try:
85
+ # STEP 1: Condition extraction (user_prompt.py)
86
+ step1_start = time.time()
87
+ condition_result = self.user_prompt_processor.extract_condition_keywords(query)
88
+ timing_details['step1_condition_extraction'] = time.time() - step1_start
89
+
90
+ print(f" Step 1 - Condition extraction: {timing_details['step1_condition_extraction']:.3f}s")
91
+ print(f" Extracted condition: {condition_result.get('condition', 'None')}")
92
+
93
+ # Check if valid medical query
94
+ if condition_result.get('query_status') in ['invalid_query', 'non_medical']:
95
+ total_time = time.time() - overall_start
96
+ print(f" ⚠️ Non-medical query detected")
97
+ return {
98
+ "query": query,
99
+ "category": category,
100
+ "total_latency": total_time,
101
+ "timing_details": timing_details,
102
+ "status": "non_medical",
103
+ "condition_result": condition_result,
104
+ "success": False,
105
+ "timestamp": datetime.now().isoformat()
106
+ }
107
+
108
+ # STEP 2: User confirmation (simulate auto-confirmation)
109
+ step2_start = time.time()
110
+ confirmation = self.user_prompt_processor.handle_user_confirmation(condition_result)
111
+ timing_details['step2_confirmation'] = time.time() - step2_start
112
+
113
+ print(f" Step 2 - User confirmation: {timing_details['step2_confirmation']:.3f}s")
114
+
115
+ # STEP 3: Retrieve relevant guidelines (retrieval.py)
116
+ step3_start = time.time()
117
+
118
+ search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
119
+ if not search_query:
120
+ search_query = condition_result.get('condition', query)
121
+
122
+ retrieval_results = self.retrieval_system.search(search_query, top_k=5)
123
+ timing_details['step3_retrieval'] = time.time() - step3_start
124
+
125
+ retrieved_count = len(retrieval_results.get('processed_results', []))
126
+ print(f" Step 3 - Retrieval: {timing_details['step3_retrieval']:.3f}s ({retrieved_count} results)")
127
+
128
+ # STEP 4: Generate medical advice (generation.py)
129
+ step4_start = time.time()
130
+
131
+ intention = self._detect_query_intention(query)
132
+ medical_advice_result = self.medical_generator.generate_medical_advice(
133
+ user_query=query,
134
+ retrieval_results=retrieval_results,
135
+ intention=intention
136
+ )
137
+ timing_details['step4_generation'] = time.time() - step4_start
138
+
139
+ print(f" Step 4 - Generation: {timing_details['step4_generation']:.3f}s")
140
+
141
+ total_time = time.time() - overall_start
142
+
143
+ # Extract medical advice output for future model comparison
144
+ medical_advice_text = medical_advice_result.get('medical_advice', '')
145
+ confidence_score = medical_advice_result.get('confidence_score', 0.0)
146
+
147
+ result = {
148
+ "query": query,
149
+ "category": category,
150
+ "total_latency": total_time,
151
+ "timing_details": timing_details,
152
+ "condition_result": condition_result,
153
+ "retrieval_results": retrieval_results,
154
+ "medical_advice_result": medical_advice_result,
155
+ "status": "success",
156
+ "success": True,
157
+ "timestamp": datetime.now().isoformat()
158
+ }
159
+
160
+ # Store medical output separately for model comparison
161
+ medical_output = {
162
+ "query": query,
163
+ "category": category,
164
+ "medical_advice": medical_advice_text,
165
+ "confidence_score": confidence_score,
166
+ "query_id": f"{category}_query",
167
+ "processing_time": total_time,
168
+ "timestamp": datetime.now().isoformat()
169
+ }
170
+
171
+ self.medical_outputs.append(medical_output)
172
+
173
+ print(f"βœ… Query completed successfully in {total_time:.2f}s")
174
+ print(f"πŸ“ Medical advice recorded ({len(medical_advice_text)} characters)")
175
+
176
+ return result
177
+
178
+ except Exception as e:
179
+ total_time = time.time() - overall_start
180
+ print(f"❌ Query failed after {total_time:.2f}s: {e}")
181
+
182
+ return {
183
+ "query": query,
184
+ "category": category,
185
+ "total_latency": total_time,
186
+ "timing_details": timing_details,
187
+ "error": str(e),
188
+ "status": "error",
189
+ "success": False,
190
+ "timestamp": datetime.now().isoformat()
191
+ }
192
+
193
+ def test_individual_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
194
+ """
195
+ Parse queries from file and return them for individual testing
196
+
197
+ Returns categorized queries for separate testing
198
+ """
199
+ print(f"πŸ“ Reading queries from file: {filepath}")
200
+
201
+ try:
202
+ with open(filepath, 'r', encoding='utf-8') as f:
203
+ content = f.read()
204
+
205
+ # Parse queries with category labels
206
+ queries_by_category = {
207
+ "diagnosis": [],
208
+ "treatment": [],
209
+ "mixed": []
210
+ }
211
+
212
+ lines = content.strip().split('\n')
213
+
214
+ for line in lines:
215
+ line = line.strip()
216
+ if not line:
217
+ continue
218
+
219
+ # Parse format: "1.diagnosis: query text"
220
+ match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
221
+ if match:
222
+ category_raw = match.group(1).lower()
223
+ query_text = match.group(2).strip()
224
+
225
+ # Normalize category name
226
+ if category_raw in ['mixed/complicated', 'mixed']:
227
+ category = 'mixed'
228
+ else:
229
+ category = category_raw
230
+
231
+ if category in queries_by_category and len(query_text) > 15:
232
+ queries_by_category[category].append({
233
+ "text": query_text,
234
+ "category": category
235
+ })
236
+
237
+ print(f"πŸ“‹ Parsed queries by category:")
238
+ for category, category_queries in queries_by_category.items():
239
+ print(f" {category.capitalize()}: {len(category_queries)} queries")
240
+ for i, query_info in enumerate(category_queries):
241
+ print(f" {i+1}. {query_info['text'][:60]}...")
242
+
243
+ return queries_by_category
244
+
245
+ except Exception as e:
246
+ print(f"❌ Failed to read file: {e}")
247
+ return {"error": f"Failed to read file: {e}"}
248
+
249
+ def _detect_query_intention(self, query: str) -> str:
250
+ """Simplified query intention detection (from app.py)"""
251
+ query_lower = query.lower()
252
+
253
+ if any(word in query_lower for word in ['diagnos', 'differential', 'possible', 'causes']):
254
+ return 'diagnosis'
255
+ elif any(word in query_lower for word in ['treat', 'manage', 'therapy', 'intervention']):
256
+ return 'treatment'
257
+ else:
258
+ return 'mixed'
259
+
260
+ def save_single_result(self, result: Dict[str, Any], filename: str = None) -> str:
261
+ """Save single query evaluation result"""
262
+ if filename is None:
263
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
264
+ category = result.get('category', 'unknown')
265
+ filename = f"latency_{category}_{timestamp}.json"
266
+
267
+ # Ensure results directory exists
268
+ results_dir = Path(__file__).parent / "results"
269
+ results_dir.mkdir(exist_ok=True)
270
+
271
+ filepath = results_dir / filename
272
+
273
+ with open(filepath, 'w', encoding='utf-8') as f:
274
+ json.dump(result, f, indent=2, ensure_ascii=False)
275
+
276
+ print(f"πŸ’Ύ Result saved to: {filepath}")
277
+ return str(filepath)
278
+
279
+
280
+ # Independent execution interface
281
+ if __name__ == "__main__":
282
+ """Independent test interface for single queries"""
283
+
284
+ print("πŸš€ OnCall.ai Latency Evaluator - Single Query Test Mode")
285
+
286
+ if len(sys.argv) > 1:
287
+ query_file = sys.argv[1]
288
+ else:
289
+ # Default to evaluation/pre_user_query_evaluate.txt
290
+ query_file = Path(__file__).parent / "pre_user_query_evaluate.txt"
291
+
292
+ if not os.path.exists(query_file):
293
+ print(f"❌ Query file not found: {query_file}")
294
+ print("Usage: python latency_evaluator.py [query_file.txt]")
295
+ sys.exit(1)
296
+
297
+ # Initialize evaluator
298
+ evaluator = LatencyEvaluator()
299
+
300
+ # Parse queries from file
301
+ queries_by_category = evaluator.test_individual_queries_from_file(str(query_file))
302
+
303
+ if "error" in queries_by_category:
304
+ print(f"❌ Failed to parse queries: {queries_by_category['error']}")
305
+ sys.exit(1)
306
+
307
+ # Test each category individually
308
+ print(f"\nπŸ§ͺ Individual Query Testing Mode with Result Accumulation")
309
+ print(f"πŸ“ Test each query separately to avoid rate limits")
310
+
311
+ for category, queries in queries_by_category.items():
312
+ if not queries:
313
+ continue
314
+
315
+ print(f"\nπŸ“‚ Testing {category.upper()} queries:")
316
+
317
+ for i, query_info in enumerate(queries):
318
+ query_text = query_info['text']
319
+ print(f"\nπŸ” Query {i+1}/{len(queries)} in {category} category:")
320
+ print(f" Text: {query_text}")
321
+
322
+ # Test single query
323
+ result = evaluator.measure_single_query_latency(query_text, category)
324
+
325
+ # Add to accumulator for chart generation
326
+ evaluator.add_result_to_accumulator(result)
327
+
328
+ # Save individual result
329
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
330
+ filename = f"latency_{category}_query{i+1}_{timestamp}.json"
331
+ saved_path = evaluator.save_single_result(result, filename)
332
+
333
+ # Show summary
334
+ if result.get('success'):
335
+ print(f" βœ… Success: {result['total_latency']:.2f}s total")
336
+ print(f" Breakdown: Extract={result['timing_details']['step1_condition_extraction']:.2f}s, "
337
+ f"Retrieve={result['timing_details']['step3_retrieval']:.2f}s, "
338
+ f"Generate={result['timing_details']['step4_generation']:.2f}s")
339
+ else:
340
+ print(f" ❌ Failed: {result.get('status')} - {result.get('error', 'Unknown error')}")
341
+
342
+ # Pause between queries to avoid rate limits
343
+ if i < len(queries) - 1: # Not the last query in category
344
+ print(f" ⏳ Pausing 5s before next query...")
345
+ time.sleep(5)
346
+
347
+ # Longer pause between categories
348
+ if category != list(queries_by_category.keys())[-1]: # Not the last category
349
+ print(f"\n⏳ Pausing 10s before next category...")
350
+ time.sleep(10)
351
+
352
+ # Generate comprehensive analysis (no charts - pure data)
353
+ print(f"\nπŸ“Š Generating comprehensive statistical summary...")
354
+
355
+ # Calculate category statistics
356
+ final_stats = evaluator.calculate_category_statistics()
357
+
358
+ # Save statistics for chart generation
359
+ stats_path = evaluator.save_statistics_summary()
360
+
361
+ # Save medical outputs for model comparison
362
+ outputs_path = evaluator.save_medical_outputs()
363
+
364
+ # Print final summary
365
+ print(f"\nπŸ“Š === FINAL LATENCY ANALYSIS SUMMARY ===")
366
+ category_results = final_stats['category_results']
367
+ overall_results = final_stats['overall_results']
368
+
369
+ print(f"Overall Performance:")
370
+ print(f" Average Latency: {overall_results['average_latency']:.2f}s (Β±{overall_results['std_deviation']:.2f})")
371
+ print(f" Success Rate: {overall_results['successful_queries']}/{overall_results['total_queries']}")
372
+ print(f" 30s Target Compliance: {overall_results['target_compliance']:.1%}")
373
+
374
+ print(f"\nCategory Breakdown:")
375
+ for category, stats in category_results.items():
376
+ if stats['query_count'] > 0:
377
+ print(f" {category.capitalize()}: {stats['average_latency']:.2f}s (Β±{stats['std_deviation']:.2f}) [{stats['query_count']} queries]")
378
+
379
+ print(f"\nβœ… Data collection complete! Files saved:")
380
+ print(f" πŸ“Š Statistics: {stats_path}")
381
+ print(f" πŸ“ Medical Outputs: {outputs_path}")
382
+ print(f" πŸ“ Individual results: {Path(__file__).parent / 'results'}")
383
+ print(f"\nπŸ’‘ Next step: Run latency_chart_generator.py to create visualizations")
384
+
385
+ def add_result_to_accumulator(self, result: Dict[str, Any]):
386
+ """Add successful result to category accumulator"""
387
+ if result.get('success') and result.get('category') in self.accumulated_results:
388
+ category = result['category']
389
+ self.accumulated_results[category].append(result)
390
+ print(f"πŸ“Š Added result to {category} category. Total: {len(self.accumulated_results[category])}")
391
+
392
+ def save_statistics_summary(self, filename: str = None) -> str:
393
+ """Save statistical summary for chart generation"""
394
+ stats = self.calculate_category_statistics()
395
+
396
+ if filename is None:
397
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
398
+ filename = f"latency_statistics_{timestamp}.json"
399
+
400
+ # Ensure results directory exists
401
+ results_dir = Path(__file__).parent / "results"
402
+ results_dir.mkdir(exist_ok=True)
403
+
404
+ filepath = results_dir / filename
405
+
406
+ with open(filepath, 'w', encoding='utf-8') as f:
407
+ json.dump(stats, f, indent=2, ensure_ascii=False)
408
+
409
+ print(f"πŸ“Š Statistics saved to: {filepath}")
410
+ return str(filepath)
411
+
412
+ def save_medical_outputs(self, filename: str = None) -> str:
413
+ """Save medical advice outputs for model comparison"""
414
+ if filename is None:
415
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
416
+ filename = f"medical_outputs_{timestamp}.json"
417
+
418
+ # Ensure results directory exists
419
+ results_dir = Path(__file__).parent / "results"
420
+ results_dir.mkdir(exist_ok=True)
421
+
422
+ filepath = results_dir / filename
423
+
424
+ # Create comprehensive output data
425
+ output_data = {
426
+ "evaluation_metadata": {
427
+ "total_outputs": len(self.medical_outputs),
428
+ "categories": list(set(output['category'] for output in self.medical_outputs)),
429
+ "timestamp": datetime.now().isoformat(),
430
+ "model_type": "Med42-70B_RAG_enhanced" # For future comparison
431
+ },
432
+ "medical_outputs": self.medical_outputs
433
+ }
434
+
435
+ with open(filepath, 'w', encoding='utf-8') as f:
436
+ json.dump(output_data, f, indent=2, ensure_ascii=False)
437
+
438
+ print(f"πŸ“ Medical outputs saved to: {filepath}")
439
+ print(f" Total outputs: {len(self.medical_outputs)}")
440
+ print(f" Categories: {', '.join(set(output['category'] for output in self.medical_outputs))}")
441
+
442
+ return str(filepath)
443
+
444
+ def calculate_category_statistics(self) -> Dict[str, Any]:
445
+ """Calculate statistics for each category and overall"""
446
+ category_stats = {}
447
+ all_successful_latencies = []
448
+
449
+ for category, results in self.accumulated_results.items():
450
+ latencies = [r['total_latency'] for r in results if r.get('success')]
451
+
452
+ if latencies:
453
+ category_stats[category] = {
454
+ "average_latency": sum(latencies) / len(latencies),
455
+ "std_deviation": self._calculate_std(latencies),
456
+ "min_latency": min(latencies),
457
+ "max_latency": max(latencies),
458
+ "query_count": len(latencies),
459
+ "individual_latencies": latencies
460
+ }
461
+ all_successful_latencies.extend(latencies)
462
+ else:
463
+ category_stats[category] = {
464
+ "average_latency": 0.0,
465
+ "std_deviation": 0.0,
466
+ "min_latency": 0.0,
467
+ "max_latency": 0.0,
468
+ "query_count": 0,
469
+ "individual_latencies": []
470
+ }
471
+
472
+ # Calculate overall statistics
473
+ overall_stats = {
474
+ "average_latency": sum(all_successful_latencies) / len(all_successful_latencies) if all_successful_latencies else 0.0,
475
+ "std_deviation": self._calculate_std(all_successful_latencies),
476
+ "min_latency": min(all_successful_latencies) if all_successful_latencies else 0.0,
477
+ "max_latency": max(all_successful_latencies) if all_successful_latencies else 0.0,
478
+ "total_queries": sum(len(results) for results in self.accumulated_results.values()),
479
+ "successful_queries": len(all_successful_latencies),
480
+ "target_compliance": sum(1 for lat in all_successful_latencies if lat <= 30.0) / len(all_successful_latencies) if all_successful_latencies else 0.0
481
+ }
482
+
483
+ return {
484
+ "category_results": category_stats,
485
+ "overall_results": overall_stats,
486
+ "timestamp": datetime.now().isoformat()
487
+ }
488
+
489
+ def _calculate_std(self, values: List[float]) -> float:
490
+ """Calculate standard deviation"""
491
+ if len(values) < 2:
492
+ return 0.0
493
+
494
+ mean = sum(values) / len(values)
495
+ variance = sum((x - mean) ** 2 for x in values) / len(values)
496
+ return variance ** 0.5
evaluation/pre_user_query_evaluate.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 1.diagnosis: 60-year-old patient with hypertension history, sudden chest pain. What are possible causes and how to assess?
2
+
3
+ 2.treatment: Suspected acute ischemic stroke. Tell me the next steps to take
4
+
5
+ 3.mixed/complicated: 20 y/f , porphyria, sudden seizure. What are possible causes and complete management workflow?
evaluation/relevance_evaluator.py ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OnCall.ai System - Retrieval Relevance Evaluator (Metric 3)
4
+ ===========================================================
5
+
6
+ Evaluates retrieval relevance using cosine similarity from retrieval.py
7
+ Automatic evaluation based on existing similarity scores with optional LLM sampling
8
+
9
+ Author: YanBo Chen
10
+ Date: 2025-08-04
11
+ """
12
+
13
+ import json
14
+ import os
15
+ import sys
16
+ from typing import Dict, List, Any
17
+ from datetime import datetime
18
+ from pathlib import Path
19
+ import re
20
+ import numpy as np
21
+
22
+ # Add project path
23
+ current_dir = Path(__file__).parent
24
+ project_root = current_dir.parent
25
+ src_dir = project_root / "src"
26
+ sys.path.insert(0, str(src_dir))
27
+
28
+ # Import existing system components
29
+ try:
30
+ from user_prompt import UserPromptProcessor
31
+ from retrieval import BasicRetrievalSystem
32
+ from llm_clients import llm_Med42_70BClient
33
+ except ImportError as e:
34
+ print(f"❌ Import failed: {e}")
35
+ print("Please ensure running from project root directory")
36
+ sys.exit(1)
37
+
38
+
39
+ class RelevanceEvaluator:
40
+ """Retrieval relevance evaluator using cosine similarity - automatic evaluation"""
41
+
42
+ def __init__(self):
43
+ """Initialize system components for relevance testing"""
44
+ print("πŸ”§ Initializing Relevance Evaluator...")
45
+
46
+ # Initialize required components
47
+ self.llm_client = llm_Med42_70BClient()
48
+ self.retrieval_system = BasicRetrievalSystem()
49
+ self.user_prompt_processor = UserPromptProcessor(
50
+ llm_client=self.llm_client,
51
+ retrieval_system=self.retrieval_system
52
+ )
53
+
54
+ # Results accumulation
55
+ self.relevance_results = []
56
+
57
+ print("βœ… Relevance Evaluator initialization complete")
58
+
59
+ def evaluate_single_relevance(self, query: str, category: str = "unknown") -> Dict[str, Any]:
60
+ """
61
+ Evaluate retrieval relevance for a single query
62
+
63
+ Uses existing cosine similarity scores from retrieval.py
64
+
65
+ Args:
66
+ query: Medical query to test
67
+ category: Query category (diagnosis/treatment/mixed)
68
+ """
69
+ print(f"πŸ” Testing relevance for: {query[:50]}...")
70
+ print(f"πŸ“‹ Category: {category}")
71
+
72
+ try:
73
+ # Step 1: Extract condition for search query construction
74
+ condition_result = self.user_prompt_processor.extract_condition_keywords(query)
75
+
76
+ # Step 2: Perform retrieval (same as latency_evaluator.py)
77
+ search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
78
+ if not search_query:
79
+ search_query = condition_result.get('condition', query)
80
+
81
+ retrieval_start = datetime.now()
82
+ retrieval_results = self.retrieval_system.search(search_query, top_k=5)
83
+ retrieval_time = (datetime.now() - retrieval_start).total_seconds()
84
+
85
+ # Step 3: Extract similarity scores from retrieval results
86
+ processed_results = retrieval_results.get('processed_results', [])
87
+
88
+ if not processed_results:
89
+ result = {
90
+ "query": query,
91
+ "category": category,
92
+ "search_query": search_query,
93
+ "retrieval_success": False,
94
+ "average_relevance": 0.0,
95
+ "relevance_scores": [],
96
+ "retrieved_count": 0,
97
+ "retrieval_time": retrieval_time,
98
+ "error": "No retrieval results",
99
+ "timestamp": datetime.now().isoformat()
100
+ }
101
+
102
+ self.relevance_results.append(result)
103
+ print(f" ❌ No retrieval results found")
104
+ return result
105
+
106
+ # Extract cosine similarity scores
107
+ similarity_scores = []
108
+ retrieval_details = []
109
+
110
+ for i, doc_result in enumerate(processed_results):
111
+ # Get similarity score (may be stored as 'distance', 'similarity_score', or 'score')
112
+ similarity = (
113
+ doc_result.get('distance', 0.0) or
114
+ doc_result.get('similarity_score', 0.0) or
115
+ doc_result.get('score', 0.0)
116
+ )
117
+
118
+ similarity_scores.append(similarity)
119
+
120
+ retrieval_details.append({
121
+ "doc_index": i,
122
+ "similarity_score": similarity,
123
+ "content_snippet": doc_result.get('content', '')[:100] + "...",
124
+ "doc_type": doc_result.get('type', 'unknown'),
125
+ "source": doc_result.get('source', 'unknown')
126
+ })
127
+
128
+ # Calculate relevance metrics
129
+ average_relevance = sum(similarity_scores) / len(similarity_scores)
130
+ max_relevance = max(similarity_scores)
131
+ min_relevance = min(similarity_scores)
132
+
133
+ # Count high-relevance results (threshold: 0.2 based on evaluation_instruction.md)
134
+ high_relevance_count = sum(1 for score in similarity_scores if score >= 0.2)
135
+ high_relevance_ratio = high_relevance_count / len(similarity_scores)
136
+
137
+ result = {
138
+ "query": query,
139
+ "category": category,
140
+ "search_query": search_query,
141
+ "retrieval_success": True,
142
+ "average_relevance": average_relevance,
143
+ "max_relevance": max_relevance,
144
+ "min_relevance": min_relevance,
145
+ "relevance_scores": similarity_scores,
146
+ "high_relevance_count": high_relevance_count,
147
+ "high_relevance_ratio": high_relevance_ratio,
148
+ "retrieved_count": len(processed_results),
149
+ "retrieval_time": retrieval_time,
150
+ "retrieval_details": retrieval_details,
151
+ "meets_threshold": average_relevance >= 0.2,
152
+ "timestamp": datetime.now().isoformat()
153
+ }
154
+
155
+ # Store result
156
+ self.relevance_results.append(result)
157
+
158
+ print(f" βœ… Retrieval: {len(processed_results)} documents")
159
+ print(f" πŸ“Š Average Relevance: {average_relevance:.3f}")
160
+ print(f" πŸ“ˆ High Relevance (β‰₯0.2): {high_relevance_count}/{len(processed_results)} ({high_relevance_ratio:.1%})")
161
+ print(f" 🎯 Threshold: {'βœ… Met' if result['meets_threshold'] else '❌ Not Met'}")
162
+ print(f" ⏱️ Retrieval Time: {retrieval_time:.3f}s")
163
+
164
+ return result
165
+
166
+ except Exception as e:
167
+ error_result = {
168
+ "query": query,
169
+ "category": category,
170
+ "retrieval_success": False,
171
+ "average_relevance": 0.0,
172
+ "error": str(e),
173
+ "timestamp": datetime.now().isoformat()
174
+ }
175
+
176
+ self.relevance_results.append(error_result)
177
+ print(f" ❌ Relevance evaluation failed: {e}")
178
+
179
+ return error_result
180
+
181
+ def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
182
+ """Parse queries from file with category labels"""
183
+ print(f"πŸ“ Reading queries from file: {filepath}")
184
+
185
+ try:
186
+ with open(filepath, 'r', encoding='utf-8') as f:
187
+ content = f.read()
188
+
189
+ # Parse queries with category labels
190
+ queries_by_category = {
191
+ "diagnosis": [],
192
+ "treatment": [],
193
+ "mixed": []
194
+ }
195
+
196
+ lines = content.strip().split('\n')
197
+
198
+ for line in lines:
199
+ line = line.strip()
200
+ if not line:
201
+ continue
202
+
203
+ # Parse format: "1.diagnosis: query text"
204
+ match = re.match(r'^\d+\.(diagnosis|treatment|mixed/complicated|mixed):\s*(.+)', line, re.IGNORECASE)
205
+ if match:
206
+ category_raw = match.group(1).lower()
207
+ query_text = match.group(2).strip()
208
+
209
+ # Normalize category name
210
+ if category_raw in ['mixed/complicated', 'mixed']:
211
+ category = 'mixed'
212
+ else:
213
+ category = category_raw
214
+
215
+ if category in queries_by_category and len(query_text) > 15:
216
+ queries_by_category[category].append({
217
+ "text": query_text,
218
+ "category": category
219
+ })
220
+
221
+ print(f"πŸ“‹ Parsed queries by category:")
222
+ for category, category_queries in queries_by_category.items():
223
+ print(f" {category.capitalize()}: {len(category_queries)} queries")
224
+
225
+ return queries_by_category
226
+
227
+ except Exception as e:
228
+ print(f"❌ Failed to read file: {e}")
229
+ return {"error": f"Failed to read file: {e}"}
230
+
231
+ def calculate_relevance_statistics(self) -> Dict[str, Any]:
232
+ """Calculate relevance statistics by category"""
233
+ category_stats = {}
234
+ all_successful_results = []
235
+
236
+ # Group results by category
237
+ results_by_category = {
238
+ "diagnosis": [],
239
+ "treatment": [],
240
+ "mixed": []
241
+ }
242
+
243
+ for result in self.relevance_results:
244
+ category = result.get('category', 'unknown')
245
+ if category in results_by_category:
246
+ results_by_category[category].append(result)
247
+ if result.get('retrieval_success'):
248
+ all_successful_results.append(result)
249
+
250
+ # Calculate statistics for each category
251
+ for category, results in results_by_category.items():
252
+ successful_results = [r for r in results if r.get('retrieval_success')]
253
+
254
+ if successful_results:
255
+ avg_relevance = sum(r['average_relevance'] for r in successful_results) / len(successful_results)
256
+ relevance_scores = [r['average_relevance'] for r in successful_results]
257
+ avg_retrieval_time = sum(r.get('retrieval_time', 0) for r in successful_results) / len(successful_results)
258
+
259
+ category_stats[category] = {
260
+ "average_relevance": avg_relevance,
261
+ "max_relevance": max(relevance_scores),
262
+ "min_relevance": min(relevance_scores),
263
+ "successful_retrievals": len(successful_results),
264
+ "total_queries": len(results),
265
+ "success_rate": len(successful_results) / len(results),
266
+ "average_retrieval_time": avg_retrieval_time,
267
+ "meets_threshold": avg_relevance >= 0.2,
268
+ "individual_relevance_scores": relevance_scores
269
+ }
270
+ else:
271
+ category_stats[category] = {
272
+ "average_relevance": 0.0,
273
+ "max_relevance": 0.0,
274
+ "min_relevance": 0.0,
275
+ "successful_retrievals": 0,
276
+ "total_queries": len(results),
277
+ "success_rate": 0.0,
278
+ "average_retrieval_time": 0.0,
279
+ "meets_threshold": False,
280
+ "individual_relevance_scores": []
281
+ }
282
+
283
+ # Calculate overall statistics
284
+ if all_successful_results:
285
+ all_relevance_scores = [r['average_relevance'] for r in all_successful_results]
286
+ overall_stats = {
287
+ "average_relevance": sum(all_relevance_scores) / len(all_relevance_scores),
288
+ "max_relevance": max(all_relevance_scores),
289
+ "min_relevance": min(all_relevance_scores),
290
+ "successful_retrievals": len(all_successful_results),
291
+ "total_queries": len(self.relevance_results),
292
+ "success_rate": len(all_successful_results) / len(self.relevance_results),
293
+ "meets_threshold": (sum(all_relevance_scores) / len(all_relevance_scores)) >= 0.2,
294
+ "target_compliance": (sum(all_relevance_scores) / len(all_relevance_scores)) >= 0.25
295
+ }
296
+ else:
297
+ overall_stats = {
298
+ "average_relevance": 0.0,
299
+ "max_relevance": 0.0,
300
+ "min_relevance": 0.0,
301
+ "successful_retrievals": 0,
302
+ "total_queries": len(self.relevance_results),
303
+ "success_rate": 0.0,
304
+ "meets_threshold": False,
305
+ "target_compliance": False
306
+ }
307
+
308
+ return {
309
+ "category_results": category_stats,
310
+ "overall_results": overall_stats,
311
+ "timestamp": datetime.now().isoformat()
312
+ }
313
+
314
+ def save_relevance_statistics(self, filename: str = None) -> str:
315
+ """Save relevance statistics for chart generation"""
316
+ stats = self.calculate_relevance_statistics()
317
+
318
+ if filename is None:
319
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
320
+ filename = f"relevance_statistics_{timestamp}.json"
321
+
322
+ # Ensure results directory exists
323
+ results_dir = Path(__file__).parent / "results"
324
+ results_dir.mkdir(exist_ok=True)
325
+
326
+ filepath = results_dir / filename
327
+
328
+ with open(filepath, 'w', encoding='utf-8') as f:
329
+ json.dump(stats, f, indent=2, ensure_ascii=False)
330
+
331
+ print(f"πŸ“Š Relevance statistics saved to: {filepath}")
332
+ return str(filepath)
333
+
334
+ def save_relevance_details(self, filename: str = None) -> str:
335
+ """Save detailed relevance results"""
336
+ if filename is None:
337
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
338
+ filename = f"relevance_details_{timestamp}.json"
339
+
340
+ # Ensure results directory exists
341
+ results_dir = Path(__file__).parent / "results"
342
+ results_dir.mkdir(exist_ok=True)
343
+
344
+ filepath = results_dir / filename
345
+
346
+ # Create comprehensive relevance data
347
+ relevance_data = {
348
+ "evaluation_metadata": {
349
+ "total_queries": len(self.relevance_results),
350
+ "successful_retrievals": len([r for r in self.relevance_results if r.get('retrieval_success')]),
351
+ "timestamp": datetime.now().isoformat(),
352
+ "evaluator_type": "retrieval_relevance",
353
+ "threshold_used": 0.2
354
+ },
355
+ "relevance_results": self.relevance_results
356
+ }
357
+
358
+ with open(filepath, 'w', encoding='utf-8') as f:
359
+ json.dump(relevance_data, f, indent=2, ensure_ascii=False)
360
+
361
+ print(f"πŸ“ Relevance details saved to: {filepath}")
362
+ return str(filepath)
363
+
364
+
365
+ # Independent execution interface
366
+ if __name__ == "__main__":
367
+ """Independent relevance evaluation interface"""
368
+
369
+ print("πŸ“Š OnCall.ai Relevance Evaluator - Retrieval Relevance Analysis")
370
+
371
+ if len(sys.argv) > 1:
372
+ query_file = sys.argv[1]
373
+ else:
374
+ # Default to evaluation/pre_user_query_evaluate.txt
375
+ query_file = Path(__file__).parent / "pre_user_query_evaluate.txt"
376
+
377
+ if not os.path.exists(query_file):
378
+ print(f"❌ Query file not found: {query_file}")
379
+ print("Usage: python relevance_evaluator.py [query_file.txt]")
380
+ sys.exit(1)
381
+
382
+ # Initialize evaluator
383
+ evaluator = RelevanceEvaluator()
384
+
385
+ # Parse queries from file
386
+ queries_by_category = evaluator.parse_queries_from_file(str(query_file))
387
+
388
+ if "error" in queries_by_category:
389
+ print(f"❌ Failed to parse queries: {queries_by_category['error']}")
390
+ sys.exit(1)
391
+
392
+ # Test relevance for each query
393
+ print(f"\nπŸ§ͺ Retrieval Relevance Testing")
394
+
395
+ for category, queries in queries_by_category.items():
396
+ if not queries:
397
+ continue
398
+
399
+ print(f"\nπŸ“‚ Testing {category.upper()} relevance:")
400
+
401
+ for i, query_info in enumerate(queries):
402
+ query_text = query_info['text']
403
+
404
+ # Test relevance
405
+ result = evaluator.evaluate_single_relevance(query_text, category)
406
+
407
+ # Pause between queries to avoid rate limits
408
+ if i < len(queries) - 1:
409
+ print(f" ⏳ Pausing 3s before next query...")
410
+ import time
411
+ time.sleep(3)
412
+
413
+ # Pause between categories
414
+ if category != list(queries_by_category.keys())[-1]:
415
+ print(f"\n⏳ Pausing 5s before next category...")
416
+ import time
417
+ time.sleep(5)
418
+
419
+ # Generate and save results
420
+ print(f"\nπŸ“Š Generating relevance analysis...")
421
+
422
+ # Save statistics and details
423
+ stats_path = evaluator.save_relevance_statistics()
424
+ details_path = evaluator.save_relevance_details()
425
+
426
+ # Print final summary
427
+ stats = evaluator.calculate_relevance_statistics()
428
+ category_results = stats['category_results']
429
+ overall_results = stats['overall_results']
430
+
431
+ print(f"\nπŸ“Š === RELEVANCE EVALUATION SUMMARY ===")
432
+ print(f"Overall Performance:")
433
+ print(f" Average Relevance: {overall_results['average_relevance']:.3f}")
434
+ print(f" Retrieval Success Rate: {overall_results['success_rate']:.1%}")
435
+ print(f" 0.2 Threshold: {'βœ… Met' if overall_results['meets_threshold'] else '❌ Not Met'}")
436
+ print(f" 0.25 Target: {'βœ… Met' if overall_results['target_compliance'] else '❌ Not Met'}")
437
+
438
+ print(f"\nCategory Breakdown:")
439
+ for category, cat_stats in category_results.items():
440
+ if cat_stats['total_queries'] > 0:
441
+ print(f" {category.capitalize()}: {cat_stats['average_relevance']:.3f} "
442
+ f"({cat_stats['successful_retrievals']}/{cat_stats['total_queries']}) "
443
+ f"[{cat_stats['average_retrieval_time']:.3f}s avg]")
444
+
445
+ print(f"\nβœ… Relevance evaluation complete!")
446
+ print(f"πŸ“Š Statistics: {stats_path}")
447
+ print(f"πŸ“ Details: {details_path}")