YanBoChen commited on
Commit
3edd46d
Β·
1 Parent(s): 24f6a16

Enhance direct LLM evaluation with retry mechanism for 504 timeouts and improved guidance format

Browse files
Files changed (1) hide show
  1. evaluation/direct_llm_evaluator.py +212 -111
evaluation/direct_llm_evaluator.py CHANGED
@@ -20,6 +20,7 @@ from typing import Dict, List, Any
20
  from datetime import datetime
21
  from pathlib import Path
22
  import re
 
23
 
24
  # Add project path
25
  current_dir = Path(__file__).parent
@@ -54,7 +55,7 @@ class DirectLLMEvaluator:
54
 
55
  def evaluate_direct_llm_query(self, query: str, category: str = "unknown") -> Dict[str, Any]:
56
  """
57
- Direct LLM evaluation for single query
58
 
59
  Only tests direct LLM response without RAG pipeline
60
  Applicable metrics: 1 (Latency), 5-6 (via medical output)
@@ -68,123 +69,223 @@ class DirectLLMEvaluator:
68
 
69
  overall_start = time.time()
70
 
71
- try:
72
- # Direct LLM call without any RAG processing
73
- llm_start = time.time()
74
-
75
- # Create direct medical consultation prompt
76
- direct_prompt = f"""
77
- You are a medical expert providing clinical guidance.
 
 
 
 
 
 
 
78
 
79
- Patient Query: {query}
 
80
 
81
- Please provide comprehensive medical advice including:
82
- 1. Differential diagnosis (if applicable)
83
- 2. Immediate assessment steps
84
- 3. Treatment recommendations
85
- 4. Clinical considerations
86
 
87
- Provide evidence-based, actionable medical guidance.
88
- """
89
-
90
- # Direct LLM generation (same parameters as RAG system for fair comparison)
91
- response = self.llm_client.analyze_medical_query(
92
- query=direct_prompt,
93
- max_tokens=1600, # Same as RAG system primary setting
94
- timeout=60.0 # Increased timeout for stable evaluation
95
- )
96
- # Extract medical advice from response (Med42 client returns dict with 'raw_response')
97
- if isinstance(response, dict):
98
- medical_advice = response.get('raw_response', '') or response.get('content', '')
99
- else:
100
- medical_advice = str(response)
101
-
102
- llm_time = time.time() - llm_start
103
- total_time = time.time() - overall_start
104
-
105
- # Check if response is valid (not empty) - focus on content, not timeout
106
- if not medical_advice or len(medical_advice.strip()) == 0:
107
- print(f"❌ Direct LLM returned empty response after {total_time:.2f}s")
108
- raise ValueError("Empty response from LLM - no content generated")
109
-
110
- # Create result
111
- result = {
112
- "query": query,
113
- "category": category,
114
 
115
- # Metric 1: Total Latency (direct LLM call time)
116
- "latency_metrics": {
117
- "total_latency": total_time,
118
- "llm_generation_time": llm_time,
119
- "meets_target": total_time <= 60.0
120
- },
121
 
122
- # Metrics 2-4: Not applicable for direct LLM
123
- "extraction_metrics": {
124
- "not_applicable": True,
125
- "reason": "No extraction pipeline in direct LLM"
126
- },
127
- "relevance_metrics": {
128
- "not_applicable": True,
129
- "reason": "No retrieval pipeline in direct LLM"
130
- },
131
- "coverage_metrics": {
132
- "not_applicable": True,
133
- "reason": "No retrieval content to cover"
134
- },
135
 
136
- # Medical advice for metrics 5-6 evaluation
137
- "medical_advice": medical_advice,
138
- "advice_length": len(medical_advice),
 
 
139
 
140
- "overall_success": True,
141
- "model_type": "Med42-70B_direct",
142
- "timestamp": datetime.now().isoformat()
143
- }
144
-
145
- # Store result
146
- self.direct_results.append(result)
147
-
148
- # Store medical output for LLM judge evaluation
149
- medical_output = {
150
- "query": query,
151
- "category": category,
152
- "medical_advice": medical_advice,
153
- "query_id": f"{category}_query_direct",
154
- "model_type": "Med42-70B_direct",
155
- "processing_time": total_time,
156
- "timestamp": datetime.now().isoformat()
157
- }
158
- self.medical_outputs.append(medical_output)
159
-
160
- print(f"βœ… Direct LLM completed in {total_time:.2f}s")
161
- print(f"πŸ“ Generated advice: {len(medical_advice)} characters")
162
-
163
- return result
164
-
165
- except Exception as e:
166
- total_time = time.time() - overall_start
167
- print(f"❌ Direct LLM evaluation failed after {total_time:.2f}s: {e}")
168
-
169
- error_result = {
170
- "query": query,
171
- "category": category,
172
- "latency_metrics": {
173
- "total_latency": total_time,
174
- "meets_target": False
175
- },
176
- "overall_success": False,
177
- "error": str(e),
178
- "model_type": "Med42-70B_direct",
179
- "timestamp": datetime.now().isoformat()
180
- }
181
-
182
- self.direct_results.append(error_result)
183
-
184
- # Do NOT add failed queries to medical_outputs for judge evaluation
185
- # Only successful queries with valid medical advice should be evaluated
186
-
187
- return error_result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
190
  """Parse queries from file with category labels"""
 
20
  from datetime import datetime
21
  from pathlib import Path
22
  import re
23
+ from huggingface_hub import InferenceClient
24
 
25
  # Add project path
26
  current_dir = Path(__file__).parent
 
55
 
56
  def evaluate_direct_llm_query(self, query: str, category: str = "unknown") -> Dict[str, Any]:
57
  """
58
+ Direct LLM evaluation for single query with retry mechanism for 504 timeouts
59
 
60
  Only tests direct LLM response without RAG pipeline
61
  Applicable metrics: 1 (Latency), 5-6 (via medical output)
 
69
 
70
  overall_start = time.time()
71
 
72
+ # Retry configuration
73
+ max_retries = 3
74
+ retry_delay = 30 # seconds
75
+ base_timeout = 120.0 # Increased base timeout for complex medical advice generation
76
+
77
+ for attempt in range(max_retries):
78
+ try:
79
+ print(f" πŸ”„ Attempt {attempt + 1}/{max_retries}")
80
+
81
+ # Direct LLM call without any RAG processing
82
+ llm_start = time.time()
83
+
84
+ # Create direct medical consultation prompt (matching generation.py format)
85
+ direct_prompt = f"""You are an experienced attending physician providing guidance to a junior clinician in an emergency setting. A colleague is asking for your expert medical opinion.
86
 
87
+ Clinical Question:
88
+ {query}
89
 
90
+ Instructions:
91
+ Provide comprehensive medical guidance covering both diagnostic and treatment aspects as appropriate.
 
 
 
92
 
93
+ Provide guidance with:
94
+ β€’ Prioritize your medical knowledge and clinical experience
95
+ β€’ Use numbered points (1. 2. 3.) for key steps
96
+ β€’ Line breaks between major sections
97
+ β€’ Highlight medications with dosages and routes
98
+ β€’ Emphasize clinical judgment for individual patient factors
99
+
100
+ IMPORTANT: Keep response under 1000 words. Use concise numbered points. For complex cases with multiple conditions, address the most urgent condition first, then relevant comorbidities. Prioritize actionable clinical steps over theoretical explanations.
101
+
102
+ Your response should provide practical clinical guidance suitable for immediate bedside application with appropriate medical caution."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
+ # Direct LLM generation with extended timeout - bypass analyze_medical_query to avoid system prompt conflict
105
+ current_timeout = 120.0 + (attempt * 60) # 120s, 180s, 240s
106
+ print(f" ⏱️ Using read timeout = {current_timeout}s")
 
 
 
107
 
108
+ # Create a new client with appropriate timeout for this attempt
109
+ hf_token = os.getenv('HF_TOKEN')
110
+ if not hf_token:
111
+ raise ValueError("HF_TOKEN not found in environment variables")
 
 
 
 
 
 
 
 
 
112
 
113
+ timeout_client = InferenceClient(
114
+ provider="featherless-ai",
115
+ api_key=hf_token,
116
+ timeout=current_timeout
117
+ )
118
 
119
+ # Call LLM directly to avoid system prompt conflicts
120
+ response = timeout_client.chat.completions.create(
121
+ model="m42-health/Llama3-Med42-70B",
122
+ messages=[
123
+ {
124
+ "role": "user",
125
+ "content": direct_prompt # Our complete prompt as user message
126
+ }
127
+ ],
128
+ max_tokens=1600,
129
+ temperature=0.1 # Low temperature for consistent medical advice
130
+ )
131
+ # Extract medical advice from direct API response (not Med42 client wrapper)
132
+ medical_advice = response.choices[0].message.content or ""
133
+
134
+ llm_time = time.time() - llm_start
135
+ total_time = time.time() - overall_start
136
+
137
+ # Check if response is valid (not empty)
138
+ if not medical_advice or len(medical_advice.strip()) == 0:
139
+ raise ValueError("Empty response from LLM - no content generated")
140
+
141
+ # Success - create result and return
142
+ if attempt > 0:
143
+ print(f" βœ… Succeeded on attempt {attempt + 1}")
144
+
145
+ result = {
146
+ "query": query,
147
+ "category": category,
148
+
149
+ # Metric 1: Total Latency (direct LLM call time)
150
+ "latency_metrics": {
151
+ "total_latency": total_time,
152
+ "llm_generation_time": llm_time,
153
+ "meets_target": total_time <= 60.0,
154
+ "attempts_needed": attempt + 1
155
+ },
156
+
157
+ # Metrics 2-4: Not applicable for direct LLM
158
+ "extraction_metrics": {
159
+ "not_applicable": True,
160
+ "reason": "No extraction pipeline in direct LLM"
161
+ },
162
+ "relevance_metrics": {
163
+ "not_applicable": True,
164
+ "reason": "No retrieval pipeline in direct LLM"
165
+ },
166
+ "coverage_metrics": {
167
+ "not_applicable": True,
168
+ "reason": "No retrieval content to cover"
169
+ },
170
+
171
+ # Medical advice for metrics 5-6 evaluation
172
+ "medical_advice": medical_advice,
173
+ "advice_length": len(medical_advice),
174
+
175
+ "overall_success": True,
176
+ "model_type": "Med42-70B_direct",
177
+ "timestamp": datetime.now().isoformat()
178
+ }
179
+
180
+ # Store result
181
+ self.direct_results.append(result)
182
+
183
+ # Store medical output for LLM judge evaluation
184
+ medical_output = {
185
+ "query": query,
186
+ "category": category,
187
+ "medical_advice": medical_advice,
188
+ "query_id": f"{category}_query_direct",
189
+ "model_type": "Med42-70B_direct",
190
+ "processing_time": total_time,
191
+ "timestamp": datetime.now().isoformat()
192
+ }
193
+ self.medical_outputs.append(medical_output)
194
+
195
+ print(f"βœ… Direct LLM completed in {total_time:.2f}s")
196
+ print(f"πŸ“ Generated advice: {len(medical_advice)} characters")
197
+
198
+ return result
199
+
200
+ except Exception as e:
201
+ error_str = str(e)
202
+
203
+ # CRITICAL: Check for timeout/connectivity errors FIRST (before any response processing)
204
+ if any(keyword in error_str.lower() for keyword in ['504', 'timeout', 'gateway', 'connection', 'time-out', 'empty response']):
205
+ if attempt < max_retries - 1:
206
+ print(f" ⏳ Timeout/connectivity/empty response error detected, retrying in {retry_delay}s...")
207
+ print(f" Error: {error_str}")
208
+ time.sleep(retry_delay)
209
+ continue # Continue to next retry attempt
210
+ else:
211
+ print(f" ❌ All {max_retries} attempts failed with timeouts/empty responses")
212
+ total_time = time.time() - overall_start
213
+ return self._create_timeout_failure_result(query, category, error_str, total_time)
214
+ else:
215
+ # Non-timeout error (e.g., ValueError for empty response), don't retry
216
+ print(f" ❌ Non-retry error: {error_str}")
217
+ total_time = time.time() - overall_start
218
+ return self._create_general_failure_result(query, category, error_str, total_time)
219
+
220
+ # Should not reach here
221
+ total_time = time.time() - overall_start
222
+ return self._create_timeout_failure_result(query, category, "Max retries exceeded", total_time)
223
+
224
+ def _create_timeout_failure_result(self, query: str, category: str, error: str, total_time: float) -> Dict[str, Any]:
225
+ """Create standardized result for timeout failures after all retries"""
226
+ error_result = {
227
+ "query": query,
228
+ "category": category,
229
+ "latency_metrics": {
230
+ "total_latency": total_time,
231
+ "llm_generation_time": 0.0,
232
+ "meets_target": False,
233
+ "failure_type": "timeout_after_retries"
234
+ },
235
+ "extraction_metrics": {
236
+ "not_applicable": True,
237
+ "reason": "No extraction pipeline in direct LLM"
238
+ },
239
+ "relevance_metrics": {
240
+ "not_applicable": True,
241
+ "reason": "No retrieval pipeline in direct LLM"
242
+ },
243
+ "coverage_metrics": {
244
+ "not_applicable": True,
245
+ "reason": "No retrieval content to cover"
246
+ },
247
+ "overall_success": False,
248
+ "error": f"API timeout after retries: {error}",
249
+ "model_type": "Med42-70B_direct",
250
+ "timestamp": datetime.now().isoformat()
251
+ }
252
+
253
+ self.direct_results.append(error_result)
254
+ print(f"❌ Direct LLM failed after {total_time:.2f}s with retries: {error}")
255
+ return error_result
256
+
257
+ def _create_general_failure_result(self, query: str, category: str, error: str, total_time: float) -> Dict[str, Any]:
258
+ """Create standardized result for general failures (non-timeout)"""
259
+ error_result = {
260
+ "query": query,
261
+ "category": category,
262
+ "latency_metrics": {
263
+ "total_latency": total_time,
264
+ "llm_generation_time": 0.0,
265
+ "meets_target": False,
266
+ "failure_type": "general_error"
267
+ },
268
+ "extraction_metrics": {
269
+ "not_applicable": True,
270
+ "reason": "No extraction pipeline in direct LLM"
271
+ },
272
+ "relevance_metrics": {
273
+ "not_applicable": True,
274
+ "reason": "No retrieval pipeline in direct LLM"
275
+ },
276
+ "coverage_metrics": {
277
+ "not_applicable": True,
278
+ "reason": "No retrieval content to cover"
279
+ },
280
+ "overall_success": False,
281
+ "error": str(error),
282
+ "model_type": "Med42-70B_direct",
283
+ "timestamp": datetime.now().isoformat()
284
+ }
285
+
286
+ self.direct_results.append(error_result)
287
+ print(f"❌ Direct LLM failed after {total_time:.2f}s: {error}")
288
+ return error_result
289
 
290
  def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
291
  """Parse queries from file with category labels"""