Spaces:
Sleeping
Sleeping
YanBoChen
commited on
Commit
Β·
3edd46d
1
Parent(s):
24f6a16
Enhance direct LLM evaluation with retry mechanism for 504 timeouts and improved guidance format
Browse files- evaluation/direct_llm_evaluator.py +212 -111
evaluation/direct_llm_evaluator.py
CHANGED
@@ -20,6 +20,7 @@ from typing import Dict, List, Any
|
|
20 |
from datetime import datetime
|
21 |
from pathlib import Path
|
22 |
import re
|
|
|
23 |
|
24 |
# Add project path
|
25 |
current_dir = Path(__file__).parent
|
@@ -54,7 +55,7 @@ class DirectLLMEvaluator:
|
|
54 |
|
55 |
def evaluate_direct_llm_query(self, query: str, category: str = "unknown") -> Dict[str, Any]:
|
56 |
"""
|
57 |
-
Direct LLM evaluation for single query
|
58 |
|
59 |
Only tests direct LLM response without RAG pipeline
|
60 |
Applicable metrics: 1 (Latency), 5-6 (via medical output)
|
@@ -68,123 +69,223 @@ class DirectLLMEvaluator:
|
|
68 |
|
69 |
overall_start = time.time()
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
-
|
|
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
2. Immediate assessment steps
|
84 |
-
3. Treatment recommendations
|
85 |
-
4. Clinical considerations
|
86 |
|
87 |
-
Provide
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
if isinstance(response, dict):
|
98 |
-
medical_advice = response.get('raw_response', '') or response.get('content', '')
|
99 |
-
else:
|
100 |
-
medical_advice = str(response)
|
101 |
-
|
102 |
-
llm_time = time.time() - llm_start
|
103 |
-
total_time = time.time() - overall_start
|
104 |
-
|
105 |
-
# Check if response is valid (not empty) - focus on content, not timeout
|
106 |
-
if not medical_advice or len(medical_advice.strip()) == 0:
|
107 |
-
print(f"β Direct LLM returned empty response after {total_time:.2f}s")
|
108 |
-
raise ValueError("Empty response from LLM - no content generated")
|
109 |
-
|
110 |
-
# Create result
|
111 |
-
result = {
|
112 |
-
"query": query,
|
113 |
-
"category": category,
|
114 |
|
115 |
-
#
|
116 |
-
|
117 |
-
|
118 |
-
"llm_generation_time": llm_time,
|
119 |
-
"meets_target": total_time <= 60.0
|
120 |
-
},
|
121 |
|
122 |
-
#
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
},
|
127 |
-
"relevance_metrics": {
|
128 |
-
"not_applicable": True,
|
129 |
-
"reason": "No retrieval pipeline in direct LLM"
|
130 |
-
},
|
131 |
-
"coverage_metrics": {
|
132 |
-
"not_applicable": True,
|
133 |
-
"reason": "No retrieval content to cover"
|
134 |
-
},
|
135 |
|
136 |
-
|
137 |
-
|
138 |
-
|
|
|
|
|
139 |
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
|
190 |
"""Parse queries from file with category labels"""
|
|
|
20 |
from datetime import datetime
|
21 |
from pathlib import Path
|
22 |
import re
|
23 |
+
from huggingface_hub import InferenceClient
|
24 |
|
25 |
# Add project path
|
26 |
current_dir = Path(__file__).parent
|
|
|
55 |
|
56 |
def evaluate_direct_llm_query(self, query: str, category: str = "unknown") -> Dict[str, Any]:
|
57 |
"""
|
58 |
+
Direct LLM evaluation for single query with retry mechanism for 504 timeouts
|
59 |
|
60 |
Only tests direct LLM response without RAG pipeline
|
61 |
Applicable metrics: 1 (Latency), 5-6 (via medical output)
|
|
|
69 |
|
70 |
overall_start = time.time()
|
71 |
|
72 |
+
# Retry configuration
|
73 |
+
max_retries = 3
|
74 |
+
retry_delay = 30 # seconds
|
75 |
+
base_timeout = 120.0 # Increased base timeout for complex medical advice generation
|
76 |
+
|
77 |
+
for attempt in range(max_retries):
|
78 |
+
try:
|
79 |
+
print(f" π Attempt {attempt + 1}/{max_retries}")
|
80 |
+
|
81 |
+
# Direct LLM call without any RAG processing
|
82 |
+
llm_start = time.time()
|
83 |
+
|
84 |
+
# Create direct medical consultation prompt (matching generation.py format)
|
85 |
+
direct_prompt = f"""You are an experienced attending physician providing guidance to a junior clinician in an emergency setting. A colleague is asking for your expert medical opinion.
|
86 |
|
87 |
+
Clinical Question:
|
88 |
+
{query}
|
89 |
|
90 |
+
Instructions:
|
91 |
+
Provide comprehensive medical guidance covering both diagnostic and treatment aspects as appropriate.
|
|
|
|
|
|
|
92 |
|
93 |
+
Provide guidance with:
|
94 |
+
β’ Prioritize your medical knowledge and clinical experience
|
95 |
+
β’ Use numbered points (1. 2. 3.) for key steps
|
96 |
+
β’ Line breaks between major sections
|
97 |
+
β’ Highlight medications with dosages and routes
|
98 |
+
β’ Emphasize clinical judgment for individual patient factors
|
99 |
+
|
100 |
+
IMPORTANT: Keep response under 1000 words. Use concise numbered points. For complex cases with multiple conditions, address the most urgent condition first, then relevant comorbidities. Prioritize actionable clinical steps over theoretical explanations.
|
101 |
+
|
102 |
+
Your response should provide practical clinical guidance suitable for immediate bedside application with appropriate medical caution."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
+
# Direct LLM generation with extended timeout - bypass analyze_medical_query to avoid system prompt conflict
|
105 |
+
current_timeout = 120.0 + (attempt * 60) # 120s, 180s, 240s
|
106 |
+
print(f" β±οΈ Using read timeout = {current_timeout}s")
|
|
|
|
|
|
|
107 |
|
108 |
+
# Create a new client with appropriate timeout for this attempt
|
109 |
+
hf_token = os.getenv('HF_TOKEN')
|
110 |
+
if not hf_token:
|
111 |
+
raise ValueError("HF_TOKEN not found in environment variables")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
+
timeout_client = InferenceClient(
|
114 |
+
provider="featherless-ai",
|
115 |
+
api_key=hf_token,
|
116 |
+
timeout=current_timeout
|
117 |
+
)
|
118 |
|
119 |
+
# Call LLM directly to avoid system prompt conflicts
|
120 |
+
response = timeout_client.chat.completions.create(
|
121 |
+
model="m42-health/Llama3-Med42-70B",
|
122 |
+
messages=[
|
123 |
+
{
|
124 |
+
"role": "user",
|
125 |
+
"content": direct_prompt # Our complete prompt as user message
|
126 |
+
}
|
127 |
+
],
|
128 |
+
max_tokens=1600,
|
129 |
+
temperature=0.1 # Low temperature for consistent medical advice
|
130 |
+
)
|
131 |
+
# Extract medical advice from direct API response (not Med42 client wrapper)
|
132 |
+
medical_advice = response.choices[0].message.content or ""
|
133 |
+
|
134 |
+
llm_time = time.time() - llm_start
|
135 |
+
total_time = time.time() - overall_start
|
136 |
+
|
137 |
+
# Check if response is valid (not empty)
|
138 |
+
if not medical_advice or len(medical_advice.strip()) == 0:
|
139 |
+
raise ValueError("Empty response from LLM - no content generated")
|
140 |
+
|
141 |
+
# Success - create result and return
|
142 |
+
if attempt > 0:
|
143 |
+
print(f" β
Succeeded on attempt {attempt + 1}")
|
144 |
+
|
145 |
+
result = {
|
146 |
+
"query": query,
|
147 |
+
"category": category,
|
148 |
+
|
149 |
+
# Metric 1: Total Latency (direct LLM call time)
|
150 |
+
"latency_metrics": {
|
151 |
+
"total_latency": total_time,
|
152 |
+
"llm_generation_time": llm_time,
|
153 |
+
"meets_target": total_time <= 60.0,
|
154 |
+
"attempts_needed": attempt + 1
|
155 |
+
},
|
156 |
+
|
157 |
+
# Metrics 2-4: Not applicable for direct LLM
|
158 |
+
"extraction_metrics": {
|
159 |
+
"not_applicable": True,
|
160 |
+
"reason": "No extraction pipeline in direct LLM"
|
161 |
+
},
|
162 |
+
"relevance_metrics": {
|
163 |
+
"not_applicable": True,
|
164 |
+
"reason": "No retrieval pipeline in direct LLM"
|
165 |
+
},
|
166 |
+
"coverage_metrics": {
|
167 |
+
"not_applicable": True,
|
168 |
+
"reason": "No retrieval content to cover"
|
169 |
+
},
|
170 |
+
|
171 |
+
# Medical advice for metrics 5-6 evaluation
|
172 |
+
"medical_advice": medical_advice,
|
173 |
+
"advice_length": len(medical_advice),
|
174 |
+
|
175 |
+
"overall_success": True,
|
176 |
+
"model_type": "Med42-70B_direct",
|
177 |
+
"timestamp": datetime.now().isoformat()
|
178 |
+
}
|
179 |
+
|
180 |
+
# Store result
|
181 |
+
self.direct_results.append(result)
|
182 |
+
|
183 |
+
# Store medical output for LLM judge evaluation
|
184 |
+
medical_output = {
|
185 |
+
"query": query,
|
186 |
+
"category": category,
|
187 |
+
"medical_advice": medical_advice,
|
188 |
+
"query_id": f"{category}_query_direct",
|
189 |
+
"model_type": "Med42-70B_direct",
|
190 |
+
"processing_time": total_time,
|
191 |
+
"timestamp": datetime.now().isoformat()
|
192 |
+
}
|
193 |
+
self.medical_outputs.append(medical_output)
|
194 |
+
|
195 |
+
print(f"β
Direct LLM completed in {total_time:.2f}s")
|
196 |
+
print(f"π Generated advice: {len(medical_advice)} characters")
|
197 |
+
|
198 |
+
return result
|
199 |
+
|
200 |
+
except Exception as e:
|
201 |
+
error_str = str(e)
|
202 |
+
|
203 |
+
# CRITICAL: Check for timeout/connectivity errors FIRST (before any response processing)
|
204 |
+
if any(keyword in error_str.lower() for keyword in ['504', 'timeout', 'gateway', 'connection', 'time-out', 'empty response']):
|
205 |
+
if attempt < max_retries - 1:
|
206 |
+
print(f" β³ Timeout/connectivity/empty response error detected, retrying in {retry_delay}s...")
|
207 |
+
print(f" Error: {error_str}")
|
208 |
+
time.sleep(retry_delay)
|
209 |
+
continue # Continue to next retry attempt
|
210 |
+
else:
|
211 |
+
print(f" β All {max_retries} attempts failed with timeouts/empty responses")
|
212 |
+
total_time = time.time() - overall_start
|
213 |
+
return self._create_timeout_failure_result(query, category, error_str, total_time)
|
214 |
+
else:
|
215 |
+
# Non-timeout error (e.g., ValueError for empty response), don't retry
|
216 |
+
print(f" β Non-retry error: {error_str}")
|
217 |
+
total_time = time.time() - overall_start
|
218 |
+
return self._create_general_failure_result(query, category, error_str, total_time)
|
219 |
+
|
220 |
+
# Should not reach here
|
221 |
+
total_time = time.time() - overall_start
|
222 |
+
return self._create_timeout_failure_result(query, category, "Max retries exceeded", total_time)
|
223 |
+
|
224 |
+
def _create_timeout_failure_result(self, query: str, category: str, error: str, total_time: float) -> Dict[str, Any]:
|
225 |
+
"""Create standardized result for timeout failures after all retries"""
|
226 |
+
error_result = {
|
227 |
+
"query": query,
|
228 |
+
"category": category,
|
229 |
+
"latency_metrics": {
|
230 |
+
"total_latency": total_time,
|
231 |
+
"llm_generation_time": 0.0,
|
232 |
+
"meets_target": False,
|
233 |
+
"failure_type": "timeout_after_retries"
|
234 |
+
},
|
235 |
+
"extraction_metrics": {
|
236 |
+
"not_applicable": True,
|
237 |
+
"reason": "No extraction pipeline in direct LLM"
|
238 |
+
},
|
239 |
+
"relevance_metrics": {
|
240 |
+
"not_applicable": True,
|
241 |
+
"reason": "No retrieval pipeline in direct LLM"
|
242 |
+
},
|
243 |
+
"coverage_metrics": {
|
244 |
+
"not_applicable": True,
|
245 |
+
"reason": "No retrieval content to cover"
|
246 |
+
},
|
247 |
+
"overall_success": False,
|
248 |
+
"error": f"API timeout after retries: {error}",
|
249 |
+
"model_type": "Med42-70B_direct",
|
250 |
+
"timestamp": datetime.now().isoformat()
|
251 |
+
}
|
252 |
+
|
253 |
+
self.direct_results.append(error_result)
|
254 |
+
print(f"β Direct LLM failed after {total_time:.2f}s with retries: {error}")
|
255 |
+
return error_result
|
256 |
+
|
257 |
+
def _create_general_failure_result(self, query: str, category: str, error: str, total_time: float) -> Dict[str, Any]:
|
258 |
+
"""Create standardized result for general failures (non-timeout)"""
|
259 |
+
error_result = {
|
260 |
+
"query": query,
|
261 |
+
"category": category,
|
262 |
+
"latency_metrics": {
|
263 |
+
"total_latency": total_time,
|
264 |
+
"llm_generation_time": 0.0,
|
265 |
+
"meets_target": False,
|
266 |
+
"failure_type": "general_error"
|
267 |
+
},
|
268 |
+
"extraction_metrics": {
|
269 |
+
"not_applicable": True,
|
270 |
+
"reason": "No extraction pipeline in direct LLM"
|
271 |
+
},
|
272 |
+
"relevance_metrics": {
|
273 |
+
"not_applicable": True,
|
274 |
+
"reason": "No retrieval pipeline in direct LLM"
|
275 |
+
},
|
276 |
+
"coverage_metrics": {
|
277 |
+
"not_applicable": True,
|
278 |
+
"reason": "No retrieval content to cover"
|
279 |
+
},
|
280 |
+
"overall_success": False,
|
281 |
+
"error": str(error),
|
282 |
+
"model_type": "Med42-70B_direct",
|
283 |
+
"timestamp": datetime.now().isoformat()
|
284 |
+
}
|
285 |
+
|
286 |
+
self.direct_results.append(error_result)
|
287 |
+
print(f"β Direct LLM failed after {total_time:.2f}s: {error}")
|
288 |
+
return error_result
|
289 |
|
290 |
def parse_queries_from_file(self, filepath: str) -> Dict[str, List[Dict]]:
|
291 |
"""Parse queries from file with category labels"""
|