Spaces:
Sleeping
Sleeping
Update core/evaluation_engine.py
Browse files- core/evaluation_engine.py +92 -43
core/evaluation_engine.py
CHANGED
@@ -2,53 +2,80 @@
|
|
2 |
import random
|
3 |
import traceback
|
4 |
|
5 |
-
|
6 |
-
from
|
7 |
-
from prompts.
|
8 |
-
from
|
9 |
-
from .safe_executor import execute_python_code_with_tests, ExecutionResult # CORRECTED: Relative import
|
10 |
|
11 |
print("DEBUG: core.evaluation_engine - Imports successful")
|
12 |
|
13 |
-
# ... (rest of the EvaluationResultOutput class, _parse_llm_score, _placeholder_safe_python_execution,
|
14 |
-
# and evaluate_solution_candidate function as previously provided and corrected) ...
|
15 |
-
# Ensure all that logic is present here. For brevity, I am not pasting it all again.
|
16 |
-
# The key change is the import line for safe_executor above.
|
17 |
-
|
18 |
class EvaluationResultOutput:
|
19 |
def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
|
20 |
-
self.combined_score
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
if self.execution_details:
|
25 |
-
|
26 |
-
|
27 |
-
if
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
def _parse_llm_score(llm_text_output: str) -> int:
|
33 |
-
# ... (
|
34 |
score = 0; import re
|
35 |
if not llm_text_output or not isinstance(llm_text_output, str): return score
|
36 |
match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
|
37 |
if match: score = max(1, min(int(match.group(1)), 10))
|
38 |
-
else:
|
|
|
|
|
39 |
return score
|
40 |
|
41 |
-
# _placeholder_safe_python_execution remains in safe_executor.py, it's imported.
|
42 |
-
|
43 |
def evaluate_solution_candidate(
|
44 |
-
solution_text: str,
|
45 |
-
|
|
|
|
|
|
|
46 |
) -> EvaluationResultOutput:
|
47 |
-
# ... (implementation as before, ensuring it calls the imported execute_python_code_with_tests) ...
|
48 |
print(f"DEBUG: evaluation_engine.py - Evaluating candidate. Problem type: {problem_type}")
|
49 |
-
llm_critique_text
|
|
|
|
|
|
|
|
|
|
|
50 |
if solution_text and not solution_text.startswith("ERROR"):
|
51 |
-
# ... (LLM critique call logic) ...
|
52 |
system_p_critique = get_system_prompt("critique_general")
|
53 |
user_p_critique = format_critique_user_prompt(problem_description, solution_text)
|
54 |
llm_response_obj = None
|
@@ -58,24 +85,46 @@ def evaluate_solution_candidate(
|
|
58 |
raw_llm_critique_resp = llm_response_obj.raw_response
|
59 |
if llm_response_obj.success: llm_critique_text, llm_score = llm_response_obj.text, _parse_llm_score(llm_response_obj.text)
|
60 |
else: llm_critique_text, llm_score = f"Error during LLM critique: {llm_response_obj.error}", 0
|
61 |
-
elif solution_text and solution_text.startswith("ERROR"):
|
|
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
combined_score = llm_score # Start with LLM score
|
69 |
-
if execution_result_obj and execution_result_obj.total_tests > 0: # Adjust based on tests
|
70 |
-
if not execution_result_obj.success or execution_result_obj.error: combined_score = max(1, llm_score - 5)
|
71 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
|
73 |
-
if pass_ratio == 1.0: combined_score = min(10, llm_score +
|
74 |
-
elif pass_ratio >= 0.
|
75 |
-
elif pass_ratio < 0.
|
76 |
-
|
|
|
|
|
|
|
|
|
77 |
combined_score = max(1, min(10, combined_score))
|
78 |
-
return EvaluationResultOutput(combined_score, llm_critique_text, execution_result_obj, raw_llm_critique_resp)
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
print("DEBUG: core.evaluation_engine - Module fully defined.")
|
|
|
2 |
import random
|
3 |
import traceback
|
4 |
|
5 |
+
from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse
|
6 |
+
from prompts.system_prompts import get_system_prompt
|
7 |
+
from prompts.prompt_templates import format_critique_user_prompt
|
8 |
+
from core.safe_executor import execute_python_code_with_tests, ExecutionResult, TestResult # Import new classes
|
|
|
9 |
|
10 |
print("DEBUG: core.evaluation_engine - Imports successful")
|
11 |
|
|
|
|
|
|
|
|
|
|
|
12 |
class EvaluationResultOutput:
|
13 |
def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
|
14 |
+
self.combined_score = combined_score
|
15 |
+
self.llm_critique_text = llm_critique_text
|
16 |
+
self.execution_details = execution_details
|
17 |
+
self.raw_llm_response = raw_llm_response
|
18 |
+
|
19 |
+
def get_display_critique(self) -> str:
|
20 |
+
"""Formats a comprehensive critique including LLM feedback and execution results."""
|
21 |
+
critique_parts = []
|
22 |
+
critique_parts.append(self.llm_critique_text if self.llm_critique_text else "LLM critique was not performed or failed.")
|
23 |
+
|
24 |
if self.execution_details:
|
25 |
+
exec_details = self.execution_details
|
26 |
+
critique_parts.append("\n\n**Automated Execution & Test Results (Simulated):**")
|
27 |
+
if exec_details.compilation_error:
|
28 |
+
critique_parts.append(f" Compilation Error: {exec_details.compilation_error}")
|
29 |
+
elif exec_details.timeout_error:
|
30 |
+
critique_parts.append(f" Execution Timed Out after {exec_details.execution_time:.2f}s.")
|
31 |
+
else:
|
32 |
+
if exec_details.total_tests > 0:
|
33 |
+
critique_parts.append(f" Tests Attempted: {exec_details.total_tests}")
|
34 |
+
critique_parts.append(f" Tests Passed: {exec_details.passed_tests}")
|
35 |
+
if exec_details.passed_tests < exec_details.total_tests:
|
36 |
+
critique_parts.append(" Failed Tests Details:")
|
37 |
+
for test_res in exec_details.individual_test_results:
|
38 |
+
if not test_res.passed:
|
39 |
+
critique_parts.append(f" - Test: `{test_res.test_string[:70]}...`")
|
40 |
+
if test_res.error_message:
|
41 |
+
critique_parts.append(f" Error: {test_res.error_message[:100]}...")
|
42 |
+
else: # Code ran, but no assert-based tests provided/found
|
43 |
+
critique_parts.append(" Code executed (no assert-based tests found/run).")
|
44 |
+
|
45 |
+
if exec_details.stdout:
|
46 |
+
critique_parts.append(f" Execution Stdout (truncated):\n```\n{exec_details.stdout[:300].strip()}\n```")
|
47 |
+
if exec_details.stderr and not any(not tr.passed for tr in exec_details.individual_test_results if tr.error_message): # Show general stderr if not already part of a test fail
|
48 |
+
critique_parts.append(f" Execution Stderr (general):\n```\n{exec_details.stderr[:300].strip()}\n```")
|
49 |
+
critique_parts.append(f" Simulated Execution Time: {exec_details.execution_time:.4f}s")
|
50 |
+
return "\n".join(critique_parts)
|
51 |
|
52 |
def _parse_llm_score(llm_text_output: str) -> int:
|
53 |
+
# ... (same as your last working version)
|
54 |
score = 0; import re
|
55 |
if not llm_text_output or not isinstance(llm_text_output, str): return score
|
56 |
match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
|
57 |
if match: score = max(1, min(int(match.group(1)), 10))
|
58 |
+
else:
|
59 |
+
print(f"INFO: evaluation_engine.py - 'Score: X/10' marker not found. Output: {llm_text_output[:100]}...")
|
60 |
+
score = random.randint(3, 6)
|
61 |
return score
|
62 |
|
|
|
|
|
63 |
def evaluate_solution_candidate(
|
64 |
+
solution_text: str,
|
65 |
+
problem_description: str,
|
66 |
+
problem_type: str,
|
67 |
+
user_provided_tests_code: str,
|
68 |
+
llm_client_config: dict
|
69 |
) -> EvaluationResultOutput:
|
|
|
70 |
print(f"DEBUG: evaluation_engine.py - Evaluating candidate. Problem type: {problem_type}")
|
71 |
+
llm_critique_text = "LLM critique generation failed or was skipped."
|
72 |
+
llm_score = 0
|
73 |
+
raw_llm_critique_resp = None
|
74 |
+
execution_result_obj = None # type: ExecutionResult
|
75 |
+
|
76 |
+
# 1. LLM-based Critique
|
77 |
if solution_text and not solution_text.startswith("ERROR"):
|
78 |
+
# ... (LLM critique call logic - same as before) ...
|
79 |
system_p_critique = get_system_prompt("critique_general")
|
80 |
user_p_critique = format_critique_user_prompt(problem_description, solution_text)
|
81 |
llm_response_obj = None
|
|
|
85 |
raw_llm_critique_resp = llm_response_obj.raw_response
|
86 |
if llm_response_obj.success: llm_critique_text, llm_score = llm_response_obj.text, _parse_llm_score(llm_response_obj.text)
|
87 |
else: llm_critique_text, llm_score = f"Error during LLM critique: {llm_response_obj.error}", 0
|
88 |
+
elif solution_text and solution_text.startswith("ERROR"):
|
89 |
+
llm_critique_text, llm_score = f"Solution was error from Genesis: {solution_text}", 0
|
90 |
|
91 |
+
# 2. Code Execution
|
92 |
+
if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR"):
|
93 |
+
if user_provided_tests_code.strip():
|
94 |
+
print(f"INFO: evaluation_engine.py - Executing Python code candidate against user tests.")
|
95 |
+
execution_result_obj = execute_python_code_with_tests(solution_text, user_provided_tests_code, timeout_seconds=10)
|
|
|
|
|
|
|
96 |
else:
|
97 |
+
print(f"INFO: evaluation_engine.py - Executing Python code candidate (no tests provided).")
|
98 |
+
execution_result_obj = execute_python_code_with_tests(solution_text, "", timeout_seconds=5) # Execute code even if no tests
|
99 |
+
print(f"INFO: evaluation_engine.py - Execution result: {execution_result_obj}")
|
100 |
+
elif "python" in problem_type.lower() and not user_provided_tests_code.strip() and solution_text and not solution_text.startswith("ERROR"):
|
101 |
+
# Case where it's python but no tests - still might want to run to catch basic runtime/compile errors
|
102 |
+
execution_result_obj = execute_python_code_with_tests(solution_text, "", timeout_seconds=5)
|
103 |
+
|
104 |
+
|
105 |
+
# 3. Combine Scores into a Final Score
|
106 |
+
combined_score = llm_score
|
107 |
+
if execution_result_obj:
|
108 |
+
if execution_result_obj.compilation_error or execution_result_obj.timeout_error or (not execution_result_obj.success and execution_result_obj.stderr and not execution_result_obj.individual_test_results) :
|
109 |
+
combined_score = 1 # Catastrophic failure
|
110 |
+
elif execution_result_obj.total_tests > 0:
|
111 |
pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
|
112 |
+
if pass_ratio == 1.0: combined_score = min(10, llm_score + 3) # Strong bonus for all tests passing
|
113 |
+
elif pass_ratio >= 0.8: combined_score = min(10, llm_score + 1)
|
114 |
+
elif pass_ratio < 0.2: combined_score = max(1, llm_score - 6) # Heavy penalty
|
115 |
+
elif pass_ratio < 0.5: combined_score = max(1, llm_score - 4)
|
116 |
+
else: combined_score = int(llm_score * (0.4 + 0.6 * pass_ratio)) # Weighted more by tests
|
117 |
+
elif not execution_result_obj.success and execution_result_obj.error : # General runtime error without tests
|
118 |
+
combined_score = max(1, llm_score - 4)
|
119 |
+
|
120 |
combined_score = max(1, min(10, combined_score))
|
|
|
121 |
|
122 |
+
print(f"DEBUG: evaluation_engine.py - Evaluation complete. Combined Score: {combined_score}")
|
123 |
+
return EvaluationResultOutput(
|
124 |
+
combined_score=combined_score,
|
125 |
+
llm_critique_text=llm_critique_text,
|
126 |
+
execution_details=execution_result_obj,
|
127 |
+
raw_llm_response=raw_llm_critique_resp
|
128 |
+
)
|
129 |
|
130 |
print("DEBUG: core.evaluation_engine - Module fully defined.")
|