Spaces:
Sleeping
Sleeping
Update core/evaluation_engine.py
Browse files- core/evaluation_engine.py +86 -156
core/evaluation_engine.py
CHANGED
@@ -1,188 +1,118 @@
|
|
1 |
# algoforge_prime/core/evaluation_engine.py
|
2 |
import random
|
3 |
-
import
|
4 |
-
import
|
5 |
-
|
6 |
-
#
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
self.
|
18 |
-
self.
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
31 |
try:
|
32 |
-
# Look for "Score: X/10" or "Score: X"
|
33 |
-
# More robust parsing might be needed depending on LLM variability
|
34 |
import re
|
35 |
match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
|
36 |
if match:
|
37 |
parsed_score_val = int(match.group(1))
|
38 |
-
score = max(1, min(parsed_score_val, 10))
|
39 |
-
else: # Fallback if
|
40 |
-
|
41 |
-
score = random.randint(3, 6) # Assign a mediocre random score
|
42 |
-
except Exception as e:
|
43 |
-
print(f"WARNING: evaluation_engine.py - Error parsing score from LLM output '{llm_text_output[:100]}...': {e}")
|
44 |
-
score = random.randint(3, 5) # Fallback on parsing error
|
45 |
return score
|
46 |
|
47 |
-
def _placeholder_safe_python_execution(code_string: str, user_tests_string: str) -> tuple[int, int, str]:
|
48 |
-
"""
|
49 |
-
PLACEHOLDER for safe Python code execution.
|
50 |
-
**WARNING: THIS IS NOT SAFE FOR PRODUCTION. IT ONLY SIMULATES.**
|
51 |
-
Replace with a robust sandboxing mechanism (Docker, nsjail, WASM, etc.).
|
52 |
-
"""
|
53 |
-
print(f"DEV_INFO: evaluation_engine.py - Entering PLACEHOLDER for code execution.")
|
54 |
-
print(f" Code (first 100 chars): {code_string[:100]}...")
|
55 |
-
print(f" Tests (first 100 chars): {user_tests_string[:100]}...")
|
56 |
-
|
57 |
-
if not user_tests_string.strip() or not code_string.strip():
|
58 |
-
return 0, 0, "SIMULATED: No tests provided or no code to test."
|
59 |
-
|
60 |
-
# Naive parsing of assert statements
|
61 |
-
test_lines = [line.strip() for line in user_tests_string.splitlines() if line.strip().startswith("assert")]
|
62 |
-
total_tests_found = len(test_lines)
|
63 |
-
|
64 |
-
if total_tests_found == 0:
|
65 |
-
return 0, 0, "SIMULATED: No 'assert' statements found in user tests."
|
66 |
-
|
67 |
-
# Extremely simplistic simulation logic (NOT REAL EXECUTION)
|
68 |
-
passed_count = 0
|
69 |
-
execution_log = ["SIMULATED EXECUTION LOG:"]
|
70 |
-
try:
|
71 |
-
# This is where real sandboxed execution would happen.
|
72 |
-
# We'll simulate based on keywords for demonstration.
|
73 |
-
if "syntax error" in code_string.lower() or "indentationerror" in code_string.lower():
|
74 |
-
execution_log.append(" - Simulated: Potential syntax error in generated code.")
|
75 |
-
# passed_count remains 0
|
76 |
-
elif "runtime error" in code_string.lower() or "exception" in code_string.lower():
|
77 |
-
execution_log.append(" - Simulated: Code might raise a runtime error.")
|
78 |
-
passed_count = random.randint(0, total_tests_found // 3) # Few pass
|
79 |
-
elif "return" not in code_string and any("==" in t for t in test_lines): # If expecting a return value
|
80 |
-
execution_log.append(" - Simulated: Code might be missing a crucial 'return' statement.")
|
81 |
-
passed_count = random.randint(0, total_tests_found // 2)
|
82 |
-
else: # Simulate some passing, some failing
|
83 |
-
passed_count = random.randint(total_tests_found // 2, total_tests_found)
|
84 |
-
execution_log.append(f" - Simulated: {passed_count} of {total_tests_found} tests likely passed.")
|
85 |
-
|
86 |
-
if passed_count < total_tests_found:
|
87 |
-
execution_log.append(f" - Simulated: {total_tests_found - passed_count} test(s) likely failed.")
|
88 |
-
|
89 |
-
summary = f"Simulated: {passed_count}/{total_tests_found} tests passed."
|
90 |
-
if passed_count < total_tests_found : summary += " Some tests likely failed."
|
91 |
-
|
92 |
-
except Exception as e_sim: # Error in our simulation logic
|
93 |
-
summary = f"Error during test SIMULATION logic: {str(e_sim)}"
|
94 |
-
passed_count = 0
|
95 |
-
execution_log.append(f" - ERROR in simulation: {e_sim}")
|
96 |
-
|
97 |
-
print(f"DEV_INFO: evaluation_engine.py - Placeholder execution result: {summary}")
|
98 |
-
return passed_count, total_tests_found, "\n".join(execution_log)
|
99 |
-
|
100 |
|
101 |
def evaluate_solution_candidate(
|
102 |
solution_text: str,
|
103 |
problem_description: str,
|
104 |
problem_type: str,
|
105 |
-
|
106 |
-
llm_client_config: dict
|
107 |
-
) ->
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
llm_critique_output_text = "LLM critique could not be performed due to an earlier error or API issue."
|
112 |
-
llm_based_score = 0
|
113 |
raw_llm_critique_resp = None
|
|
|
114 |
|
115 |
-
# 1. LLM-based Critique (if solution_text is not an error
|
116 |
if solution_text and not solution_text.startswith("ERROR"):
|
117 |
-
system_p_critique = get_system_prompt("critique_general")
|
118 |
user_p_critique = format_critique_user_prompt(problem_description, solution_text)
|
119 |
|
120 |
-
llm_response_obj = None
|
121 |
if llm_client_config["type"] == "hf":
|
122 |
-
llm_response_obj = call_huggingface_api(
|
123 |
-
user_p_critique, llm_client_config["model_id"],
|
124 |
-
temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
|
125 |
-
system_prompt_text=system_p_critique
|
126 |
-
)
|
127 |
elif llm_client_config["type"] == "google_gemini":
|
128 |
-
llm_response_obj = call_gemini_api(
|
129 |
-
user_p_critique, llm_client_config["model_id"],
|
130 |
-
temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
|
131 |
-
system_prompt_text=system_p_critique
|
132 |
-
)
|
133 |
|
134 |
if llm_response_obj:
|
135 |
raw_llm_critique_resp = llm_response_obj.raw_response
|
136 |
if llm_response_obj.success:
|
137 |
-
|
138 |
-
|
139 |
else:
|
140 |
-
|
141 |
-
|
142 |
elif solution_text and solution_text.startswith("ERROR"):
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
|
147 |
-
# 2. (Simulated) Code Execution if applicable
|
148 |
-
passed_tests_count = 0
|
149 |
-
total_tests_count = 0
|
150 |
-
exec_summary_msg = "Automated tests not applicable or not run for this problem type/solution."
|
151 |
|
152 |
-
#
|
153 |
-
if "python" in problem_type.lower() and
|
154 |
-
|
155 |
-
|
156 |
-
|
|
|
157 |
)
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
if
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
|
181 |
-
return
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
execution_summary=exec_summary_msg,
|
187 |
-
raw_llm_critique_response=raw_llm_critique_resp
|
188 |
)
|
|
|
1 |
# algoforge_prime/core/evaluation_engine.py
|
2 |
import random
|
3 |
+
from .llm_clients import call_huggingface_api, call_gemini_api, LLMResponse
|
4 |
+
from ..prompts.system_prompts import get_system_prompt
|
5 |
+
from ..prompts.prompt_templates import format_critique_user_prompt
|
6 |
+
# Import our (simulated) safe executor
|
7 |
+
from .safe_executor import execute_python_code_with_tests, ExecutionResult # Assuming it's in the same 'core' package
|
8 |
+
|
9 |
+
class EvaluationResultOutput: # Renamed to avoid conflict with safe_executor.ExecutionResult
|
10 |
+
def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
|
11 |
+
self.combined_score = combined_score
|
12 |
+
self.llm_critique_text = llm_critique_text # LLM's qualitative assessment
|
13 |
+
self.execution_details = execution_details # Object from safe_executor
|
14 |
+
self.raw_llm_response = raw_llm_response
|
15 |
+
|
16 |
+
def get_display_critique(self):
|
17 |
+
full_critique = self.llm_critique_text if self.llm_critique_text else "LLM critique was not performed or failed."
|
18 |
+
if self.execution_details:
|
19 |
+
full_critique += f"\n\n**Automated Execution & Test Results (Simulated):**\n"
|
20 |
+
if self.execution_details.total_tests > 0:
|
21 |
+
full_critique += f" Tests Attempted: {self.execution_details.total_tests}\n"
|
22 |
+
full_critique += f" Tests Passed: {self.execution_details.passed_tests}\n"
|
23 |
+
if self.execution_details.error:
|
24 |
+
full_critique += f" Execution Error: {self.execution_details.error}\n"
|
25 |
+
elif self.execution_details.output:
|
26 |
+
full_critique += f" Execution Output (stdout):\n```\n{self.execution_details.output[:500]}\n```\n" # Limit output display
|
27 |
+
full_critique += f" Execution Time: {self.execution_details.execution_time:.4f}s\n"
|
28 |
+
return full_critique
|
29 |
+
|
30 |
+
|
31 |
+
def _parse_llm_score(llm_text_output: str) -> int:
|
32 |
+
# ... (keep your existing _parse_score_from_llm_text, renamed for clarity) ...
|
33 |
+
score = 0
|
34 |
+
if not llm_text_output or not isinstance(llm_text_output, str): return score
|
35 |
try:
|
|
|
|
|
36 |
import re
|
37 |
match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
|
38 |
if match:
|
39 |
parsed_score_val = int(match.group(1))
|
40 |
+
score = max(1, min(parsed_score_val, 10))
|
41 |
+
else: score = random.randint(3, 6) # Fallback if no score marker
|
42 |
+
except Exception: score = random.randint(3, 5) # Fallback on any parsing error
|
|
|
|
|
|
|
|
|
43 |
return score
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
def evaluate_solution_candidate(
|
47 |
solution_text: str,
|
48 |
problem_description: str,
|
49 |
problem_type: str,
|
50 |
+
user_provided_tests_code: str,
|
51 |
+
llm_client_config: dict
|
52 |
+
) -> EvaluationResultOutput:
|
53 |
+
|
54 |
+
llm_critique_text = "LLM critique generation failed or was skipped."
|
55 |
+
llm_score = 0
|
|
|
|
|
56 |
raw_llm_critique_resp = None
|
57 |
+
execution_result_obj = None # type: ExecutionResult
|
58 |
|
59 |
+
# 1. LLM-based Critique (only if solution_text is not an error itself)
|
60 |
if solution_text and not solution_text.startswith("ERROR"):
|
61 |
+
system_p_critique = get_system_prompt("critique_general")
|
62 |
user_p_critique = format_critique_user_prompt(problem_description, solution_text)
|
63 |
|
64 |
+
llm_response_obj = None
|
65 |
if llm_client_config["type"] == "hf":
|
66 |
+
llm_response_obj = call_huggingface_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
|
|
|
|
|
|
|
|
|
67 |
elif llm_client_config["type"] == "google_gemini":
|
68 |
+
llm_response_obj = call_gemini_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
|
|
|
|
|
|
|
|
|
69 |
|
70 |
if llm_response_obj:
|
71 |
raw_llm_critique_resp = llm_response_obj.raw_response
|
72 |
if llm_response_obj.success:
|
73 |
+
llm_critique_text = llm_response_obj.text
|
74 |
+
llm_score = _parse_llm_score(llm_critique_text)
|
75 |
else:
|
76 |
+
llm_critique_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
|
77 |
+
llm_score = 0 # Penalize
|
78 |
elif solution_text and solution_text.startswith("ERROR"):
|
79 |
+
llm_critique_text = f"Solution was an error from Genesis: {solution_text}"
|
80 |
+
llm_score = 0
|
|
|
81 |
|
|
|
|
|
|
|
|
|
82 |
|
83 |
+
# 2. Code Execution (if Python problem, code exists, and tests are provided)
|
84 |
+
if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip():
|
85 |
+
print(f"INFO: evaluation_engine.py - Preparing to execute Python code candidate against user tests.")
|
86 |
+
# Use the (simulated) safe executor
|
87 |
+
execution_result_obj = execute_python_code_with_tests(
|
88 |
+
solution_text, user_provided_tests_code, timeout_seconds=10 # Example timeout
|
89 |
)
|
90 |
+
print(f"INFO: evaluation_engine.py - Execution result: {execution_result_obj}")
|
91 |
+
elif "python" in problem_type.lower() and not user_provided_tests_code.strip():
|
92 |
+
execution_result_obj = ExecutionResult(success=True, output="No user tests provided to run against the Python code.", total_tests=0)
|
93 |
+
|
94 |
+
|
95 |
+
# 3. Combine Scores into a Final Score (More sophisticated heuristic)
|
96 |
+
combined_score = llm_score
|
97 |
+
if execution_result_obj and execution_result_obj.total_tests > 0:
|
98 |
+
if not execution_result_obj.success or execution_result_obj.error: # Major execution failure
|
99 |
+
combined_score = max(1, llm_score - 5) # Penalize heavily
|
100 |
+
else:
|
101 |
+
pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
|
102 |
+
if pass_ratio == 1.0: # All tests passed
|
103 |
+
combined_score = min(10, llm_score + 2) # Significant bonus
|
104 |
+
elif pass_ratio >= 0.75: # Most tests passed
|
105 |
+
combined_score = min(10, llm_score + 1) # Small bonus
|
106 |
+
elif pass_ratio < 0.25: # Very few tests passed
|
107 |
+
combined_score = max(1, llm_score - 4)
|
108 |
+
else: # Some tests passed
|
109 |
+
combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio)) # Weighted average
|
110 |
+
|
111 |
+
combined_score = max(1, min(10, combined_score)) # Clamp 1-10
|
112 |
|
113 |
+
return EvaluationResultOutput(
|
114 |
+
combined_score=combined_score,
|
115 |
+
llm_critique_text=llm_critique_text,
|
116 |
+
execution_details=execution_result_obj,
|
117 |
+
raw_llm_response=raw_llm_critique_resp
|
|
|
|
|
118 |
)
|