Spaces:
Sleeping
Sleeping
Update core/evaluation_engine.py
Browse files- core/evaluation_engine.py +39 -39
core/evaluation_engine.py
CHANGED
@@ -1,16 +1,21 @@
|
|
1 |
# algoforge_prime/core/evaluation_engine.py
|
2 |
import random
|
3 |
-
|
4 |
-
from ..prompts.system_prompts import get_system_prompt
|
5 |
-
from ..prompts.prompt_templates import format_critique_user_prompt
|
6 |
-
# Import our (simulated) safe executor
|
7 |
-
from .safe_executor import execute_python_code_with_tests, ExecutionResult # Assuming it's in the same 'core' package
|
8 |
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
|
11 |
self.combined_score = combined_score
|
12 |
-
self.llm_critique_text = llm_critique_text
|
13 |
-
self.execution_details = execution_details
|
14 |
self.raw_llm_response = raw_llm_response
|
15 |
|
16 |
def get_display_critique(self):
|
@@ -23,13 +28,12 @@ class EvaluationResultOutput: # Renamed to avoid conflict with safe_executor.Exe
|
|
23 |
if self.execution_details.error:
|
24 |
full_critique += f" Execution Error: {self.execution_details.error}\n"
|
25 |
elif self.execution_details.output:
|
26 |
-
full_critique += f" Execution Output (stdout):\n```\n{self.execution_details.output[:500]}\n```\n"
|
27 |
full_critique += f" Execution Time: {self.execution_details.execution_time:.4f}s\n"
|
28 |
return full_critique
|
29 |
|
30 |
|
31 |
def _parse_llm_score(llm_text_output: str) -> int:
|
32 |
-
# ... (keep your existing _parse_score_from_llm_text, renamed for clarity) ...
|
33 |
score = 0
|
34 |
if not llm_text_output or not isinstance(llm_text_output, str): return score
|
35 |
try:
|
@@ -38,8 +42,12 @@ def _parse_llm_score(llm_text_output: str) -> int:
|
|
38 |
if match:
|
39 |
parsed_score_val = int(match.group(1))
|
40 |
score = max(1, min(parsed_score_val, 10))
|
41 |
-
else:
|
42 |
-
|
|
|
|
|
|
|
|
|
43 |
return score
|
44 |
|
45 |
|
@@ -50,13 +58,12 @@ def evaluate_solution_candidate(
|
|
50 |
user_provided_tests_code: str,
|
51 |
llm_client_config: dict
|
52 |
) -> EvaluationResultOutput:
|
53 |
-
|
54 |
llm_critique_text = "LLM critique generation failed or was skipped."
|
55 |
llm_score = 0
|
56 |
raw_llm_critique_resp = None
|
57 |
execution_result_obj = None # type: ExecutionResult
|
58 |
|
59 |
-
# 1. LLM-based Critique (only if solution_text is not an error itself)
|
60 |
if solution_text and not solution_text.startswith("ERROR"):
|
61 |
system_p_critique = get_system_prompt("critique_general")
|
62 |
user_p_critique = format_critique_user_prompt(problem_description, solution_text)
|
@@ -74,45 +81,38 @@ def evaluate_solution_candidate(
|
|
74 |
llm_score = _parse_llm_score(llm_critique_text)
|
75 |
else:
|
76 |
llm_critique_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
|
77 |
-
llm_score = 0
|
78 |
elif solution_text and solution_text.startswith("ERROR"):
|
79 |
llm_critique_text = f"Solution was an error from Genesis: {solution_text}"
|
80 |
llm_score = 0
|
81 |
|
82 |
-
|
83 |
-
# 2. Code Execution (if Python problem, code exists, and tests are provided)
|
84 |
if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip():
|
85 |
-
print(f"INFO: evaluation_engine.py - Preparing to execute Python code candidate
|
86 |
-
# Use the (simulated) safe executor
|
87 |
execution_result_obj = execute_python_code_with_tests(
|
88 |
-
solution_text, user_provided_tests_code, timeout_seconds=10
|
89 |
)
|
90 |
-
print(f"INFO: evaluation_engine.py - Execution result: {execution_result_obj}")
|
91 |
elif "python" in problem_type.lower() and not user_provided_tests_code.strip():
|
92 |
-
execution_result_obj = ExecutionResult(success=True, output="No user tests provided
|
93 |
-
|
94 |
|
95 |
-
# 3. Combine Scores into a Final Score (More sophisticated heuristic)
|
96 |
combined_score = llm_score
|
97 |
if execution_result_obj and execution_result_obj.total_tests > 0:
|
98 |
-
if not execution_result_obj.success or execution_result_obj.error:
|
99 |
-
combined_score = max(1, llm_score - 5)
|
100 |
else:
|
101 |
pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
|
102 |
-
if pass_ratio == 1.0:
|
103 |
-
|
104 |
-
elif pass_ratio
|
105 |
-
|
106 |
-
|
107 |
-
combined_score = max(1, llm_score - 4)
|
108 |
-
else: # Some tests passed
|
109 |
-
combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio)) # Weighted average
|
110 |
-
|
111 |
-
combined_score = max(1, min(10, combined_score)) # Clamp 1-10
|
112 |
|
|
|
113 |
return EvaluationResultOutput(
|
114 |
combined_score=combined_score,
|
115 |
-
llm_critique_text=llm_critique_text,
|
116 |
-
execution_details=execution_result_obj,
|
117 |
raw_llm_response=raw_llm_critique_resp
|
118 |
-
)
|
|
|
|
|
|
1 |
# algoforge_prime/core/evaluation_engine.py
|
2 |
import random
|
3 |
+
import traceback # Keep this if used in your placeholder
|
|
|
|
|
|
|
|
|
4 |
|
5 |
+
# --- Corrected Absolute Imports ---
|
6 |
+
from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse # Absolute from project root
|
7 |
+
from prompts.system_prompts import get_system_prompt # Absolute from project root
|
8 |
+
from prompts.prompt_templates import format_critique_user_prompt # Absolute from project root
|
9 |
+
from core.safe_executor import execute_python_code_with_tests, ExecutionResult # Absolute from project root
|
10 |
+
|
11 |
+
print("DEBUG: core.evaluation_engine - Imports successful")
|
12 |
+
|
13 |
+
|
14 |
+
class EvaluationResultOutput:
|
15 |
def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
|
16 |
self.combined_score = combined_score
|
17 |
+
self.llm_critique_text = llm_critique_text
|
18 |
+
self.execution_details = execution_details
|
19 |
self.raw_llm_response = raw_llm_response
|
20 |
|
21 |
def get_display_critique(self):
|
|
|
28 |
if self.execution_details.error:
|
29 |
full_critique += f" Execution Error: {self.execution_details.error}\n"
|
30 |
elif self.execution_details.output:
|
31 |
+
full_critique += f" Execution Output (stdout):\n```\n{self.execution_details.output[:500]}\n```\n"
|
32 |
full_critique += f" Execution Time: {self.execution_details.execution_time:.4f}s\n"
|
33 |
return full_critique
|
34 |
|
35 |
|
36 |
def _parse_llm_score(llm_text_output: str) -> int:
|
|
|
37 |
score = 0
|
38 |
if not llm_text_output or not isinstance(llm_text_output, str): return score
|
39 |
try:
|
|
|
42 |
if match:
|
43 |
parsed_score_val = int(match.group(1))
|
44 |
score = max(1, min(parsed_score_val, 10))
|
45 |
+
else:
|
46 |
+
print(f"INFO: evaluation_engine.py - 'Score: X/10' marker not found in LLM output. Output: {llm_text_output[:100]}...")
|
47 |
+
score = random.randint(3, 6)
|
48 |
+
except Exception as e:
|
49 |
+
print(f"WARNING: evaluation_engine.py - Error parsing score from LLM output '{llm_text_output[:100]}...': {e}")
|
50 |
+
score = random.randint(3, 5)
|
51 |
return score
|
52 |
|
53 |
|
|
|
58 |
user_provided_tests_code: str,
|
59 |
llm_client_config: dict
|
60 |
) -> EvaluationResultOutput:
|
61 |
+
print(f"DEBUG: evaluation_engine.py - Evaluating candidate. Problem type: {problem_type}")
|
62 |
llm_critique_text = "LLM critique generation failed or was skipped."
|
63 |
llm_score = 0
|
64 |
raw_llm_critique_resp = None
|
65 |
execution_result_obj = None # type: ExecutionResult
|
66 |
|
|
|
67 |
if solution_text and not solution_text.startswith("ERROR"):
|
68 |
system_p_critique = get_system_prompt("critique_general")
|
69 |
user_p_critique = format_critique_user_prompt(problem_description, solution_text)
|
|
|
81 |
llm_score = _parse_llm_score(llm_critique_text)
|
82 |
else:
|
83 |
llm_critique_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
|
84 |
+
llm_score = 0
|
85 |
elif solution_text and solution_text.startswith("ERROR"):
|
86 |
llm_critique_text = f"Solution was an error from Genesis: {solution_text}"
|
87 |
llm_score = 0
|
88 |
|
|
|
|
|
89 |
if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip():
|
90 |
+
print(f"INFO: evaluation_engine.py - Preparing to (simulated) execute Python code candidate.")
|
|
|
91 |
execution_result_obj = execute_python_code_with_tests(
|
92 |
+
solution_text, user_provided_tests_code, timeout_seconds=10
|
93 |
)
|
94 |
+
print(f"INFO: evaluation_engine.py - (Simulated) Execution result: {execution_result_obj}")
|
95 |
elif "python" in problem_type.lower() and not user_provided_tests_code.strip():
|
96 |
+
execution_result_obj = ExecutionResult(success=True, output="No user tests provided for this Python problem.", total_tests=0)
|
|
|
97 |
|
|
|
98 |
combined_score = llm_score
|
99 |
if execution_result_obj and execution_result_obj.total_tests > 0:
|
100 |
+
if not execution_result_obj.success or execution_result_obj.error:
|
101 |
+
combined_score = max(1, llm_score - 5)
|
102 |
else:
|
103 |
pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
|
104 |
+
if pass_ratio == 1.0: combined_score = min(10, llm_score + 2)
|
105 |
+
elif pass_ratio >= 0.75: combined_score = min(10, llm_score + 1)
|
106 |
+
elif pass_ratio < 0.25: combined_score = max(1, llm_score - 4)
|
107 |
+
else: combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio))
|
108 |
+
combined_score = max(1, min(10, combined_score))
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
+
print(f"DEBUG: evaluation_engine.py - Evaluation complete. Combined Score: {combined_score}")
|
111 |
return EvaluationResultOutput(
|
112 |
combined_score=combined_score,
|
113 |
+
llm_critique_text=llm_critique_text, # This is just the LLM's part
|
114 |
+
execution_details=execution_result_obj, # This contains test pass/fail and errors
|
115 |
raw_llm_response=raw_llm_critique_resp
|
116 |
+
)
|
117 |
+
|
118 |
+
print("DEBUG: core.evaluation_engine - Module fully defined.")
|