mgbam commited on
Commit
a00b966
·
verified ·
1 Parent(s): 3d7ae13

Update core/evaluation_engine.py

Browse files
Files changed (1) hide show
  1. core/evaluation_engine.py +86 -156
core/evaluation_engine.py CHANGED
@@ -1,188 +1,118 @@
1
  # algoforge_prime/core/evaluation_engine.py
2
  import random
3
- import time
4
- import traceback
5
- # IMPORTANT: The following import is for a HYPOTHETICAL safe executor.
6
- # You would need to implement or find a robust sandboxing solution.
7
- # from .restricted_env_executor import execute_python_code_safely # Example
8
-
9
- from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse # Absolute
10
- from prompts.system_prompts import get_system_prompt # Absolute
11
- from prompts.prompt_templates import format_critique_user_prompt # Absolute
12
-
13
- class EvaluationResult:
14
- def __init__(self, score=0, critique_text="", passed_tests=0, total_tests=0, execution_summary=None, raw_llm_critique_response=None):
15
- self.score = score # Final combined score
16
- self.critique_text = critique_text # LLM based critique + execution summary
17
- self.passed_tests = passed_tests
18
- self.total_tests = total_tests
19
- self.execution_summary = execution_summary # Error or success message from code execution
20
- self.raw_llm_critique_response = raw_llm_critique_response
21
-
22
- def __str__(self): # For simple string representation if needed
23
- return f"Score: {self.score}/10. Tests: {self.passed_tests}/{self.total_tests}. Summary: {self.execution_summary}. Critique: {self.critique_text[:100]}..."
24
-
25
- def _parse_score_from_llm_text(llm_text_output: str) -> int:
26
- """Helper to parse 'Score: X/10' from LLM's textual output."""
27
- score = 0 # Default if not found or unparsable
28
- if not llm_text_output or not isinstance(llm_text_output, str):
29
- return score
30
-
 
 
 
 
31
  try:
32
- # Look for "Score: X/10" or "Score: X"
33
- # More robust parsing might be needed depending on LLM variability
34
  import re
35
  match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
36
  if match:
37
  parsed_score_val = int(match.group(1))
38
- score = max(1, min(parsed_score_val, 10)) # Clamp score to 1-10
39
- else: # Fallback if specific format not found
40
- print(f"INFO: evaluation_engine.py - 'Score: X/10' marker not found in LLM output. Assigning fallback score. Output: {llm_text_output[:100]}...")
41
- score = random.randint(3, 6) # Assign a mediocre random score
42
- except Exception as e:
43
- print(f"WARNING: evaluation_engine.py - Error parsing score from LLM output '{llm_text_output[:100]}...': {e}")
44
- score = random.randint(3, 5) # Fallback on parsing error
45
  return score
46
 
47
- def _placeholder_safe_python_execution(code_string: str, user_tests_string: str) -> tuple[int, int, str]:
48
- """
49
- PLACEHOLDER for safe Python code execution.
50
- **WARNING: THIS IS NOT SAFE FOR PRODUCTION. IT ONLY SIMULATES.**
51
- Replace with a robust sandboxing mechanism (Docker, nsjail, WASM, etc.).
52
- """
53
- print(f"DEV_INFO: evaluation_engine.py - Entering PLACEHOLDER for code execution.")
54
- print(f" Code (first 100 chars): {code_string[:100]}...")
55
- print(f" Tests (first 100 chars): {user_tests_string[:100]}...")
56
-
57
- if not user_tests_string.strip() or not code_string.strip():
58
- return 0, 0, "SIMULATED: No tests provided or no code to test."
59
-
60
- # Naive parsing of assert statements
61
- test_lines = [line.strip() for line in user_tests_string.splitlines() if line.strip().startswith("assert")]
62
- total_tests_found = len(test_lines)
63
-
64
- if total_tests_found == 0:
65
- return 0, 0, "SIMULATED: No 'assert' statements found in user tests."
66
-
67
- # Extremely simplistic simulation logic (NOT REAL EXECUTION)
68
- passed_count = 0
69
- execution_log = ["SIMULATED EXECUTION LOG:"]
70
- try:
71
- # This is where real sandboxed execution would happen.
72
- # We'll simulate based on keywords for demonstration.
73
- if "syntax error" in code_string.lower() or "indentationerror" in code_string.lower():
74
- execution_log.append(" - Simulated: Potential syntax error in generated code.")
75
- # passed_count remains 0
76
- elif "runtime error" in code_string.lower() or "exception" in code_string.lower():
77
- execution_log.append(" - Simulated: Code might raise a runtime error.")
78
- passed_count = random.randint(0, total_tests_found // 3) # Few pass
79
- elif "return" not in code_string and any("==" in t for t in test_lines): # If expecting a return value
80
- execution_log.append(" - Simulated: Code might be missing a crucial 'return' statement.")
81
- passed_count = random.randint(0, total_tests_found // 2)
82
- else: # Simulate some passing, some failing
83
- passed_count = random.randint(total_tests_found // 2, total_tests_found)
84
- execution_log.append(f" - Simulated: {passed_count} of {total_tests_found} tests likely passed.")
85
-
86
- if passed_count < total_tests_found:
87
- execution_log.append(f" - Simulated: {total_tests_found - passed_count} test(s) likely failed.")
88
-
89
- summary = f"Simulated: {passed_count}/{total_tests_found} tests passed."
90
- if passed_count < total_tests_found : summary += " Some tests likely failed."
91
-
92
- except Exception as e_sim: # Error in our simulation logic
93
- summary = f"Error during test SIMULATION logic: {str(e_sim)}"
94
- passed_count = 0
95
- execution_log.append(f" - ERROR in simulation: {e_sim}")
96
-
97
- print(f"DEV_INFO: evaluation_engine.py - Placeholder execution result: {summary}")
98
- return passed_count, total_tests_found, "\n".join(execution_log)
99
-
100
 
101
  def evaluate_solution_candidate(
102
  solution_text: str,
103
  problem_description: str,
104
  problem_type: str,
105
- user_provided_tests: str, # String of Python assert statements
106
- llm_client_config: dict # {"type": ..., "model_id": ..., "temp": ..., "max_tokens": ...}
107
- ) -> EvaluationResult:
108
- """
109
- Evaluates a single solution candidate.
110
- """
111
- llm_critique_output_text = "LLM critique could not be performed due to an earlier error or API issue."
112
- llm_based_score = 0
113
  raw_llm_critique_resp = None
 
114
 
115
- # 1. LLM-based Critique (if solution_text is not an error message itself)
116
  if solution_text and not solution_text.startswith("ERROR"):
117
- system_p_critique = get_system_prompt("critique_general") # problem_type can be used here too
118
  user_p_critique = format_critique_user_prompt(problem_description, solution_text)
119
 
120
- llm_response_obj = None # type: LLMResponse
121
  if llm_client_config["type"] == "hf":
122
- llm_response_obj = call_huggingface_api(
123
- user_p_critique, llm_client_config["model_id"],
124
- temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
125
- system_prompt_text=system_p_critique
126
- )
127
  elif llm_client_config["type"] == "google_gemini":
128
- llm_response_obj = call_gemini_api(
129
- user_p_critique, llm_client_config["model_id"],
130
- temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
131
- system_prompt_text=system_p_critique
132
- )
133
 
134
  if llm_response_obj:
135
  raw_llm_critique_resp = llm_response_obj.raw_response
136
  if llm_response_obj.success:
137
- llm_critique_output_text = llm_response_obj.text
138
- llm_based_score = _parse_score_from_llm_text(llm_critique_output_text)
139
  else:
140
- llm_critique_output_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
141
- llm_based_score = 0 # Penalize for critique failure
142
  elif solution_text and solution_text.startswith("ERROR"):
143
- llm_critique_output_text = f"Solution was an error from Genesis: {solution_text}"
144
- llm_based_score = 0
145
-
146
 
147
- # 2. (Simulated) Code Execution if applicable
148
- passed_tests_count = 0
149
- total_tests_count = 0
150
- exec_summary_msg = "Automated tests not applicable or not run for this problem type/solution."
151
 
152
- # Only run tests if it's a Python problem, tests are provided, and solution isn't an error
153
- if "python" in problem_type.lower() and user_provided_tests.strip() and solution_text and not solution_text.startswith("ERROR"):
154
- # **IMPORTANT**: Replace with a REAL sandboxed executor for safety.
155
- passed_tests_count, total_tests_count, exec_summary_msg = _placeholder_safe_python_execution(
156
- solution_text, user_provided_tests
 
157
  )
158
- elif "python" in problem_type.lower() and not user_provided_tests.strip():
159
- exec_summary_msg = "No user tests provided for this Python problem."
160
-
161
-
162
- # 3. Combine Scores into a Final Score (Example Heuristic)
163
- final_score_calculated = llm_based_score
164
- if total_tests_count > 0: # If tests were run
165
- test_pass_ratio = passed_tests_count / total_tests_count
166
- if test_pass_ratio < 0.5 : # Penalize heavily if less than half tests pass
167
- final_score_calculated = max(1, int(llm_based_score * 0.5) - 1)
168
- elif test_pass_ratio == 1.0 and passed_tests_count > 0: # All tests passed
169
- final_score_calculated = min(10, llm_based_score + 1 if llm_based_score < 10 else 10) # Small bonus
170
- else: # Some tests passed or ratio between 0.5 and 1.0
171
- final_score_calculated = int(llm_based_score * (0.6 + 0.4 * test_pass_ratio))
172
- final_score_calculated = max(1, min(10, final_score_calculated)) # Ensure score is 1-10
173
-
174
- # Construct comprehensive critique text for display
175
- comprehensive_critique = f"{llm_critique_output_text}"
176
- if total_tests_count > 0 or ("python" in problem_type.lower() and user_provided_tests.strip()): # Add test summary if applicable
177
- comprehensive_critique += f"\n\n**Automated Test Summary (Simulated):**\n{exec_summary_msg}\n"
178
- comprehensive_critique += f"Passed: {passed_tests_count}/{total_tests_count}"
179
-
180
 
181
- return EvaluationResult(
182
- score=final_score_calculated,
183
- critique_text=comprehensive_critique,
184
- passed_tests=passed_tests_count,
185
- total_tests=total_tests_count,
186
- execution_summary=exec_summary_msg,
187
- raw_llm_critique_response=raw_llm_critique_resp
188
  )
 
1
  # algoforge_prime/core/evaluation_engine.py
2
  import random
3
+ from .llm_clients import call_huggingface_api, call_gemini_api, LLMResponse
4
+ from ..prompts.system_prompts import get_system_prompt
5
+ from ..prompts.prompt_templates import format_critique_user_prompt
6
+ # Import our (simulated) safe executor
7
+ from .safe_executor import execute_python_code_with_tests, ExecutionResult # Assuming it's in the same 'core' package
8
+
9
+ class EvaluationResultOutput: # Renamed to avoid conflict with safe_executor.ExecutionResult
10
+ def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
11
+ self.combined_score = combined_score
12
+ self.llm_critique_text = llm_critique_text # LLM's qualitative assessment
13
+ self.execution_details = execution_details # Object from safe_executor
14
+ self.raw_llm_response = raw_llm_response
15
+
16
+ def get_display_critique(self):
17
+ full_critique = self.llm_critique_text if self.llm_critique_text else "LLM critique was not performed or failed."
18
+ if self.execution_details:
19
+ full_critique += f"\n\n**Automated Execution & Test Results (Simulated):**\n"
20
+ if self.execution_details.total_tests > 0:
21
+ full_critique += f" Tests Attempted: {self.execution_details.total_tests}\n"
22
+ full_critique += f" Tests Passed: {self.execution_details.passed_tests}\n"
23
+ if self.execution_details.error:
24
+ full_critique += f" Execution Error: {self.execution_details.error}\n"
25
+ elif self.execution_details.output:
26
+ full_critique += f" Execution Output (stdout):\n```\n{self.execution_details.output[:500]}\n```\n" # Limit output display
27
+ full_critique += f" Execution Time: {self.execution_details.execution_time:.4f}s\n"
28
+ return full_critique
29
+
30
+
31
+ def _parse_llm_score(llm_text_output: str) -> int:
32
+ # ... (keep your existing _parse_score_from_llm_text, renamed for clarity) ...
33
+ score = 0
34
+ if not llm_text_output or not isinstance(llm_text_output, str): return score
35
  try:
 
 
36
  import re
37
  match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
38
  if match:
39
  parsed_score_val = int(match.group(1))
40
+ score = max(1, min(parsed_score_val, 10))
41
+ else: score = random.randint(3, 6) # Fallback if no score marker
42
+ except Exception: score = random.randint(3, 5) # Fallback on any parsing error
 
 
 
 
43
  return score
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  def evaluate_solution_candidate(
47
  solution_text: str,
48
  problem_description: str,
49
  problem_type: str,
50
+ user_provided_tests_code: str,
51
+ llm_client_config: dict
52
+ ) -> EvaluationResultOutput:
53
+
54
+ llm_critique_text = "LLM critique generation failed or was skipped."
55
+ llm_score = 0
 
 
56
  raw_llm_critique_resp = None
57
+ execution_result_obj = None # type: ExecutionResult
58
 
59
+ # 1. LLM-based Critique (only if solution_text is not an error itself)
60
  if solution_text and not solution_text.startswith("ERROR"):
61
+ system_p_critique = get_system_prompt("critique_general")
62
  user_p_critique = format_critique_user_prompt(problem_description, solution_text)
63
 
64
+ llm_response_obj = None
65
  if llm_client_config["type"] == "hf":
66
+ llm_response_obj = call_huggingface_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
 
 
 
 
67
  elif llm_client_config["type"] == "google_gemini":
68
+ llm_response_obj = call_gemini_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
 
 
 
 
69
 
70
  if llm_response_obj:
71
  raw_llm_critique_resp = llm_response_obj.raw_response
72
  if llm_response_obj.success:
73
+ llm_critique_text = llm_response_obj.text
74
+ llm_score = _parse_llm_score(llm_critique_text)
75
  else:
76
+ llm_critique_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
77
+ llm_score = 0 # Penalize
78
  elif solution_text and solution_text.startswith("ERROR"):
79
+ llm_critique_text = f"Solution was an error from Genesis: {solution_text}"
80
+ llm_score = 0
 
81
 
 
 
 
 
82
 
83
+ # 2. Code Execution (if Python problem, code exists, and tests are provided)
84
+ if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip():
85
+ print(f"INFO: evaluation_engine.py - Preparing to execute Python code candidate against user tests.")
86
+ # Use the (simulated) safe executor
87
+ execution_result_obj = execute_python_code_with_tests(
88
+ solution_text, user_provided_tests_code, timeout_seconds=10 # Example timeout
89
  )
90
+ print(f"INFO: evaluation_engine.py - Execution result: {execution_result_obj}")
91
+ elif "python" in problem_type.lower() and not user_provided_tests_code.strip():
92
+ execution_result_obj = ExecutionResult(success=True, output="No user tests provided to run against the Python code.", total_tests=0)
93
+
94
+
95
+ # 3. Combine Scores into a Final Score (More sophisticated heuristic)
96
+ combined_score = llm_score
97
+ if execution_result_obj and execution_result_obj.total_tests > 0:
98
+ if not execution_result_obj.success or execution_result_obj.error: # Major execution failure
99
+ combined_score = max(1, llm_score - 5) # Penalize heavily
100
+ else:
101
+ pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
102
+ if pass_ratio == 1.0: # All tests passed
103
+ combined_score = min(10, llm_score + 2) # Significant bonus
104
+ elif pass_ratio >= 0.75: # Most tests passed
105
+ combined_score = min(10, llm_score + 1) # Small bonus
106
+ elif pass_ratio < 0.25: # Very few tests passed
107
+ combined_score = max(1, llm_score - 4)
108
+ else: # Some tests passed
109
+ combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio)) # Weighted average
110
+
111
+ combined_score = max(1, min(10, combined_score)) # Clamp 1-10
112
 
113
+ return EvaluationResultOutput(
114
+ combined_score=combined_score,
115
+ llm_critique_text=llm_critique_text,
116
+ execution_details=execution_result_obj,
117
+ raw_llm_response=raw_llm_critique_resp
 
 
118
  )