Spaces:
Sleeping
Sleeping
File size: 10,145 Bytes
48e5e22 8246349 48e5e22 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
# algoforge_prime/core/evaluation_engine.py
import random
import time
import traceback
# IMPORTANT: The following import is for a HYPOTHETICAL safe executor.
# You would need to implement or find a robust sandboxing solution.
# from .restricted_env_executor import execute_python_code_safely # Example
from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse # Absolute
from prompts.system_prompts import get_system_prompt # Absolute
from prompts.prompt_templates import format_critique_user_prompt # Absolute
class EvaluationResult:
def __init__(self, score=0, critique_text="", passed_tests=0, total_tests=0, execution_summary=None, raw_llm_critique_response=None):
self.score = score # Final combined score
self.critique_text = critique_text # LLM based critique + execution summary
self.passed_tests = passed_tests
self.total_tests = total_tests
self.execution_summary = execution_summary # Error or success message from code execution
self.raw_llm_critique_response = raw_llm_critique_response
def __str__(self): # For simple string representation if needed
return f"Score: {self.score}/10. Tests: {self.passed_tests}/{self.total_tests}. Summary: {self.execution_summary}. Critique: {self.critique_text[:100]}..."
def _parse_score_from_llm_text(llm_text_output: str) -> int:
"""Helper to parse 'Score: X/10' from LLM's textual output."""
score = 0 # Default if not found or unparsable
if not llm_text_output or not isinstance(llm_text_output, str):
return score
try:
# Look for "Score: X/10" or "Score: X"
# More robust parsing might be needed depending on LLM variability
import re
match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
if match:
parsed_score_val = int(match.group(1))
score = max(1, min(parsed_score_val, 10)) # Clamp score to 1-10
else: # Fallback if specific format not found
print(f"INFO: evaluation_engine.py - 'Score: X/10' marker not found in LLM output. Assigning fallback score. Output: {llm_text_output[:100]}...")
score = random.randint(3, 6) # Assign a mediocre random score
except Exception as e:
print(f"WARNING: evaluation_engine.py - Error parsing score from LLM output '{llm_text_output[:100]}...': {e}")
score = random.randint(3, 5) # Fallback on parsing error
return score
def _placeholder_safe_python_execution(code_string: str, user_tests_string: str) -> tuple[int, int, str]:
"""
PLACEHOLDER for safe Python code execution.
**WARNING: THIS IS NOT SAFE FOR PRODUCTION. IT ONLY SIMULATES.**
Replace with a robust sandboxing mechanism (Docker, nsjail, WASM, etc.).
"""
print(f"DEV_INFO: evaluation_engine.py - Entering PLACEHOLDER for code execution.")
print(f" Code (first 100 chars): {code_string[:100]}...")
print(f" Tests (first 100 chars): {user_tests_string[:100]}...")
if not user_tests_string.strip() or not code_string.strip():
return 0, 0, "SIMULATED: No tests provided or no code to test."
# Naive parsing of assert statements
test_lines = [line.strip() for line in user_tests_string.splitlines() if line.strip().startswith("assert")]
total_tests_found = len(test_lines)
if total_tests_found == 0:
return 0, 0, "SIMULATED: No 'assert' statements found in user tests."
# Extremely simplistic simulation logic (NOT REAL EXECUTION)
passed_count = 0
execution_log = ["SIMULATED EXECUTION LOG:"]
try:
# This is where real sandboxed execution would happen.
# We'll simulate based on keywords for demonstration.
if "syntax error" in code_string.lower() or "indentationerror" in code_string.lower():
execution_log.append(" - Simulated: Potential syntax error in generated code.")
# passed_count remains 0
elif "runtime error" in code_string.lower() or "exception" in code_string.lower():
execution_log.append(" - Simulated: Code might raise a runtime error.")
passed_count = random.randint(0, total_tests_found // 3) # Few pass
elif "return" not in code_string and any("==" in t for t in test_lines): # If expecting a return value
execution_log.append(" - Simulated: Code might be missing a crucial 'return' statement.")
passed_count = random.randint(0, total_tests_found // 2)
else: # Simulate some passing, some failing
passed_count = random.randint(total_tests_found // 2, total_tests_found)
execution_log.append(f" - Simulated: {passed_count} of {total_tests_found} tests likely passed.")
if passed_count < total_tests_found:
execution_log.append(f" - Simulated: {total_tests_found - passed_count} test(s) likely failed.")
summary = f"Simulated: {passed_count}/{total_tests_found} tests passed."
if passed_count < total_tests_found : summary += " Some tests likely failed."
except Exception as e_sim: # Error in our simulation logic
summary = f"Error during test SIMULATION logic: {str(e_sim)}"
passed_count = 0
execution_log.append(f" - ERROR in simulation: {e_sim}")
print(f"DEV_INFO: evaluation_engine.py - Placeholder execution result: {summary}")
return passed_count, total_tests_found, "\n".join(execution_log)
def evaluate_solution_candidate(
solution_text: str,
problem_description: str,
problem_type: str,
user_provided_tests: str, # String of Python assert statements
llm_client_config: dict # {"type": ..., "model_id": ..., "temp": ..., "max_tokens": ...}
) -> EvaluationResult:
"""
Evaluates a single solution candidate.
"""
llm_critique_output_text = "LLM critique could not be performed due to an earlier error or API issue."
llm_based_score = 0
raw_llm_critique_resp = None
# 1. LLM-based Critique (if solution_text is not an error message itself)
if solution_text and not solution_text.startswith("ERROR"):
system_p_critique = get_system_prompt("critique_general") # problem_type can be used here too
user_p_critique = format_critique_user_prompt(problem_description, solution_text)
llm_response_obj = None # type: LLMResponse
if llm_client_config["type"] == "hf":
llm_response_obj = call_huggingface_api(
user_p_critique, llm_client_config["model_id"],
temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
system_prompt_text=system_p_critique
)
elif llm_client_config["type"] == "google_gemini":
llm_response_obj = call_gemini_api(
user_p_critique, llm_client_config["model_id"],
temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
system_prompt_text=system_p_critique
)
if llm_response_obj:
raw_llm_critique_resp = llm_response_obj.raw_response
if llm_response_obj.success:
llm_critique_output_text = llm_response_obj.text
llm_based_score = _parse_score_from_llm_text(llm_critique_output_text)
else:
llm_critique_output_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
llm_based_score = 0 # Penalize for critique failure
elif solution_text and solution_text.startswith("ERROR"):
llm_critique_output_text = f"Solution was an error from Genesis: {solution_text}"
llm_based_score = 0
# 2. (Simulated) Code Execution if applicable
passed_tests_count = 0
total_tests_count = 0
exec_summary_msg = "Automated tests not applicable or not run for this problem type/solution."
# Only run tests if it's a Python problem, tests are provided, and solution isn't an error
if "python" in problem_type.lower() and user_provided_tests.strip() and solution_text and not solution_text.startswith("ERROR"):
# **IMPORTANT**: Replace with a REAL sandboxed executor for safety.
passed_tests_count, total_tests_count, exec_summary_msg = _placeholder_safe_python_execution(
solution_text, user_provided_tests
)
elif "python" in problem_type.lower() and not user_provided_tests.strip():
exec_summary_msg = "No user tests provided for this Python problem."
# 3. Combine Scores into a Final Score (Example Heuristic)
final_score_calculated = llm_based_score
if total_tests_count > 0: # If tests were run
test_pass_ratio = passed_tests_count / total_tests_count
if test_pass_ratio < 0.5 : # Penalize heavily if less than half tests pass
final_score_calculated = max(1, int(llm_based_score * 0.5) - 1)
elif test_pass_ratio == 1.0 and passed_tests_count > 0: # All tests passed
final_score_calculated = min(10, llm_based_score + 1 if llm_based_score < 10 else 10) # Small bonus
else: # Some tests passed or ratio between 0.5 and 1.0
final_score_calculated = int(llm_based_score * (0.6 + 0.4 * test_pass_ratio))
final_score_calculated = max(1, min(10, final_score_calculated)) # Ensure score is 1-10
# Construct comprehensive critique text for display
comprehensive_critique = f"{llm_critique_output_text}"
if total_tests_count > 0 or ("python" in problem_type.lower() and user_provided_tests.strip()): # Add test summary if applicable
comprehensive_critique += f"\n\n**Automated Test Summary (Simulated):**\n{exec_summary_msg}\n"
comprehensive_critique += f"Passed: {passed_tests_count}/{total_tests_count}"
return EvaluationResult(
score=final_score_calculated,
critique_text=comprehensive_critique,
passed_tests=passed_tests_count,
total_tests=total_tests_count,
execution_summary=exec_summary_msg,
raw_llm_critique_response=raw_llm_critique_resp
) |