Spaces:
Sleeping
Sleeping
# algoforge_prime/core/evaluation_engine.py | |
import random | |
import time | |
import traceback | |
# IMPORTANT: The following import is for a HYPOTHETICAL safe executor. | |
# You would need to implement or find a robust sandboxing solution. | |
# from .restricted_env_executor import execute_python_code_safely # Example | |
from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse # Absolute | |
from prompts.system_prompts import get_system_prompt # Absolute | |
from prompts.prompt_templates import format_critique_user_prompt # Absolute | |
class EvaluationResult: | |
def __init__(self, score=0, critique_text="", passed_tests=0, total_tests=0, execution_summary=None, raw_llm_critique_response=None): | |
self.score = score # Final combined score | |
self.critique_text = critique_text # LLM based critique + execution summary | |
self.passed_tests = passed_tests | |
self.total_tests = total_tests | |
self.execution_summary = execution_summary # Error or success message from code execution | |
self.raw_llm_critique_response = raw_llm_critique_response | |
def __str__(self): # For simple string representation if needed | |
return f"Score: {self.score}/10. Tests: {self.passed_tests}/{self.total_tests}. Summary: {self.execution_summary}. Critique: {self.critique_text[:100]}..." | |
def _parse_score_from_llm_text(llm_text_output: str) -> int: | |
"""Helper to parse 'Score: X/10' from LLM's textual output.""" | |
score = 0 # Default if not found or unparsable | |
if not llm_text_output or not isinstance(llm_text_output, str): | |
return score | |
try: | |
# Look for "Score: X/10" or "Score: X" | |
# More robust parsing might be needed depending on LLM variability | |
import re | |
match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE) | |
if match: | |
parsed_score_val = int(match.group(1)) | |
score = max(1, min(parsed_score_val, 10)) # Clamp score to 1-10 | |
else: # Fallback if specific format not found | |
print(f"INFO: evaluation_engine.py - 'Score: X/10' marker not found in LLM output. Assigning fallback score. Output: {llm_text_output[:100]}...") | |
score = random.randint(3, 6) # Assign a mediocre random score | |
except Exception as e: | |
print(f"WARNING: evaluation_engine.py - Error parsing score from LLM output '{llm_text_output[:100]}...': {e}") | |
score = random.randint(3, 5) # Fallback on parsing error | |
return score | |
def _placeholder_safe_python_execution(code_string: str, user_tests_string: str) -> tuple[int, int, str]: | |
""" | |
PLACEHOLDER for safe Python code execution. | |
**WARNING: THIS IS NOT SAFE FOR PRODUCTION. IT ONLY SIMULATES.** | |
Replace with a robust sandboxing mechanism (Docker, nsjail, WASM, etc.). | |
""" | |
print(f"DEV_INFO: evaluation_engine.py - Entering PLACEHOLDER for code execution.") | |
print(f" Code (first 100 chars): {code_string[:100]}...") | |
print(f" Tests (first 100 chars): {user_tests_string[:100]}...") | |
if not user_tests_string.strip() or not code_string.strip(): | |
return 0, 0, "SIMULATED: No tests provided or no code to test." | |
# Naive parsing of assert statements | |
test_lines = [line.strip() for line in user_tests_string.splitlines() if line.strip().startswith("assert")] | |
total_tests_found = len(test_lines) | |
if total_tests_found == 0: | |
return 0, 0, "SIMULATED: No 'assert' statements found in user tests." | |
# Extremely simplistic simulation logic (NOT REAL EXECUTION) | |
passed_count = 0 | |
execution_log = ["SIMULATED EXECUTION LOG:"] | |
try: | |
# This is where real sandboxed execution would happen. | |
# We'll simulate based on keywords for demonstration. | |
if "syntax error" in code_string.lower() or "indentationerror" in code_string.lower(): | |
execution_log.append(" - Simulated: Potential syntax error in generated code.") | |
# passed_count remains 0 | |
elif "runtime error" in code_string.lower() or "exception" in code_string.lower(): | |
execution_log.append(" - Simulated: Code might raise a runtime error.") | |
passed_count = random.randint(0, total_tests_found // 3) # Few pass | |
elif "return" not in code_string and any("==" in t for t in test_lines): # If expecting a return value | |
execution_log.append(" - Simulated: Code might be missing a crucial 'return' statement.") | |
passed_count = random.randint(0, total_tests_found // 2) | |
else: # Simulate some passing, some failing | |
passed_count = random.randint(total_tests_found // 2, total_tests_found) | |
execution_log.append(f" - Simulated: {passed_count} of {total_tests_found} tests likely passed.") | |
if passed_count < total_tests_found: | |
execution_log.append(f" - Simulated: {total_tests_found - passed_count} test(s) likely failed.") | |
summary = f"Simulated: {passed_count}/{total_tests_found} tests passed." | |
if passed_count < total_tests_found : summary += " Some tests likely failed." | |
except Exception as e_sim: # Error in our simulation logic | |
summary = f"Error during test SIMULATION logic: {str(e_sim)}" | |
passed_count = 0 | |
execution_log.append(f" - ERROR in simulation: {e_sim}") | |
print(f"DEV_INFO: evaluation_engine.py - Placeholder execution result: {summary}") | |
return passed_count, total_tests_found, "\n".join(execution_log) | |
def evaluate_solution_candidate( | |
solution_text: str, | |
problem_description: str, | |
problem_type: str, | |
user_provided_tests: str, # String of Python assert statements | |
llm_client_config: dict # {"type": ..., "model_id": ..., "temp": ..., "max_tokens": ...} | |
) -> EvaluationResult: | |
""" | |
Evaluates a single solution candidate. | |
""" | |
llm_critique_output_text = "LLM critique could not be performed due to an earlier error or API issue." | |
llm_based_score = 0 | |
raw_llm_critique_resp = None | |
# 1. LLM-based Critique (if solution_text is not an error message itself) | |
if solution_text and not solution_text.startswith("ERROR"): | |
system_p_critique = get_system_prompt("critique_general") # problem_type can be used here too | |
user_p_critique = format_critique_user_prompt(problem_description, solution_text) | |
llm_response_obj = None # type: LLMResponse | |
if llm_client_config["type"] == "hf": | |
llm_response_obj = call_huggingface_api( | |
user_p_critique, llm_client_config["model_id"], | |
temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"], | |
system_prompt_text=system_p_critique | |
) | |
elif llm_client_config["type"] == "google_gemini": | |
llm_response_obj = call_gemini_api( | |
user_p_critique, llm_client_config["model_id"], | |
temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"], | |
system_prompt_text=system_p_critique | |
) | |
if llm_response_obj: | |
raw_llm_critique_resp = llm_response_obj.raw_response | |
if llm_response_obj.success: | |
llm_critique_output_text = llm_response_obj.text | |
llm_based_score = _parse_score_from_llm_text(llm_critique_output_text) | |
else: | |
llm_critique_output_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}" | |
llm_based_score = 0 # Penalize for critique failure | |
elif solution_text and solution_text.startswith("ERROR"): | |
llm_critique_output_text = f"Solution was an error from Genesis: {solution_text}" | |
llm_based_score = 0 | |
# 2. (Simulated) Code Execution if applicable | |
passed_tests_count = 0 | |
total_tests_count = 0 | |
exec_summary_msg = "Automated tests not applicable or not run for this problem type/solution." | |
# Only run tests if it's a Python problem, tests are provided, and solution isn't an error | |
if "python" in problem_type.lower() and user_provided_tests.strip() and solution_text and not solution_text.startswith("ERROR"): | |
# **IMPORTANT**: Replace with a REAL sandboxed executor for safety. | |
passed_tests_count, total_tests_count, exec_summary_msg = _placeholder_safe_python_execution( | |
solution_text, user_provided_tests | |
) | |
elif "python" in problem_type.lower() and not user_provided_tests.strip(): | |
exec_summary_msg = "No user tests provided for this Python problem." | |
# 3. Combine Scores into a Final Score (Example Heuristic) | |
final_score_calculated = llm_based_score | |
if total_tests_count > 0: # If tests were run | |
test_pass_ratio = passed_tests_count / total_tests_count | |
if test_pass_ratio < 0.5 : # Penalize heavily if less than half tests pass | |
final_score_calculated = max(1, int(llm_based_score * 0.5) - 1) | |
elif test_pass_ratio == 1.0 and passed_tests_count > 0: # All tests passed | |
final_score_calculated = min(10, llm_based_score + 1 if llm_based_score < 10 else 10) # Small bonus | |
else: # Some tests passed or ratio between 0.5 and 1.0 | |
final_score_calculated = int(llm_based_score * (0.6 + 0.4 * test_pass_ratio)) | |
final_score_calculated = max(1, min(10, final_score_calculated)) # Ensure score is 1-10 | |
# Construct comprehensive critique text for display | |
comprehensive_critique = f"{llm_critique_output_text}" | |
if total_tests_count > 0 or ("python" in problem_type.lower() and user_provided_tests.strip()): # Add test summary if applicable | |
comprehensive_critique += f"\n\n**Automated Test Summary (Simulated):**\n{exec_summary_msg}\n" | |
comprehensive_critique += f"Passed: {passed_tests_count}/{total_tests_count}" | |
return EvaluationResult( | |
score=final_score_calculated, | |
critique_text=comprehensive_critique, | |
passed_tests=passed_tests_count, | |
total_tests=total_tests_count, | |
execution_summary=exec_summary_msg, | |
raw_llm_critique_response=raw_llm_critique_resp | |
) |