Spaces:
Sleeping
Sleeping
# algoforge_prime/core/evaluation_engine.py | |
import random | |
import traceback | |
from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse | |
from prompts.system_prompts import get_system_prompt | |
from prompts.prompt_templates import format_critique_user_prompt | |
from core.safe_executor import execute_python_code_with_tests, ExecutionResult, TestResult # Import new classes | |
print("DEBUG: core.evaluation_engine - Imports successful") | |
class EvaluationResultOutput: | |
def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None): | |
self.combined_score = combined_score | |
self.llm_critique_text = llm_critique_text | |
self.execution_details = execution_details | |
self.raw_llm_response = raw_llm_response | |
def get_display_critique(self) -> str: | |
"""Formats a comprehensive critique including LLM feedback and execution results.""" | |
critique_parts = [] | |
critique_parts.append(self.llm_critique_text if self.llm_critique_text else "LLM critique was not performed or failed.") | |
if self.execution_details: | |
exec_details = self.execution_details | |
critique_parts.append("\n\n**Automated Execution & Test Results (Simulated):**") | |
if exec_details.compilation_error: | |
critique_parts.append(f" Compilation Error: {exec_details.compilation_error}") | |
elif exec_details.timeout_error: | |
critique_parts.append(f" Execution Timed Out after {exec_details.execution_time:.2f}s.") | |
else: | |
if exec_details.total_tests > 0: | |
critique_parts.append(f" Tests Attempted: {exec_details.total_tests}") | |
critique_parts.append(f" Tests Passed: {exec_details.passed_tests}") | |
if exec_details.passed_tests < exec_details.total_tests: | |
critique_parts.append(" Failed Tests Details:") | |
for test_res in exec_details.individual_test_results: | |
if not test_res.passed: | |
critique_parts.append(f" - Test: `{test_res.test_string[:70]}...`") | |
if test_res.error_message: | |
critique_parts.append(f" Error: {test_res.error_message[:100]}...") | |
else: # Code ran, but no assert-based tests provided/found | |
critique_parts.append(" Code executed (no assert-based tests found/run).") | |
if exec_details.stdout: | |
critique_parts.append(f" Execution Stdout (truncated):\n```\n{exec_details.stdout[:300].strip()}\n```") | |
if exec_details.stderr and not any(not tr.passed for tr in exec_details.individual_test_results if tr.error_message): # Show general stderr if not already part of a test fail | |
critique_parts.append(f" Execution Stderr (general):\n```\n{exec_details.stderr[:300].strip()}\n```") | |
critique_parts.append(f" Simulated Execution Time: {exec_details.execution_time:.4f}s") | |
return "\n".join(critique_parts) | |
def _parse_llm_score(llm_text_output: str) -> int: | |
# ... (same as your last working version) | |
score = 0; import re | |
if not llm_text_output or not isinstance(llm_text_output, str): return score | |
match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE) | |
if match: score = max(1, min(int(match.group(1)), 10)) | |
else: | |
print(f"INFO: evaluation_engine.py - 'Score: X/10' marker not found. Output: {llm_text_output[:100]}...") | |
score = random.randint(3, 6) | |
return score | |
def evaluate_solution_candidate( | |
solution_text: str, | |
problem_description: str, | |
problem_type: str, | |
user_provided_tests_code: str, | |
llm_client_config: dict | |
) -> EvaluationResultOutput: | |
print(f"DEBUG: evaluation_engine.py - Evaluating candidate. Problem type: {problem_type}") | |
llm_critique_text = "LLM critique generation failed or was skipped." | |
llm_score = 0 | |
raw_llm_critique_resp = None | |
execution_result_obj = None # type: ExecutionResult | |
# 1. LLM-based Critique | |
if solution_text and not solution_text.startswith("ERROR"): | |
# ... (LLM critique call logic - same as before) ... | |
system_p_critique = get_system_prompt("critique_general") | |
user_p_critique = format_critique_user_prompt(problem_description, solution_text) | |
llm_response_obj = None | |
if llm_client_config["type"] == "hf": llm_response_obj = call_huggingface_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique) | |
elif llm_client_config["type"] == "google_gemini": llm_response_obj = call_gemini_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique) | |
if llm_response_obj: | |
raw_llm_critique_resp = llm_response_obj.raw_response | |
if llm_response_obj.success: llm_critique_text, llm_score = llm_response_obj.text, _parse_llm_score(llm_response_obj.text) | |
else: llm_critique_text, llm_score = f"Error during LLM critique: {llm_response_obj.error}", 0 | |
elif solution_text and solution_text.startswith("ERROR"): | |
llm_critique_text, llm_score = f"Solution was error from Genesis: {solution_text}", 0 | |
# 2. Code Execution | |
if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR"): | |
if user_provided_tests_code.strip(): | |
print(f"INFO: evaluation_engine.py - Executing Python code candidate against user tests.") | |
execution_result_obj = execute_python_code_with_tests(solution_text, user_provided_tests_code, timeout_seconds=10) | |
else: | |
print(f"INFO: evaluation_engine.py - Executing Python code candidate (no tests provided).") | |
execution_result_obj = execute_python_code_with_tests(solution_text, "", timeout_seconds=5) # Execute code even if no tests | |
print(f"INFO: evaluation_engine.py - Execution result: {execution_result_obj}") | |
elif "python" in problem_type.lower() and not user_provided_tests_code.strip() and solution_text and not solution_text.startswith("ERROR"): | |
# Case where it's python but no tests - still might want to run to catch basic runtime/compile errors | |
execution_result_obj = execute_python_code_with_tests(solution_text, "", timeout_seconds=5) | |
# 3. Combine Scores into a Final Score | |
combined_score = llm_score | |
if execution_result_obj: | |
if execution_result_obj.compilation_error or execution_result_obj.timeout_error or (not execution_result_obj.success and execution_result_obj.stderr and not execution_result_obj.individual_test_results) : | |
combined_score = 1 # Catastrophic failure | |
elif execution_result_obj.total_tests > 0: | |
pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests | |
if pass_ratio == 1.0: combined_score = min(10, llm_score + 3) # Strong bonus for all tests passing | |
elif pass_ratio >= 0.8: combined_score = min(10, llm_score + 1) | |
elif pass_ratio < 0.2: combined_score = max(1, llm_score - 6) # Heavy penalty | |
elif pass_ratio < 0.5: combined_score = max(1, llm_score - 4) | |
else: combined_score = int(llm_score * (0.4 + 0.6 * pass_ratio)) # Weighted more by tests | |
elif not execution_result_obj.success and execution_result_obj.error : # General runtime error without tests | |
combined_score = max(1, llm_score - 4) | |
combined_score = max(1, min(10, combined_score)) | |
print(f"DEBUG: evaluation_engine.py - Evaluation complete. Combined Score: {combined_score}") | |
return EvaluationResultOutput( | |
combined_score=combined_score, | |
llm_critique_text=llm_critique_text, | |
execution_details=execution_result_obj, | |
raw_llm_response=raw_llm_critique_resp | |
) | |
print("DEBUG: core.evaluation_engine - Module fully defined.") |