Spaces:
Sleeping
Sleeping
# algoforge_prime/core/evaluation_engine.py | |
import random | |
import traceback | |
# --- Corrected Imports --- | |
from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse | |
from prompts.system_prompts import get_system_prompt | |
from prompts.prompt_templates import format_critique_user_prompt | |
from .safe_executor import execute_python_code_with_tests, ExecutionResult # CORRECTED: Relative import | |
print("DEBUG: core.evaluation_engine - Imports successful") | |
# ... (rest of the EvaluationResultOutput class, _parse_llm_score, _placeholder_safe_python_execution, | |
# and evaluate_solution_candidate function as previously provided and corrected) ... | |
# Ensure all that logic is present here. For brevity, I am not pasting it all again. | |
# The key change is the import line for safe_executor above. | |
class EvaluationResultOutput: | |
def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None): | |
self.combined_score, self.llm_critique_text, self.execution_details, self.raw_llm_response = combined_score, llm_critique_text, execution_details, raw_llm_response | |
def get_display_critique(self): | |
# ... (implementation as before) | |
full_critique = self.llm_critique_text if self.llm_critique_text else "LLM critique failed/skipped." | |
if self.execution_details: | |
full_critique += f"\n\n**Automated Execution & Test Results (Simulated):**\n" | |
if self.execution_details.total_tests > 0: full_critique += f" Tests: {self.execution_details.passed_tests}/{self.execution_details.total_tests} passed.\n" | |
if self.execution_details.error: full_critique += f" Error: {self.execution_details.error}\n" | |
elif self.execution_details.output: full_critique += f" Output:\n```\n{self.execution_details.output[:500]}\n```\n" | |
full_critique += f" Time: {self.execution_details.execution_time:.4f}s\n" | |
return full_critique | |
def _parse_llm_score(llm_text_output: str) -> int: | |
# ... (implementation as before) | |
score = 0; import re | |
if not llm_text_output or not isinstance(llm_text_output, str): return score | |
match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE) | |
if match: score = max(1, min(int(match.group(1)), 10)) | |
else: score = random.randint(3, 6) | |
return score | |
# _placeholder_safe_python_execution remains in safe_executor.py, it's imported. | |
def evaluate_solution_candidate( | |
solution_text: str, problem_description: str, problem_type: str, | |
user_provided_tests_code: str, llm_client_config: dict | |
) -> EvaluationResultOutput: | |
# ... (implementation as before, ensuring it calls the imported execute_python_code_with_tests) ... | |
print(f"DEBUG: evaluation_engine.py - Evaluating candidate. Problem type: {problem_type}") | |
llm_critique_text, llm_score, raw_llm_critique_resp, execution_result_obj = "LLM critique failed/skipped.", 0, None, None | |
if solution_text and not solution_text.startswith("ERROR"): | |
# ... (LLM critique call logic) ... | |
system_p_critique = get_system_prompt("critique_general") | |
user_p_critique = format_critique_user_prompt(problem_description, solution_text) | |
llm_response_obj = None | |
if llm_client_config["type"] == "hf": llm_response_obj = call_huggingface_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique) | |
elif llm_client_config["type"] == "google_gemini": llm_response_obj = call_gemini_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique) | |
if llm_response_obj: | |
raw_llm_critique_resp = llm_response_obj.raw_response | |
if llm_response_obj.success: llm_critique_text, llm_score = llm_response_obj.text, _parse_llm_score(llm_response_obj.text) | |
else: llm_critique_text, llm_score = f"Error during LLM critique: {llm_response_obj.error}", 0 | |
elif solution_text and solution_text.startswith("ERROR"): llm_critique_text, llm_score = f"Solution was error: {solution_text}", 0 | |
if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip(): | |
execution_result_obj = execute_python_code_with_tests(solution_text, user_provided_tests_code, timeout_seconds=10) | |
elif "python" in problem_type.lower() and not user_provided_tests_code.strip(): | |
execution_result_obj = ExecutionResult(success=True, output="No user tests provided.", total_tests=0) | |
combined_score = llm_score # Start with LLM score | |
if execution_result_obj and execution_result_obj.total_tests > 0: # Adjust based on tests | |
if not execution_result_obj.success or execution_result_obj.error: combined_score = max(1, llm_score - 5) | |
else: | |
pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests | |
if pass_ratio == 1.0: combined_score = min(10, llm_score + 2) | |
elif pass_ratio >= 0.75: combined_score = min(10, llm_score + 1) | |
elif pass_ratio < 0.25: combined_score = max(1, llm_score - 4) | |
else: combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio)) | |
combined_score = max(1, min(10, combined_score)) | |
return EvaluationResultOutput(combined_score, llm_critique_text, execution_result_obj, raw_llm_critique_resp) | |
print("DEBUG: core.evaluation_engine - Module fully defined.") |