Spaces:
Sleeping
Sleeping
File size: 5,538 Bytes
48e5e22 cbf1fef a00b966 cbf1fef a98e0b1 cbf1fef a98e0b1 a00b966 cbf1fef a00b966 cbf1fef a00b966 cbf1fef a00b966 cbf1fef a00b966 cbf1fef 48e5e22 cbf1fef 48e5e22 cbf1fef a00b966 cbf1fef a98e0b1 cbf1fef 48e5e22 cbf1fef a00b966 48e5e22 a00b966 cbf1fef 48e5e22 cbf1fef 48e5e22 a00b966 cbf1fef a00b966 cbf1fef a00b966 a98e0b1 cbf1fef 48e5e22 a98e0b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# algoforge_prime/core/evaluation_engine.py
import random
import traceback
# --- Corrected Imports ---
from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse
from prompts.system_prompts import get_system_prompt
from prompts.prompt_templates import format_critique_user_prompt
from .safe_executor import execute_python_code_with_tests, ExecutionResult # CORRECTED: Relative import
print("DEBUG: core.evaluation_engine - Imports successful")
# ... (rest of the EvaluationResultOutput class, _parse_llm_score, _placeholder_safe_python_execution,
# and evaluate_solution_candidate function as previously provided and corrected) ...
# Ensure all that logic is present here. For brevity, I am not pasting it all again.
# The key change is the import line for safe_executor above.
class EvaluationResultOutput:
def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
self.combined_score, self.llm_critique_text, self.execution_details, self.raw_llm_response = combined_score, llm_critique_text, execution_details, raw_llm_response
def get_display_critique(self):
# ... (implementation as before)
full_critique = self.llm_critique_text if self.llm_critique_text else "LLM critique failed/skipped."
if self.execution_details:
full_critique += f"\n\n**Automated Execution & Test Results (Simulated):**\n"
if self.execution_details.total_tests > 0: full_critique += f" Tests: {self.execution_details.passed_tests}/{self.execution_details.total_tests} passed.\n"
if self.execution_details.error: full_critique += f" Error: {self.execution_details.error}\n"
elif self.execution_details.output: full_critique += f" Output:\n```\n{self.execution_details.output[:500]}\n```\n"
full_critique += f" Time: {self.execution_details.execution_time:.4f}s\n"
return full_critique
def _parse_llm_score(llm_text_output: str) -> int:
# ... (implementation as before)
score = 0; import re
if not llm_text_output or not isinstance(llm_text_output, str): return score
match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
if match: score = max(1, min(int(match.group(1)), 10))
else: score = random.randint(3, 6)
return score
# _placeholder_safe_python_execution remains in safe_executor.py, it's imported.
def evaluate_solution_candidate(
solution_text: str, problem_description: str, problem_type: str,
user_provided_tests_code: str, llm_client_config: dict
) -> EvaluationResultOutput:
# ... (implementation as before, ensuring it calls the imported execute_python_code_with_tests) ...
print(f"DEBUG: evaluation_engine.py - Evaluating candidate. Problem type: {problem_type}")
llm_critique_text, llm_score, raw_llm_critique_resp, execution_result_obj = "LLM critique failed/skipped.", 0, None, None
if solution_text and not solution_text.startswith("ERROR"):
# ... (LLM critique call logic) ...
system_p_critique = get_system_prompt("critique_general")
user_p_critique = format_critique_user_prompt(problem_description, solution_text)
llm_response_obj = None
if llm_client_config["type"] == "hf": llm_response_obj = call_huggingface_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
elif llm_client_config["type"] == "google_gemini": llm_response_obj = call_gemini_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
if llm_response_obj:
raw_llm_critique_resp = llm_response_obj.raw_response
if llm_response_obj.success: llm_critique_text, llm_score = llm_response_obj.text, _parse_llm_score(llm_response_obj.text)
else: llm_critique_text, llm_score = f"Error during LLM critique: {llm_response_obj.error}", 0
elif solution_text and solution_text.startswith("ERROR"): llm_critique_text, llm_score = f"Solution was error: {solution_text}", 0
if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip():
execution_result_obj = execute_python_code_with_tests(solution_text, user_provided_tests_code, timeout_seconds=10)
elif "python" in problem_type.lower() and not user_provided_tests_code.strip():
execution_result_obj = ExecutionResult(success=True, output="No user tests provided.", total_tests=0)
combined_score = llm_score # Start with LLM score
if execution_result_obj and execution_result_obj.total_tests > 0: # Adjust based on tests
if not execution_result_obj.success or execution_result_obj.error: combined_score = max(1, llm_score - 5)
else:
pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
if pass_ratio == 1.0: combined_score = min(10, llm_score + 2)
elif pass_ratio >= 0.75: combined_score = min(10, llm_score + 1)
elif pass_ratio < 0.25: combined_score = max(1, llm_score - 4)
else: combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio))
combined_score = max(1, min(10, combined_score))
return EvaluationResultOutput(combined_score, llm_critique_text, execution_result_obj, raw_llm_critique_resp)
print("DEBUG: core.evaluation_engine - Module fully defined.") |