File size: 5,538 Bytes
48e5e22
 
cbf1fef
a00b966
cbf1fef
 
 
 
 
a98e0b1
 
 
cbf1fef
 
 
 
a98e0b1
 
a00b966
cbf1fef
a00b966
cbf1fef
 
a00b966
 
cbf1fef
 
 
 
a00b966
 
 
cbf1fef
 
a00b966
cbf1fef
 
 
48e5e22
 
cbf1fef
48e5e22
 
cbf1fef
 
a00b966
cbf1fef
a98e0b1
cbf1fef
48e5e22
cbf1fef
a00b966
48e5e22
a00b966
cbf1fef
 
48e5e22
 
cbf1fef
 
 
48e5e22
a00b966
cbf1fef
a00b966
cbf1fef
 
 
 
 
a00b966
 
a98e0b1
 
 
 
 
cbf1fef
48e5e22
a98e0b1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# algoforge_prime/core/evaluation_engine.py
import random
import traceback

# --- Corrected Imports ---
from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse 
from prompts.system_prompts import get_system_prompt 
from prompts.prompt_templates import format_critique_user_prompt 
from .safe_executor import execute_python_code_with_tests, ExecutionResult # CORRECTED: Relative import

print("DEBUG: core.evaluation_engine - Imports successful")

# ... (rest of the EvaluationResultOutput class, _parse_llm_score, _placeholder_safe_python_execution, 
#      and evaluate_solution_candidate function as previously provided and corrected) ...
# Ensure all that logic is present here. For brevity, I am not pasting it all again.
# The key change is the import line for safe_executor above.

class EvaluationResultOutput:
    def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
        self.combined_score, self.llm_critique_text, self.execution_details, self.raw_llm_response = combined_score, llm_critique_text, execution_details, raw_llm_response
    def get_display_critique(self):
        # ... (implementation as before)
        full_critique = self.llm_critique_text if self.llm_critique_text else "LLM critique failed/skipped."
        if self.execution_details:
            full_critique += f"\n\n**Automated Execution & Test Results (Simulated):**\n"
            if self.execution_details.total_tests > 0: full_critique += f"  Tests: {self.execution_details.passed_tests}/{self.execution_details.total_tests} passed.\n"
            if self.execution_details.error: full_critique += f"  Error: {self.execution_details.error}\n"
            elif self.execution_details.output: full_critique += f"  Output:\n```\n{self.execution_details.output[:500]}\n```\n"
            full_critique += f"  Time: {self.execution_details.execution_time:.4f}s\n"
        return full_critique

def _parse_llm_score(llm_text_output: str) -> int:
    # ... (implementation as before)
    score = 0; import re
    if not llm_text_output or not isinstance(llm_text_output, str): return score
    match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
    if match: score = max(1, min(int(match.group(1)), 10))
    else: score = random.randint(3, 6)
    return score

# _placeholder_safe_python_execution remains in safe_executor.py, it's imported.

def evaluate_solution_candidate(
    solution_text: str, problem_description: str, problem_type: str,
    user_provided_tests_code: str, llm_client_config: dict
) -> EvaluationResultOutput:
    # ... (implementation as before, ensuring it calls the imported execute_python_code_with_tests) ...
    print(f"DEBUG: evaluation_engine.py - Evaluating candidate. Problem type: {problem_type}")
    llm_critique_text, llm_score, raw_llm_critique_resp, execution_result_obj = "LLM critique failed/skipped.", 0, None, None
    if solution_text and not solution_text.startswith("ERROR"):
        # ... (LLM critique call logic) ...
        system_p_critique = get_system_prompt("critique_general")
        user_p_critique = format_critique_user_prompt(problem_description, solution_text)
        llm_response_obj = None
        if llm_client_config["type"] == "hf": llm_response_obj = call_huggingface_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
        elif llm_client_config["type"] == "google_gemini": llm_response_obj = call_gemini_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
        if llm_response_obj:
            raw_llm_critique_resp = llm_response_obj.raw_response
            if llm_response_obj.success: llm_critique_text, llm_score = llm_response_obj.text, _parse_llm_score(llm_response_obj.text)
            else: llm_critique_text, llm_score = f"Error during LLM critique: {llm_response_obj.error}", 0
    elif solution_text and solution_text.startswith("ERROR"): llm_critique_text, llm_score = f"Solution was error: {solution_text}", 0

    if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip():
        execution_result_obj = execute_python_code_with_tests(solution_text, user_provided_tests_code, timeout_seconds=10)
    elif "python" in problem_type.lower() and not user_provided_tests_code.strip():
        execution_result_obj = ExecutionResult(success=True, output="No user tests provided.", total_tests=0)
    
    combined_score = llm_score # Start with LLM score
    if execution_result_obj and execution_result_obj.total_tests > 0: # Adjust based on tests
        if not execution_result_obj.success or execution_result_obj.error: combined_score = max(1, llm_score - 5)
        else:
            pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
            if pass_ratio == 1.0: combined_score = min(10, llm_score + 2)
            elif pass_ratio >= 0.75: combined_score = min(10, llm_score + 1)
            elif pass_ratio < 0.25: combined_score = max(1, llm_score - 4)
            else: combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio))
    combined_score = max(1, min(10, combined_score))
    return EvaluationResultOutput(combined_score, llm_critique_text, execution_result_obj, raw_llm_critique_resp)


print("DEBUG: core.evaluation_engine - Module fully defined.")