File size: 8,118 Bytes
48e5e22
 
cbf1fef
a00b966
ebbb706
 
 
 
a98e0b1
 
 
 
a00b966
ebbb706
 
 
 
 
 
 
 
 
 
a00b966
ebbb706
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a00b966
 
ebbb706
cbf1fef
a00b966
cbf1fef
 
ebbb706
 
 
48e5e22
 
 
ebbb706
 
 
 
 
a00b966
a98e0b1
ebbb706
 
 
 
 
 
48e5e22
ebbb706
a00b966
48e5e22
a00b966
cbf1fef
 
48e5e22
 
cbf1fef
 
ebbb706
 
48e5e22
ebbb706
 
 
 
 
a00b966
ebbb706
 
 
 
 
 
 
 
 
 
 
 
 
 
a00b966
ebbb706
 
 
 
 
 
 
 
a98e0b1
48e5e22
ebbb706
 
 
 
 
 
 
a98e0b1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# algoforge_prime/core/evaluation_engine.py
import random
import traceback

from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse
from prompts.system_prompts import get_system_prompt
from prompts.prompt_templates import format_critique_user_prompt
from core.safe_executor import execute_python_code_with_tests, ExecutionResult, TestResult # Import new classes

print("DEBUG: core.evaluation_engine - Imports successful")

class EvaluationResultOutput:
    def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
        self.combined_score = combined_score
        self.llm_critique_text = llm_critique_text 
        self.execution_details = execution_details 
        self.raw_llm_response = raw_llm_response

    def get_display_critique(self) -> str:
        """Formats a comprehensive critique including LLM feedback and execution results."""
        critique_parts = []
        critique_parts.append(self.llm_critique_text if self.llm_critique_text else "LLM critique was not performed or failed.")

        if self.execution_details:
            exec_details = self.execution_details
            critique_parts.append("\n\n**Automated Execution & Test Results (Simulated):**")
            if exec_details.compilation_error:
                critique_parts.append(f"  Compilation Error: {exec_details.compilation_error}")
            elif exec_details.timeout_error:
                critique_parts.append(f"  Execution Timed Out after {exec_details.execution_time:.2f}s.")
            else:
                if exec_details.total_tests > 0:
                    critique_parts.append(f"  Tests Attempted: {exec_details.total_tests}")
                    critique_parts.append(f"  Tests Passed:    {exec_details.passed_tests}")
                    if exec_details.passed_tests < exec_details.total_tests:
                        critique_parts.append("  Failed Tests Details:")
                        for test_res in exec_details.individual_test_results:
                            if not test_res.passed:
                                critique_parts.append(f"    - Test: `{test_res.test_string[:70]}...`")
                                if test_res.error_message:
                                    critique_parts.append(f"      Error: {test_res.error_message[:100]}...")
                else: # Code ran, but no assert-based tests provided/found
                    critique_parts.append("  Code executed (no assert-based tests found/run).")

                if exec_details.stdout:
                    critique_parts.append(f"  Execution Stdout (truncated):\n```\n{exec_details.stdout[:300].strip()}\n```")
                if exec_details.stderr and not any(not tr.passed for tr in exec_details.individual_test_results if tr.error_message): # Show general stderr if not already part of a test fail
                    critique_parts.append(f"  Execution Stderr (general):\n```\n{exec_details.stderr[:300].strip()}\n```")
            critique_parts.append(f"  Simulated Execution Time: {exec_details.execution_time:.4f}s")
        return "\n".join(critique_parts)

def _parse_llm_score(llm_text_output: str) -> int:
    # ... (same as your last working version)
    score = 0; import re
    if not llm_text_output or not isinstance(llm_text_output, str): return score
    match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
    if match: score = max(1, min(int(match.group(1)), 10))
    else: 
        print(f"INFO: evaluation_engine.py - 'Score: X/10' marker not found. Output: {llm_text_output[:100]}...")
        score = random.randint(3, 6) 
    return score

def evaluate_solution_candidate(
    solution_text: str,
    problem_description: str,
    problem_type: str,
    user_provided_tests_code: str,
    llm_client_config: dict
) -> EvaluationResultOutput:
    print(f"DEBUG: evaluation_engine.py - Evaluating candidate. Problem type: {problem_type}")
    llm_critique_text = "LLM critique generation failed or was skipped."
    llm_score = 0
    raw_llm_critique_resp = None
    execution_result_obj = None # type: ExecutionResult

    # 1. LLM-based Critique
    if solution_text and not solution_text.startswith("ERROR"):
        # ... (LLM critique call logic - same as before) ...
        system_p_critique = get_system_prompt("critique_general")
        user_p_critique = format_critique_user_prompt(problem_description, solution_text)
        llm_response_obj = None
        if llm_client_config["type"] == "hf": llm_response_obj = call_huggingface_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
        elif llm_client_config["type"] == "google_gemini": llm_response_obj = call_gemini_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
        if llm_response_obj:
            raw_llm_critique_resp = llm_response_obj.raw_response
            if llm_response_obj.success: llm_critique_text, llm_score = llm_response_obj.text, _parse_llm_score(llm_response_obj.text)
            else: llm_critique_text, llm_score = f"Error during LLM critique: {llm_response_obj.error}", 0
    elif solution_text and solution_text.startswith("ERROR"):
        llm_critique_text, llm_score = f"Solution was error from Genesis: {solution_text}", 0

    # 2. Code Execution
    if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR"):
        if user_provided_tests_code.strip():
            print(f"INFO: evaluation_engine.py - Executing Python code candidate against user tests.")
            execution_result_obj = execute_python_code_with_tests(solution_text, user_provided_tests_code, timeout_seconds=10)
        else:
            print(f"INFO: evaluation_engine.py - Executing Python code candidate (no tests provided).")
            execution_result_obj = execute_python_code_with_tests(solution_text, "", timeout_seconds=5) # Execute code even if no tests
        print(f"INFO: evaluation_engine.py - Execution result: {execution_result_obj}")
    elif "python" in problem_type.lower() and not user_provided_tests_code.strip() and solution_text and not solution_text.startswith("ERROR"):
        # Case where it's python but no tests - still might want to run to catch basic runtime/compile errors
        execution_result_obj = execute_python_code_with_tests(solution_text, "", timeout_seconds=5)


    # 3. Combine Scores into a Final Score
    combined_score = llm_score
    if execution_result_obj:
        if execution_result_obj.compilation_error or execution_result_obj.timeout_error or (not execution_result_obj.success and execution_result_obj.stderr and not execution_result_obj.individual_test_results) :
            combined_score = 1 # Catastrophic failure
        elif execution_result_obj.total_tests > 0:
            pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
            if pass_ratio == 1.0: combined_score = min(10, llm_score + 3) # Strong bonus for all tests passing
            elif pass_ratio >= 0.8: combined_score = min(10, llm_score + 1)
            elif pass_ratio < 0.2: combined_score = max(1, llm_score - 6) # Heavy penalty
            elif pass_ratio < 0.5: combined_score = max(1, llm_score - 4)
            else: combined_score = int(llm_score * (0.4 + 0.6 * pass_ratio)) # Weighted more by tests
        elif not execution_result_obj.success and execution_result_obj.error : # General runtime error without tests
             combined_score = max(1, llm_score - 4)

    combined_score = max(1, min(10, combined_score))

    print(f"DEBUG: evaluation_engine.py - Evaluation complete. Combined Score: {combined_score}")
    return EvaluationResultOutput(
        combined_score=combined_score,
        llm_critique_text=llm_critique_text,
        execution_details=execution_result_obj,
        raw_llm_response=raw_llm_critique_resp
    )

print("DEBUG: core.evaluation_engine - Module fully defined.")