File size: 10,145 Bytes
48e5e22
 
 
 
 
 
 
 
8246349
 
 
48e5e22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# algoforge_prime/core/evaluation_engine.py
import random
import time
import traceback
# IMPORTANT: The following import is for a HYPOTHETICAL safe executor.
# You would need to implement or find a robust sandboxing solution.
# from .restricted_env_executor import execute_python_code_safely # Example

from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse # Absolute
from prompts.system_prompts import get_system_prompt # Absolute
from prompts.prompt_templates import format_critique_user_prompt # Absolute

class EvaluationResult:
    def __init__(self, score=0, critique_text="", passed_tests=0, total_tests=0, execution_summary=None, raw_llm_critique_response=None):
        self.score = score  # Final combined score
        self.critique_text = critique_text # LLM based critique + execution summary
        self.passed_tests = passed_tests
        self.total_tests = total_tests
        self.execution_summary = execution_summary # Error or success message from code execution
        self.raw_llm_critique_response = raw_llm_critique_response

    def __str__(self): # For simple string representation if needed
        return f"Score: {self.score}/10. Tests: {self.passed_tests}/{self.total_tests}. Summary: {self.execution_summary}. Critique: {self.critique_text[:100]}..."

def _parse_score_from_llm_text(llm_text_output: str) -> int:
    """Helper to parse 'Score: X/10' from LLM's textual output."""
    score = 0 # Default if not found or unparsable
    if not llm_text_output or not isinstance(llm_text_output, str):
        return score

    try:
        # Look for "Score: X/10" or "Score: X"
        # More robust parsing might be needed depending on LLM variability
        import re
        match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
        if match:
            parsed_score_val = int(match.group(1))
            score = max(1, min(parsed_score_val, 10)) # Clamp score to 1-10
        else: # Fallback if specific format not found
            print(f"INFO: evaluation_engine.py - 'Score: X/10' marker not found in LLM output. Assigning fallback score. Output: {llm_text_output[:100]}...")
            score = random.randint(3, 6) # Assign a mediocre random score
    except Exception as e:
        print(f"WARNING: evaluation_engine.py - Error parsing score from LLM output '{llm_text_output[:100]}...': {e}")
        score = random.randint(3, 5) # Fallback on parsing error
    return score

def _placeholder_safe_python_execution(code_string: str, user_tests_string: str) -> tuple[int, int, str]:
    """
    PLACEHOLDER for safe Python code execution.
    **WARNING: THIS IS NOT SAFE FOR PRODUCTION. IT ONLY SIMULATES.**
    Replace with a robust sandboxing mechanism (Docker, nsjail, WASM, etc.).
    """
    print(f"DEV_INFO: evaluation_engine.py - Entering PLACEHOLDER for code execution.")
    print(f"  Code (first 100 chars): {code_string[:100]}...")
    print(f"  Tests (first 100 chars): {user_tests_string[:100]}...")

    if not user_tests_string.strip() or not code_string.strip():
        return 0, 0, "SIMULATED: No tests provided or no code to test."

    # Naive parsing of assert statements
    test_lines = [line.strip() for line in user_tests_string.splitlines() if line.strip().startswith("assert")]
    total_tests_found = len(test_lines)

    if total_tests_found == 0:
        return 0, 0, "SIMULATED: No 'assert' statements found in user tests."

    # Extremely simplistic simulation logic (NOT REAL EXECUTION)
    passed_count = 0
    execution_log = ["SIMULATED EXECUTION LOG:"]
    try:
        # This is where real sandboxed execution would happen.
        # We'll simulate based on keywords for demonstration.
        if "syntax error" in code_string.lower() or "indentationerror" in code_string.lower():
            execution_log.append("  - Simulated: Potential syntax error in generated code.")
            # passed_count remains 0
        elif "runtime error" in code_string.lower() or "exception" in code_string.lower():
            execution_log.append("  - Simulated: Code might raise a runtime error.")
            passed_count = random.randint(0, total_tests_found // 3) # Few pass
        elif "return" not in code_string and any("==" in t for t in test_lines): # If expecting a return value
            execution_log.append("  - Simulated: Code might be missing a crucial 'return' statement.")
            passed_count = random.randint(0, total_tests_found // 2)
        else: # Simulate some passing, some failing
            passed_count = random.randint(total_tests_found // 2, total_tests_found)
            execution_log.append(f"  - Simulated: {passed_count} of {total_tests_found} tests likely passed.")
        
        if passed_count < total_tests_found:
            execution_log.append(f"  - Simulated: {total_tests_found - passed_count} test(s) likely failed.")
        
        summary = f"Simulated: {passed_count}/{total_tests_found} tests passed."
        if passed_count < total_tests_found : summary += " Some tests likely failed."

    except Exception as e_sim: # Error in our simulation logic
        summary = f"Error during test SIMULATION logic: {str(e_sim)}"
        passed_count = 0
        execution_log.append(f"  - ERROR in simulation: {e_sim}")
    
    print(f"DEV_INFO: evaluation_engine.py - Placeholder execution result: {summary}")
    return passed_count, total_tests_found, "\n".join(execution_log)


def evaluate_solution_candidate(
    solution_text: str,
    problem_description: str,
    problem_type: str,
    user_provided_tests: str, # String of Python assert statements
    llm_client_config: dict # {"type": ..., "model_id": ..., "temp": ..., "max_tokens": ...}
) -> EvaluationResult:
    """
    Evaluates a single solution candidate.
    """
    llm_critique_output_text = "LLM critique could not be performed due to an earlier error or API issue."
    llm_based_score = 0
    raw_llm_critique_resp = None

    # 1. LLM-based Critique (if solution_text is not an error message itself)
    if solution_text and not solution_text.startswith("ERROR"):
        system_p_critique = get_system_prompt("critique_general") # problem_type can be used here too
        user_p_critique = format_critique_user_prompt(problem_description, solution_text)
        
        llm_response_obj = None # type: LLMResponse
        if llm_client_config["type"] == "hf":
            llm_response_obj = call_huggingface_api(
                user_p_critique, llm_client_config["model_id"],
                temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
                system_prompt_text=system_p_critique
            )
        elif llm_client_config["type"] == "google_gemini":
            llm_response_obj = call_gemini_api(
                user_p_critique, llm_client_config["model_id"],
                temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
                system_prompt_text=system_p_critique
            )
        
        if llm_response_obj:
            raw_llm_critique_resp = llm_response_obj.raw_response
            if llm_response_obj.success:
                llm_critique_output_text = llm_response_obj.text
                llm_based_score = _parse_score_from_llm_text(llm_critique_output_text)
            else:
                llm_critique_output_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
                llm_based_score = 0 # Penalize for critique failure
    elif solution_text and solution_text.startswith("ERROR"):
        llm_critique_output_text = f"Solution was an error from Genesis: {solution_text}"
        llm_based_score = 0


    # 2. (Simulated) Code Execution if applicable
    passed_tests_count = 0
    total_tests_count = 0
    exec_summary_msg = "Automated tests not applicable or not run for this problem type/solution."

    # Only run tests if it's a Python problem, tests are provided, and solution isn't an error
    if "python" in problem_type.lower() and user_provided_tests.strip() and solution_text and not solution_text.startswith("ERROR"):
        # **IMPORTANT**: Replace with a REAL sandboxed executor for safety.
        passed_tests_count, total_tests_count, exec_summary_msg = _placeholder_safe_python_execution(
            solution_text, user_provided_tests
        )
    elif "python" in problem_type.lower() and not user_provided_tests.strip():
        exec_summary_msg = "No user tests provided for this Python problem."


    # 3. Combine Scores into a Final Score (Example Heuristic)
    final_score_calculated = llm_based_score
    if total_tests_count > 0: # If tests were run
        test_pass_ratio = passed_tests_count / total_tests_count
        if test_pass_ratio < 0.5 : # Penalize heavily if less than half tests pass
            final_score_calculated = max(1, int(llm_based_score * 0.5) - 1)
        elif test_pass_ratio == 1.0 and passed_tests_count > 0: # All tests passed
            final_score_calculated = min(10, llm_based_score + 1 if llm_based_score < 10 else 10) # Small bonus
        else: # Some tests passed or ratio between 0.5 and 1.0
            final_score_calculated = int(llm_based_score * (0.6 + 0.4 * test_pass_ratio))
    final_score_calculated = max(1, min(10, final_score_calculated)) # Ensure score is 1-10

    # Construct comprehensive critique text for display
    comprehensive_critique = f"{llm_critique_output_text}"
    if total_tests_count > 0 or ("python" in problem_type.lower() and user_provided_tests.strip()): # Add test summary if applicable
        comprehensive_critique += f"\n\n**Automated Test Summary (Simulated):**\n{exec_summary_msg}\n"
        comprehensive_critique += f"Passed: {passed_tests_count}/{total_tests_count}"


    return EvaluationResult(
        score=final_score_calculated,
        critique_text=comprehensive_critique,
        passed_tests=passed_tests_count,
        total_tests=total_tests_count,
        execution_summary=exec_summary_msg,
        raw_llm_critique_response=raw_llm_critique_resp
    )