Spaces:
Sleeping
Sleeping
File size: 8,118 Bytes
48e5e22 cbf1fef a00b966 ebbb706 a98e0b1 a00b966 ebbb706 a00b966 ebbb706 a00b966 ebbb706 cbf1fef a00b966 cbf1fef ebbb706 48e5e22 ebbb706 a00b966 a98e0b1 ebbb706 48e5e22 ebbb706 a00b966 48e5e22 a00b966 cbf1fef 48e5e22 cbf1fef ebbb706 48e5e22 ebbb706 a00b966 ebbb706 a00b966 ebbb706 a98e0b1 48e5e22 ebbb706 a98e0b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# algoforge_prime/core/evaluation_engine.py
import random
import traceback
from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse
from prompts.system_prompts import get_system_prompt
from prompts.prompt_templates import format_critique_user_prompt
from core.safe_executor import execute_python_code_with_tests, ExecutionResult, TestResult # Import new classes
print("DEBUG: core.evaluation_engine - Imports successful")
class EvaluationResultOutput:
def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
self.combined_score = combined_score
self.llm_critique_text = llm_critique_text
self.execution_details = execution_details
self.raw_llm_response = raw_llm_response
def get_display_critique(self) -> str:
"""Formats a comprehensive critique including LLM feedback and execution results."""
critique_parts = []
critique_parts.append(self.llm_critique_text if self.llm_critique_text else "LLM critique was not performed or failed.")
if self.execution_details:
exec_details = self.execution_details
critique_parts.append("\n\n**Automated Execution & Test Results (Simulated):**")
if exec_details.compilation_error:
critique_parts.append(f" Compilation Error: {exec_details.compilation_error}")
elif exec_details.timeout_error:
critique_parts.append(f" Execution Timed Out after {exec_details.execution_time:.2f}s.")
else:
if exec_details.total_tests > 0:
critique_parts.append(f" Tests Attempted: {exec_details.total_tests}")
critique_parts.append(f" Tests Passed: {exec_details.passed_tests}")
if exec_details.passed_tests < exec_details.total_tests:
critique_parts.append(" Failed Tests Details:")
for test_res in exec_details.individual_test_results:
if not test_res.passed:
critique_parts.append(f" - Test: `{test_res.test_string[:70]}...`")
if test_res.error_message:
critique_parts.append(f" Error: {test_res.error_message[:100]}...")
else: # Code ran, but no assert-based tests provided/found
critique_parts.append(" Code executed (no assert-based tests found/run).")
if exec_details.stdout:
critique_parts.append(f" Execution Stdout (truncated):\n```\n{exec_details.stdout[:300].strip()}\n```")
if exec_details.stderr and not any(not tr.passed for tr in exec_details.individual_test_results if tr.error_message): # Show general stderr if not already part of a test fail
critique_parts.append(f" Execution Stderr (general):\n```\n{exec_details.stderr[:300].strip()}\n```")
critique_parts.append(f" Simulated Execution Time: {exec_details.execution_time:.4f}s")
return "\n".join(critique_parts)
def _parse_llm_score(llm_text_output: str) -> int:
# ... (same as your last working version)
score = 0; import re
if not llm_text_output or not isinstance(llm_text_output, str): return score
match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
if match: score = max(1, min(int(match.group(1)), 10))
else:
print(f"INFO: evaluation_engine.py - 'Score: X/10' marker not found. Output: {llm_text_output[:100]}...")
score = random.randint(3, 6)
return score
def evaluate_solution_candidate(
solution_text: str,
problem_description: str,
problem_type: str,
user_provided_tests_code: str,
llm_client_config: dict
) -> EvaluationResultOutput:
print(f"DEBUG: evaluation_engine.py - Evaluating candidate. Problem type: {problem_type}")
llm_critique_text = "LLM critique generation failed or was skipped."
llm_score = 0
raw_llm_critique_resp = None
execution_result_obj = None # type: ExecutionResult
# 1. LLM-based Critique
if solution_text and not solution_text.startswith("ERROR"):
# ... (LLM critique call logic - same as before) ...
system_p_critique = get_system_prompt("critique_general")
user_p_critique = format_critique_user_prompt(problem_description, solution_text)
llm_response_obj = None
if llm_client_config["type"] == "hf": llm_response_obj = call_huggingface_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
elif llm_client_config["type"] == "google_gemini": llm_response_obj = call_gemini_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
if llm_response_obj:
raw_llm_critique_resp = llm_response_obj.raw_response
if llm_response_obj.success: llm_critique_text, llm_score = llm_response_obj.text, _parse_llm_score(llm_response_obj.text)
else: llm_critique_text, llm_score = f"Error during LLM critique: {llm_response_obj.error}", 0
elif solution_text and solution_text.startswith("ERROR"):
llm_critique_text, llm_score = f"Solution was error from Genesis: {solution_text}", 0
# 2. Code Execution
if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR"):
if user_provided_tests_code.strip():
print(f"INFO: evaluation_engine.py - Executing Python code candidate against user tests.")
execution_result_obj = execute_python_code_with_tests(solution_text, user_provided_tests_code, timeout_seconds=10)
else:
print(f"INFO: evaluation_engine.py - Executing Python code candidate (no tests provided).")
execution_result_obj = execute_python_code_with_tests(solution_text, "", timeout_seconds=5) # Execute code even if no tests
print(f"INFO: evaluation_engine.py - Execution result: {execution_result_obj}")
elif "python" in problem_type.lower() and not user_provided_tests_code.strip() and solution_text and not solution_text.startswith("ERROR"):
# Case where it's python but no tests - still might want to run to catch basic runtime/compile errors
execution_result_obj = execute_python_code_with_tests(solution_text, "", timeout_seconds=5)
# 3. Combine Scores into a Final Score
combined_score = llm_score
if execution_result_obj:
if execution_result_obj.compilation_error or execution_result_obj.timeout_error or (not execution_result_obj.success and execution_result_obj.stderr and not execution_result_obj.individual_test_results) :
combined_score = 1 # Catastrophic failure
elif execution_result_obj.total_tests > 0:
pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
if pass_ratio == 1.0: combined_score = min(10, llm_score + 3) # Strong bonus for all tests passing
elif pass_ratio >= 0.8: combined_score = min(10, llm_score + 1)
elif pass_ratio < 0.2: combined_score = max(1, llm_score - 6) # Heavy penalty
elif pass_ratio < 0.5: combined_score = max(1, llm_score - 4)
else: combined_score = int(llm_score * (0.4 + 0.6 * pass_ratio)) # Weighted more by tests
elif not execution_result_obj.success and execution_result_obj.error : # General runtime error without tests
combined_score = max(1, llm_score - 4)
combined_score = max(1, min(10, combined_score))
print(f"DEBUG: evaluation_engine.py - Evaluation complete. Combined Score: {combined_score}")
return EvaluationResultOutput(
combined_score=combined_score,
llm_critique_text=llm_critique_text,
execution_details=execution_result_obj,
raw_llm_response=raw_llm_critique_resp
)
print("DEBUG: core.evaluation_engine - Module fully defined.") |