mgbam commited on
Commit
a98e0b1
·
verified ·
1 Parent(s): 6aa264c

Update core/evaluation_engine.py

Browse files
Files changed (1) hide show
  1. core/evaluation_engine.py +39 -39
core/evaluation_engine.py CHANGED
@@ -1,16 +1,21 @@
1
  # algoforge_prime/core/evaluation_engine.py
2
  import random
3
- from .llm_clients import call_huggingface_api, call_gemini_api, LLMResponse
4
- from ..prompts.system_prompts import get_system_prompt
5
- from ..prompts.prompt_templates import format_critique_user_prompt
6
- # Import our (simulated) safe executor
7
- from .safe_executor import execute_python_code_with_tests, ExecutionResult # Assuming it's in the same 'core' package
8
 
9
- class EvaluationResultOutput: # Renamed to avoid conflict with safe_executor.ExecutionResult
 
 
 
 
 
 
 
 
 
10
  def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
11
  self.combined_score = combined_score
12
- self.llm_critique_text = llm_critique_text # LLM's qualitative assessment
13
- self.execution_details = execution_details # Object from safe_executor
14
  self.raw_llm_response = raw_llm_response
15
 
16
  def get_display_critique(self):
@@ -23,13 +28,12 @@ class EvaluationResultOutput: # Renamed to avoid conflict with safe_executor.Exe
23
  if self.execution_details.error:
24
  full_critique += f" Execution Error: {self.execution_details.error}\n"
25
  elif self.execution_details.output:
26
- full_critique += f" Execution Output (stdout):\n```\n{self.execution_details.output[:500]}\n```\n" # Limit output display
27
  full_critique += f" Execution Time: {self.execution_details.execution_time:.4f}s\n"
28
  return full_critique
29
 
30
 
31
  def _parse_llm_score(llm_text_output: str) -> int:
32
- # ... (keep your existing _parse_score_from_llm_text, renamed for clarity) ...
33
  score = 0
34
  if not llm_text_output or not isinstance(llm_text_output, str): return score
35
  try:
@@ -38,8 +42,12 @@ def _parse_llm_score(llm_text_output: str) -> int:
38
  if match:
39
  parsed_score_val = int(match.group(1))
40
  score = max(1, min(parsed_score_val, 10))
41
- else: score = random.randint(3, 6) # Fallback if no score marker
42
- except Exception: score = random.randint(3, 5) # Fallback on any parsing error
 
 
 
 
43
  return score
44
 
45
 
@@ -50,13 +58,12 @@ def evaluate_solution_candidate(
50
  user_provided_tests_code: str,
51
  llm_client_config: dict
52
  ) -> EvaluationResultOutput:
53
-
54
  llm_critique_text = "LLM critique generation failed or was skipped."
55
  llm_score = 0
56
  raw_llm_critique_resp = None
57
  execution_result_obj = None # type: ExecutionResult
58
 
59
- # 1. LLM-based Critique (only if solution_text is not an error itself)
60
  if solution_text and not solution_text.startswith("ERROR"):
61
  system_p_critique = get_system_prompt("critique_general")
62
  user_p_critique = format_critique_user_prompt(problem_description, solution_text)
@@ -74,45 +81,38 @@ def evaluate_solution_candidate(
74
  llm_score = _parse_llm_score(llm_critique_text)
75
  else:
76
  llm_critique_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
77
- llm_score = 0 # Penalize
78
  elif solution_text and solution_text.startswith("ERROR"):
79
  llm_critique_text = f"Solution was an error from Genesis: {solution_text}"
80
  llm_score = 0
81
 
82
-
83
- # 2. Code Execution (if Python problem, code exists, and tests are provided)
84
  if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip():
85
- print(f"INFO: evaluation_engine.py - Preparing to execute Python code candidate against user tests.")
86
- # Use the (simulated) safe executor
87
  execution_result_obj = execute_python_code_with_tests(
88
- solution_text, user_provided_tests_code, timeout_seconds=10 # Example timeout
89
  )
90
- print(f"INFO: evaluation_engine.py - Execution result: {execution_result_obj}")
91
  elif "python" in problem_type.lower() and not user_provided_tests_code.strip():
92
- execution_result_obj = ExecutionResult(success=True, output="No user tests provided to run against the Python code.", total_tests=0)
93
-
94
 
95
- # 3. Combine Scores into a Final Score (More sophisticated heuristic)
96
  combined_score = llm_score
97
  if execution_result_obj and execution_result_obj.total_tests > 0:
98
- if not execution_result_obj.success or execution_result_obj.error: # Major execution failure
99
- combined_score = max(1, llm_score - 5) # Penalize heavily
100
  else:
101
  pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
102
- if pass_ratio == 1.0: # All tests passed
103
- combined_score = min(10, llm_score + 2) # Significant bonus
104
- elif pass_ratio >= 0.75: # Most tests passed
105
- combined_score = min(10, llm_score + 1) # Small bonus
106
- elif pass_ratio < 0.25: # Very few tests passed
107
- combined_score = max(1, llm_score - 4)
108
- else: # Some tests passed
109
- combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio)) # Weighted average
110
-
111
- combined_score = max(1, min(10, combined_score)) # Clamp 1-10
112
 
 
113
  return EvaluationResultOutput(
114
  combined_score=combined_score,
115
- llm_critique_text=llm_critique_text,
116
- execution_details=execution_result_obj,
117
  raw_llm_response=raw_llm_critique_resp
118
- )
 
 
 
1
  # algoforge_prime/core/evaluation_engine.py
2
  import random
3
+ import traceback # Keep this if used in your placeholder
 
 
 
 
4
 
5
+ # --- Corrected Absolute Imports ---
6
+ from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse # Absolute from project root
7
+ from prompts.system_prompts import get_system_prompt # Absolute from project root
8
+ from prompts.prompt_templates import format_critique_user_prompt # Absolute from project root
9
+ from core.safe_executor import execute_python_code_with_tests, ExecutionResult # Absolute from project root
10
+
11
+ print("DEBUG: core.evaluation_engine - Imports successful")
12
+
13
+
14
+ class EvaluationResultOutput:
15
  def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
16
  self.combined_score = combined_score
17
+ self.llm_critique_text = llm_critique_text
18
+ self.execution_details = execution_details
19
  self.raw_llm_response = raw_llm_response
20
 
21
  def get_display_critique(self):
 
28
  if self.execution_details.error:
29
  full_critique += f" Execution Error: {self.execution_details.error}\n"
30
  elif self.execution_details.output:
31
+ full_critique += f" Execution Output (stdout):\n```\n{self.execution_details.output[:500]}\n```\n"
32
  full_critique += f" Execution Time: {self.execution_details.execution_time:.4f}s\n"
33
  return full_critique
34
 
35
 
36
  def _parse_llm_score(llm_text_output: str) -> int:
 
37
  score = 0
38
  if not llm_text_output or not isinstance(llm_text_output, str): return score
39
  try:
 
42
  if match:
43
  parsed_score_val = int(match.group(1))
44
  score = max(1, min(parsed_score_val, 10))
45
+ else:
46
+ print(f"INFO: evaluation_engine.py - 'Score: X/10' marker not found in LLM output. Output: {llm_text_output[:100]}...")
47
+ score = random.randint(3, 6)
48
+ except Exception as e:
49
+ print(f"WARNING: evaluation_engine.py - Error parsing score from LLM output '{llm_text_output[:100]}...': {e}")
50
+ score = random.randint(3, 5)
51
  return score
52
 
53
 
 
58
  user_provided_tests_code: str,
59
  llm_client_config: dict
60
  ) -> EvaluationResultOutput:
61
+ print(f"DEBUG: evaluation_engine.py - Evaluating candidate. Problem type: {problem_type}")
62
  llm_critique_text = "LLM critique generation failed or was skipped."
63
  llm_score = 0
64
  raw_llm_critique_resp = None
65
  execution_result_obj = None # type: ExecutionResult
66
 
 
67
  if solution_text and not solution_text.startswith("ERROR"):
68
  system_p_critique = get_system_prompt("critique_general")
69
  user_p_critique = format_critique_user_prompt(problem_description, solution_text)
 
81
  llm_score = _parse_llm_score(llm_critique_text)
82
  else:
83
  llm_critique_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
84
+ llm_score = 0
85
  elif solution_text and solution_text.startswith("ERROR"):
86
  llm_critique_text = f"Solution was an error from Genesis: {solution_text}"
87
  llm_score = 0
88
 
 
 
89
  if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip():
90
+ print(f"INFO: evaluation_engine.py - Preparing to (simulated) execute Python code candidate.")
 
91
  execution_result_obj = execute_python_code_with_tests(
92
+ solution_text, user_provided_tests_code, timeout_seconds=10
93
  )
94
+ print(f"INFO: evaluation_engine.py - (Simulated) Execution result: {execution_result_obj}")
95
  elif "python" in problem_type.lower() and not user_provided_tests_code.strip():
96
+ execution_result_obj = ExecutionResult(success=True, output="No user tests provided for this Python problem.", total_tests=0)
 
97
 
 
98
  combined_score = llm_score
99
  if execution_result_obj and execution_result_obj.total_tests > 0:
100
+ if not execution_result_obj.success or execution_result_obj.error:
101
+ combined_score = max(1, llm_score - 5)
102
  else:
103
  pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
104
+ if pass_ratio == 1.0: combined_score = min(10, llm_score + 2)
105
+ elif pass_ratio >= 0.75: combined_score = min(10, llm_score + 1)
106
+ elif pass_ratio < 0.25: combined_score = max(1, llm_score - 4)
107
+ else: combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio))
108
+ combined_score = max(1, min(10, combined_score))
 
 
 
 
 
109
 
110
+ print(f"DEBUG: evaluation_engine.py - Evaluation complete. Combined Score: {combined_score}")
111
  return EvaluationResultOutput(
112
  combined_score=combined_score,
113
+ llm_critique_text=llm_critique_text, # This is just the LLM's part
114
+ execution_details=execution_result_obj, # This contains test pass/fail and errors
115
  raw_llm_response=raw_llm_critique_resp
116
+ )
117
+
118
+ print("DEBUG: core.evaluation_engine - Module fully defined.")