mgbam commited on
Commit
ebbb706
·
verified ·
1 Parent(s): e3bf7f8

Update core/evaluation_engine.py

Browse files
Files changed (1) hide show
  1. core/evaluation_engine.py +92 -43
core/evaluation_engine.py CHANGED
@@ -2,53 +2,80 @@
2
  import random
3
  import traceback
4
 
5
- # --- Corrected Imports ---
6
- from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse
7
- from prompts.system_prompts import get_system_prompt
8
- from prompts.prompt_templates import format_critique_user_prompt
9
- from .safe_executor import execute_python_code_with_tests, ExecutionResult # CORRECTED: Relative import
10
 
11
  print("DEBUG: core.evaluation_engine - Imports successful")
12
 
13
- # ... (rest of the EvaluationResultOutput class, _parse_llm_score, _placeholder_safe_python_execution,
14
- # and evaluate_solution_candidate function as previously provided and corrected) ...
15
- # Ensure all that logic is present here. For brevity, I am not pasting it all again.
16
- # The key change is the import line for safe_executor above.
17
-
18
  class EvaluationResultOutput:
19
  def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
20
- self.combined_score, self.llm_critique_text, self.execution_details, self.raw_llm_response = combined_score, llm_critique_text, execution_details, raw_llm_response
21
- def get_display_critique(self):
22
- # ... (implementation as before)
23
- full_critique = self.llm_critique_text if self.llm_critique_text else "LLM critique failed/skipped."
 
 
 
 
 
 
24
  if self.execution_details:
25
- full_critique += f"\n\n**Automated Execution & Test Results (Simulated):**\n"
26
- if self.execution_details.total_tests > 0: full_critique += f" Tests: {self.execution_details.passed_tests}/{self.execution_details.total_tests} passed.\n"
27
- if self.execution_details.error: full_critique += f" Error: {self.execution_details.error}\n"
28
- elif self.execution_details.output: full_critique += f" Output:\n```\n{self.execution_details.output[:500]}\n```\n"
29
- full_critique += f" Time: {self.execution_details.execution_time:.4f}s\n"
30
- return full_critique
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def _parse_llm_score(llm_text_output: str) -> int:
33
- # ... (implementation as before)
34
  score = 0; import re
35
  if not llm_text_output or not isinstance(llm_text_output, str): return score
36
  match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
37
  if match: score = max(1, min(int(match.group(1)), 10))
38
- else: score = random.randint(3, 6)
 
 
39
  return score
40
 
41
- # _placeholder_safe_python_execution remains in safe_executor.py, it's imported.
42
-
43
  def evaluate_solution_candidate(
44
- solution_text: str, problem_description: str, problem_type: str,
45
- user_provided_tests_code: str, llm_client_config: dict
 
 
 
46
  ) -> EvaluationResultOutput:
47
- # ... (implementation as before, ensuring it calls the imported execute_python_code_with_tests) ...
48
  print(f"DEBUG: evaluation_engine.py - Evaluating candidate. Problem type: {problem_type}")
49
- llm_critique_text, llm_score, raw_llm_critique_resp, execution_result_obj = "LLM critique failed/skipped.", 0, None, None
 
 
 
 
 
50
  if solution_text and not solution_text.startswith("ERROR"):
51
- # ... (LLM critique call logic) ...
52
  system_p_critique = get_system_prompt("critique_general")
53
  user_p_critique = format_critique_user_prompt(problem_description, solution_text)
54
  llm_response_obj = None
@@ -58,24 +85,46 @@ def evaluate_solution_candidate(
58
  raw_llm_critique_resp = llm_response_obj.raw_response
59
  if llm_response_obj.success: llm_critique_text, llm_score = llm_response_obj.text, _parse_llm_score(llm_response_obj.text)
60
  else: llm_critique_text, llm_score = f"Error during LLM critique: {llm_response_obj.error}", 0
61
- elif solution_text and solution_text.startswith("ERROR"): llm_critique_text, llm_score = f"Solution was error: {solution_text}", 0
 
62
 
63
- if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip():
64
- execution_result_obj = execute_python_code_with_tests(solution_text, user_provided_tests_code, timeout_seconds=10)
65
- elif "python" in problem_type.lower() and not user_provided_tests_code.strip():
66
- execution_result_obj = ExecutionResult(success=True, output="No user tests provided.", total_tests=0)
67
-
68
- combined_score = llm_score # Start with LLM score
69
- if execution_result_obj and execution_result_obj.total_tests > 0: # Adjust based on tests
70
- if not execution_result_obj.success or execution_result_obj.error: combined_score = max(1, llm_score - 5)
71
  else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
73
- if pass_ratio == 1.0: combined_score = min(10, llm_score + 2)
74
- elif pass_ratio >= 0.75: combined_score = min(10, llm_score + 1)
75
- elif pass_ratio < 0.25: combined_score = max(1, llm_score - 4)
76
- else: combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio))
 
 
 
 
77
  combined_score = max(1, min(10, combined_score))
78
- return EvaluationResultOutput(combined_score, llm_critique_text, execution_result_obj, raw_llm_critique_resp)
79
 
 
 
 
 
 
 
 
80
 
81
  print("DEBUG: core.evaluation_engine - Module fully defined.")
 
2
  import random
3
  import traceback
4
 
5
+ from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse
6
+ from prompts.system_prompts import get_system_prompt
7
+ from prompts.prompt_templates import format_critique_user_prompt
8
+ from core.safe_executor import execute_python_code_with_tests, ExecutionResult, TestResult # Import new classes
 
9
 
10
  print("DEBUG: core.evaluation_engine - Imports successful")
11
 
 
 
 
 
 
12
  class EvaluationResultOutput:
13
  def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
14
+ self.combined_score = combined_score
15
+ self.llm_critique_text = llm_critique_text
16
+ self.execution_details = execution_details
17
+ self.raw_llm_response = raw_llm_response
18
+
19
+ def get_display_critique(self) -> str:
20
+ """Formats a comprehensive critique including LLM feedback and execution results."""
21
+ critique_parts = []
22
+ critique_parts.append(self.llm_critique_text if self.llm_critique_text else "LLM critique was not performed or failed.")
23
+
24
  if self.execution_details:
25
+ exec_details = self.execution_details
26
+ critique_parts.append("\n\n**Automated Execution & Test Results (Simulated):**")
27
+ if exec_details.compilation_error:
28
+ critique_parts.append(f" Compilation Error: {exec_details.compilation_error}")
29
+ elif exec_details.timeout_error:
30
+ critique_parts.append(f" Execution Timed Out after {exec_details.execution_time:.2f}s.")
31
+ else:
32
+ if exec_details.total_tests > 0:
33
+ critique_parts.append(f" Tests Attempted: {exec_details.total_tests}")
34
+ critique_parts.append(f" Tests Passed: {exec_details.passed_tests}")
35
+ if exec_details.passed_tests < exec_details.total_tests:
36
+ critique_parts.append(" Failed Tests Details:")
37
+ for test_res in exec_details.individual_test_results:
38
+ if not test_res.passed:
39
+ critique_parts.append(f" - Test: `{test_res.test_string[:70]}...`")
40
+ if test_res.error_message:
41
+ critique_parts.append(f" Error: {test_res.error_message[:100]}...")
42
+ else: # Code ran, but no assert-based tests provided/found
43
+ critique_parts.append(" Code executed (no assert-based tests found/run).")
44
+
45
+ if exec_details.stdout:
46
+ critique_parts.append(f" Execution Stdout (truncated):\n```\n{exec_details.stdout[:300].strip()}\n```")
47
+ if exec_details.stderr and not any(not tr.passed for tr in exec_details.individual_test_results if tr.error_message): # Show general stderr if not already part of a test fail
48
+ critique_parts.append(f" Execution Stderr (general):\n```\n{exec_details.stderr[:300].strip()}\n```")
49
+ critique_parts.append(f" Simulated Execution Time: {exec_details.execution_time:.4f}s")
50
+ return "\n".join(critique_parts)
51
 
52
  def _parse_llm_score(llm_text_output: str) -> int:
53
+ # ... (same as your last working version)
54
  score = 0; import re
55
  if not llm_text_output or not isinstance(llm_text_output, str): return score
56
  match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
57
  if match: score = max(1, min(int(match.group(1)), 10))
58
+ else:
59
+ print(f"INFO: evaluation_engine.py - 'Score: X/10' marker not found. Output: {llm_text_output[:100]}...")
60
+ score = random.randint(3, 6)
61
  return score
62
 
 
 
63
  def evaluate_solution_candidate(
64
+ solution_text: str,
65
+ problem_description: str,
66
+ problem_type: str,
67
+ user_provided_tests_code: str,
68
+ llm_client_config: dict
69
  ) -> EvaluationResultOutput:
 
70
  print(f"DEBUG: evaluation_engine.py - Evaluating candidate. Problem type: {problem_type}")
71
+ llm_critique_text = "LLM critique generation failed or was skipped."
72
+ llm_score = 0
73
+ raw_llm_critique_resp = None
74
+ execution_result_obj = None # type: ExecutionResult
75
+
76
+ # 1. LLM-based Critique
77
  if solution_text and not solution_text.startswith("ERROR"):
78
+ # ... (LLM critique call logic - same as before) ...
79
  system_p_critique = get_system_prompt("critique_general")
80
  user_p_critique = format_critique_user_prompt(problem_description, solution_text)
81
  llm_response_obj = None
 
85
  raw_llm_critique_resp = llm_response_obj.raw_response
86
  if llm_response_obj.success: llm_critique_text, llm_score = llm_response_obj.text, _parse_llm_score(llm_response_obj.text)
87
  else: llm_critique_text, llm_score = f"Error during LLM critique: {llm_response_obj.error}", 0
88
+ elif solution_text and solution_text.startswith("ERROR"):
89
+ llm_critique_text, llm_score = f"Solution was error from Genesis: {solution_text}", 0
90
 
91
+ # 2. Code Execution
92
+ if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR"):
93
+ if user_provided_tests_code.strip():
94
+ print(f"INFO: evaluation_engine.py - Executing Python code candidate against user tests.")
95
+ execution_result_obj = execute_python_code_with_tests(solution_text, user_provided_tests_code, timeout_seconds=10)
 
 
 
96
  else:
97
+ print(f"INFO: evaluation_engine.py - Executing Python code candidate (no tests provided).")
98
+ execution_result_obj = execute_python_code_with_tests(solution_text, "", timeout_seconds=5) # Execute code even if no tests
99
+ print(f"INFO: evaluation_engine.py - Execution result: {execution_result_obj}")
100
+ elif "python" in problem_type.lower() and not user_provided_tests_code.strip() and solution_text and not solution_text.startswith("ERROR"):
101
+ # Case where it's python but no tests - still might want to run to catch basic runtime/compile errors
102
+ execution_result_obj = execute_python_code_with_tests(solution_text, "", timeout_seconds=5)
103
+
104
+
105
+ # 3. Combine Scores into a Final Score
106
+ combined_score = llm_score
107
+ if execution_result_obj:
108
+ if execution_result_obj.compilation_error or execution_result_obj.timeout_error or (not execution_result_obj.success and execution_result_obj.stderr and not execution_result_obj.individual_test_results) :
109
+ combined_score = 1 # Catastrophic failure
110
+ elif execution_result_obj.total_tests > 0:
111
  pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
112
+ if pass_ratio == 1.0: combined_score = min(10, llm_score + 3) # Strong bonus for all tests passing
113
+ elif pass_ratio >= 0.8: combined_score = min(10, llm_score + 1)
114
+ elif pass_ratio < 0.2: combined_score = max(1, llm_score - 6) # Heavy penalty
115
+ elif pass_ratio < 0.5: combined_score = max(1, llm_score - 4)
116
+ else: combined_score = int(llm_score * (0.4 + 0.6 * pass_ratio)) # Weighted more by tests
117
+ elif not execution_result_obj.success and execution_result_obj.error : # General runtime error without tests
118
+ combined_score = max(1, llm_score - 4)
119
+
120
  combined_score = max(1, min(10, combined_score))
 
121
 
122
+ print(f"DEBUG: evaluation_engine.py - Evaluation complete. Combined Score: {combined_score}")
123
+ return EvaluationResultOutput(
124
+ combined_score=combined_score,
125
+ llm_critique_text=llm_critique_text,
126
+ execution_details=execution_result_obj,
127
+ raw_llm_response=raw_llm_critique_resp
128
+ )
129
 
130
  print("DEBUG: core.evaluation_engine - Module fully defined.")