mgbam commited on
Commit
cbf1fef
·
verified ·
1 Parent(s): a0a78d2

Update core/evaluation_engine.py

Browse files
Files changed (1) hide show
  1. core/evaluation_engine.py +40 -77
core/evaluation_engine.py CHANGED
@@ -1,104 +1,73 @@
1
  # algoforge_prime/core/evaluation_engine.py
2
  import random
3
- import traceback # Keep this if used in your placeholder
4
 
5
- # --- Corrected Absolute Imports ---
6
- from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse # Absolute from project root
7
- from prompts.system_prompts import get_system_prompt # Absolute from project root
8
- from prompts.prompt_templates import format_critique_user_prompt # Absolute from project root
9
- from core.safe_executor import execute_python_code_with_tests, ExecutionResult # Absolute from project root
10
 
11
  print("DEBUG: core.evaluation_engine - Imports successful")
12
 
 
 
 
 
13
 
14
  class EvaluationResultOutput:
15
  def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
16
- self.combined_score = combined_score
17
- self.llm_critique_text = llm_critique_text
18
- self.execution_details = execution_details
19
- self.raw_llm_response = raw_llm_response
20
-
21
  def get_display_critique(self):
22
- full_critique = self.llm_critique_text if self.llm_critique_text else "LLM critique was not performed or failed."
 
23
  if self.execution_details:
24
  full_critique += f"\n\n**Automated Execution & Test Results (Simulated):**\n"
25
- if self.execution_details.total_tests > 0:
26
- full_critique += f" Tests Attempted: {self.execution_details.total_tests}\n"
27
- full_critique += f" Tests Passed: {self.execution_details.passed_tests}\n"
28
- if self.execution_details.error:
29
- full_critique += f" Execution Error: {self.execution_details.error}\n"
30
- elif self.execution_details.output:
31
- full_critique += f" Execution Output (stdout):\n```\n{self.execution_details.output[:500]}\n```\n"
32
- full_critique += f" Execution Time: {self.execution_details.execution_time:.4f}s\n"
33
  return full_critique
34
 
35
-
36
  def _parse_llm_score(llm_text_output: str) -> int:
37
- score = 0
 
38
  if not llm_text_output or not isinstance(llm_text_output, str): return score
39
- try:
40
- import re
41
- match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
42
- if match:
43
- parsed_score_val = int(match.group(1))
44
- score = max(1, min(parsed_score_val, 10))
45
- else:
46
- print(f"INFO: evaluation_engine.py - 'Score: X/10' marker not found in LLM output. Output: {llm_text_output[:100]}...")
47
- score = random.randint(3, 6)
48
- except Exception as e:
49
- print(f"WARNING: evaluation_engine.py - Error parsing score from LLM output '{llm_text_output[:100]}...': {e}")
50
- score = random.randint(3, 5)
51
  return score
52
 
 
53
 
54
  def evaluate_solution_candidate(
55
- solution_text: str,
56
- problem_description: str,
57
- problem_type: str,
58
- user_provided_tests_code: str,
59
- llm_client_config: dict
60
  ) -> EvaluationResultOutput:
 
61
  print(f"DEBUG: evaluation_engine.py - Evaluating candidate. Problem type: {problem_type}")
62
- llm_critique_text = "LLM critique generation failed or was skipped."
63
- llm_score = 0
64
- raw_llm_critique_resp = None
65
- execution_result_obj = None # type: ExecutionResult
66
-
67
  if solution_text and not solution_text.startswith("ERROR"):
 
68
  system_p_critique = get_system_prompt("critique_general")
69
  user_p_critique = format_critique_user_prompt(problem_description, solution_text)
70
-
71
  llm_response_obj = None
72
- if llm_client_config["type"] == "hf":
73
- llm_response_obj = call_huggingface_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
74
- elif llm_client_config["type"] == "google_gemini":
75
- llm_response_obj = call_gemini_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
76
-
77
  if llm_response_obj:
78
  raw_llm_critique_resp = llm_response_obj.raw_response
79
- if llm_response_obj.success:
80
- llm_critique_text = llm_response_obj.text
81
- llm_score = _parse_llm_score(llm_critique_text)
82
- else:
83
- llm_critique_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
84
- llm_score = 0
85
- elif solution_text and solution_text.startswith("ERROR"):
86
- llm_critique_text = f"Solution was an error from Genesis: {solution_text}"
87
- llm_score = 0
88
 
89
  if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip():
90
- print(f"INFO: evaluation_engine.py - Preparing to (simulated) execute Python code candidate.")
91
- execution_result_obj = execute_python_code_with_tests(
92
- solution_text, user_provided_tests_code, timeout_seconds=10
93
- )
94
- print(f"INFO: evaluation_engine.py - (Simulated) Execution result: {execution_result_obj}")
95
  elif "python" in problem_type.lower() and not user_provided_tests_code.strip():
96
- execution_result_obj = ExecutionResult(success=True, output="No user tests provided for this Python problem.", total_tests=0)
97
-
98
- combined_score = llm_score
99
- if execution_result_obj and execution_result_obj.total_tests > 0:
100
- if not execution_result_obj.success or execution_result_obj.error:
101
- combined_score = max(1, llm_score - 5)
102
  else:
103
  pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
104
  if pass_ratio == 1.0: combined_score = min(10, llm_score + 2)
@@ -106,13 +75,7 @@ def evaluate_solution_candidate(
106
  elif pass_ratio < 0.25: combined_score = max(1, llm_score - 4)
107
  else: combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio))
108
  combined_score = max(1, min(10, combined_score))
 
109
 
110
- print(f"DEBUG: evaluation_engine.py - Evaluation complete. Combined Score: {combined_score}")
111
- return EvaluationResultOutput(
112
- combined_score=combined_score,
113
- llm_critique_text=llm_critique_text, # This is just the LLM's part
114
- execution_details=execution_result_obj, # This contains test pass/fail and errors
115
- raw_llm_response=raw_llm_critique_resp
116
- )
117
 
118
  print("DEBUG: core.evaluation_engine - Module fully defined.")
 
1
  # algoforge_prime/core/evaluation_engine.py
2
  import random
3
+ import traceback
4
 
5
+ # --- Corrected Imports ---
6
+ from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse
7
+ from prompts.system_prompts import get_system_prompt
8
+ from prompts.prompt_templates import format_critique_user_prompt
9
+ from .safe_executor import execute_python_code_with_tests, ExecutionResult # CORRECTED: Relative import
10
 
11
  print("DEBUG: core.evaluation_engine - Imports successful")
12
 
13
+ # ... (rest of the EvaluationResultOutput class, _parse_llm_score, _placeholder_safe_python_execution,
14
+ # and evaluate_solution_candidate function as previously provided and corrected) ...
15
+ # Ensure all that logic is present here. For brevity, I am not pasting it all again.
16
+ # The key change is the import line for safe_executor above.
17
 
18
  class EvaluationResultOutput:
19
  def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
20
+ self.combined_score, self.llm_critique_text, self.execution_details, self.raw_llm_response = combined_score, llm_critique_text, execution_details, raw_llm_response
 
 
 
 
21
  def get_display_critique(self):
22
+ # ... (implementation as before)
23
+ full_critique = self.llm_critique_text if self.llm_critique_text else "LLM critique failed/skipped."
24
  if self.execution_details:
25
  full_critique += f"\n\n**Automated Execution & Test Results (Simulated):**\n"
26
+ if self.execution_details.total_tests > 0: full_critique += f" Tests: {self.execution_details.passed_tests}/{self.execution_details.total_tests} passed.\n"
27
+ if self.execution_details.error: full_critique += f" Error: {self.execution_details.error}\n"
28
+ elif self.execution_details.output: full_critique += f" Output:\n```\n{self.execution_details.output[:500]}\n```\n"
29
+ full_critique += f" Time: {self.execution_details.execution_time:.4f}s\n"
 
 
 
 
30
  return full_critique
31
 
 
32
  def _parse_llm_score(llm_text_output: str) -> int:
33
+ # ... (implementation as before)
34
+ score = 0; import re
35
  if not llm_text_output or not isinstance(llm_text_output, str): return score
36
+ match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
37
+ if match: score = max(1, min(int(match.group(1)), 10))
38
+ else: score = random.randint(3, 6)
 
 
 
 
 
 
 
 
 
39
  return score
40
 
41
+ # _placeholder_safe_python_execution remains in safe_executor.py, it's imported.
42
 
43
  def evaluate_solution_candidate(
44
+ solution_text: str, problem_description: str, problem_type: str,
45
+ user_provided_tests_code: str, llm_client_config: dict
 
 
 
46
  ) -> EvaluationResultOutput:
47
+ # ... (implementation as before, ensuring it calls the imported execute_python_code_with_tests) ...
48
  print(f"DEBUG: evaluation_engine.py - Evaluating candidate. Problem type: {problem_type}")
49
+ llm_critique_text, llm_score, raw_llm_critique_resp, execution_result_obj = "LLM critique failed/skipped.", 0, None, None
 
 
 
 
50
  if solution_text and not solution_text.startswith("ERROR"):
51
+ # ... (LLM critique call logic) ...
52
  system_p_critique = get_system_prompt("critique_general")
53
  user_p_critique = format_critique_user_prompt(problem_description, solution_text)
 
54
  llm_response_obj = None
55
+ if llm_client_config["type"] == "hf": llm_response_obj = call_huggingface_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
56
+ elif llm_client_config["type"] == "google_gemini": llm_response_obj = call_gemini_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
 
 
 
57
  if llm_response_obj:
58
  raw_llm_critique_resp = llm_response_obj.raw_response
59
+ if llm_response_obj.success: llm_critique_text, llm_score = llm_response_obj.text, _parse_llm_score(llm_response_obj.text)
60
+ else: llm_critique_text, llm_score = f"Error during LLM critique: {llm_response_obj.error}", 0
61
+ elif solution_text and solution_text.startswith("ERROR"): llm_critique_text, llm_score = f"Solution was error: {solution_text}", 0
 
 
 
 
 
 
62
 
63
  if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip():
64
+ execution_result_obj = execute_python_code_with_tests(solution_text, user_provided_tests_code, timeout_seconds=10)
 
 
 
 
65
  elif "python" in problem_type.lower() and not user_provided_tests_code.strip():
66
+ execution_result_obj = ExecutionResult(success=True, output="No user tests provided.", total_tests=0)
67
+
68
+ combined_score = llm_score # Start with LLM score
69
+ if execution_result_obj and execution_result_obj.total_tests > 0: # Adjust based on tests
70
+ if not execution_result_obj.success or execution_result_obj.error: combined_score = max(1, llm_score - 5)
 
71
  else:
72
  pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
73
  if pass_ratio == 1.0: combined_score = min(10, llm_score + 2)
 
75
  elif pass_ratio < 0.25: combined_score = max(1, llm_score - 4)
76
  else: combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio))
77
  combined_score = max(1, min(10, combined_score))
78
+ return EvaluationResultOutput(combined_score, llm_critique_text, execution_result_obj, raw_llm_critique_resp)
79
 
 
 
 
 
 
 
 
80
 
81
  print("DEBUG: core.evaluation_engine - Module fully defined.")