mgbam commited on
Commit
1cc90f6
·
verified ·
1 Parent(s): a00b966

Update core/evolution_engine.py

Browse files
Files changed (1) hide show
  1. core/evolution_engine.py +113 -44
core/evolution_engine.py CHANGED
@@ -1,49 +1,118 @@
1
- # algoforge_prime/core/evolution_engine.py
2
- from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse # Absolute
3
- from prompts.system_prompts import get_system_prompt # Absolute
4
- # from ..prompts.prompt_templates import format_evolution_user_prompt # If you create one
5
-
6
- def evolve_solution(
7
- original_solution_text: str,
8
- comprehensive_critique_text: str, # This includes LLM critique + test summary
9
- original_combined_score: int,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  problem_description: str,
11
  problem_type: str,
12
- llm_client_config: dict # {"type": ..., "model_id": ..., "temp": ..., "max_tokens": ...}
13
- ) -> str: # Returns evolved solution text or an error string
14
- """
15
- Attempts to evolve a solution based on its critique and score.
16
- """
17
- system_p_evolve = get_system_prompt("evolution_general") # problem_type can be used for specialization
18
 
19
- user_p_evolve = (
20
- f"Original Problem Context: \"{problem_description}\"\n\n"
21
- f"The solution to be evolved achieved a score of {original_combined_score}/10.\n"
22
- f"Here is the solution text:\n```python\n{original_solution_text}\n```\n\n"
23
- f"Here is the comprehensive evaluation and critique it received (including any automated test feedback):\n'''\n{comprehensive_critique_text}\n'''\n\n"
24
- f"Your Task: Based on the above, evolve the provided solution to make it demonstrably superior. "
25
- f"Address any flaws, incompleteness, or inefficiencies mentioned in the critique or highlighted by test failures. "
26
- f"If the solution was good, make it even better (e.g., more robust, more efficient, clearer). "
27
- f"Clearly explain the key improvements you've made as an integral part of your evolved response (e.g., in comments or a concluding summary)."
28
- )
29
-
30
- llm_response_obj = None # type: LLMResponse
31
- if llm_client_config["type"] == "hf":
32
- llm_response_obj = call_huggingface_api(
33
- user_p_evolve, llm_client_config["model_id"],
34
- temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
35
- system_prompt_text=system_p_evolve
36
- )
37
- elif llm_client_config["type"] == "google_gemini":
38
- llm_response_obj = call_gemini_api(
39
- user_p_evolve, llm_client_config["model_id"],
40
- temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
41
- system_prompt_text=system_p_evolve
 
 
 
 
 
 
 
 
 
 
 
 
42
  )
43
- else:
44
- return f"ERROR (Evolution): Unknown LLM client type '{llm_client_config['type']}'"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- if llm_response_obj.success:
47
- return llm_response_obj.text
48
- else:
49
- return f"ERROR (Evolution with {llm_response_obj.model_id_used}): {llm_response_obj.error}"
 
 
 
1
+ # algoforge_prime/core/evaluation_engine.py
2
+ import random
3
+ from .llm_clients import call_huggingface_api, call_gemini_api, LLMResponse
4
+ from ..prompts.system_prompts import get_system_prompt
5
+ from ..prompts.prompt_templates import format_critique_user_prompt
6
+ # Import our (simulated) safe executor
7
+ from .safe_executor import execute_python_code_with_tests, ExecutionResult # Assuming it's in the same 'core' package
8
+
9
+ class EvaluationResultOutput: # Renamed to avoid conflict with safe_executor.ExecutionResult
10
+ def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
11
+ self.combined_score = combined_score
12
+ self.llm_critique_text = llm_critique_text # LLM's qualitative assessment
13
+ self.execution_details = execution_details # Object from safe_executor
14
+ self.raw_llm_response = raw_llm_response
15
+
16
+ def get_display_critique(self):
17
+ full_critique = self.llm_critique_text if self.llm_critique_text else "LLM critique was not performed or failed."
18
+ if self.execution_details:
19
+ full_critique += f"\n\n**Automated Execution & Test Results (Simulated):**\n"
20
+ if self.execution_details.total_tests > 0:
21
+ full_critique += f" Tests Attempted: {self.execution_details.total_tests}\n"
22
+ full_critique += f" Tests Passed: {self.execution_details.passed_tests}\n"
23
+ if self.execution_details.error:
24
+ full_critique += f" Execution Error: {self.execution_details.error}\n"
25
+ elif self.execution_details.output:
26
+ full_critique += f" Execution Output (stdout):\n```\n{self.execution_details.output[:500]}\n```\n" # Limit output display
27
+ full_critique += f" Execution Time: {self.execution_details.execution_time:.4f}s\n"
28
+ return full_critique
29
+
30
+
31
+ def _parse_llm_score(llm_text_output: str) -> int:
32
+ # ... (keep your existing _parse_score_from_llm_text, renamed for clarity) ...
33
+ score = 0
34
+ if not llm_text_output or not isinstance(llm_text_output, str): return score
35
+ try:
36
+ import re
37
+ match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
38
+ if match:
39
+ parsed_score_val = int(match.group(1))
40
+ score = max(1, min(parsed_score_val, 10))
41
+ else: score = random.randint(3, 6) # Fallback if no score marker
42
+ except Exception: score = random.randint(3, 5) # Fallback on any parsing error
43
+ return score
44
+
45
+
46
+ def evaluate_solution_candidate(
47
+ solution_text: str,
48
  problem_description: str,
49
  problem_type: str,
50
+ user_provided_tests_code: str,
51
+ llm_client_config: dict
52
+ ) -> EvaluationResultOutput:
 
 
 
53
 
54
+ llm_critique_text = "LLM critique generation failed or was skipped."
55
+ llm_score = 0
56
+ raw_llm_critique_resp = None
57
+ execution_result_obj = None # type: ExecutionResult
58
+
59
+ # 1. LLM-based Critique (only if solution_text is not an error itself)
60
+ if solution_text and not solution_text.startswith("ERROR"):
61
+ system_p_critique = get_system_prompt("critique_general")
62
+ user_p_critique = format_critique_user_prompt(problem_description, solution_text)
63
+
64
+ llm_response_obj = None
65
+ if llm_client_config["type"] == "hf":
66
+ llm_response_obj = call_huggingface_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
67
+ elif llm_client_config["type"] == "google_gemini":
68
+ llm_response_obj = call_gemini_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
69
+
70
+ if llm_response_obj:
71
+ raw_llm_critique_resp = llm_response_obj.raw_response
72
+ if llm_response_obj.success:
73
+ llm_critique_text = llm_response_obj.text
74
+ llm_score = _parse_llm_score(llm_critique_text)
75
+ else:
76
+ llm_critique_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
77
+ llm_score = 0 # Penalize
78
+ elif solution_text and solution_text.startswith("ERROR"):
79
+ llm_critique_text = f"Solution was an error from Genesis: {solution_text}"
80
+ llm_score = 0
81
+
82
+
83
+ # 2. Code Execution (if Python problem, code exists, and tests are provided)
84
+ if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip():
85
+ print(f"INFO: evaluation_engine.py - Preparing to execute Python code candidate against user tests.")
86
+ # Use the (simulated) safe executor
87
+ execution_result_obj = execute_python_code_with_tests(
88
+ solution_text, user_provided_tests_code, timeout_seconds=10 # Example timeout
89
  )
90
+ print(f"INFO: evaluation_engine.py - Execution result: {execution_result_obj}")
91
+ elif "python" in problem_type.lower() and not user_provided_tests_code.strip():
92
+ execution_result_obj = ExecutionResult(success=True, output="No user tests provided to run against the Python code.", total_tests=0)
93
+
94
+
95
+ # 3. Combine Scores into a Final Score (More sophisticated heuristic)
96
+ combined_score = llm_score
97
+ if execution_result_obj and execution_result_obj.total_tests > 0:
98
+ if not execution_result_obj.success or execution_result_obj.error: # Major execution failure
99
+ combined_score = max(1, llm_score - 5) # Penalize heavily
100
+ else:
101
+ pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
102
+ if pass_ratio == 1.0: # All tests passed
103
+ combined_score = min(10, llm_score + 2) # Significant bonus
104
+ elif pass_ratio >= 0.75: # Most tests passed
105
+ combined_score = min(10, llm_score + 1) # Small bonus
106
+ elif pass_ratio < 0.25: # Very few tests passed
107
+ combined_score = max(1, llm_score - 4)
108
+ else: # Some tests passed
109
+ combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio)) # Weighted average
110
+
111
+ combined_score = max(1, min(10, combined_score)) # Clamp 1-10
112
 
113
+ return EvaluationResultOutput(
114
+ combined_score=combined_score,
115
+ llm_critique_text=llm_critique_text,
116
+ execution_details=execution_result_obj,
117
+ raw_llm_response=raw_llm_critique_resp
118
+ )