Spaces:

mgbam
/

StoryVerseWeaver

Sleeping

App Files Files Community

mgbam commited on May 17

Commit

1cc90f6

verified ·

1 Parent(s): a00b966

Update core/evolution_engine.py

Browse files

Files changed (1) hide show

core/evolution_engine.py +113 -44

core/evolution_engine.py CHANGED Viewed

@@ -1,49 +1,118 @@
-# algoforge_prime/core/evolution_engine.py
-from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse # Absolute
-from prompts.system_prompts import get_system_prompt # Absolute
-# from ..prompts.prompt_templates import format_evolution_user_prompt # If you create one
-def evolve_solution(
-    original_solution_text: str,
-    comprehensive_critique_text: str, # This includes LLM critique + test summary
-    original_combined_score: int,
     problem_description: str,
     problem_type: str,
-    llm_client_config: dict # {"type": ..., "model_id": ..., "temp": ..., "max_tokens": ...}
-) -> str: # Returns evolved solution text or an error string
-    """
-    Attempts to evolve a solution based on its critique and score.
-    """
-    system_p_evolve = get_system_prompt("evolution_general") # problem_type can be used for specialization
-    user_p_evolve = (
-        f"Original Problem Context: \"{problem_description}\"\n\n"
-        f"The solution to be evolved achieved a score of {original_combined_score}/10.\n"
-        f"Here is the solution text:\n```python\n{original_solution_text}\n```\n\n"
-        f"Here is the comprehensive evaluation and critique it received (including any automated test feedback):\n'''\n{comprehensive_critique_text}\n'''\n\n"
-        f"Your Task: Based on the above, evolve the provided solution to make it demonstrably superior. "
-        f"Address any flaws, incompleteness, or inefficiencies mentioned in the critique or highlighted by test failures. "
-        f"If the solution was good, make it even better (e.g., more robust, more efficient, clearer). "
-        f"Clearly explain the key improvements you've made as an integral part of your evolved response (e.g., in comments or a concluding summary)."
-    )
-    llm_response_obj = None # type: LLMResponse
-    if llm_client_config["type"] == "hf":
-        llm_response_obj = call_huggingface_api(
-            user_p_evolve, llm_client_config["model_id"],
-            temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
-            system_prompt_text=system_p_evolve
-        )
-    elif llm_client_config["type"] == "google_gemini":
-        llm_response_obj = call_gemini_api(
-            user_p_evolve, llm_client_config["model_id"],
-            temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
-            system_prompt_text=system_p_evolve
         )
-    else:
-        return f"ERROR (Evolution): Unknown LLM client type '{llm_client_config['type']}'"
-    if llm_response_obj.success:
-        return llm_response_obj.text
-    else:
-        return f"ERROR (Evolution with {llm_response_obj.model_id_used}): {llm_response_obj.error}"

+# algoforge_prime/core/evaluation_engine.py
+import random
+from .llm_clients import call_huggingface_api, call_gemini_api, LLMResponse
+from ..prompts.system_prompts import get_system_prompt
+from ..prompts.prompt_templates import format_critique_user_prompt
+# Import our (simulated) safe executor
+from .safe_executor import execute_python_code_with_tests, ExecutionResult # Assuming it's in the same 'core' package
+class EvaluationResultOutput: # Renamed to avoid conflict with safe_executor.ExecutionResult
+    def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
+        self.combined_score = combined_score
+        self.llm_critique_text = llm_critique_text # LLM's qualitative assessment
+        self.execution_details = execution_details # Object from safe_executor
+        self.raw_llm_response = raw_llm_response
+    def get_display_critique(self):
+        full_critique = self.llm_critique_text if self.llm_critique_text else "LLM critique was not performed or failed."
+        if self.execution_details:
+            full_critique += f"\n\n**Automated Execution & Test Results (Simulated):**\n"
+            if self.execution_details.total_tests > 0:
+                full_critique += f"  Tests Attempted: {self.execution_details.total_tests}\n"
+                full_critique += f"  Tests Passed:    {self.execution_details.passed_tests}\n"
+            if self.execution_details.error:
+                full_critique += f"  Execution Error: {self.execution_details.error}\n"
+            elif self.execution_details.output:
+                 full_critique += f"  Execution Output (stdout):\n```\n{self.execution_details.output[:500]}\n```\n" # Limit output display
+            full_critique += f"  Execution Time: {self.execution_details.execution_time:.4f}s\n"
+        return full_critique
+def _parse_llm_score(llm_text_output: str) -> int:
+    # ... (keep your existing _parse_score_from_llm_text, renamed for clarity) ...
+    score = 0
+    if not llm_text_output or not isinstance(llm_text_output, str): return score
+    try:
+        import re
+        match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
+        if match:
+            parsed_score_val = int(match.group(1))
+            score = max(1, min(parsed_score_val, 10))
+        else: score = random.randint(3, 6) # Fallback if no score marker
+    except Exception: score = random.randint(3, 5) # Fallback on any parsing error
+    return score
+def evaluate_solution_candidate(
+    solution_text: str,
     problem_description: str,
     problem_type: str,
+    user_provided_tests_code: str,
+    llm_client_config: dict
+) -> EvaluationResultOutput:
+    llm_critique_text = "LLM critique generation failed or was skipped."
+    llm_score = 0
+    raw_llm_critique_resp = None
+    execution_result_obj = None # type: ExecutionResult
+    # 1. LLM-based Critique (only if solution_text is not an error itself)
+    if solution_text and not solution_text.startswith("ERROR"):
+        system_p_critique = get_system_prompt("critique_general")
+        user_p_critique = format_critique_user_prompt(problem_description, solution_text)
+        llm_response_obj = None
+        if llm_client_config["type"] == "hf":
+            llm_response_obj = call_huggingface_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
+        elif llm_client_config["type"] == "google_gemini":
+            llm_response_obj = call_gemini_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
+        if llm_response_obj:
+            raw_llm_critique_resp = llm_response_obj.raw_response
+            if llm_response_obj.success:
+                llm_critique_text = llm_response_obj.text
+                llm_score = _parse_llm_score(llm_critique_text)
+            else:
+                llm_critique_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
+                llm_score = 0 # Penalize
+    elif solution_text and solution_text.startswith("ERROR"):
+        llm_critique_text = f"Solution was an error from Genesis: {solution_text}"
+        llm_score = 0
+    # 2. Code Execution (if Python problem, code exists, and tests are provided)
+    if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip():
+        print(f"INFO: evaluation_engine.py - Preparing to execute Python code candidate against user tests.")
+        # Use the (simulated) safe executor
+        execution_result_obj = execute_python_code_with_tests(
+            solution_text, user_provided_tests_code, timeout_seconds=10 # Example timeout
         )
+        print(f"INFO: evaluation_engine.py - Execution result: {execution_result_obj}")
+    elif "python" in problem_type.lower() and not user_provided_tests_code.strip():
+         execution_result_obj = ExecutionResult(success=True, output="No user tests provided to run against the Python code.", total_tests=0)
+    # 3. Combine Scores into a Final Score (More sophisticated heuristic)
+    combined_score = llm_score
+    if execution_result_obj and execution_result_obj.total_tests > 0:
+        if not execution_result_obj.success or execution_result_obj.error: # Major execution failure
+            combined_score = max(1, llm_score - 5) # Penalize heavily
+        else:
+            pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
+            if pass_ratio == 1.0: # All tests passed
+                combined_score = min(10, llm_score + 2) # Significant bonus
+            elif pass_ratio >= 0.75: # Most tests passed
+                combined_score = min(10, llm_score + 1) # Small bonus
+            elif pass_ratio < 0.25: # Very few tests passed
+                combined_score = max(1, llm_score - 4)
+            else: # Some tests passed
+                combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio)) # Weighted average
+    combined_score = max(1, min(10, combined_score)) # Clamp 1-10
+    return EvaluationResultOutput(
+        combined_score=combined_score,
+        llm_critique_text=llm_critique_text,
+        execution_details=execution_result_obj,
+        raw_llm_response=raw_llm_critique_resp
+    )