Spaces:

mgbam
/

StoryVerseWeaver

Running

App Files Files Community

mgbam commited on 23 days ago

Commit

cbf1fef

verified ·

1 Parent(s): a0a78d2

Update core/evaluation_engine.py

Browse files

Files changed (1) hide show

core/evaluation_engine.py +40 -77

core/evaluation_engine.py CHANGED Viewed

@@ -1,104 +1,73 @@
 # algoforge_prime/core/evaluation_engine.py
 import random
-import traceback # Keep this if used in your placeholder
-# --- Corrected Absolute Imports ---
-from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse # Absolute from project root
-from prompts.system_prompts import get_system_prompt # Absolute from project root
-from prompts.prompt_templates import format_critique_user_prompt # Absolute from project root
-from core.safe_executor import execute_python_code_with_tests, ExecutionResult # Absolute from project root
 print("DEBUG: core.evaluation_engine - Imports successful")
 class EvaluationResultOutput:
     def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
-        self.combined_score = combined_score
-        self.llm_critique_text = llm_critique_text
-        self.execution_details = execution_details
-        self.raw_llm_response = raw_llm_response
     def get_display_critique(self):
-        full_critique = self.llm_critique_text if self.llm_critique_text else "LLM critique was not performed or failed."
         if self.execution_details:
             full_critique += f"\n\n**Automated Execution & Test Results (Simulated):**\n"
-            if self.execution_details.total_tests > 0:
-                full_critique += f"  Tests Attempted: {self.execution_details.total_tests}\n"
-                full_critique += f"  Tests Passed:    {self.execution_details.passed_tests}\n"
-            if self.execution_details.error:
-                full_critique += f"  Execution Error: {self.execution_details.error}\n"
-            elif self.execution_details.output:
-                 full_critique += f"  Execution Output (stdout):\n```\n{self.execution_details.output[:500]}\n```\n"
-            full_critique += f"  Execution Time: {self.execution_details.execution_time:.4f}s\n"
         return full_critique
 def _parse_llm_score(llm_text_output: str) -> int:
-    score = 0
     if not llm_text_output or not isinstance(llm_text_output, str): return score
-    try:
-        import re
-        match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
-        if match:
-            parsed_score_val = int(match.group(1))
-            score = max(1, min(parsed_score_val, 10))
-        else:
-            print(f"INFO: evaluation_engine.py - 'Score: X/10' marker not found in LLM output. Output: {llm_text_output[:100]}...")
-            score = random.randint(3, 6)
-    except Exception as e:
-        print(f"WARNING: evaluation_engine.py - Error parsing score from LLM output '{llm_text_output[:100]}...': {e}")
-        score = random.randint(3, 5)
     return score
 def evaluate_solution_candidate(
-    solution_text: str,
-    problem_description: str,
-    problem_type: str,
-    user_provided_tests_code: str,
-    llm_client_config: dict
 ) -> EvaluationResultOutput:
     print(f"DEBUG: evaluation_engine.py - Evaluating candidate. Problem type: {problem_type}")
-    llm_critique_text = "LLM critique generation failed or was skipped."
-    llm_score = 0
-    raw_llm_critique_resp = None
-    execution_result_obj = None # type: ExecutionResult
     if solution_text and not solution_text.startswith("ERROR"):
         system_p_critique = get_system_prompt("critique_general")
         user_p_critique = format_critique_user_prompt(problem_description, solution_text)
         llm_response_obj = None
-        if llm_client_config["type"] == "hf":
-            llm_response_obj = call_huggingface_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
-        elif llm_client_config["type"] == "google_gemini":
-            llm_response_obj = call_gemini_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
         if llm_response_obj:
             raw_llm_critique_resp = llm_response_obj.raw_response
-            if llm_response_obj.success:
-                llm_critique_text = llm_response_obj.text
-                llm_score = _parse_llm_score(llm_critique_text)
-            else:
-                llm_critique_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
-                llm_score = 0
-    elif solution_text and solution_text.startswith("ERROR"):
-        llm_critique_text = f"Solution was an error from Genesis: {solution_text}"
-        llm_score = 0
     if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip():
-        print(f"INFO: evaluation_engine.py - Preparing to (simulated) execute Python code candidate.")
-        execution_result_obj = execute_python_code_with_tests(
-            solution_text, user_provided_tests_code, timeout_seconds=10
-        )
-        print(f"INFO: evaluation_engine.py - (Simulated) Execution result: {execution_result_obj}")
     elif "python" in problem_type.lower() and not user_provided_tests_code.strip():
-         execution_result_obj = ExecutionResult(success=True, output="No user tests provided for this Python problem.", total_tests=0)
-    combined_score = llm_score
-    if execution_result_obj and execution_result_obj.total_tests > 0:
-        if not execution_result_obj.success or execution_result_obj.error:
-            combined_score = max(1, llm_score - 5)
         else:
             pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
             if pass_ratio == 1.0: combined_score = min(10, llm_score + 2)
@@ -106,13 +75,7 @@ def evaluate_solution_candidate(
             elif pass_ratio < 0.25: combined_score = max(1, llm_score - 4)
             else: combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio))
     combined_score = max(1, min(10, combined_score))
-    print(f"DEBUG: evaluation_engine.py - Evaluation complete. Combined Score: {combined_score}")
-    return EvaluationResultOutput(
-        combined_score=combined_score,
-        llm_critique_text=llm_critique_text, # This is just the LLM's part
-        execution_details=execution_result_obj, # This contains test pass/fail and errors
-        raw_llm_response=raw_llm_critique_resp
-    )
 print("DEBUG: core.evaluation_engine - Module fully defined.")

 # algoforge_prime/core/evaluation_engine.py
 import random
+import traceback
+# --- Corrected Imports ---
+from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse
+from prompts.system_prompts import get_system_prompt
+from prompts.prompt_templates import format_critique_user_prompt
+from .safe_executor import execute_python_code_with_tests, ExecutionResult # CORRECTED: Relative import
 print("DEBUG: core.evaluation_engine - Imports successful")
+# ... (rest of the EvaluationResultOutput class, _parse_llm_score, _placeholder_safe_python_execution,
+#      and evaluate_solution_candidate function as previously provided and corrected) ...
+# Ensure all that logic is present here. For brevity, I am not pasting it all again.
+# The key change is the import line for safe_executor above.
 class EvaluationResultOutput:
     def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
+        self.combined_score, self.llm_critique_text, self.execution_details, self.raw_llm_response = combined_score, llm_critique_text, execution_details, raw_llm_response
     def get_display_critique(self):
+        # ... (implementation as before)
+        full_critique = self.llm_critique_text if self.llm_critique_text else "LLM critique failed/skipped."
         if self.execution_details:
             full_critique += f"\n\n**Automated Execution & Test Results (Simulated):**\n"
+            if self.execution_details.total_tests > 0: full_critique += f"  Tests: {self.execution_details.passed_tests}/{self.execution_details.total_tests} passed.\n"
+            if self.execution_details.error: full_critique += f"  Error: {self.execution_details.error}\n"
+            elif self.execution_details.output: full_critique += f"  Output:\n```\n{self.execution_details.output[:500]}\n```\n"
+            full_critique += f"  Time: {self.execution_details.execution_time:.4f}s\n"
         return full_critique
 def _parse_llm_score(llm_text_output: str) -> int:
+    # ... (implementation as before)
+    score = 0; import re
     if not llm_text_output or not isinstance(llm_text_output, str): return score
+    match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
+    if match: score = max(1, min(int(match.group(1)), 10))
+    else: score = random.randint(3, 6)
     return score
+# _placeholder_safe_python_execution remains in safe_executor.py, it's imported.
 def evaluate_solution_candidate(
+    solution_text: str, problem_description: str, problem_type: str,
+    user_provided_tests_code: str, llm_client_config: dict
 ) -> EvaluationResultOutput:
+    # ... (implementation as before, ensuring it calls the imported execute_python_code_with_tests) ...
     print(f"DEBUG: evaluation_engine.py - Evaluating candidate. Problem type: {problem_type}")
+    llm_critique_text, llm_score, raw_llm_critique_resp, execution_result_obj = "LLM critique failed/skipped.", 0, None, None
     if solution_text and not solution_text.startswith("ERROR"):
+        # ... (LLM critique call logic) ...
         system_p_critique = get_system_prompt("critique_general")
         user_p_critique = format_critique_user_prompt(problem_description, solution_text)
         llm_response_obj = None
+        if llm_client_config["type"] == "hf": llm_response_obj = call_huggingface_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
+        elif llm_client_config["type"] == "google_gemini": llm_response_obj = call_gemini_api(user_p_critique, llm_client_config["model_id"], llm_client_config["temp"], llm_client_config["max_tokens"], system_p_critique)
         if llm_response_obj:
             raw_llm_critique_resp = llm_response_obj.raw_response
+            if llm_response_obj.success: llm_critique_text, llm_score = llm_response_obj.text, _parse_llm_score(llm_response_obj.text)
+            else: llm_critique_text, llm_score = f"Error during LLM critique: {llm_response_obj.error}", 0
+    elif solution_text and solution_text.startswith("ERROR"): llm_critique_text, llm_score = f"Solution was error: {solution_text}", 0
     if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip():
+        execution_result_obj = execute_python_code_with_tests(solution_text, user_provided_tests_code, timeout_seconds=10)
     elif "python" in problem_type.lower() and not user_provided_tests_code.strip():
+        execution_result_obj = ExecutionResult(success=True, output="No user tests provided.", total_tests=0)
+    combined_score = llm_score # Start with LLM score
+    if execution_result_obj and execution_result_obj.total_tests > 0: # Adjust based on tests
+        if not execution_result_obj.success or execution_result_obj.error: combined_score = max(1, llm_score - 5)
         else:
             pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
             if pass_ratio == 1.0: combined_score = min(10, llm_score + 2)
             elif pass_ratio < 0.25: combined_score = max(1, llm_score - 4)
             else: combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio))
     combined_score = max(1, min(10, combined_score))
+    return EvaluationResultOutput(combined_score, llm_critique_text, execution_result_obj, raw_llm_critique_resp)
 print("DEBUG: core.evaluation_engine - Module fully defined.")