Spaces:

mgbam
/

StoryVerseWeaver

Sleeping

App Files Files Community

mgbam commited on May 17

Commit

a98e0b1

verified ·

1 Parent(s): 6aa264c

Update core/evaluation_engine.py

Browse files

Files changed (1) hide show

core/evaluation_engine.py +39 -39

core/evaluation_engine.py CHANGED Viewed

@@ -1,16 +1,21 @@
 # algoforge_prime/core/evaluation_engine.py
 import random
-from .llm_clients import call_huggingface_api, call_gemini_api, LLMResponse
-from ..prompts.system_prompts import get_system_prompt
-from ..prompts.prompt_templates import format_critique_user_prompt
-# Import our (simulated) safe executor
-from .safe_executor import execute_python_code_with_tests, ExecutionResult # Assuming it's in the same 'core' package
-class EvaluationResultOutput: # Renamed to avoid conflict with safe_executor.ExecutionResult
     def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
         self.combined_score = combined_score
-        self.llm_critique_text = llm_critique_text # LLM's qualitative assessment
-        self.execution_details = execution_details # Object from safe_executor
         self.raw_llm_response = raw_llm_response
     def get_display_critique(self):
@@ -23,13 +28,12 @@ class EvaluationResultOutput: # Renamed to avoid conflict with safe_executor.Exe
             if self.execution_details.error:
                 full_critique += f"  Execution Error: {self.execution_details.error}\n"
             elif self.execution_details.output:
-                 full_critique += f"  Execution Output (stdout):\n```\n{self.execution_details.output[:500]}\n```\n" # Limit output display
             full_critique += f"  Execution Time: {self.execution_details.execution_time:.4f}s\n"
         return full_critique
 def _parse_llm_score(llm_text_output: str) -> int:
-    # ... (keep your existing _parse_score_from_llm_text, renamed for clarity) ...
     score = 0
     if not llm_text_output or not isinstance(llm_text_output, str): return score
     try:
@@ -38,8 +42,12 @@ def _parse_llm_score(llm_text_output: str) -> int:
         if match:
             parsed_score_val = int(match.group(1))
             score = max(1, min(parsed_score_val, 10))
-        else: score = random.randint(3, 6) # Fallback if no score marker
-    except Exception: score = random.randint(3, 5) # Fallback on any parsing error
     return score
@@ -50,13 +58,12 @@ def evaluate_solution_candidate(
     user_provided_tests_code: str,
     llm_client_config: dict
 ) -> EvaluationResultOutput:
     llm_critique_text = "LLM critique generation failed or was skipped."
     llm_score = 0
     raw_llm_critique_resp = None
     execution_result_obj = None # type: ExecutionResult
-    # 1. LLM-based Critique (only if solution_text is not an error itself)
     if solution_text and not solution_text.startswith("ERROR"):
         system_p_critique = get_system_prompt("critique_general")
         user_p_critique = format_critique_user_prompt(problem_description, solution_text)
@@ -74,45 +81,38 @@ def evaluate_solution_candidate(
                 llm_score = _parse_llm_score(llm_critique_text)
             else:
                 llm_critique_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
-                llm_score = 0 # Penalize
     elif solution_text and solution_text.startswith("ERROR"):
         llm_critique_text = f"Solution was an error from Genesis: {solution_text}"
         llm_score = 0
-    # 2. Code Execution (if Python problem, code exists, and tests are provided)
     if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip():
-        print(f"INFO: evaluation_engine.py - Preparing to execute Python code candidate against user tests.")
-        # Use the (simulated) safe executor
         execution_result_obj = execute_python_code_with_tests(
-            solution_text, user_provided_tests_code, timeout_seconds=10 # Example timeout
         )
-        print(f"INFO: evaluation_engine.py - Execution result: {execution_result_obj}")
     elif "python" in problem_type.lower() and not user_provided_tests_code.strip():
-         execution_result_obj = ExecutionResult(success=True, output="No user tests provided to run against the Python code.", total_tests=0)
-    # 3. Combine Scores into a Final Score (More sophisticated heuristic)
     combined_score = llm_score
     if execution_result_obj and execution_result_obj.total_tests > 0:
-        if not execution_result_obj.success or execution_result_obj.error: # Major execution failure
-            combined_score = max(1, llm_score - 5) # Penalize heavily
         else:
             pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
-            if pass_ratio == 1.0: # All tests passed
-                combined_score = min(10, llm_score + 2) # Significant bonus
-            elif pass_ratio >= 0.75: # Most tests passed
-                combined_score = min(10, llm_score + 1) # Small bonus
-            elif pass_ratio < 0.25: # Very few tests passed
-                combined_score = max(1, llm_score - 4)
-            else: # Some tests passed
-                combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio)) # Weighted average
-    combined_score = max(1, min(10, combined_score)) # Clamp 1-10
     return EvaluationResultOutput(
         combined_score=combined_score,
-        llm_critique_text=llm_critique_text,
-        execution_details=execution_result_obj,
         raw_llm_response=raw_llm_critique_resp
-    )

 # algoforge_prime/core/evaluation_engine.py
 import random
+import traceback # Keep this if used in your placeholder
+# --- Corrected Absolute Imports ---
+from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse # Absolute from project root
+from prompts.system_prompts import get_system_prompt # Absolute from project root
+from prompts.prompt_templates import format_critique_user_prompt # Absolute from project root
+from core.safe_executor import execute_python_code_with_tests, ExecutionResult # Absolute from project root
+print("DEBUG: core.evaluation_engine - Imports successful")
+class EvaluationResultOutput:
     def __init__(self, combined_score=0, llm_critique_text="", execution_details: ExecutionResult = None, raw_llm_response=None):
         self.combined_score = combined_score
+        self.llm_critique_text = llm_critique_text
+        self.execution_details = execution_details
         self.raw_llm_response = raw_llm_response
     def get_display_critique(self):
             if self.execution_details.error:
                 full_critique += f"  Execution Error: {self.execution_details.error}\n"
             elif self.execution_details.output:
+                 full_critique += f"  Execution Output (stdout):\n```\n{self.execution_details.output[:500]}\n```\n"
             full_critique += f"  Execution Time: {self.execution_details.execution_time:.4f}s\n"
         return full_critique
 def _parse_llm_score(llm_text_output: str) -> int:
     score = 0
     if not llm_text_output or not isinstance(llm_text_output, str): return score
     try:
         if match:
             parsed_score_val = int(match.group(1))
             score = max(1, min(parsed_score_val, 10))
+        else:
+            print(f"INFO: evaluation_engine.py - 'Score: X/10' marker not found in LLM output. Output: {llm_text_output[:100]}...")
+            score = random.randint(3, 6)
+    except Exception as e:
+        print(f"WARNING: evaluation_engine.py - Error parsing score from LLM output '{llm_text_output[:100]}...': {e}")
+        score = random.randint(3, 5)
     return score
     user_provided_tests_code: str,
     llm_client_config: dict
 ) -> EvaluationResultOutput:
+    print(f"DEBUG: evaluation_engine.py - Evaluating candidate. Problem type: {problem_type}")
     llm_critique_text = "LLM critique generation failed or was skipped."
     llm_score = 0
     raw_llm_critique_resp = None
     execution_result_obj = None # type: ExecutionResult
     if solution_text and not solution_text.startswith("ERROR"):
         system_p_critique = get_system_prompt("critique_general")
         user_p_critique = format_critique_user_prompt(problem_description, solution_text)
                 llm_score = _parse_llm_score(llm_critique_text)
             else:
                 llm_critique_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
+                llm_score = 0
     elif solution_text and solution_text.startswith("ERROR"):
         llm_critique_text = f"Solution was an error from Genesis: {solution_text}"
         llm_score = 0
     if "python" in problem_type.lower() and solution_text and not solution_text.startswith("ERROR") and user_provided_tests_code.strip():
+        print(f"INFO: evaluation_engine.py - Preparing to (simulated) execute Python code candidate.")
         execution_result_obj = execute_python_code_with_tests(
+            solution_text, user_provided_tests_code, timeout_seconds=10
         )
+        print(f"INFO: evaluation_engine.py - (Simulated) Execution result: {execution_result_obj}")
     elif "python" in problem_type.lower() and not user_provided_tests_code.strip():
+         execution_result_obj = ExecutionResult(success=True, output="No user tests provided for this Python problem.", total_tests=0)
     combined_score = llm_score
     if execution_result_obj and execution_result_obj.total_tests > 0:
+        if not execution_result_obj.success or execution_result_obj.error:
+            combined_score = max(1, llm_score - 5)
         else:
             pass_ratio = execution_result_obj.passed_tests / execution_result_obj.total_tests
+            if pass_ratio == 1.0: combined_score = min(10, llm_score + 2)
+            elif pass_ratio >= 0.75: combined_score = min(10, llm_score + 1)
+            elif pass_ratio < 0.25: combined_score = max(1, llm_score - 4)
+            else: combined_score = int(llm_score * (0.5 + 0.5 * pass_ratio))
+    combined_score = max(1, min(10, combined_score))
+    print(f"DEBUG: evaluation_engine.py - Evaluation complete. Combined Score: {combined_score}")
     return EvaluationResultOutput(
         combined_score=combined_score,
+        llm_critique_text=llm_critique_text, # This is just the LLM's part
+        execution_details=execution_result_obj, # This contains test pass/fail and errors
         raw_llm_response=raw_llm_critique_resp
+    )
+print("DEBUG: core.evaluation_engine - Module fully defined.")