Spaces:

mgbam
/

StoryVerseWeaver

Sleeping

App Files Files Community

mgbam commited on May 17

Commit

48e5e22

verified ·

1 Parent(s): 3fd2bb1

Update core/evaluation_engine.py

Browse files

Files changed (1) hide show

core/evaluation_engine.py +188 -0

core/evaluation_engine.py CHANGED Viewed

	@@ -0,0 +1,188 @@

+# algoforge_prime/core/evaluation_engine.py
+import random
+import time
+import traceback
+# IMPORTANT: The following import is for a HYPOTHETICAL safe executor.
+# You would need to implement or find a robust sandboxing solution.
+# from .restricted_env_executor import execute_python_code_safely # Example
+from .llm_clients import call_huggingface_api, call_gemini_api, LLMResponse
+from ..prompts.system_prompts import get_system_prompt
+from ..prompts.prompt_templates import format_critique_user_prompt
+class EvaluationResult:
+    def __init__(self, score=0, critique_text="", passed_tests=0, total_tests=0, execution_summary=None, raw_llm_critique_response=None):
+        self.score = score  # Final combined score
+        self.critique_text = critique_text # LLM based critique + execution summary
+        self.passed_tests = passed_tests
+        self.total_tests = total_tests
+        self.execution_summary = execution_summary # Error or success message from code execution
+        self.raw_llm_critique_response = raw_llm_critique_response
+    def __str__(self): # For simple string representation if needed
+        return f"Score: {self.score}/10. Tests: {self.passed_tests}/{self.total_tests}. Summary: {self.execution_summary}. Critique: {self.critique_text[:100]}..."
+def _parse_score_from_llm_text(llm_text_output: str) -> int:
+    """Helper to parse 'Score: X/10' from LLM's textual output."""
+    score = 0 # Default if not found or unparsable
+    if not llm_text_output or not isinstance(llm_text_output, str):
+        return score
+    try:
+        # Look for "Score: X/10" or "Score: X"
+        # More robust parsing might be needed depending on LLM variability
+        import re
+        match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
+        if match:
+            parsed_score_val = int(match.group(1))
+            score = max(1, min(parsed_score_val, 10)) # Clamp score to 1-10
+        else: # Fallback if specific format not found
+            print(f"INFO: evaluation_engine.py - 'Score: X/10' marker not found in LLM output. Assigning fallback score. Output: {llm_text_output[:100]}...")
+            score = random.randint(3, 6) # Assign a mediocre random score
+    except Exception as e:
+        print(f"WARNING: evaluation_engine.py - Error parsing score from LLM output '{llm_text_output[:100]}...': {e}")
+        score = random.randint(3, 5) # Fallback on parsing error
+    return score
+def _placeholder_safe_python_execution(code_string: str, user_tests_string: str) -> tuple[int, int, str]:
+    """
+    PLACEHOLDER for safe Python code execution.
+    **WARNING: THIS IS NOT SAFE FOR PRODUCTION. IT ONLY SIMULATES.**
+    Replace with a robust sandboxing mechanism (Docker, nsjail, WASM, etc.).
+    """
+    print(f"DEV_INFO: evaluation_engine.py - Entering PLACEHOLDER for code execution.")
+    print(f"  Code (first 100 chars): {code_string[:100]}...")
+    print(f"  Tests (first 100 chars): {user_tests_string[:100]}...")
+    if not user_tests_string.strip() or not code_string.strip():
+        return 0, 0, "SIMULATED: No tests provided or no code to test."
+    # Naive parsing of assert statements
+    test_lines = [line.strip() for line in user_tests_string.splitlines() if line.strip().startswith("assert")]
+    total_tests_found = len(test_lines)
+    if total_tests_found == 0:
+        return 0, 0, "SIMULATED: No 'assert' statements found in user tests."
+    # Extremely simplistic simulation logic (NOT REAL EXECUTION)
+    passed_count = 0
+    execution_log = ["SIMULATED EXECUTION LOG:"]
+    try:
+        # This is where real sandboxed execution would happen.
+        # We'll simulate based on keywords for demonstration.
+        if "syntax error" in code_string.lower() or "indentationerror" in code_string.lower():
+            execution_log.append("  - Simulated: Potential syntax error in generated code.")
+            # passed_count remains 0
+        elif "runtime error" in code_string.lower() or "exception" in code_string.lower():
+            execution_log.append("  - Simulated: Code might raise a runtime error.")
+            passed_count = random.randint(0, total_tests_found // 3) # Few pass
+        elif "return" not in code_string and any("==" in t for t in test_lines): # If expecting a return value
+            execution_log.append("  - Simulated: Code might be missing a crucial 'return' statement.")
+            passed_count = random.randint(0, total_tests_found // 2)
+        else: # Simulate some passing, some failing
+            passed_count = random.randint(total_tests_found // 2, total_tests_found)
+            execution_log.append(f"  - Simulated: {passed_count} of {total_tests_found} tests likely passed.")
+        if passed_count < total_tests_found:
+            execution_log.append(f"  - Simulated: {total_tests_found - passed_count} test(s) likely failed.")
+        summary = f"Simulated: {passed_count}/{total_tests_found} tests passed."
+        if passed_count < total_tests_found : summary += " Some tests likely failed."
+    except Exception as e_sim: # Error in our simulation logic
+        summary = f"Error during test SIMULATION logic: {str(e_sim)}"
+        passed_count = 0
+        execution_log.append(f"  - ERROR in simulation: {e_sim}")
+    print(f"DEV_INFO: evaluation_engine.py - Placeholder execution result: {summary}")
+    return passed_count, total_tests_found, "\n".join(execution_log)
+def evaluate_solution_candidate(
+    solution_text: str,
+    problem_description: str,
+    problem_type: str,
+    user_provided_tests: str, # String of Python assert statements
+    llm_client_config: dict # {"type": ..., "model_id": ..., "temp": ..., "max_tokens": ...}
+) -> EvaluationResult:
+    """
+    Evaluates a single solution candidate.
+    """
+    llm_critique_output_text = "LLM critique could not be performed due to an earlier error or API issue."
+    llm_based_score = 0
+    raw_llm_critique_resp = None
+    # 1. LLM-based Critique (if solution_text is not an error message itself)
+    if solution_text and not solution_text.startswith("ERROR"):
+        system_p_critique = get_system_prompt("critique_general") # problem_type can be used here too
+        user_p_critique = format_critique_user_prompt(problem_description, solution_text)
+        llm_response_obj = None # type: LLMResponse
+        if llm_client_config["type"] == "hf":
+            llm_response_obj = call_huggingface_api(
+                user_p_critique, llm_client_config["model_id"],
+                temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
+                system_prompt_text=system_p_critique
+            )
+        elif llm_client_config["type"] == "google_gemini":
+            llm_response_obj = call_gemini_api(
+                user_p_critique, llm_client_config["model_id"],
+                temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
+                system_prompt_text=system_p_critique
+            )
+        if llm_response_obj:
+            raw_llm_critique_resp = llm_response_obj.raw_response
+            if llm_response_obj.success:
+                llm_critique_output_text = llm_response_obj.text
+                llm_based_score = _parse_score_from_llm_text(llm_critique_output_text)
+            else:
+                llm_critique_output_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
+                llm_based_score = 0 # Penalize for critique failure
+    elif solution_text and solution_text.startswith("ERROR"):
+        llm_critique_output_text = f"Solution was an error from Genesis: {solution_text}"
+        llm_based_score = 0
+    # 2. (Simulated) Code Execution if applicable
+    passed_tests_count = 0
+    total_tests_count = 0
+    exec_summary_msg = "Automated tests not applicable or not run for this problem type/solution."
+    # Only run tests if it's a Python problem, tests are provided, and solution isn't an error
+    if "python" in problem_type.lower() and user_provided_tests.strip() and solution_text and not solution_text.startswith("ERROR"):
+        # **IMPORTANT**: Replace with a REAL sandboxed executor for safety.
+        passed_tests_count, total_tests_count, exec_summary_msg = _placeholder_safe_python_execution(
+            solution_text, user_provided_tests
+        )
+    elif "python" in problem_type.lower() and not user_provided_tests.strip():
+        exec_summary_msg = "No user tests provided for this Python problem."
+    # 3. Combine Scores into a Final Score (Example Heuristic)
+    final_score_calculated = llm_based_score
+    if total_tests_count > 0: # If tests were run
+        test_pass_ratio = passed_tests_count / total_tests_count
+        if test_pass_ratio < 0.5 : # Penalize heavily if less than half tests pass
+            final_score_calculated = max(1, int(llm_based_score * 0.5) - 1)
+        elif test_pass_ratio == 1.0 and passed_tests_count > 0: # All tests passed
+            final_score_calculated = min(10, llm_based_score + 1 if llm_based_score < 10 else 10) # Small bonus
+        else: # Some tests passed or ratio between 0.5 and 1.0
+            final_score_calculated = int(llm_based_score * (0.6 + 0.4 * test_pass_ratio))
+    final_score_calculated = max(1, min(10, final_score_calculated)) # Ensure score is 1-10
+    # Construct comprehensive critique text for display
+    comprehensive_critique = f"{llm_critique_output_text}"
+    if total_tests_count > 0 or ("python" in problem_type.lower() and user_provided_tests.strip()): # Add test summary if applicable
+        comprehensive_critique += f"\n\n**Automated Test Summary (Simulated):**\n{exec_summary_msg}\n"
+        comprehensive_critique += f"Passed: {passed_tests_count}/{total_tests_count}"
+    return EvaluationResult(
+        score=final_score_calculated,
+        critique_text=comprehensive_critique,
+        passed_tests=passed_tests_count,
+        total_tests=total_tests_count,
+        execution_summary=exec_summary_msg,
+        raw_llm_critique_response=raw_llm_critique_resp
+    )