import os import json import shutil import time import argparse import subprocess import traceback import threading import concurrent.futures from datetime import datetime from threading import Timer from e2b_desktop import Sandbox from huggingface_hub import get_token from smolagents import CodeAgent, OpenAIServerModel from smolagents.monitoring import LogLevel from e2bqwen import QwenVLAPIModel, E2BVisionAgent from dotenv import load_dotenv load_dotenv() # Environment variables and constants E2B_API_KEY = os.getenv("E2B_API_KEY") # Try to get token dynamically, fall back to environment variable try: HUGGINGFACE_API_KEY = get_token() if not HUGGINGFACE_API_KEY: HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY") if not HUGGINGFACE_API_KEY: raise ValueError("No Hugging Face token found. Please login with `huggingface-cli login` or set HUGGINGFACE_API_KEY environment variable") except ImportError: # Fall back if huggingface_hub is old version without get_token HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY") WIDTH = 1024 HEIGHT = 768 SANDBOX_TIMEOUT = 600 # 10 minutes # Thread lock for print statements to avoid garbled output print_lock = threading.Lock() def thread_safe_print(*args, **kwargs): """Thread-safe print function""" with print_lock: print(*args, **kwargs) # Get git hash for folder naming def get_git_hash(): try: result = subprocess.run(['git', 'rev-parse', '--short', 'HEAD'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) if result.returncode == 0: return result.stdout.strip() return "nogit" except: return "nogit" def create_agent(data_dir, desktop, max_steps: int): """Create an agent with the E2B desktop sandbox""" model = QwenVLAPIModel( model_id="Qwen/Qwen2.5-VL-72B-Instruct", hf_token=HUGGINGFACE_API_KEY, ) # model = OpenAIServerModel( # model_id="gpt-4o", # api_key=os.getenv("OPENAI_API_KEY") # ) return E2BVisionAgent( model=model, data_dir=data_dir, desktop=desktop, max_steps=max_steps, verbosity_level=2, planning_interval=10, ) def get_agent_summary_erase_images(agent): """Get agent summary and erase images to save space""" for memory_step in agent.memory.steps: if getattr(memory_step, "observations_images", None): memory_step.observations_images = None return agent.memory.get_succinct_steps() def chat_message_to_json(obj): """Custom JSON serializer for ChatMessage and related objects""" if hasattr(obj, '__dict__'): # Create a copy of the object's __dict__ to avoid modifying the original result = obj.__dict__.copy() # Remove the 'raw' field which may contain non-serializable data if 'raw' in result: del result['raw'] # Process the content or tool_calls if they exist if 'content' in result and result['content'] is not None: if hasattr(result['content'], '__dict__'): result['content'] = chat_message_to_json(result['content']) if 'tool_calls' in result and result['tool_calls'] is not None: result['tool_calls'] = [chat_message_to_json(tc) for tc in result['tool_calls']] return result elif isinstance(obj, (list, tuple)): return [chat_message_to_json(item) for item in obj] else: return obj def save_final_status(folder, status: str, summary, error_message=None) -> None: """Save metadata about the run""" metadata_path = os.path.join(folder, "metadata.json") with open(metadata_path, "w") as output_file: output_file.write(json.dumps({ "status": status, "summary": summary, "error_message": error_message }, default=chat_message_to_json)) def run_example_once(example_name, example_text, run_index, example_dir, max_steps): """Run a single example once and return the result""" run_dir = os.path.join(example_dir, f"run_{run_index}") os.makedirs(run_dir, exist_ok=True) # Save the example text with open(os.path.join(run_dir, "task.txt"), "w") as f: f.write(example_text) thread_safe_print(f" Starting run {run_index} for example '{example_name}'") # Create a new sandbox for this run desktop = None try: desktop = Sandbox( api_key=E2B_API_KEY, resolution=(WIDTH, HEIGHT), dpi=96, timeout=SANDBOX_TIMEOUT ) # Initialize the desktop environment setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null""" desktop.commands.run(setup_cmd) # Create and run the agent agent = create_agent(data_dir=run_dir, desktop=desktop, max_steps=max_steps) try: agent.run(task=example_text) summary = get_agent_summary_erase_images(agent) save_final_status(run_dir, "completed", summary=summary) thread_safe_print(f" ✓ Example '{example_name}' run {run_index} completed successfully") result = {"status": "completed", "run_dir": run_dir} except Exception as e: error_message = f"Error in agent execution: {str(e)}" thread_safe_print(f" ✗ Example '{example_name}' run {run_index} failed: {error_message}") summary = get_agent_summary_erase_images(agent) if hasattr(agent, 'memory') else None save_final_status(run_dir, "failed", summary=summary, error_message=error_message) result = {"status": "failed", "run_dir": run_dir, "error": error_message} except Exception as e: error_message = f"Error setting up sandbox: {str(e)}" thread_safe_print(f" ✗ Example '{example_name}' run {run_index} failed: {error_message}") save_final_status(run_dir, "failed", summary=None, error_message=error_message) result = {"status": "failed", "run_dir": run_dir, "error": error_message} finally: # Always clean up the sandbox if desktop: try: desktop.kill() except: pass return result def run_example(example_name, example_text, num_runs, example_dir, max_steps): """Run a single example multiple times using threads for each run""" thread_safe_print(f"\nRunning example '{example_name}': '{example_text[:50]}...'") results = [] with concurrent.futures.ThreadPoolExecutor(max_workers=num_runs) as executor: # Submit all runs to the executor future_to_run = { executor.submit(run_example_once, example_name, example_text, j, example_dir, max_steps): j for j in range(num_runs) } # Collect results as they complete for future in concurrent.futures.as_completed(future_to_run): run_index = future_to_run[future] try: result = future.result() results.append(result) except Exception as exc: thread_safe_print(f" ✗ Run {run_index} for '{example_name}' generated an exception: {exc}") results.append({ "status": "error", "run_index": run_index, "error": str(exc) }) return results def run_evaluation(examples, num_runs, output_dir, max_parallel, max_steps): """Run each example n times and save the results""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") git_hash = get_git_hash() eval_dir = os.path.join(output_dir, f"eval_{timestamp}_{git_hash}") os.makedirs(eval_dir, exist_ok=True) thread_safe_print(f"Starting evaluation. Results will be saved to: {eval_dir}") thread_safe_print(f"Will run {len(examples)} examples, {num_runs} times each, with {max_parallel} parallel examples") # Save examples to the evaluation directory with open(os.path.join(eval_dir, "examples.json"), "w") as f: json.dump(examples, f, indent=2) all_results = {} # Run examples in parallel, but limit the number of parallel examples with concurrent.futures.ThreadPoolExecutor(max_workers=max_parallel) as executor: # Prepare the example directories first example_dirs = {} for example_name in examples: example_dir = os.path.join(eval_dir, f"example_{example_name}") os.makedirs(example_dir, exist_ok=True) example_dirs[example_name] = example_dir # Submit all examples to the executor future_to_example = { executor.submit(run_example, example_name, example_text, num_runs, example_dirs[example_name], max_steps): example_name for example_name, example_text in examples.items() } # Collect results as they complete for future in concurrent.futures.as_completed(future_to_example): example_name = future_to_example[future] try: results = future.result() all_results[example_name] = results # Calculate success rate for this example success_count = sum(1 for r in results if r["status"] == "completed") thread_safe_print(f"Example '{example_name}' complete: {success_count}/{num_runs} successful runs ({success_count/num_runs*100:.1f}%)") except Exception as exc: thread_safe_print(f"Example '{example_name}' generated an exception: {exc}") all_results[example_name] = [{"status": "error", "error": str(exc)}] # Calculate overall results and success rates success_counts = { example_name: sum(1 for r in results if r["status"] == "completed") for example_name, results in all_results.items() } total_runs = sum(len(results) for results in all_results.values()) total_successes = sum(success_counts.values()) # Save summary to evaluation directory summary = { "total_runs": total_runs, "total_successes": total_successes, "success_rate": total_successes / total_runs if total_runs > 0 else 0, "example_success_rates": { example_name: success_counts[example_name] / len(all_results[example_name]) for example_name in examples } } with open(os.path.join(eval_dir, "summary.json"), "w") as f: json.dump(summary, f, indent=2) thread_safe_print(f"\nEvaluation complete. Results saved to: {eval_dir}") thread_safe_print(f"Overall success rate: {summary['success_rate']*100:.1f}% ({total_successes}/{total_runs})") for example_name in examples: success_rate = summary["example_success_rates"][example_name] * 100 thread_safe_print(f"Example '{example_name}': {success_rate:.1f}% success") return eval_dir def main(): parser = argparse.ArgumentParser(description="Evaluate computer agent on examples") parser.add_argument("--num-runs", type=int, default=3, help="Number of runs per example") parser.add_argument("--output-dir", type=str, default="./eval_results", help="Output directory for evaluation results") parser.add_argument("--max-parallel", type=int, default=2, help="Maximum number of examples to run in parallel") parser.add_argument("--max-steps", type=int, default=200, help="Maximum number of steps in each run") args = parser.parse_args() # Examples from the original code examples = { "puppies": "Find me pictures of cute puppies", "commute": "Check the commuting time between Bern and Zurich on Google maps", "hello": "Write 'Hello World' in a text editor", "wiki": "When was Temple Grandin introduced to the American Academy of Arts and Sciences, according to Wikipedia?", "flight": "Search a flight Rome - Berlin for tomorrow", "pond": "What's the name of the pond just south of Château de Fontainebleau in Google maps?", "flux": "Go generate a picture of the Golden Gate bridge on a FLUX1.dev space", "hf": "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background", } # Create output directory if it doesn't exist os.makedirs(args.output_dir, exist_ok=True) # Run the evaluation eval_dir = run_evaluation(examples, args.num_runs, args.output_dir, args.max_parallel, args.max_steps) if __name__ == "__main__": main()