Spaces:

opencompass
/

MultiPLE-Evaluator

Running

App Files Files Community

dongsheng commited on Mar 20

Commit

41e79e2

verified ·

1 Parent(s): 19139e3

Upload 48 files

Browse files

Files changed (48) hide show

Dockerfile +14 -0
README.md +3 -3
app.py +677 -0
requirements.txt +1 -0
src/__init__.py +1 -0
src/containerized_eval.py +98 -0
src/eval_adb.py +64 -0
src/eval_clj.py +42 -0
src/eval_cpp.py +40 -0
src/eval_cs.py +65 -0
src/eval_dart.py +27 -0
src/eval_dfy.py +29 -0
src/eval_dlang.py +63 -0
src/eval_elixir.py +37 -0
src/eval_fs.py +17 -0
src/eval_go.py +52 -0
src/eval_hs.py +19 -0
src/eval_java.py +50 -0
src/eval_javascript.py +49 -0
src/eval_julia.py +21 -0
src/eval_lean.py +29 -0
src/eval_lua.py +17 -0
src/eval_luau.py +26 -0
src/eval_matlab.py +53 -0
src/eval_ocaml.py +21 -0
src/eval_php.py +20 -0
src/eval_pl.py +20 -0
src/eval_python.py +19 -0
src/eval_r.py +47 -0
src/eval_racket.py +49 -0
src/eval_ruby.py +43 -0
src/eval_rust.py +53 -0
src/eval_scala.py +37 -0
src/eval_sh.py +24 -0
src/eval_swift.py +30 -0
src/eval_ts.py +33 -0
src/eval_v.py +40 -0
src/generic_eval.py +149 -0
src/libeval.py +40 -0
src/safe_subprocess/.gitignore +2 -0
src/safe_subprocess/__init__.py +91 -0
src/safe_subprocess/evil_programs/block_on_inputs.py +2 -0
src/safe_subprocess/evil_programs/close_outputs.py +7 -0
src/safe_subprocess/evil_programs/fork_bomb.py +4 -0
src/safe_subprocess/evil_programs/fork_once.py +6 -0
src/safe_subprocess/evil_programs/sleep_forever.py +4 -0
src/safe_subprocess/evil_programs/unbounded_output.py +4 -0
src/safe_subprocess/module_test.py +103 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,14 @@

+FROM ghcr.io/nuprl/multipl-e-evaluation:v3.1
+# Install GNAT for Ada language support
+RUN apt-get update && apt-get install -y gnat && apt-get clean
+# Override the default entrypoint of the base image
+ENTRYPOINT []
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+COPY . .
+EXPOSE 7860
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+CMD ["python3", "app.py"]

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: MultiPLE Evaluator
 emoji: 🔥
-colorFrom: indigo
-colorTo: pink
 sdk: docker
 pinned: false
 ---

 ---
+title: Docker Test
 emoji: 🔥
+colorFrom: blue
+colorTo: gray
 sdk: docker
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,677 @@

+import gradio as gr
+import json
+import importlib
+import os
+import sys
+from pathlib import Path
+import concurrent.futures
+import multiprocessing
+import time
+import threading
+import queue
+import uuid
+import numpy as np
+from datetime import datetime
+from tqdm.auto import tqdm
+from src.containerized_eval import eval_string_script
+# Add current directory and src directory to module search path
+current_dir = os.path.dirname(os.path.abspath(__file__))
+src_dir = os.path.join(current_dir, "src")
+if current_dir not in sys.path:
+    sys.path.append(current_dir)
+if src_dir not in sys.path:
+    sys.path.append(src_dir)
+# Create message queue
+task_queue = queue.Queue()
+# Dictionary to store task status
+task_status = {}
+# List to store task history, max 200 tasks
+task_history = []
+# Lock for shared resources
+lock = threading.Lock()
+# Number of worker threads
+worker_threads = max(1, multiprocessing.cpu_count() // 2)  # Using half the available cores for better stability
+# Flag for running background threads
+running = True
+# Mapping from task type to processing time
+task_type_times = {}
+def queue_processor():
+    """Process tasks in the queue"""
+    while running:
+        try:
+            task_id, input_data, request_time = task_queue.get(timeout=0.1)
+            with lock:
+                task_status[task_id]['status'] = 'processing'
+                task_status[task_id]['start_time'] = time.time()
+            if isinstance(input_data, list) and len(input_data) > 0:
+                sample_task = input_data[0]
+                language = sample_task.get('language', 'unknown') if isinstance(sample_task, dict) else 'unknown'
+                task_size = len(input_data)
+                task_complexity = _estimate_task_complexity(input_data)
+                with lock:
+                    task_status[task_id]['estimated_factors'] = {
+                        'language': language,
+                        'size': task_size,
+                        'complexity': task_complexity
+                    }
+            result = evaluate(input_data)
+            end_time = time.time()
+            process_time = end_time - task_status[task_id]['start_time']
+            with lock:
+                task_status[task_id]['status'] = 'completed'
+                task_status[task_id]['result'] = result
+                task_status[task_id]['end_time'] = end_time
+                task_status[task_id]['process_time'] = process_time
+                if 'estimated_factors' in task_status[task_id]:
+                    factors = task_status[task_id]['estimated_factors']
+                    key = f"{factors['language']}_{factors['complexity']}"
+                    if key not in task_type_times:
+                        task_type_times[key] = []
+                    task_type_times[key].append(process_time / factors['size'])
+                    if len(task_type_times[key]) > 10:
+                        task_type_times[key] = task_type_times[key][-10:]
+                task_history.append({
+                    'task_id': task_id,
+                    'request_time': request_time,
+                    'process_time': process_time,
+                    'status': 'completed',
+                    'factors': task_status[task_id].get('estimated_factors', {})
+                })
+                while len(task_history) > 200:
+                    task_history.pop(0)
+            task_queue.task_done()
+        except queue.Empty:
+            continue
+        except Exception as e:
+            if 'task_id' in locals():
+                with lock:
+                    task_status[task_id]['status'] = 'error'
+                    task_status[task_id]['error'] = str(e)
+                    task_status[task_id]['end_time'] = time.time()
+            task_queue.task_done()
+def _estimate_task_complexity(tasks):
+    """Estimate task complexity
+    Returns: 'simple', 'medium', or 'complex'
+    """
+    total_code_length = 0
+    count = 0
+    for task in tasks:
+        if isinstance(task, dict):
+            prompt = task.get('prompt', '')
+            tests = task.get('tests', '')
+            completions = task.get('processed_completions', [])
+            code_length = len(prompt) + len(tests)
+            if completions:
+                code_length += sum(len(comp) for comp in completions)
+            total_code_length += code_length
+            count += 1
+    if count == 0:
+        return 'medium'
+    avg_length = total_code_length / count
+    if avg_length < 1000:
+        return 'simple'
+    elif avg_length < 5000:
+        return 'medium'
+    else:
+        return 'complex'
+def evaluate(input_data):
+    """Main function for code evaluation"""
+    try:
+        if not isinstance(input_data, list):
+            return {"status": "Exception", "error": "Input must be a list"}
+        results = []
+        # Use a moderate number of workers for all language tests to ensure stability
+        # This prevents resource contention regardless of language
+        max_workers = max(1, min(multiprocessing.cpu_count() // 2, 4))
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+            future_to_item = {executor.submit(evaluate_single_case, item): item for item in input_data}
+            for future in concurrent.futures.as_completed(future_to_item):
+                item = future_to_item[future]
+                try:
+                    result = future.result()
+                    item.update(result)
+                    results.append(item)
+                except Exception as e:
+                    item.update({"status": "Exception", "error": str(e)})
+                    results.append(item)
+        return results
+    except Exception as e:
+        return {"status": "Exception", "error": str(e)}
+def evaluate_single_case(input_data):
+    """Evaluate a single code case"""
+    try:
+        if not isinstance(input_data, dict):
+            return {"status": "Exception", "error": "Input item must be a dictionary"}
+        language = input_data.get('language')
+        completions = input_data.get('processed_completions', [])
+        if not completions:
+            return {"status": "Exception", "error": "No code provided"}
+        # Use a retry mechanism for all languages for better reliability
+        max_retries = 2  # One retry for all languages
+        results = []
+        for comp in completions:
+            code = input_data.get('prompt') + comp + '\n' + input_data.get('tests')
+            # Try up to max_retries + 1 times for all test cases
+            for attempt in range(max_retries + 1):
+                result = evaluate_code(code, language)
+                # If success or last attempt, return/record the result
+                if result["status"] == "OK" or attempt == max_retries:
+                    if result["status"] == "OK":
+                        return result
+                    results.append(result)
+                    break
+                # For retries, briefly wait to allow resources to stabilize
+                time.sleep(0.3)
+        return results[0]
+    except Exception as e:
+        return {"status": "Exception", "error": str(e)}
+def evaluate_code(code, language):
+    """Evaluate code in a specific language"""
+    try:
+        result = eval_string_script(language, code)
+        return result
+    except Exception as e:
+        return {"status": "Exception", "error": str(e)}
+def synchronous_evaluate(input_data):
+    """Synchronously evaluate code, compatible with original interface"""
+    if isinstance(input_data, list) and len(input_data) > 0:
+        sample_task = input_data[0]
+        language = sample_task.get('language', 'unknown') if isinstance(sample_task, dict) else 'unknown'
+        task_size = len(input_data)
+        task_complexity = _estimate_task_complexity(input_data)
+    else:
+        language = 'unknown'
+        task_size = 1
+        task_complexity = 'medium'
+    estimated_time_per_task = _get_estimated_time_for_task(language, task_complexity)
+    estimated_total_time = estimated_time_per_task * task_size
+    queue_info = get_queue_status()
+    waiting_tasks = queue_info['waiting_tasks']
+    task_id = str(uuid.uuid4())
+    request_time = time.time()
+    with lock:
+        task_status[task_id] = {
+            'status': 'queued',
+            'queued_time': request_time,
+            'queue_position': task_queue.qsize() + 1,
+            'synchronous': True,
+            'estimated_factors': {
+                'language': language,
+                'size': task_size,
+                'complexity': task_complexity
+            },
+            'estimated_time': estimated_total_time
+        }
+    task_queue.put((task_id, input_data, request_time))
+    while True:
+        with lock:
+            if task_id in task_status:
+                status = task_status[task_id]['status']
+                if status == 'completed':
+                    result = task_status[task_id]['result']
+                    task_status.pop(task_id, None)
+                    return result
+                elif status == 'error':
+                    error = task_status[task_id].get('error', 'Unknown error')
+                    task_status.pop(task_id, None)
+                    return {"status": "Exception", "error": error}
+        time.sleep(0.1)
+def _get_estimated_time_for_task(language, complexity):
+    """Get estimated processing time for a specific task type"""
+    key = f"{language}_{complexity}"
+    if key in task_type_times and len(task_type_times[key]) > 0:
+        return np.median(task_type_times[key])
+    if complexity == 'simple':
+        return 1.0
+    elif complexity == 'medium':
+        return 3.0
+    else:  # complex
+        return 8.0
+def enqueue_task(input_data):
+    """Add task to queue"""
+    if isinstance(input_data, list) and len(input_data) > 0:
+        sample_task = input_data[0]
+        language = sample_task.get('language', 'unknown') if isinstance(sample_task, dict) else 'unknown'
+        task_size = len(input_data)
+        task_complexity = _estimate_task_complexity(input_data)
+    else:
+        language = 'unknown'
+        task_size = 1
+        task_complexity = 'medium'
+    estimated_time_per_task = _get_estimated_time_for_task(language, task_complexity)
+    estimated_total_time = estimated_time_per_task * task_size
+    task_id = str(uuid.uuid4())
+    request_time = time.time()
+    with lock:
+        task_status[task_id] = {
+            'status': 'queued',
+            'queued_time': request_time,
+            'queue_position': task_queue.qsize() + 1,
+            'estimated_factors': {
+                'language': language,
+                'size': task_size,
+                'complexity': task_complexity
+            },
+            'estimated_time': estimated_total_time
+        }
+    queue_info = get_queue_status()
+    est_wait = queue_info['estimated_wait']
+    task_queue.put((task_id, input_data, request_time))
+    return {
+        'task_id': task_id,
+        'status': 'queued',
+        'queue_position': task_status[task_id]['queue_position'],
+        'estimated_wait': est_wait,
+        'estimated_processing': estimated_total_time
+    }
+def check_status(task_id):
+    """Check task status"""
+    with lock:
+        if task_id not in task_status:
+            return {'status': 'not_found'}
+        status_info = task_status[task_id].copy()
+        if status_info['status'] in ['completed', 'error'] and time.time() - status_info.get('end_time', 0) > 3600:
+            task_status.pop(task_id, None)
+        return status_info
+def get_queue_status():
+    """Get queue status"""
+    with lock:
+        queued_tasks = [t for t in task_status.values() if t['status'] == 'queued']
+        processing_tasks = [t for t in task_status.values() if t['status'] == 'processing']
+        queue_size = task_queue.qsize()
+        active_tasks = len(processing_tasks)
+        waiting_tasks = len(queued_tasks)
+        remaining_processing_time = 0
+        for task in processing_tasks:
+            if 'start_time' in task and 'estimated_time' in task:
+                elapsed = time.time() - task['start_time']
+                remaining = max(0, task['estimated_time'] - elapsed)
+                remaining_processing_time += remaining
+            else:
+                remaining_processing_time += 2
+        if active_tasks > 0:
+            remaining_processing_time = remaining_processing_time / min(active_tasks, worker_threads)
+        queued_processing_time = 0
+        for task in queued_tasks:
+            if 'estimated_time' in task:
+                queued_processing_time += task['estimated_time']
+            else:
+                queued_processing_time += 5
+        if worker_threads > 0 and queued_processing_time > 0:
+            queued_processing_time = queued_processing_time / worker_threads
+        estimated_wait = remaining_processing_time + queued_processing_time
+        if task_history:
+            prediction_ratios = []
+            for task in task_history:
+                if 'factors' in task and 'estimated_time' in task:
+                    prediction_ratios.append(task['process_time'] / task['estimated_time'])
+            if prediction_ratios:
+                correction_factor = np.median(prediction_ratios)
+                correction_factor = max(0.5, min(2.0, correction_factor))
+                estimated_wait *= correction_factor
+        estimated_wait = max(0.1, estimated_wait)
+        if waiting_tasks == 0 and active_tasks == 0:
+            estimated_wait = 0
+        recent_tasks = task_history[-5:] if task_history else []
+        return {
+            'queue_size': queue_size,
+            'active_tasks': active_tasks,
+            'waiting_tasks': waiting_tasks,
+            'worker_threads': worker_threads,
+            'estimated_wait': estimated_wait,
+            'recent_tasks': recent_tasks
+        }
+def format_time(seconds):
+    """Format time into readable format"""
+    if seconds < 60:
+        return f"{seconds:.1f} seconds"
+    elif seconds < 3600:
+        minutes = int(seconds / 60)
+        seconds = seconds % 60
+        return f"{minutes}m {seconds:.1f}s"
+    else:
+        hours = int(seconds / 3600)
+        minutes = int((seconds % 3600) / 60)
+        return f"{hours}h {minutes}m"
+def ui_get_queue_info():
+    """Get queue info for UI"""
+    queue_info = get_queue_status()
+    tasks_html = ""
+    for task in reversed(queue_info['recent_tasks']):
+        tasks_html += f"""
+        <tr>
+            <td>{task['task_id'][:8]}...</td>
+            <td>{datetime.fromtimestamp(task['request_time']).strftime('%H:%M:%S')}</td>
+            <td>{format_time(task['process_time'])}</td>
+        </tr>
+        """
+    if not tasks_html:
+        tasks_html = """
+        <tr>
+            <td colspan="3" style="text-align: center; padding: 20px;">No historical tasks</td>
+        </tr>
+        """
+    return f"""
+    <div class="dashboard">
+        <div class="queue-info-card main-card">
+            <h3 class="card-title">Queue Status Monitor</h3>
+            <div class="queue-stats">
+                <div class="stat-item">
+                    <div class="stat-value">{queue_info['waiting_tasks']}</div>
+                    <div class="stat-label">Waiting</div>
+                </div>
+                <div class="stat-item">
+                    <div class="stat-value">{queue_info['active_tasks']}</div>
+                    <div class="stat-label">Processing</div>
+                </div>
+                <div class="stat-item">
+                    <div class="stat-value">{queue_info['worker_threads']}</div>
+                    <div class="stat-label">Worker Threads</div>
+                </div>
+            </div>
+            <div class="wait-time">
+                <p><b>Current Estimated Wait Time:</b> {format_time(queue_info['estimated_wait'])}</p>
+                <p class="last-update"><small>Last update: {datetime.now().strftime('%H:%M:%S')}</small></p>
+            </div>
+        </div>
+        <div class="queue-info-card history-card">
+            <h3 class="card-title">Recently Processed Tasks</h3>
+            <table class="recent-tasks">
+                <thead>
+                    <tr>
+                        <th>Task ID</th>
+                        <th>Request Time</th>
+                        <th>Processing Time</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    {tasks_html}
+                </tbody>
+            </table>
+        </div>
+    </div>
+    """
+def launch_workers():
+    """Launch worker threads"""
+    global running
+    running = True
+    for _ in range(worker_threads):
+        worker = threading.Thread(target=queue_processor)
+        worker.daemon = True
+        worker.start()
+# Custom CSS
+custom_css = """
+.container {
+    max-width: 1200px;
+    margin: 0 auto;
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+}
+.dashboard {
+    display: flex;
+    flex-direction: column;
+    gap: 20px;
+}
+.card-title {
+    color: #333;
+    border-bottom: 2px solid #ddd;
+    padding-bottom: 10px;
+    margin-top: 0;
+}
+.status-card, .queue-info-card {
+    background: #fff;
+    border-radius: 12px;
+    padding: 20px;
+    margin: 10px 0;
+    box-shadow: 0 4px 15px rgba(0,0,0,0.08);
+}
+.main-card {
+    border-top: 5px solid #4285f4;
+}
+.history-card {
+    border-top: 5px solid #34a853;
+}
+.status-card.success {
+    background: #e7f5e7;
+    border-left: 5px solid #28a745;
+}
+.status-card.error {
+    background: #f8d7da;
+    border-left: 5px solid #dc3545;
+}
+.error-message {
+    color: #dc3545;
+    font-weight: bold;
+    padding: 10px;
+    background: #f8d7da;
+    border-radius: 5px;
+}
+.notice {
+    color: #0c5460;
+    background-color: #d1ecf1;
+    padding: 10px;
+    border-radius: 5px;
+}
+.queue-stats {
+    display: flex;
+    justify-content: space-around;
+    margin: 20px 0;
+}
+.stat-item {
+    text-align: center;
+    padding: 15px;
+    background: #f8f9fa;
+    border-radius: 10px;
+    min-width: 120px;
+    transition: transform 0.3s ease;
+}
+.stat-item:hover {
+    transform: translateY(-5px);
+    box-shadow: 0 5px 15px rgba(0,0,0,0.1);
+}
+.stat-value {
+    font-size: 32px;
+    font-weight: bold;
+    color: #4285f4;
+    margin-bottom: 5px;
+}
+.stat-label {
+    color: #5f6368;
+    font-size: 16px;
+}
+.wait-time {
+    text-align: center;
+    margin: 20px 0;
+    padding: 15px;
+    background: #f1f3f4;
+    border-radius: 8px;
+    font-size: 18px;
+}
+.last-update {
+    color: #80868b;
+    margin-top: 10px;
+    margin-bottom: 0;
+}
+.recent-tasks {
+    width: 100%;
+    border-collapse: collapse;
+    margin-top: 15px;
+    background: white;
+    box-shadow: 0 1px 3px rgba(0,0,0,0.05);
+}
+.recent-tasks th, .recent-tasks td {
+    border: 1px solid #e0e0e0;
+    padding: 12px 15px;
+    text-align: center;
+}
+.recent-tasks th {
+    background-color: #f1f3f4;
+    color: #202124;
+    font-weight: 500;
+}
+.recent-tasks tbody tr:hover {
+    background-color: #f8f9fa;
+}
+.tabs {
+    margin-top: 20px;
+}
+button.primary {
+    background-color: #4285f4;
+    color: white;
+    padding: 10px 20px;
+    border: none;
+    border-radius: 4px;
+    cursor: pointer;
+    font-size: 16px;
+    font-weight: 500;
+    transition: background-color 0.3s;
+}
+button.primary:hover {
+    background-color: #3367d6;
+}
+"""
+# Initialize and launch worker threads
+launch_workers()
+# Create Gradio interface
+with gr.Blocks(css=custom_css) as demo:
+    gr.Markdown("# Code Evaluation Service")
+    gr.Markdown("Code evaluation service supporting multiple programming languages, using queue mechanism to process requests")
+    with gr.Row():
+        with gr.Column(scale=3):
+            # Queue status info card
+            queue_info_html = gr.HTML()
+            refresh_queue_btn = gr.Button("Refresh Queue Status", variant="primary")
+    # Hidden API interface components
+    with gr.Row(visible=False):
+        api_input = gr.JSON()
+        api_output = gr.JSON()
+    # Define update function
+    def update_queue_info():
+        return ui_get_queue_info()
+    # Update queue info periodically
+    demo.load(update_queue_info, None, queue_info_html, every=3)
+    # Refresh button event
+    refresh_queue_btn.click(update_queue_info, None, queue_info_html)
+    # Add evaluation endpoint compatible with original interface
+    demo.queue()
+    evaluate_endpoint = demo.load(fn=synchronous_evaluate, inputs=api_input, outputs=api_output, api_name="evaluate")
+if __name__ == "__main__":
+    try:
+        demo.launch()
+    finally:
+        # Stop worker threads
+        running = False

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ gradio==4.44.1

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # src package

src/containerized_eval.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from pathlib import Path
+from . import eval_adb
+from . import eval_ruby
+from . import eval_lua
+from . import eval_python
+from . import eval_rust
+from . import eval_julia
+from . import eval_java
+from . import eval_lua
+from . import eval_racket
+from . import eval_javascript
+from . import eval_swift
+from . import eval_cpp
+from . import eval_php
+from . import eval_dlang
+from . import eval_julia
+from . import eval_r
+from . import eval_fs
+from . import eval_ocaml
+from . import eval_matlab
+from . import eval_hs
+from . import eval_elixir
+from . import eval_clj
+from . import eval_v
+from . import eval_lean
+from . import eval_dart
+from . import eval_go
+import tempfile
+EVALUATORS = {
+    "ada": (eval_adb.eval_script, ".adb"),
+    "rb": (eval_ruby.eval_script, ".rb"),
+    "lua": (eval_lua.eval_script, ".lua"),
+    "python": (eval_python.eval_script, ".py"),
+    "py": (eval_python.eval_script, ".py"),
+    "notypes.py": (eval_python.eval_script, ".py"),
+    "julia": (eval_julia.eval_script, ".jl"),
+    "java" : (eval_java.eval_script, ".java"),
+    "rust" : (eval_rust.eval_script, ".rs"),
+    "rs" : (eval_rust.eval_script, ".rs"),
+    "swift": (eval_swift.eval_script, ".swift"),
+    "lua": (eval_lua.eval_script, ".lua"),
+    "racket": (eval_racket.eval_script, ".rkt"),
+    "rkt": (eval_racket.eval_script, ".rkt"),
+    "javascript": (eval_javascript.eval_script, ".js"),
+    "js": (eval_javascript.eval_script, ".js"),
+    "cpp": (eval_cpp.eval_script, ".cpp"),
+    "php": (eval_php.eval_script, ".php"),
+    "humaneval_to_dlang.py": (eval_dlang.eval_script, ".d"),
+    "d": (eval_dlang.eval_script, ".d"),
+    "r": (eval_r.eval_script, ".r"),
+    "humaneval_to_r.py": (eval_r.eval_script, ".r"),
+    "jl": (eval_julia.eval_script, ".jl"),
+    "fs": (eval_fs.eval_script, ".fsx"),
+    "ml": (eval_ocaml.eval_script, ".ml"),
+    "m": (eval_matlab.eval_script, ".m"),
+    "hs": (eval_hs.eval_script, ".hs"),
+    "elixir": (eval_elixir.eval_script, ".exs"),
+    "clj": (eval_clj.eval_script, ".clj"),
+    "coq": (eval_v.eval_script, ".v"),
+    "lean": (eval_lean.eval_script, ".lean"),
+    "dart": (eval_dart.eval_script, ".dart"),
+    "go": (eval_go.eval_script, ".go"),
+    "go_test.go": (eval_go.eval_script, "_test.go"),
+}
+def eval_string_script(language, program):
+    if language in EVALUATORS:
+        (eval_script, file_ext) = EVALUATORS[language]
+    else:
+        eval_module = __import__(f"eval_{language}" if language != "go_test.go" else "eval_go")
+        eval_script = eval_module.eval_script
+        file_ext = f".{language}" if language != "go_test.go" else "_test.go"
+    with tempfile.NamedTemporaryFile(suffix=file_ext, delete=True) as f:
+        f.write(program.encode("utf-8"))
+        f.flush()
+        result = eval_script(Path(f.name))
+        # Only save the first 2K of output from the running program. Any futher
+        # output is very likely an exceptionally long stack trace or a long
+        # series of prints.
+        if type(result["stdout"]) == bytes:
+            result["stdout"] = result["stdout"].decode("utf-8", errors="ignore")
+        if result["stdout"] is None:
+            result["stdout"] = ""
+        if result["stderr"] is None:
+            result["stderr"] = ""
+        if type(result["stderr"]) == bytes:
+            result["stderr"] = result["stderr"].decode("utf-8", errors="ignore")
+        assert type(result["stdout"]) == str
+        assert type(result["stderr"]) == str
+        return {
+            "program": program,
+            "stdout": result['stdout'].replace("!!int", "")[:2048],
+            "stderr": result['stderr'][:2048],
+            "exit_code": result['exit_code'],
+            "status": result['status']
+        }

src/eval_adb.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from pathlib import Path
+from src.safe_subprocess import run
+from src.generic_eval import main
+LANG_NAME = "Ada"
+LANG_EXT = ".adb"
+def eval_script(path: Path):
+    working_dir: Path = path.parent / (path.stem + "_tmp")
+    working_dir.mkdir()
+    chop_result = run(["gnatchop", "-w", path, working_dir])
+    if chop_result.exit_code != 0:
+        return {
+            "status": "SyntaxError (gnatchop)",
+            "exit_code": chop_result.exit_code,
+            "stdout": chop_result.stdout,
+            "stderr": chop_result.stderr,
+        }
+    build_result = run(
+        [
+            "gnatmake",
+            "-gnatW8",
+            "main.adb",
+            "-o",
+            "main",
+            "-g",
+            "-j0",
+            "-gnata",
+            "-gnat2022",
+            "-gnateE",
+            "-bargs",
+            "-Es",
+        ],
+        cwd=str(working_dir),
+    )
+    if build_result.exit_code != 0:
+        return {
+            "status": "SyntaxError (gnatmake)",
+            "exit_code": build_result.exit_code,
+            "stdout": build_result.stdout,
+            "stderr": build_result.stderr,
+        }
+    status = "OK"
+    run_result = run(["./main"], cwd=str(working_dir))
+    if run_result.timeout:
+        status = "Timeout"
+    elif run_result.exit_code != 0:
+        status = "Exception"
+    return {
+        "status": status,
+        "exit_code": run_result.exit_code,
+        "stdout": run_result.stdout,
+        "stderr": run_result.stderr,
+    }
+if __name__ == "__main__":
+    main(eval_script, LANG_NAME, LANG_EXT)

src/eval_clj.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""
+Evaluates a generated Clojure program (.clj).
+"""
+import os
+import tempfile
+from pathlib import Path
+from src.safe_subprocess import run
+from src.libeval import run_without_exn
+def eval_script(path: Path):
+    # Create environment with a writable temporary directory for Clojure cache
+    temp_dir = tempfile.mkdtemp(prefix="clojure_home_")
+    env = os.environ.copy()
+    env["XDG_CONFIG_HOME"] = temp_dir  # Set XDG_CONFIG_HOME for Clojure cache
+    env["XDG_DATA_HOME"] = temp_dir    # Set XDG_DATA_HOME for Clojure data
+    env["XDG_CACHE_HOME"] = temp_dir   # Set XDG_CACHE_HOME for caches
+    # Run Clojure with the custom environment
+    result = run(
+        ["clojure", "-J-Dclojure.main.report=stderr", "-M", str(path)],
+        env=env
+    )
+    if result.timeout:
+        status = "Timeout"
+    elif result.exit_code != 0:
+        status = "Exception"
+    elif "\n0 failures, 0 errors.\n" in result.stdout:
+        status = "OK"
+    else: # test failure
+        status = "Exception"
+    return {
+        "status": status,
+        "exit_code": result.exit_code,
+        "stdout": result.stdout,
+        "stderr": result.stderr,
+    }
+if __name__ == "__main__":
+    print("This module is not meant to be executed directly.")

src/eval_cpp.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from pathlib import Path
+from src.safe_subprocess import run
+from src.generic_eval import main
+LANG_NAME = "C++"
+LANG_EXT = ".cpp"
+def eval_script(path: Path):
+    basename = ".".join(str(path).split(".")[:-1])
+    build_result = run(["g++", path, "-o", basename, "-std=c++17"])
+    if build_result.exit_code != 0:
+        return {
+            "status": "SyntaxError",
+            "exit_code": build_result.exit_code,
+            "stdout": build_result.stdout,
+            "stderr": build_result.stderr,
+        }
+    run_result = run([basename])
+    if "In file included from /shared/centos7/gcc/9.2.0-skylake/" in run_result.stderr:
+        raise Exception("Skylake bug encountered")
+    if "/4.8.2" in run_result.stderr:
+        raise Exception("Ancient compiler encountered")
+    if run_result.timeout:
+        status = "Timeout"
+    elif run_result.exit_code != 0:
+        status = "Exception"
+    else:
+        status = "OK"
+    return {
+        "status": status,
+        "exit_code": run_result.exit_code,
+        "stdout": run_result.stdout,
+        "stderr": run_result.stderr,
+    }
+if __name__ == "__main__":
+    main(eval_script, LANG_NAME, LANG_EXT)

src/eval_cs.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import subprocess
+import tempfile
+from pathlib import Path
+from src.generic_eval import main
+LANG_NAME = "CSharp"
+LANG_EXT = ".cs"
+#Following files have problems:
+#137,
+#22: Any
+#148: Elipsis
+def eval_script(path: str):
+    if ".cs" not in path.name:
+        return
+    basename = ".".join(str(path).split(".")[:-1])
+    binaryname = basename + ".exe"
+    build = subprocess.run(["csc", "/d:DEBUG", "-r:System.Numerics.dll", path, f"/out:{binaryname}"], capture_output=True)
+    status = None
+    returncode = -1
+    output = None
+    if build.returncode != 0:
+        # Well, it's a compile error. May be a type error or
+        # something. But, why break the set convention
+        status = "SyntaxError"
+        returncode = build.returncode
+        output = build
+    else:
+        try:
+            output = subprocess.run(["mono", binaryname], env={"PATH": os.getenv("PATH"), "MONO_TRACE_LISTENER":"Console.Error"}, capture_output=True, timeout=5)
+            returncode = output.returncode
+            output.stderr = str(output.stderr, "utf-8")
+            #mono return 0 even when failing
+            fail = "System.Diagnostics.DefaultTraceListener.Fail" in output.stderr or "Unhandled Exception" in output.stderr
+            output.returncode = 1 if fail else 0
+            if output.returncode == 0:
+                status = "OK"
+            else:
+                # Well, it's a panic
+                status = "Exception"
+        except subprocess.TimeoutExpired as exc:
+            status = "Timeout"
+            output = exc
+        os.remove(binaryname)
+    if output.stdout is not None:
+        output.stdout = output.stdout.decode("utf-8")
+    else:
+        output.stdout = "None"
+    if output.stderr == "":
+        output.stderr = "None"
+    return {
+        "status": status,
+        "exit_code": returncode,
+        "stdout": output.stdout,
+        "stderr": output.stderr,
+    }
+if __name__ == "__main__":
+    main(eval_script, LANG_NAME, LANG_EXT)

src/eval_dart.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from pathlib import Path
+from src.safe_subprocess import run
+def eval_script(path: Path):
+    r = run(["dart", "analyze", "--no-fatal-warnings", str(path)], timeout_seconds=15)
+    if r.exit_code != 0:
+        return {
+            "status": "SyntaxError",
+            "exit_code": r.exit_code,
+            "stdout": r.stdout,
+            "stderr": r.stderr,
+        }
+    r = run(["dart", str(path)], timeout_seconds=15)
+    if r.timeout:
+        status = "Timeout"
+    elif r.exit_code == 0:
+        status = "OK"
+    else:
+        status = "Exception"
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }

src/eval_dfy.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from pathlib import Path
+from src.safe_subprocess import run
+# 0 – success
+# 1 – invalid command-line arguments
+# 2 – syntax, parse, or name or type resolution errors
+# 3 – compilation errors
+# 4 – verification errors
+def eval_script(path: Path):
+    r = run(["dafny", "run", str(path)])
+    if r.timeout:
+        status = "Timeout"
+    elif r.exit_code == 0:
+        status = "OK"
+    elif r.exit_code == 2:
+        status = "SyntaxError"
+    elif r.exit_code == 3:
+        status = "CompilationError"
+    elif r.exit_code == 4:
+        status = "VerificationError"
+    else:
+        status = "Exception"
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }

src/eval_dlang.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import subprocess
+from pathlib import Path
+from src.safe_subprocess import run
+import sys
+import re
+ENABLE_SYNTAX_CHECK = False
+def eval_script(path: Path):
+    result = run(["rdmd", "-unittest", str(path)], timeout_seconds=15)
+    if "might not be correctly installed" in result.stderr:
+        raise Exception("D is not correctly installed")
+    if result.timeout:
+        status = "Timeout"
+    elif result.exit_code == 0:
+        status = "OK"
+    elif "Error:" in result.stderr:
+        status = "SyntaxError"
+    else:
+        status = "Exception"
+    return {
+        "status": status,
+        "exit_code": result.exit_code,
+        "stdout": result.stdout,
+        "stderr": result.stderr,
+    }
+DIR = "d-keep-code_davinci_001_temp_0.2"
+def main():
+    directory = Path(Path(__file__).parent, "..", "datasets", DIR).resolve()
+    count = {"OK": 0, "Timeout": 0, "Exception": 0, "SyntaxError": 0}
+    for filename in os.listdir(directory):
+        path = Path.joinpath(directory, filename)
+        r = eval_script(path)
+        status = r["status"]
+        count[status] += 1
+        if ENABLE_SYNTAX_CHECK and status == "SyntaxError":
+            error_msgs = r["stderr"].split("\n")
+            with open(path) as source_file:
+                lines = source_file.readlines()
+                unittest_line_start = lines.index("unittest\n")
+                unittest_line_end = len(lines)
+                for err_msg_line in error_msgs:
+                    matched_parts = re.match(r"(\/?.*?\.[\w:]+\/.*.d)\(([0-9]+)\): Error: (.*)", err_msg_line[2:-1])
+                    _file, line_num = matched_parts[1], int(matched_parts[2])
+                    if unittest_line_start <= line_num and line_num <= unittest_line_end:
+                        print("===============")
+                        print(path, "contains error in unit test part")
+                        print(error_msgs)
+                        print("===============")
+        filename = filename.split(".")[0]
+        print(f"Dlang,{filename},{status}")
+    print(DIR + ":" + str(count))
+if __name__ == "__main__":
+    main()

src/eval_elixir.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import argparse
+from sys import exit
+import subprocess
+from pathlib import Path
+from src.generic_eval import main as gmain
+def eval_script(path: Path):
+    try:
+        # Assumes exit-code 0 is all okay
+        output = subprocess.run(["elixir", str(path)], capture_output=True, timeout=5)
+        if output.returncode == 0:
+            status = "OK"
+        else:
+            outmessage = str(output)
+            if "Assertion with == failed" in outmessage:
+                status = "AssertionError"
+            elif "SyntaxError" in outmessage:
+                status = "SyntaxError"
+            else:
+                status = "Exception"
+        returncode = output.returncode
+    except subprocess.TimeoutExpired as exc:
+        status = "Timeout"
+        output = exc
+        returncode = -1
+    return {
+        "status": status,
+        "exit_code": returncode,
+        "stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
+        "stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
+    }
+if __name__ == "__main__":
+    gmain(eval_script, "Elixir", ".exs")

src/eval_fs.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from pathlib import Path
+from src.safe_subprocess import run
+def eval_script(path: Path):
+    r = run(["dotnet", "fsi", "-d:DEBUG", str(path)])
+    if r.timeout:
+        status = "Timeout"
+    elif r.exit_code == 0:
+        status = "OK"
+    else:
+        status = "Exception"
+    return {
+        "status" : status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }

src/eval_go.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import argparse
+from sys import exit
+import subprocess
+from pathlib import Path
+import os
+import tempfile
+from src.generic_eval import main as gmain
+def eval_script(path: Path):
+    status = None
+    stdout = None
+    stderr = None
+    exit_code = None
+    try:
+        # 创建临时目录用于Go缓存
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # 设置Go环境变量
+            env = os.environ.copy()
+            env["GOCACHE"] = os.path.join(temp_dir, "go-build")
+            env["GOPATH"] = os.path.join(temp_dir, "gopath")
+            build = subprocess.run(["go", "test", path],
+                                env=env,
+                                timeout=30,
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE)
+            stdout = build.stdout.decode("utf-8", errors="ignore")
+            stderr = build.stderr.decode("utf-8", errors="ignore")
+            exit_code = build.returncode
+            # write to stderr just so that we can redirect stdout to a csv
+            if "[setup failed]" in stdout or "[build failed]" in stdout:
+                status = "SyntaxError"
+            elif "FAIL" in stdout:
+                status = "Exception"
+            else:
+                status = "OK"
+    except subprocess.TimeoutExpired:
+        status = "Timeout"
+    return {
+        "status": status,
+        "exit_code": exit_code,
+        "stdout": stdout,
+        "stderr": stderr,
+    }
+if __name__ == "__main__":
+    gmain(eval_script, 'Go', '.go')

src/eval_hs.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from pathlib import Path
+from src.safe_subprocess import run
+def eval_script(path: Path):
+    r = run(["runghc", str(path)])
+    if r.timeout:
+        status = "Timeout"
+    elif r.exit_code == 0:
+        status = "OK"
+    elif "Syntax error":
+        status = "SyntaxError"
+    else:
+        status = "Exception"
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }

src/eval_java.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+import tempfile
+from src.safe_subprocess import run
+from pathlib import Path
+from src.generic_eval import main
+LANG_NAME = "Java"
+LANG_EXT = ".java"
+#Following files have problems:
+#137,
+#22: Any
+#148: Elipsis
+def eval_script(path: Path):
+    sys_env = os.environ.copy()
+    javatuples_path = Path("/usr/multiple/javatuples-1.2.jar")
+    sys_env["CLASSPATH"] =  f"{javatuples_path}"
+    with tempfile.TemporaryDirectory() as outdir:
+        #Each Java file contains the class with same name `JAVA_CLASS_NAME`
+        #Hence, javac will same JAVA_CLASS_NAME.class file for each problem
+        #Write class for each problem to a different temp dir
+        #Use UTF8 encoding with javac
+        result = run(["javac", "-encoding", "UTF8", "-d", outdir, path], env=sys_env)
+        if result.exit_code != 0:
+            # Well, it's a compile error. May be a type error or
+            # something. But, why break the set convention
+            status = "SyntaxError"
+        else:
+            result = run(["java", "-ea", "-cp", f"{outdir}:{javatuples_path}", "Problem"], env = sys_env)
+            if result.timeout:
+                status = "Timeout"
+            elif result.exit_code == 0:
+                status = "OK"
+            else:
+                status = "Exception"
+    return {
+        "status": status,
+        "exit_code": result.exit_code,
+        "stdout": result.stdout,
+        "stderr": result.stderr,
+    }
+if __name__ == "__main__":
+    main(eval_script, LANG_NAME, LANG_EXT)

src/eval_javascript.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+import subprocess
+from pathlib import Path
+def eval_script(path: Path):
+    try:
+        # Assumes exit-code 0 is all okay
+        output = subprocess.run(["node", str(path)], capture_output=True, timeout=5)
+        if output.returncode == 0:
+            status = "OK"
+        else:
+            outmessage = str(output)
+            if 'ERR_ASSERTION' in outmessage:
+                status = "AssertionError"
+            elif 'SyntaxError' in outmessage:
+                status = "SyntaxError"
+            elif 'ReferenceError' in outmessage:
+                status = "ReferenceError"
+            else:
+                status = "Exception"
+        returncode = output.returncode
+    except subprocess.TimeoutExpired as exc:
+        status = "Timeout"
+        output = exc
+        returncode = -1
+    except subprocess.CalledProcessError as exc:
+        status = "Exception"
+        returncode = exc.returncode
+        output = exc
+    return {
+            "status": status,
+            "exit_code": returncode,
+            "stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
+            "stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
+                }
+def main():
+    directory = Path(Path(__file__).parent, "..", "datasets", "js-keep-code_davinci_001_temp_0.2").resolve()
+    for filename in os.listdir(directory):
+        r = eval_script(Path.joinpath(directory,filename))
+        filename = filename.split(".")[0]
+        print(f"JavaScript,{filename},{r['status']}")
+if __name__ == "__main__":
+    main()

src/eval_julia.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from src.safe_subprocess import run
+from pathlib import Path
+def eval_script(path: Path):
+    result = run(["julia", str(path)], timeout_seconds=5)
+    if result.timeout:
+        status = "Timeout"
+    elif result.exit_code == 0:
+        status = "OK"
+    # TODO(arjun): I would like this to be reviewed more carefully by John.
+    elif len(result.stderr) < 1:
+        status = "Exception"
+    else:
+        status = "SyntaxError"
+    return {
+        "status": status,
+        "exit_code": result.exit_code,
+        "stdout": result.stdout,
+        "stderr": result.stderr,
+    }

src/eval_lean.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from pathlib import Path
+from src.safe_subprocess import run
+import subprocess
+def eval_script(path: Path):
+    # since lean is a theorem prover first and not a programming environment,
+    # the return code is always 1. idk.
+    try:
+        output = subprocess.run(["lean", str(path)], capture_output=True, timeout=5)
+        outmessage = str(output)
+        if "error: tactic 'rfl' failed" in outmessage: # :skull:
+            status = "AssertionError"
+        elif outmessage == "":
+            status = "OK"
+        else:
+            status = "SyntaxError"
+        returncode = output.returncode
+    except subprocess.TimeoutExpired as exc:
+        status = "Timeout"
+        output = exc
+        returncode = -1
+    return {
+        "status": status,
+        "exit_code": returncode,
+        "stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
+        "stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
+    }

src/eval_lua.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from pathlib import Path
+from src.safe_subprocess import run
+def eval_script(path: Path):
+    r = run(["lua", str(path)])
+    if r.timeout:
+        status = "Timeout"
+    elif r.exit_code == 0:
+        status = "OK"
+    else:
+        status = "Exception"
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }

src/eval_luau.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from pathlib import Path
+from src.safe_subprocess import run
+def eval_script(path: Path):
+    r = run(["luau-analyze", str(path)])
+    if r.timeout:
+        status = "Timeout"
+    elif r.exit_code == 0:
+        r = run(["luau", str(path)])
+        if r.timeout:
+            status = "Timeout"
+        elif r.exit_code == 0:
+            status = "OK"
+        else:
+            status = "Exception"
+    elif "SyntaxError" in r.stderr:
+        status = "SyntaxError"
+    else:
+        status = "TypeError"
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }

src/eval_matlab.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from pathlib import Path
+from src.safe_subprocess import run
+def eval_script(path):
+    # Matlab has the requirement that all functions must appear at the end
+    # of the file. So we first have to write the call to the test-function at the
+    # beginning of the file.
+    with open(path, 'r') as f:
+        content = f.read()
+    content = f"test();\n{content}"
+    with open(path, 'w') as f:
+        f.write(content)
+    filename = path.stem
+    parent_dir = path.parent.absolute()
+    # We use the matlab.engine to run the script; however, the way that the
+    # matlab engine works requires that we call the script as if it were a
+    # member of the matlab.engine object. So we have to write a python script
+    # that calls the matlab script. This also ensures that the script is called
+    # in a safe-subprocess. Who needs runtime reflection when you have IPC?
+    program= f"""
+import matlab.engine
+import io
+import sys
+out = io.StringIO()
+err = io.StringIO()
+eng = matlab.engine.start_matlab()
+eng.addpath(r'{parent_dir}',nargout=0)
+try:
+    r = eng.{filename}(nargout=0, stdout=out,stderr=err)
+    print(out.getvalue())
+except matlab.engine.MatlabExecutionError as e:
+    print(err.getvalue(), file=sys.stderr)
+"""
+    r = run(["python3", "-c", program], timeout_seconds=30)
+    # This is still somewhat brittle.
+    if r.timeout:
+        status = "Timeout"
+        exit_code = -1
+    elif r.stderr == "":
+        status = "OK"
+        exit_code = 0
+    else:
+        status = "Exception"
+        exit_code = 1
+    return {
+        "status": status,
+        "exit_code": exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }

src/eval_ocaml.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from pathlib import Path
+from src.safe_subprocess import run
+def eval_script(path: Path):
+    r = run(["ocaml", str(path)])
+    if r.timeout:
+        status = "Timeout"
+    elif r.exit_code == 0:
+        status = "OK"
+    elif "Assert_failure" in r.stderr:
+        status = "AssertionError"
+    elif "Syntax error" in r.stderr:
+        status = "SyntaxError"
+    else:
+        status = "Exception"
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }

src/eval_php.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from pathlib import Path
+from src.safe_subprocess import run
+LANG_NAME = "PHP"
+LANG_EXT = ".php"
+def eval_script(path: Path):
+    r = run(["php", path])
+    if "PHP Parse error" in r.stdout:
+        status = "SyntaxError"
+    elif r.exit_code != 0:
+        status = "Exception"
+    else:
+        status = "OK"
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }

src/eval_pl.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from pathlib import Path
+from src.safe_subprocess import run
+def eval_script(path: Path):
+    r = run(["perl", path])
+    if r.timeout:
+        status = "Timeout"
+    elif r.exit_code != 0:
+        status = "Exception"
+    elif "ERROR" in r.stdout or "ERROR" in r.stderr:
+        status = "Exception"
+    else:
+        status = "OK"
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }

src/eval_python.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from pathlib import Path
+from src.safe_subprocess import run
+def eval_script(path: Path):
+    r = run(["python3", str(path)])
+    if r.timeout:
+        status = "Timeout"
+    elif r.exit_code == 0:
+        status = "OK"
+    elif "SyntaxError" in r.stderr:
+        status = "SyntaxError"
+    else:
+        status = "Exception"
+    return {
+        "status" : status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }

src/eval_r.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+import subprocess
+from pathlib import Path
+def eval_script(path: Path):
+    try:
+        # Assumes exit-code 0 is all okay
+        # Run R on the file, capturing stderr
+        output = subprocess.run(["Rscript", str(path)], capture_output=True, timeout=5)
+        if output.returncode == 0:
+            status = "OK"
+        else:
+            outmessage = str(output)
+            if 'unexpected' in outmessage:
+                status = "SyntaxError"
+            elif "err=b''" in outmessage:
+                status = "AssertionError"
+            else:
+                status = "Exception"
+        returncode = output.returncode
+    except subprocess.TimeoutExpired as exc:
+        status = "Timeout"
+        output = exc
+        returncode = -1
+    except subprocess.CalledProcessError as exc:
+        status = "Exception"
+        returncode = exc.returncode
+        output = exc
+    return {
+            "status": status,
+            "exit_code": returncode,
+            "stdout": output.stdout,
+            "stderr": output.stderr
+    }
+def main():
+    directory = Path(Path(__file__).parent, "..", "datasets", "R-keep-code_davinci_001_temp_0.2").resolve()
+    for filename in os.listdir(directory):
+        r = eval_script(Path.joinpath(directory,filename))
+        filename = filename.split(".")[0]
+        print(f"R,{filename},{r['status']}")
+if __name__ == "__main__":
+    main()

src/eval_racket.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""
+Evaluates a generated Racket program (.rkt).
+"""
+import os
+from pathlib import Path
+from src.safe_subprocess import run
+from src.libeval import run_without_exn
+def eval_script(path: Path):
+    result = run(["racket", str(path)])
+    if (
+        "standard-module-name-resolver: collection not found\n  for module path: rackunit"
+        in result.stderr
+    ):
+        print(f"Failed to run evaluation for {path}: rackunit is not installed")
+        return None
+    # rackunit produces exit code 0 even if tests fail.
+    if len(result.stderr) > 0 or result.exit_code != 0:
+        if "read-syntax" in result.stderr:
+            status = "SyntaxError"
+        else:
+            status = "Exception"
+    else:
+        status = "OK"
+    return {
+        "status": status,
+        "exit_code": result.exit_code,
+        "stdout": result.stdout,
+        "stderr": result.stderr,
+    }
+def main():
+    directory = Path(
+        Path(__file__).parent, "..", "datasets", "racket-keep-code_davinci_001_temp_0.2"
+    ).resolve()
+    for filename in os.listdir(directory):
+        r = eval_script(Path.joinpath(directory, filename))
+        filename = filename.split(".")[0]
+        print(f"Racket,{filename},{r['status']}")
+if __name__ == "__main__":
+    main()

src/eval_ruby.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import argparse
+from sys import exit
+import subprocess
+from pathlib import Path
+from src.generic_eval import main as gmain
+def eval_script(path: Path):
+    try:
+        # Assumes exit-code 0 is all okay
+        # Need check=True for Ruby to pass errors to CalledProcessError
+        output = subprocess.run(
+            ["ruby", path], check=True, capture_output=True, timeout=5
+        )
+        if output.returncode == 0:
+            status = "OK"
+            out = output.stderr
+            error = output.stdout
+            returncode = 0
+        else:
+            raise Exception("there's an issue with check = True for Ruby, INVESTIGATE!")
+    except subprocess.TimeoutExpired as exc:
+        status = "Timeout"
+        out = exc.stdout
+        error = exc.stderr
+        returncode = -1
+    except subprocess.CalledProcessError as exc:
+        returncode = exc.returncode
+        out = exc.stdout
+        error = exc.stderr
+        #failure with code 1 but no error message is an Exception from Failed tests
+        if len(error) < 1:
+            status = "Exception"
+        else: #everything that prints out an error message is a SyntaxError
+            status = "SyntaxError"
+    return {
+        "status": status,
+        "exit_code": returncode,
+        "stdout": out,
+        "stderr": error,
+    }
+if __name__ == "__main__":
+    gmain(eval_script, 'Ruby', '.rb')

src/eval_rust.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+import subprocess
+import tempfile
+from pathlib import Path
+from src.generic_eval import main
+LANG_NAME = "Rust"
+LANG_EXT = ".rs"
+def eval_script(path: Path):
+    basename = ".".join(str(path).split(".")[:-1])
+    try:
+        build = subprocess.run(["rustc", path, "-o", basename], capture_output=True, timeout=15)
+    except subprocess.TimeoutExpired as exc:
+        return {
+            "status": "Timeout",
+            "exit_code": -1,
+            "stdout": "Compiler timeout",
+            "stderr": "Compiler timeout",
+        }
+    status = None
+    returncode = -1
+    output = None
+    if build.returncode != 0:
+        # Well, it's a compile error. May be a type error or
+        # something. But, why break the set convention
+        status = "SyntaxError"
+        returncode = build.returncode
+        output = build
+    else:
+        try:
+            # Assumes exit-code 0 is all okay
+            output = subprocess.run([basename], capture_output=True, timeout=5)
+            returncode = output.returncode
+            if output.returncode == 0:
+                status = "OK"
+            else:
+                # Well, it's a panic
+                status = "Exception"
+        except subprocess.TimeoutExpired as exc:
+            status = "Timeout"
+            output = exc
+        os.remove(basename)
+    return {
+        "status": status,
+        "exit_code": returncode,
+        "stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
+        "stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
+    }
+if __name__ == "__main__":
+    main(eval_script, LANG_NAME, LANG_EXT)

src/eval_scala.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from pathlib import Path
+import tempfile
+from src.safe_subprocess import run
+LANG_NAME = "Scala"
+LANG_EXT = ".scala"
+def eval_script(path: Path):
+    with tempfile.TemporaryDirectory() as outdir:
+        # Each Scala file contains the class with same name `JAVA_CLASS_NAME`
+        # Hence, scalac will same JAVA_CLASS_NAME.class file for each problem
+        # Write class for each problem to a different temp dir
+        build = run(["scalac", "-d", outdir, path], timeout_seconds=45)
+        if build.exit_code != 0:
+            # Well, it's a compile error. May be a type error or
+            # something. But, why break the set convention
+            return {
+                "status": "SyntaxError",
+                "exit_code": build.exit_code,
+                "stdout": build.stdout,
+                "stderr": build.stderr,
+            }
+        # "Problem" is the name of the class we emit.
+        r = run(["scala", "-cp", f"{outdir}", "Problem"])
+        if r.timeout:
+            status = "Timeout"
+        elif r.exit_code == 0 and r.stderr == "":
+            status = "OK"
+        else:
+            # Well, it's a panic
+            status = "Exception"
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }

src/eval_sh.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from pathlib import Path
+from src.safe_subprocess import run
+LANG_NAME = "bash"
+LANG_EXT = ".sh"
+def eval_script(path: Path):
+    # Capture output - will be generated regardless of success, fail, or syntax error
+    p = run(["bash", path])
+    if p.timeout:
+        status = "Timeout"
+    elif p.exit_code == 0:
+        status = "OK"
+    elif "syntax error" in p.stderr:
+        status = "SyntaxError"
+    else:
+        status = "Exception"
+    return {
+        "status": status,
+        "exit_code": p.exit_code,
+        "stdout": p.stdout,
+        "stderr": p.stderr,
+    }

src/eval_swift.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import subprocess
+from pathlib import Path
+import os
+from src.safe_subprocess import run
+def eval_script(path: Path):
+    basename = ".".join(str(path).split(".")[:-1])
+    r = run(["swiftc", path, "-o", basename], timeout_seconds=45)
+    if r.timeout:
+        status = "Timeout"
+    elif r.exit_code != 0:
+        # Well, it's a compile error. May be a type error or
+        # something. But, why break the set convention
+        status = "SyntaxError"
+    else:
+        r = run([basename], timeout_seconds=5)
+        if r.timeout:
+            status = "Timeout"
+        elif r.exit_code != 0:
+            # Well, it's a panic
+            status = "Exception"
+        else:
+            status = "OK"
+        os.remove(basename)
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }

src/eval_ts.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from pathlib import Path
+from src.safe_subprocess import run
+def eval_script(path: Path):
+    r = run(["tsc", "--target", "esnext", str(path)], timeout_seconds=15)
+    if r.exit_code != 0:
+        return {
+            "status": "SyntaxError",
+            "exit_code": r.exit_code,
+            "stdout": r.stdout,
+            "stderr": r.stderr,
+        }
+    r = run(["node", str(path).replace(".ts", ".js")], timeout_seconds=15)
+    if r.timeout:
+        status = "Timeout"
+    elif r.exit_code == 0:
+        status = "OK"
+    elif "ERR_ASSERTION" in r.stderr:
+        status = "AssertionError"
+    elif "SyntaxError" in r.stderr:
+        status = "SyntaxError"
+    elif "ReferenceError" in r.stderr:
+        status = "ReferenceError"
+    else:
+        status = "Exception"
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }

src/eval_v.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from pathlib import Path
+from src.safe_subprocess import run
+import subprocess
+# return codes for coqc:
+# 0: compilation goes through
+# 1: some sort of error (nondescript)
+def eval_script(path: Path):
+    cleanup_extensions = ['.vo', '.vok', '.vos']
+    try:
+        # sadly there seems to be no way to verify proofs in a coq file without compiling
+        output = subprocess.run(["coqc", "-noglob", str(path)], capture_output=True, timeout=5)
+        outmessage = str(output)
+        if output.returncode == 0:
+            status = "OK"
+            # cleanup: remove files generated by coqc
+            for ext in cleanup_extensions:
+                file_to_remove = path.with_suffix(ext)
+                if file_to_remove.exists():
+                    file_to_remove.unlink()
+        elif "Unable to unify" in outmessage:
+            status = "AssertionError"
+        else:
+            status = "SyntaxError"
+        returncode = output.returncode
+    except subprocess.TimeoutExpired as exc:
+        status = "Timeout"
+        output = exc
+        returncode = -1
+    return {
+        "status": status,
+        "exit_code": returncode,
+        "stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
+        "stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
+    }

src/generic_eval.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# This is a helper script for evaluating benchmarks that have been translated to
+# different languages.
+#
+# To use this script, call eval_lang.py.
+# The --directory argument is required, and tells the script where the benchmarks are located.
+# The --files argument is optional, and takes a list of numbers corresponding to the files to be evaluated.
+#
+# The script will print the results on each benchmark, and also write to results/lang.csv.
+# When the script completes, it will print a summary.
+#
+# Examples
+#
+# To run the entire benchmark suite:
+#   python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/
+#
+# To run benchmarks 1, 2, and 3:
+#   python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/ --files 1 2 3
+import argparse
+from sys import exit as sysexit
+from pathlib import Path
+import sys
+def list_files(directory, ext):
+    files_unsorted = directory.glob(f"HumanEval_*{ext}")
+    # assumption: base filenames are in the format of HumanEval_X_*
+    # Where X is a valid number
+    def key(s):
+        return int(str(s.name).split("_")[1])
+    files_sorted = sorted(files_unsorted, key=(lambda s: key(s)))
+    # assumption: there may be missing files, but no extra files
+    # so we build files_array where the index corresponds to the file's number,
+    # and a missing file is represented by None
+    size = key(files_sorted[-1]) + 1
+    files_array = [None] * size
+    for f in files_sorted:
+        k = key(f)
+        files_array[k] = f
+    return files_array
+def main(eval_script, language, extension):
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--directory", type=str, required=True, help="Directory to read benchmarks from"
+    )
+    args.add_argument(
+        "--files",
+        type=int,
+        nargs="*",
+        default=[],
+        help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2"
+    )
+    args = args.parse_args()
+    directory = Path(args.directory).resolve()
+    files_sorted = list_files(directory, extension)
+    # the directory you specified does not contain the right language
+    if len(files_sorted) == 0:
+        print(f'The specified directory does not contain files of type {extension}')
+        sysexit(1)
+    files_index = []
+    if len(args.files) > 0:
+        files_index = args.files
+    else:
+        files_index = range(len(files_sorted))
+    total = 0
+    passed = 0
+    syntax_error = 0
+    results_file = Path(Path(__file__).parent, "..", "results", language.lower() + ".csv").resolve()
+    with open(results_file, "w") as f:
+        for i in files_index:
+            filepath = files_sorted[i]
+            if filepath is None:
+                print("File {} does not exist!".format(i))
+                continue
+            res = eval_script(filepath)
+            output = f"{language},{filepath.stem},{res['status']}\n"
+            f.write(output)
+            print(output, end="")
+            total += 1
+            if res['status'] == "OK":
+                passed += 1
+            elif res['status'] == "SyntaxError":
+                syntax_error += 1
+    print (f"Total {total}, Syntax Error {syntax_error}, Passed {passed}")
+def main_check_stubs(check_script, language, extension):
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--directory", type=str, required=True, help="Directory to read benchmarks from"
+    )
+    args.add_argument(
+        "--files",
+        type=int,
+        nargs="*",
+        default=[],
+        help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2"
+    )
+    args = args.parse_args()
+    directory = Path(args.directory).resolve()
+    files_sorted = list_files(directory, extension)
+    # the directory you specified does not contain the right language
+    if len(files_sorted) == 0:
+        print(f'The specified directory does not contain files of type {extension}')
+        sysexit(1)
+    files_index = []
+    if len(args.files) > 0:
+        files_index = args.files
+    else:
+        files_index = range(len(files_sorted))
+    total = 0
+    passed = 0
+    results_file = Path(Path(__file__).parent, "..", "check_results", language.lower() + ".csv").resolve()
+    with open(results_file, "w") as f:
+        for i in files_index:
+            filepath = files_sorted[i]
+            if filepath is None:
+                print("File {} does not exist!".format(i))
+                continue
+            res = check_script(filepath)
+            output = f"{language},{filepath.stem},{res['status']}\n"
+            f.write(output)
+            print(output, end="")
+            total += 1
+            if res['status'] == "OK":
+                passed += 1
+    print (f"Total {total}, Passed {passed}")
+    if total != passed:
+        sys.exit(1)

src/libeval.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+import signal
+import subprocess
+from typing import List
+from . import generic_eval
+def testing_mail(x, y, z):
+    generic_eval.gmain(x, y, z)
+def run_without_exn(args: List[str]):
+    """
+    Runs the given program with a five second timeout. Does not throw an exception
+    no matter what happens. The output is a dictionary of the format that we expect
+    for our evaluation scripts. The "status" field is "OK" when the exit code is
+    zero. If that isn't enough, you may want to tweak the status based on the
+    captured stderr and stdout.
+    """
+    p = subprocess.Popen(
+        args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, start_new_session=True
+    )
+    try:
+        stdout, stderr = p.communicate(timeout=5)
+        exit_code = p.returncode
+        status = "OK" if exit_code == 0 else "Exception"
+    except subprocess.TimeoutExpired as exc:
+        stdout, stderr = p.stdout.read(), p.stderr.read()
+        os.killpg(os.getpgid(p.pid), signal.SIGTERM)
+        exit_code = -1
+        status = "Timeout"
+    if stdout is None:
+        stdout = b""
+    if stderr is None:
+        stderr = b""
+    return {
+        "status": status,
+        "exit_code": exit_code,
+        "stdout": stdout.decode("utf-8", errors="ignore"),
+        "stderr": stderr.decode("utf-8", errors="ignore"),
+    }

src/safe_subprocess/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ /__pycache__
2	+ /.pytest_cache

src/safe_subprocess/__init__.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import signal
+import fcntl
+import time
+import subprocess
+from typing import List
+MAX_BYTES_PER_READ = 1024
+SLEEP_BETWEEN_READS = 0.1
+class Result:
+    timeout: int
+    exit_code: int
+    stdout: str
+    stderr: str
+    def __init__(self, timeout, exit_code, stdout, stderr):
+        self.timeout = timeout
+        self.exit_code = exit_code
+        self.stdout = stdout
+        self.stderr = stderr
+def set_nonblocking(reader):
+    fd = reader.fileno()
+    fl = fcntl.fcntl(fd, fcntl.F_GETFL)
+    fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
+def run(
+    args: List[str],
+    timeout_seconds: int = 15,
+    max_output_size: int = 2048,
+    env = None,
+    cwd: str | None = None
+) -> Result:
+    """
+    Runs the given program with arguments. After the timeout elapses, kills the process
+    and all other processes in the process group. Captures at most max_output_size bytes
+    of stdout and stderr each, and discards any output beyond that.
+    """
+    p = subprocess.Popen(
+        args,
+        env=env,
+        stdin=subprocess.DEVNULL,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        start_new_session=True,
+        bufsize=MAX_BYTES_PER_READ,
+        cwd=cwd
+    )
+    set_nonblocking(p.stdout)
+    set_nonblocking(p.stderr)
+    process_group_id = os.getpgid(p.pid)
+    # We sleep for 0.1 seconds in each iteration.
+    max_iterations = timeout_seconds * 10
+    stdout_saved_bytes = []
+    stderr_saved_bytes = []
+    stdout_bytes_read = 0
+    stderr_bytes_read = 0
+    for _ in range(max_iterations):
+        this_stdout_read = p.stdout.read(MAX_BYTES_PER_READ)
+        this_stderr_read = p.stderr.read(MAX_BYTES_PER_READ)
+        # this_stdout_read and this_stderr_read may be None if stdout or stderr
+        # are closed. Without these checks, test_close_output fails.
+        if this_stdout_read is not None and stdout_bytes_read < max_output_size:
+            stdout_saved_bytes.append(this_stdout_read)
+            stdout_bytes_read += len(this_stdout_read)
+        if this_stderr_read is not None and stderr_bytes_read < max_output_size:
+            stderr_saved_bytes.append(this_stderr_read)
+            stderr_bytes_read += len(this_stderr_read)
+        exit_code = p.poll()
+        if exit_code is not None:
+            break
+        time.sleep(SLEEP_BETWEEN_READS)
+    try:
+        # Kills the process group. Without this line, test_fork_once fails.
+        os.killpg(process_group_id, signal.SIGKILL)
+    except ProcessLookupError:
+        pass
+    timeout = exit_code is None
+    exit_code = exit_code if exit_code is not None else -1
+    stdout = b"".join(stdout_saved_bytes).decode("utf-8", errors="ignore")
+    stderr = b"".join(stderr_saved_bytes).decode("utf-8", errors="ignore")
+    return Result(timeout=timeout, exit_code=exit_code, stdout=stdout, stderr=stderr)

src/safe_subprocess/evil_programs/block_on_inputs.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ while True:
2	+ input()

src/safe_subprocess/evil_programs/close_outputs.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import sys
+print("This is the end")
+sys.stdout.close()
+sys.stderr.close()
+while True:
+    pass

src/safe_subprocess/evil_programs/fork_bomb.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import os
+while True:
+    os.fork()

src/safe_subprocess/evil_programs/fork_once.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import os
+import time
+if os.fork() == 0:
+    while True:
+        time.sleep(60)

src/safe_subprocess/evil_programs/sleep_forever.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import time
+while True:
+    time.sleep(60)

src/safe_subprocess/evil_programs/unbounded_output.py ADDED Viewed

	@@ -0,0 +1,4 @@

+b = True
+while True:
+    print(b)
+    b = not b

src/safe_subprocess/module_test.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from . import run
+import time
+from pathlib import Path
+ROOT = Path(__file__).resolve().parent / "evil_programs"
+def assert_no_running_evil():
+    result = run(
+        ["pgrep", "-f", ROOT], timeout_seconds=1, max_output_size=1024
+    )
+    assert (
+        result.exit_code == 1
+    ), f"There are still evil processes running: {result.stdout}"
+    assert len(result.stderr) == 0
+    assert len(result.stdout) == 0
+def test_fork_once():
+    # The program exits cleanly and immediately. But, it forks a child that runs
+    # forever.
+    result = run(
+        ["python3", ROOT / "fork_once.py"],
+        timeout_seconds=2,
+        max_output_size=1024,
+    )
+    assert result.exit_code == 0
+    assert result.timeout == False
+    assert len(result.stderr) == 0
+    assert len(result.stdout) == 0
+    assert_no_running_evil()
+def test_close_outputs():
+    # The program prints to stdout, closes its output, and then runs forever.
+    result = run(
+        ["python3", ROOT / "close_outputs.py"],
+        timeout_seconds=2,
+        max_output_size=1024,
+    )
+    assert result.exit_code == -1
+    assert result.timeout == True
+    assert len(result.stderr) == 0
+    assert result.stdout == "This is the end\n"
+    assert_no_running_evil()
+def test_unbounded_output():
+    result = run(
+        ["python3", ROOT / "unbounded_output.py"],
+        timeout_seconds=3,
+        max_output_size=1024,
+    )
+    assert result.exit_code == -1
+    assert result.timeout == True
+    assert len(result.stderr) == 0
+    assert len(result.stdout) == 1024
+    assert_no_running_evil()
+def test_sleep_forever():
+    result = run(
+        ["python3", ROOT / "sleep_forever.py"],
+        timeout_seconds=2,
+        max_output_size=1024,
+    )
+    assert result.exit_code == -1
+    assert result.timeout == True
+    assert len(result.stderr) == 0
+    assert len(result.stdout) == 0
+    assert_no_running_evil()
+def test_fork_bomb():
+    result = run(
+        ["python3", ROOT / "fork_bomb.py"],
+        timeout_seconds=2,
+        max_output_size=1024,
+    )
+    assert result.exit_code == -1
+    assert result.timeout == True
+    assert len(result.stderr) == 0
+    assert len(result.stdout) == 0
+    # Unfortunately, this sleep seems to be necessary. My theories:
+    # 1. os.killpg doesn't block until the whole process group is dead.
+    # 2. pgrep can produce stale output
+    time.sleep(2)
+    assert_no_running_evil()
+def test_block_on_inputs():
+    # We run the subprocess with /dev/null as input. So, any program that tries
+    # to read input will error.
+    result = run(
+        ["python3", ROOT / "block_on_inputs.py"],
+        timeout_seconds=2,
+        max_output_size=1024,
+    )
+    assert result.exit_code == 1
+    assert result.timeout == False
+    assert len(result.stdout) == 0
+    assert "EOF when reading a line" in result.stderr
+    assert_no_running_evil()