Spaces:
Running
Running
Upload 48 files
Browse files- Dockerfile +14 -0
- README.md +3 -3
- app.py +677 -0
- requirements.txt +1 -0
- src/__init__.py +1 -0
- src/containerized_eval.py +98 -0
- src/eval_adb.py +64 -0
- src/eval_clj.py +42 -0
- src/eval_cpp.py +40 -0
- src/eval_cs.py +65 -0
- src/eval_dart.py +27 -0
- src/eval_dfy.py +29 -0
- src/eval_dlang.py +63 -0
- src/eval_elixir.py +37 -0
- src/eval_fs.py +17 -0
- src/eval_go.py +52 -0
- src/eval_hs.py +19 -0
- src/eval_java.py +50 -0
- src/eval_javascript.py +49 -0
- src/eval_julia.py +21 -0
- src/eval_lean.py +29 -0
- src/eval_lua.py +17 -0
- src/eval_luau.py +26 -0
- src/eval_matlab.py +53 -0
- src/eval_ocaml.py +21 -0
- src/eval_php.py +20 -0
- src/eval_pl.py +20 -0
- src/eval_python.py +19 -0
- src/eval_r.py +47 -0
- src/eval_racket.py +49 -0
- src/eval_ruby.py +43 -0
- src/eval_rust.py +53 -0
- src/eval_scala.py +37 -0
- src/eval_sh.py +24 -0
- src/eval_swift.py +30 -0
- src/eval_ts.py +33 -0
- src/eval_v.py +40 -0
- src/generic_eval.py +149 -0
- src/libeval.py +40 -0
- src/safe_subprocess/.gitignore +2 -0
- src/safe_subprocess/__init__.py +91 -0
- src/safe_subprocess/evil_programs/block_on_inputs.py +2 -0
- src/safe_subprocess/evil_programs/close_outputs.py +7 -0
- src/safe_subprocess/evil_programs/fork_bomb.py +4 -0
- src/safe_subprocess/evil_programs/fork_once.py +6 -0
- src/safe_subprocess/evil_programs/sleep_forever.py +4 -0
- src/safe_subprocess/evil_programs/unbounded_output.py +4 -0
- src/safe_subprocess/module_test.py +103 -0
Dockerfile
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM ghcr.io/nuprl/multipl-e-evaluation:v3.1
|
2 |
+
|
3 |
+
# Install GNAT for Ada language support
|
4 |
+
RUN apt-get update && apt-get install -y gnat && apt-get clean
|
5 |
+
|
6 |
+
# Override the default entrypoint of the base image
|
7 |
+
ENTRYPOINT []
|
8 |
+
WORKDIR /app
|
9 |
+
COPY requirements.txt .
|
10 |
+
RUN pip install -r requirements.txt
|
11 |
+
COPY . .
|
12 |
+
EXPOSE 7860
|
13 |
+
ENV GRADIO_SERVER_NAME="0.0.0.0"
|
14 |
+
CMD ["python3", "app.py"]
|
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 🔥
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
---
|
|
|
1 |
---
|
2 |
+
title: Docker Test
|
3 |
emoji: 🔥
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: gray
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
---
|
app.py
ADDED
@@ -0,0 +1,677 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import json
|
3 |
+
import importlib
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
from pathlib import Path
|
7 |
+
import concurrent.futures
|
8 |
+
import multiprocessing
|
9 |
+
import time
|
10 |
+
import threading
|
11 |
+
import queue
|
12 |
+
import uuid
|
13 |
+
import numpy as np
|
14 |
+
from datetime import datetime
|
15 |
+
from tqdm.auto import tqdm
|
16 |
+
from src.containerized_eval import eval_string_script
|
17 |
+
|
18 |
+
# Add current directory and src directory to module search path
|
19 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
20 |
+
src_dir = os.path.join(current_dir, "src")
|
21 |
+
if current_dir not in sys.path:
|
22 |
+
sys.path.append(current_dir)
|
23 |
+
if src_dir not in sys.path:
|
24 |
+
sys.path.append(src_dir)
|
25 |
+
|
26 |
+
# Create message queue
|
27 |
+
task_queue = queue.Queue()
|
28 |
+
# Dictionary to store task status
|
29 |
+
task_status = {}
|
30 |
+
# List to store task history, max 200 tasks
|
31 |
+
task_history = []
|
32 |
+
# Lock for shared resources
|
33 |
+
lock = threading.Lock()
|
34 |
+
# Number of worker threads
|
35 |
+
worker_threads = max(1, multiprocessing.cpu_count() // 2) # Using half the available cores for better stability
|
36 |
+
# Flag for running background threads
|
37 |
+
running = True
|
38 |
+
# Mapping from task type to processing time
|
39 |
+
task_type_times = {}
|
40 |
+
|
41 |
+
def queue_processor():
|
42 |
+
"""Process tasks in the queue"""
|
43 |
+
while running:
|
44 |
+
try:
|
45 |
+
task_id, input_data, request_time = task_queue.get(timeout=0.1)
|
46 |
+
with lock:
|
47 |
+
task_status[task_id]['status'] = 'processing'
|
48 |
+
task_status[task_id]['start_time'] = time.time()
|
49 |
+
|
50 |
+
if isinstance(input_data, list) and len(input_data) > 0:
|
51 |
+
sample_task = input_data[0]
|
52 |
+
language = sample_task.get('language', 'unknown') if isinstance(sample_task, dict) else 'unknown'
|
53 |
+
task_size = len(input_data)
|
54 |
+
task_complexity = _estimate_task_complexity(input_data)
|
55 |
+
|
56 |
+
with lock:
|
57 |
+
task_status[task_id]['estimated_factors'] = {
|
58 |
+
'language': language,
|
59 |
+
'size': task_size,
|
60 |
+
'complexity': task_complexity
|
61 |
+
}
|
62 |
+
|
63 |
+
result = evaluate(input_data)
|
64 |
+
|
65 |
+
end_time = time.time()
|
66 |
+
process_time = end_time - task_status[task_id]['start_time']
|
67 |
+
|
68 |
+
with lock:
|
69 |
+
task_status[task_id]['status'] = 'completed'
|
70 |
+
task_status[task_id]['result'] = result
|
71 |
+
task_status[task_id]['end_time'] = end_time
|
72 |
+
task_status[task_id]['process_time'] = process_time
|
73 |
+
|
74 |
+
if 'estimated_factors' in task_status[task_id]:
|
75 |
+
factors = task_status[task_id]['estimated_factors']
|
76 |
+
key = f"{factors['language']}_{factors['complexity']}"
|
77 |
+
|
78 |
+
if key not in task_type_times:
|
79 |
+
task_type_times[key] = []
|
80 |
+
|
81 |
+
task_type_times[key].append(process_time / factors['size'])
|
82 |
+
if len(task_type_times[key]) > 10:
|
83 |
+
task_type_times[key] = task_type_times[key][-10:]
|
84 |
+
|
85 |
+
task_history.append({
|
86 |
+
'task_id': task_id,
|
87 |
+
'request_time': request_time,
|
88 |
+
'process_time': process_time,
|
89 |
+
'status': 'completed',
|
90 |
+
'factors': task_status[task_id].get('estimated_factors', {})
|
91 |
+
})
|
92 |
+
while len(task_history) > 200:
|
93 |
+
task_history.pop(0)
|
94 |
+
|
95 |
+
task_queue.task_done()
|
96 |
+
|
97 |
+
except queue.Empty:
|
98 |
+
continue
|
99 |
+
except Exception as e:
|
100 |
+
if 'task_id' in locals():
|
101 |
+
with lock:
|
102 |
+
task_status[task_id]['status'] = 'error'
|
103 |
+
task_status[task_id]['error'] = str(e)
|
104 |
+
task_status[task_id]['end_time'] = time.time()
|
105 |
+
task_queue.task_done()
|
106 |
+
|
107 |
+
def _estimate_task_complexity(tasks):
|
108 |
+
"""Estimate task complexity
|
109 |
+
|
110 |
+
Returns: 'simple', 'medium', or 'complex'
|
111 |
+
"""
|
112 |
+
total_code_length = 0
|
113 |
+
count = 0
|
114 |
+
|
115 |
+
for task in tasks:
|
116 |
+
if isinstance(task, dict):
|
117 |
+
prompt = task.get('prompt', '')
|
118 |
+
tests = task.get('tests', '')
|
119 |
+
completions = task.get('processed_completions', [])
|
120 |
+
|
121 |
+
code_length = len(prompt) + len(tests)
|
122 |
+
if completions:
|
123 |
+
code_length += sum(len(comp) for comp in completions)
|
124 |
+
|
125 |
+
total_code_length += code_length
|
126 |
+
count += 1
|
127 |
+
|
128 |
+
if count == 0:
|
129 |
+
return 'medium'
|
130 |
+
|
131 |
+
avg_length = total_code_length / count
|
132 |
+
|
133 |
+
if avg_length < 1000:
|
134 |
+
return 'simple'
|
135 |
+
elif avg_length < 5000:
|
136 |
+
return 'medium'
|
137 |
+
else:
|
138 |
+
return 'complex'
|
139 |
+
|
140 |
+
def evaluate(input_data):
|
141 |
+
"""Main function for code evaluation"""
|
142 |
+
try:
|
143 |
+
if not isinstance(input_data, list):
|
144 |
+
return {"status": "Exception", "error": "Input must be a list"}
|
145 |
+
|
146 |
+
results = []
|
147 |
+
|
148 |
+
# Use a moderate number of workers for all language tests to ensure stability
|
149 |
+
# This prevents resource contention regardless of language
|
150 |
+
max_workers = max(1, min(multiprocessing.cpu_count() // 2, 4))
|
151 |
+
|
152 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
153 |
+
future_to_item = {executor.submit(evaluate_single_case, item): item for item in input_data}
|
154 |
+
for future in concurrent.futures.as_completed(future_to_item):
|
155 |
+
item = future_to_item[future]
|
156 |
+
try:
|
157 |
+
result = future.result()
|
158 |
+
item.update(result)
|
159 |
+
results.append(item)
|
160 |
+
except Exception as e:
|
161 |
+
item.update({"status": "Exception", "error": str(e)})
|
162 |
+
results.append(item)
|
163 |
+
return results
|
164 |
+
|
165 |
+
except Exception as e:
|
166 |
+
return {"status": "Exception", "error": str(e)}
|
167 |
+
|
168 |
+
def evaluate_single_case(input_data):
|
169 |
+
"""Evaluate a single code case"""
|
170 |
+
try:
|
171 |
+
if not isinstance(input_data, dict):
|
172 |
+
return {"status": "Exception", "error": "Input item must be a dictionary"}
|
173 |
+
|
174 |
+
language = input_data.get('language')
|
175 |
+
completions = input_data.get('processed_completions', [])
|
176 |
+
|
177 |
+
if not completions:
|
178 |
+
return {"status": "Exception", "error": "No code provided"}
|
179 |
+
|
180 |
+
# Use a retry mechanism for all languages for better reliability
|
181 |
+
max_retries = 2 # One retry for all languages
|
182 |
+
|
183 |
+
results = []
|
184 |
+
for comp in completions:
|
185 |
+
code = input_data.get('prompt') + comp + '\n' + input_data.get('tests')
|
186 |
+
|
187 |
+
# Try up to max_retries + 1 times for all test cases
|
188 |
+
for attempt in range(max_retries + 1):
|
189 |
+
result = evaluate_code(code, language)
|
190 |
+
|
191 |
+
# If success or last attempt, return/record the result
|
192 |
+
if result["status"] == "OK" or attempt == max_retries:
|
193 |
+
if result["status"] == "OK":
|
194 |
+
return result
|
195 |
+
results.append(result)
|
196 |
+
break
|
197 |
+
|
198 |
+
# For retries, briefly wait to allow resources to stabilize
|
199 |
+
time.sleep(0.3)
|
200 |
+
|
201 |
+
return results[0]
|
202 |
+
|
203 |
+
except Exception as e:
|
204 |
+
return {"status": "Exception", "error": str(e)}
|
205 |
+
|
206 |
+
def evaluate_code(code, language):
|
207 |
+
"""Evaluate code in a specific language"""
|
208 |
+
try:
|
209 |
+
result = eval_string_script(language, code)
|
210 |
+
return result
|
211 |
+
|
212 |
+
except Exception as e:
|
213 |
+
return {"status": "Exception", "error": str(e)}
|
214 |
+
|
215 |
+
def synchronous_evaluate(input_data):
|
216 |
+
"""Synchronously evaluate code, compatible with original interface"""
|
217 |
+
if isinstance(input_data, list) and len(input_data) > 0:
|
218 |
+
sample_task = input_data[0]
|
219 |
+
language = sample_task.get('language', 'unknown') if isinstance(sample_task, dict) else 'unknown'
|
220 |
+
task_size = len(input_data)
|
221 |
+
task_complexity = _estimate_task_complexity(input_data)
|
222 |
+
else:
|
223 |
+
language = 'unknown'
|
224 |
+
task_size = 1
|
225 |
+
task_complexity = 'medium'
|
226 |
+
|
227 |
+
estimated_time_per_task = _get_estimated_time_for_task(language, task_complexity)
|
228 |
+
estimated_total_time = estimated_time_per_task * task_size
|
229 |
+
|
230 |
+
queue_info = get_queue_status()
|
231 |
+
waiting_tasks = queue_info['waiting_tasks']
|
232 |
+
|
233 |
+
task_id = str(uuid.uuid4())
|
234 |
+
request_time = time.time()
|
235 |
+
|
236 |
+
with lock:
|
237 |
+
task_status[task_id] = {
|
238 |
+
'status': 'queued',
|
239 |
+
'queued_time': request_time,
|
240 |
+
'queue_position': task_queue.qsize() + 1,
|
241 |
+
'synchronous': True,
|
242 |
+
'estimated_factors': {
|
243 |
+
'language': language,
|
244 |
+
'size': task_size,
|
245 |
+
'complexity': task_complexity
|
246 |
+
},
|
247 |
+
'estimated_time': estimated_total_time
|
248 |
+
}
|
249 |
+
|
250 |
+
task_queue.put((task_id, input_data, request_time))
|
251 |
+
|
252 |
+
while True:
|
253 |
+
with lock:
|
254 |
+
if task_id in task_status:
|
255 |
+
status = task_status[task_id]['status']
|
256 |
+
if status == 'completed':
|
257 |
+
result = task_status[task_id]['result']
|
258 |
+
task_status.pop(task_id, None)
|
259 |
+
return result
|
260 |
+
elif status == 'error':
|
261 |
+
error = task_status[task_id].get('error', 'Unknown error')
|
262 |
+
task_status.pop(task_id, None)
|
263 |
+
return {"status": "Exception", "error": error}
|
264 |
+
|
265 |
+
time.sleep(0.1)
|
266 |
+
|
267 |
+
def _get_estimated_time_for_task(language, complexity):
|
268 |
+
"""Get estimated processing time for a specific task type"""
|
269 |
+
key = f"{language}_{complexity}"
|
270 |
+
|
271 |
+
if key in task_type_times and len(task_type_times[key]) > 0:
|
272 |
+
return np.median(task_type_times[key])
|
273 |
+
|
274 |
+
if complexity == 'simple':
|
275 |
+
return 1.0
|
276 |
+
elif complexity == 'medium':
|
277 |
+
return 3.0
|
278 |
+
else: # complex
|
279 |
+
return 8.0
|
280 |
+
|
281 |
+
def enqueue_task(input_data):
|
282 |
+
"""Add task to queue"""
|
283 |
+
if isinstance(input_data, list) and len(input_data) > 0:
|
284 |
+
sample_task = input_data[0]
|
285 |
+
language = sample_task.get('language', 'unknown') if isinstance(sample_task, dict) else 'unknown'
|
286 |
+
task_size = len(input_data)
|
287 |
+
task_complexity = _estimate_task_complexity(input_data)
|
288 |
+
else:
|
289 |
+
language = 'unknown'
|
290 |
+
task_size = 1
|
291 |
+
task_complexity = 'medium'
|
292 |
+
|
293 |
+
estimated_time_per_task = _get_estimated_time_for_task(language, task_complexity)
|
294 |
+
estimated_total_time = estimated_time_per_task * task_size
|
295 |
+
|
296 |
+
task_id = str(uuid.uuid4())
|
297 |
+
request_time = time.time()
|
298 |
+
|
299 |
+
with lock:
|
300 |
+
task_status[task_id] = {
|
301 |
+
'status': 'queued',
|
302 |
+
'queued_time': request_time,
|
303 |
+
'queue_position': task_queue.qsize() + 1,
|
304 |
+
'estimated_factors': {
|
305 |
+
'language': language,
|
306 |
+
'size': task_size,
|
307 |
+
'complexity': task_complexity
|
308 |
+
},
|
309 |
+
'estimated_time': estimated_total_time
|
310 |
+
}
|
311 |
+
|
312 |
+
queue_info = get_queue_status()
|
313 |
+
est_wait = queue_info['estimated_wait']
|
314 |
+
|
315 |
+
task_queue.put((task_id, input_data, request_time))
|
316 |
+
|
317 |
+
return {
|
318 |
+
'task_id': task_id,
|
319 |
+
'status': 'queued',
|
320 |
+
'queue_position': task_status[task_id]['queue_position'],
|
321 |
+
'estimated_wait': est_wait,
|
322 |
+
'estimated_processing': estimated_total_time
|
323 |
+
}
|
324 |
+
|
325 |
+
def check_status(task_id):
|
326 |
+
"""Check task status"""
|
327 |
+
with lock:
|
328 |
+
if task_id not in task_status:
|
329 |
+
return {'status': 'not_found'}
|
330 |
+
|
331 |
+
status_info = task_status[task_id].copy()
|
332 |
+
|
333 |
+
if status_info['status'] in ['completed', 'error'] and time.time() - status_info.get('end_time', 0) > 3600:
|
334 |
+
task_status.pop(task_id, None)
|
335 |
+
|
336 |
+
return status_info
|
337 |
+
|
338 |
+
def get_queue_status():
|
339 |
+
"""Get queue status"""
|
340 |
+
with lock:
|
341 |
+
queued_tasks = [t for t in task_status.values() if t['status'] == 'queued']
|
342 |
+
processing_tasks = [t for t in task_status.values() if t['status'] == 'processing']
|
343 |
+
|
344 |
+
queue_size = task_queue.qsize()
|
345 |
+
active_tasks = len(processing_tasks)
|
346 |
+
waiting_tasks = len(queued_tasks)
|
347 |
+
|
348 |
+
remaining_processing_time = 0
|
349 |
+
for task in processing_tasks:
|
350 |
+
if 'start_time' in task and 'estimated_time' in task:
|
351 |
+
elapsed = time.time() - task['start_time']
|
352 |
+
remaining = max(0, task['estimated_time'] - elapsed)
|
353 |
+
remaining_processing_time += remaining
|
354 |
+
else:
|
355 |
+
remaining_processing_time += 2
|
356 |
+
|
357 |
+
if active_tasks > 0:
|
358 |
+
remaining_processing_time = remaining_processing_time / min(active_tasks, worker_threads)
|
359 |
+
|
360 |
+
queued_processing_time = 0
|
361 |
+
for task in queued_tasks:
|
362 |
+
if 'estimated_time' in task:
|
363 |
+
queued_processing_time += task['estimated_time']
|
364 |
+
else:
|
365 |
+
queued_processing_time += 5
|
366 |
+
|
367 |
+
if worker_threads > 0 and queued_processing_time > 0:
|
368 |
+
queued_processing_time = queued_processing_time / worker_threads
|
369 |
+
|
370 |
+
estimated_wait = remaining_processing_time + queued_processing_time
|
371 |
+
|
372 |
+
if task_history:
|
373 |
+
prediction_ratios = []
|
374 |
+
for task in task_history:
|
375 |
+
if 'factors' in task and 'estimated_time' in task:
|
376 |
+
prediction_ratios.append(task['process_time'] / task['estimated_time'])
|
377 |
+
|
378 |
+
if prediction_ratios:
|
379 |
+
correction_factor = np.median(prediction_ratios)
|
380 |
+
correction_factor = max(0.5, min(2.0, correction_factor))
|
381 |
+
estimated_wait *= correction_factor
|
382 |
+
|
383 |
+
estimated_wait = max(0.1, estimated_wait)
|
384 |
+
if waiting_tasks == 0 and active_tasks == 0:
|
385 |
+
estimated_wait = 0
|
386 |
+
|
387 |
+
recent_tasks = task_history[-5:] if task_history else []
|
388 |
+
|
389 |
+
return {
|
390 |
+
'queue_size': queue_size,
|
391 |
+
'active_tasks': active_tasks,
|
392 |
+
'waiting_tasks': waiting_tasks,
|
393 |
+
'worker_threads': worker_threads,
|
394 |
+
'estimated_wait': estimated_wait,
|
395 |
+
'recent_tasks': recent_tasks
|
396 |
+
}
|
397 |
+
|
398 |
+
def format_time(seconds):
|
399 |
+
"""Format time into readable format"""
|
400 |
+
if seconds < 60:
|
401 |
+
return f"{seconds:.1f} seconds"
|
402 |
+
elif seconds < 3600:
|
403 |
+
minutes = int(seconds / 60)
|
404 |
+
seconds = seconds % 60
|
405 |
+
return f"{minutes}m {seconds:.1f}s"
|
406 |
+
else:
|
407 |
+
hours = int(seconds / 3600)
|
408 |
+
minutes = int((seconds % 3600) / 60)
|
409 |
+
return f"{hours}h {minutes}m"
|
410 |
+
|
411 |
+
def ui_get_queue_info():
|
412 |
+
"""Get queue info for UI"""
|
413 |
+
queue_info = get_queue_status()
|
414 |
+
|
415 |
+
tasks_html = ""
|
416 |
+
for task in reversed(queue_info['recent_tasks']):
|
417 |
+
tasks_html += f"""
|
418 |
+
<tr>
|
419 |
+
<td>{task['task_id'][:8]}...</td>
|
420 |
+
<td>{datetime.fromtimestamp(task['request_time']).strftime('%H:%M:%S')}</td>
|
421 |
+
<td>{format_time(task['process_time'])}</td>
|
422 |
+
</tr>
|
423 |
+
"""
|
424 |
+
|
425 |
+
if not tasks_html:
|
426 |
+
tasks_html = """
|
427 |
+
<tr>
|
428 |
+
<td colspan="3" style="text-align: center; padding: 20px;">No historical tasks</td>
|
429 |
+
</tr>
|
430 |
+
"""
|
431 |
+
|
432 |
+
return f"""
|
433 |
+
<div class="dashboard">
|
434 |
+
<div class="queue-info-card main-card">
|
435 |
+
<h3 class="card-title">Queue Status Monitor</h3>
|
436 |
+
<div class="queue-stats">
|
437 |
+
<div class="stat-item">
|
438 |
+
<div class="stat-value">{queue_info['waiting_tasks']}</div>
|
439 |
+
<div class="stat-label">Waiting</div>
|
440 |
+
</div>
|
441 |
+
<div class="stat-item">
|
442 |
+
<div class="stat-value">{queue_info['active_tasks']}</div>
|
443 |
+
<div class="stat-label">Processing</div>
|
444 |
+
</div>
|
445 |
+
<div class="stat-item">
|
446 |
+
<div class="stat-value">{queue_info['worker_threads']}</div>
|
447 |
+
<div class="stat-label">Worker Threads</div>
|
448 |
+
</div>
|
449 |
+
</div>
|
450 |
+
|
451 |
+
<div class="wait-time">
|
452 |
+
<p><b>Current Estimated Wait Time:</b> {format_time(queue_info['estimated_wait'])}</p>
|
453 |
+
<p class="last-update"><small>Last update: {datetime.now().strftime('%H:%M:%S')}</small></p>
|
454 |
+
</div>
|
455 |
+
</div>
|
456 |
+
|
457 |
+
<div class="queue-info-card history-card">
|
458 |
+
<h3 class="card-title">Recently Processed Tasks</h3>
|
459 |
+
<table class="recent-tasks">
|
460 |
+
<thead>
|
461 |
+
<tr>
|
462 |
+
<th>Task ID</th>
|
463 |
+
<th>Request Time</th>
|
464 |
+
<th>Processing Time</th>
|
465 |
+
</tr>
|
466 |
+
</thead>
|
467 |
+
<tbody>
|
468 |
+
{tasks_html}
|
469 |
+
</tbody>
|
470 |
+
</table>
|
471 |
+
</div>
|
472 |
+
</div>
|
473 |
+
"""
|
474 |
+
|
475 |
+
def launch_workers():
|
476 |
+
"""Launch worker threads"""
|
477 |
+
global running
|
478 |
+
running = True
|
479 |
+
|
480 |
+
for _ in range(worker_threads):
|
481 |
+
worker = threading.Thread(target=queue_processor)
|
482 |
+
worker.daemon = True
|
483 |
+
worker.start()
|
484 |
+
|
485 |
+
# Custom CSS
|
486 |
+
custom_css = """
|
487 |
+
.container {
|
488 |
+
max-width: 1200px;
|
489 |
+
margin: 0 auto;
|
490 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
491 |
+
}
|
492 |
+
|
493 |
+
.dashboard {
|
494 |
+
display: flex;
|
495 |
+
flex-direction: column;
|
496 |
+
gap: 20px;
|
497 |
+
}
|
498 |
+
|
499 |
+
.card-title {
|
500 |
+
color: #333;
|
501 |
+
border-bottom: 2px solid #ddd;
|
502 |
+
padding-bottom: 10px;
|
503 |
+
margin-top: 0;
|
504 |
+
}
|
505 |
+
|
506 |
+
.status-card, .queue-info-card {
|
507 |
+
background: #fff;
|
508 |
+
border-radius: 12px;
|
509 |
+
padding: 20px;
|
510 |
+
margin: 10px 0;
|
511 |
+
box-shadow: 0 4px 15px rgba(0,0,0,0.08);
|
512 |
+
}
|
513 |
+
|
514 |
+
.main-card {
|
515 |
+
border-top: 5px solid #4285f4;
|
516 |
+
}
|
517 |
+
|
518 |
+
.history-card {
|
519 |
+
border-top: 5px solid #34a853;
|
520 |
+
}
|
521 |
+
|
522 |
+
.status-card.success {
|
523 |
+
background: #e7f5e7;
|
524 |
+
border-left: 5px solid #28a745;
|
525 |
+
}
|
526 |
+
|
527 |
+
.status-card.error {
|
528 |
+
background: #f8d7da;
|
529 |
+
border-left: 5px solid #dc3545;
|
530 |
+
}
|
531 |
+
|
532 |
+
.error-message {
|
533 |
+
color: #dc3545;
|
534 |
+
font-weight: bold;
|
535 |
+
padding: 10px;
|
536 |
+
background: #f8d7da;
|
537 |
+
border-radius: 5px;
|
538 |
+
}
|
539 |
+
|
540 |
+
.notice {
|
541 |
+
color: #0c5460;
|
542 |
+
background-color: #d1ecf1;
|
543 |
+
padding: 10px;
|
544 |
+
border-radius: 5px;
|
545 |
+
}
|
546 |
+
|
547 |
+
.queue-stats {
|
548 |
+
display: flex;
|
549 |
+
justify-content: space-around;
|
550 |
+
margin: 20px 0;
|
551 |
+
}
|
552 |
+
|
553 |
+
.stat-item {
|
554 |
+
text-align: center;
|
555 |
+
padding: 15px;
|
556 |
+
background: #f8f9fa;
|
557 |
+
border-radius: 10px;
|
558 |
+
min-width: 120px;
|
559 |
+
transition: transform 0.3s ease;
|
560 |
+
}
|
561 |
+
|
562 |
+
.stat-item:hover {
|
563 |
+
transform: translateY(-5px);
|
564 |
+
box-shadow: 0 5px 15px rgba(0,0,0,0.1);
|
565 |
+
}
|
566 |
+
|
567 |
+
.stat-value {
|
568 |
+
font-size: 32px;
|
569 |
+
font-weight: bold;
|
570 |
+
color: #4285f4;
|
571 |
+
margin-bottom: 5px;
|
572 |
+
}
|
573 |
+
|
574 |
+
.stat-label {
|
575 |
+
color: #5f6368;
|
576 |
+
font-size: 16px;
|
577 |
+
}
|
578 |
+
|
579 |
+
.wait-time {
|
580 |
+
text-align: center;
|
581 |
+
margin: 20px 0;
|
582 |
+
padding: 15px;
|
583 |
+
background: #f1f3f4;
|
584 |
+
border-radius: 8px;
|
585 |
+
font-size: 18px;
|
586 |
+
}
|
587 |
+
|
588 |
+
.last-update {
|
589 |
+
color: #80868b;
|
590 |
+
margin-top: 10px;
|
591 |
+
margin-bottom: 0;
|
592 |
+
}
|
593 |
+
|
594 |
+
.recent-tasks {
|
595 |
+
width: 100%;
|
596 |
+
border-collapse: collapse;
|
597 |
+
margin-top: 15px;
|
598 |
+
background: white;
|
599 |
+
box-shadow: 0 1px 3px rgba(0,0,0,0.05);
|
600 |
+
}
|
601 |
+
|
602 |
+
.recent-tasks th, .recent-tasks td {
|
603 |
+
border: 1px solid #e0e0e0;
|
604 |
+
padding: 12px 15px;
|
605 |
+
text-align: center;
|
606 |
+
}
|
607 |
+
|
608 |
+
.recent-tasks th {
|
609 |
+
background-color: #f1f3f4;
|
610 |
+
color: #202124;
|
611 |
+
font-weight: 500;
|
612 |
+
}
|
613 |
+
|
614 |
+
.recent-tasks tbody tr:hover {
|
615 |
+
background-color: #f8f9fa;
|
616 |
+
}
|
617 |
+
|
618 |
+
.tabs {
|
619 |
+
margin-top: 20px;
|
620 |
+
}
|
621 |
+
|
622 |
+
button.primary {
|
623 |
+
background-color: #4285f4;
|
624 |
+
color: white;
|
625 |
+
padding: 10px 20px;
|
626 |
+
border: none;
|
627 |
+
border-radius: 4px;
|
628 |
+
cursor: pointer;
|
629 |
+
font-size: 16px;
|
630 |
+
font-weight: 500;
|
631 |
+
transition: background-color 0.3s;
|
632 |
+
}
|
633 |
+
|
634 |
+
button.primary:hover {
|
635 |
+
background-color: #3367d6;
|
636 |
+
}
|
637 |
+
"""
|
638 |
+
|
639 |
+
# Initialize and launch worker threads
|
640 |
+
launch_workers()
|
641 |
+
|
642 |
+
# Create Gradio interface
|
643 |
+
with gr.Blocks(css=custom_css) as demo:
|
644 |
+
gr.Markdown("# Code Evaluation Service")
|
645 |
+
gr.Markdown("Code evaluation service supporting multiple programming languages, using queue mechanism to process requests")
|
646 |
+
|
647 |
+
with gr.Row():
|
648 |
+
with gr.Column(scale=3):
|
649 |
+
# Queue status info card
|
650 |
+
queue_info_html = gr.HTML()
|
651 |
+
refresh_queue_btn = gr.Button("Refresh Queue Status", variant="primary")
|
652 |
+
|
653 |
+
# Hidden API interface components
|
654 |
+
with gr.Row(visible=False):
|
655 |
+
api_input = gr.JSON()
|
656 |
+
api_output = gr.JSON()
|
657 |
+
|
658 |
+
# Define update function
|
659 |
+
def update_queue_info():
|
660 |
+
return ui_get_queue_info()
|
661 |
+
|
662 |
+
# Update queue info periodically
|
663 |
+
demo.load(update_queue_info, None, queue_info_html, every=3)
|
664 |
+
|
665 |
+
# Refresh button event
|
666 |
+
refresh_queue_btn.click(update_queue_info, None, queue_info_html)
|
667 |
+
|
668 |
+
# Add evaluation endpoint compatible with original interface
|
669 |
+
demo.queue()
|
670 |
+
evaluate_endpoint = demo.load(fn=synchronous_evaluate, inputs=api_input, outputs=api_output, api_name="evaluate")
|
671 |
+
|
672 |
+
if __name__ == "__main__":
|
673 |
+
try:
|
674 |
+
demo.launch()
|
675 |
+
finally:
|
676 |
+
# Stop worker threads
|
677 |
+
running = False
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
gradio==4.44.1
|
src/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# src package
|
src/containerized_eval.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from . import eval_adb
|
3 |
+
from . import eval_ruby
|
4 |
+
from . import eval_lua
|
5 |
+
from . import eval_python
|
6 |
+
from . import eval_rust
|
7 |
+
from . import eval_julia
|
8 |
+
from . import eval_java
|
9 |
+
from . import eval_lua
|
10 |
+
from . import eval_racket
|
11 |
+
from . import eval_javascript
|
12 |
+
from . import eval_swift
|
13 |
+
from . import eval_cpp
|
14 |
+
from . import eval_php
|
15 |
+
from . import eval_dlang
|
16 |
+
from . import eval_julia
|
17 |
+
from . import eval_r
|
18 |
+
from . import eval_fs
|
19 |
+
from . import eval_ocaml
|
20 |
+
from . import eval_matlab
|
21 |
+
from . import eval_hs
|
22 |
+
from . import eval_elixir
|
23 |
+
from . import eval_clj
|
24 |
+
from . import eval_v
|
25 |
+
from . import eval_lean
|
26 |
+
from . import eval_dart
|
27 |
+
from . import eval_go
|
28 |
+
import tempfile
|
29 |
+
|
30 |
+
|
31 |
+
EVALUATORS = {
|
32 |
+
"ada": (eval_adb.eval_script, ".adb"),
|
33 |
+
"rb": (eval_ruby.eval_script, ".rb"),
|
34 |
+
"lua": (eval_lua.eval_script, ".lua"),
|
35 |
+
"python": (eval_python.eval_script, ".py"),
|
36 |
+
"py": (eval_python.eval_script, ".py"),
|
37 |
+
"notypes.py": (eval_python.eval_script, ".py"),
|
38 |
+
"julia": (eval_julia.eval_script, ".jl"),
|
39 |
+
"java" : (eval_java.eval_script, ".java"),
|
40 |
+
"rust" : (eval_rust.eval_script, ".rs"),
|
41 |
+
"rs" : (eval_rust.eval_script, ".rs"),
|
42 |
+
"swift": (eval_swift.eval_script, ".swift"),
|
43 |
+
"lua": (eval_lua.eval_script, ".lua"),
|
44 |
+
"racket": (eval_racket.eval_script, ".rkt"),
|
45 |
+
"rkt": (eval_racket.eval_script, ".rkt"),
|
46 |
+
"javascript": (eval_javascript.eval_script, ".js"),
|
47 |
+
"js": (eval_javascript.eval_script, ".js"),
|
48 |
+
"cpp": (eval_cpp.eval_script, ".cpp"),
|
49 |
+
"php": (eval_php.eval_script, ".php"),
|
50 |
+
"humaneval_to_dlang.py": (eval_dlang.eval_script, ".d"),
|
51 |
+
"d": (eval_dlang.eval_script, ".d"),
|
52 |
+
"r": (eval_r.eval_script, ".r"),
|
53 |
+
"humaneval_to_r.py": (eval_r.eval_script, ".r"),
|
54 |
+
"jl": (eval_julia.eval_script, ".jl"),
|
55 |
+
"fs": (eval_fs.eval_script, ".fsx"),
|
56 |
+
"ml": (eval_ocaml.eval_script, ".ml"),
|
57 |
+
"m": (eval_matlab.eval_script, ".m"),
|
58 |
+
"hs": (eval_hs.eval_script, ".hs"),
|
59 |
+
"elixir": (eval_elixir.eval_script, ".exs"),
|
60 |
+
"clj": (eval_clj.eval_script, ".clj"),
|
61 |
+
"coq": (eval_v.eval_script, ".v"),
|
62 |
+
"lean": (eval_lean.eval_script, ".lean"),
|
63 |
+
"dart": (eval_dart.eval_script, ".dart"),
|
64 |
+
"go": (eval_go.eval_script, ".go"),
|
65 |
+
"go_test.go": (eval_go.eval_script, "_test.go"),
|
66 |
+
}
|
67 |
+
|
68 |
+
def eval_string_script(language, program):
|
69 |
+
if language in EVALUATORS:
|
70 |
+
(eval_script, file_ext) = EVALUATORS[language]
|
71 |
+
else:
|
72 |
+
eval_module = __import__(f"eval_{language}" if language != "go_test.go" else "eval_go")
|
73 |
+
eval_script = eval_module.eval_script
|
74 |
+
file_ext = f".{language}" if language != "go_test.go" else "_test.go"
|
75 |
+
with tempfile.NamedTemporaryFile(suffix=file_ext, delete=True) as f:
|
76 |
+
f.write(program.encode("utf-8"))
|
77 |
+
f.flush()
|
78 |
+
result = eval_script(Path(f.name))
|
79 |
+
# Only save the first 2K of output from the running program. Any futher
|
80 |
+
# output is very likely an exceptionally long stack trace or a long
|
81 |
+
# series of prints.
|
82 |
+
if type(result["stdout"]) == bytes:
|
83 |
+
result["stdout"] = result["stdout"].decode("utf-8", errors="ignore")
|
84 |
+
if result["stdout"] is None:
|
85 |
+
result["stdout"] = ""
|
86 |
+
if result["stderr"] is None:
|
87 |
+
result["stderr"] = ""
|
88 |
+
if type(result["stderr"]) == bytes:
|
89 |
+
result["stderr"] = result["stderr"].decode("utf-8", errors="ignore")
|
90 |
+
assert type(result["stdout"]) == str
|
91 |
+
assert type(result["stderr"]) == str
|
92 |
+
return {
|
93 |
+
"program": program,
|
94 |
+
"stdout": result['stdout'].replace("!!int", "")[:2048],
|
95 |
+
"stderr": result['stderr'][:2048],
|
96 |
+
"exit_code": result['exit_code'],
|
97 |
+
"status": result['status']
|
98 |
+
}
|
src/eval_adb.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from src.safe_subprocess import run
|
3 |
+
from src.generic_eval import main
|
4 |
+
|
5 |
+
|
6 |
+
LANG_NAME = "Ada"
|
7 |
+
LANG_EXT = ".adb"
|
8 |
+
|
9 |
+
|
10 |
+
def eval_script(path: Path):
|
11 |
+
working_dir: Path = path.parent / (path.stem + "_tmp")
|
12 |
+
working_dir.mkdir()
|
13 |
+
chop_result = run(["gnatchop", "-w", path, working_dir])
|
14 |
+
if chop_result.exit_code != 0:
|
15 |
+
return {
|
16 |
+
"status": "SyntaxError (gnatchop)",
|
17 |
+
"exit_code": chop_result.exit_code,
|
18 |
+
"stdout": chop_result.stdout,
|
19 |
+
"stderr": chop_result.stderr,
|
20 |
+
}
|
21 |
+
|
22 |
+
build_result = run(
|
23 |
+
[
|
24 |
+
"gnatmake",
|
25 |
+
"-gnatW8",
|
26 |
+
"main.adb",
|
27 |
+
"-o",
|
28 |
+
"main",
|
29 |
+
"-g",
|
30 |
+
"-j0",
|
31 |
+
"-gnata",
|
32 |
+
"-gnat2022",
|
33 |
+
"-gnateE",
|
34 |
+
"-bargs",
|
35 |
+
"-Es",
|
36 |
+
],
|
37 |
+
cwd=str(working_dir),
|
38 |
+
)
|
39 |
+
if build_result.exit_code != 0:
|
40 |
+
return {
|
41 |
+
"status": "SyntaxError (gnatmake)",
|
42 |
+
"exit_code": build_result.exit_code,
|
43 |
+
"stdout": build_result.stdout,
|
44 |
+
"stderr": build_result.stderr,
|
45 |
+
}
|
46 |
+
|
47 |
+
status = "OK"
|
48 |
+
run_result = run(["./main"], cwd=str(working_dir))
|
49 |
+
|
50 |
+
if run_result.timeout:
|
51 |
+
status = "Timeout"
|
52 |
+
elif run_result.exit_code != 0:
|
53 |
+
status = "Exception"
|
54 |
+
|
55 |
+
return {
|
56 |
+
"status": status,
|
57 |
+
"exit_code": run_result.exit_code,
|
58 |
+
"stdout": run_result.stdout,
|
59 |
+
"stderr": run_result.stderr,
|
60 |
+
}
|
61 |
+
|
62 |
+
|
63 |
+
if __name__ == "__main__":
|
64 |
+
main(eval_script, LANG_NAME, LANG_EXT)
|
src/eval_clj.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Evaluates a generated Clojure program (.clj).
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import tempfile
|
6 |
+
from pathlib import Path
|
7 |
+
from src.safe_subprocess import run
|
8 |
+
from src.libeval import run_without_exn
|
9 |
+
|
10 |
+
|
11 |
+
def eval_script(path: Path):
|
12 |
+
# Create environment with a writable temporary directory for Clojure cache
|
13 |
+
temp_dir = tempfile.mkdtemp(prefix="clojure_home_")
|
14 |
+
env = os.environ.copy()
|
15 |
+
env["XDG_CONFIG_HOME"] = temp_dir # Set XDG_CONFIG_HOME for Clojure cache
|
16 |
+
env["XDG_DATA_HOME"] = temp_dir # Set XDG_DATA_HOME for Clojure data
|
17 |
+
env["XDG_CACHE_HOME"] = temp_dir # Set XDG_CACHE_HOME for caches
|
18 |
+
|
19 |
+
# Run Clojure with the custom environment
|
20 |
+
result = run(
|
21 |
+
["clojure", "-J-Dclojure.main.report=stderr", "-M", str(path)],
|
22 |
+
env=env
|
23 |
+
)
|
24 |
+
|
25 |
+
if result.timeout:
|
26 |
+
status = "Timeout"
|
27 |
+
elif result.exit_code != 0:
|
28 |
+
status = "Exception"
|
29 |
+
elif "\n0 failures, 0 errors.\n" in result.stdout:
|
30 |
+
status = "OK"
|
31 |
+
else: # test failure
|
32 |
+
status = "Exception"
|
33 |
+
|
34 |
+
return {
|
35 |
+
"status": status,
|
36 |
+
"exit_code": result.exit_code,
|
37 |
+
"stdout": result.stdout,
|
38 |
+
"stderr": result.stderr,
|
39 |
+
}
|
40 |
+
|
41 |
+
if __name__ == "__main__":
|
42 |
+
print("This module is not meant to be executed directly.")
|
src/eval_cpp.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from src.safe_subprocess import run
|
3 |
+
from src.generic_eval import main
|
4 |
+
|
5 |
+
LANG_NAME = "C++"
|
6 |
+
LANG_EXT = ".cpp"
|
7 |
+
|
8 |
+
|
9 |
+
def eval_script(path: Path):
|
10 |
+
basename = ".".join(str(path).split(".")[:-1])
|
11 |
+
build_result = run(["g++", path, "-o", basename, "-std=c++17"])
|
12 |
+
if build_result.exit_code != 0:
|
13 |
+
return {
|
14 |
+
"status": "SyntaxError",
|
15 |
+
"exit_code": build_result.exit_code,
|
16 |
+
"stdout": build_result.stdout,
|
17 |
+
"stderr": build_result.stderr,
|
18 |
+
}
|
19 |
+
|
20 |
+
run_result = run([basename])
|
21 |
+
if "In file included from /shared/centos7/gcc/9.2.0-skylake/" in run_result.stderr:
|
22 |
+
raise Exception("Skylake bug encountered")
|
23 |
+
if "/4.8.2" in run_result.stderr:
|
24 |
+
raise Exception("Ancient compiler encountered")
|
25 |
+
if run_result.timeout:
|
26 |
+
status = "Timeout"
|
27 |
+
elif run_result.exit_code != 0:
|
28 |
+
status = "Exception"
|
29 |
+
else:
|
30 |
+
status = "OK"
|
31 |
+
return {
|
32 |
+
"status": status,
|
33 |
+
"exit_code": run_result.exit_code,
|
34 |
+
"stdout": run_result.stdout,
|
35 |
+
"stderr": run_result.stderr,
|
36 |
+
}
|
37 |
+
|
38 |
+
|
39 |
+
if __name__ == "__main__":
|
40 |
+
main(eval_script, LANG_NAME, LANG_EXT)
|
src/eval_cs.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
import tempfile
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
from src.generic_eval import main
|
7 |
+
|
8 |
+
LANG_NAME = "CSharp"
|
9 |
+
LANG_EXT = ".cs"
|
10 |
+
|
11 |
+
#Following files have problems:
|
12 |
+
#137,
|
13 |
+
#22: Any
|
14 |
+
#148: Elipsis
|
15 |
+
|
16 |
+
def eval_script(path: str):
|
17 |
+
if ".cs" not in path.name:
|
18 |
+
return
|
19 |
+
basename = ".".join(str(path).split(".")[:-1])
|
20 |
+
binaryname = basename + ".exe"
|
21 |
+
build = subprocess.run(["csc", "/d:DEBUG", "-r:System.Numerics.dll", path, f"/out:{binaryname}"], capture_output=True)
|
22 |
+
status = None
|
23 |
+
returncode = -1
|
24 |
+
output = None
|
25 |
+
if build.returncode != 0:
|
26 |
+
# Well, it's a compile error. May be a type error or
|
27 |
+
# something. But, why break the set convention
|
28 |
+
status = "SyntaxError"
|
29 |
+
returncode = build.returncode
|
30 |
+
output = build
|
31 |
+
else:
|
32 |
+
try:
|
33 |
+
output = subprocess.run(["mono", binaryname], env={"PATH": os.getenv("PATH"), "MONO_TRACE_LISTENER":"Console.Error"}, capture_output=True, timeout=5)
|
34 |
+
returncode = output.returncode
|
35 |
+
output.stderr = str(output.stderr, "utf-8")
|
36 |
+
#mono return 0 even when failing
|
37 |
+
fail = "System.Diagnostics.DefaultTraceListener.Fail" in output.stderr or "Unhandled Exception" in output.stderr
|
38 |
+
output.returncode = 1 if fail else 0
|
39 |
+
if output.returncode == 0:
|
40 |
+
status = "OK"
|
41 |
+
else:
|
42 |
+
# Well, it's a panic
|
43 |
+
status = "Exception"
|
44 |
+
except subprocess.TimeoutExpired as exc:
|
45 |
+
status = "Timeout"
|
46 |
+
output = exc
|
47 |
+
os.remove(binaryname)
|
48 |
+
|
49 |
+
if output.stdout is not None:
|
50 |
+
output.stdout = output.stdout.decode("utf-8")
|
51 |
+
else:
|
52 |
+
output.stdout = "None"
|
53 |
+
|
54 |
+
if output.stderr == "":
|
55 |
+
output.stderr = "None"
|
56 |
+
|
57 |
+
return {
|
58 |
+
"status": status,
|
59 |
+
"exit_code": returncode,
|
60 |
+
"stdout": output.stdout,
|
61 |
+
"stderr": output.stderr,
|
62 |
+
}
|
63 |
+
|
64 |
+
if __name__ == "__main__":
|
65 |
+
main(eval_script, LANG_NAME, LANG_EXT)
|
src/eval_dart.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from src.safe_subprocess import run
|
3 |
+
|
4 |
+
|
5 |
+
def eval_script(path: Path):
|
6 |
+
r = run(["dart", "analyze", "--no-fatal-warnings", str(path)], timeout_seconds=15)
|
7 |
+
if r.exit_code != 0:
|
8 |
+
return {
|
9 |
+
"status": "SyntaxError",
|
10 |
+
"exit_code": r.exit_code,
|
11 |
+
"stdout": r.stdout,
|
12 |
+
"stderr": r.stderr,
|
13 |
+
}
|
14 |
+
|
15 |
+
r = run(["dart", str(path)], timeout_seconds=15)
|
16 |
+
if r.timeout:
|
17 |
+
status = "Timeout"
|
18 |
+
elif r.exit_code == 0:
|
19 |
+
status = "OK"
|
20 |
+
else:
|
21 |
+
status = "Exception"
|
22 |
+
return {
|
23 |
+
"status": status,
|
24 |
+
"exit_code": r.exit_code,
|
25 |
+
"stdout": r.stdout,
|
26 |
+
"stderr": r.stderr,
|
27 |
+
}
|
src/eval_dfy.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from src.safe_subprocess import run
|
3 |
+
|
4 |
+
# 0 – success
|
5 |
+
# 1 – invalid command-line arguments
|
6 |
+
# 2 – syntax, parse, or name or type resolution errors
|
7 |
+
# 3 – compilation errors
|
8 |
+
# 4 – verification errors
|
9 |
+
|
10 |
+
def eval_script(path: Path):
|
11 |
+
r = run(["dafny", "run", str(path)])
|
12 |
+
if r.timeout:
|
13 |
+
status = "Timeout"
|
14 |
+
elif r.exit_code == 0:
|
15 |
+
status = "OK"
|
16 |
+
elif r.exit_code == 2:
|
17 |
+
status = "SyntaxError"
|
18 |
+
elif r.exit_code == 3:
|
19 |
+
status = "CompilationError"
|
20 |
+
elif r.exit_code == 4:
|
21 |
+
status = "VerificationError"
|
22 |
+
else:
|
23 |
+
status = "Exception"
|
24 |
+
return {
|
25 |
+
"status": status,
|
26 |
+
"exit_code": r.exit_code,
|
27 |
+
"stdout": r.stdout,
|
28 |
+
"stderr": r.stderr,
|
29 |
+
}
|
src/eval_dlang.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
from pathlib import Path
|
4 |
+
from src.safe_subprocess import run
|
5 |
+
import sys
|
6 |
+
import re
|
7 |
+
|
8 |
+
ENABLE_SYNTAX_CHECK = False
|
9 |
+
|
10 |
+
def eval_script(path: Path):
|
11 |
+
result = run(["rdmd", "-unittest", str(path)], timeout_seconds=15)
|
12 |
+
if "might not be correctly installed" in result.stderr:
|
13 |
+
raise Exception("D is not correctly installed")
|
14 |
+
|
15 |
+
if result.timeout:
|
16 |
+
status = "Timeout"
|
17 |
+
elif result.exit_code == 0:
|
18 |
+
status = "OK"
|
19 |
+
elif "Error:" in result.stderr:
|
20 |
+
status = "SyntaxError"
|
21 |
+
else:
|
22 |
+
status = "Exception"
|
23 |
+
|
24 |
+
return {
|
25 |
+
"status": status,
|
26 |
+
"exit_code": result.exit_code,
|
27 |
+
"stdout": result.stdout,
|
28 |
+
"stderr": result.stderr,
|
29 |
+
}
|
30 |
+
|
31 |
+
DIR = "d-keep-code_davinci_001_temp_0.2"
|
32 |
+
def main():
|
33 |
+
directory = Path(Path(__file__).parent, "..", "datasets", DIR).resolve()
|
34 |
+
|
35 |
+
count = {"OK": 0, "Timeout": 0, "Exception": 0, "SyntaxError": 0}
|
36 |
+
for filename in os.listdir(directory):
|
37 |
+
path = Path.joinpath(directory, filename)
|
38 |
+
r = eval_script(path)
|
39 |
+
status = r["status"]
|
40 |
+
count[status] += 1
|
41 |
+
|
42 |
+
if ENABLE_SYNTAX_CHECK and status == "SyntaxError":
|
43 |
+
error_msgs = r["stderr"].split("\n")
|
44 |
+
with open(path) as source_file:
|
45 |
+
lines = source_file.readlines()
|
46 |
+
unittest_line_start = lines.index("unittest\n")
|
47 |
+
unittest_line_end = len(lines)
|
48 |
+
for err_msg_line in error_msgs:
|
49 |
+
matched_parts = re.match(r"(\/?.*?\.[\w:]+\/.*.d)\(([0-9]+)\): Error: (.*)", err_msg_line[2:-1])
|
50 |
+
_file, line_num = matched_parts[1], int(matched_parts[2])
|
51 |
+
if unittest_line_start <= line_num and line_num <= unittest_line_end:
|
52 |
+
print("===============")
|
53 |
+
print(path, "contains error in unit test part")
|
54 |
+
print(error_msgs)
|
55 |
+
print("===============")
|
56 |
+
|
57 |
+
filename = filename.split(".")[0]
|
58 |
+
print(f"Dlang,{filename},{status}")
|
59 |
+
|
60 |
+
print(DIR + ":" + str(count))
|
61 |
+
|
62 |
+
if __name__ == "__main__":
|
63 |
+
main()
|
src/eval_elixir.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
from sys import exit
|
3 |
+
import subprocess
|
4 |
+
from pathlib import Path
|
5 |
+
from src.generic_eval import main as gmain
|
6 |
+
|
7 |
+
|
8 |
+
def eval_script(path: Path):
|
9 |
+
try:
|
10 |
+
# Assumes exit-code 0 is all okay
|
11 |
+
output = subprocess.run(["elixir", str(path)], capture_output=True, timeout=5)
|
12 |
+
|
13 |
+
if output.returncode == 0:
|
14 |
+
status = "OK"
|
15 |
+
else:
|
16 |
+
outmessage = str(output)
|
17 |
+
if "Assertion with == failed" in outmessage:
|
18 |
+
status = "AssertionError"
|
19 |
+
elif "SyntaxError" in outmessage:
|
20 |
+
status = "SyntaxError"
|
21 |
+
else:
|
22 |
+
status = "Exception"
|
23 |
+
returncode = output.returncode
|
24 |
+
except subprocess.TimeoutExpired as exc:
|
25 |
+
status = "Timeout"
|
26 |
+
output = exc
|
27 |
+
returncode = -1
|
28 |
+
return {
|
29 |
+
"status": status,
|
30 |
+
"exit_code": returncode,
|
31 |
+
"stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
|
32 |
+
"stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
|
33 |
+
}
|
34 |
+
|
35 |
+
|
36 |
+
if __name__ == "__main__":
|
37 |
+
gmain(eval_script, "Elixir", ".exs")
|
src/eval_fs.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from src.safe_subprocess import run
|
3 |
+
|
4 |
+
def eval_script(path: Path):
|
5 |
+
r = run(["dotnet", "fsi", "-d:DEBUG", str(path)])
|
6 |
+
if r.timeout:
|
7 |
+
status = "Timeout"
|
8 |
+
elif r.exit_code == 0:
|
9 |
+
status = "OK"
|
10 |
+
else:
|
11 |
+
status = "Exception"
|
12 |
+
return {
|
13 |
+
"status" : status,
|
14 |
+
"exit_code": r.exit_code,
|
15 |
+
"stdout": r.stdout,
|
16 |
+
"stderr": r.stderr,
|
17 |
+
}
|
src/eval_go.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
from sys import exit
|
3 |
+
import subprocess
|
4 |
+
from pathlib import Path
|
5 |
+
import os
|
6 |
+
import tempfile
|
7 |
+
from src.generic_eval import main as gmain
|
8 |
+
|
9 |
+
|
10 |
+
def eval_script(path: Path):
|
11 |
+
status = None
|
12 |
+
stdout = None
|
13 |
+
stderr = None
|
14 |
+
exit_code = None
|
15 |
+
try:
|
16 |
+
# 创建临时目录用于Go缓存
|
17 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
18 |
+
# 设置Go环境变量
|
19 |
+
env = os.environ.copy()
|
20 |
+
env["GOCACHE"] = os.path.join(temp_dir, "go-build")
|
21 |
+
env["GOPATH"] = os.path.join(temp_dir, "gopath")
|
22 |
+
|
23 |
+
build = subprocess.run(["go", "test", path],
|
24 |
+
env=env,
|
25 |
+
timeout=30,
|
26 |
+
stdout=subprocess.PIPE,
|
27 |
+
stderr=subprocess.PIPE)
|
28 |
+
|
29 |
+
stdout = build.stdout.decode("utf-8", errors="ignore")
|
30 |
+
stderr = build.stderr.decode("utf-8", errors="ignore")
|
31 |
+
exit_code = build.returncode
|
32 |
+
# write to stderr just so that we can redirect stdout to a csv
|
33 |
+
|
34 |
+
if "[setup failed]" in stdout or "[build failed]" in stdout:
|
35 |
+
status = "SyntaxError"
|
36 |
+
elif "FAIL" in stdout:
|
37 |
+
status = "Exception"
|
38 |
+
else:
|
39 |
+
status = "OK"
|
40 |
+
except subprocess.TimeoutExpired:
|
41 |
+
status = "Timeout"
|
42 |
+
|
43 |
+
return {
|
44 |
+
"status": status,
|
45 |
+
"exit_code": exit_code,
|
46 |
+
"stdout": stdout,
|
47 |
+
"stderr": stderr,
|
48 |
+
}
|
49 |
+
|
50 |
+
|
51 |
+
if __name__ == "__main__":
|
52 |
+
gmain(eval_script, 'Go', '.go')
|
src/eval_hs.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from src.safe_subprocess import run
|
3 |
+
|
4 |
+
def eval_script(path: Path):
|
5 |
+
r = run(["runghc", str(path)])
|
6 |
+
if r.timeout:
|
7 |
+
status = "Timeout"
|
8 |
+
elif r.exit_code == 0:
|
9 |
+
status = "OK"
|
10 |
+
elif "Syntax error":
|
11 |
+
status = "SyntaxError"
|
12 |
+
else:
|
13 |
+
status = "Exception"
|
14 |
+
return {
|
15 |
+
"status": status,
|
16 |
+
"exit_code": r.exit_code,
|
17 |
+
"stdout": r.stdout,
|
18 |
+
"stderr": r.stderr,
|
19 |
+
}
|
src/eval_java.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import tempfile
|
3 |
+
from src.safe_subprocess import run
|
4 |
+
from pathlib import Path
|
5 |
+
from src.generic_eval import main
|
6 |
+
|
7 |
+
LANG_NAME = "Java"
|
8 |
+
LANG_EXT = ".java"
|
9 |
+
|
10 |
+
#Following files have problems:
|
11 |
+
#137,
|
12 |
+
#22: Any
|
13 |
+
#148: Elipsis
|
14 |
+
|
15 |
+
def eval_script(path: Path):
|
16 |
+
|
17 |
+
sys_env = os.environ.copy()
|
18 |
+
javatuples_path = Path("/usr/multiple/javatuples-1.2.jar")
|
19 |
+
|
20 |
+
sys_env["CLASSPATH"] = f"{javatuples_path}"
|
21 |
+
|
22 |
+
with tempfile.TemporaryDirectory() as outdir:
|
23 |
+
#Each Java file contains the class with same name `JAVA_CLASS_NAME`
|
24 |
+
#Hence, javac will same JAVA_CLASS_NAME.class file for each problem
|
25 |
+
#Write class for each problem to a different temp dir
|
26 |
+
#Use UTF8 encoding with javac
|
27 |
+
result = run(["javac", "-encoding", "UTF8", "-d", outdir, path], env=sys_env)
|
28 |
+
|
29 |
+
if result.exit_code != 0:
|
30 |
+
# Well, it's a compile error. May be a type error or
|
31 |
+
# something. But, why break the set convention
|
32 |
+
status = "SyntaxError"
|
33 |
+
else:
|
34 |
+
result = run(["java", "-ea", "-cp", f"{outdir}:{javatuples_path}", "Problem"], env = sys_env)
|
35 |
+
if result.timeout:
|
36 |
+
status = "Timeout"
|
37 |
+
elif result.exit_code == 0:
|
38 |
+
status = "OK"
|
39 |
+
else:
|
40 |
+
status = "Exception"
|
41 |
+
|
42 |
+
return {
|
43 |
+
"status": status,
|
44 |
+
"exit_code": result.exit_code,
|
45 |
+
"stdout": result.stdout,
|
46 |
+
"stderr": result.stderr,
|
47 |
+
}
|
48 |
+
|
49 |
+
if __name__ == "__main__":
|
50 |
+
main(eval_script, LANG_NAME, LANG_EXT)
|
src/eval_javascript.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
def eval_script(path: Path):
|
6 |
+
try:
|
7 |
+
# Assumes exit-code 0 is all okay
|
8 |
+
output = subprocess.run(["node", str(path)], capture_output=True, timeout=5)
|
9 |
+
|
10 |
+
if output.returncode == 0:
|
11 |
+
status = "OK"
|
12 |
+
else:
|
13 |
+
outmessage = str(output)
|
14 |
+
if 'ERR_ASSERTION' in outmessage:
|
15 |
+
status = "AssertionError"
|
16 |
+
elif 'SyntaxError' in outmessage:
|
17 |
+
status = "SyntaxError"
|
18 |
+
elif 'ReferenceError' in outmessage:
|
19 |
+
status = "ReferenceError"
|
20 |
+
else:
|
21 |
+
status = "Exception"
|
22 |
+
returncode = output.returncode
|
23 |
+
except subprocess.TimeoutExpired as exc:
|
24 |
+
status = "Timeout"
|
25 |
+
output = exc
|
26 |
+
returncode = -1
|
27 |
+
except subprocess.CalledProcessError as exc:
|
28 |
+
status = "Exception"
|
29 |
+
returncode = exc.returncode
|
30 |
+
output = exc
|
31 |
+
return {
|
32 |
+
"status": status,
|
33 |
+
"exit_code": returncode,
|
34 |
+
"stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
|
35 |
+
"stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
|
36 |
+
}
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
def main():
|
41 |
+
directory = Path(Path(__file__).parent, "..", "datasets", "js-keep-code_davinci_001_temp_0.2").resolve()
|
42 |
+
|
43 |
+
for filename in os.listdir(directory):
|
44 |
+
r = eval_script(Path.joinpath(directory,filename))
|
45 |
+
filename = filename.split(".")[0]
|
46 |
+
print(f"JavaScript,{filename},{r['status']}")
|
47 |
+
|
48 |
+
if __name__ == "__main__":
|
49 |
+
main()
|
src/eval_julia.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.safe_subprocess import run
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
def eval_script(path: Path):
|
5 |
+
result = run(["julia", str(path)], timeout_seconds=5)
|
6 |
+
if result.timeout:
|
7 |
+
status = "Timeout"
|
8 |
+
elif result.exit_code == 0:
|
9 |
+
status = "OK"
|
10 |
+
# TODO(arjun): I would like this to be reviewed more carefully by John.
|
11 |
+
elif len(result.stderr) < 1:
|
12 |
+
status = "Exception"
|
13 |
+
else:
|
14 |
+
status = "SyntaxError"
|
15 |
+
|
16 |
+
return {
|
17 |
+
"status": status,
|
18 |
+
"exit_code": result.exit_code,
|
19 |
+
"stdout": result.stdout,
|
20 |
+
"stderr": result.stderr,
|
21 |
+
}
|
src/eval_lean.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from src.safe_subprocess import run
|
3 |
+
import subprocess
|
4 |
+
|
5 |
+
def eval_script(path: Path):
|
6 |
+
# since lean is a theorem prover first and not a programming environment,
|
7 |
+
# the return code is always 1. idk.
|
8 |
+
try:
|
9 |
+
output = subprocess.run(["lean", str(path)], capture_output=True, timeout=5)
|
10 |
+
outmessage = str(output)
|
11 |
+
|
12 |
+
if "error: tactic 'rfl' failed" in outmessage: # :skull:
|
13 |
+
status = "AssertionError"
|
14 |
+
elif outmessage == "":
|
15 |
+
status = "OK"
|
16 |
+
else:
|
17 |
+
status = "SyntaxError"
|
18 |
+
returncode = output.returncode
|
19 |
+
|
20 |
+
except subprocess.TimeoutExpired as exc:
|
21 |
+
status = "Timeout"
|
22 |
+
output = exc
|
23 |
+
returncode = -1
|
24 |
+
return {
|
25 |
+
"status": status,
|
26 |
+
"exit_code": returncode,
|
27 |
+
"stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
|
28 |
+
"stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
|
29 |
+
}
|
src/eval_lua.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from src.safe_subprocess import run
|
3 |
+
|
4 |
+
def eval_script(path: Path):
|
5 |
+
r = run(["lua", str(path)])
|
6 |
+
if r.timeout:
|
7 |
+
status = "Timeout"
|
8 |
+
elif r.exit_code == 0:
|
9 |
+
status = "OK"
|
10 |
+
else:
|
11 |
+
status = "Exception"
|
12 |
+
return {
|
13 |
+
"status": status,
|
14 |
+
"exit_code": r.exit_code,
|
15 |
+
"stdout": r.stdout,
|
16 |
+
"stderr": r.stderr,
|
17 |
+
}
|
src/eval_luau.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from src.safe_subprocess import run
|
3 |
+
|
4 |
+
|
5 |
+
def eval_script(path: Path):
|
6 |
+
r = run(["luau-analyze", str(path)])
|
7 |
+
if r.timeout:
|
8 |
+
status = "Timeout"
|
9 |
+
elif r.exit_code == 0:
|
10 |
+
r = run(["luau", str(path)])
|
11 |
+
if r.timeout:
|
12 |
+
status = "Timeout"
|
13 |
+
elif r.exit_code == 0:
|
14 |
+
status = "OK"
|
15 |
+
else:
|
16 |
+
status = "Exception"
|
17 |
+
elif "SyntaxError" in r.stderr:
|
18 |
+
status = "SyntaxError"
|
19 |
+
else:
|
20 |
+
status = "TypeError"
|
21 |
+
return {
|
22 |
+
"status": status,
|
23 |
+
"exit_code": r.exit_code,
|
24 |
+
"stdout": r.stdout,
|
25 |
+
"stderr": r.stderr,
|
26 |
+
}
|
src/eval_matlab.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from src.safe_subprocess import run
|
3 |
+
|
4 |
+
def eval_script(path):
|
5 |
+
# Matlab has the requirement that all functions must appear at the end
|
6 |
+
# of the file. So we first have to write the call to the test-function at the
|
7 |
+
# beginning of the file.
|
8 |
+
with open(path, 'r') as f:
|
9 |
+
content = f.read()
|
10 |
+
content = f"test();\n{content}"
|
11 |
+
with open(path, 'w') as f:
|
12 |
+
f.write(content)
|
13 |
+
filename = path.stem
|
14 |
+
parent_dir = path.parent.absolute()
|
15 |
+
|
16 |
+
# We use the matlab.engine to run the script; however, the way that the
|
17 |
+
# matlab engine works requires that we call the script as if it were a
|
18 |
+
# member of the matlab.engine object. So we have to write a python script
|
19 |
+
# that calls the matlab script. This also ensures that the script is called
|
20 |
+
# in a safe-subprocess. Who needs runtime reflection when you have IPC?
|
21 |
+
program= f"""
|
22 |
+
import matlab.engine
|
23 |
+
import io
|
24 |
+
import sys
|
25 |
+
out = io.StringIO()
|
26 |
+
err = io.StringIO()
|
27 |
+
eng = matlab.engine.start_matlab()
|
28 |
+
eng.addpath(r'{parent_dir}',nargout=0)
|
29 |
+
try:
|
30 |
+
r = eng.{filename}(nargout=0, stdout=out,stderr=err)
|
31 |
+
print(out.getvalue())
|
32 |
+
except matlab.engine.MatlabExecutionError as e:
|
33 |
+
print(err.getvalue(), file=sys.stderr)
|
34 |
+
"""
|
35 |
+
r = run(["python3", "-c", program], timeout_seconds=30)
|
36 |
+
|
37 |
+
# This is still somewhat brittle.
|
38 |
+
if r.timeout:
|
39 |
+
status = "Timeout"
|
40 |
+
exit_code = -1
|
41 |
+
elif r.stderr == "":
|
42 |
+
status = "OK"
|
43 |
+
exit_code = 0
|
44 |
+
else:
|
45 |
+
status = "Exception"
|
46 |
+
exit_code = 1
|
47 |
+
|
48 |
+
return {
|
49 |
+
"status": status,
|
50 |
+
"exit_code": exit_code,
|
51 |
+
"stdout": r.stdout,
|
52 |
+
"stderr": r.stderr,
|
53 |
+
}
|
src/eval_ocaml.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from src.safe_subprocess import run
|
3 |
+
|
4 |
+
def eval_script(path: Path):
|
5 |
+
r = run(["ocaml", str(path)])
|
6 |
+
if r.timeout:
|
7 |
+
status = "Timeout"
|
8 |
+
elif r.exit_code == 0:
|
9 |
+
status = "OK"
|
10 |
+
elif "Assert_failure" in r.stderr:
|
11 |
+
status = "AssertionError"
|
12 |
+
elif "Syntax error" in r.stderr:
|
13 |
+
status = "SyntaxError"
|
14 |
+
else:
|
15 |
+
status = "Exception"
|
16 |
+
return {
|
17 |
+
"status": status,
|
18 |
+
"exit_code": r.exit_code,
|
19 |
+
"stdout": r.stdout,
|
20 |
+
"stderr": r.stderr,
|
21 |
+
}
|
src/eval_php.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from src.safe_subprocess import run
|
3 |
+
|
4 |
+
LANG_NAME = "PHP"
|
5 |
+
LANG_EXT = ".php"
|
6 |
+
|
7 |
+
def eval_script(path: Path):
|
8 |
+
r = run(["php", path])
|
9 |
+
if "PHP Parse error" in r.stdout:
|
10 |
+
status = "SyntaxError"
|
11 |
+
elif r.exit_code != 0:
|
12 |
+
status = "Exception"
|
13 |
+
else:
|
14 |
+
status = "OK"
|
15 |
+
return {
|
16 |
+
"status": status,
|
17 |
+
"exit_code": r.exit_code,
|
18 |
+
"stdout": r.stdout,
|
19 |
+
"stderr": r.stderr,
|
20 |
+
}
|
src/eval_pl.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from src.safe_subprocess import run
|
3 |
+
|
4 |
+
def eval_script(path: Path):
|
5 |
+
r = run(["perl", path])
|
6 |
+
|
7 |
+
if r.timeout:
|
8 |
+
status = "Timeout"
|
9 |
+
elif r.exit_code != 0:
|
10 |
+
status = "Exception"
|
11 |
+
elif "ERROR" in r.stdout or "ERROR" in r.stderr:
|
12 |
+
status = "Exception"
|
13 |
+
else:
|
14 |
+
status = "OK"
|
15 |
+
return {
|
16 |
+
"status": status,
|
17 |
+
"exit_code": r.exit_code,
|
18 |
+
"stdout": r.stdout,
|
19 |
+
"stderr": r.stderr,
|
20 |
+
}
|
src/eval_python.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from src.safe_subprocess import run
|
3 |
+
|
4 |
+
def eval_script(path: Path):
|
5 |
+
r = run(["python3", str(path)])
|
6 |
+
if r.timeout:
|
7 |
+
status = "Timeout"
|
8 |
+
elif r.exit_code == 0:
|
9 |
+
status = "OK"
|
10 |
+
elif "SyntaxError" in r.stderr:
|
11 |
+
status = "SyntaxError"
|
12 |
+
else:
|
13 |
+
status = "Exception"
|
14 |
+
return {
|
15 |
+
"status" : status,
|
16 |
+
"exit_code": r.exit_code,
|
17 |
+
"stdout": r.stdout,
|
18 |
+
"stderr": r.stderr,
|
19 |
+
}
|
src/eval_r.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
def eval_script(path: Path):
|
6 |
+
try:
|
7 |
+
# Assumes exit-code 0 is all okay
|
8 |
+
# Run R on the file, capturing stderr
|
9 |
+
output = subprocess.run(["Rscript", str(path)], capture_output=True, timeout=5)
|
10 |
+
if output.returncode == 0:
|
11 |
+
status = "OK"
|
12 |
+
else:
|
13 |
+
outmessage = str(output)
|
14 |
+
if 'unexpected' in outmessage:
|
15 |
+
status = "SyntaxError"
|
16 |
+
elif "err=b''" in outmessage:
|
17 |
+
status = "AssertionError"
|
18 |
+
else:
|
19 |
+
status = "Exception"
|
20 |
+
returncode = output.returncode
|
21 |
+
except subprocess.TimeoutExpired as exc:
|
22 |
+
status = "Timeout"
|
23 |
+
output = exc
|
24 |
+
returncode = -1
|
25 |
+
except subprocess.CalledProcessError as exc:
|
26 |
+
status = "Exception"
|
27 |
+
returncode = exc.returncode
|
28 |
+
output = exc
|
29 |
+
return {
|
30 |
+
"status": status,
|
31 |
+
"exit_code": returncode,
|
32 |
+
"stdout": output.stdout,
|
33 |
+
"stderr": output.stderr
|
34 |
+
}
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
def main():
|
39 |
+
directory = Path(Path(__file__).parent, "..", "datasets", "R-keep-code_davinci_001_temp_0.2").resolve()
|
40 |
+
|
41 |
+
for filename in os.listdir(directory):
|
42 |
+
r = eval_script(Path.joinpath(directory,filename))
|
43 |
+
filename = filename.split(".")[0]
|
44 |
+
print(f"R,{filename},{r['status']}")
|
45 |
+
|
46 |
+
if __name__ == "__main__":
|
47 |
+
main()
|
src/eval_racket.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Evaluates a generated Racket program (.rkt).
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
from pathlib import Path
|
6 |
+
from src.safe_subprocess import run
|
7 |
+
from src.libeval import run_without_exn
|
8 |
+
|
9 |
+
|
10 |
+
def eval_script(path: Path):
|
11 |
+
result = run(["racket", str(path)])
|
12 |
+
|
13 |
+
if (
|
14 |
+
"standard-module-name-resolver: collection not found\n for module path: rackunit"
|
15 |
+
in result.stderr
|
16 |
+
):
|
17 |
+
print(f"Failed to run evaluation for {path}: rackunit is not installed")
|
18 |
+
return None
|
19 |
+
|
20 |
+
# rackunit produces exit code 0 even if tests fail.
|
21 |
+
if len(result.stderr) > 0 or result.exit_code != 0:
|
22 |
+
if "read-syntax" in result.stderr:
|
23 |
+
status = "SyntaxError"
|
24 |
+
else:
|
25 |
+
status = "Exception"
|
26 |
+
else:
|
27 |
+
status = "OK"
|
28 |
+
|
29 |
+
return {
|
30 |
+
"status": status,
|
31 |
+
"exit_code": result.exit_code,
|
32 |
+
"stdout": result.stdout,
|
33 |
+
"stderr": result.stderr,
|
34 |
+
}
|
35 |
+
|
36 |
+
|
37 |
+
def main():
|
38 |
+
directory = Path(
|
39 |
+
Path(__file__).parent, "..", "datasets", "racket-keep-code_davinci_001_temp_0.2"
|
40 |
+
).resolve()
|
41 |
+
|
42 |
+
for filename in os.listdir(directory):
|
43 |
+
r = eval_script(Path.joinpath(directory, filename))
|
44 |
+
filename = filename.split(".")[0]
|
45 |
+
print(f"Racket,{filename},{r['status']}")
|
46 |
+
|
47 |
+
|
48 |
+
if __name__ == "__main__":
|
49 |
+
main()
|
src/eval_ruby.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
from sys import exit
|
3 |
+
import subprocess
|
4 |
+
from pathlib import Path
|
5 |
+
from src.generic_eval import main as gmain
|
6 |
+
|
7 |
+
def eval_script(path: Path):
|
8 |
+
try:
|
9 |
+
# Assumes exit-code 0 is all okay
|
10 |
+
# Need check=True for Ruby to pass errors to CalledProcessError
|
11 |
+
output = subprocess.run(
|
12 |
+
["ruby", path], check=True, capture_output=True, timeout=5
|
13 |
+
)
|
14 |
+
if output.returncode == 0:
|
15 |
+
status = "OK"
|
16 |
+
out = output.stderr
|
17 |
+
error = output.stdout
|
18 |
+
returncode = 0
|
19 |
+
else:
|
20 |
+
raise Exception("there's an issue with check = True for Ruby, INVESTIGATE!")
|
21 |
+
except subprocess.TimeoutExpired as exc:
|
22 |
+
status = "Timeout"
|
23 |
+
out = exc.stdout
|
24 |
+
error = exc.stderr
|
25 |
+
returncode = -1
|
26 |
+
except subprocess.CalledProcessError as exc:
|
27 |
+
returncode = exc.returncode
|
28 |
+
out = exc.stdout
|
29 |
+
error = exc.stderr
|
30 |
+
#failure with code 1 but no error message is an Exception from Failed tests
|
31 |
+
if len(error) < 1:
|
32 |
+
status = "Exception"
|
33 |
+
else: #everything that prints out an error message is a SyntaxError
|
34 |
+
status = "SyntaxError"
|
35 |
+
return {
|
36 |
+
"status": status,
|
37 |
+
"exit_code": returncode,
|
38 |
+
"stdout": out,
|
39 |
+
"stderr": error,
|
40 |
+
}
|
41 |
+
|
42 |
+
if __name__ == "__main__":
|
43 |
+
gmain(eval_script, 'Ruby', '.rb')
|
src/eval_rust.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
import tempfile
|
4 |
+
from pathlib import Path
|
5 |
+
from src.generic_eval import main
|
6 |
+
|
7 |
+
LANG_NAME = "Rust"
|
8 |
+
LANG_EXT = ".rs"
|
9 |
+
|
10 |
+
def eval_script(path: Path):
|
11 |
+
basename = ".".join(str(path).split(".")[:-1])
|
12 |
+
try:
|
13 |
+
build = subprocess.run(["rustc", path, "-o", basename], capture_output=True, timeout=15)
|
14 |
+
except subprocess.TimeoutExpired as exc:
|
15 |
+
return {
|
16 |
+
"status": "Timeout",
|
17 |
+
"exit_code": -1,
|
18 |
+
"stdout": "Compiler timeout",
|
19 |
+
"stderr": "Compiler timeout",
|
20 |
+
}
|
21 |
+
status = None
|
22 |
+
returncode = -1
|
23 |
+
output = None
|
24 |
+
if build.returncode != 0:
|
25 |
+
# Well, it's a compile error. May be a type error or
|
26 |
+
# something. But, why break the set convention
|
27 |
+
status = "SyntaxError"
|
28 |
+
returncode = build.returncode
|
29 |
+
output = build
|
30 |
+
else:
|
31 |
+
try:
|
32 |
+
# Assumes exit-code 0 is all okay
|
33 |
+
output = subprocess.run([basename], capture_output=True, timeout=5)
|
34 |
+
returncode = output.returncode
|
35 |
+
if output.returncode == 0:
|
36 |
+
status = "OK"
|
37 |
+
else:
|
38 |
+
# Well, it's a panic
|
39 |
+
status = "Exception"
|
40 |
+
except subprocess.TimeoutExpired as exc:
|
41 |
+
status = "Timeout"
|
42 |
+
output = exc
|
43 |
+
os.remove(basename)
|
44 |
+
return {
|
45 |
+
"status": status,
|
46 |
+
"exit_code": returncode,
|
47 |
+
"stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
|
48 |
+
"stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
|
49 |
+
}
|
50 |
+
|
51 |
+
if __name__ == "__main__":
|
52 |
+
main(eval_script, LANG_NAME, LANG_EXT)
|
53 |
+
|
src/eval_scala.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
import tempfile
|
3 |
+
from src.safe_subprocess import run
|
4 |
+
|
5 |
+
LANG_NAME = "Scala"
|
6 |
+
LANG_EXT = ".scala"
|
7 |
+
|
8 |
+
def eval_script(path: Path):
|
9 |
+
with tempfile.TemporaryDirectory() as outdir:
|
10 |
+
# Each Scala file contains the class with same name `JAVA_CLASS_NAME`
|
11 |
+
# Hence, scalac will same JAVA_CLASS_NAME.class file for each problem
|
12 |
+
# Write class for each problem to a different temp dir
|
13 |
+
build = run(["scalac", "-d", outdir, path], timeout_seconds=45)
|
14 |
+
if build.exit_code != 0:
|
15 |
+
# Well, it's a compile error. May be a type error or
|
16 |
+
# something. But, why break the set convention
|
17 |
+
return {
|
18 |
+
"status": "SyntaxError",
|
19 |
+
"exit_code": build.exit_code,
|
20 |
+
"stdout": build.stdout,
|
21 |
+
"stderr": build.stderr,
|
22 |
+
}
|
23 |
+
# "Problem" is the name of the class we emit.
|
24 |
+
r = run(["scala", "-cp", f"{outdir}", "Problem"])
|
25 |
+
if r.timeout:
|
26 |
+
status = "Timeout"
|
27 |
+
elif r.exit_code == 0 and r.stderr == "":
|
28 |
+
status = "OK"
|
29 |
+
else:
|
30 |
+
# Well, it's a panic
|
31 |
+
status = "Exception"
|
32 |
+
return {
|
33 |
+
"status": status,
|
34 |
+
"exit_code": r.exit_code,
|
35 |
+
"stdout": r.stdout,
|
36 |
+
"stderr": r.stderr,
|
37 |
+
}
|
src/eval_sh.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from src.safe_subprocess import run
|
3 |
+
|
4 |
+
LANG_NAME = "bash"
|
5 |
+
LANG_EXT = ".sh"
|
6 |
+
|
7 |
+
def eval_script(path: Path):
|
8 |
+
# Capture output - will be generated regardless of success, fail, or syntax error
|
9 |
+
p = run(["bash", path])
|
10 |
+
if p.timeout:
|
11 |
+
status = "Timeout"
|
12 |
+
elif p.exit_code == 0:
|
13 |
+
status = "OK"
|
14 |
+
elif "syntax error" in p.stderr:
|
15 |
+
status = "SyntaxError"
|
16 |
+
else:
|
17 |
+
status = "Exception"
|
18 |
+
|
19 |
+
return {
|
20 |
+
"status": status,
|
21 |
+
"exit_code": p.exit_code,
|
22 |
+
"stdout": p.stdout,
|
23 |
+
"stderr": p.stderr,
|
24 |
+
}
|
src/eval_swift.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess
|
2 |
+
from pathlib import Path
|
3 |
+
import os
|
4 |
+
from src.safe_subprocess import run
|
5 |
+
|
6 |
+
def eval_script(path: Path):
|
7 |
+
basename = ".".join(str(path).split(".")[:-1])
|
8 |
+
r = run(["swiftc", path, "-o", basename], timeout_seconds=45)
|
9 |
+
if r.timeout:
|
10 |
+
status = "Timeout"
|
11 |
+
elif r.exit_code != 0:
|
12 |
+
# Well, it's a compile error. May be a type error or
|
13 |
+
# something. But, why break the set convention
|
14 |
+
status = "SyntaxError"
|
15 |
+
else:
|
16 |
+
r = run([basename], timeout_seconds=5)
|
17 |
+
if r.timeout:
|
18 |
+
status = "Timeout"
|
19 |
+
elif r.exit_code != 0:
|
20 |
+
# Well, it's a panic
|
21 |
+
status = "Exception"
|
22 |
+
else:
|
23 |
+
status = "OK"
|
24 |
+
os.remove(basename)
|
25 |
+
return {
|
26 |
+
"status": status,
|
27 |
+
"exit_code": r.exit_code,
|
28 |
+
"stdout": r.stdout,
|
29 |
+
"stderr": r.stderr,
|
30 |
+
}
|
src/eval_ts.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from src.safe_subprocess import run
|
3 |
+
|
4 |
+
|
5 |
+
def eval_script(path: Path):
|
6 |
+
r = run(["tsc", "--target", "esnext", str(path)], timeout_seconds=15)
|
7 |
+
if r.exit_code != 0:
|
8 |
+
return {
|
9 |
+
"status": "SyntaxError",
|
10 |
+
"exit_code": r.exit_code,
|
11 |
+
"stdout": r.stdout,
|
12 |
+
"stderr": r.stderr,
|
13 |
+
}
|
14 |
+
|
15 |
+
r = run(["node", str(path).replace(".ts", ".js")], timeout_seconds=15)
|
16 |
+
if r.timeout:
|
17 |
+
status = "Timeout"
|
18 |
+
elif r.exit_code == 0:
|
19 |
+
status = "OK"
|
20 |
+
elif "ERR_ASSERTION" in r.stderr:
|
21 |
+
status = "AssertionError"
|
22 |
+
elif "SyntaxError" in r.stderr:
|
23 |
+
status = "SyntaxError"
|
24 |
+
elif "ReferenceError" in r.stderr:
|
25 |
+
status = "ReferenceError"
|
26 |
+
else:
|
27 |
+
status = "Exception"
|
28 |
+
return {
|
29 |
+
"status": status,
|
30 |
+
"exit_code": r.exit_code,
|
31 |
+
"stdout": r.stdout,
|
32 |
+
"stderr": r.stderr,
|
33 |
+
}
|
src/eval_v.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from src.safe_subprocess import run
|
3 |
+
import subprocess
|
4 |
+
|
5 |
+
# return codes for coqc:
|
6 |
+
# 0: compilation goes through
|
7 |
+
# 1: some sort of error (nondescript)
|
8 |
+
|
9 |
+
def eval_script(path: Path):
|
10 |
+
cleanup_extensions = ['.vo', '.vok', '.vos']
|
11 |
+
|
12 |
+
try:
|
13 |
+
# sadly there seems to be no way to verify proofs in a coq file without compiling
|
14 |
+
output = subprocess.run(["coqc", "-noglob", str(path)], capture_output=True, timeout=5)
|
15 |
+
outmessage = str(output)
|
16 |
+
|
17 |
+
if output.returncode == 0:
|
18 |
+
status = "OK"
|
19 |
+
# cleanup: remove files generated by coqc
|
20 |
+
for ext in cleanup_extensions:
|
21 |
+
file_to_remove = path.with_suffix(ext)
|
22 |
+
if file_to_remove.exists():
|
23 |
+
file_to_remove.unlink()
|
24 |
+
|
25 |
+
elif "Unable to unify" in outmessage:
|
26 |
+
status = "AssertionError"
|
27 |
+
else:
|
28 |
+
status = "SyntaxError"
|
29 |
+
returncode = output.returncode
|
30 |
+
|
31 |
+
except subprocess.TimeoutExpired as exc:
|
32 |
+
status = "Timeout"
|
33 |
+
output = exc
|
34 |
+
returncode = -1
|
35 |
+
return {
|
36 |
+
"status": status,
|
37 |
+
"exit_code": returncode,
|
38 |
+
"stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
|
39 |
+
"stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
|
40 |
+
}
|
src/generic_eval.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is a helper script for evaluating benchmarks that have been translated to
|
2 |
+
# different languages.
|
3 |
+
#
|
4 |
+
# To use this script, call eval_lang.py.
|
5 |
+
# The --directory argument is required, and tells the script where the benchmarks are located.
|
6 |
+
# The --files argument is optional, and takes a list of numbers corresponding to the files to be evaluated.
|
7 |
+
#
|
8 |
+
# The script will print the results on each benchmark, and also write to results/lang.csv.
|
9 |
+
# When the script completes, it will print a summary.
|
10 |
+
#
|
11 |
+
# Examples
|
12 |
+
#
|
13 |
+
# To run the entire benchmark suite:
|
14 |
+
# python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/
|
15 |
+
#
|
16 |
+
# To run benchmarks 1, 2, and 3:
|
17 |
+
# python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/ --files 1 2 3
|
18 |
+
|
19 |
+
import argparse
|
20 |
+
from sys import exit as sysexit
|
21 |
+
from pathlib import Path
|
22 |
+
import sys
|
23 |
+
|
24 |
+
def list_files(directory, ext):
|
25 |
+
files_unsorted = directory.glob(f"HumanEval_*{ext}")
|
26 |
+
# assumption: base filenames are in the format of HumanEval_X_*
|
27 |
+
# Where X is a valid number
|
28 |
+
def key(s):
|
29 |
+
return int(str(s.name).split("_")[1])
|
30 |
+
files_sorted = sorted(files_unsorted, key=(lambda s: key(s)))
|
31 |
+
|
32 |
+
# assumption: there may be missing files, but no extra files
|
33 |
+
# so we build files_array where the index corresponds to the file's number,
|
34 |
+
# and a missing file is represented by None
|
35 |
+
size = key(files_sorted[-1]) + 1
|
36 |
+
files_array = [None] * size
|
37 |
+
for f in files_sorted:
|
38 |
+
k = key(f)
|
39 |
+
files_array[k] = f
|
40 |
+
|
41 |
+
return files_array
|
42 |
+
|
43 |
+
def main(eval_script, language, extension):
|
44 |
+
args = argparse.ArgumentParser()
|
45 |
+
|
46 |
+
args.add_argument(
|
47 |
+
"--directory", type=str, required=True, help="Directory to read benchmarks from"
|
48 |
+
)
|
49 |
+
args.add_argument(
|
50 |
+
"--files",
|
51 |
+
type=int,
|
52 |
+
nargs="*",
|
53 |
+
default=[],
|
54 |
+
help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2"
|
55 |
+
)
|
56 |
+
args = args.parse_args()
|
57 |
+
|
58 |
+
directory = Path(args.directory).resolve()
|
59 |
+
|
60 |
+
files_sorted = list_files(directory, extension)
|
61 |
+
|
62 |
+
# the directory you specified does not contain the right language
|
63 |
+
if len(files_sorted) == 0:
|
64 |
+
print(f'The specified directory does not contain files of type {extension}')
|
65 |
+
sysexit(1)
|
66 |
+
|
67 |
+
files_index = []
|
68 |
+
if len(args.files) > 0:
|
69 |
+
files_index = args.files
|
70 |
+
else:
|
71 |
+
files_index = range(len(files_sorted))
|
72 |
+
|
73 |
+
total = 0
|
74 |
+
passed = 0
|
75 |
+
syntax_error = 0
|
76 |
+
|
77 |
+
results_file = Path(Path(__file__).parent, "..", "results", language.lower() + ".csv").resolve()
|
78 |
+
|
79 |
+
with open(results_file, "w") as f:
|
80 |
+
for i in files_index:
|
81 |
+
filepath = files_sorted[i]
|
82 |
+
if filepath is None:
|
83 |
+
print("File {} does not exist!".format(i))
|
84 |
+
continue
|
85 |
+
res = eval_script(filepath)
|
86 |
+
output = f"{language},{filepath.stem},{res['status']}\n"
|
87 |
+
f.write(output)
|
88 |
+
print(output, end="")
|
89 |
+
total += 1
|
90 |
+
if res['status'] == "OK":
|
91 |
+
passed += 1
|
92 |
+
elif res['status'] == "SyntaxError":
|
93 |
+
syntax_error += 1
|
94 |
+
print (f"Total {total}, Syntax Error {syntax_error}, Passed {passed}")
|
95 |
+
|
96 |
+
|
97 |
+
|
98 |
+
def main_check_stubs(check_script, language, extension):
|
99 |
+
args = argparse.ArgumentParser()
|
100 |
+
|
101 |
+
args.add_argument(
|
102 |
+
"--directory", type=str, required=True, help="Directory to read benchmarks from"
|
103 |
+
)
|
104 |
+
args.add_argument(
|
105 |
+
"--files",
|
106 |
+
type=int,
|
107 |
+
nargs="*",
|
108 |
+
default=[],
|
109 |
+
help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2"
|
110 |
+
)
|
111 |
+
args = args.parse_args()
|
112 |
+
|
113 |
+
directory = Path(args.directory).resolve()
|
114 |
+
|
115 |
+
files_sorted = list_files(directory, extension)
|
116 |
+
|
117 |
+
# the directory you specified does not contain the right language
|
118 |
+
if len(files_sorted) == 0:
|
119 |
+
print(f'The specified directory does not contain files of type {extension}')
|
120 |
+
sysexit(1)
|
121 |
+
|
122 |
+
files_index = []
|
123 |
+
if len(args.files) > 0:
|
124 |
+
files_index = args.files
|
125 |
+
else:
|
126 |
+
files_index = range(len(files_sorted))
|
127 |
+
|
128 |
+
total = 0
|
129 |
+
passed = 0
|
130 |
+
|
131 |
+
results_file = Path(Path(__file__).parent, "..", "check_results", language.lower() + ".csv").resolve()
|
132 |
+
|
133 |
+
with open(results_file, "w") as f:
|
134 |
+
for i in files_index:
|
135 |
+
filepath = files_sorted[i]
|
136 |
+
if filepath is None:
|
137 |
+
print("File {} does not exist!".format(i))
|
138 |
+
continue
|
139 |
+
res = check_script(filepath)
|
140 |
+
output = f"{language},{filepath.stem},{res['status']}\n"
|
141 |
+
f.write(output)
|
142 |
+
print(output, end="")
|
143 |
+
total += 1
|
144 |
+
if res['status'] == "OK":
|
145 |
+
passed += 1
|
146 |
+
print (f"Total {total}, Passed {passed}")
|
147 |
+
|
148 |
+
if total != passed:
|
149 |
+
sys.exit(1)
|
src/libeval.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import signal
|
3 |
+
import subprocess
|
4 |
+
from typing import List
|
5 |
+
from . import generic_eval
|
6 |
+
|
7 |
+
def testing_mail(x, y, z):
|
8 |
+
generic_eval.gmain(x, y, z)
|
9 |
+
|
10 |
+
def run_without_exn(args: List[str]):
|
11 |
+
"""
|
12 |
+
Runs the given program with a five second timeout. Does not throw an exception
|
13 |
+
no matter what happens. The output is a dictionary of the format that we expect
|
14 |
+
for our evaluation scripts. The "status" field is "OK" when the exit code is
|
15 |
+
zero. If that isn't enough, you may want to tweak the status based on the
|
16 |
+
captured stderr and stdout.
|
17 |
+
"""
|
18 |
+
p = subprocess.Popen(
|
19 |
+
args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, start_new_session=True
|
20 |
+
)
|
21 |
+
try:
|
22 |
+
stdout, stderr = p.communicate(timeout=5)
|
23 |
+
exit_code = p.returncode
|
24 |
+
status = "OK" if exit_code == 0 else "Exception"
|
25 |
+
except subprocess.TimeoutExpired as exc:
|
26 |
+
stdout, stderr = p.stdout.read(), p.stderr.read()
|
27 |
+
os.killpg(os.getpgid(p.pid), signal.SIGTERM)
|
28 |
+
exit_code = -1
|
29 |
+
status = "Timeout"
|
30 |
+
|
31 |
+
if stdout is None:
|
32 |
+
stdout = b""
|
33 |
+
if stderr is None:
|
34 |
+
stderr = b""
|
35 |
+
return {
|
36 |
+
"status": status,
|
37 |
+
"exit_code": exit_code,
|
38 |
+
"stdout": stdout.decode("utf-8", errors="ignore"),
|
39 |
+
"stderr": stderr.decode("utf-8", errors="ignore"),
|
40 |
+
}
|
src/safe_subprocess/.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
/__pycache__
|
2 |
+
/.pytest_cache
|
src/safe_subprocess/__init__.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import signal
|
3 |
+
import fcntl
|
4 |
+
import time
|
5 |
+
import subprocess
|
6 |
+
from typing import List
|
7 |
+
|
8 |
+
MAX_BYTES_PER_READ = 1024
|
9 |
+
SLEEP_BETWEEN_READS = 0.1
|
10 |
+
|
11 |
+
|
12 |
+
class Result:
|
13 |
+
timeout: int
|
14 |
+
exit_code: int
|
15 |
+
stdout: str
|
16 |
+
stderr: str
|
17 |
+
|
18 |
+
def __init__(self, timeout, exit_code, stdout, stderr):
|
19 |
+
self.timeout = timeout
|
20 |
+
self.exit_code = exit_code
|
21 |
+
self.stdout = stdout
|
22 |
+
self.stderr = stderr
|
23 |
+
|
24 |
+
|
25 |
+
def set_nonblocking(reader):
|
26 |
+
fd = reader.fileno()
|
27 |
+
fl = fcntl.fcntl(fd, fcntl.F_GETFL)
|
28 |
+
fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
|
29 |
+
|
30 |
+
|
31 |
+
def run(
|
32 |
+
args: List[str],
|
33 |
+
timeout_seconds: int = 15,
|
34 |
+
max_output_size: int = 2048,
|
35 |
+
env = None,
|
36 |
+
cwd: str | None = None
|
37 |
+
) -> Result:
|
38 |
+
"""
|
39 |
+
Runs the given program with arguments. After the timeout elapses, kills the process
|
40 |
+
and all other processes in the process group. Captures at most max_output_size bytes
|
41 |
+
of stdout and stderr each, and discards any output beyond that.
|
42 |
+
"""
|
43 |
+
p = subprocess.Popen(
|
44 |
+
args,
|
45 |
+
env=env,
|
46 |
+
stdin=subprocess.DEVNULL,
|
47 |
+
stdout=subprocess.PIPE,
|
48 |
+
stderr=subprocess.PIPE,
|
49 |
+
start_new_session=True,
|
50 |
+
bufsize=MAX_BYTES_PER_READ,
|
51 |
+
cwd=cwd
|
52 |
+
)
|
53 |
+
set_nonblocking(p.stdout)
|
54 |
+
set_nonblocking(p.stderr)
|
55 |
+
|
56 |
+
process_group_id = os.getpgid(p.pid)
|
57 |
+
|
58 |
+
# We sleep for 0.1 seconds in each iteration.
|
59 |
+
max_iterations = timeout_seconds * 10
|
60 |
+
stdout_saved_bytes = []
|
61 |
+
stderr_saved_bytes = []
|
62 |
+
stdout_bytes_read = 0
|
63 |
+
stderr_bytes_read = 0
|
64 |
+
|
65 |
+
for _ in range(max_iterations):
|
66 |
+
this_stdout_read = p.stdout.read(MAX_BYTES_PER_READ)
|
67 |
+
this_stderr_read = p.stderr.read(MAX_BYTES_PER_READ)
|
68 |
+
# this_stdout_read and this_stderr_read may be None if stdout or stderr
|
69 |
+
# are closed. Without these checks, test_close_output fails.
|
70 |
+
if this_stdout_read is not None and stdout_bytes_read < max_output_size:
|
71 |
+
stdout_saved_bytes.append(this_stdout_read)
|
72 |
+
stdout_bytes_read += len(this_stdout_read)
|
73 |
+
if this_stderr_read is not None and stderr_bytes_read < max_output_size:
|
74 |
+
stderr_saved_bytes.append(this_stderr_read)
|
75 |
+
stderr_bytes_read += len(this_stderr_read)
|
76 |
+
exit_code = p.poll()
|
77 |
+
if exit_code is not None:
|
78 |
+
break
|
79 |
+
time.sleep(SLEEP_BETWEEN_READS)
|
80 |
+
|
81 |
+
try:
|
82 |
+
# Kills the process group. Without this line, test_fork_once fails.
|
83 |
+
os.killpg(process_group_id, signal.SIGKILL)
|
84 |
+
except ProcessLookupError:
|
85 |
+
pass
|
86 |
+
|
87 |
+
timeout = exit_code is None
|
88 |
+
exit_code = exit_code if exit_code is not None else -1
|
89 |
+
stdout = b"".join(stdout_saved_bytes).decode("utf-8", errors="ignore")
|
90 |
+
stderr = b"".join(stderr_saved_bytes).decode("utf-8", errors="ignore")
|
91 |
+
return Result(timeout=timeout, exit_code=exit_code, stdout=stdout, stderr=stderr)
|
src/safe_subprocess/evil_programs/block_on_inputs.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
while True:
|
2 |
+
input()
|
src/safe_subprocess/evil_programs/close_outputs.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
|
3 |
+
print("This is the end")
|
4 |
+
sys.stdout.close()
|
5 |
+
sys.stderr.close()
|
6 |
+
while True:
|
7 |
+
pass
|
src/safe_subprocess/evil_programs/fork_bomb.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
while True:
|
4 |
+
os.fork()
|
src/safe_subprocess/evil_programs/fork_once.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
|
4 |
+
if os.fork() == 0:
|
5 |
+
while True:
|
6 |
+
time.sleep(60)
|
src/safe_subprocess/evil_programs/sleep_forever.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
while True:
|
4 |
+
time.sleep(60)
|
src/safe_subprocess/evil_programs/unbounded_output.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
b = True
|
2 |
+
while True:
|
3 |
+
print(b)
|
4 |
+
b = not b
|
src/safe_subprocess/module_test.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from . import run
|
2 |
+
import time
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
ROOT = Path(__file__).resolve().parent / "evil_programs"
|
6 |
+
|
7 |
+
|
8 |
+
def assert_no_running_evil():
|
9 |
+
result = run(
|
10 |
+
["pgrep", "-f", ROOT], timeout_seconds=1, max_output_size=1024
|
11 |
+
)
|
12 |
+
assert (
|
13 |
+
result.exit_code == 1
|
14 |
+
), f"There are still evil processes running: {result.stdout}"
|
15 |
+
assert len(result.stderr) == 0
|
16 |
+
assert len(result.stdout) == 0
|
17 |
+
|
18 |
+
|
19 |
+
def test_fork_once():
|
20 |
+
# The program exits cleanly and immediately. But, it forks a child that runs
|
21 |
+
# forever.
|
22 |
+
result = run(
|
23 |
+
["python3", ROOT / "fork_once.py"],
|
24 |
+
timeout_seconds=2,
|
25 |
+
max_output_size=1024,
|
26 |
+
)
|
27 |
+
assert result.exit_code == 0
|
28 |
+
assert result.timeout == False
|
29 |
+
assert len(result.stderr) == 0
|
30 |
+
assert len(result.stdout) == 0
|
31 |
+
assert_no_running_evil()
|
32 |
+
|
33 |
+
|
34 |
+
def test_close_outputs():
|
35 |
+
# The program prints to stdout, closes its output, and then runs forever.
|
36 |
+
result = run(
|
37 |
+
["python3", ROOT / "close_outputs.py"],
|
38 |
+
timeout_seconds=2,
|
39 |
+
max_output_size=1024,
|
40 |
+
)
|
41 |
+
assert result.exit_code == -1
|
42 |
+
assert result.timeout == True
|
43 |
+
assert len(result.stderr) == 0
|
44 |
+
assert result.stdout == "This is the end\n"
|
45 |
+
assert_no_running_evil()
|
46 |
+
|
47 |
+
|
48 |
+
def test_unbounded_output():
|
49 |
+
result = run(
|
50 |
+
["python3", ROOT / "unbounded_output.py"],
|
51 |
+
timeout_seconds=3,
|
52 |
+
max_output_size=1024,
|
53 |
+
)
|
54 |
+
assert result.exit_code == -1
|
55 |
+
assert result.timeout == True
|
56 |
+
assert len(result.stderr) == 0
|
57 |
+
assert len(result.stdout) == 1024
|
58 |
+
assert_no_running_evil()
|
59 |
+
|
60 |
+
|
61 |
+
def test_sleep_forever():
|
62 |
+
result = run(
|
63 |
+
["python3", ROOT / "sleep_forever.py"],
|
64 |
+
timeout_seconds=2,
|
65 |
+
max_output_size=1024,
|
66 |
+
)
|
67 |
+
assert result.exit_code == -1
|
68 |
+
assert result.timeout == True
|
69 |
+
assert len(result.stderr) == 0
|
70 |
+
assert len(result.stdout) == 0
|
71 |
+
assert_no_running_evil()
|
72 |
+
|
73 |
+
|
74 |
+
def test_fork_bomb():
|
75 |
+
result = run(
|
76 |
+
["python3", ROOT / "fork_bomb.py"],
|
77 |
+
timeout_seconds=2,
|
78 |
+
max_output_size=1024,
|
79 |
+
)
|
80 |
+
assert result.exit_code == -1
|
81 |
+
assert result.timeout == True
|
82 |
+
assert len(result.stderr) == 0
|
83 |
+
assert len(result.stdout) == 0
|
84 |
+
# Unfortunately, this sleep seems to be necessary. My theories:
|
85 |
+
# 1. os.killpg doesn't block until the whole process group is dead.
|
86 |
+
# 2. pgrep can produce stale output
|
87 |
+
time.sleep(2)
|
88 |
+
assert_no_running_evil()
|
89 |
+
|
90 |
+
|
91 |
+
def test_block_on_inputs():
|
92 |
+
# We run the subprocess with /dev/null as input. So, any program that tries
|
93 |
+
# to read input will error.
|
94 |
+
result = run(
|
95 |
+
["python3", ROOT / "block_on_inputs.py"],
|
96 |
+
timeout_seconds=2,
|
97 |
+
max_output_size=1024,
|
98 |
+
)
|
99 |
+
assert result.exit_code == 1
|
100 |
+
assert result.timeout == False
|
101 |
+
assert len(result.stdout) == 0
|
102 |
+
assert "EOF when reading a line" in result.stderr
|
103 |
+
assert_no_running_evil()
|