dongsheng commited on
Commit
41e79e2
·
verified ·
1 Parent(s): 19139e3

Upload 48 files

Browse files
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM ghcr.io/nuprl/multipl-e-evaluation:v3.1
2
+
3
+ # Install GNAT for Ada language support
4
+ RUN apt-get update && apt-get install -y gnat && apt-get clean
5
+
6
+ # Override the default entrypoint of the base image
7
+ ENTRYPOINT []
8
+ WORKDIR /app
9
+ COPY requirements.txt .
10
+ RUN pip install -r requirements.txt
11
+ COPY . .
12
+ EXPOSE 7860
13
+ ENV GRADIO_SERVER_NAME="0.0.0.0"
14
+ CMD ["python3", "app.py"]
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: MultiPLE Evaluator
3
  emoji: 🔥
4
- colorFrom: indigo
5
- colorTo: pink
6
  sdk: docker
7
  pinned: false
8
  ---
 
1
  ---
2
+ title: Docker Test
3
  emoji: 🔥
4
+ colorFrom: blue
5
+ colorTo: gray
6
  sdk: docker
7
  pinned: false
8
  ---
app.py ADDED
@@ -0,0 +1,677 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import importlib
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
+ import concurrent.futures
8
+ import multiprocessing
9
+ import time
10
+ import threading
11
+ import queue
12
+ import uuid
13
+ import numpy as np
14
+ from datetime import datetime
15
+ from tqdm.auto import tqdm
16
+ from src.containerized_eval import eval_string_script
17
+
18
+ # Add current directory and src directory to module search path
19
+ current_dir = os.path.dirname(os.path.abspath(__file__))
20
+ src_dir = os.path.join(current_dir, "src")
21
+ if current_dir not in sys.path:
22
+ sys.path.append(current_dir)
23
+ if src_dir not in sys.path:
24
+ sys.path.append(src_dir)
25
+
26
+ # Create message queue
27
+ task_queue = queue.Queue()
28
+ # Dictionary to store task status
29
+ task_status = {}
30
+ # List to store task history, max 200 tasks
31
+ task_history = []
32
+ # Lock for shared resources
33
+ lock = threading.Lock()
34
+ # Number of worker threads
35
+ worker_threads = max(1, multiprocessing.cpu_count() // 2) # Using half the available cores for better stability
36
+ # Flag for running background threads
37
+ running = True
38
+ # Mapping from task type to processing time
39
+ task_type_times = {}
40
+
41
+ def queue_processor():
42
+ """Process tasks in the queue"""
43
+ while running:
44
+ try:
45
+ task_id, input_data, request_time = task_queue.get(timeout=0.1)
46
+ with lock:
47
+ task_status[task_id]['status'] = 'processing'
48
+ task_status[task_id]['start_time'] = time.time()
49
+
50
+ if isinstance(input_data, list) and len(input_data) > 0:
51
+ sample_task = input_data[0]
52
+ language = sample_task.get('language', 'unknown') if isinstance(sample_task, dict) else 'unknown'
53
+ task_size = len(input_data)
54
+ task_complexity = _estimate_task_complexity(input_data)
55
+
56
+ with lock:
57
+ task_status[task_id]['estimated_factors'] = {
58
+ 'language': language,
59
+ 'size': task_size,
60
+ 'complexity': task_complexity
61
+ }
62
+
63
+ result = evaluate(input_data)
64
+
65
+ end_time = time.time()
66
+ process_time = end_time - task_status[task_id]['start_time']
67
+
68
+ with lock:
69
+ task_status[task_id]['status'] = 'completed'
70
+ task_status[task_id]['result'] = result
71
+ task_status[task_id]['end_time'] = end_time
72
+ task_status[task_id]['process_time'] = process_time
73
+
74
+ if 'estimated_factors' in task_status[task_id]:
75
+ factors = task_status[task_id]['estimated_factors']
76
+ key = f"{factors['language']}_{factors['complexity']}"
77
+
78
+ if key not in task_type_times:
79
+ task_type_times[key] = []
80
+
81
+ task_type_times[key].append(process_time / factors['size'])
82
+ if len(task_type_times[key]) > 10:
83
+ task_type_times[key] = task_type_times[key][-10:]
84
+
85
+ task_history.append({
86
+ 'task_id': task_id,
87
+ 'request_time': request_time,
88
+ 'process_time': process_time,
89
+ 'status': 'completed',
90
+ 'factors': task_status[task_id].get('estimated_factors', {})
91
+ })
92
+ while len(task_history) > 200:
93
+ task_history.pop(0)
94
+
95
+ task_queue.task_done()
96
+
97
+ except queue.Empty:
98
+ continue
99
+ except Exception as e:
100
+ if 'task_id' in locals():
101
+ with lock:
102
+ task_status[task_id]['status'] = 'error'
103
+ task_status[task_id]['error'] = str(e)
104
+ task_status[task_id]['end_time'] = time.time()
105
+ task_queue.task_done()
106
+
107
+ def _estimate_task_complexity(tasks):
108
+ """Estimate task complexity
109
+
110
+ Returns: 'simple', 'medium', or 'complex'
111
+ """
112
+ total_code_length = 0
113
+ count = 0
114
+
115
+ for task in tasks:
116
+ if isinstance(task, dict):
117
+ prompt = task.get('prompt', '')
118
+ tests = task.get('tests', '')
119
+ completions = task.get('processed_completions', [])
120
+
121
+ code_length = len(prompt) + len(tests)
122
+ if completions:
123
+ code_length += sum(len(comp) for comp in completions)
124
+
125
+ total_code_length += code_length
126
+ count += 1
127
+
128
+ if count == 0:
129
+ return 'medium'
130
+
131
+ avg_length = total_code_length / count
132
+
133
+ if avg_length < 1000:
134
+ return 'simple'
135
+ elif avg_length < 5000:
136
+ return 'medium'
137
+ else:
138
+ return 'complex'
139
+
140
+ def evaluate(input_data):
141
+ """Main function for code evaluation"""
142
+ try:
143
+ if not isinstance(input_data, list):
144
+ return {"status": "Exception", "error": "Input must be a list"}
145
+
146
+ results = []
147
+
148
+ # Use a moderate number of workers for all language tests to ensure stability
149
+ # This prevents resource contention regardless of language
150
+ max_workers = max(1, min(multiprocessing.cpu_count() // 2, 4))
151
+
152
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
153
+ future_to_item = {executor.submit(evaluate_single_case, item): item for item in input_data}
154
+ for future in concurrent.futures.as_completed(future_to_item):
155
+ item = future_to_item[future]
156
+ try:
157
+ result = future.result()
158
+ item.update(result)
159
+ results.append(item)
160
+ except Exception as e:
161
+ item.update({"status": "Exception", "error": str(e)})
162
+ results.append(item)
163
+ return results
164
+
165
+ except Exception as e:
166
+ return {"status": "Exception", "error": str(e)}
167
+
168
+ def evaluate_single_case(input_data):
169
+ """Evaluate a single code case"""
170
+ try:
171
+ if not isinstance(input_data, dict):
172
+ return {"status": "Exception", "error": "Input item must be a dictionary"}
173
+
174
+ language = input_data.get('language')
175
+ completions = input_data.get('processed_completions', [])
176
+
177
+ if not completions:
178
+ return {"status": "Exception", "error": "No code provided"}
179
+
180
+ # Use a retry mechanism for all languages for better reliability
181
+ max_retries = 2 # One retry for all languages
182
+
183
+ results = []
184
+ for comp in completions:
185
+ code = input_data.get('prompt') + comp + '\n' + input_data.get('tests')
186
+
187
+ # Try up to max_retries + 1 times for all test cases
188
+ for attempt in range(max_retries + 1):
189
+ result = evaluate_code(code, language)
190
+
191
+ # If success or last attempt, return/record the result
192
+ if result["status"] == "OK" or attempt == max_retries:
193
+ if result["status"] == "OK":
194
+ return result
195
+ results.append(result)
196
+ break
197
+
198
+ # For retries, briefly wait to allow resources to stabilize
199
+ time.sleep(0.3)
200
+
201
+ return results[0]
202
+
203
+ except Exception as e:
204
+ return {"status": "Exception", "error": str(e)}
205
+
206
+ def evaluate_code(code, language):
207
+ """Evaluate code in a specific language"""
208
+ try:
209
+ result = eval_string_script(language, code)
210
+ return result
211
+
212
+ except Exception as e:
213
+ return {"status": "Exception", "error": str(e)}
214
+
215
+ def synchronous_evaluate(input_data):
216
+ """Synchronously evaluate code, compatible with original interface"""
217
+ if isinstance(input_data, list) and len(input_data) > 0:
218
+ sample_task = input_data[0]
219
+ language = sample_task.get('language', 'unknown') if isinstance(sample_task, dict) else 'unknown'
220
+ task_size = len(input_data)
221
+ task_complexity = _estimate_task_complexity(input_data)
222
+ else:
223
+ language = 'unknown'
224
+ task_size = 1
225
+ task_complexity = 'medium'
226
+
227
+ estimated_time_per_task = _get_estimated_time_for_task(language, task_complexity)
228
+ estimated_total_time = estimated_time_per_task * task_size
229
+
230
+ queue_info = get_queue_status()
231
+ waiting_tasks = queue_info['waiting_tasks']
232
+
233
+ task_id = str(uuid.uuid4())
234
+ request_time = time.time()
235
+
236
+ with lock:
237
+ task_status[task_id] = {
238
+ 'status': 'queued',
239
+ 'queued_time': request_time,
240
+ 'queue_position': task_queue.qsize() + 1,
241
+ 'synchronous': True,
242
+ 'estimated_factors': {
243
+ 'language': language,
244
+ 'size': task_size,
245
+ 'complexity': task_complexity
246
+ },
247
+ 'estimated_time': estimated_total_time
248
+ }
249
+
250
+ task_queue.put((task_id, input_data, request_time))
251
+
252
+ while True:
253
+ with lock:
254
+ if task_id in task_status:
255
+ status = task_status[task_id]['status']
256
+ if status == 'completed':
257
+ result = task_status[task_id]['result']
258
+ task_status.pop(task_id, None)
259
+ return result
260
+ elif status == 'error':
261
+ error = task_status[task_id].get('error', 'Unknown error')
262
+ task_status.pop(task_id, None)
263
+ return {"status": "Exception", "error": error}
264
+
265
+ time.sleep(0.1)
266
+
267
+ def _get_estimated_time_for_task(language, complexity):
268
+ """Get estimated processing time for a specific task type"""
269
+ key = f"{language}_{complexity}"
270
+
271
+ if key in task_type_times and len(task_type_times[key]) > 0:
272
+ return np.median(task_type_times[key])
273
+
274
+ if complexity == 'simple':
275
+ return 1.0
276
+ elif complexity == 'medium':
277
+ return 3.0
278
+ else: # complex
279
+ return 8.0
280
+
281
+ def enqueue_task(input_data):
282
+ """Add task to queue"""
283
+ if isinstance(input_data, list) and len(input_data) > 0:
284
+ sample_task = input_data[0]
285
+ language = sample_task.get('language', 'unknown') if isinstance(sample_task, dict) else 'unknown'
286
+ task_size = len(input_data)
287
+ task_complexity = _estimate_task_complexity(input_data)
288
+ else:
289
+ language = 'unknown'
290
+ task_size = 1
291
+ task_complexity = 'medium'
292
+
293
+ estimated_time_per_task = _get_estimated_time_for_task(language, task_complexity)
294
+ estimated_total_time = estimated_time_per_task * task_size
295
+
296
+ task_id = str(uuid.uuid4())
297
+ request_time = time.time()
298
+
299
+ with lock:
300
+ task_status[task_id] = {
301
+ 'status': 'queued',
302
+ 'queued_time': request_time,
303
+ 'queue_position': task_queue.qsize() + 1,
304
+ 'estimated_factors': {
305
+ 'language': language,
306
+ 'size': task_size,
307
+ 'complexity': task_complexity
308
+ },
309
+ 'estimated_time': estimated_total_time
310
+ }
311
+
312
+ queue_info = get_queue_status()
313
+ est_wait = queue_info['estimated_wait']
314
+
315
+ task_queue.put((task_id, input_data, request_time))
316
+
317
+ return {
318
+ 'task_id': task_id,
319
+ 'status': 'queued',
320
+ 'queue_position': task_status[task_id]['queue_position'],
321
+ 'estimated_wait': est_wait,
322
+ 'estimated_processing': estimated_total_time
323
+ }
324
+
325
+ def check_status(task_id):
326
+ """Check task status"""
327
+ with lock:
328
+ if task_id not in task_status:
329
+ return {'status': 'not_found'}
330
+
331
+ status_info = task_status[task_id].copy()
332
+
333
+ if status_info['status'] in ['completed', 'error'] and time.time() - status_info.get('end_time', 0) > 3600:
334
+ task_status.pop(task_id, None)
335
+
336
+ return status_info
337
+
338
+ def get_queue_status():
339
+ """Get queue status"""
340
+ with lock:
341
+ queued_tasks = [t for t in task_status.values() if t['status'] == 'queued']
342
+ processing_tasks = [t for t in task_status.values() if t['status'] == 'processing']
343
+
344
+ queue_size = task_queue.qsize()
345
+ active_tasks = len(processing_tasks)
346
+ waiting_tasks = len(queued_tasks)
347
+
348
+ remaining_processing_time = 0
349
+ for task in processing_tasks:
350
+ if 'start_time' in task and 'estimated_time' in task:
351
+ elapsed = time.time() - task['start_time']
352
+ remaining = max(0, task['estimated_time'] - elapsed)
353
+ remaining_processing_time += remaining
354
+ else:
355
+ remaining_processing_time += 2
356
+
357
+ if active_tasks > 0:
358
+ remaining_processing_time = remaining_processing_time / min(active_tasks, worker_threads)
359
+
360
+ queued_processing_time = 0
361
+ for task in queued_tasks:
362
+ if 'estimated_time' in task:
363
+ queued_processing_time += task['estimated_time']
364
+ else:
365
+ queued_processing_time += 5
366
+
367
+ if worker_threads > 0 and queued_processing_time > 0:
368
+ queued_processing_time = queued_processing_time / worker_threads
369
+
370
+ estimated_wait = remaining_processing_time + queued_processing_time
371
+
372
+ if task_history:
373
+ prediction_ratios = []
374
+ for task in task_history:
375
+ if 'factors' in task and 'estimated_time' in task:
376
+ prediction_ratios.append(task['process_time'] / task['estimated_time'])
377
+
378
+ if prediction_ratios:
379
+ correction_factor = np.median(prediction_ratios)
380
+ correction_factor = max(0.5, min(2.0, correction_factor))
381
+ estimated_wait *= correction_factor
382
+
383
+ estimated_wait = max(0.1, estimated_wait)
384
+ if waiting_tasks == 0 and active_tasks == 0:
385
+ estimated_wait = 0
386
+
387
+ recent_tasks = task_history[-5:] if task_history else []
388
+
389
+ return {
390
+ 'queue_size': queue_size,
391
+ 'active_tasks': active_tasks,
392
+ 'waiting_tasks': waiting_tasks,
393
+ 'worker_threads': worker_threads,
394
+ 'estimated_wait': estimated_wait,
395
+ 'recent_tasks': recent_tasks
396
+ }
397
+
398
+ def format_time(seconds):
399
+ """Format time into readable format"""
400
+ if seconds < 60:
401
+ return f"{seconds:.1f} seconds"
402
+ elif seconds < 3600:
403
+ minutes = int(seconds / 60)
404
+ seconds = seconds % 60
405
+ return f"{minutes}m {seconds:.1f}s"
406
+ else:
407
+ hours = int(seconds / 3600)
408
+ minutes = int((seconds % 3600) / 60)
409
+ return f"{hours}h {minutes}m"
410
+
411
+ def ui_get_queue_info():
412
+ """Get queue info for UI"""
413
+ queue_info = get_queue_status()
414
+
415
+ tasks_html = ""
416
+ for task in reversed(queue_info['recent_tasks']):
417
+ tasks_html += f"""
418
+ <tr>
419
+ <td>{task['task_id'][:8]}...</td>
420
+ <td>{datetime.fromtimestamp(task['request_time']).strftime('%H:%M:%S')}</td>
421
+ <td>{format_time(task['process_time'])}</td>
422
+ </tr>
423
+ """
424
+
425
+ if not tasks_html:
426
+ tasks_html = """
427
+ <tr>
428
+ <td colspan="3" style="text-align: center; padding: 20px;">No historical tasks</td>
429
+ </tr>
430
+ """
431
+
432
+ return f"""
433
+ <div class="dashboard">
434
+ <div class="queue-info-card main-card">
435
+ <h3 class="card-title">Queue Status Monitor</h3>
436
+ <div class="queue-stats">
437
+ <div class="stat-item">
438
+ <div class="stat-value">{queue_info['waiting_tasks']}</div>
439
+ <div class="stat-label">Waiting</div>
440
+ </div>
441
+ <div class="stat-item">
442
+ <div class="stat-value">{queue_info['active_tasks']}</div>
443
+ <div class="stat-label">Processing</div>
444
+ </div>
445
+ <div class="stat-item">
446
+ <div class="stat-value">{queue_info['worker_threads']}</div>
447
+ <div class="stat-label">Worker Threads</div>
448
+ </div>
449
+ </div>
450
+
451
+ <div class="wait-time">
452
+ <p><b>Current Estimated Wait Time:</b> {format_time(queue_info['estimated_wait'])}</p>
453
+ <p class="last-update"><small>Last update: {datetime.now().strftime('%H:%M:%S')}</small></p>
454
+ </div>
455
+ </div>
456
+
457
+ <div class="queue-info-card history-card">
458
+ <h3 class="card-title">Recently Processed Tasks</h3>
459
+ <table class="recent-tasks">
460
+ <thead>
461
+ <tr>
462
+ <th>Task ID</th>
463
+ <th>Request Time</th>
464
+ <th>Processing Time</th>
465
+ </tr>
466
+ </thead>
467
+ <tbody>
468
+ {tasks_html}
469
+ </tbody>
470
+ </table>
471
+ </div>
472
+ </div>
473
+ """
474
+
475
+ def launch_workers():
476
+ """Launch worker threads"""
477
+ global running
478
+ running = True
479
+
480
+ for _ in range(worker_threads):
481
+ worker = threading.Thread(target=queue_processor)
482
+ worker.daemon = True
483
+ worker.start()
484
+
485
+ # Custom CSS
486
+ custom_css = """
487
+ .container {
488
+ max-width: 1200px;
489
+ margin: 0 auto;
490
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
491
+ }
492
+
493
+ .dashboard {
494
+ display: flex;
495
+ flex-direction: column;
496
+ gap: 20px;
497
+ }
498
+
499
+ .card-title {
500
+ color: #333;
501
+ border-bottom: 2px solid #ddd;
502
+ padding-bottom: 10px;
503
+ margin-top: 0;
504
+ }
505
+
506
+ .status-card, .queue-info-card {
507
+ background: #fff;
508
+ border-radius: 12px;
509
+ padding: 20px;
510
+ margin: 10px 0;
511
+ box-shadow: 0 4px 15px rgba(0,0,0,0.08);
512
+ }
513
+
514
+ .main-card {
515
+ border-top: 5px solid #4285f4;
516
+ }
517
+
518
+ .history-card {
519
+ border-top: 5px solid #34a853;
520
+ }
521
+
522
+ .status-card.success {
523
+ background: #e7f5e7;
524
+ border-left: 5px solid #28a745;
525
+ }
526
+
527
+ .status-card.error {
528
+ background: #f8d7da;
529
+ border-left: 5px solid #dc3545;
530
+ }
531
+
532
+ .error-message {
533
+ color: #dc3545;
534
+ font-weight: bold;
535
+ padding: 10px;
536
+ background: #f8d7da;
537
+ border-radius: 5px;
538
+ }
539
+
540
+ .notice {
541
+ color: #0c5460;
542
+ background-color: #d1ecf1;
543
+ padding: 10px;
544
+ border-radius: 5px;
545
+ }
546
+
547
+ .queue-stats {
548
+ display: flex;
549
+ justify-content: space-around;
550
+ margin: 20px 0;
551
+ }
552
+
553
+ .stat-item {
554
+ text-align: center;
555
+ padding: 15px;
556
+ background: #f8f9fa;
557
+ border-radius: 10px;
558
+ min-width: 120px;
559
+ transition: transform 0.3s ease;
560
+ }
561
+
562
+ .stat-item:hover {
563
+ transform: translateY(-5px);
564
+ box-shadow: 0 5px 15px rgba(0,0,0,0.1);
565
+ }
566
+
567
+ .stat-value {
568
+ font-size: 32px;
569
+ font-weight: bold;
570
+ color: #4285f4;
571
+ margin-bottom: 5px;
572
+ }
573
+
574
+ .stat-label {
575
+ color: #5f6368;
576
+ font-size: 16px;
577
+ }
578
+
579
+ .wait-time {
580
+ text-align: center;
581
+ margin: 20px 0;
582
+ padding: 15px;
583
+ background: #f1f3f4;
584
+ border-radius: 8px;
585
+ font-size: 18px;
586
+ }
587
+
588
+ .last-update {
589
+ color: #80868b;
590
+ margin-top: 10px;
591
+ margin-bottom: 0;
592
+ }
593
+
594
+ .recent-tasks {
595
+ width: 100%;
596
+ border-collapse: collapse;
597
+ margin-top: 15px;
598
+ background: white;
599
+ box-shadow: 0 1px 3px rgba(0,0,0,0.05);
600
+ }
601
+
602
+ .recent-tasks th, .recent-tasks td {
603
+ border: 1px solid #e0e0e0;
604
+ padding: 12px 15px;
605
+ text-align: center;
606
+ }
607
+
608
+ .recent-tasks th {
609
+ background-color: #f1f3f4;
610
+ color: #202124;
611
+ font-weight: 500;
612
+ }
613
+
614
+ .recent-tasks tbody tr:hover {
615
+ background-color: #f8f9fa;
616
+ }
617
+
618
+ .tabs {
619
+ margin-top: 20px;
620
+ }
621
+
622
+ button.primary {
623
+ background-color: #4285f4;
624
+ color: white;
625
+ padding: 10px 20px;
626
+ border: none;
627
+ border-radius: 4px;
628
+ cursor: pointer;
629
+ font-size: 16px;
630
+ font-weight: 500;
631
+ transition: background-color 0.3s;
632
+ }
633
+
634
+ button.primary:hover {
635
+ background-color: #3367d6;
636
+ }
637
+ """
638
+
639
+ # Initialize and launch worker threads
640
+ launch_workers()
641
+
642
+ # Create Gradio interface
643
+ with gr.Blocks(css=custom_css) as demo:
644
+ gr.Markdown("# Code Evaluation Service")
645
+ gr.Markdown("Code evaluation service supporting multiple programming languages, using queue mechanism to process requests")
646
+
647
+ with gr.Row():
648
+ with gr.Column(scale=3):
649
+ # Queue status info card
650
+ queue_info_html = gr.HTML()
651
+ refresh_queue_btn = gr.Button("Refresh Queue Status", variant="primary")
652
+
653
+ # Hidden API interface components
654
+ with gr.Row(visible=False):
655
+ api_input = gr.JSON()
656
+ api_output = gr.JSON()
657
+
658
+ # Define update function
659
+ def update_queue_info():
660
+ return ui_get_queue_info()
661
+
662
+ # Update queue info periodically
663
+ demo.load(update_queue_info, None, queue_info_html, every=3)
664
+
665
+ # Refresh button event
666
+ refresh_queue_btn.click(update_queue_info, None, queue_info_html)
667
+
668
+ # Add evaluation endpoint compatible with original interface
669
+ demo.queue()
670
+ evaluate_endpoint = demo.load(fn=synchronous_evaluate, inputs=api_input, outputs=api_output, api_name="evaluate")
671
+
672
+ if __name__ == "__main__":
673
+ try:
674
+ demo.launch()
675
+ finally:
676
+ # Stop worker threads
677
+ running = False
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ gradio==4.44.1
src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # src package
src/containerized_eval.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from . import eval_adb
3
+ from . import eval_ruby
4
+ from . import eval_lua
5
+ from . import eval_python
6
+ from . import eval_rust
7
+ from . import eval_julia
8
+ from . import eval_java
9
+ from . import eval_lua
10
+ from . import eval_racket
11
+ from . import eval_javascript
12
+ from . import eval_swift
13
+ from . import eval_cpp
14
+ from . import eval_php
15
+ from . import eval_dlang
16
+ from . import eval_julia
17
+ from . import eval_r
18
+ from . import eval_fs
19
+ from . import eval_ocaml
20
+ from . import eval_matlab
21
+ from . import eval_hs
22
+ from . import eval_elixir
23
+ from . import eval_clj
24
+ from . import eval_v
25
+ from . import eval_lean
26
+ from . import eval_dart
27
+ from . import eval_go
28
+ import tempfile
29
+
30
+
31
+ EVALUATORS = {
32
+ "ada": (eval_adb.eval_script, ".adb"),
33
+ "rb": (eval_ruby.eval_script, ".rb"),
34
+ "lua": (eval_lua.eval_script, ".lua"),
35
+ "python": (eval_python.eval_script, ".py"),
36
+ "py": (eval_python.eval_script, ".py"),
37
+ "notypes.py": (eval_python.eval_script, ".py"),
38
+ "julia": (eval_julia.eval_script, ".jl"),
39
+ "java" : (eval_java.eval_script, ".java"),
40
+ "rust" : (eval_rust.eval_script, ".rs"),
41
+ "rs" : (eval_rust.eval_script, ".rs"),
42
+ "swift": (eval_swift.eval_script, ".swift"),
43
+ "lua": (eval_lua.eval_script, ".lua"),
44
+ "racket": (eval_racket.eval_script, ".rkt"),
45
+ "rkt": (eval_racket.eval_script, ".rkt"),
46
+ "javascript": (eval_javascript.eval_script, ".js"),
47
+ "js": (eval_javascript.eval_script, ".js"),
48
+ "cpp": (eval_cpp.eval_script, ".cpp"),
49
+ "php": (eval_php.eval_script, ".php"),
50
+ "humaneval_to_dlang.py": (eval_dlang.eval_script, ".d"),
51
+ "d": (eval_dlang.eval_script, ".d"),
52
+ "r": (eval_r.eval_script, ".r"),
53
+ "humaneval_to_r.py": (eval_r.eval_script, ".r"),
54
+ "jl": (eval_julia.eval_script, ".jl"),
55
+ "fs": (eval_fs.eval_script, ".fsx"),
56
+ "ml": (eval_ocaml.eval_script, ".ml"),
57
+ "m": (eval_matlab.eval_script, ".m"),
58
+ "hs": (eval_hs.eval_script, ".hs"),
59
+ "elixir": (eval_elixir.eval_script, ".exs"),
60
+ "clj": (eval_clj.eval_script, ".clj"),
61
+ "coq": (eval_v.eval_script, ".v"),
62
+ "lean": (eval_lean.eval_script, ".lean"),
63
+ "dart": (eval_dart.eval_script, ".dart"),
64
+ "go": (eval_go.eval_script, ".go"),
65
+ "go_test.go": (eval_go.eval_script, "_test.go"),
66
+ }
67
+
68
+ def eval_string_script(language, program):
69
+ if language in EVALUATORS:
70
+ (eval_script, file_ext) = EVALUATORS[language]
71
+ else:
72
+ eval_module = __import__(f"eval_{language}" if language != "go_test.go" else "eval_go")
73
+ eval_script = eval_module.eval_script
74
+ file_ext = f".{language}" if language != "go_test.go" else "_test.go"
75
+ with tempfile.NamedTemporaryFile(suffix=file_ext, delete=True) as f:
76
+ f.write(program.encode("utf-8"))
77
+ f.flush()
78
+ result = eval_script(Path(f.name))
79
+ # Only save the first 2K of output from the running program. Any futher
80
+ # output is very likely an exceptionally long stack trace or a long
81
+ # series of prints.
82
+ if type(result["stdout"]) == bytes:
83
+ result["stdout"] = result["stdout"].decode("utf-8", errors="ignore")
84
+ if result["stdout"] is None:
85
+ result["stdout"] = ""
86
+ if result["stderr"] is None:
87
+ result["stderr"] = ""
88
+ if type(result["stderr"]) == bytes:
89
+ result["stderr"] = result["stderr"].decode("utf-8", errors="ignore")
90
+ assert type(result["stdout"]) == str
91
+ assert type(result["stderr"]) == str
92
+ return {
93
+ "program": program,
94
+ "stdout": result['stdout'].replace("!!int", "")[:2048],
95
+ "stderr": result['stderr'][:2048],
96
+ "exit_code": result['exit_code'],
97
+ "status": result['status']
98
+ }
src/eval_adb.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from src.safe_subprocess import run
3
+ from src.generic_eval import main
4
+
5
+
6
+ LANG_NAME = "Ada"
7
+ LANG_EXT = ".adb"
8
+
9
+
10
+ def eval_script(path: Path):
11
+ working_dir: Path = path.parent / (path.stem + "_tmp")
12
+ working_dir.mkdir()
13
+ chop_result = run(["gnatchop", "-w", path, working_dir])
14
+ if chop_result.exit_code != 0:
15
+ return {
16
+ "status": "SyntaxError (gnatchop)",
17
+ "exit_code": chop_result.exit_code,
18
+ "stdout": chop_result.stdout,
19
+ "stderr": chop_result.stderr,
20
+ }
21
+
22
+ build_result = run(
23
+ [
24
+ "gnatmake",
25
+ "-gnatW8",
26
+ "main.adb",
27
+ "-o",
28
+ "main",
29
+ "-g",
30
+ "-j0",
31
+ "-gnata",
32
+ "-gnat2022",
33
+ "-gnateE",
34
+ "-bargs",
35
+ "-Es",
36
+ ],
37
+ cwd=str(working_dir),
38
+ )
39
+ if build_result.exit_code != 0:
40
+ return {
41
+ "status": "SyntaxError (gnatmake)",
42
+ "exit_code": build_result.exit_code,
43
+ "stdout": build_result.stdout,
44
+ "stderr": build_result.stderr,
45
+ }
46
+
47
+ status = "OK"
48
+ run_result = run(["./main"], cwd=str(working_dir))
49
+
50
+ if run_result.timeout:
51
+ status = "Timeout"
52
+ elif run_result.exit_code != 0:
53
+ status = "Exception"
54
+
55
+ return {
56
+ "status": status,
57
+ "exit_code": run_result.exit_code,
58
+ "stdout": run_result.stdout,
59
+ "stderr": run_result.stderr,
60
+ }
61
+
62
+
63
+ if __name__ == "__main__":
64
+ main(eval_script, LANG_NAME, LANG_EXT)
src/eval_clj.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evaluates a generated Clojure program (.clj).
3
+ """
4
+ import os
5
+ import tempfile
6
+ from pathlib import Path
7
+ from src.safe_subprocess import run
8
+ from src.libeval import run_without_exn
9
+
10
+
11
+ def eval_script(path: Path):
12
+ # Create environment with a writable temporary directory for Clojure cache
13
+ temp_dir = tempfile.mkdtemp(prefix="clojure_home_")
14
+ env = os.environ.copy()
15
+ env["XDG_CONFIG_HOME"] = temp_dir # Set XDG_CONFIG_HOME for Clojure cache
16
+ env["XDG_DATA_HOME"] = temp_dir # Set XDG_DATA_HOME for Clojure data
17
+ env["XDG_CACHE_HOME"] = temp_dir # Set XDG_CACHE_HOME for caches
18
+
19
+ # Run Clojure with the custom environment
20
+ result = run(
21
+ ["clojure", "-J-Dclojure.main.report=stderr", "-M", str(path)],
22
+ env=env
23
+ )
24
+
25
+ if result.timeout:
26
+ status = "Timeout"
27
+ elif result.exit_code != 0:
28
+ status = "Exception"
29
+ elif "\n0 failures, 0 errors.\n" in result.stdout:
30
+ status = "OK"
31
+ else: # test failure
32
+ status = "Exception"
33
+
34
+ return {
35
+ "status": status,
36
+ "exit_code": result.exit_code,
37
+ "stdout": result.stdout,
38
+ "stderr": result.stderr,
39
+ }
40
+
41
+ if __name__ == "__main__":
42
+ print("This module is not meant to be executed directly.")
src/eval_cpp.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from src.safe_subprocess import run
3
+ from src.generic_eval import main
4
+
5
+ LANG_NAME = "C++"
6
+ LANG_EXT = ".cpp"
7
+
8
+
9
+ def eval_script(path: Path):
10
+ basename = ".".join(str(path).split(".")[:-1])
11
+ build_result = run(["g++", path, "-o", basename, "-std=c++17"])
12
+ if build_result.exit_code != 0:
13
+ return {
14
+ "status": "SyntaxError",
15
+ "exit_code": build_result.exit_code,
16
+ "stdout": build_result.stdout,
17
+ "stderr": build_result.stderr,
18
+ }
19
+
20
+ run_result = run([basename])
21
+ if "In file included from /shared/centos7/gcc/9.2.0-skylake/" in run_result.stderr:
22
+ raise Exception("Skylake bug encountered")
23
+ if "/4.8.2" in run_result.stderr:
24
+ raise Exception("Ancient compiler encountered")
25
+ if run_result.timeout:
26
+ status = "Timeout"
27
+ elif run_result.exit_code != 0:
28
+ status = "Exception"
29
+ else:
30
+ status = "OK"
31
+ return {
32
+ "status": status,
33
+ "exit_code": run_result.exit_code,
34
+ "stdout": run_result.stdout,
35
+ "stderr": run_result.stderr,
36
+ }
37
+
38
+
39
+ if __name__ == "__main__":
40
+ main(eval_script, LANG_NAME, LANG_EXT)
src/eval_cs.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import tempfile
4
+ from pathlib import Path
5
+
6
+ from src.generic_eval import main
7
+
8
+ LANG_NAME = "CSharp"
9
+ LANG_EXT = ".cs"
10
+
11
+ #Following files have problems:
12
+ #137,
13
+ #22: Any
14
+ #148: Elipsis
15
+
16
+ def eval_script(path: str):
17
+ if ".cs" not in path.name:
18
+ return
19
+ basename = ".".join(str(path).split(".")[:-1])
20
+ binaryname = basename + ".exe"
21
+ build = subprocess.run(["csc", "/d:DEBUG", "-r:System.Numerics.dll", path, f"/out:{binaryname}"], capture_output=True)
22
+ status = None
23
+ returncode = -1
24
+ output = None
25
+ if build.returncode != 0:
26
+ # Well, it's a compile error. May be a type error or
27
+ # something. But, why break the set convention
28
+ status = "SyntaxError"
29
+ returncode = build.returncode
30
+ output = build
31
+ else:
32
+ try:
33
+ output = subprocess.run(["mono", binaryname], env={"PATH": os.getenv("PATH"), "MONO_TRACE_LISTENER":"Console.Error"}, capture_output=True, timeout=5)
34
+ returncode = output.returncode
35
+ output.stderr = str(output.stderr, "utf-8")
36
+ #mono return 0 even when failing
37
+ fail = "System.Diagnostics.DefaultTraceListener.Fail" in output.stderr or "Unhandled Exception" in output.stderr
38
+ output.returncode = 1 if fail else 0
39
+ if output.returncode == 0:
40
+ status = "OK"
41
+ else:
42
+ # Well, it's a panic
43
+ status = "Exception"
44
+ except subprocess.TimeoutExpired as exc:
45
+ status = "Timeout"
46
+ output = exc
47
+ os.remove(binaryname)
48
+
49
+ if output.stdout is not None:
50
+ output.stdout = output.stdout.decode("utf-8")
51
+ else:
52
+ output.stdout = "None"
53
+
54
+ if output.stderr == "":
55
+ output.stderr = "None"
56
+
57
+ return {
58
+ "status": status,
59
+ "exit_code": returncode,
60
+ "stdout": output.stdout,
61
+ "stderr": output.stderr,
62
+ }
63
+
64
+ if __name__ == "__main__":
65
+ main(eval_script, LANG_NAME, LANG_EXT)
src/eval_dart.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from src.safe_subprocess import run
3
+
4
+
5
+ def eval_script(path: Path):
6
+ r = run(["dart", "analyze", "--no-fatal-warnings", str(path)], timeout_seconds=15)
7
+ if r.exit_code != 0:
8
+ return {
9
+ "status": "SyntaxError",
10
+ "exit_code": r.exit_code,
11
+ "stdout": r.stdout,
12
+ "stderr": r.stderr,
13
+ }
14
+
15
+ r = run(["dart", str(path)], timeout_seconds=15)
16
+ if r.timeout:
17
+ status = "Timeout"
18
+ elif r.exit_code == 0:
19
+ status = "OK"
20
+ else:
21
+ status = "Exception"
22
+ return {
23
+ "status": status,
24
+ "exit_code": r.exit_code,
25
+ "stdout": r.stdout,
26
+ "stderr": r.stderr,
27
+ }
src/eval_dfy.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from src.safe_subprocess import run
3
+
4
+ # 0 – success
5
+ # 1 – invalid command-line arguments
6
+ # 2 – syntax, parse, or name or type resolution errors
7
+ # 3 – compilation errors
8
+ # 4 – verification errors
9
+
10
+ def eval_script(path: Path):
11
+ r = run(["dafny", "run", str(path)])
12
+ if r.timeout:
13
+ status = "Timeout"
14
+ elif r.exit_code == 0:
15
+ status = "OK"
16
+ elif r.exit_code == 2:
17
+ status = "SyntaxError"
18
+ elif r.exit_code == 3:
19
+ status = "CompilationError"
20
+ elif r.exit_code == 4:
21
+ status = "VerificationError"
22
+ else:
23
+ status = "Exception"
24
+ return {
25
+ "status": status,
26
+ "exit_code": r.exit_code,
27
+ "stdout": r.stdout,
28
+ "stderr": r.stderr,
29
+ }
src/eval_dlang.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ from pathlib import Path
4
+ from src.safe_subprocess import run
5
+ import sys
6
+ import re
7
+
8
+ ENABLE_SYNTAX_CHECK = False
9
+
10
+ def eval_script(path: Path):
11
+ result = run(["rdmd", "-unittest", str(path)], timeout_seconds=15)
12
+ if "might not be correctly installed" in result.stderr:
13
+ raise Exception("D is not correctly installed")
14
+
15
+ if result.timeout:
16
+ status = "Timeout"
17
+ elif result.exit_code == 0:
18
+ status = "OK"
19
+ elif "Error:" in result.stderr:
20
+ status = "SyntaxError"
21
+ else:
22
+ status = "Exception"
23
+
24
+ return {
25
+ "status": status,
26
+ "exit_code": result.exit_code,
27
+ "stdout": result.stdout,
28
+ "stderr": result.stderr,
29
+ }
30
+
31
+ DIR = "d-keep-code_davinci_001_temp_0.2"
32
+ def main():
33
+ directory = Path(Path(__file__).parent, "..", "datasets", DIR).resolve()
34
+
35
+ count = {"OK": 0, "Timeout": 0, "Exception": 0, "SyntaxError": 0}
36
+ for filename in os.listdir(directory):
37
+ path = Path.joinpath(directory, filename)
38
+ r = eval_script(path)
39
+ status = r["status"]
40
+ count[status] += 1
41
+
42
+ if ENABLE_SYNTAX_CHECK and status == "SyntaxError":
43
+ error_msgs = r["stderr"].split("\n")
44
+ with open(path) as source_file:
45
+ lines = source_file.readlines()
46
+ unittest_line_start = lines.index("unittest\n")
47
+ unittest_line_end = len(lines)
48
+ for err_msg_line in error_msgs:
49
+ matched_parts = re.match(r"(\/?.*?\.[\w:]+\/.*.d)\(([0-9]+)\): Error: (.*)", err_msg_line[2:-1])
50
+ _file, line_num = matched_parts[1], int(matched_parts[2])
51
+ if unittest_line_start <= line_num and line_num <= unittest_line_end:
52
+ print("===============")
53
+ print(path, "contains error in unit test part")
54
+ print(error_msgs)
55
+ print("===============")
56
+
57
+ filename = filename.split(".")[0]
58
+ print(f"Dlang,{filename},{status}")
59
+
60
+ print(DIR + ":" + str(count))
61
+
62
+ if __name__ == "__main__":
63
+ main()
src/eval_elixir.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from sys import exit
3
+ import subprocess
4
+ from pathlib import Path
5
+ from src.generic_eval import main as gmain
6
+
7
+
8
+ def eval_script(path: Path):
9
+ try:
10
+ # Assumes exit-code 0 is all okay
11
+ output = subprocess.run(["elixir", str(path)], capture_output=True, timeout=5)
12
+
13
+ if output.returncode == 0:
14
+ status = "OK"
15
+ else:
16
+ outmessage = str(output)
17
+ if "Assertion with == failed" in outmessage:
18
+ status = "AssertionError"
19
+ elif "SyntaxError" in outmessage:
20
+ status = "SyntaxError"
21
+ else:
22
+ status = "Exception"
23
+ returncode = output.returncode
24
+ except subprocess.TimeoutExpired as exc:
25
+ status = "Timeout"
26
+ output = exc
27
+ returncode = -1
28
+ return {
29
+ "status": status,
30
+ "exit_code": returncode,
31
+ "stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
32
+ "stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
33
+ }
34
+
35
+
36
+ if __name__ == "__main__":
37
+ gmain(eval_script, "Elixir", ".exs")
src/eval_fs.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from src.safe_subprocess import run
3
+
4
+ def eval_script(path: Path):
5
+ r = run(["dotnet", "fsi", "-d:DEBUG", str(path)])
6
+ if r.timeout:
7
+ status = "Timeout"
8
+ elif r.exit_code == 0:
9
+ status = "OK"
10
+ else:
11
+ status = "Exception"
12
+ return {
13
+ "status" : status,
14
+ "exit_code": r.exit_code,
15
+ "stdout": r.stdout,
16
+ "stderr": r.stderr,
17
+ }
src/eval_go.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from sys import exit
3
+ import subprocess
4
+ from pathlib import Path
5
+ import os
6
+ import tempfile
7
+ from src.generic_eval import main as gmain
8
+
9
+
10
+ def eval_script(path: Path):
11
+ status = None
12
+ stdout = None
13
+ stderr = None
14
+ exit_code = None
15
+ try:
16
+ # 创建临时目录用于Go缓存
17
+ with tempfile.TemporaryDirectory() as temp_dir:
18
+ # 设置Go环境变量
19
+ env = os.environ.copy()
20
+ env["GOCACHE"] = os.path.join(temp_dir, "go-build")
21
+ env["GOPATH"] = os.path.join(temp_dir, "gopath")
22
+
23
+ build = subprocess.run(["go", "test", path],
24
+ env=env,
25
+ timeout=30,
26
+ stdout=subprocess.PIPE,
27
+ stderr=subprocess.PIPE)
28
+
29
+ stdout = build.stdout.decode("utf-8", errors="ignore")
30
+ stderr = build.stderr.decode("utf-8", errors="ignore")
31
+ exit_code = build.returncode
32
+ # write to stderr just so that we can redirect stdout to a csv
33
+
34
+ if "[setup failed]" in stdout or "[build failed]" in stdout:
35
+ status = "SyntaxError"
36
+ elif "FAIL" in stdout:
37
+ status = "Exception"
38
+ else:
39
+ status = "OK"
40
+ except subprocess.TimeoutExpired:
41
+ status = "Timeout"
42
+
43
+ return {
44
+ "status": status,
45
+ "exit_code": exit_code,
46
+ "stdout": stdout,
47
+ "stderr": stderr,
48
+ }
49
+
50
+
51
+ if __name__ == "__main__":
52
+ gmain(eval_script, 'Go', '.go')
src/eval_hs.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from src.safe_subprocess import run
3
+
4
+ def eval_script(path: Path):
5
+ r = run(["runghc", str(path)])
6
+ if r.timeout:
7
+ status = "Timeout"
8
+ elif r.exit_code == 0:
9
+ status = "OK"
10
+ elif "Syntax error":
11
+ status = "SyntaxError"
12
+ else:
13
+ status = "Exception"
14
+ return {
15
+ "status": status,
16
+ "exit_code": r.exit_code,
17
+ "stdout": r.stdout,
18
+ "stderr": r.stderr,
19
+ }
src/eval_java.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ from src.safe_subprocess import run
4
+ from pathlib import Path
5
+ from src.generic_eval import main
6
+
7
+ LANG_NAME = "Java"
8
+ LANG_EXT = ".java"
9
+
10
+ #Following files have problems:
11
+ #137,
12
+ #22: Any
13
+ #148: Elipsis
14
+
15
+ def eval_script(path: Path):
16
+
17
+ sys_env = os.environ.copy()
18
+ javatuples_path = Path("/usr/multiple/javatuples-1.2.jar")
19
+
20
+ sys_env["CLASSPATH"] = f"{javatuples_path}"
21
+
22
+ with tempfile.TemporaryDirectory() as outdir:
23
+ #Each Java file contains the class with same name `JAVA_CLASS_NAME`
24
+ #Hence, javac will same JAVA_CLASS_NAME.class file for each problem
25
+ #Write class for each problem to a different temp dir
26
+ #Use UTF8 encoding with javac
27
+ result = run(["javac", "-encoding", "UTF8", "-d", outdir, path], env=sys_env)
28
+
29
+ if result.exit_code != 0:
30
+ # Well, it's a compile error. May be a type error or
31
+ # something. But, why break the set convention
32
+ status = "SyntaxError"
33
+ else:
34
+ result = run(["java", "-ea", "-cp", f"{outdir}:{javatuples_path}", "Problem"], env = sys_env)
35
+ if result.timeout:
36
+ status = "Timeout"
37
+ elif result.exit_code == 0:
38
+ status = "OK"
39
+ else:
40
+ status = "Exception"
41
+
42
+ return {
43
+ "status": status,
44
+ "exit_code": result.exit_code,
45
+ "stdout": result.stdout,
46
+ "stderr": result.stderr,
47
+ }
48
+
49
+ if __name__ == "__main__":
50
+ main(eval_script, LANG_NAME, LANG_EXT)
src/eval_javascript.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ from pathlib import Path
4
+
5
+ def eval_script(path: Path):
6
+ try:
7
+ # Assumes exit-code 0 is all okay
8
+ output = subprocess.run(["node", str(path)], capture_output=True, timeout=5)
9
+
10
+ if output.returncode == 0:
11
+ status = "OK"
12
+ else:
13
+ outmessage = str(output)
14
+ if 'ERR_ASSERTION' in outmessage:
15
+ status = "AssertionError"
16
+ elif 'SyntaxError' in outmessage:
17
+ status = "SyntaxError"
18
+ elif 'ReferenceError' in outmessage:
19
+ status = "ReferenceError"
20
+ else:
21
+ status = "Exception"
22
+ returncode = output.returncode
23
+ except subprocess.TimeoutExpired as exc:
24
+ status = "Timeout"
25
+ output = exc
26
+ returncode = -1
27
+ except subprocess.CalledProcessError as exc:
28
+ status = "Exception"
29
+ returncode = exc.returncode
30
+ output = exc
31
+ return {
32
+ "status": status,
33
+ "exit_code": returncode,
34
+ "stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
35
+ "stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
36
+ }
37
+
38
+
39
+
40
+ def main():
41
+ directory = Path(Path(__file__).parent, "..", "datasets", "js-keep-code_davinci_001_temp_0.2").resolve()
42
+
43
+ for filename in os.listdir(directory):
44
+ r = eval_script(Path.joinpath(directory,filename))
45
+ filename = filename.split(".")[0]
46
+ print(f"JavaScript,{filename},{r['status']}")
47
+
48
+ if __name__ == "__main__":
49
+ main()
src/eval_julia.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.safe_subprocess import run
2
+ from pathlib import Path
3
+
4
+ def eval_script(path: Path):
5
+ result = run(["julia", str(path)], timeout_seconds=5)
6
+ if result.timeout:
7
+ status = "Timeout"
8
+ elif result.exit_code == 0:
9
+ status = "OK"
10
+ # TODO(arjun): I would like this to be reviewed more carefully by John.
11
+ elif len(result.stderr) < 1:
12
+ status = "Exception"
13
+ else:
14
+ status = "SyntaxError"
15
+
16
+ return {
17
+ "status": status,
18
+ "exit_code": result.exit_code,
19
+ "stdout": result.stdout,
20
+ "stderr": result.stderr,
21
+ }
src/eval_lean.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from src.safe_subprocess import run
3
+ import subprocess
4
+
5
+ def eval_script(path: Path):
6
+ # since lean is a theorem prover first and not a programming environment,
7
+ # the return code is always 1. idk.
8
+ try:
9
+ output = subprocess.run(["lean", str(path)], capture_output=True, timeout=5)
10
+ outmessage = str(output)
11
+
12
+ if "error: tactic 'rfl' failed" in outmessage: # :skull:
13
+ status = "AssertionError"
14
+ elif outmessage == "":
15
+ status = "OK"
16
+ else:
17
+ status = "SyntaxError"
18
+ returncode = output.returncode
19
+
20
+ except subprocess.TimeoutExpired as exc:
21
+ status = "Timeout"
22
+ output = exc
23
+ returncode = -1
24
+ return {
25
+ "status": status,
26
+ "exit_code": returncode,
27
+ "stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
28
+ "stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
29
+ }
src/eval_lua.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from src.safe_subprocess import run
3
+
4
+ def eval_script(path: Path):
5
+ r = run(["lua", str(path)])
6
+ if r.timeout:
7
+ status = "Timeout"
8
+ elif r.exit_code == 0:
9
+ status = "OK"
10
+ else:
11
+ status = "Exception"
12
+ return {
13
+ "status": status,
14
+ "exit_code": r.exit_code,
15
+ "stdout": r.stdout,
16
+ "stderr": r.stderr,
17
+ }
src/eval_luau.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from src.safe_subprocess import run
3
+
4
+
5
+ def eval_script(path: Path):
6
+ r = run(["luau-analyze", str(path)])
7
+ if r.timeout:
8
+ status = "Timeout"
9
+ elif r.exit_code == 0:
10
+ r = run(["luau", str(path)])
11
+ if r.timeout:
12
+ status = "Timeout"
13
+ elif r.exit_code == 0:
14
+ status = "OK"
15
+ else:
16
+ status = "Exception"
17
+ elif "SyntaxError" in r.stderr:
18
+ status = "SyntaxError"
19
+ else:
20
+ status = "TypeError"
21
+ return {
22
+ "status": status,
23
+ "exit_code": r.exit_code,
24
+ "stdout": r.stdout,
25
+ "stderr": r.stderr,
26
+ }
src/eval_matlab.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from src.safe_subprocess import run
3
+
4
+ def eval_script(path):
5
+ # Matlab has the requirement that all functions must appear at the end
6
+ # of the file. So we first have to write the call to the test-function at the
7
+ # beginning of the file.
8
+ with open(path, 'r') as f:
9
+ content = f.read()
10
+ content = f"test();\n{content}"
11
+ with open(path, 'w') as f:
12
+ f.write(content)
13
+ filename = path.stem
14
+ parent_dir = path.parent.absolute()
15
+
16
+ # We use the matlab.engine to run the script; however, the way that the
17
+ # matlab engine works requires that we call the script as if it were a
18
+ # member of the matlab.engine object. So we have to write a python script
19
+ # that calls the matlab script. This also ensures that the script is called
20
+ # in a safe-subprocess. Who needs runtime reflection when you have IPC?
21
+ program= f"""
22
+ import matlab.engine
23
+ import io
24
+ import sys
25
+ out = io.StringIO()
26
+ err = io.StringIO()
27
+ eng = matlab.engine.start_matlab()
28
+ eng.addpath(r'{parent_dir}',nargout=0)
29
+ try:
30
+ r = eng.{filename}(nargout=0, stdout=out,stderr=err)
31
+ print(out.getvalue())
32
+ except matlab.engine.MatlabExecutionError as e:
33
+ print(err.getvalue(), file=sys.stderr)
34
+ """
35
+ r = run(["python3", "-c", program], timeout_seconds=30)
36
+
37
+ # This is still somewhat brittle.
38
+ if r.timeout:
39
+ status = "Timeout"
40
+ exit_code = -1
41
+ elif r.stderr == "":
42
+ status = "OK"
43
+ exit_code = 0
44
+ else:
45
+ status = "Exception"
46
+ exit_code = 1
47
+
48
+ return {
49
+ "status": status,
50
+ "exit_code": exit_code,
51
+ "stdout": r.stdout,
52
+ "stderr": r.stderr,
53
+ }
src/eval_ocaml.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from src.safe_subprocess import run
3
+
4
+ def eval_script(path: Path):
5
+ r = run(["ocaml", str(path)])
6
+ if r.timeout:
7
+ status = "Timeout"
8
+ elif r.exit_code == 0:
9
+ status = "OK"
10
+ elif "Assert_failure" in r.stderr:
11
+ status = "AssertionError"
12
+ elif "Syntax error" in r.stderr:
13
+ status = "SyntaxError"
14
+ else:
15
+ status = "Exception"
16
+ return {
17
+ "status": status,
18
+ "exit_code": r.exit_code,
19
+ "stdout": r.stdout,
20
+ "stderr": r.stderr,
21
+ }
src/eval_php.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from src.safe_subprocess import run
3
+
4
+ LANG_NAME = "PHP"
5
+ LANG_EXT = ".php"
6
+
7
+ def eval_script(path: Path):
8
+ r = run(["php", path])
9
+ if "PHP Parse error" in r.stdout:
10
+ status = "SyntaxError"
11
+ elif r.exit_code != 0:
12
+ status = "Exception"
13
+ else:
14
+ status = "OK"
15
+ return {
16
+ "status": status,
17
+ "exit_code": r.exit_code,
18
+ "stdout": r.stdout,
19
+ "stderr": r.stderr,
20
+ }
src/eval_pl.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from src.safe_subprocess import run
3
+
4
+ def eval_script(path: Path):
5
+ r = run(["perl", path])
6
+
7
+ if r.timeout:
8
+ status = "Timeout"
9
+ elif r.exit_code != 0:
10
+ status = "Exception"
11
+ elif "ERROR" in r.stdout or "ERROR" in r.stderr:
12
+ status = "Exception"
13
+ else:
14
+ status = "OK"
15
+ return {
16
+ "status": status,
17
+ "exit_code": r.exit_code,
18
+ "stdout": r.stdout,
19
+ "stderr": r.stderr,
20
+ }
src/eval_python.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from src.safe_subprocess import run
3
+
4
+ def eval_script(path: Path):
5
+ r = run(["python3", str(path)])
6
+ if r.timeout:
7
+ status = "Timeout"
8
+ elif r.exit_code == 0:
9
+ status = "OK"
10
+ elif "SyntaxError" in r.stderr:
11
+ status = "SyntaxError"
12
+ else:
13
+ status = "Exception"
14
+ return {
15
+ "status" : status,
16
+ "exit_code": r.exit_code,
17
+ "stdout": r.stdout,
18
+ "stderr": r.stderr,
19
+ }
src/eval_r.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ from pathlib import Path
4
+
5
+ def eval_script(path: Path):
6
+ try:
7
+ # Assumes exit-code 0 is all okay
8
+ # Run R on the file, capturing stderr
9
+ output = subprocess.run(["Rscript", str(path)], capture_output=True, timeout=5)
10
+ if output.returncode == 0:
11
+ status = "OK"
12
+ else:
13
+ outmessage = str(output)
14
+ if 'unexpected' in outmessage:
15
+ status = "SyntaxError"
16
+ elif "err=b''" in outmessage:
17
+ status = "AssertionError"
18
+ else:
19
+ status = "Exception"
20
+ returncode = output.returncode
21
+ except subprocess.TimeoutExpired as exc:
22
+ status = "Timeout"
23
+ output = exc
24
+ returncode = -1
25
+ except subprocess.CalledProcessError as exc:
26
+ status = "Exception"
27
+ returncode = exc.returncode
28
+ output = exc
29
+ return {
30
+ "status": status,
31
+ "exit_code": returncode,
32
+ "stdout": output.stdout,
33
+ "stderr": output.stderr
34
+ }
35
+
36
+
37
+
38
+ def main():
39
+ directory = Path(Path(__file__).parent, "..", "datasets", "R-keep-code_davinci_001_temp_0.2").resolve()
40
+
41
+ for filename in os.listdir(directory):
42
+ r = eval_script(Path.joinpath(directory,filename))
43
+ filename = filename.split(".")[0]
44
+ print(f"R,{filename},{r['status']}")
45
+
46
+ if __name__ == "__main__":
47
+ main()
src/eval_racket.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evaluates a generated Racket program (.rkt).
3
+ """
4
+ import os
5
+ from pathlib import Path
6
+ from src.safe_subprocess import run
7
+ from src.libeval import run_without_exn
8
+
9
+
10
+ def eval_script(path: Path):
11
+ result = run(["racket", str(path)])
12
+
13
+ if (
14
+ "standard-module-name-resolver: collection not found\n for module path: rackunit"
15
+ in result.stderr
16
+ ):
17
+ print(f"Failed to run evaluation for {path}: rackunit is not installed")
18
+ return None
19
+
20
+ # rackunit produces exit code 0 even if tests fail.
21
+ if len(result.stderr) > 0 or result.exit_code != 0:
22
+ if "read-syntax" in result.stderr:
23
+ status = "SyntaxError"
24
+ else:
25
+ status = "Exception"
26
+ else:
27
+ status = "OK"
28
+
29
+ return {
30
+ "status": status,
31
+ "exit_code": result.exit_code,
32
+ "stdout": result.stdout,
33
+ "stderr": result.stderr,
34
+ }
35
+
36
+
37
+ def main():
38
+ directory = Path(
39
+ Path(__file__).parent, "..", "datasets", "racket-keep-code_davinci_001_temp_0.2"
40
+ ).resolve()
41
+
42
+ for filename in os.listdir(directory):
43
+ r = eval_script(Path.joinpath(directory, filename))
44
+ filename = filename.split(".")[0]
45
+ print(f"Racket,{filename},{r['status']}")
46
+
47
+
48
+ if __name__ == "__main__":
49
+ main()
src/eval_ruby.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from sys import exit
3
+ import subprocess
4
+ from pathlib import Path
5
+ from src.generic_eval import main as gmain
6
+
7
+ def eval_script(path: Path):
8
+ try:
9
+ # Assumes exit-code 0 is all okay
10
+ # Need check=True for Ruby to pass errors to CalledProcessError
11
+ output = subprocess.run(
12
+ ["ruby", path], check=True, capture_output=True, timeout=5
13
+ )
14
+ if output.returncode == 0:
15
+ status = "OK"
16
+ out = output.stderr
17
+ error = output.stdout
18
+ returncode = 0
19
+ else:
20
+ raise Exception("there's an issue with check = True for Ruby, INVESTIGATE!")
21
+ except subprocess.TimeoutExpired as exc:
22
+ status = "Timeout"
23
+ out = exc.stdout
24
+ error = exc.stderr
25
+ returncode = -1
26
+ except subprocess.CalledProcessError as exc:
27
+ returncode = exc.returncode
28
+ out = exc.stdout
29
+ error = exc.stderr
30
+ #failure with code 1 but no error message is an Exception from Failed tests
31
+ if len(error) < 1:
32
+ status = "Exception"
33
+ else: #everything that prints out an error message is a SyntaxError
34
+ status = "SyntaxError"
35
+ return {
36
+ "status": status,
37
+ "exit_code": returncode,
38
+ "stdout": out,
39
+ "stderr": error,
40
+ }
41
+
42
+ if __name__ == "__main__":
43
+ gmain(eval_script, 'Ruby', '.rb')
src/eval_rust.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import tempfile
4
+ from pathlib import Path
5
+ from src.generic_eval import main
6
+
7
+ LANG_NAME = "Rust"
8
+ LANG_EXT = ".rs"
9
+
10
+ def eval_script(path: Path):
11
+ basename = ".".join(str(path).split(".")[:-1])
12
+ try:
13
+ build = subprocess.run(["rustc", path, "-o", basename], capture_output=True, timeout=15)
14
+ except subprocess.TimeoutExpired as exc:
15
+ return {
16
+ "status": "Timeout",
17
+ "exit_code": -1,
18
+ "stdout": "Compiler timeout",
19
+ "stderr": "Compiler timeout",
20
+ }
21
+ status = None
22
+ returncode = -1
23
+ output = None
24
+ if build.returncode != 0:
25
+ # Well, it's a compile error. May be a type error or
26
+ # something. But, why break the set convention
27
+ status = "SyntaxError"
28
+ returncode = build.returncode
29
+ output = build
30
+ else:
31
+ try:
32
+ # Assumes exit-code 0 is all okay
33
+ output = subprocess.run([basename], capture_output=True, timeout=5)
34
+ returncode = output.returncode
35
+ if output.returncode == 0:
36
+ status = "OK"
37
+ else:
38
+ # Well, it's a panic
39
+ status = "Exception"
40
+ except subprocess.TimeoutExpired as exc:
41
+ status = "Timeout"
42
+ output = exc
43
+ os.remove(basename)
44
+ return {
45
+ "status": status,
46
+ "exit_code": returncode,
47
+ "stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
48
+ "stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
49
+ }
50
+
51
+ if __name__ == "__main__":
52
+ main(eval_script, LANG_NAME, LANG_EXT)
53
+
src/eval_scala.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import tempfile
3
+ from src.safe_subprocess import run
4
+
5
+ LANG_NAME = "Scala"
6
+ LANG_EXT = ".scala"
7
+
8
+ def eval_script(path: Path):
9
+ with tempfile.TemporaryDirectory() as outdir:
10
+ # Each Scala file contains the class with same name `JAVA_CLASS_NAME`
11
+ # Hence, scalac will same JAVA_CLASS_NAME.class file for each problem
12
+ # Write class for each problem to a different temp dir
13
+ build = run(["scalac", "-d", outdir, path], timeout_seconds=45)
14
+ if build.exit_code != 0:
15
+ # Well, it's a compile error. May be a type error or
16
+ # something. But, why break the set convention
17
+ return {
18
+ "status": "SyntaxError",
19
+ "exit_code": build.exit_code,
20
+ "stdout": build.stdout,
21
+ "stderr": build.stderr,
22
+ }
23
+ # "Problem" is the name of the class we emit.
24
+ r = run(["scala", "-cp", f"{outdir}", "Problem"])
25
+ if r.timeout:
26
+ status = "Timeout"
27
+ elif r.exit_code == 0 and r.stderr == "":
28
+ status = "OK"
29
+ else:
30
+ # Well, it's a panic
31
+ status = "Exception"
32
+ return {
33
+ "status": status,
34
+ "exit_code": r.exit_code,
35
+ "stdout": r.stdout,
36
+ "stderr": r.stderr,
37
+ }
src/eval_sh.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from src.safe_subprocess import run
3
+
4
+ LANG_NAME = "bash"
5
+ LANG_EXT = ".sh"
6
+
7
+ def eval_script(path: Path):
8
+ # Capture output - will be generated regardless of success, fail, or syntax error
9
+ p = run(["bash", path])
10
+ if p.timeout:
11
+ status = "Timeout"
12
+ elif p.exit_code == 0:
13
+ status = "OK"
14
+ elif "syntax error" in p.stderr:
15
+ status = "SyntaxError"
16
+ else:
17
+ status = "Exception"
18
+
19
+ return {
20
+ "status": status,
21
+ "exit_code": p.exit_code,
22
+ "stdout": p.stdout,
23
+ "stderr": p.stderr,
24
+ }
src/eval_swift.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ from pathlib import Path
3
+ import os
4
+ from src.safe_subprocess import run
5
+
6
+ def eval_script(path: Path):
7
+ basename = ".".join(str(path).split(".")[:-1])
8
+ r = run(["swiftc", path, "-o", basename], timeout_seconds=45)
9
+ if r.timeout:
10
+ status = "Timeout"
11
+ elif r.exit_code != 0:
12
+ # Well, it's a compile error. May be a type error or
13
+ # something. But, why break the set convention
14
+ status = "SyntaxError"
15
+ else:
16
+ r = run([basename], timeout_seconds=5)
17
+ if r.timeout:
18
+ status = "Timeout"
19
+ elif r.exit_code != 0:
20
+ # Well, it's a panic
21
+ status = "Exception"
22
+ else:
23
+ status = "OK"
24
+ os.remove(basename)
25
+ return {
26
+ "status": status,
27
+ "exit_code": r.exit_code,
28
+ "stdout": r.stdout,
29
+ "stderr": r.stderr,
30
+ }
src/eval_ts.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from src.safe_subprocess import run
3
+
4
+
5
+ def eval_script(path: Path):
6
+ r = run(["tsc", "--target", "esnext", str(path)], timeout_seconds=15)
7
+ if r.exit_code != 0:
8
+ return {
9
+ "status": "SyntaxError",
10
+ "exit_code": r.exit_code,
11
+ "stdout": r.stdout,
12
+ "stderr": r.stderr,
13
+ }
14
+
15
+ r = run(["node", str(path).replace(".ts", ".js")], timeout_seconds=15)
16
+ if r.timeout:
17
+ status = "Timeout"
18
+ elif r.exit_code == 0:
19
+ status = "OK"
20
+ elif "ERR_ASSERTION" in r.stderr:
21
+ status = "AssertionError"
22
+ elif "SyntaxError" in r.stderr:
23
+ status = "SyntaxError"
24
+ elif "ReferenceError" in r.stderr:
25
+ status = "ReferenceError"
26
+ else:
27
+ status = "Exception"
28
+ return {
29
+ "status": status,
30
+ "exit_code": r.exit_code,
31
+ "stdout": r.stdout,
32
+ "stderr": r.stderr,
33
+ }
src/eval_v.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from src.safe_subprocess import run
3
+ import subprocess
4
+
5
+ # return codes for coqc:
6
+ # 0: compilation goes through
7
+ # 1: some sort of error (nondescript)
8
+
9
+ def eval_script(path: Path):
10
+ cleanup_extensions = ['.vo', '.vok', '.vos']
11
+
12
+ try:
13
+ # sadly there seems to be no way to verify proofs in a coq file without compiling
14
+ output = subprocess.run(["coqc", "-noglob", str(path)], capture_output=True, timeout=5)
15
+ outmessage = str(output)
16
+
17
+ if output.returncode == 0:
18
+ status = "OK"
19
+ # cleanup: remove files generated by coqc
20
+ for ext in cleanup_extensions:
21
+ file_to_remove = path.with_suffix(ext)
22
+ if file_to_remove.exists():
23
+ file_to_remove.unlink()
24
+
25
+ elif "Unable to unify" in outmessage:
26
+ status = "AssertionError"
27
+ else:
28
+ status = "SyntaxError"
29
+ returncode = output.returncode
30
+
31
+ except subprocess.TimeoutExpired as exc:
32
+ status = "Timeout"
33
+ output = exc
34
+ returncode = -1
35
+ return {
36
+ "status": status,
37
+ "exit_code": returncode,
38
+ "stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
39
+ "stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
40
+ }
src/generic_eval.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is a helper script for evaluating benchmarks that have been translated to
2
+ # different languages.
3
+ #
4
+ # To use this script, call eval_lang.py.
5
+ # The --directory argument is required, and tells the script where the benchmarks are located.
6
+ # The --files argument is optional, and takes a list of numbers corresponding to the files to be evaluated.
7
+ #
8
+ # The script will print the results on each benchmark, and also write to results/lang.csv.
9
+ # When the script completes, it will print a summary.
10
+ #
11
+ # Examples
12
+ #
13
+ # To run the entire benchmark suite:
14
+ # python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/
15
+ #
16
+ # To run benchmarks 1, 2, and 3:
17
+ # python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/ --files 1 2 3
18
+
19
+ import argparse
20
+ from sys import exit as sysexit
21
+ from pathlib import Path
22
+ import sys
23
+
24
+ def list_files(directory, ext):
25
+ files_unsorted = directory.glob(f"HumanEval_*{ext}")
26
+ # assumption: base filenames are in the format of HumanEval_X_*
27
+ # Where X is a valid number
28
+ def key(s):
29
+ return int(str(s.name).split("_")[1])
30
+ files_sorted = sorted(files_unsorted, key=(lambda s: key(s)))
31
+
32
+ # assumption: there may be missing files, but no extra files
33
+ # so we build files_array where the index corresponds to the file's number,
34
+ # and a missing file is represented by None
35
+ size = key(files_sorted[-1]) + 1
36
+ files_array = [None] * size
37
+ for f in files_sorted:
38
+ k = key(f)
39
+ files_array[k] = f
40
+
41
+ return files_array
42
+
43
+ def main(eval_script, language, extension):
44
+ args = argparse.ArgumentParser()
45
+
46
+ args.add_argument(
47
+ "--directory", type=str, required=True, help="Directory to read benchmarks from"
48
+ )
49
+ args.add_argument(
50
+ "--files",
51
+ type=int,
52
+ nargs="*",
53
+ default=[],
54
+ help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2"
55
+ )
56
+ args = args.parse_args()
57
+
58
+ directory = Path(args.directory).resolve()
59
+
60
+ files_sorted = list_files(directory, extension)
61
+
62
+ # the directory you specified does not contain the right language
63
+ if len(files_sorted) == 0:
64
+ print(f'The specified directory does not contain files of type {extension}')
65
+ sysexit(1)
66
+
67
+ files_index = []
68
+ if len(args.files) > 0:
69
+ files_index = args.files
70
+ else:
71
+ files_index = range(len(files_sorted))
72
+
73
+ total = 0
74
+ passed = 0
75
+ syntax_error = 0
76
+
77
+ results_file = Path(Path(__file__).parent, "..", "results", language.lower() + ".csv").resolve()
78
+
79
+ with open(results_file, "w") as f:
80
+ for i in files_index:
81
+ filepath = files_sorted[i]
82
+ if filepath is None:
83
+ print("File {} does not exist!".format(i))
84
+ continue
85
+ res = eval_script(filepath)
86
+ output = f"{language},{filepath.stem},{res['status']}\n"
87
+ f.write(output)
88
+ print(output, end="")
89
+ total += 1
90
+ if res['status'] == "OK":
91
+ passed += 1
92
+ elif res['status'] == "SyntaxError":
93
+ syntax_error += 1
94
+ print (f"Total {total}, Syntax Error {syntax_error}, Passed {passed}")
95
+
96
+
97
+
98
+ def main_check_stubs(check_script, language, extension):
99
+ args = argparse.ArgumentParser()
100
+
101
+ args.add_argument(
102
+ "--directory", type=str, required=True, help="Directory to read benchmarks from"
103
+ )
104
+ args.add_argument(
105
+ "--files",
106
+ type=int,
107
+ nargs="*",
108
+ default=[],
109
+ help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2"
110
+ )
111
+ args = args.parse_args()
112
+
113
+ directory = Path(args.directory).resolve()
114
+
115
+ files_sorted = list_files(directory, extension)
116
+
117
+ # the directory you specified does not contain the right language
118
+ if len(files_sorted) == 0:
119
+ print(f'The specified directory does not contain files of type {extension}')
120
+ sysexit(1)
121
+
122
+ files_index = []
123
+ if len(args.files) > 0:
124
+ files_index = args.files
125
+ else:
126
+ files_index = range(len(files_sorted))
127
+
128
+ total = 0
129
+ passed = 0
130
+
131
+ results_file = Path(Path(__file__).parent, "..", "check_results", language.lower() + ".csv").resolve()
132
+
133
+ with open(results_file, "w") as f:
134
+ for i in files_index:
135
+ filepath = files_sorted[i]
136
+ if filepath is None:
137
+ print("File {} does not exist!".format(i))
138
+ continue
139
+ res = check_script(filepath)
140
+ output = f"{language},{filepath.stem},{res['status']}\n"
141
+ f.write(output)
142
+ print(output, end="")
143
+ total += 1
144
+ if res['status'] == "OK":
145
+ passed += 1
146
+ print (f"Total {total}, Passed {passed}")
147
+
148
+ if total != passed:
149
+ sys.exit(1)
src/libeval.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import signal
3
+ import subprocess
4
+ from typing import List
5
+ from . import generic_eval
6
+
7
+ def testing_mail(x, y, z):
8
+ generic_eval.gmain(x, y, z)
9
+
10
+ def run_without_exn(args: List[str]):
11
+ """
12
+ Runs the given program with a five second timeout. Does not throw an exception
13
+ no matter what happens. The output is a dictionary of the format that we expect
14
+ for our evaluation scripts. The "status" field is "OK" when the exit code is
15
+ zero. If that isn't enough, you may want to tweak the status based on the
16
+ captured stderr and stdout.
17
+ """
18
+ p = subprocess.Popen(
19
+ args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, start_new_session=True
20
+ )
21
+ try:
22
+ stdout, stderr = p.communicate(timeout=5)
23
+ exit_code = p.returncode
24
+ status = "OK" if exit_code == 0 else "Exception"
25
+ except subprocess.TimeoutExpired as exc:
26
+ stdout, stderr = p.stdout.read(), p.stderr.read()
27
+ os.killpg(os.getpgid(p.pid), signal.SIGTERM)
28
+ exit_code = -1
29
+ status = "Timeout"
30
+
31
+ if stdout is None:
32
+ stdout = b""
33
+ if stderr is None:
34
+ stderr = b""
35
+ return {
36
+ "status": status,
37
+ "exit_code": exit_code,
38
+ "stdout": stdout.decode("utf-8", errors="ignore"),
39
+ "stderr": stderr.decode("utf-8", errors="ignore"),
40
+ }
src/safe_subprocess/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ /__pycache__
2
+ /.pytest_cache
src/safe_subprocess/__init__.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import signal
3
+ import fcntl
4
+ import time
5
+ import subprocess
6
+ from typing import List
7
+
8
+ MAX_BYTES_PER_READ = 1024
9
+ SLEEP_BETWEEN_READS = 0.1
10
+
11
+
12
+ class Result:
13
+ timeout: int
14
+ exit_code: int
15
+ stdout: str
16
+ stderr: str
17
+
18
+ def __init__(self, timeout, exit_code, stdout, stderr):
19
+ self.timeout = timeout
20
+ self.exit_code = exit_code
21
+ self.stdout = stdout
22
+ self.stderr = stderr
23
+
24
+
25
+ def set_nonblocking(reader):
26
+ fd = reader.fileno()
27
+ fl = fcntl.fcntl(fd, fcntl.F_GETFL)
28
+ fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
29
+
30
+
31
+ def run(
32
+ args: List[str],
33
+ timeout_seconds: int = 15,
34
+ max_output_size: int = 2048,
35
+ env = None,
36
+ cwd: str | None = None
37
+ ) -> Result:
38
+ """
39
+ Runs the given program with arguments. After the timeout elapses, kills the process
40
+ and all other processes in the process group. Captures at most max_output_size bytes
41
+ of stdout and stderr each, and discards any output beyond that.
42
+ """
43
+ p = subprocess.Popen(
44
+ args,
45
+ env=env,
46
+ stdin=subprocess.DEVNULL,
47
+ stdout=subprocess.PIPE,
48
+ stderr=subprocess.PIPE,
49
+ start_new_session=True,
50
+ bufsize=MAX_BYTES_PER_READ,
51
+ cwd=cwd
52
+ )
53
+ set_nonblocking(p.stdout)
54
+ set_nonblocking(p.stderr)
55
+
56
+ process_group_id = os.getpgid(p.pid)
57
+
58
+ # We sleep for 0.1 seconds in each iteration.
59
+ max_iterations = timeout_seconds * 10
60
+ stdout_saved_bytes = []
61
+ stderr_saved_bytes = []
62
+ stdout_bytes_read = 0
63
+ stderr_bytes_read = 0
64
+
65
+ for _ in range(max_iterations):
66
+ this_stdout_read = p.stdout.read(MAX_BYTES_PER_READ)
67
+ this_stderr_read = p.stderr.read(MAX_BYTES_PER_READ)
68
+ # this_stdout_read and this_stderr_read may be None if stdout or stderr
69
+ # are closed. Without these checks, test_close_output fails.
70
+ if this_stdout_read is not None and stdout_bytes_read < max_output_size:
71
+ stdout_saved_bytes.append(this_stdout_read)
72
+ stdout_bytes_read += len(this_stdout_read)
73
+ if this_stderr_read is not None and stderr_bytes_read < max_output_size:
74
+ stderr_saved_bytes.append(this_stderr_read)
75
+ stderr_bytes_read += len(this_stderr_read)
76
+ exit_code = p.poll()
77
+ if exit_code is not None:
78
+ break
79
+ time.sleep(SLEEP_BETWEEN_READS)
80
+
81
+ try:
82
+ # Kills the process group. Without this line, test_fork_once fails.
83
+ os.killpg(process_group_id, signal.SIGKILL)
84
+ except ProcessLookupError:
85
+ pass
86
+
87
+ timeout = exit_code is None
88
+ exit_code = exit_code if exit_code is not None else -1
89
+ stdout = b"".join(stdout_saved_bytes).decode("utf-8", errors="ignore")
90
+ stderr = b"".join(stderr_saved_bytes).decode("utf-8", errors="ignore")
91
+ return Result(timeout=timeout, exit_code=exit_code, stdout=stdout, stderr=stderr)
src/safe_subprocess/evil_programs/block_on_inputs.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ while True:
2
+ input()
src/safe_subprocess/evil_programs/close_outputs.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ print("This is the end")
4
+ sys.stdout.close()
5
+ sys.stderr.close()
6
+ while True:
7
+ pass
src/safe_subprocess/evil_programs/fork_bomb.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import os
2
+
3
+ while True:
4
+ os.fork()
src/safe_subprocess/evil_programs/fork_once.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+
4
+ if os.fork() == 0:
5
+ while True:
6
+ time.sleep(60)
src/safe_subprocess/evil_programs/sleep_forever.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import time
2
+
3
+ while True:
4
+ time.sleep(60)
src/safe_subprocess/evil_programs/unbounded_output.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ b = True
2
+ while True:
3
+ print(b)
4
+ b = not b
src/safe_subprocess/module_test.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from . import run
2
+ import time
3
+ from pathlib import Path
4
+
5
+ ROOT = Path(__file__).resolve().parent / "evil_programs"
6
+
7
+
8
+ def assert_no_running_evil():
9
+ result = run(
10
+ ["pgrep", "-f", ROOT], timeout_seconds=1, max_output_size=1024
11
+ )
12
+ assert (
13
+ result.exit_code == 1
14
+ ), f"There are still evil processes running: {result.stdout}"
15
+ assert len(result.stderr) == 0
16
+ assert len(result.stdout) == 0
17
+
18
+
19
+ def test_fork_once():
20
+ # The program exits cleanly and immediately. But, it forks a child that runs
21
+ # forever.
22
+ result = run(
23
+ ["python3", ROOT / "fork_once.py"],
24
+ timeout_seconds=2,
25
+ max_output_size=1024,
26
+ )
27
+ assert result.exit_code == 0
28
+ assert result.timeout == False
29
+ assert len(result.stderr) == 0
30
+ assert len(result.stdout) == 0
31
+ assert_no_running_evil()
32
+
33
+
34
+ def test_close_outputs():
35
+ # The program prints to stdout, closes its output, and then runs forever.
36
+ result = run(
37
+ ["python3", ROOT / "close_outputs.py"],
38
+ timeout_seconds=2,
39
+ max_output_size=1024,
40
+ )
41
+ assert result.exit_code == -1
42
+ assert result.timeout == True
43
+ assert len(result.stderr) == 0
44
+ assert result.stdout == "This is the end\n"
45
+ assert_no_running_evil()
46
+
47
+
48
+ def test_unbounded_output():
49
+ result = run(
50
+ ["python3", ROOT / "unbounded_output.py"],
51
+ timeout_seconds=3,
52
+ max_output_size=1024,
53
+ )
54
+ assert result.exit_code == -1
55
+ assert result.timeout == True
56
+ assert len(result.stderr) == 0
57
+ assert len(result.stdout) == 1024
58
+ assert_no_running_evil()
59
+
60
+
61
+ def test_sleep_forever():
62
+ result = run(
63
+ ["python3", ROOT / "sleep_forever.py"],
64
+ timeout_seconds=2,
65
+ max_output_size=1024,
66
+ )
67
+ assert result.exit_code == -1
68
+ assert result.timeout == True
69
+ assert len(result.stderr) == 0
70
+ assert len(result.stdout) == 0
71
+ assert_no_running_evil()
72
+
73
+
74
+ def test_fork_bomb():
75
+ result = run(
76
+ ["python3", ROOT / "fork_bomb.py"],
77
+ timeout_seconds=2,
78
+ max_output_size=1024,
79
+ )
80
+ assert result.exit_code == -1
81
+ assert result.timeout == True
82
+ assert len(result.stderr) == 0
83
+ assert len(result.stdout) == 0
84
+ # Unfortunately, this sleep seems to be necessary. My theories:
85
+ # 1. os.killpg doesn't block until the whole process group is dead.
86
+ # 2. pgrep can produce stale output
87
+ time.sleep(2)
88
+ assert_no_running_evil()
89
+
90
+
91
+ def test_block_on_inputs():
92
+ # We run the subprocess with /dev/null as input. So, any program that tries
93
+ # to read input will error.
94
+ result = run(
95
+ ["python3", ROOT / "block_on_inputs.py"],
96
+ timeout_seconds=2,
97
+ max_output_size=1024,
98
+ )
99
+ assert result.exit_code == 1
100
+ assert result.timeout == False
101
+ assert len(result.stdout) == 0
102
+ assert "EOF when reading a line" in result.stderr
103
+ assert_no_running_evil()