Spaces:
Runtime error
Runtime error
import copy as cp | |
import json, sys | |
from collections import defaultdict | |
from urllib.request import urlopen | |
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
from meta_data import DEFAULT_TASK | |
def listinstr(lst, s): | |
assert isinstance(lst, list) | |
for item in lst: | |
if item in s: | |
return True | |
return False | |
def load_results(): | |
#data = json.loads(urlopen(URL).read()) | |
with open('results.json', 'r') as file: | |
data = json.load(file) | |
return data | |
def nth_large(val, vals): | |
return sum([1 for v in vals if v > val]) + 1 | |
def format_timestamp(timestamp): | |
date = timestamp[:-4] + '.' + timestamp[-4:-2] + '.' + timestamp[-2:] | |
return date | |
# def BUILD_L1_DF(results, fields): | |
# check_box = {} | |
# check_box['essential'] = ['Model'] | |
# # revise there to set default dataset | |
# check_box['required'] = DEFAULT_TASK | |
# check_box['all'] = DEFAULT_TASK | |
# type_map = defaultdict(lambda: 'number') | |
# check_box['type_map'] = type_map | |
# df = generate_table(results, fields) | |
# return df, check_box | |
def BUILD_L2_DF(results, benchmark): | |
results=results[benchmark] | |
model_list=[] | |
all_fields=list(results.keys()) | |
for task in results: | |
model_list+=list(results[task].keys()) | |
model_list=list(set(model_list)) | |
res = defaultdict(list) | |
if benchmark not in ["RedCode","NYU CTF Bench","PrimeVul","SWE-bench-verified"]: | |
res['Model']=model_list | |
elif benchmark=="SWE-bench-verified": | |
res['Agent']=model_list | |
elif benchmark == "PrimeVul": | |
used=[] | |
for task in all_fields: | |
for model in results[task]: | |
for extra in results[task][model]: | |
if [model,extra] not in used: | |
res['Model'].append(model) | |
res['Method'].append(extra) | |
used.append([model,extra]) | |
else: | |
used=[] | |
for task in all_fields: | |
for model in results[task]: | |
for extra in results[task][model]: | |
if [model,extra] not in used: | |
res['Model'].append(model) | |
res['Agent'].append(extra) | |
used.append([model,extra]) | |
if benchmark not in ["RedCode","NYU CTF Bench",'PrimeVul']: | |
for task in all_fields: | |
for model in model_list: | |
if model in results[task]: | |
res[task].append(results[task][model]) | |
else: | |
res[task].append(None) | |
else: | |
for task in all_fields: | |
for model, extra in used: | |
if model in results[task] and extra in results[task][model]: | |
res[task].append(results[task][model][extra]) | |
else: | |
res[task].append(None) | |
df = pd.DataFrame(res) | |
rank_criteria=all_fields[0] | |
valid, missing = df[~pd.isna(df[rank_criteria])], df[pd.isna(df[rank_criteria])] | |
valid = valid.sort_values(rank_criteria) | |
valid = valid.iloc[::-1] | |
if len(all_fields): | |
missing = missing.iloc[::-1] | |
df = pd.concat([valid, missing]) | |
required_fields = all_fields | |
check_box = {} | |
if benchmark=="SWE-bench-verified": | |
check_box['essential'] = ['Agent'] | |
elif benchmark=='PrimeVul': | |
check_box['essential'] = ['Model','Method'] | |
elif benchmark in ["RedCode","NYU CTF Bench"]: | |
check_box['essential'] = ['Model','Agent'] | |
else: | |
check_box['essential'] = ['Model'] | |
check_box['required'] = required_fields | |
check_box['all'] = all_fields | |
type_map = defaultdict(lambda: 'number') | |
check_box['type_map'] = type_map | |
return df, check_box | |
def generate_table(results, fields): | |
model_list=[] | |
task_list=fields | |
benchmark_list=[] | |
for task in results: | |
for benchmark in results[task]: | |
if benchmark!='category': | |
benchmark_list+=[benchmark] | |
model_list+=list(results[task][benchmark].keys()) | |
model_list=list(set(model_list)) | |
res = defaultdict(list) | |
res['Model']=model_list | |
average_score={} | |
cnt={} | |
for task in task_list: | |
task_score=[] | |
for model in model_list: | |
score=[] | |
for benchmark in results[task]: | |
if benchmark != 'category': | |
if model not in results[task][benchmark]: | |
score.append(None) | |
elif not isinstance(results[task][benchmark][model], (int, float)): | |
score.append((results[task][benchmark][model]["autonomous"]+results[task][benchmark][model]["assisted"])/2) | |
else: | |
score.append(results[task][benchmark][model]) | |
if not any (item is not None for item in score): | |
score=None | |
else: | |
score=np.mean([s for s in score if s is not None]) | |
if model not in average_score: | |
average_score[model]=score | |
cnt[model]=1 | |
else: | |
average_score[model]=((average_score[model]*cnt[model])+score)/(cnt[model]+1) | |
cnt[model]+=1 | |
task_score.append(score) | |
res[task]=task_score | |
#res['Avg Score']=[average_score[model] for model in model_list] | |
#res['Avg Rank'] = [sorted(res['Avg Score'], reverse=True).index(score) + 1 for score in res['Avg Score']] | |
df = pd.DataFrame(res) | |
# valid, missing = df[~pd.isna(df['Avg Score'])], df[pd.isna(df['Avg Score'])] | |
# valid = valid.sort_values('Avg Score') | |
# valid = valid.iloc[::-1] | |
# if len(fields): | |
# missing = missing.sort_values('MMBench_V11' if 'MMBench_V11' in fields else fields[0]) | |
# missing = missing.iloc[::-1] | |
# df = pd.concat([valid, missing]) | |
return df | |