Spaces:
Runtime error
Runtime error
import os | |
import json | |
import datetime | |
from email.utils import parseaddr | |
import gradio as gr | |
import pandas as pd | |
import numpy as np | |
from datasets import load_dataset, DatasetDict | |
from apscheduler.schedulers.background import BackgroundScheduler | |
from huggingface_hub import HfApi | |
# InfoStrings | |
from scorer import question_scorer | |
from content import ( | |
format_error, | |
format_warning, | |
format_log, | |
TITLE, | |
INTRODUCTION_TEXT, | |
CITATION_BUTTON_LABEL, | |
CITATION_BUTTON_TEXT, | |
model_hyperlink, | |
) | |
TOKEN = os.environ.get("TOKEN", None) | |
OWNER = "stemdataset" | |
INTERNAL_DATA_DATASET = f"{OWNER}/STEM-Labels-Private" | |
SUBMISSION_DATASET = f"{OWNER}/submissions_internal" | |
CONTACT_DATASET = f"{OWNER}/contact_info" | |
RESULTS_DATASET = f"{OWNER}/results" | |
LEADERBOARD_PATH = f"{OWNER}/stem-leaderboard" | |
api = HfApi() | |
os.makedirs("scored", exist_ok=True) | |
# Display the results | |
eval_results = load_dataset( | |
RESULTS_DATASET, | |
token=TOKEN, | |
download_mode="force_redownload", | |
verification_mode="no_checks", | |
) | |
contact_infos = load_dataset( | |
CONTACT_DATASET, | |
token=TOKEN, | |
download_mode="force_redownload", | |
verification_mode="no_checks", | |
) | |
def get_dataframe_from_results(eval_results: DatasetDict, split): | |
local_df = eval_results[split] | |
local_df = local_df.map( | |
lambda row: {"model": model_hyperlink(row["url"], row["model"])} | |
) | |
local_df = local_df.remove_columns(["url"]) | |
local_df = local_df.rename_column("model", "Model Name") | |
local_df = local_df.rename_column("model_family", "Model Family") | |
local_df = local_df.rename_column("average", "Average") | |
local_df = local_df.rename_column("science", "Science") | |
local_df = local_df.rename_column("technology", "Technology") | |
local_df = local_df.rename_column("engineering", "Engineering") | |
local_df = local_df.rename_column("math", "Math") | |
local_df = local_df.rename_column("organisation", "Organisation") | |
local_df = local_df.rename_column("submit_date", "Submit Date") | |
df = pd.DataFrame(local_df) | |
df = df[[ | |
"Model Name", | |
"Model Family", | |
"Science", | |
"Technology", | |
"Engineering", | |
"Math", | |
"Average", | |
"Organisation", | |
"Submit Date", | |
]] | |
df = df.sort_values(by=["Average"], ascending=False) | |
numeric_cols = ["Science", "Technology", "Engineering", "Math", "Average"] | |
df[numeric_cols] = df[numeric_cols].round(decimals=1) | |
for col in numeric_cols: | |
df[col] = df[col].apply(lambda x: f"{x:.1f}") | |
return df | |
eval_dataframe_test = get_dataframe_from_results( | |
eval_results=eval_results, split="basic" | |
) | |
# Gold answers | |
gold_dataset = load_dataset(INTERNAL_DATA_DATASET, token=TOKEN)["labels"] | |
def restart_space(): | |
api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN) | |
TYPES = ["markdown", "number", "number", "number", "number", "str", "str"] | |
def calc_test_acc(preds: list[int]) -> dict[str, float]: | |
tmp_accs = { | |
"science": [0, 0], | |
"technology": [0, 0], | |
"engineer": [0, 0], | |
"math": [0, 0], | |
} | |
labels = gold_dataset | |
for pred, label in zip(preds, labels): | |
subject = label["subject"] | |
tmp_accs[subject][1] += 1 | |
if pred == label["answer_idx"]: | |
tmp_accs[subject][0] += 1 | |
accs = {k: v[0] / v[1] for k, v in tmp_accs.items()} | |
accs["average"] = np.mean(list(accs.values())) | |
accs = {k: round(v * 100, 1) for k, v in accs.items()} | |
return accs | |
def add_new_eval( | |
val_or_test: str, | |
model: str, | |
model_family: str, | |
url: str, | |
path_to_file: gr.File, | |
organisation: str, | |
mail: str, | |
): | |
curr_timestamp = datetime.datetime.today() | |
# Very basic email parsing | |
_, parsed_mail = parseaddr(mail) | |
if not "@" in parsed_mail: | |
return format_warning("Please provide a valid email adress.") | |
if model == "": | |
return format_warning("Please provide a model name.") | |
if model_family == "": | |
return format_warning("Please provide a model family.") | |
print( | |
json.dumps( | |
{ | |
"val_or_test": val_or_test, | |
"model": model, | |
"model_family": model_family, | |
"url": url, | |
"path_to_file": path_to_file, | |
"organisation": organisation, | |
"mail": mail, | |
}, | |
indent=2, | |
) | |
) | |
print("Adding new eval") | |
# Check if the combination model/org already exists and prints a warning message if yes | |
if model.lower() in set( | |
[m.lower() for m in eval_results["basic"]["model"]] | |
) and organisation.lower() in set( | |
[l.lower() for l in eval_results["basic"]["organisation"]] | |
): | |
return format_warning("This model has been already submitted.") | |
if path_to_file is None: | |
return format_warning("Please attach a file.") | |
# Save submitted file | |
api.upload_file( | |
repo_id=SUBMISSION_DATASET, | |
path_or_fileobj=path_to_file.name, | |
path_in_repo=f"{organisation}/{model}/{val_or_test}_raw_{curr_timestamp}.txt", | |
repo_type="dataset", | |
token=TOKEN, | |
) | |
# Compute score | |
file_path = path_to_file.name | |
with open(f"scored/{organisation}_{model}.json", "w") as scored_file: | |
with open(file_path, "r") as f: | |
preds = [] | |
for ix, line in enumerate(f): | |
try: | |
pred_idx = int(line.strip()) | |
except Exception: | |
return format_error( | |
f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file." | |
) | |
preds.append(pred_idx) | |
stem_scores = calc_test_acc(preds) | |
scored_file.write(json.dumps(stem_scores, indent=2)) | |
# Save scored file | |
api.upload_file( | |
repo_id=SUBMISSION_DATASET, | |
path_or_fileobj=f"scored/{organisation}_{model}.json", | |
path_in_repo=f"{organisation}/{model}/{val_or_test}_scored_{curr_timestamp}.json", | |
repo_type="dataset", | |
token=TOKEN, | |
) | |
# Actual submission | |
eval_entry = { | |
"model": model, | |
"model_family": model_family, | |
"url": url, | |
"organisation": organisation, | |
"submit_date": "\n".join(str(curr_timestamp).split(" ")), | |
"science": stem_scores["science"], | |
"technology": stem_scores["technology"], | |
"engineering": stem_scores["engineer"], | |
"math": stem_scores["math"], | |
"average": stem_scores["average"], | |
} | |
eval_results["basic"] = eval_results["basic"].add_item(eval_entry) | |
print(eval_results) | |
eval_results.push_to_hub(RESULTS_DATASET, token=TOKEN) | |
contact_info = { | |
"model": model, | |
"model_family": model_family, | |
"url": url, | |
"organisation": organisation, | |
"mail": mail, | |
"submit_date": "\n".join(str(curr_timestamp).split(" ")), | |
} | |
contact_infos["basic"] = contact_infos["basic"].add_item(contact_info) | |
contact_infos.push_to_hub(CONTACT_DATASET, token=TOKEN) | |
return format_log( | |
f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed" | |
) | |
def refresh(): | |
eval_results = load_dataset( | |
RESULTS_DATASET, | |
token=TOKEN, | |
download_mode="force_redownload", | |
verification_mode="no_checks", | |
) | |
eval_dataframe_test = get_dataframe_from_results( | |
eval_results=eval_results, split="basic" | |
) | |
return eval_dataframe_test | |
def upload_file(files): | |
file_paths = [file.name for file in files] | |
return file_paths | |
demo = gr.Blocks() | |
with demo: | |
gr.HTML(TITLE) | |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
with gr.Tab("Results: Test"): | |
leaderboard_table_test = gr.components.Dataframe( | |
value=eval_dataframe_test, | |
datatype=TYPES, | |
interactive=False, | |
wrap=True, | |
) | |
refresh_button = gr.Button("Refresh") | |
refresh_button.click( | |
refresh, | |
inputs=[], | |
outputs=[ | |
leaderboard_table_test, | |
], | |
) | |
with gr.Accordion("Submit a new model for evaluation"): | |
with gr.Row(): | |
with gr.Column(): | |
level_of_test = gr.Radio(["test"], value="test", label="Split") | |
model_name_textbox = gr.Textbox(label="Model name") | |
model_family_textbox = gr.Textbox(label="Model family") | |
url_textbox = gr.Textbox(label="Url to model information") | |
with gr.Column(): | |
organisation = gr.Textbox(label="Organisation") | |
mail = gr.Textbox( | |
label="Contact email (will be stored privately, & used if there is an issue with your submission)" | |
) | |
file_output = gr.File() | |
submit_button = gr.Button("Submit Eval") | |
submission_result = gr.Markdown() | |
submit_button.click( | |
add_new_eval, | |
[ | |
level_of_test, | |
model_name_textbox, | |
model_family_textbox, | |
url_textbox, | |
file_output, | |
organisation, | |
mail, | |
], | |
submission_result, | |
) | |
scheduler = BackgroundScheduler() | |
scheduler.add_job(restart_space, "interval", seconds=3600) | |
scheduler.start() | |
demo.launch(debug=True, server_name="0.0.0.0") | |