|
import logging |
|
import os |
|
|
|
import pandas as pd |
|
from datasets import get_dataset_config_names, load_dataset |
|
|
|
from .leaderboard_formatting import COLUMNS_PRETTY, get_columns_per_task |
|
|
|
AVAILABLE_TASKS = get_dataset_config_names(os.environ["DATASET_ID"]) |
|
|
|
|
|
def _get_results_stub() -> pd.DataFrame: |
|
stub_df = pd.DataFrame( |
|
[ |
|
{ |
|
"Model Name": "GPT-4", |
|
"Availability": "Proprietary", |
|
"Context Size": "16k", |
|
"BLEU": "X", |
|
"ROUGE": "X", |
|
"ChrF": "X", |
|
"BERTScore": "X", |
|
"BERTScore (Normalized)": "X", |
|
"Submitted By": "π Long Code Arena Team", |
|
}, |
|
{ |
|
"Model Name": "CodeLlama-7b (instruct)", |
|
"Availability": "Llama 2 license", |
|
"Context Size": "16k", |
|
"BLEU": "X", |
|
"ROUGE": "X", |
|
"ChrF": "X", |
|
"BERTScore": "X", |
|
"BERTScore (Normalized)": "X", |
|
"Submitted By": "π Long Code Arena Team", |
|
}, |
|
] |
|
) |
|
return stub_df |
|
|
|
|
|
def _get_results_dataset(task_id: str) -> pd.DataFrame: |
|
results_df = load_dataset( |
|
os.environ["DATASET_ID"], task_id, split="test" |
|
).to_pandas() |
|
results_df = results_df.rename(columns=COLUMNS_PRETTY, errors="ignore") |
|
results_df["Context Size"] = results_df["Context Size"].map( |
|
lambda x: f"{int(x) // 1000}k" if int(x) >= 1000 else x |
|
) |
|
results_df = results_df[get_columns_per_task(task_id)] |
|
return results_df |
|
|
|
|
|
def get_results_for_task(task_id: str) -> pd.DataFrame: |
|
if task_id in AVAILABLE_TASKS: |
|
logging.info(f"Retrieving results for {task_id}...") |
|
return _get_results_dataset(task_id) |
|
logging.info(f"Generating leaderboard stub for {task_id}...") |
|
return _get_results_stub() |
|
|