File size: 2,789 Bytes
9203553
 
 
cdf268e
6c92442
9203553
2d26479
6c92442
 
 
 
 
 
 
e2473e2
2d26479
 
 
 
9203553
a9273cf
9203553
e2473e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9203553
 
 
6c92442
9203553
6c92442
aa8b23d
6c92442
aa8b23d
 
dc801c4
6c92442
dc801c4
6c92442
aa8b23d
2d26479
 
 
 
9203553
 
 
 
2d0af54
 
9203553
2d0af54
9203553
2d0af54
9203553
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import logging
import os

import pandas as pd  # type: ignore[import]
from datasets import get_dataset_config_names, load_dataset  # type: ignore[import]

from .formatting import model_hyperlink
from .leaderboard_formatting import (
    COLUMNS_PRETTY,
    METRICS_PER_TASK,
    SORT_COLUMN_PER_TASK,
    get_columns_per_task,
)
from .tasks_content import TASKS_PRETTY_REVERSE

try:
    AVAILABLE_TASKS = get_dataset_config_names(os.environ["DATASET_ID"])
except FileNotFoundError:
    AVAILABLE_TASKS = []


def _get_results_stub() -> pd.DataFrame:
    stub_df = pd.DataFrame(
        [
            {
                "Model Name": "GPT-4",
                "Availability": "Proprietary",
                "Context Size": "16k",
                "BLEU": "X",
                "ROUGE": "X",
                "ChrF": "X",
                "BERTScore": "X",
                "BERTScore (Normalized)": "X",
                "Submitted By": "🏟 Long Code Arena Team",
            },
            {
                "Model Name": "CodeLlama-7b (instruct)",
                "Availability": "Llama 2 license",
                "Context Size": "16k",
                "BLEU": "X",
                "ROUGE": "X",
                "ChrF": "X",
                "BERTScore": "X",
                "BERTScore (Normalized)": "X",
                "Submitted By": "🏟 Long Code Arena Team",
            },
        ]
    )
    return stub_df


def _get_results_dataset(task_id: str) -> pd.DataFrame:
    results_df = load_dataset(os.environ["DATASET_ID"], task_id, split="test").to_pandas()
    results_df = results_df.rename(columns=COLUMNS_PRETTY, errors="ignore")
    results_df["Context Size"] = results_df["Context Size"].map(lambda x: f"{int(x) // 1000}k" if int(x) >= 1000 else x)

    results_df = results_df.sort_values(by=SORT_COLUMN_PER_TASK[task_id], ascending=False)

    for metric_column in METRICS_PER_TASK[task_id]:
        if "BERTScore" in metric_column:
            results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.5f}")
        else:
            results_df[metric_column] = results_df[metric_column].map(lambda x: f"{x:.2f}")

    results_df["Model Name"] = [
        model_hyperlink(link=link, model_name=model_name) if link else model_name
        for link, model_name in zip(results_df["model_url"], results_df["Model Name"])
    ]
    results_df = results_df[get_columns_per_task(task_id)]
    return results_df


def get_results_for_task(task_pretty: str) -> pd.DataFrame:
    task_id = TASKS_PRETTY_REVERSE[task_pretty]
    if task_id in AVAILABLE_TASKS:
        logging.info(f"Retrieving results for {task_pretty}...")
        return _get_results_dataset(task_id)
    logging.info(f"Generating leaderboard stub for {task_pretty}...")
    return _get_results_stub()