Spaces:
Sleeping
Sleeping
File size: 7,539 Bytes
bcf4698 aa2789b 624eca2 bcf4698 624eca2 f8935ce 624eca2 bcf4698 aa2789b bcf4698 09351b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
from typing import Literal
from functools import partial
from content import *
import gradio as gr
import numpy as np
import pandas as pd
import pandas as pd
# This dataframe must contain the following columns:
# - model: the name of the model
# - language: the language of the model
# - dataset: the dataset used to evaluate the model
# - score: the score of the model on the (language, dataset) pair
# - model_type: the type of the model (e.g. "Chat Model", "Base Model")
df = pd.read_csv("data/raw_scores.csv")
choices_language = list(df["language"].unique())
choices_dataset = list(df["dataset"].unique())
choices_model_type = list(df["model_type"].unique())
# Utility functions for data processing
reduce_functions = {
"Mean": lambda x: np.mean(x),
"Median": lambda x: np.median(x),
"Max": lambda x: np.max(x),
"Min": lambda x: np.min(x),
}
map_functions = {
"Raw": lambda x: x,
"Rank": partial(pd.Series.rank, ascending=False, method="dense"),
"Normalize": lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)),
}
score_ascending = {
"Raw": False,
"Rank": True,
"Normalize": False,
}
def prepare_dataframe(
df: pd.DataFrame,
filters: dict[str, list[str]],
group_by: Literal["language", "dataset"],
map_function: str,
reduce_function: str,
) -> pd.DataFrame:
# Filters contains a value subset for each column
language = filters["language"]
dataset = filters["dataset"]
# Columns are divided into two groups: other_columns and group_by_columns, apart from `model` which should be the index
other_columns = list(set(df.columns) - set(["language", "dataset", "score"]))
group_by_columns = filters[group_by]
# Step 1: Filter the dataframe based on the selected language and dataset
for k, v in filters.items():
df = df[df[k].isin(v)]
# If dataframe is empty, return an empty dataframe
if len(df) == 0:
gr.Warning(
f"No scores remain after the filter application. Please verify the checkboxes."
)
return pd.DataFrame(columns=other_columns)
# Sanity check: All score exists for each (language,dataset) pair
score_count = (
df.drop_duplicates(subset=["model", "language", "dataset"])
.groupby(["model"])["score"]
.count()
)
invalid_models = score_count[
score_count < len(language) * len(dataset)
].index.tolist()
df = df[~df["model"].isin(invalid_models)]
# Send a warning message if there are any invalid models
for model in invalid_models:
gr.Warning(
f"<strong>{model}</strong> is lacking some scores thus hidden. Please report to the maintainers."
)
# Step 2: Process Scores
# Step 2.0: Map the scores along each (language, dataset) pair
df["score"] = df.groupby(["language", "dataset"])["score"].transform(
map_functions[map_function]
)
# Step 2.1: Reduce the scores along the column other than `group_by`
df = (
df.groupby(other_columns + [group_by])
.agg({"score": reduce_functions[reduce_function]})
.reset_index()
)
# Step 2.2: Reduce the scores along `group_by` to get the overall score of each model
reduced_col = df.groupby(other_columns).agg(
{"score": reduce_functions[reduce_function]}
)["score"]
# Step 2.3: Pivot the dataframe, then concat the overall score
df = df.pivot(index=other_columns, columns=group_by, values=["score"]).droplevel(
0, 1
)
df["Overall Score"] = reduced_col
# Step 3: Styling for display
# - Sort the dataframe by the reduced score
# - Sort the columns for better readability
# - Highlight the maximum value in each column
# - Format the score to 2 decimal places if it is a float
other_columns.remove("model")
df = (
df.reset_index()[
["model"] + other_columns + ["Overall Score"] + group_by_columns
]
.sort_values(by="Overall Score", ascending=score_ascending[map_function])
.style.format(precision=2)
)
if score_ascending[map_function]:
df = df.highlight_min(
axis=0, color="#18864B", subset=["Overall Score"] + group_by_columns
)
else:
df = df.highlight_max(
axis=0, color="#18864B", subset=["Overall Score"] + group_by_columns
)
return df
with gr.Blocks(theme=gr.themes.Base()) as demo:
# UI definition
with gr.Row():
with gr.Column():
gr.Markdown(
MARKDOWN_HEADER
)
checkbox_language = gr.CheckboxGroup(
choices=choices_language,
value=choices_language,
label="Language(s)",
interactive=True,
)
checkbox_dataset = gr.CheckboxGroup(
choices=choices_dataset,
value=choices_dataset,
label="Dataset(s)",
interactive=True,
)
checkbox_model_type = gr.CheckboxGroup(
choices=choices_model_type,
value=choices_model_type,
label="Model Type(s)",
interactive=True,
)
dropdown_map_function = gr.Dropdown(
choices=map_functions.keys(),
value="Raw",
label="Map Function",
interactive=True,
info=MARKDOWN_MAP_FUNCTION
)
dropdown_reduce_function = gr.Dropdown(
choices=reduce_functions.keys(),
value="Mean",
label="Reduce Function",
interactive=True,
info=MARKDOWN_REDUCE_FUNCTION
)
ratio_group_by = gr.Radio(
choices=["language", "dataset"],
value="language",
label="Group by",
interactive=True,
)
dataframe = gr.DataFrame(
prepare_dataframe(
df=df,
filters={
"language": choices_language,
"dataset": choices_dataset,
},
group_by="language",
map_function="Raw",
reduce_function="Mean",
),
interactive=False,
)
gr.Code(
language="markdown",
label="Citation",
value=CITATION,
)
# Event listeners
gr.on(
triggers=[
checkbox_model_type.change,
checkbox_language.change,
checkbox_dataset.change,
ratio_group_by.change,
dropdown_reduce_function.change,
dropdown_map_function.change,
],
fn=lambda model_type, language, dataset, group_by, map_function, reduce_function: prepare_dataframe(
df=df,
filters={
"language": language,
"dataset": dataset,
"model_type": model_type,
},
group_by=group_by,
map_function=map_function,
reduce_function=reduce_function,
),
inputs=[
checkbox_model_type,
checkbox_language,
checkbox_dataset,
ratio_group_by,
dropdown_map_function,
dropdown_reduce_function,
],
outputs=[dataframe],
)
if __name__ == "__main__":
demo.launch()
|