ToluClassics's picture
mockup eval
c25e6bb
raw
history blame
5.13 kB
import os
import json
import glob
from collections import defaultdict
import pandas as pd
import gradio as gr
from content import *
from css import *
import glob
AFRIMMLU_DIRECT = "afrimmlu_direct"
AFRIMMLU_TRANSLATE = "afrimmlu_translate"
AFRIXNLI_DIRECT = "afrixnli_direct"
AFRIXNLI_TRANSLATE = "afrixnli_translate"
BENCHMARKS = [AFRIMMLU_DIRECT, AFRIMMLU_TRANSLATE, AFRIXNLI_DIRECT, AFRIXNLI_TRANSLATE]
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
LANGS = ['amh', 'eng', 'ewe', 'fra', 'hau', 'ibo', 'kin', 'lin', 'lug', 'orm', 'sna', 'sot', 'swa', 'twi', 'wol', 'xho', 'yor', 'zul']
LANG_NAME = {
'amh': 'Amharic',
'eng': 'English',
'ewe': 'Ewe',
'fra': 'French',
'hau': 'Hausa',
'ibo': 'Igbo',
'kin': 'Kinyarwanda',
'lin': 'Lingala',
'lug': 'Luganda',
'orm': 'Oromo',
'sna': 'Shona',
'sot': 'Sotho',
'swa': 'Swahili',
'twi': 'Twi',
'wol': 'Wolof',
'xho': 'Xhosa',
'yor': 'Yoruba',
'zul': 'Zulu'
}
def collect_results():
performance_dict = defaultdict(dict)
pretrained_models = set()
for file in glob.glob('evals/*/*.json'):
with open(file, 'r') as f:
data = json.load(f)
if 'results' not in data:
continue
if 'config' not in data:
continue
results = data['results']
config = data['config']
if 'model_args' not in config:
continue
model_args = config['model_args'].split(',')
pretrained = [x for x in model_args if x.startswith('pretrained=')]
if len(pretrained) != 1:
continue
pretrained = pretrained[0].split('=')[1]
pretrained = pretrained.split('/')[-1]
pretrained_models.add(pretrained)
for lang_task, perfs in results.items():
task, lang = lang_task.split('_')
assert task in BENCHMARKS
if lang and task:
metric = METRICS[BENCHMARKS.index(task)]
p = round(perfs[metric] * 100, 1)
performance_dict[(pretrained, lang)][task] = p
return performance_dict, pretrained_models
def get_leaderboard_df(performance_dict, pretrained_models):
df = list()
for (pretrained, lang), perfs in performance_dict.items():
lang_name = LANG_NAME[lang]
afrimmlu_direct_perf = perfs.get(AFRIMMLU_DIRECT, 0.0)
afrimmlu_translate_perf = perfs.get(AFRIMMLU_TRANSLATE, 0.0)
afrixnli_direct_perf = perfs.get(AFRIXNLI_DIRECT, 0.0)
afrixnli_translate_perf = perfs.get(AFRIXNLI_TRANSLATE, 0.0)
if afrimmlu_direct_perf * afrimmlu_translate_perf * afrixnli_direct_perf * afrixnli_translate_perf == 0:
continue
avg = round((afrimmlu_direct_perf + afrimmlu_translate_perf + afrixnli_direct_perf + afrixnli_translate_perf) / 4, 1)
notes = ' '.join([pretrained, lang_name])
row = [pretrained, lang_name, lang, avg, afrimmlu_direct_perf, afrimmlu_translate_perf, afrixnli_direct_perf, afrixnli_translate_perf, notes]
df.append(row)
df = pd.DataFrame.from_records(df, columns=COLS)
df = df.sort_values(by=[LANG_COL, AVERAGE_COL], ascending=False)
df = df[COLS]
return df
def search_table(df, query):
filtered_df = df[df[NOTES_COL].str.contains(query, case=False)]
return filtered_df
MODEL_COL = "Model"
LANG_COL = "Language"
CODE_COL = "Code"
AVERAGE_COL = "Average"
AFRIMMLU_DIRECT_COL = "AfriMMLU Direct (0-Shot)"
AFRIMMLU_TRANSLATE_COL = "AfriMMLU Translate (0-Shot)"
AFRIXNLI_DIRECT_COL = "AfriXNLI Direct (0-Shot)"
AFRIXNLI_TRANSLATE_COL = "AfriXNLI Translate (0-Shot)"
NOTES_COL = "Notes" # For search only
COLS = [MODEL_COL, LANG_COL, CODE_COL, AVERAGE_COL, AFRIMMLU_DIRECT_COL, AFRIMMLU_TRANSLATE_COL, AFRIXNLI_DIRECT_COL, AFRIXNLI_TRANSLATE_COL, NOTES_COL]
TYPES = ["str", "str", "str", "number", "number", "number", "number", "number", "str"]
args = collect_results()
original_df = get_leaderboard_df(*args)
demo = gr.Blocks(css=CUSTOM_CSS)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
gr.Markdown(HOW_TO, elem_classes="markdown-text")
with gr.Group():
search_bar = gr.Textbox(
placeholder="Search models and languages...", show_label=False, elem_id="search-bar"
)
leaderboard_table = gr.components.Dataframe(
value=original_df,
headers=COLS,
datatype=TYPES,
# max_rows=5,
elem_id="leaderboard-table",
)
# # Dummy leaderboard for handling the case when the user uses backspace key
hidden_leaderboard_table_for_search = gr.components.Dataframe(
value=original_df,
headers=COLS,
datatype=TYPES,
# max_rows=5,
visible=False
)
search_bar.change(
search_table,
[hidden_leaderboard_table_for_search, search_bar],
leaderboard_table,
)
gr.Markdown(CREDIT, elem_classes="markdown-text")
gr.Markdown(CITATION, elem_classes="markdown-text")
demo.launch()