# coding=utf-8 # Copyright 2024 The Mexa Authors. # Lint as: python3 # This space is built based on uonlp/open_multilingual_llm_leaderboard. # Mexa Space import os import json import glob from collections import defaultdict import pandas as pd import gradio as gr import glob # cache_data def language_names(json_path): with open(json_path, 'r') as json_file: data = json.load(json_file) return data label2name = language_names("assets/language_names.json") def get_name(label): """Get the name of language from label""" iso_3 = label.split('_')[0] name = label2name[iso_3] return name FLORES_ARC = {'name': 'FLORES - ARC style', 'csv': 'evals/flores-mean-arc.csv'} Bible_ARC = {'name': 'Bible - ARC style', 'csv': 'evals/bible-mean-arc.csv'} FLORES_Belebele = {'name': 'FLORES - Belebele style', 'csv': 'evals/flores-max-belebele.csv'} Bible_Belebele = {'name': 'Bible - Belebele style', 'csv': 'evals/bible-max-belebele.csv'} BENCHMARKS = [FLORES_ARC, Bible_ARC, FLORES_Belebele, Bible_Belebele] def collect_results(): performance_dict = defaultdict(lambda: defaultdict(dict)) for bench in BENCHMARKS: task = bench['name'] results = pd.read_csv(bench['csv'], index_col = 0).to_dict() # add performances for model, d_lang_value in results.items(): if model == 'avg': continue for lang, value in d_lang_value.items(): performance_dict[task.split(' ')[0]][(model, lang)][task] = value return performance_dict def get_leaderboard_df(results_dict, parallel_data = 'FLORES'): df = list() performance_dict = results_dict.get(parallel_data, defaultdict(dict)) for (pretrained, lang), perfs in performance_dict.items(): lang_name = get_name(lang) perfs_num = [0, 0] if f'{parallel_data} - ARC style' in perfs: perfs_num = [perfs[f'{parallel_data} - ARC style'], perfs[f'{parallel_data} - Belebele style']] avg = round(sum(perfs_num) / len(perfs_num), 4) notes = ' '.join([pretrained, lang_name]) row = [pretrained, lang_name, lang, avg] + perfs_num + [notes] df.append(row) df = pd.DataFrame.from_records(df, columns=COLS) # Sort language_aggregate = df.groupby("Language")["Average"].mean().reset_index().sort_values(by='Average', ascending=False).reset_index() df = df.sort_values(by='Average', ascending=False) df = df.set_index("Language").loc[language_aggregate["Language"]].reset_index() df = df[COLS] return df MODEL_COL = "Model" LANG_COL = "Language" CODE_COL = "Code" ARC_COL = 'ARC Style' BELEBELE_COL = 'Belebele Style' AVERAGE_COL = "Average" NOTES_COL = "Notes" # For search only COLS = [MODEL_COL, LANG_COL, CODE_COL, AVERAGE_COL, ARC_COL, BELEBELE_COL, NOTES_COL] TYPES = ["str", "str", "str", "number", "number", "number", "number", "number", "str"] performance = collect_results() def search_table(query, selection): df = get_leaderboard_df(performance, selection) filtered_df = df[df[NOTES_COL].str.contains(query, case=False)] return filtered_df def update_dataframe(selection): if selection == "FLORES": return get_leaderboard_df(performance, selection) elif selection == "Bible": return get_leaderboard_df(performance, selection) CUSTOM_CSS= """ /* Hides the final column */ table td:last-child, table th:last-child { display: none; } # table td:first-child, # table th:first-child { # max-width: 400px; # overflow: auto; # white-space: nowrap; # } """ TITLE = '