|
|
|
import json |
|
|
|
import streamlit as st |
|
import glob |
|
import os |
|
from datetime import datetime |
|
|
|
st.set_page_config(layout="wide") |
|
st.title('Meta Open LLM leaderboard') |
|
st.write("Combine data from various open LLM leaderboards into one useful visualization page") |
|
st.write("<nbsp/>", unsafe_allow_html=True) |
|
|
|
directories = os.listdir("./data") |
|
|
|
def format_dir_date(data_dir): |
|
|
|
parsed_date = datetime.strptime(data_dir, "%Y%m%d_%H%M") |
|
|
|
|
|
return parsed_date.strftime("%b %d, %Y %H:%M") |
|
|
|
def print_model_list(file_name, st, split_into_two=False): |
|
file_path = file_name[:-4] + '.json' |
|
|
|
with open(file_path, 'r') as file: |
|
model_id_list_loaded = json.load(file) |
|
midpoint = len(model_id_list_loaded) // 2 + (len(model_id_list_loaded) % 2) |
|
|
|
|
|
left_list = model_id_list_loaded[:midpoint] |
|
right_list = model_id_list_loaded[midpoint:] |
|
|
|
|
|
left_html = "" |
|
for model_id in left_list: |
|
model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:] |
|
left_html += f'<li><a href="https://huggingface.co/{model_id}">{model_id_trunc}</a></li>' |
|
|
|
|
|
|
|
right_html = "" |
|
for model_id in right_list: |
|
model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:] |
|
right_html += f'<li><a href="https://huggingface.co/{model_id}">{model_id_trunc}</a></li>' |
|
|
|
final_html = "" |
|
if(split_into_two): |
|
final_html = "<ul>" |
|
final_html += left_html |
|
final_html += "</ul>" |
|
cols = st.columns(2) |
|
cols[0].write(final_html, unsafe_allow_html=True) |
|
final_html = "<ul>" |
|
final_html += right_html |
|
final_html += "</ul>" |
|
cols[1].write(final_html, unsafe_allow_html=True) |
|
else: |
|
final_html = "<ul>" |
|
final_html += left_html |
|
final_html += right_html |
|
final_html += "</ul>" |
|
st.write(final_html, unsafe_allow_html=True) |
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
data_dir = st.selectbox( |
|
'Select different data generation date', |
|
directories, |
|
format_func=format_dir_date, |
|
index=len(directories)-1, |
|
) |
|
with col2: |
|
compare_mode = st.checkbox('Enable compare to different date') |
|
if compare_mode: |
|
compare_data_dir = st.selectbox( |
|
'Select date for comparison', |
|
directories, |
|
format_func=format_dir_date, |
|
index=len(directories)-1, |
|
) |
|
|
|
captions_map = { |
|
"hg_average_to_agentbench_compare.png": "HF to AgentBench compare", |
|
"hg_average_to_opencompass_compare.png": "HF to OpenCompass compare", |
|
"hg_average_to_mt_bench_compare.png": "HF to MT-Bench compare", |
|
"hg_average_to_mosaic_compare.png": "HF to MosaicML compare", |
|
"hg_average_to_alpacaeval_compare.png": "HF to AlpacaEval compare" |
|
} |
|
|
|
with col1: |
|
st.write("<div style=\"text-align: center\" >Generated on: <b>" + format_dir_date(data_dir) + "</b></div>", unsafe_allow_html=True) |
|
|
|
|
|
data_path = './data/' + data_dir |
|
|
|
|
|
if compare_mode: |
|
|
|
|
|
compare_data_path = './data/' + compare_data_dir |
|
|
|
|
|
imgs = glob.glob(os.path.join(data_path, '*.png')) |
|
compare_imgs = glob.glob(os.path.join(compare_data_path, '*.png')) |
|
|
|
|
|
def extract_images(keyword, img_list): |
|
return [img for img in img_list if keyword in os.path.basename(img)] |
|
|
|
hf_llm_diagrams = extract_images('hf_llm_diagram', imgs) |
|
bigcode_diagrams = extract_images('bigcode', imgs) |
|
mt_bench_diagrams = extract_images('lmsys_leaderboard_mt_bench', imgs) |
|
arena_diagrams = extract_images('lmsys_leaderboard_arena', imgs) |
|
opencompass_diagrams = extract_images('opencompass_leaderboard', imgs) |
|
|
|
compare_hf_llm_diagrams = extract_images('hf_llm_diagram', compare_imgs) |
|
compare_bigcode_diagrams = extract_images('bigcode', compare_imgs) |
|
compare_mt_bench_diagrams = extract_images('lmsys_leaderboard_mt_bench', compare_imgs) |
|
compare_arena_diagrams = extract_images('lmsys_leaderboard_arena', compare_imgs) |
|
compare_opencompass_diagrams = extract_images('opencompass_leaderboard', compare_imgs) |
|
|
|
|
|
def display_side_by_side(diagrams1, diagrams2, title): |
|
st.subheader(title, divider=True) |
|
for d1, d2 in zip(diagrams1, diagrams2): |
|
cols = st.columns(2) |
|
cols[0].image(d1, use_column_width="auto") |
|
cols[1].image(d2, use_column_width="auto") |
|
|
|
|
|
display_side_by_side(hf_llm_diagrams, compare_hf_llm_diagrams, "HuggingFace Open LLM leaderboard by Model Size") |
|
|
|
|
|
display_side_by_side(bigcode_diagrams, compare_bigcode_diagrams, "Big Code Models Leaderboard") |
|
|
|
|
|
display_side_by_side(mt_bench_diagrams, compare_mt_bench_diagrams, "MT-Bench Models Leaderboard") |
|
|
|
|
|
display_side_by_side(arena_diagrams, compare_arena_diagrams, "LMSYS Arena Elo Models Leaderboard") |
|
|
|
|
|
display_side_by_side(opencompass_diagrams, compare_opencompass_diagrams, "OpenCompass Models Leaderboard") |
|
|
|
|
|
remaining_imgs = list(set(imgs) - set(hf_llm_diagrams) - set(bigcode_diagrams) - set(mt_bench_diagrams) - set(opencompass_diagrams)) |
|
compare_remaining_imgs = list(set(compare_imgs) - set(compare_hf_llm_diagrams) - set(compare_bigcode_diagrams) - set(compare_mt_bench_diagrams) - set(compare_opencompass_diagrams)) |
|
|
|
st.subheader("HuggingFace and Other Leaderboards: A Comparative Model Evaluation", divider=True) |
|
st.caption("Only models evaluated on both leaderboards are included.") |
|
|
|
|
|
for img, compare_img in zip(remaining_imgs, compare_remaining_imgs): |
|
cols = st.columns(2) |
|
|
|
|
|
filename = os.path.basename(img) |
|
caption = captions_map.get(filename, "") |
|
|
|
|
|
compare_filename = os.path.basename(compare_img) |
|
compare_caption = captions_map.get(compare_filename, "") |
|
|
|
|
|
cols[0].image(img, caption=caption, width=None) |
|
cols[1].image(compare_img, caption=compare_caption, width=None) |
|
|
|
else: |
|
imgs = glob.glob(os.path.join(data_path, '*.png')) |
|
|
|
|
|
hf_llm_diagrams = [img for img in imgs if 'hf_llm_diagram' in os.path.basename(img)] |
|
bigcode_diagrams = [img for img in imgs if 'bigcode' in os.path.basename(img)] |
|
mt_bench_diagrams = [img for img in imgs if 'lmsys_leaderboard_mt_bench' in os.path.basename(img)] |
|
arena_diagrams = [img for img in imgs if 'lmsys_leaderboard_arena' in os.path.basename(img)] |
|
opencompass_diagrams = [img for img in imgs if 'opencompass_leaderboard' in os.path.basename(img)] |
|
|
|
|
|
remaining_imgs = list(set(imgs) - set(hf_llm_diagrams) - set(bigcode_diagrams) - set(mt_bench_diagrams) - set(arena_diagrams) - set(opencompass_diagrams)) |
|
|
|
st.subheader("HuggingFace Open LLM leaderboard by Model Size", divider=True) |
|
cols = st.columns(2) |
|
|
|
cols[0].image(hf_llm_diagrams[0], caption="Main chart using all the models", use_column_width="auto") |
|
|
|
print_model_list(hf_llm_diagrams[0],st, True) |
|
st.write("<nbsp/>", unsafe_allow_html=True) |
|
|
|
cols = st.columns(2) |
|
|
|
cols[0].image(hf_llm_diagrams[1],caption="Other or commercially permissive licenses only", use_column_width="auto") |
|
print_model_list(hf_llm_diagrams[1],cols[0]) |
|
|
|
cols[1].image(hf_llm_diagrams[2],caption="Commercially permissive license only", use_column_width="auto") |
|
print_model_list(hf_llm_diagrams[2],cols[1]) |
|
|
|
st.write("<nbsp/>", unsafe_allow_html=True) |
|
|
|
cols = st.columns(2) |
|
cols[0].image(hf_llm_diagrams[3],caption="TruthfulQA at 10% for HuggingFace Open LLM leaderboard by Model Size", use_column_width="auto") |
|
print_model_list(hf_llm_diagrams[3],cols[0],False) |
|
|
|
cols[1].image(hf_llm_diagrams[4],caption="ARC at 50% and MMLU at 50% for HuggingFace Open LLM leaderboard by Model Size", use_column_width="auto") |
|
print_model_list(hf_llm_diagrams[4],cols[1],False) |
|
|
|
|
|
st.subheader("Big Code Models Leaderboard", divider=True) |
|
cols = st.columns(2) |
|
cols[0].image(bigcode_diagrams[0], use_column_width="auto") |
|
|
|
|
|
print_model_list(bigcode_diagrams[0],st,True) |
|
|
|
st.subheader("MT-Bench Models Leaderboard", divider=True) |
|
cols = st.columns(2) |
|
cols[0].image(mt_bench_diagrams[0], use_column_width="auto") |
|
|
|
print_model_list(mt_bench_diagrams[0],st,True) |
|
|
|
st.subheader("LMSYS Arena Elo Models Leaderboard", divider=True) |
|
cols = st.columns(2) |
|
cols[0].image(arena_diagrams[0], use_column_width="auto") |
|
|
|
print_model_list(arena_diagrams[0],st,True) |
|
|
|
st.subheader("OpenCompass Models Leaderboard", divider=True) |
|
cols = st.columns(2) |
|
cols[0].image(opencompass_diagrams[0], use_column_width="auto") |
|
print_model_list(opencompass_diagrams[0],st,True) |
|
|
|
st.subheader("HuggingFace and Other Leaderboards: A Comparative Model Evaluation", divider=True) |
|
st.caption("Only models evaluated on both leaderboards are included.") |
|
|
|
cols = st.columns(2) |
|
|
|
for i, img in enumerate(remaining_imgs): |
|
|
|
filename = os.path.basename(img) |
|
|
|
|
|
caption = captions_map.get(filename, "") |
|
|
|
|
|
cols[i % 2].image(img, caption=caption, width=None) |
|
|
|
st.write( |
|
""" |
|
<p>Leaderboards tracked:</p> |
|
<ul> |
|
<li><a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">Hugging Face Open LLM</a></li> |
|
<li><a href="https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard">MT-Bench and Arena Elo</a>MT-Bench is GPT4 judged evaluation of models, Arena Elo is users ranking outputs between models.</li> |
|
<li><a href="https://tatsu-lab.github.io/alpaca_eval/">AlpacaEval</a> GPT4 judged evaluation of models</li> |
|
<li><a href="https://www.mosaicml.com/llm-evaluation">MosaicML</a> Balanced set of static benchmarks</li> |
|
<li><a href="https://opencompass.org.cn/leaderboard-llm">OpenCompass</a> Balanced set of static benchmarks</li> |
|
<li><a href="https://llmbench.ai/data">AgentBench</a> Benchmark evaluating Agent abilities</li> |
|
<li><a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">BigCode</a> Compare performance of base multilingual code generation models</li> |
|
</ul> |
|
<sub>HuggingFace models that have been flagged as contaminated or do not provide any model card information are excluded.</sub> |
|
""", unsafe_allow_html=True |
|
) |
|
|
|
|
|
st.subheader('About', divider=True) |
|
st.write('This meta leaderboard is built and maintained by Felix Zaslavskiy. For feedback, correction, suggestions please reach out on X at <a href="https://twitter.com/FZaslavskiy" >@FZaslavskiy</a> or here via community discussions.', unsafe_allow_html=True) |