Spaces:

felixz
/

meta_open_llm_leaderboard

Sleeping

File size: 7,219 Bytes

7411847
6d7a85e
 
7411847
 
 
 
 
d1b3326
7411847
8d4c97a
 
7411847
9a34cda
7411847
9a34cda
 
 
 
 
 
 
e254e41
 
 
 
 
 
 
 
 
7411847
84f536a
 
602bc64
84f536a
 
 
 
e254e41
 
84f536a
 
9a34cda
 
7411847
 
84f536a
 
b9fbf95
0110cd9
 
84f536a
 
f44293a
84f536a
8fcd96c
6d7a85e
 
 
 
8fcd96c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d7a85e
5669002
8d4c97a
d1b3326
 
e254e41
d1b3326
 
e254e41
6d7a85e
dedfb73
6d7a85e
e254e41
6d7a85e
 
dedfb73
6d7a85e
84f536a
e254e41
 
 
 
 
 
 
 
 
 
b9fbf95
8d4c97a
d1b3326
 
e254e41
b9fbf95
 
 
0110cd9
 
 
 
 
e254e41
0110cd9
 
 
 
e254e41
8d4c97a
84f536a
e254e41
7411847
 
84f536a
 
 
 
 
 
 
 
 
b0337e1
9a34cda
 
0e46dc8
 
f765ec9
0e46dc8
 
 
 
 
b9fbf95
9a34cda
 
 
5669002
 
8d4c97a
b0337e1

# app.py
import json

import streamlit as st
import glob
import os
from datetime import datetime

st.set_page_config(layout="wide")
st.title('Meta Open LLM leaderboard')
st.write("Combine data from various open LLM leaderboards into one useful visualization page")
st.write("<nbsp/>", unsafe_allow_html=True)

directories = os.listdir("./data")

def format_dir_date(data_dir):
    # Extracting date and time information from the path
    parsed_date = datetime.strptime(data_dir, "%Y%m%d_%H%M")

    # Formatting the parsed date
    return  parsed_date.strftime("%b %d, %Y %H:%M")

col1, col2 = st.columns(2)

with col1:
    data_dir = st.selectbox(
        'Select different data generation date',
        directories,
        format_func=format_dir_date,
        index=len(directories)-1,
        )

captions_map = {
    "hg_average_to_agentbench_compare.png": "HF to AgentBench compare",
    "hg_average_to_opencompass_compare.png": "HF to OpenCompass compare",
    "hg_average_to_mt_bench_compare.png": "HF to MT-Bench compare",
    "hg_average_to_mosaic_compare.png": "HF to MosaicML compare",
    "hg_average_to_alpacaeval_compare.png": "HF to AlpacaEval compare"
}
with col2:
    st.write("<div style=\"text-align: center\" >Generated on: <b>" + format_dir_date(data_dir) + "</b></div>", unsafe_allow_html=True)


data_path = './data/' + data_dir

imgs = glob.glob(os.path.join(data_path, '*.png'))

# Extracting images that start with "hf_llm_diagram"
hf_llm_diagrams = [img for img in imgs if 'hf_llm_diagram' in os.path.basename(img)]
bigcode_diagrams = [img for img in imgs if 'bigcode' in os.path.basename(img)]
mt_bench_diagrams = [img for img in imgs if 'mt_bench_leaderboard' in os.path.basename(img)]
opencompass_diagrams = [img for img in imgs if 'opencompass_leaderboard' in os.path.basename(img)]

# Getting the remaining images
remaining_imgs = list(set(imgs) - set(hf_llm_diagrams) - set(bigcode_diagrams) - set(mt_bench_diagrams) - set(opencompass_diagrams))

def print_model_list(file_name, st, split_into_two=False):
    file_path = file_name[:-4] + '.json'
    # Read the list from the JSON file
    with open(file_path, 'r') as file:
        model_id_list_loaded = json.load(file)
        midpoint = len(model_id_list_loaded) // 2 + (len(model_id_list_loaded) % 2)  # Calculate the midpoint

        # Split the list into two parts
        left_list = model_id_list_loaded[:midpoint]
        right_list = model_id_list_loaded[midpoint:]

        # Generate HTML for the left column
        left_html = ""
        for model_id in left_list:
            model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:]
            left_html += f'<li><a href="https://huggingface.co/{model_id}">{model_id_trunc}</a></li>'


        # Generate HTML for the right column
        right_html = ""
        for model_id in right_list:
            model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:]
            right_html += f'<li><a href="https://huggingface.co/{model_id}">{model_id_trunc}</a></li>'

        final_html = ""
        if(split_into_two):
            final_html  = "<ul>"
            final_html  += left_html
            final_html  += "</ul>"
            cols = st.columns(2)
            cols[0].write(final_html, unsafe_allow_html=True)
            final_html  = "<ul>"
            final_html  += right_html
            final_html  += "</ul>"
            cols[1].write(final_html, unsafe_allow_html=True)
        else:
            final_html  = "<ul>"
            final_html  += left_html
            final_html  += right_html
            final_html  += "</ul>"
            st.write(final_html, unsafe_allow_html=True)


st.subheader("HuggingFace Open LLM leaderboard by Model Size", divider=True)
cols = st.columns(2)

cols[0].image(hf_llm_diagrams[0], caption="Main chart using all the models", use_column_width="auto")

print_model_list(hf_llm_diagrams[0],st, True)
st.write("<nbsp/>", unsafe_allow_html=True)

cols = st.columns(2)

cols[0].image(hf_llm_diagrams[1],caption="Other or commercially permissive licenses only", use_column_width="auto")
print_model_list(hf_llm_diagrams[1],cols[0])

cols[1].image(hf_llm_diagrams[2],caption="Commercially permissive license only", use_column_width="auto")
print_model_list(hf_llm_diagrams[2],cols[1])

st.write("<nbsp/>", unsafe_allow_html=True)

cols = st.columns(2)
cols[0].image(hf_llm_diagrams[3],caption="TruthfulQA at 10% for HuggingFace Open LLM leaderboard by Model Size", use_column_width="auto")
print_model_list(hf_llm_diagrams[3],cols[0],False)

cols[1].image(hf_llm_diagrams[4],caption="ARC at 50% and MMLU at 50% for HuggingFace Open LLM leaderboard by Model Size", use_column_width="auto")
print_model_list(hf_llm_diagrams[4],cols[1],False)



st.subheader("Big Code Models Leaderboard", divider=True)
cols = st.columns(2)
cols[0].image(bigcode_diagrams[0], use_column_width="auto")


print_model_list(bigcode_diagrams[0],st,True)

st.subheader("MT-Bench Models Leaderboard", divider=True)
cols = st.columns(2)
cols[0].image(mt_bench_diagrams[0], use_column_width="auto")

print_model_list(mt_bench_diagrams[0],st,True)

st.subheader("OpenCompass Models Leaderboard", divider=True)
cols = st.columns(2)
cols[0].image(opencompass_diagrams[0], use_column_width="auto")
print_model_list(opencompass_diagrams[0],st,True)

st.subheader("HuggingFace and Other Leaderboards: A Comparative Model Evaluation", divider=True)
st.caption("Only models evaluated on both leaderboards are included.")

cols = st.columns(2)

for i, img in enumerate(remaining_imgs):
    # Extract the filename from the full image path
    filename = os.path.basename(img)

    # Get the caption from the captions_map dictionary
    caption = captions_map.get(filename, "")  # If no caption is found, it will default to an empty string

    # Display the image with the caption
    cols[i % 2].image(img, caption=caption, width=None)

st.write(
    """
    <p>Leaderboards tracked:</p>
     <ul>
        <li><a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">Hugging Face Open LLM</a></li>
        <li><a href="https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard">MT-Bench</a> GPT4 judged evaluation of models</li>
        <li><a href="https://tatsu-lab.github.io/alpaca_eval/">AlpacaEval</a> GPT4 judged evaluation of models</li>
        <li><a href="https://www.mosaicml.com/llm-evaluation">MosaicML</a> Balanced set of static benchmarks</li>
        <li><a href="https://opencompass.org.cn/leaderboard-llm">OpenCompass</a> Balanced set of static benchmarks</li>
        <li><a href="https://llmbench.ai/data">AgentBench</a> Benchmark evaluating Agent abilities</li>
        <li><a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">BigCode</a>  Compare performance of base multilingual code generation models</li>
        </ul>
    """, unsafe_allow_html=True
)


st.subheader('About', divider=True)
st.write('This meta leaderboard is built and maintained by Felix Zaslavskiy. For feedback, correction, suggestions please reach out on X at <a href="https://twitter.com/FZaslavskiy" >@FZaslavskiy</a> or here via community discussions.', unsafe_allow_html=True)