Spaces:
Running
Running
File size: 12,245 Bytes
fc68f79 bc527a4 30ec544 b049de2 a989e5c 708e0e9 744e7f0 8a0dd37 744e7f0 b005e3f fc68f79 35bf268 b005e3f fc68f79 8a0dd37 7d1b966 8a0dd37 a989e5c 434f3cf d0f351b 434f3cf a989e5c 434f3cf 7d1b966 434f3cf d0f351b 434f3cf d0f351b 434f3cf a989e5c 8a0dd37 5e3730c ec2bb77 8a0dd37 952614e 708e0e9 4b13fa7 708e0e9 7d1b966 8a0dd37 7d1b966 df11296 a989e5c df11296 a989e5c 7d1b966 a989e5c 692d1fe a989e5c 692d1fe a989e5c 5d80dad 7d1b966 8a0dd37 ebf8650 8a0dd37 7d1b966 8a0dd37 7d1b966 8a0dd37 7d1b966 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 |
import streamlit as st
import pandas as pd
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
from itertools import combinations
import re
from functools import cache
from io import StringIO
from yall import create_yall
import plotly.graph_objs as go
from huggingface_hub import ModelCard
def calculate_pages(df, items_per_page):
return -(-len(df) // items_per_page) # Equivalent to math.ceil(len(df) / items_per_page)
@st.cache_data
def cached_model_info(api, model):
try:
return api.model_info(repo_id=str(model))
except (RepositoryNotFoundError, RevisionNotFoundError):
return None
@st.cache_data
def get_model_info(df):
api = HfApi()
for index, row in df.iterrows():
model_info = cached_model_info(api, row['Model'].strip())
if model_info:
df.loc[index, 'Likes'] = model_info.likes
df.loc[index, 'Tags'] = ', '.join(model_info.tags)
else:
df.loc[index, 'Likes'] = -1
df.loc[index, 'Tags'] = ''
return df
def convert_markdown_table_to_dataframe(md_content):
cleaned_content = re.sub(r'\|\s*$', '', re.sub(r'^\|\s*', '', md_content, flags=re.MULTILINE), flags=re.MULTILINE)
df = pd.read_csv(StringIO(cleaned_content), sep="\|", engine='python')
df = df.drop(0, axis=0)
df.columns = df.columns.str.strip()
model_link_pattern = r'\[(.*?)\]\((.*?)\)\s*\[.*?\]\(.*?\)'
df['URL'] = df['Model'].apply(lambda x: re.search(model_link_pattern, x).group(2) if re.search(model_link_pattern, x) else None)
df['Model'] = df['Model'].apply(lambda x: re.sub(model_link_pattern, r'\1', x))
return df
def create_bar_chart(df, category):
st.write(f"### {category} Scores")
sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
fig = go.Figure(go.Bar(
x=sorted_df[category],
y=sorted_df['Model'],
orientation='h',
marker=dict(color=sorted_df[category], colorscale='Agsunset')
))
fig.update_layout(
margin=dict(l=20, r=20, t=20, b=20)
)
st.plotly_chart(fig, use_container_width=True, height=len(df) * 35)
def fetch_merge_configs(df):
df_sorted = df.sort_values(by='Average', ascending=False)
with open('/tmp/configurations.txt', 'a') as file:
for index, row in df_sorted.head(20).iterrows():
model_name = row['Model'].rstrip()
card = ModelCard.load(model_name)
file.write(f'Model Name: {model_name}\n')
file.write(f'Scores: {row["Average"]}\n')
file.write(f'AGIEval: {row["AGIEval"]}\n')
file.write(f'GPT4All: {row["GPT4All"]}\n')
file.write(f'TruthfulQA: {row["TruthfulQA"]}\n')
file.write(f'Bigbench: {row["Bigbench"]}\n')
file.write(f'Model Card: {card}\n')
with open('/tmp/configurations.txt', 'r') as file:
content = file.read()
matches = re.findall(r'yaml(.*?)```', content, re.DOTALL)
with open('/tmp/configurations2.txt', 'w') as file:
for row, match in zip(df_sorted[['Model', 'Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']].head(20).values, matches):
file.write(f'Model Name: {row[0]}\n')
file.write(f'Scores: {row[1]}\n')
file.write(f'AGIEval: {row[2]}\n')
file.write(f'GPT4All: {row[3]}\n')
file.write(f'TruthfulQA: {row[4]}\n')
file.write(f'Bigbench: {row[5]}\n')
file.write('yaml' + match + '```\n')
def main():
st.set_page_config(page_title="YALL - Yet Another LLM Leaderboard", layout="wide")
st.title("๐ YALL - Yet Another LLM Leaderboard")
st.markdown("Leaderboard made with ๐ง [LLM AutoEval](https://github.com/mlabonne/llm-autoeval) using [Nous](https://huggingface.co/NousResearch) benchmark suite.")
content = create_yall()
tab1, tab2 = st.tabs(["๐ Leaderboard", "๐ About"])
with tab1:
if content:
try:
score_columns = ['Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']
full_df = convert_markdown_table_to_dataframe(content)
for col in score_columns:
full_df[col] = pd.to_numeric(full_df[col].str.strip(), errors='coerce')
full_df = get_model_info(full_df)
full_df['Tags'] = full_df['Tags'].fillna('')
df = pd.DataFrame(columns=full_df.columns)
show_phi = st.checkbox("Phi (2.8B)", value=True)
show_mistral = st.checkbox("Mistral (7B)", value=True)
show_other = st.checkbox("Other", value=True)
dfs_to_concat = []
if show_phi:
dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('phi,|phi-msft,')])
if show_mistral:
dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('mistral,')])
if show_other:
other_df = full_df[~full_df['Tags'].str.lower().str.contains('phi,|phi-msft,|mistral,')]
dfs_to_concat.append(other_df)
if dfs_to_concat:
df = pd.concat(dfs_to_concat, ignore_index=True)
search_query = st.text_input("Search models", "")
if search_query:
df = df[df['Model'].str.contains(search_query, case=False)]
items_per_page = 50
pages = calculate_pages(df, items_per_page)
page = st.selectbox("Page", list(range(1, pages + 1)))
df = df.sort_values(by='Average', ascending=False)
start = (page - 1) * items_per_page
end = start + items_per_page
df = df[start:end]
st.dataframe(
df[['Model'] + score_columns + ['Likes', 'URL']],
use_container_width=True,
column_config={
"Likes": st.column_config.NumberColumn(
"Likes",
help="Number of likes on Hugging Face",
format="%d โค๏ธ",
),
"URL": st.column_config.LinkColumn("URL"),
},
hide_index=True,
height=len(df) * 37,
)
selected_models = st.multiselect('Select models to compare', df['Model'].unique())
comparison_df = df[df['Model'].isin(selected_models)]
st.dataframe(comparison_df)
selected_benchmarks = st.multiselect('Select benchmarks to include in the average', score_columns, default=score_columns)
if selected_benchmarks:
df['Filtered Average'] = df[selected_benchmarks].mean(axis=1)
df = df.sort_values(by='Filtered Average', ascending=False)
st.dataframe(
df[['Model'] + selected_benchmarks + ['Filtered Average', 'Likes', 'URL']],
use_container_width=True,
column_config={
"Likes": st.column_config.NumberColumn(
"Likes",
help="Number of likes on Hugging Face",
format="%d โค๏ธ",
),
"URL": st.column_config.LinkColumn("URL"),
},
hide_index=True,
height=len(df) * 37,
)
if st.button("Export to CSV"):
csv_data = df.to_csv(index=False)
st.download_button(
label="Download CSV",
data=csv_data,
file_name="leaderboard.csv",
key="download-csv",
help="Click to download the CSV file",
)
if st.button("Fetch Merge-Configs"):
configurations, matches, csv_data = fetch_merge_configs(full_df)
for config in configurations:
st.text(f"Model Name: {config['Model Name']}\nScores: {config['Scores']}\nAGIEval: {config['AGIEval']}\nGPT4All: {config['GPT4All']}\nTruthfulQA: {config['TruthfulQA']}\nBigbench: {config['Bigbench']}\nModel Card: {config['Model Card']}\n\n")
configurations_df = pd.DataFrame(configurations)
configurations_csv = configurations_df.to_csv(index=False)
st.download_button(
label="Download Configurations",
data=configurations_csv,
file_name="configurations.csv",
key="download-csv",
help="Click to download the CSV file",
)
create_bar_chart(df, 'Filtered Average')
col1, col2 = st.columns(2)
with col1:
create_bar_chart(df, score_columns[1])
with col2:
create_bar_chart(df, score_columns[2])
col3, col4 = st.columns(2)
with col3:
create_bar_chart(df, score_columns[3])
with col4:
create_bar_chart(df, score_columns[4])
except Exception as e:
st.error("An error occurred while processing the markdown table.")
st.error(str(e))
else:
st.error("Failed to download the content from the URL provided.")
with tab2:
st.markdown('''
### Nous benchmark suite
Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks:
* [**AGIEval**](https://arxiv.org/abs/2304.06364) (0-shot): `agieval_aqua_rat,agieval_logiqa_en,agieval_lsat_ar,agieval_lsat_lr,agieval_lsat_rc,agieval_sat_en,agieval_sat_en_without_passage,agieval_sat_math`
* **GPT4ALL** (0-shot): `hellaswag,openbookqa,winogrande,arc_easy,arc_challenge,boolq,piqa`
* [**TruthfulQA**](https://arxiv.org/abs/2109.07958) (0-shot): `truthfulqa_mc`
* [**Bigbench**](https://arxiv.org/abs/2206.04615) (0-shot): `bigbench_causal_judgement,bigbench_date_understanding,bigbench_disambiguation_qa,bigbench_geometric_shapes,bigbench_logical_deduction_five_objects,bigbench_logical_deduction_seven_objects,bigbench_logical_deduction_three_objects,bigbench_movie_recommendation,bigbench_navigate,bigbench_reasoning_about_colored_objects,bigbench_ruin_names,bigbench_salient_translation_error_detection,bigbench_snarks,bigbench_sports_understanding,bigbench_temporal_sequences,bigbench_tracking_shuffled_objects_five_objects,bigbench_tracking_shuffled_objects_seven_objects,bigbench_tracking_shuffled_objects_three_objects`
### Reproducibility
You can easily reproduce these results using ๐ง [LLM AutoEval](https://github.com/mlabonne/llm-autoeval/tree/master), a colab notebook that automates the evaluation process (benchmark: `nous`). This will upload the results to GitHub as gists. You can find the entire table with the links to the detailed results [here](https://gist.github.com/mlabonne/90294929a2dbcb8877f9696f28105fdf).
### Clone this space
You can create your own leaderboard with your LLM AutoEval results on GitHub Gist. You just need to clone this space and specify two variables:
* Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
* Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations.
''')
if __name__ == "__main__":
main()
|