booydar
commited on
Commit
Β·
4821c71
1
Parent(s):
275a638
add 10M results & prettify naming & do not display models with few evals on the avg tab
Browse files- app.py +5 -2
- draw_utils.py +4 -4
app.py
CHANGED
@@ -11,7 +11,7 @@ def draw_leaderboard():
|
|
11 |
df = load_results()
|
12 |
|
13 |
tasks = ['avg'] + [f"qa{i}" for i in range(1, 11)]
|
14 |
-
columns = ["model_name", "
|
15 |
|
16 |
st.title("πππͺ‘πβ BABILong Leaderboard π")
|
17 |
st.markdown(PAGE_INFO)
|
@@ -25,6 +25,9 @@ def draw_leaderboard():
|
|
25 |
for i, tab in enumerate(tabs):
|
26 |
with tab:
|
27 |
task_df = df[df.task == tasks[i]][columns]
|
|
|
|
|
|
|
28 |
|
29 |
if search_term:
|
30 |
task_df = task_df[task_df['model_name'].str.contains(search_term, case=False)]
|
@@ -37,7 +40,7 @@ def draw_leaderboard():
|
|
37 |
|
38 |
st.dataframe(
|
39 |
styled_df,
|
40 |
-
width=
|
41 |
height=height,
|
42 |
)
|
43 |
|
|
|
11 |
df = load_results()
|
12 |
|
13 |
tasks = ['avg'] + [f"qa{i}" for i in range(1, 11)]
|
14 |
+
columns = ["model_name", "β€32k", "β€128k"] + LENGTHS
|
15 |
|
16 |
st.title("πππͺ‘πβ BABILong Leaderboard π")
|
17 |
st.markdown(PAGE_INFO)
|
|
|
25 |
for i, tab in enumerate(tabs):
|
26 |
with tab:
|
27 |
task_df = df[df.task == tasks[i]][columns]
|
28 |
+
if i == 0: # do not dispay models with no evals β€1k for avg task
|
29 |
+
print(task_df.loc[task_df[task_df.columns[:5]].isna().any(axis=1)])
|
30 |
+
task_df = task_df.loc[~task_df[task_df.columns[:5]].isna().any(axis=1)]
|
31 |
|
32 |
if search_term:
|
33 |
task_df = task_df[task_df['model_name'].str.contains(search_term, case=False)]
|
|
|
40 |
|
41 |
st.dataframe(
|
42 |
styled_df,
|
43 |
+
width=1100,
|
44 |
height=height,
|
45 |
)
|
46 |
|
draw_utils.py
CHANGED
@@ -16,7 +16,7 @@ PAGE_MARKDOWN = """
|
|
16 |
|
17 |
PAGE_INFO = """[](https://huggingface.co/datasets/RMT-team/babilong) | [GitHub](https://github.com/booydar/babilong) | [Paper](https://arxiv.org/abs/2406.10149) | [HF Dataset](https://huggingface.co/datasets/RMT-team/babilong) | [HF Dataset 1k samples per task](https://huggingface.co/datasets/RMT-team/babilong-1k-samples) |"""
|
18 |
|
19 |
-
LENGTHS = ['0k', '1k', '2k', '4k', '8k', '16k', '32k', '64k', '128k', '512k', '1M', '2M']
|
20 |
LENGTHS_32k = ['0k', '1k', '2k', '4k', '8k', '16k', '32k']
|
21 |
LENGTHS_128k = ['0k', '1k', '2k', '4k', '8k', '16k', '32k', '64k', '128k']
|
22 |
|
@@ -41,8 +41,8 @@ def load_results():
|
|
41 |
res.drop('normalized_name', axis=1, inplace=True)
|
42 |
|
43 |
res.replace(-1, np.nan, inplace=True)
|
44 |
-
res['
|
45 |
-
res['
|
46 |
|
47 |
# Calculate the maximum length with non-NaN values for each model
|
48 |
res['max_eval_length_idx'] = res.apply(
|
@@ -50,7 +50,7 @@ def load_results():
|
|
50 |
res['max_eval_length'] = res['max_eval_length_idx'].apply(lambda x: LENGTHS[x])
|
51 |
|
52 |
# Sort first by max length (descending) and then by average score (descending)
|
53 |
-
res.sort_values(['max_eval_length_idx', '
|
54 |
|
55 |
return res
|
56 |
|
|
|
16 |
|
17 |
PAGE_INFO = """[](https://huggingface.co/datasets/RMT-team/babilong) | [GitHub](https://github.com/booydar/babilong) | [Paper](https://arxiv.org/abs/2406.10149) | [HF Dataset](https://huggingface.co/datasets/RMT-team/babilong) | [HF Dataset 1k samples per task](https://huggingface.co/datasets/RMT-team/babilong-1k-samples) |"""
|
18 |
|
19 |
+
LENGTHS = ['0k', '1k', '2k', '4k', '8k', '16k', '32k', '64k', '128k', '512k', '1M', '2M', '10M']
|
20 |
LENGTHS_32k = ['0k', '1k', '2k', '4k', '8k', '16k', '32k']
|
21 |
LENGTHS_128k = ['0k', '1k', '2k', '4k', '8k', '16k', '32k', '64k', '128k']
|
22 |
|
|
|
41 |
res.drop('normalized_name', axis=1, inplace=True)
|
42 |
|
43 |
res.replace(-1, np.nan, inplace=True)
|
44 |
+
res['β€32k'] = res[LENGTHS_32k].mean(axis=1)
|
45 |
+
res['β€128k'] = res[LENGTHS_128k].mean(axis=1)
|
46 |
|
47 |
# Calculate the maximum length with non-NaN values for each model
|
48 |
res['max_eval_length_idx'] = res.apply(
|
|
|
50 |
res['max_eval_length'] = res['max_eval_length_idx'].apply(lambda x: LENGTHS[x])
|
51 |
|
52 |
# Sort first by max length (descending) and then by average score (descending)
|
53 |
+
res.sort_values(['max_eval_length_idx', 'β€128k'], ascending=[False, False], inplace=True)
|
54 |
|
55 |
return res
|
56 |
|