booydar commited on
Commit
4821c71
Β·
1 Parent(s): 275a638

add 10M results & prettify naming & do not display models with few evals on the avg tab

Browse files
Files changed (2) hide show
  1. app.py +5 -2
  2. draw_utils.py +4 -4
app.py CHANGED
@@ -11,7 +11,7 @@ def draw_leaderboard():
11
  df = load_results()
12
 
13
  tasks = ['avg'] + [f"qa{i}" for i in range(1, 11)]
14
- columns = ["model_name", "<=32k", "<=128k"] + LENGTHS
15
 
16
  st.title("πŸ”ŽπŸ“šπŸͺ‘πŸ“šβ“ BABILong Leaderboard πŸ†")
17
  st.markdown(PAGE_INFO)
@@ -25,6 +25,9 @@ def draw_leaderboard():
25
  for i, tab in enumerate(tabs):
26
  with tab:
27
  task_df = df[df.task == tasks[i]][columns]
 
 
 
28
 
29
  if search_term:
30
  task_df = task_df[task_df['model_name'].str.contains(search_term, case=False)]
@@ -37,7 +40,7 @@ def draw_leaderboard():
37
 
38
  st.dataframe(
39
  styled_df,
40
- width=1030,
41
  height=height,
42
  )
43
 
 
11
  df = load_results()
12
 
13
  tasks = ['avg'] + [f"qa{i}" for i in range(1, 11)]
14
+ columns = ["model_name", "≀32k", "≀128k"] + LENGTHS
15
 
16
  st.title("πŸ”ŽπŸ“šπŸͺ‘πŸ“šβ“ BABILong Leaderboard πŸ†")
17
  st.markdown(PAGE_INFO)
 
25
  for i, tab in enumerate(tabs):
26
  with tab:
27
  task_df = df[df.task == tasks[i]][columns]
28
+ if i == 0: # do not dispay models with no evals ≀1k for avg task
29
+ print(task_df.loc[task_df[task_df.columns[:5]].isna().any(axis=1)])
30
+ task_df = task_df.loc[~task_df[task_df.columns[:5]].isna().any(axis=1)]
31
 
32
  if search_term:
33
  task_df = task_df[task_df['model_name'].str.contains(search_term, case=False)]
 
40
 
41
  st.dataframe(
42
  styled_df,
43
+ width=1100,
44
  height=height,
45
  )
46
 
draw_utils.py CHANGED
@@ -16,7 +16,7 @@ PAGE_MARKDOWN = """
16
 
17
  PAGE_INFO = """[![Dataset on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/dataset-on-hf-lg.svg)](https://huggingface.co/datasets/RMT-team/babilong) | [GitHub](https://github.com/booydar/babilong) | [Paper](https://arxiv.org/abs/2406.10149) | [HF Dataset](https://huggingface.co/datasets/RMT-team/babilong) | [HF Dataset 1k samples per task](https://huggingface.co/datasets/RMT-team/babilong-1k-samples) |"""
18
 
19
- LENGTHS = ['0k', '1k', '2k', '4k', '8k', '16k', '32k', '64k', '128k', '512k', '1M', '2M']
20
  LENGTHS_32k = ['0k', '1k', '2k', '4k', '8k', '16k', '32k']
21
  LENGTHS_128k = ['0k', '1k', '2k', '4k', '8k', '16k', '32k', '64k', '128k']
22
 
@@ -41,8 +41,8 @@ def load_results():
41
  res.drop('normalized_name', axis=1, inplace=True)
42
 
43
  res.replace(-1, np.nan, inplace=True)
44
- res['<=32k'] = res[LENGTHS_32k].mean(axis=1)
45
- res['<=128k'] = res[LENGTHS_128k].mean(axis=1)
46
 
47
  # Calculate the maximum length with non-NaN values for each model
48
  res['max_eval_length_idx'] = res.apply(
@@ -50,7 +50,7 @@ def load_results():
50
  res['max_eval_length'] = res['max_eval_length_idx'].apply(lambda x: LENGTHS[x])
51
 
52
  # Sort first by max length (descending) and then by average score (descending)
53
- res.sort_values(['max_eval_length_idx', '<=128k'], ascending=[False, False], inplace=True)
54
 
55
  return res
56
 
 
16
 
17
  PAGE_INFO = """[![Dataset on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/dataset-on-hf-lg.svg)](https://huggingface.co/datasets/RMT-team/babilong) | [GitHub](https://github.com/booydar/babilong) | [Paper](https://arxiv.org/abs/2406.10149) | [HF Dataset](https://huggingface.co/datasets/RMT-team/babilong) | [HF Dataset 1k samples per task](https://huggingface.co/datasets/RMT-team/babilong-1k-samples) |"""
18
 
19
+ LENGTHS = ['0k', '1k', '2k', '4k', '8k', '16k', '32k', '64k', '128k', '512k', '1M', '2M', '10M']
20
  LENGTHS_32k = ['0k', '1k', '2k', '4k', '8k', '16k', '32k']
21
  LENGTHS_128k = ['0k', '1k', '2k', '4k', '8k', '16k', '32k', '64k', '128k']
22
 
 
41
  res.drop('normalized_name', axis=1, inplace=True)
42
 
43
  res.replace(-1, np.nan, inplace=True)
44
+ res['≀32k'] = res[LENGTHS_32k].mean(axis=1)
45
+ res['≀128k'] = res[LENGTHS_128k].mean(axis=1)
46
 
47
  # Calculate the maximum length with non-NaN values for each model
48
  res['max_eval_length_idx'] = res.apply(
 
50
  res['max_eval_length'] = res['max_eval_length_idx'].apply(lambda x: LENGTHS[x])
51
 
52
  # Sort first by max length (descending) and then by average score (descending)
53
+ res.sort_values(['max_eval_length_idx', '≀128k'], ascending=[False, False], inplace=True)
54
 
55
  return res
56