laiviet commited on
Commit
8c2ee0f
·
1 Parent(s): 13a280b

Add search capability and language names

Browse files
Files changed (3) hide show
  1. app.py +70 -6
  2. content.py +1 -1
  3. css.py +13 -0
app.py CHANGED
@@ -2,8 +2,10 @@ import os
2
  import json
3
  import glob
4
  from collections import defaultdict
 
5
  import gradio as gr
6
  from content import *
 
7
  import glob
8
 
9
  ARC = "arc"
@@ -14,6 +16,42 @@ BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
14
 
15
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def collect_results():
19
  performance_dict = defaultdict(dict)
@@ -52,6 +90,7 @@ def collect_results():
52
  def get_leaderboard_df(performance_dict, pretrained_models):
53
  df = list()
54
  for (pretrained, lang), perfs in performance_dict.items():
 
55
  arc_perf = perfs.get(ARC, 0.0)
56
  hellaswag_perf = perfs.get(HELLASWAG, 0.0)
57
  mmlu_perf = perfs.get(MMLU, 0.0)
@@ -60,26 +99,40 @@ def get_leaderboard_df(performance_dict, pretrained_models):
60
  if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
61
  continue
62
  avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
63
- row = [pretrained, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
 
64
  df.append(row)
 
 
 
 
 
65
  return df
66
 
67
 
 
 
 
 
 
 
68
  MODEL_COL = "Model"
69
  LANG_COL = "Language"
 
70
  AVERAGE_COL = "Average"
71
  ARC_COL = "ARC (25-shot)"
72
  HELLASWAG_COL = "HellaSwag (10-shot)️"
73
  MMLU_COL = "MMLU (5-shot)"
74
  TRUTHFULQA_COL = "TruthfulQA (0-shot)"
 
75
 
76
- COLS = [MODEL_COL, LANG_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
77
- TYPES = ["str", "str", "number", "number", "number", "number", "number"]
78
 
79
  args = collect_results()
80
- leaderboard_df = get_leaderboard_df(*args)
81
 
82
- demo = gr.Blocks()
83
  with demo:
84
  gr.HTML(TITLE)
85
  gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
@@ -91,13 +144,24 @@ with demo:
91
  )
92
 
93
  leaderboard_table = gr.components.Dataframe(
94
- value=leaderboard_df,
95
  headers=COLS,
96
  datatype=TYPES,
97
  max_rows=5,
98
  elem_id="leaderboard-table",
99
  )
100
 
 
 
 
 
 
 
 
 
 
 
 
101
  gr.Markdown(CREDIT, elem_classes="markdown-text")
102
  gr.Markdown(CITATION, elem_classes="markdown-text")
103
 
 
2
  import json
3
  import glob
4
  from collections import defaultdict
5
+ import pandas as pd
6
  import gradio as gr
7
  from content import *
8
+ from css import *
9
  import glob
10
 
11
  ARC = "arc"
 
16
 
17
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
18
 
19
+ LANGS = 'ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh'.split(',')
20
+
21
+ LANG_NAME = {
22
+ 'ar': 'Arabic',
23
+ 'bn': 'Bengali',
24
+ 'ca': 'Catalan',
25
+ 'da': 'Danish',
26
+ 'de': 'German',
27
+ 'es': 'Spanish',
28
+ 'eu': 'Basque',
29
+ 'fr': 'French',
30
+ 'gu': 'Gujarati',
31
+ 'hi': 'Hindi',
32
+ 'hr': 'Croatian',
33
+ 'hu': 'Hungarian',
34
+ 'hy': 'Armenian',
35
+ 'id': 'Indonesian',
36
+ 'it': 'Italian',
37
+ 'kn': 'Kannada',
38
+ 'ml': 'Malayalam',
39
+ 'mr': 'Marathi',
40
+ 'ne': 'Nepali',
41
+ 'nl': 'Dutch',
42
+ 'pt': 'Portuguese',
43
+ 'ro': 'Romanian',
44
+ 'ru': 'Russian',
45
+ 'sk': 'Slovak',
46
+ 'sr': 'Serbian',
47
+ 'sv': 'Swedish',
48
+ 'ta': 'Tamil',
49
+ 'te': 'Telugu',
50
+ 'uk': 'Ukrainian',
51
+ 'vi': 'Vietnamese',
52
+ 'zh': 'Chinese'
53
+ }
54
+
55
 
56
  def collect_results():
57
  performance_dict = defaultdict(dict)
 
90
  def get_leaderboard_df(performance_dict, pretrained_models):
91
  df = list()
92
  for (pretrained, lang), perfs in performance_dict.items():
93
+ lang_name = LANG_NAME[lang]
94
  arc_perf = perfs.get(ARC, 0.0)
95
  hellaswag_perf = perfs.get(HELLASWAG, 0.0)
96
  mmlu_perf = perfs.get(MMLU, 0.0)
 
99
  if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
100
  continue
101
  avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
102
+ notes = ' '.join([pretrained, lang_name, lang])
103
+ row = [pretrained, lang_name, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes]
104
  df.append(row)
105
+
106
+ df = pd.DataFrame.from_records(df, columns=COLS)
107
+ df = df.sort_values(by=[AVERAGE_COL], ascending=False)
108
+ df = df[COLS]
109
+
110
  return df
111
 
112
 
113
+ def search_table(df, query):
114
+ filtered_df = df[df[NOTES_COL].str.contains(query, case=False)]
115
+ return filtered_df
116
+
117
+
118
+
119
  MODEL_COL = "Model"
120
  LANG_COL = "Language"
121
+ CODE_COL = "Code"
122
  AVERAGE_COL = "Average"
123
  ARC_COL = "ARC (25-shot)"
124
  HELLASWAG_COL = "HellaSwag (10-shot)️"
125
  MMLU_COL = "MMLU (5-shot)"
126
  TRUTHFULQA_COL = "TruthfulQA (0-shot)"
127
+ NOTES_COL = "Notes" # For search only
128
 
129
+ COLS = [MODEL_COL, LANG_COL, CODE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL, NOTES_COL]
130
+ TYPES = ["str", "str", "str", "number", "number", "number", "number", "number", "str"]
131
 
132
  args = collect_results()
133
+ original_df = get_leaderboard_df(*args)
134
 
135
+ demo = gr.Blocks(css=CUSTOM_CSS)
136
  with demo:
137
  gr.HTML(TITLE)
138
  gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
 
144
  )
145
 
146
  leaderboard_table = gr.components.Dataframe(
147
+ value=original_df,
148
  headers=COLS,
149
  datatype=TYPES,
150
  max_rows=5,
151
  elem_id="leaderboard-table",
152
  )
153
 
154
+ # # Dummy leaderboard for handling the case when the user uses backspace key
155
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
156
+ value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False
157
+ )
158
+
159
+ search_bar.change(
160
+ search_table,
161
+ [hidden_leaderboard_table_for_search, search_bar],
162
+ leaderboard_table,
163
+ )
164
+
165
  gr.Markdown(CREDIT, elem_classes="markdown-text")
166
  gr.Markdown(CITATION, elem_classes="markdown-text")
167
 
content.py CHANGED
@@ -3,7 +3,7 @@ TITLE = '<h1 align="center" id="space-title">Open Multilingual LLM Evaluation Le
3
  INTRO_TEXT = f"""
4
  ## About
5
 
6
- This leaderboard shows the performance of pretrained models in 29 languages on four benchmarks:
7
 
8
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot)
9
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot)
 
3
  INTRO_TEXT = f"""
4
  ## About
5
 
6
+ This leaderboard shows the performance of pretrained models in 29 languages including Arabic, Armenian, Basque, Bengali, Catalan, Chinese, Croatian, Danish, Dutch, French, German, Gujarati, Hindi, Hungarian, Indonesian, Italian, Kannada, Malayalam, Marathi, Nepali, Portuguese, Romanian, Russian, Serbian, Slovak, Spanish, Swedish, Tamil, Telugu, Ukrainian, and Vietnameseon four benchmarks:
7
 
8
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot)
9
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot)
css.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CUSTOM_CSS= """
2
+ /* Hides the final column */
3
+ table td:last-child,
4
+ table th:last-child {
5
+ display: none;
6
+ }
7
+ # table td:first-child,
8
+ # table th:first-child {
9
+ # max-width: 400px;
10
+ # overflow: auto;
11
+ # white-space: nowrap;
12
+ # }
13
+ """