Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: add rank and language dropdown lists
Browse files- app.py +14 -10
- src/display/utils.py +4 -0
- src/leaderboard/read_evals.py +11 -2
- utils.py +23 -18
app.py
CHANGED
@@ -12,7 +12,7 @@ from src.display.css_html_js import custom_css
|
|
12 |
from src.leaderboard.read_evals import get_raw_eval_results, get_leaderboard_df
|
13 |
|
14 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
|
15 |
-
from utils import update_table, update_metric, update_table_long_doc, upload_file
|
16 |
from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, metric_list
|
17 |
|
18 |
|
@@ -39,7 +39,12 @@ print(f'QA data loaded: {original_df_qa.shape}')
|
|
39 |
print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
|
40 |
|
41 |
leaderboard_df_qa = original_df_qa.copy()
|
|
|
|
|
|
|
42 |
leaderboard_df_long_doc = original_df_long_doc.copy()
|
|
|
|
|
43 |
|
44 |
|
45 |
def update_metric_qa(
|
@@ -97,11 +102,12 @@ with demo:
|
|
97 |
)
|
98 |
# select language
|
99 |
with gr.Row():
|
100 |
-
selected_langs = gr.
|
101 |
choices=LANG_COLS_QA,
|
102 |
value=LANG_COLS_QA,
|
103 |
label="Select the languages",
|
104 |
elem_id="language-column-select",
|
|
|
105 |
interactive=True
|
106 |
)
|
107 |
# select reranking model
|
@@ -117,8 +123,6 @@ with demo:
|
|
117 |
|
118 |
leaderboard_table = gr.components.Dataframe(
|
119 |
value=leaderboard_df_qa,
|
120 |
-
# headers=shown_columns,
|
121 |
-
# datatype=TYPES,
|
122 |
elem_id="leaderboard-table",
|
123 |
interactive=False,
|
124 |
visible=True,
|
@@ -205,11 +209,12 @@ with demo:
|
|
205 |
)
|
206 |
# select language
|
207 |
with gr.Row():
|
208 |
-
selected_langs = gr.
|
209 |
choices=LANG_COLS_LONG_DOC,
|
210 |
value=LANG_COLS_LONG_DOC,
|
211 |
label="Select the languages",
|
212 |
elem_id="language-column-select-long-doc",
|
|
|
213 |
interactive=True
|
214 |
)
|
215 |
# select reranking model
|
@@ -225,8 +230,6 @@ with demo:
|
|
225 |
|
226 |
leaderboard_table_long_doc = gr.components.Dataframe(
|
227 |
value=leaderboard_df_long_doc,
|
228 |
-
# headers=shown_columns,
|
229 |
-
# datatype=TYPES,
|
230 |
elem_id="leaderboard-table-long-doc",
|
231 |
interactive=False,
|
232 |
visible=True,
|
@@ -235,8 +238,6 @@ with demo:
|
|
235 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
236 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
237 |
value=leaderboard_df_long_doc,
|
238 |
-
# headers=COLS,
|
239 |
-
# datatype=TYPES,
|
240 |
visible=False,
|
241 |
)
|
242 |
|
@@ -293,7 +294,10 @@ with demo:
|
|
293 |
with gr.Row():
|
294 |
with gr.Column():
|
295 |
benchmark_version = gr.Dropdown(
|
296 |
-
[
|
|
|
|
|
|
|
297 |
with gr.Column():
|
298 |
model_name_textbox = gr.Textbox(label="Model name")
|
299 |
with gr.Column():
|
|
|
12 |
from src.leaderboard.read_evals import get_raw_eval_results, get_leaderboard_df
|
13 |
|
14 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
|
15 |
+
from utils import update_table, update_metric, update_table_long_doc, upload_file, get_default_cols
|
16 |
from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, metric_list
|
17 |
|
18 |
|
|
|
39 |
print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
|
40 |
|
41 |
leaderboard_df_qa = original_df_qa.copy()
|
42 |
+
shown_columns_qa = get_default_cols('qa', leaderboard_df_qa.columns, add_fix_cols=True)
|
43 |
+
leaderboard_df_qa = leaderboard_df_qa[shown_columns_qa]
|
44 |
+
|
45 |
leaderboard_df_long_doc = original_df_long_doc.copy()
|
46 |
+
shown_columns_long_doc = get_default_cols('long_doc', leaderboard_df_long_doc.columns, add_fix_cols=True)
|
47 |
+
leaderboard_df_long_doc = leaderboard_df_long_doc[shown_columns_long_doc]
|
48 |
|
49 |
|
50 |
def update_metric_qa(
|
|
|
102 |
)
|
103 |
# select language
|
104 |
with gr.Row():
|
105 |
+
selected_langs = gr.Dropdown(
|
106 |
choices=LANG_COLS_QA,
|
107 |
value=LANG_COLS_QA,
|
108 |
label="Select the languages",
|
109 |
elem_id="language-column-select",
|
110 |
+
multiselect=True,
|
111 |
interactive=True
|
112 |
)
|
113 |
# select reranking model
|
|
|
123 |
|
124 |
leaderboard_table = gr.components.Dataframe(
|
125 |
value=leaderboard_df_qa,
|
|
|
|
|
126 |
elem_id="leaderboard-table",
|
127 |
interactive=False,
|
128 |
visible=True,
|
|
|
209 |
)
|
210 |
# select language
|
211 |
with gr.Row():
|
212 |
+
selected_langs = gr.Dropdown(
|
213 |
choices=LANG_COLS_LONG_DOC,
|
214 |
value=LANG_COLS_LONG_DOC,
|
215 |
label="Select the languages",
|
216 |
elem_id="language-column-select-long-doc",
|
217 |
+
multiselect=True,
|
218 |
interactive=True
|
219 |
)
|
220 |
# select reranking model
|
|
|
230 |
|
231 |
leaderboard_table_long_doc = gr.components.Dataframe(
|
232 |
value=leaderboard_df_long_doc,
|
|
|
|
|
233 |
elem_id="leaderboard-table-long-doc",
|
234 |
interactive=False,
|
235 |
visible=True,
|
|
|
238 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
239 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
240 |
value=leaderboard_df_long_doc,
|
|
|
|
|
241 |
visible=False,
|
242 |
)
|
243 |
|
|
|
294 |
with gr.Row():
|
295 |
with gr.Column():
|
296 |
benchmark_version = gr.Dropdown(
|
297 |
+
["AIR-Bench_24.04",],
|
298 |
+
value="AIR-Bench_24.04",
|
299 |
+
interactive=True,
|
300 |
+
label="AIR-Bench Version")
|
301 |
with gr.Column():
|
302 |
model_name_textbox = gr.Textbox(label="Model name")
|
303 |
with gr.Column():
|
src/display/utils.py
CHANGED
@@ -22,6 +22,7 @@ class ColumnContent:
|
|
22 |
COL_NAME_AVG = "Average ⬆️"
|
23 |
COL_NAME_RETRIEVAL_MODEL = "Retrieval Model"
|
24 |
COL_NAME_RERANKING_MODEL = "Reranking Model"
|
|
|
25 |
|
26 |
def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
|
27 |
## Leaderboard columns
|
@@ -36,6 +37,9 @@ def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
|
|
36 |
auto_eval_column_dict.append(
|
37 |
["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)]
|
38 |
)
|
|
|
|
|
|
|
39 |
for benchmark in benchmarks:
|
40 |
auto_eval_column_dict.append(
|
41 |
[benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
|
|
|
22 |
COL_NAME_AVG = "Average ⬆️"
|
23 |
COL_NAME_RETRIEVAL_MODEL = "Retrieval Model"
|
24 |
COL_NAME_RERANKING_MODEL = "Reranking Model"
|
25 |
+
COL_NAME_RANK = "Rank 🏆"
|
26 |
|
27 |
def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
|
28 |
## Leaderboard columns
|
|
|
37 |
auto_eval_column_dict.append(
|
38 |
["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)]
|
39 |
)
|
40 |
+
auto_eval_column_dict.append(
|
41 |
+
["rank", ColumnContent, ColumnContent(COL_NAME_RANK, "number", True)]
|
42 |
+
)
|
43 |
for benchmark in benchmarks:
|
44 |
auto_eval_column_dict.append(
|
45 |
[benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
|
src/leaderboard/read_evals.py
CHANGED
@@ -9,8 +9,16 @@ import pandas as pd
|
|
9 |
|
10 |
from src.benchmarks import get_safe_name
|
11 |
from src.display.formatting import has_no_nan_values
|
12 |
-
from src.display.utils import
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
|
16 |
@dataclass
|
@@ -158,6 +166,7 @@ def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -
|
|
158 |
df[COL_NAME_AVG] = df[list(_benchmark_cols)].mean(axis=1).round(decimals=2)
|
159 |
df = df.sort_values(by=[COL_NAME_AVG], ascending=False)
|
160 |
df.reset_index(inplace=True)
|
|
|
161 |
|
162 |
_cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
|
163 |
df = df[_cols].round(decimals=2)
|
|
|
9 |
|
10 |
from src.benchmarks import get_safe_name
|
11 |
from src.display.formatting import has_no_nan_values
|
12 |
+
from src.display.utils import (
|
13 |
+
COL_NAME_RERANKING_MODEL,
|
14 |
+
COL_NAME_RETRIEVAL_MODEL,
|
15 |
+
COLS_QA,
|
16 |
+
QA_BENCHMARK_COLS,
|
17 |
+
COLS_LONG_DOC,
|
18 |
+
LONG_DOC_BENCHMARK_COLS,
|
19 |
+
COL_NAME_AVG,
|
20 |
+
COL_NAME_RANK
|
21 |
+
)
|
22 |
|
23 |
|
24 |
@dataclass
|
|
|
166 |
df[COL_NAME_AVG] = df[list(_benchmark_cols)].mean(axis=1).round(decimals=2)
|
167 |
df = df.sort_values(by=[COL_NAME_AVG], ascending=False)
|
168 |
df.reset_index(inplace=True)
|
169 |
+
df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="dense")
|
170 |
|
171 |
_cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
|
172 |
df = df[_cols].round(decimals=2)
|
utils.py
CHANGED
@@ -3,7 +3,7 @@ from typing import List
|
|
3 |
import pandas as pd
|
4 |
|
5 |
from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, BenchmarksQA, BenchmarksLongDoc
|
6 |
-
from src.display.utils import AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS_QA, COLS_LONG_DOC
|
7 |
from src.leaderboard.read_evals import FullEvalResult, get_leaderboard_df
|
8 |
|
9 |
|
@@ -37,25 +37,28 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
|
37 |
return df[(df[AutoEvalColumnQA.retrieval_model.name].str.contains(query, case=False))]
|
38 |
|
39 |
|
40 |
-
def
|
41 |
if task == "qa":
|
42 |
-
|
43 |
-
AutoEvalColumnQA.retrieval_model.name,
|
44 |
-
AutoEvalColumnQA.reranking_model.name,
|
45 |
-
AutoEvalColumnQA.average.name
|
46 |
-
]
|
47 |
-
cols = list(frozenset(COLS_QA).intersection(frozenset(BENCHMARK_COLS_QA)))
|
48 |
elif task == "long_doc":
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
selected_cols = []
|
56 |
for c in cols:
|
57 |
-
if c not in df.columns:
|
58 |
-
continue
|
59 |
if task == "qa":
|
60 |
eval_col = BenchmarksQA[c].value
|
61 |
elif task == "long_doc":
|
@@ -66,8 +69,10 @@ def select_columns(df: pd.DataFrame, domain_query: list, language_query: list, t
|
|
66 |
continue
|
67 |
selected_cols.append(c)
|
68 |
# We use COLS to maintain sorting
|
69 |
-
filtered_df = df[
|
70 |
-
filtered_df[
|
|
|
|
|
71 |
return filtered_df
|
72 |
|
73 |
|
|
|
3 |
import pandas as pd
|
4 |
|
5 |
from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, BenchmarksQA, BenchmarksLongDoc
|
6 |
+
from src.display.utils import AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS_QA, COLS_LONG_DOC, COL_NAME_RANK, COL_NAME_AVG, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
|
7 |
from src.leaderboard.read_evals import FullEvalResult, get_leaderboard_df
|
8 |
|
9 |
|
|
|
37 |
return df[(df[AutoEvalColumnQA.retrieval_model.name].str.contains(query, case=False))]
|
38 |
|
39 |
|
40 |
+
def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list:
|
41 |
if task == "qa":
|
42 |
+
cols = list(frozenset(COLS_QA).intersection(frozenset(BENCHMARK_COLS_QA)).intersection(frozenset(columns)))
|
|
|
|
|
|
|
|
|
|
|
43 |
elif task == "long_doc":
|
44 |
+
cols = list(frozenset(COLS_LONG_DOC).intersection(frozenset(BENCHMARK_COLS_LONG_DOC)).intersection(frozenset(columns)))
|
45 |
+
else:
|
46 |
+
raise NotImplemented
|
47 |
+
if add_fix_cols:
|
48 |
+
cols = FIXED_COLS + cols
|
49 |
+
return cols
|
50 |
+
|
51 |
+
FIXED_COLS = [
|
52 |
+
COL_NAME_RANK,
|
53 |
+
COL_NAME_RETRIEVAL_MODEL,
|
54 |
+
COL_NAME_RERANKING_MODEL,
|
55 |
+
COL_NAME_AVG,
|
56 |
+
]
|
57 |
+
|
58 |
+
def select_columns(df: pd.DataFrame, domain_query: list, language_query: list, task: str = "qa") -> pd.DataFrame:
|
59 |
+
cols = get_default_cols(task=task, columns=df.columns, add_fix_cols=False)
|
60 |
selected_cols = []
|
61 |
for c in cols:
|
|
|
|
|
62 |
if task == "qa":
|
63 |
eval_col = BenchmarksQA[c].value
|
64 |
elif task == "long_doc":
|
|
|
69 |
continue
|
70 |
selected_cols.append(c)
|
71 |
# We use COLS to maintain sorting
|
72 |
+
filtered_df = df[FIXED_COLS + selected_cols]
|
73 |
+
filtered_df[COL_NAME_AVG] = filtered_df[selected_cols].mean(axis=1).round(decimals=2)
|
74 |
+
filtered_df[COL_NAME_RANK] = filtered_df[COL_NAME_AVG].rank(ascending=False, method="dense")
|
75 |
+
|
76 |
return filtered_df
|
77 |
|
78 |
|