Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
7ff98ba
1 Parent(s): 165d25c

feat: make the model name clickable

Browse files
Files changed (3) hide show
  1. app.py +5 -0
  2. src/display/utils.py +3 -2
  3. src/leaderboard/read_evals.py +9 -5
app.py CHANGED
@@ -14,6 +14,7 @@ from src.leaderboard.read_evals import get_raw_eval_results, get_leaderboard_df
14
  from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
15
  from utils import update_table, update_metric, update_table_long_doc, upload_file, get_default_cols
16
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, metric_list
 
17
 
18
 
19
  def restart_space():
@@ -122,6 +123,7 @@ with demo:
122
 
123
  leaderboard_table = gr.components.Dataframe(
124
  value=leaderboard_df_qa,
 
125
  elem_id="leaderboard-table",
126
  interactive=False,
127
  visible=True,
@@ -130,6 +132,7 @@ with demo:
130
  # Dummy leaderboard for handling the case when the user uses backspace key
131
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
132
  value=leaderboard_df_qa,
 
133
  # headers=COLS,
134
  # datatype=TYPES,
135
  visible=False,
@@ -229,6 +232,7 @@ with demo:
229
 
230
  leaderboard_table_long_doc = gr.components.Dataframe(
231
  value=leaderboard_df_long_doc,
 
232
  elem_id="leaderboard-table-long-doc",
233
  interactive=False,
234
  visible=True,
@@ -237,6 +241,7 @@ with demo:
237
  # Dummy leaderboard for handling the case when the user uses backspace key
238
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
239
  value=leaderboard_df_long_doc,
 
240
  visible=False,
241
  )
242
 
 
14
  from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
15
  from utils import update_table, update_metric, update_table_long_doc, upload_file, get_default_cols
16
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, metric_list
17
+ from src.display.utils import TYPES_QA, TYPES_LONG_DOC
18
 
19
 
20
  def restart_space():
 
123
 
124
  leaderboard_table = gr.components.Dataframe(
125
  value=leaderboard_df_qa,
126
+ datatype=TYPES_QA,
127
  elem_id="leaderboard-table",
128
  interactive=False,
129
  visible=True,
 
132
  # Dummy leaderboard for handling the case when the user uses backspace key
133
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
134
  value=leaderboard_df_qa,
135
+ datatype=TYPES_QA,
136
  # headers=COLS,
137
  # datatype=TYPES,
138
  visible=False,
 
232
 
233
  leaderboard_table_long_doc = gr.components.Dataframe(
234
  value=leaderboard_df_long_doc,
235
+ datatype=TYPES_LONG_DOC,
236
  elem_id="leaderboard-table-long-doc",
237
  interactive=False,
238
  visible=True,
 
241
  # Dummy leaderboard for handling the case when the user uses backspace key
242
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
243
  value=leaderboard_df_long_doc,
244
+ datatype=TYPES_LONG_DOC,
245
  visible=False,
246
  )
247
 
src/display/utils.py CHANGED
@@ -66,9 +66,10 @@ AutoEvalColumnLongDoc = make_autoevalcolumn(
66
  # Column selection
67
  COLS_QA = [c.name for c in fields(AutoEvalColumnQA) if not c.hidden]
68
  COLS_LONG_DOC = [c.name for c in fields(AutoEvalColumnLongDoc) if not c.hidden]
69
- TYPES = [c.type for c in fields(AutoEvalColumnQA) if not c.hidden]
 
70
  COLS_LITE = [c.name for c in fields(AutoEvalColumnQA) if c.displayed_by_default and not c.hidden]
71
 
72
  QA_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksQA]
73
 
74
- LONG_DOC_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksLongDoc]
 
66
  # Column selection
67
  COLS_QA = [c.name for c in fields(AutoEvalColumnQA) if not c.hidden]
68
  COLS_LONG_DOC = [c.name for c in fields(AutoEvalColumnLongDoc) if not c.hidden]
69
+ TYPES_QA = [c.type for c in fields(AutoEvalColumnQA) if not c.hidden]
70
+ TYPES_LONG_DOC = [c.type for c in fields(AutoEvalColumnLongDoc) if not c.hidden]
71
  COLS_LITE = [c.name for c in fields(AutoEvalColumnQA) if c.displayed_by_default and not c.hidden]
72
 
73
  QA_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksQA]
74
 
75
+ LONG_DOC_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksLongDoc]
src/leaderboard/read_evals.py CHANGED
@@ -4,7 +4,6 @@ from collections import defaultdict
4
  from dataclasses import dataclass
5
  from typing import List
6
 
7
- import dateutil.parser._parser
8
  import pandas as pd
9
 
10
  from src.benchmarks import get_safe_name
@@ -22,6 +21,8 @@ from src.display.utils import (
22
  COL_NAME_RANK
23
  )
24
 
 
 
25
 
26
  @dataclass
27
  class EvalResult:
@@ -100,8 +101,10 @@ class FullEvalResult:
100
  if eval_result.task != task:
101
  continue
102
  results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
103
- results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL] = self.retrieval_model
104
- results[eval_result.eval_name][COL_NAME_RERANKING_MODEL] = self.reranking_model
 
 
105
  results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL_LINK] = self.retrieval_model_link
106
  results[eval_result.eval_name][COL_NAME_RERANKING_MODEL_LINK] = self.reranking_model_link
107
 
@@ -177,16 +180,17 @@ def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -
177
  df = pd.DataFrame.from_records(all_data_json)
178
  print(f'dataframe created: {df.shape}')
179
 
180
- # calculate the average score for selected benchmarks
181
  _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
 
 
182
  df[COL_NAME_AVG] = df[list(_benchmark_cols)].mean(axis=1).round(decimals=2)
183
  df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
184
  df.reset_index(inplace=True, drop=True)
185
- df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min")
186
 
187
  _cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
188
  df = df[_cols].round(decimals=2)
189
 
190
  # filter out if any of the benchmarks have not been produced
191
  df = df[has_no_nan_values(df, _benchmark_cols)]
 
192
  return df
 
4
  from dataclasses import dataclass
5
  from typing import List
6
 
 
7
  import pandas as pd
8
 
9
  from src.benchmarks import get_safe_name
 
21
  COL_NAME_RANK
22
  )
23
 
24
+ from src.display.formatting import make_clickable_model
25
+
26
 
27
  @dataclass
28
  class EvalResult:
 
101
  if eval_result.task != task:
102
  continue
103
  results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
104
+ results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL] = (
105
+ make_clickable_model(self.retrieval_model, self.retrieval_model_link))
106
+ results[eval_result.eval_name][COL_NAME_RERANKING_MODEL] = (
107
+ make_clickable_model(self.reranking_model, self.reranking_model_link))
108
  results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL_LINK] = self.retrieval_model_link
109
  results[eval_result.eval_name][COL_NAME_RERANKING_MODEL_LINK] = self.reranking_model_link
110
 
 
180
  df = pd.DataFrame.from_records(all_data_json)
181
  print(f'dataframe created: {df.shape}')
182
 
 
183
  _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
184
+
185
+ # calculate the average score for selected benchmarks
186
  df[COL_NAME_AVG] = df[list(_benchmark_cols)].mean(axis=1).round(decimals=2)
187
  df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
188
  df.reset_index(inplace=True, drop=True)
 
189
 
190
  _cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
191
  df = df[_cols].round(decimals=2)
192
 
193
  # filter out if any of the benchmarks have not been produced
194
  df = df[has_no_nan_values(df, _benchmark_cols)]
195
+ df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min")
196
  return df