Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
af8395f
1 Parent(s): 9400714

feat: implement anonymous displaying for qa

Browse files
Files changed (3) hide show
  1. app.py +23 -13
  2. src/read_evals.py +7 -4
  3. src/utils.py +19 -6
app.py CHANGED
@@ -11,6 +11,7 @@ from src.about import (
11
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, \
12
  DEFAULT_METRIC
13
  from src.display.css_html_js import custom_css
 
14
  from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
15
  from src.read_evals import get_raw_eval_results, get_leaderboard_df
16
  from src.utils import update_table, update_metric, update_table_long_doc, upload_file, get_default_cols, submit_results
@@ -20,13 +21,14 @@ def restart_space():
20
  API.restart_space(repo_id=REPO_ID)
21
 
22
 
23
- try:
24
- snapshot_download(
25
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
26
- token=TOKEN
27
- )
28
- except Exception:
29
- restart_space()
 
30
 
31
  raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
32
 
@@ -40,7 +42,7 @@ print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
40
 
41
  leaderboard_df_qa = original_df_qa.copy()
42
  shown_columns_qa, types_qa = get_default_cols('qa', leaderboard_df_qa.columns, add_fix_cols=True)
43
- leaderboard_df_qa = leaderboard_df_qa[shown_columns_qa]
44
 
45
  leaderboard_df_long_doc = original_df_long_doc.copy()
46
  shown_columns_long_doc, types_long_doc = get_default_cols('long-doc', leaderboard_df_long_doc.columns,
@@ -54,8 +56,9 @@ def update_metric_qa(
54
  langs: list,
55
  reranking_model: list,
56
  query: str,
 
57
  ):
58
- return update_metric(raw_data, 'qa', metric, domains, langs, reranking_model, query)
59
 
60
 
61
  def update_metric_long_doc(
@@ -123,6 +126,12 @@ with demo:
123
  multiselect=True,
124
  interactive=True
125
  )
 
 
 
 
 
 
126
 
127
  leaderboard_table = gr.components.Dataframe(
128
  value=leaderboard_df_qa,
@@ -134,10 +143,8 @@ with demo:
134
 
135
  # Dummy leaderboard for handling the case when the user uses backspace key
136
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
137
- value=leaderboard_df_qa,
138
  datatype=types_qa,
139
- # headers=COLS,
140
- # datatype=TYPES,
141
  visible=False,
142
  )
143
 
@@ -150,13 +157,14 @@ with demo:
150
  selected_langs,
151
  selected_rerankings,
152
  search_bar,
 
153
  ],
154
  leaderboard_table,
155
  )
156
 
157
  # Set column-wise listener
158
  for selector in [
159
- selected_domains, selected_langs, selected_rerankings
160
  ]:
161
  selector.change(
162
  update_table,
@@ -166,6 +174,7 @@ with demo:
166
  selected_langs,
167
  selected_rerankings,
168
  search_bar,
 
169
  ],
170
  leaderboard_table,
171
  queue=True,
@@ -180,6 +189,7 @@ with demo:
180
  selected_langs,
181
  selected_rerankings,
182
  search_bar,
 
183
  ],
184
  leaderboard_table,
185
  queue=True
 
11
  from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, METRIC_LIST, \
12
  DEFAULT_METRIC
13
  from src.display.css_html_js import custom_css
14
+ from src.display.utils import COL_NAME_IS_ANONYMOUS
15
  from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
16
  from src.read_evals import get_raw_eval_results, get_leaderboard_df
17
  from src.utils import update_table, update_metric, update_table_long_doc, upload_file, get_default_cols, submit_results
 
21
  API.restart_space(repo_id=REPO_ID)
22
 
23
 
24
+ # try:
25
+ # snapshot_download(
26
+ # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
27
+ # token=TOKEN
28
+ # )
29
+ # except Exception as e:
30
+ # print(f'failed to download')
31
+ # restart_space()
32
 
33
  raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
34
 
 
42
 
43
  leaderboard_df_qa = original_df_qa.copy()
44
  shown_columns_qa, types_qa = get_default_cols('qa', leaderboard_df_qa.columns, add_fix_cols=True)
45
+ leaderboard_df_qa = leaderboard_df_qa[~leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
46
 
47
  leaderboard_df_long_doc = original_df_long_doc.copy()
48
  shown_columns_long_doc, types_long_doc = get_default_cols('long-doc', leaderboard_df_long_doc.columns,
 
56
  langs: list,
57
  reranking_model: list,
58
  query: str,
59
+ show_anonymous: bool
60
  ):
61
+ return update_metric(raw_data, 'qa', metric, domains, langs, reranking_model, query, show_anonymous)
62
 
63
 
64
  def update_metric_long_doc(
 
126
  multiselect=True,
127
  interactive=True
128
  )
129
+ with gr.Row():
130
+ show_anonymous = gr.Checkbox(
131
+ label="Show anonymous submissions",
132
+ value=False,
133
+ info="The anonymous submissions might have invalid model information."
134
+ )
135
 
136
  leaderboard_table = gr.components.Dataframe(
137
  value=leaderboard_df_qa,
 
143
 
144
  # Dummy leaderboard for handling the case when the user uses backspace key
145
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
146
+ value=original_df_qa,
147
  datatype=types_qa,
 
 
148
  visible=False,
149
  )
150
 
 
157
  selected_langs,
158
  selected_rerankings,
159
  search_bar,
160
+ show_anonymous,
161
  ],
162
  leaderboard_table,
163
  )
164
 
165
  # Set column-wise listener
166
  for selector in [
167
+ selected_domains, selected_langs, selected_rerankings, show_anonymous
168
  ]:
169
  selector.change(
170
  update_table,
 
174
  selected_langs,
175
  selected_rerankings,
176
  search_bar,
177
+ show_anonymous,
178
  ],
179
  leaderboard_table,
180
  queue=True,
 
189
  selected_langs,
190
  selected_rerankings,
191
  search_bar,
192
+ show_anonymous,
193
  ],
194
  leaderboard_table,
195
  queue=True
src/read_evals.py CHANGED
@@ -15,6 +15,7 @@ from src.display.utils import (
15
  COL_NAME_RETRIEVAL_MODEL_LINK,
16
  COL_NAME_REVISION,
17
  COL_NAME_TIMESTAMP,
 
18
  COLS_QA,
19
  QA_BENCHMARK_COLS,
20
  COLS_LONG_DOC,
@@ -90,7 +91,7 @@ class FullEvalResult:
90
  metric=config["metric"],
91
  timestamp=config.get("timestamp", "2024-05-12T12:24:02Z"),
92
  revision=config.get("revision", "3a2ba9dcad796a48a02ca1147557724e"),
93
- is_anonymous=config.get("is_anonymous", False)
94
  )
95
  result_list.append(eval_result)
96
  return cls(
@@ -124,6 +125,7 @@ class FullEvalResult:
124
  results[eval_result.eval_name][COL_NAME_RERANKING_MODEL_LINK] = self.reranking_model_link
125
  results[eval_result.eval_name][COL_NAME_REVISION] = self.revision
126
  results[eval_result.eval_name][COL_NAME_TIMESTAMP] = self.timestamp
 
127
 
128
  # print(f'result loaded: {eval_result.eval_name}')
129
  for result in eval_result.results:
@@ -183,11 +185,12 @@ def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -
183
  """
184
  Creates a dataframe from all the individual experiment results
185
  """
 
186
  if task == "qa":
187
- cols = COLS_QA
188
  benchmark_cols = QA_BENCHMARK_COLS
189
  elif task == "long-doc":
190
- cols = COLS_LONG_DOC
191
  benchmark_cols = LONG_DOC_BENCHMARK_COLS
192
  else:
193
  raise NotImplemented
@@ -195,7 +198,7 @@ def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -
195
  for v in raw_data:
196
  all_data_json += v.to_dict(task=task, metric=metric)
197
  df = pd.DataFrame.from_records(all_data_json)
198
- print(f'dataframe created: {df.shape}')
199
 
200
  _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
201
 
 
15
  COL_NAME_RETRIEVAL_MODEL_LINK,
16
  COL_NAME_REVISION,
17
  COL_NAME_TIMESTAMP,
18
+ COL_NAME_IS_ANONYMOUS,
19
  COLS_QA,
20
  QA_BENCHMARK_COLS,
21
  COLS_LONG_DOC,
 
91
  metric=config["metric"],
92
  timestamp=config.get("timestamp", "2024-05-12T12:24:02Z"),
93
  revision=config.get("revision", "3a2ba9dcad796a48a02ca1147557724e"),
94
+ is_anonymous=config.get("is_anonymous", True)
95
  )
96
  result_list.append(eval_result)
97
  return cls(
 
125
  results[eval_result.eval_name][COL_NAME_RERANKING_MODEL_LINK] = self.reranking_model_link
126
  results[eval_result.eval_name][COL_NAME_REVISION] = self.revision
127
  results[eval_result.eval_name][COL_NAME_TIMESTAMP] = self.timestamp
128
+ results[eval_result.eval_name][COL_NAME_IS_ANONYMOUS] = self.is_anonymous
129
 
130
  # print(f'result loaded: {eval_result.eval_name}')
131
  for result in eval_result.results:
 
185
  """
186
  Creates a dataframe from all the individual experiment results
187
  """
188
+ cols = [COL_NAME_IS_ANONYMOUS, ]
189
  if task == "qa":
190
+ cols += COLS_QA
191
  benchmark_cols = QA_BENCHMARK_COLS
192
  elif task == "long-doc":
193
+ cols += COLS_LONG_DOC
194
  benchmark_cols = LONG_DOC_BENCHMARK_COLS
195
  else:
196
  raise NotImplemented
 
198
  for v in raw_data:
199
  all_data_json += v.to_dict(task=task, metric=metric)
200
  df = pd.DataFrame.from_records(all_data_json)
201
+ # print(f'dataframe created: {df.shape}')
202
 
203
  _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
204
 
src/utils.py CHANGED
@@ -8,7 +8,7 @@ import pandas as pd
8
  from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, BenchmarksQA, BenchmarksLongDoc
9
  from src.display.formatting import styled_message, styled_error
10
  from src.display.utils import COLS_QA, TYPES_QA, COLS_LONG_DOC, TYPES_LONG_DOC, COL_NAME_RANK, COL_NAME_AVG, \
11
- COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, get_default_auto_eval_column_dict
12
  from src.envs import API, SEARCH_RESULTS_REPO
13
  from src.read_evals import FullEvalResult, get_leaderboard_df
14
 
@@ -77,7 +77,7 @@ FIXED_COLS_TYPES = [c.type for _, _, c in fixed_cols]
77
 
78
 
79
  def select_columns(df: pd.DataFrame, domain_query: list, language_query: list, task: str = "qa") -> pd.DataFrame:
80
- cols = get_default_cols(task=task, columns=df.columns, add_fix_cols=False)
81
  selected_cols = []
82
  for c in cols:
83
  if task == "qa":
@@ -91,7 +91,7 @@ def select_columns(df: pd.DataFrame, domain_query: list, language_query: list, t
91
  selected_cols.append(c)
92
  # We use COLS to maintain sorting
93
  filtered_df = df[FIXED_COLS + selected_cols]
94
- filtered_df[COL_NAME_AVG] = filtered_df[selected_cols].mean(axis=1).round(decimals=2)
95
  filtered_df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
96
  filtered_df.reset_index(inplace=True, drop=True)
97
  filtered_df[COL_NAME_RANK] = filtered_df[COL_NAME_AVG].rank(ascending=False, method="min")
@@ -105,8 +105,15 @@ def update_table(
105
  langs: list,
106
  reranking_query: list,
107
  query: str,
 
108
  ):
109
- filtered_df = filter_models(hidden_df, reranking_query)
 
 
 
 
 
 
110
  filtered_df = filter_queries(query, filtered_df)
111
  df = select_columns(filtered_df, domains, langs)
112
  return df
@@ -118,10 +125,13 @@ def update_table_long_doc(
118
  langs: list,
119
  reranking_query: list,
120
  query: str,
 
121
  ):
122
  filtered_df = filter_models(hidden_df, reranking_query)
123
  filtered_df = filter_queries(query, filtered_df)
124
  df = select_columns(filtered_df, domains, langs, task='long_doc')
 
 
125
  return df
126
 
127
 
@@ -133,6 +143,7 @@ def update_metric(
133
  langs: list,
134
  reranking_model: list,
135
  query: str,
 
136
  ) -> pd.DataFrame:
137
  if task == 'qa':
138
  leaderboard_df = get_leaderboard_df(raw_data, task=task, metric=metric)
@@ -141,7 +152,8 @@ def update_metric(
141
  domains,
142
  langs,
143
  reranking_model,
144
- query
 
145
  )
146
  elif task == "long-doc":
147
  leaderboard_df = get_leaderboard_df(raw_data, task=task, metric=metric)
@@ -150,7 +162,8 @@ def update_metric(
150
  domains,
151
  langs,
152
  reranking_model,
153
- query
 
154
  )
155
 
156
 
 
8
  from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, BenchmarksQA, BenchmarksLongDoc
9
  from src.display.formatting import styled_message, styled_error
10
  from src.display.utils import COLS_QA, TYPES_QA, COLS_LONG_DOC, TYPES_LONG_DOC, COL_NAME_RANK, COL_NAME_AVG, \
11
+ COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, COL_NAME_IS_ANONYMOUS, get_default_auto_eval_column_dict
12
  from src.envs import API, SEARCH_RESULTS_REPO
13
  from src.read_evals import FullEvalResult, get_leaderboard_df
14
 
 
77
 
78
 
79
  def select_columns(df: pd.DataFrame, domain_query: list, language_query: list, task: str = "qa") -> pd.DataFrame:
80
+ cols, _ = get_default_cols(task=task, columns=df.columns, add_fix_cols=False)
81
  selected_cols = []
82
  for c in cols:
83
  if task == "qa":
 
91
  selected_cols.append(c)
92
  # We use COLS to maintain sorting
93
  filtered_df = df[FIXED_COLS + selected_cols]
94
+ filtered_df[COL_NAME_AVG] = filtered_df[selected_cols].mean(axis=1, numeric_only=True).round(decimals=2)
95
  filtered_df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
96
  filtered_df.reset_index(inplace=True, drop=True)
97
  filtered_df[COL_NAME_RANK] = filtered_df[COL_NAME_AVG].rank(ascending=False, method="min")
 
105
  langs: list,
106
  reranking_query: list,
107
  query: str,
108
+ show_anonymous: bool
109
  ):
110
+ print(f"shown_anonymous: {show_anonymous}")
111
+ filtered_df = hidden_df
112
+ if not show_anonymous:
113
+ print(filtered_df[COL_NAME_IS_ANONYMOUS])
114
+ filtered_df = filtered_df[~filtered_df[COL_NAME_IS_ANONYMOUS]]
115
+ print(f"filtered_df: {len(filtered_df)}")
116
+ filtered_df = filter_models(filtered_df, reranking_query)
117
  filtered_df = filter_queries(query, filtered_df)
118
  df = select_columns(filtered_df, domains, langs)
119
  return df
 
125
  langs: list,
126
  reranking_query: list,
127
  query: str,
128
+ # show_anonymous: bool
129
  ):
130
  filtered_df = filter_models(hidden_df, reranking_query)
131
  filtered_df = filter_queries(query, filtered_df)
132
  df = select_columns(filtered_df, domains, langs, task='long_doc')
133
+ # if not show_anonymous:
134
+ # df = df[~df[COL_NAME_IS_ANONYMOUS]]
135
  return df
136
 
137
 
 
143
  langs: list,
144
  reranking_model: list,
145
  query: str,
146
+ show_anonymous: bool
147
  ) -> pd.DataFrame:
148
  if task == 'qa':
149
  leaderboard_df = get_leaderboard_df(raw_data, task=task, metric=metric)
 
152
  domains,
153
  langs,
154
  reranking_model,
155
+ query,
156
+ show_anonymous
157
  )
158
  elif task == "long-doc":
159
  leaderboard_df = get_leaderboard_df(raw_data, task=task, metric=metric)
 
162
  domains,
163
  langs,
164
  reranking_model,
165
+ query,
166
+ # show_anonymous
167
  )
168
 
169