Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
649e0fb
1 Parent(s): 4eb64b4

refactor: refactor the benchmarks

Browse files
app.py CHANGED
@@ -1,46 +1,28 @@
1
  import gradio as gr
2
  from apscheduler.schedulers.background import BackgroundScheduler
3
- from huggingface_hub import snapshot_download
4
 
5
  from src.about import (
6
  INTRODUCTION_TEXT,
7
- BENCHMARKS_TEXT,
8
- TITLE,
9
- EVALUATION_QUEUE_TEXT
10
  )
11
  from src.benchmarks import (
12
- DOMAIN_COLS_QA,
13
- LANG_COLS_QA,
14
- DOMAIN_COLS_LONG_DOC,
15
- LANG_COLS_LONG_DOC,
16
  METRIC_LIST,
17
  DEFAULT_METRIC_QA,
18
  DEFAULT_METRIC_LONG_DOC
19
  )
20
  from src.display.css_html_js import custom_css
21
- from src.display.column_names import COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_REVISION, \
22
- COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
23
  from src.envs import (
24
  API,
25
  EVAL_RESULTS_PATH,
26
- REPO_ID,
27
- RESULTS_REPO,
28
- TOKEN,
29
- BM25_LINK,
30
- BENCHMARK_VERSION_LIST,
31
- LATEST_BENCHMARK_VERSION
32
  )
33
- from src.read_evals import (
34
- get_raw_eval_results,
35
- get_leaderboard_df
36
  )
37
  from src.utils import (
38
- update_metric,
39
- upload_file,
40
- get_default_cols,
41
- submit_results,
42
- reset_rank,
43
- remove_html
44
  )
45
  from src.display.gradio_formatting import (
46
  get_version_dropdown,
@@ -51,8 +33,7 @@ from src.display.gradio_formatting import (
51
  get_language_dropdown,
52
  get_anonymous_checkbox,
53
  get_revision_and_ts_checkbox,
54
- get_leaderboard_table,
55
- get_noreranking_dropdown
56
  )
57
  from src.display.gradio_listener import set_listeners
58
 
@@ -69,65 +50,6 @@ def restart_space():
69
  # print(f'failed to download')
70
  # restart_space()
71
 
72
- from dataclasses import dataclass
73
- import pandas as pd
74
- from typing import Optional
75
-
76
-
77
- @dataclass
78
- class LeaderboardDataStore:
79
- raw_data: Optional[list]
80
- raw_df_qa: Optional[pd.DataFrame]
81
- raw_df_long_doc: Optional[pd.DataFrame]
82
- leaderboard_df_qa: Optional[pd.DataFrame]
83
- leaderboard_df_long_doc: Optional[pd.DataFrame]
84
- reranking_models: Optional[list]
85
- types_qa: Optional[list]
86
- types_long_doc: Optional[list]
87
-
88
-
89
- def load_leaderboard_data(file_path) -> LeaderboardDataStore:
90
- lb_data_store = LeaderboardDataStore(None, None, None, None, None, None, None, None)
91
- lb_data_store.raw_data = get_raw_eval_results(file_path)
92
- print(f'raw data: {len(lb_data_store.raw_data)}')
93
-
94
- lb_data_store.raw_df_qa = get_leaderboard_df(
95
- lb_data_store.raw_data, task='qa', metric=DEFAULT_METRIC_QA)
96
- lb_data_store.leaderboard_df_qa = lb_data_store.raw_df_qa.copy()
97
- # leaderboard_df_qa = leaderboard_df_qa[has_no_nan_values(df, _benchmark_cols)]
98
- print(f'QA data loaded: {lb_data_store.raw_df_qa.shape}')
99
- shown_columns_qa, types_qa = get_default_cols(
100
- 'qa', lb_data_store.leaderboard_df_qa.columns, add_fix_cols=True)
101
- lb_data_store.types_qa = types_qa
102
- lb_data_store.leaderboard_df_qa = \
103
- lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
104
- lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
105
-
106
- lb_data_store.raw_df_long_doc = get_leaderboard_df(
107
- lb_data_store.raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
108
- print(f'Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}')
109
- lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy()
110
- shown_columns_long_doc, types_long_doc = get_default_cols(
111
- 'long-doc', lb_data_store.leaderboard_df_long_doc.columns, add_fix_cols=True)
112
- lb_data_store.types_long_doc = types_long_doc
113
- lb_data_store.leaderboard_df_long_doc = \
114
- lb_data_store.leaderboard_df_long_doc[~lb_data_store.leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][
115
- shown_columns_long_doc]
116
- lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
117
-
118
- lb_data_store.reranking_models = sorted(
119
- list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data])))
120
- return lb_data_store
121
-
122
-
123
- def load_eval_results(file_path: str):
124
- output = {}
125
- versions = ("AIR-Bench_24.04",)
126
- for version in versions:
127
- fn = f"{file_path}/{version}"
128
- output[version] = load_leaderboard_data(fn)
129
- return output
130
-
131
 
132
  data = load_eval_results(EVAL_RESULTS_PATH)
133
 
@@ -157,6 +79,12 @@ def update_metric_long_doc(
157
  return update_metric(data["AIR-Bench_24.04"].raw_data, "long-doc", metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
158
 
159
 
 
 
 
 
 
 
160
  demo = gr.Blocks(css=custom_css)
161
 
162
  with demo:
 
1
  import gradio as gr
2
  from apscheduler.schedulers.background import BackgroundScheduler
 
3
 
4
  from src.about import (
5
  INTRODUCTION_TEXT,
6
+ TITLE
 
 
7
  )
8
  from src.benchmarks import (
9
+ qa_benchmark_dict,
10
+ long_doc_benchmark_dict,
 
 
11
  METRIC_LIST,
12
  DEFAULT_METRIC_QA,
13
  DEFAULT_METRIC_LONG_DOC
14
  )
15
  from src.display.css_html_js import custom_css
 
 
16
  from src.envs import (
17
  API,
18
  EVAL_RESULTS_PATH,
19
+ REPO_ID
 
 
 
 
 
20
  )
21
+ from src.loaders import (
22
+ load_eval_results
 
23
  )
24
  from src.utils import (
25
+ update_metric
 
 
 
 
 
26
  )
27
  from src.display.gradio_formatting import (
28
  get_version_dropdown,
 
33
  get_language_dropdown,
34
  get_anonymous_checkbox,
35
  get_revision_and_ts_checkbox,
36
+ get_leaderboard_table
 
37
  )
38
  from src.display.gradio_listener import set_listeners
39
 
 
50
  # print(f'failed to download')
51
  # restart_space()
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  data = load_eval_results(EVAL_RESULTS_PATH)
55
 
 
79
  return update_metric(data["AIR-Bench_24.04"].raw_data, "long-doc", metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
80
 
81
 
82
+ DOMAIN_COLS_QA = list(frozenset([c.domain for c in qa_benchmark_dict.values()]))
83
+ LANG_COLS_QA = list(frozenset([c.lang for c in qa_benchmark_dict.values()]))
84
+
85
+ DOMAIN_COLS_LONG_DOC = list(frozenset([c.domain for c in long_doc_benchmark_dict.values()]))
86
+ LANG_COLS_LONG_DOC = list(frozenset([c.lang for c in long_doc_benchmark_dict.values()]))
87
+
88
  demo = gr.Blocks(css=custom_css)
89
 
90
  with demo:
src/benchmarks.py CHANGED
@@ -1,16 +1,10 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
- from air_benchmark.tasks.tasks import BenchmarkTable
4
-
5
 
6
- def get_safe_name(name: str):
7
- """Get RFC 1123 compatible safe name"""
8
- name = name.replace('-', '_')
9
- return ''.join(
10
- character.lower()
11
- for character in name
12
- if (character.isalnum() or character == '_'))
13
 
 
 
14
 
15
  METRIC_LIST = [
16
  "ndcg_at_1",
@@ -46,6 +40,15 @@ METRIC_LIST = [
46
  ]
47
 
48
 
 
 
 
 
 
 
 
 
 
49
  @dataclass
50
  class Benchmark:
51
  name: str # [domain]_[language]_[metric], task_key in the json file,
@@ -78,15 +81,3 @@ for task, domain_dict in BenchmarkTable['AIR-Bench_24.04'].items():
78
 
79
  BenchmarksQA = Enum('BenchmarksQA', qa_benchmark_dict)
80
  BenchmarksLongDoc = Enum('BenchmarksLongDoc', long_doc_benchmark_dict)
81
-
82
- BENCHMARK_COLS_QA = [c.col_name for c in qa_benchmark_dict.values()]
83
- BENCHMARK_COLS_LONG_DOC = [c.col_name for c in long_doc_benchmark_dict.values()]
84
-
85
- DOMAIN_COLS_QA = list(frozenset([c.domain for c in qa_benchmark_dict.values()]))
86
- LANG_COLS_QA = list(frozenset([c.lang for c in qa_benchmark_dict.values()]))
87
-
88
- DOMAIN_COLS_LONG_DOC = list(frozenset([c.domain for c in long_doc_benchmark_dict.values()]))
89
- LANG_COLS_LONG_DOC = list(frozenset([c.lang for c in long_doc_benchmark_dict.values()]))
90
-
91
- DEFAULT_METRIC_QA = "ndcg_at_10"
92
- DEFAULT_METRIC_LONG_DOC = "recall_at_10"
 
1
  from dataclasses import dataclass
2
  from enum import Enum
 
 
3
 
4
+ from air_benchmark.tasks.tasks import BenchmarkTable
 
 
 
 
 
 
5
 
6
+ DEFAULT_METRIC_QA = "ndcg_at_10"
7
+ DEFAULT_METRIC_LONG_DOC = "recall_at_10"
8
 
9
  METRIC_LIST = [
10
  "ndcg_at_1",
 
40
  ]
41
 
42
 
43
+ def get_safe_name(name: str):
44
+ """Get RFC 1123 compatible safe name"""
45
+ name = name.replace('-', '_')
46
+ return ''.join(
47
+ character.lower()
48
+ for character in name
49
+ if (character.isalnum() or character == '_'))
50
+
51
+
52
  @dataclass
53
  class Benchmark:
54
  name: str # [domain]_[language]_[metric], task_key in the json file,
 
81
 
82
  BenchmarksQA = Enum('BenchmarksQA', qa_benchmark_dict)
83
  BenchmarksLongDoc = Enum('BenchmarksLongDoc', long_doc_benchmark_dict)
 
 
 
 
 
 
 
 
 
 
 
 
src/display/gradio_formatting.py CHANGED
@@ -64,7 +64,7 @@ def get_domain_dropdown(domain_list, default_domains):
64
  def get_language_dropdown(language_list, default_languages):
65
  return gr.Dropdown(
66
  choices=language_list,
67
- value=language_list,
68
  label="Select the languages",
69
  multiselect=True,
70
  interactive=True
 
64
  def get_language_dropdown(language_list, default_languages):
65
  return gr.Dropdown(
66
  choices=language_list,
67
+ value=default_languages,
68
  label="Select the languages",
69
  multiselect=True,
70
  interactive=True
src/display/utils.py CHANGED
@@ -57,7 +57,7 @@ def get_default_auto_eval_column_dict():
57
 
58
  def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
59
  auto_eval_column_dict = get_default_auto_eval_column_dict()
60
- ## Leaderboard columns
61
  for benchmark in benchmarks:
62
  auto_eval_column_dict.append(
63
  [benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
 
57
 
58
  def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
59
  auto_eval_column_dict = get_default_auto_eval_column_dict()
60
+ # Leaderboard columns
61
  for benchmark in benchmarks:
62
  auto_eval_column_dict.append(
63
  [benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
src/{read_evals.py → loaders.py} RENAMED
@@ -3,23 +3,18 @@ from typing import List
3
 
4
  import pandas as pd
5
 
6
- from src.benchmarks import BenchmarksQA, BenchmarksLongDoc
7
- from src.display.utils import COLS_QA, COLS_LONG_DOC
8
- from src.display.column_names import COL_NAME_AVG, COL_NAME_RANK, COL_NAME_REVISION, COL_NAME_IS_ANONYMOUS
9
 
10
- from src.models import FullEvalResult
 
11
 
12
  pd.options.mode.copy_on_write = True
13
 
14
 
15
- def calculate_mean(row):
16
- if pd.isna(row).any():
17
- return -1
18
- else:
19
- return row.mean()
20
 
21
-
22
- def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
23
  """
24
  Load the evaluation results from a json file
25
  """
@@ -58,41 +53,44 @@ def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
58
  return results
59
 
60
 
61
- def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -> pd.DataFrame:
62
- """
63
- Creates a dataframe from all the individual experiment results
64
- """
65
- cols = [COL_NAME_IS_ANONYMOUS, ]
66
- if task == "qa":
67
- cols += COLS_QA
68
- benchmark_cols = [t.value.col_name for t in BenchmarksQA]
69
- elif task == "long-doc":
70
- cols += COLS_LONG_DOC
71
- benchmark_cols = [t.value.col_name for t in BenchmarksLongDoc]
72
- else:
73
- raise NotImplemented
74
- all_data_json = []
75
- for v in raw_data:
76
- all_data_json += v.to_dict(task=task, metric=metric)
77
- df = pd.DataFrame.from_records(all_data_json)
78
- # print(f'dataframe created: {df.shape}')
79
-
80
- _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
81
-
82
- # calculate the average score for selected benchmarks
83
- df[COL_NAME_AVG] = df[list(_benchmark_cols)].apply(calculate_mean, axis=1).round(decimals=2)
84
- df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
85
- df.reset_index(inplace=True, drop=True)
86
-
87
- _cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
88
- df = df[_cols].round(decimals=2)
89
-
90
- # filter out if any of the benchmarks have not been produced
91
- df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min")
92
-
93
- # shorten the revision
94
- df[COL_NAME_REVISION] = df[COL_NAME_REVISION].str[:6]
95
-
96
- # # replace "0" with "-" for average score
97
- # df[COL_NAME_AVG] = df[COL_NAME_AVG].replace(0, "-")
98
- return df
 
 
 
 
3
 
4
  import pandas as pd
5
 
6
+ from src.benchmarks import DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC
7
+ from src.display.column_names import COL_NAME_REVISION, COL_NAME_IS_ANONYMOUS, \
8
+ COL_NAME_TIMESTAMP
9
 
10
+ from src.models import FullEvalResult, LeaderboardDataStore
11
+ from src.utils import get_default_cols, get_leaderboard_df
12
 
13
  pd.options.mode.copy_on_write = True
14
 
15
 
 
 
 
 
 
16
 
17
+ def load_raw_eval_results(results_path: str) -> List[FullEvalResult]:
 
18
  """
19
  Load the evaluation results from a json file
20
  """
 
53
  return results
54
 
55
 
56
+ def load_leaderboard_datastore(file_path) -> LeaderboardDataStore:
57
+ lb_data_store = LeaderboardDataStore(None, None, None, None, None, None, None, None)
58
+ lb_data_store.raw_data = load_raw_eval_results(file_path)
59
+ print(f'raw data: {len(lb_data_store.raw_data)}')
60
+
61
+ lb_data_store.raw_df_qa = get_leaderboard_df(
62
+ lb_data_store.raw_data, task='qa', metric=DEFAULT_METRIC_QA)
63
+ lb_data_store.leaderboard_df_qa = lb_data_store.raw_df_qa.copy()
64
+ # leaderboard_df_qa = leaderboard_df_qa[has_no_nan_values(df, _benchmark_cols)]
65
+ print(f'QA data loaded: {lb_data_store.raw_df_qa.shape}')
66
+ shown_columns_qa, types_qa = get_default_cols(
67
+ 'qa', lb_data_store.leaderboard_df_qa.columns, add_fix_cols=True)
68
+ lb_data_store.types_qa = types_qa
69
+ lb_data_store.leaderboard_df_qa = \
70
+ lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
71
+ lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
72
+
73
+ lb_data_store.raw_df_long_doc = get_leaderboard_df(
74
+ lb_data_store.raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
75
+ print(f'Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}')
76
+ lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy()
77
+ shown_columns_long_doc, types_long_doc = get_default_cols(
78
+ 'long-doc', lb_data_store.leaderboard_df_long_doc.columns, add_fix_cols=True)
79
+ lb_data_store.types_long_doc = types_long_doc
80
+ lb_data_store.leaderboard_df_long_doc = \
81
+ lb_data_store.leaderboard_df_long_doc[~lb_data_store.leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][
82
+ shown_columns_long_doc]
83
+ lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
84
+
85
+ lb_data_store.reranking_models = sorted(
86
+ list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data])))
87
+ return lb_data_store
88
+
89
+
90
+ def load_eval_results(file_path: str):
91
+ output = {}
92
+ versions = ("AIR-Bench_24.04",)
93
+ for version in versions:
94
+ fn = f"{file_path}/{version}"
95
+ output[version] = load_leaderboard_datastore(fn)
96
+ return output
src/utils.py CHANGED
@@ -6,18 +6,23 @@ from typing import List
6
 
7
  import pandas as pd
8
 
9
- from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, BenchmarksQA, BenchmarksLongDoc
10
  from src.display.formatting import styled_message, styled_error
11
  from src.display.utils import COLS_QA, TYPES_QA, COLS_LONG_DOC, TYPES_LONG_DOC, get_default_auto_eval_column_dict
12
  from src.display.column_names import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RANK, \
13
  COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
14
  from src.envs import API, SEARCH_RESULTS_REPO, LATEST_BENCHMARK_VERSION
15
- from src.read_evals import get_leaderboard_df, calculate_mean
16
  from src.models import FullEvalResult
17
 
18
  import re
19
 
20
 
 
 
 
 
 
 
21
  def remove_html(input_str):
22
  # Regular expression for finding HTML tags
23
  clean = re.sub(r'<.*?>', '', input_str)
@@ -63,11 +68,11 @@ def get_default_cols(task: str, columns: list=[], add_fix_cols: bool=True) -> li
63
  if task == "qa":
64
  cols_list = COLS_QA
65
  types_list = TYPES_QA
66
- benchmark_list = BENCHMARK_COLS_QA
67
  elif task == "long-doc":
68
  cols_list = COLS_LONG_DOC
69
  types_list = TYPES_LONG_DOC
70
- benchmark_list = BENCHMARK_COLS_LONG_DOC
71
  else:
72
  raise NotImplemented
73
  for col_name, col_type in zip(cols_list, types_list):
@@ -318,3 +323,43 @@ def submit_results(
318
  def reset_rank(df):
319
  df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min")
320
  return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  import pandas as pd
8
 
9
+ from src.benchmarks import qa_benchmark_dict, long_doc_benchmark_dict, BenchmarksQA, BenchmarksLongDoc
10
  from src.display.formatting import styled_message, styled_error
11
  from src.display.utils import COLS_QA, TYPES_QA, COLS_LONG_DOC, TYPES_LONG_DOC, get_default_auto_eval_column_dict
12
  from src.display.column_names import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RANK, \
13
  COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
14
  from src.envs import API, SEARCH_RESULTS_REPO, LATEST_BENCHMARK_VERSION
 
15
  from src.models import FullEvalResult
16
 
17
  import re
18
 
19
 
20
+ def calculate_mean(row):
21
+ if pd.isna(row).any():
22
+ return -1
23
+ else:
24
+ return row.mean()
25
+
26
  def remove_html(input_str):
27
  # Regular expression for finding HTML tags
28
  clean = re.sub(r'<.*?>', '', input_str)
 
68
  if task == "qa":
69
  cols_list = COLS_QA
70
  types_list = TYPES_QA
71
+ benchmark_list = [c.col_name for c in qa_benchmark_dict.values()]
72
  elif task == "long-doc":
73
  cols_list = COLS_LONG_DOC
74
  types_list = TYPES_LONG_DOC
75
+ benchmark_list = [c.col_name for c in long_doc_benchmark_dict.values()]
76
  else:
77
  raise NotImplemented
78
  for col_name, col_type in zip(cols_list, types_list):
 
323
  def reset_rank(df):
324
  df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min")
325
  return df
326
+
327
+
328
+ def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -> pd.DataFrame:
329
+ """
330
+ Creates a dataframe from all the individual experiment results
331
+ """
332
+ cols = [COL_NAME_IS_ANONYMOUS, ]
333
+ if task == "qa":
334
+ cols += COLS_QA
335
+ benchmark_cols = [t.value.col_name for t in BenchmarksQA]
336
+ elif task == "long-doc":
337
+ cols += COLS_LONG_DOC
338
+ benchmark_cols = [t.value.col_name for t in BenchmarksLongDoc]
339
+ else:
340
+ raise NotImplemented
341
+ all_data_json = []
342
+ for v in raw_data:
343
+ all_data_json += v.to_dict(task=task, metric=metric)
344
+ df = pd.DataFrame.from_records(all_data_json)
345
+ # print(f'dataframe created: {df.shape}')
346
+
347
+ _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
348
+
349
+ # calculate the average score for selected benchmarks
350
+ df[COL_NAME_AVG] = df[list(_benchmark_cols)].apply(calculate_mean, axis=1).round(decimals=2)
351
+ df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
352
+ df.reset_index(inplace=True, drop=True)
353
+
354
+ _cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
355
+ df = df[_cols].round(decimals=2)
356
+
357
+ # filter out if any of the benchmarks have not been produced
358
+ df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min")
359
+
360
+ # shorten the revision
361
+ df[COL_NAME_REVISION] = df[COL_NAME_REVISION].str[:6]
362
+
363
+ # # replace "0" with "-" for average score
364
+ # df[COL_NAME_AVG] = df[COL_NAME_AVG].replace(0, "-")
365
+ return df
tests/src/test_read_evals.py CHANGED
@@ -1,6 +1,7 @@
1
  from pathlib import Path
2
 
3
- from src.read_evals import get_raw_eval_results, get_leaderboard_df
 
4
  from src.models import FullEvalResult
5
 
6
  cur_fp = Path(__file__)
@@ -30,7 +31,7 @@ def test_to_dict():
30
 
31
  def test_get_raw_eval_results():
32
  results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
33
- results = get_raw_eval_results(results_path)
34
  # only load the latest results
35
  assert len(results) == 4
36
  assert results[0].eval_name == "bge-base-en-v1.5_NoReranker"
@@ -41,7 +42,7 @@ def test_get_raw_eval_results():
41
 
42
  def test_get_leaderboard_df():
43
  results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
44
- raw_data = get_raw_eval_results(results_path)
45
  df = get_leaderboard_df(raw_data, 'qa', 'ndcg_at_10')
46
  assert df.shape[0] == 4
47
  # the results contain only one embedding model
@@ -56,7 +57,7 @@ def test_get_leaderboard_df():
56
 
57
  def test_get_leaderboard_df_long_doc():
58
  results_path = cur_fp.parents[2] / "toydata" / "test_results"
59
- raw_data = get_raw_eval_results(results_path)
60
  df = get_leaderboard_df(raw_data, 'long-doc', 'ndcg_at_1')
61
  assert df.shape[0] == 2
62
  # the results contain only one embedding model
 
1
  from pathlib import Path
2
 
3
+ from src.read_evals import load_raw_eval_results
4
+ from src.utils import get_leaderboard_df
5
  from src.models import FullEvalResult
6
 
7
  cur_fp = Path(__file__)
 
31
 
32
  def test_get_raw_eval_results():
33
  results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
34
+ results = load_raw_eval_results(results_path)
35
  # only load the latest results
36
  assert len(results) == 4
37
  assert results[0].eval_name == "bge-base-en-v1.5_NoReranker"
 
42
 
43
  def test_get_leaderboard_df():
44
  results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
45
+ raw_data = load_raw_eval_results(results_path)
46
  df = get_leaderboard_df(raw_data, 'qa', 'ndcg_at_10')
47
  assert df.shape[0] == 4
48
  # the results contain only one embedding model
 
57
 
58
  def test_get_leaderboard_df_long_doc():
59
  results_path = cur_fp.parents[2] / "toydata" / "test_results"
60
+ raw_data = load_raw_eval_results(results_path)
61
  df = get_leaderboard_df(raw_data, 'long-doc', 'ndcg_at_1')
62
  assert df.shape[0] == 2
63
  # the results contain only one embedding model