hi-melnikov commited on
Commit
d0e8be9
·
1 Parent(s): a70555b

ruff format everything

Browse files
app.py CHANGED
@@ -24,39 +24,33 @@ from src.envs import (
24
  )
25
  from src.leaderboard.build_leaderboard import build_leadearboard_df
26
 
27
- os.environ['GRADIO_ANALYTICS_ENABLED']='false'
28
 
29
  # Configure logging
30
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
31
 
32
  # Start ephemeral Spaces on PRs (see config in README.md)
33
  enable_space_ci()
34
 
 
35
  def restart_space():
36
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
37
 
38
 
39
  def build_demo():
40
- demo = gr.Blocks(
41
- title = "Chatbot Arena Leaderboard",
42
- css=custom_css
43
- )
44
  leaderboard_df = build_leadearboard_df()
45
  with demo:
46
  gr.HTML(TITLE)
47
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
48
 
49
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
50
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
51
- leaderboard = Leaderboard(
52
  value=leaderboard_df,
53
  datatype=[c.type for c in fields(AutoEvalColumn)],
54
  select_columns=SelectColumns(
55
- default_selection=[
56
- c.name
57
- for c in fields(AutoEvalColumn)
58
- if c.displayed_by_default
59
- ],
60
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
61
  label="Select Columns to Display:",
62
  ),
@@ -67,50 +61,59 @@ def build_demo():
67
  ],
68
  )
69
 
70
- #with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
71
  # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
72
- #with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=2):
73
  # gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
74
 
75
  with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=3):
76
-
77
  with gr.Row():
78
  gr.Markdown("# ✨ Submit your model here!", elem_classes="markdown-text")
79
 
80
  with gr.Column():
81
- model_name_textbox = gr.Textbox(label="Model name")
82
- def upload_file(file):
83
- file_path = file.name.split('/')[-1] if '/' in file.name else file.name
84
- logging.info("New submition: file saved to %s", file_path)
85
- API.upload_file(path_or_fileobj=file.name,path_in_repo='./external/'+file_path,repo_id='Vikhrmodels/openbench-eval',repo_type='dataset')
86
- os.environ[RESET_JUDGEMENT_ENV] = '1'
87
- return file.name
88
-
89
- if model_name_textbox:
90
- file_output = gr.File()
91
- upload_button = gr.UploadButton("Click to Upload & Submit Answers", file_types=['*'], file_count="single")
92
- upload_button.upload(upload_file, upload_button, file_output)
93
-
 
 
 
 
 
 
 
 
94
  return demo
95
-
 
96
  # print(os.system('cd src/gen && ../../.venv/bin/python gen_judgment.py'))
97
  # print(os.system('cd src/gen/ && python show_result.py --output'))
98
-
 
99
  def update_board():
100
  need_reset = os.environ.get(RESET_JUDGEMENT_ENV)
101
- if need_reset != '1':
102
  return
103
- os.environ[RESET_JUDGEMENT_ENV] = '0'
104
- subprocess.run(['python', 'src/gen/gen_judgement.py'], check = False)
105
- subprocess.Popen('python3.src/gen/show_result.py --output')
106
 
107
 
108
  if __name__ == "__main__":
109
- os.environ[RESET_JUDGEMENT_ENV] = '1'
110
-
111
  scheduler = BackgroundScheduler()
112
- scheduler.add_job(update_board, "interval", minutes=10)
113
  scheduler.start()
114
-
115
  demo_app = build_demo()
116
  demo_app.launch(debug=True)
 
24
  )
25
  from src.leaderboard.build_leaderboard import build_leadearboard_df
26
 
27
+ os.environ["GRADIO_ANALYTICS_ENABLED"] = "false"
28
 
29
  # Configure logging
30
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
31
 
32
  # Start ephemeral Spaces on PRs (see config in README.md)
33
  enable_space_ci()
34
 
35
+
36
  def restart_space():
37
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
38
 
39
 
40
  def build_demo():
41
+ demo = gr.Blocks(title="Chatbot Arena Leaderboard", css=custom_css)
 
 
 
42
  leaderboard_df = build_leadearboard_df()
43
  with demo:
44
  gr.HTML(TITLE)
45
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
46
 
47
+ with gr.Tabs(elem_classes="tab-buttons"):
48
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
49
+ Leaderboard(
50
  value=leaderboard_df,
51
  datatype=[c.type for c in fields(AutoEvalColumn)],
52
  select_columns=SelectColumns(
53
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
 
 
 
 
54
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
55
  label="Select Columns to Display:",
56
  ),
 
61
  ],
62
  )
63
 
64
+ # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
65
  # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
66
+ # with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=2):
67
  # gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
68
 
69
  with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=3):
 
70
  with gr.Row():
71
  gr.Markdown("# ✨ Submit your model here!", elem_classes="markdown-text")
72
 
73
  with gr.Column():
74
+ model_name_textbox = gr.Textbox(label="Model name")
75
+
76
+ def upload_file(file):
77
+ file_path = file.name.split("/")[-1] if "/" in file.name else file.name
78
+ logging.info("New submition: file saved to %s", file_path)
79
+ API.upload_file(
80
+ path_or_fileobj=file.name,
81
+ path_in_repo="./external/" + file_path,
82
+ repo_id="Vikhrmodels/openbench-eval",
83
+ repo_type="dataset",
84
+ )
85
+ os.environ[RESET_JUDGEMENT_ENV] = "1"
86
+ return file.name
87
+
88
+ if model_name_textbox:
89
+ file_output = gr.File()
90
+ upload_button = gr.UploadButton(
91
+ "Click to Upload & Submit Answers", file_types=["*"], file_count="single"
92
+ )
93
+ upload_button.upload(upload_file, upload_button, file_output)
94
+
95
  return demo
96
+
97
+
98
  # print(os.system('cd src/gen && ../../.venv/bin/python gen_judgment.py'))
99
  # print(os.system('cd src/gen/ && python show_result.py --output'))
100
+
101
+
102
  def update_board():
103
  need_reset = os.environ.get(RESET_JUDGEMENT_ENV)
104
+ if need_reset != "1":
105
  return
106
+ os.environ[RESET_JUDGEMENT_ENV] = "0"
107
+ subprocess.run(["python", "src/gen/gen_judgement.py"], check=False)
108
+ subprocess.Popen("python3.src/gen/show_result.py --output")
109
 
110
 
111
  if __name__ == "__main__":
112
+ os.environ[RESET_JUDGEMENT_ENV] = "1"
113
+
114
  scheduler = BackgroundScheduler()
115
+ scheduler.add_job(update_board, "interval", minutes=10)
116
  scheduler.start()
117
+
118
  demo_app = build_demo()
119
  demo_app.launch(debug=True)
src/display/css_html_js.py CHANGED
@@ -88,4 +88,4 @@ get_window_url_params = """
88
  url_params = Object.fromEntries(params);
89
  return url_params;
90
  }
91
- """
 
88
  url_params = Object.fromEntries(params);
89
  return url_params;
90
  }
91
+ """
src/display/utils.py CHANGED
@@ -7,7 +7,8 @@ import pandas as pd
7
 
8
 
9
  # Configure logging
10
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
11
 
12
  def parse_datetime(datetime_str):
13
  formats = [
@@ -25,6 +26,7 @@ def parse_datetime(datetime_str):
25
  logging.error(f"No valid date format found for: {datetime_str}")
26
  return datetime(1970, 1, 1)
27
 
 
28
  def load_json_data(file_path):
29
  """Safely load JSON data from a file."""
30
  try:
@@ -98,7 +100,6 @@ auto_eval_column_dict.append(["score", ColumnContent, ColumnContent("score", "nu
98
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
99
 
100
 
101
-
102
  @dataclass(frozen=True)
103
  class EvalQueueColumn: # Queue column
104
  model = ColumnContent("model", "markdown", True)
 
7
 
8
 
9
  # Configure logging
10
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
11
+
12
 
13
  def parse_datetime(datetime_str):
14
  formats = [
 
26
  logging.error(f"No valid date format found for: {datetime_str}")
27
  return datetime(1970, 1, 1)
28
 
29
+
30
  def load_json_data(file_path):
31
  """Safely load JSON data from a file."""
32
  try:
 
100
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
101
 
102
 
 
103
  @dataclass(frozen=True)
104
  class EvalQueueColumn: # Queue column
105
  model = ColumnContent("model", "markdown", True)
src/gen/gen_answer.py CHANGED
@@ -33,7 +33,14 @@ from utils import (
33
 
34
 
35
  def get_answer(
36
- question: dict, model: str, endpoint_info: dict, num_choices: int, max_tokens: int, temperature: float, answer_file: str, api_dict: dict
 
 
 
 
 
 
 
37
  ):
38
  if question["category"] in temperature_config:
39
  temperature = temperature_config[question["category"]]
@@ -54,49 +61,56 @@ def get_answer(
54
  for j in range(len(question["turns"])):
55
  conv.append({"role": "user", "content": question["turns"][j]["content"]})
56
  if api_type == "anthropic":
57
- output = chat_completion_anthropic(model=endpoint_info["model_name"],
58
- messages=conv,
59
- temperature=temperature,
60
- max_tokens=max_tokens)
61
  elif api_type == "mistral":
62
- output = chat_completion_mistral(model=endpoint_info["model_name"],
63
- messages=conv,
64
- temperature=temperature,
65
- max_tokens=max_tokens)
66
  elif api_type == "yandex":
67
- output = chat_completion_yandex(model=endpoint_info["model_name"],
68
- messages=conv,
69
- temperature=temperature,
70
- max_tokens=max_tokens,
71
- api_dict=api_dict)
 
 
72
  elif api_type == "gigachat":
73
- output = chat_completion_gigachat(model=endpoint_info["model_name"],
74
- messages=conv,
75
- temperature=temperature,
76
- max_tokens=max_tokens,
77
- api_dict=api_dict)
 
 
78
  elif api_type == "gemini":
79
- output = chat_completion_gemini(model=endpoint_info["model_name"],
80
- messages=question["turns"][j]["content"],
81
- temperature=temperature,
82
- max_tokens=max_tokens)
 
 
83
  elif api_type == "azure":
84
- output = chat_completion_openai_azure(model=endpoint_info["model_name"],
85
- messages=conv,
86
- temperature=temperature,
87
- max_tokens=max_tokens,
88
- api_dict=api_dict)
 
 
89
  elif api_type == "cohere":
90
- output = chat_completion_cohere(model=endpoint_info["model_name"],
91
- messages=conv,
92
- temperature=temperature,
93
- max_tokens=max_tokens)
94
  else:
95
- output = chat_completion_openai(model=endpoint_info["model_name"],
96
- messages=conv,
97
- temperature=temperature,
98
- max_tokens=max_tokens,
99
- api_dict=api_dict)
 
 
100
  conv.append({"role": "assistant", "content": output})
101
 
102
  turns.append({"content": output, "token_len": len(encoding.encode(output))})
@@ -118,12 +132,8 @@ def get_answer(
118
 
119
  if __name__ == "__main__":
120
  parser = argparse.ArgumentParser()
121
- parser.add_argument(
122
- "--setting-file", type=str, default="config/gen_answer_config.yaml"
123
- )
124
- parser.add_argument(
125
- "--endpoint-file", type=str, default="config/api_config.yaml"
126
- )
127
  args = parser.parse_args()
128
 
129
  settings = make_config(args.setting_file)
@@ -187,9 +197,7 @@ if __name__ == "__main__":
187
  futures.append(future)
188
  if count > 0:
189
  print(f"{count} number of existing answers")
190
- for future in tqdm.tqdm(
191
- concurrent.futures.as_completed(futures), total=len(futures)
192
- ):
193
  future.result()
194
 
195
  reorg_answer_file(answer_file)
 
33
 
34
 
35
  def get_answer(
36
+ question: dict,
37
+ model: str,
38
+ endpoint_info: dict,
39
+ num_choices: int,
40
+ max_tokens: int,
41
+ temperature: float,
42
+ answer_file: str,
43
+ api_dict: dict,
44
  ):
45
  if question["category"] in temperature_config:
46
  temperature = temperature_config[question["category"]]
 
61
  for j in range(len(question["turns"])):
62
  conv.append({"role": "user", "content": question["turns"][j]["content"]})
63
  if api_type == "anthropic":
64
+ output = chat_completion_anthropic(
65
+ model=endpoint_info["model_name"], messages=conv, temperature=temperature, max_tokens=max_tokens
66
+ )
 
67
  elif api_type == "mistral":
68
+ output = chat_completion_mistral(
69
+ model=endpoint_info["model_name"], messages=conv, temperature=temperature, max_tokens=max_tokens
70
+ )
 
71
  elif api_type == "yandex":
72
+ output = chat_completion_yandex(
73
+ model=endpoint_info["model_name"],
74
+ messages=conv,
75
+ temperature=temperature,
76
+ max_tokens=max_tokens,
77
+ api_dict=api_dict,
78
+ )
79
  elif api_type == "gigachat":
80
+ output = chat_completion_gigachat(
81
+ model=endpoint_info["model_name"],
82
+ messages=conv,
83
+ temperature=temperature,
84
+ max_tokens=max_tokens,
85
+ api_dict=api_dict,
86
+ )
87
  elif api_type == "gemini":
88
+ output = chat_completion_gemini(
89
+ model=endpoint_info["model_name"],
90
+ messages=question["turns"][j]["content"],
91
+ temperature=temperature,
92
+ max_tokens=max_tokens,
93
+ )
94
  elif api_type == "azure":
95
+ output = chat_completion_openai_azure(
96
+ model=endpoint_info["model_name"],
97
+ messages=conv,
98
+ temperature=temperature,
99
+ max_tokens=max_tokens,
100
+ api_dict=api_dict,
101
+ )
102
  elif api_type == "cohere":
103
+ output = chat_completion_cohere(
104
+ model=endpoint_info["model_name"], messages=conv, temperature=temperature, max_tokens=max_tokens
105
+ )
 
106
  else:
107
+ output = chat_completion_openai(
108
+ model=endpoint_info["model_name"],
109
+ messages=conv,
110
+ temperature=temperature,
111
+ max_tokens=max_tokens,
112
+ api_dict=api_dict,
113
+ )
114
  conv.append({"role": "assistant", "content": output})
115
 
116
  turns.append({"content": output, "token_len": len(encoding.encode(output))})
 
132
 
133
  if __name__ == "__main__":
134
  parser = argparse.ArgumentParser()
135
+ parser.add_argument("--setting-file", type=str, default="config/gen_answer_config.yaml")
136
+ parser.add_argument("--endpoint-file", type=str, default="config/api_config.yaml")
 
 
 
 
137
  args = parser.parse_args()
138
 
139
  settings = make_config(args.setting_file)
 
197
  futures.append(future)
198
  if count > 0:
199
  print(f"{count} number of existing answers")
200
+ for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
 
 
201
  future.result()
202
 
203
  reorg_answer_file(answer_file)
src/gen/gen_judgment.py CHANGED
@@ -55,12 +55,7 @@ def judgment(**args):
55
 
56
  num_games = 2 if configs["pairwise"] else 1
57
 
58
- output = {
59
- "question_id":question["question_id"],
60
- "model":answer["model_id"],
61
- "judge": model,
62
- "games":[]
63
- }
64
 
65
  for game in range(num_games):
66
  conv = [{"role": "system", "content": configs["system_prompt"]}]
@@ -73,7 +68,7 @@ def judgment(**args):
73
  base = 1
74
 
75
  if baseline:
76
- if game % 2 == 1: # swap position
77
  temp = baseline
78
  baseline = answer
79
  answer = temp
@@ -103,7 +98,7 @@ def judgment(**args):
103
  args["endpoint_dict"],
104
  )
105
 
106
- judgment += ("\n" + new_judgment)
107
 
108
  score, try_again = get_score(judgment, args["regex_pattern"])
109
 
@@ -112,18 +107,21 @@ def judgment(**args):
112
  if not try_again:
113
  break
114
 
115
- conv.append({"role": "user", "content": "continue your judgment and finish by outputting a final verdict label"})
 
 
116
 
117
- result = {
118
- "user_prompt": conv[1]["content"],
119
- "judgment": judgment,
120
- "score":score
121
- }
122
  output["games"].append(result)
123
 
124
  with open(output_file, "a") as f:
125
  f.write(json.dumps(output, ensure_ascii=False) + "\n")
126
- huggingface_hub.HfApi().upload_file(output_file, path_in_repo=f'model_judgment/{configs['judge_model']}/{output_file.split('/')[-1]}', repo_id='Vikhrmodels/openbench-eval', repo_type='dataset')
 
 
 
 
 
127
 
128
 
129
  if __name__ == "__main__":
@@ -136,8 +134,10 @@ if __name__ == "__main__":
136
  configs = make_config(args.setting_file)
137
  endpoint_list = make_config(args.endpoint_file)
138
 
139
- print(f'judge model: {configs["judge_model"]}, baseline: {configs["baseline"]}, baseline model: {configs["baseline_model"]}, reference: {configs["reference"]}, '
140
- + f'reference models: {configs["ref_model"]}, temperature: {configs["temperature"]}, max tokens: {configs["max_tokens"]}, pairwise: {configs["pairwise"]}')
 
 
141
 
142
  if configs["regex_pattern"]:
143
  pattern = re.compile(configs["regex_pattern"])
@@ -150,12 +150,15 @@ if __name__ == "__main__":
150
  questions = load_questions(question_file)
151
  model_answers_external = load_model_answers(external_dir)
152
  model_answers_internal = load_model_answers(internal_dir)
153
-
154
  # internal has priority
155
  model_answers = {**model_answers_external, **model_answers_internal}
156
 
157
  # if user choose a set of models, only judge those models
158
- models = [model.split('/')[-1].split('.')[0] for model in glob.glob('./data/arena-hard-v0.1/model_answer/external/*.jsonl')]
 
 
 
159
 
160
  ref_answers = None
161
  if configs["reference"]:
@@ -214,7 +217,5 @@ if __name__ == "__main__":
214
  if count > 0:
215
  print(f"{count} number of existing judgments")
216
 
217
- for future in tqdm(
218
- concurrent.futures.as_completed(futures), total=len(futures)
219
- ):
220
  future.result()
 
55
 
56
  num_games = 2 if configs["pairwise"] else 1
57
 
58
+ output = {"question_id": question["question_id"], "model": answer["model_id"], "judge": model, "games": []}
 
 
 
 
 
59
 
60
  for game in range(num_games):
61
  conv = [{"role": "system", "content": configs["system_prompt"]}]
 
68
  base = 1
69
 
70
  if baseline:
71
+ if game % 2 == 1: # swap position
72
  temp = baseline
73
  baseline = answer
74
  answer = temp
 
98
  args["endpoint_dict"],
99
  )
100
 
101
+ judgment += "\n" + new_judgment
102
 
103
  score, try_again = get_score(judgment, args["regex_pattern"])
104
 
 
107
  if not try_again:
108
  break
109
 
110
+ conv.append(
111
+ {"role": "user", "content": "continue your judgment and finish by outputting a final verdict label"}
112
+ )
113
 
114
+ result = {"user_prompt": conv[1]["content"], "judgment": judgment, "score": score}
 
 
 
 
115
  output["games"].append(result)
116
 
117
  with open(output_file, "a") as f:
118
  f.write(json.dumps(output, ensure_ascii=False) + "\n")
119
+ huggingface_hub.HfApi().upload_file(
120
+ output_file,
121
+ path_in_repo=f'model_judgment/{configs['judge_model']}/{output_file.split('/')[-1]}',
122
+ repo_id="Vikhrmodels/openbench-eval",
123
+ repo_type="dataset",
124
+ )
125
 
126
 
127
  if __name__ == "__main__":
 
134
  configs = make_config(args.setting_file)
135
  endpoint_list = make_config(args.endpoint_file)
136
 
137
+ print(
138
+ f'judge model: {configs["judge_model"]}, baseline: {configs["baseline"]}, baseline model: {configs["baseline_model"]}, reference: {configs["reference"]}, '
139
+ + f'reference models: {configs["ref_model"]}, temperature: {configs["temperature"]}, max tokens: {configs["max_tokens"]}, pairwise: {configs["pairwise"]}'
140
+ )
141
 
142
  if configs["regex_pattern"]:
143
  pattern = re.compile(configs["regex_pattern"])
 
150
  questions = load_questions(question_file)
151
  model_answers_external = load_model_answers(external_dir)
152
  model_answers_internal = load_model_answers(internal_dir)
153
+
154
  # internal has priority
155
  model_answers = {**model_answers_external, **model_answers_internal}
156
 
157
  # if user choose a set of models, only judge those models
158
+ models = [
159
+ model.split("/")[-1].split(".")[0]
160
+ for model in glob.glob("./data/arena-hard-v0.1/model_answer/external/*.jsonl")
161
+ ]
162
 
163
  ref_answers = None
164
  if configs["reference"]:
 
217
  if count > 0:
218
  print(f"{count} number of existing judgments")
219
 
220
+ for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
 
 
221
  future.result()
src/gen/show_result.py CHANGED
@@ -2,7 +2,6 @@ import pandas as pd
2
  import numpy as np
3
  import plotly.express as px
4
 
5
- import tiktoken
6
  import datetime
7
  import argparse
8
  import os
@@ -15,6 +14,7 @@ from sklearn.linear_model import LogisticRegression
15
  from collections import defaultdict
16
  from utils import load_model_answers
17
 
 
18
  def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
19
  models = pd.concat([df["model_a"], df["model_b"]]).unique()
20
  models = pd.Series(np.arange(len(models)), index=models)
@@ -35,18 +35,18 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
35
  # one tie => one A win + one B win
36
  # find tie + tie (both bad) index
37
  tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
38
- tie_idx[len(tie_idx)//2:] = False
39
  Y[tie_idx] = 1.0
40
 
41
  lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
42
- lr.fit(X,Y)
43
 
44
  elo_scores = SCALE * lr.coef_[0] + INIT_RATING
45
 
46
  # set anchor as gpt-3.5-turbo-0125 = 1000
47
  if "gpt-3.5-turbo-0125" in models.index:
48
  elo_scores += 1000 - elo_scores[models["gpt-3.5-turbo-0125"]]
49
- return pd.Series(elo_scores, index = models.index).sort_values(ascending=False)
50
 
51
 
52
  def get_bootstrap_result(battles, func_compute_elo, num_round):
@@ -58,9 +58,14 @@ def get_bootstrap_result(battles, func_compute_elo, num_round):
58
 
59
 
60
  def preety_print_two_ratings(ratings_1, ratings_2, column_names):
61
- df = pd.DataFrame([
62
- [n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()
63
- ], columns=["Model", column_names[0], column_names[1]]).sort_values(column_names[0], ascending=False).reset_index(drop=True)
 
 
 
 
 
64
  df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
65
  df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
66
  df.index = df.index + 1
@@ -68,18 +73,24 @@ def preety_print_two_ratings(ratings_1, ratings_2, column_names):
68
 
69
 
70
  def visualize_bootstrap_scores(df, title):
71
- bars = pd.DataFrame(dict(
72
- lower = df.quantile(.025),
73
- rating = df.quantile(.5),
74
- upper = df.quantile(.975))).reset_index(names="model").sort_values("rating", ascending=False)
75
- bars['error_y'] = bars['upper'] - bars["rating"]
76
- bars['error_y_minus'] = bars['rating'] - bars["lower"]
77
- bars['rating_rounded'] = np.round(bars['rating'], 2)
78
- fig = px.scatter(bars, x="model", y="rating", error_y="error_y",
79
- error_y_minus="error_y_minus", text="rating_rounded",
80
- title=title)
81
- fig.update_layout(xaxis_title="Model", yaxis_title="Rating",
82
- height=600)
 
 
 
 
 
 
83
  return fig
84
 
85
 
@@ -92,10 +103,7 @@ def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
92
  wins[a][b] = ea
93
  wins[b][a] = 1 - ea
94
 
95
- data = {
96
- a: [wins[a][b] if a != b else np.NAN for b in names]
97
- for a in names
98
- }
99
 
100
  df = pd.DataFrame(data, index=names)
101
  df.index.name = "model_a"
@@ -121,9 +129,7 @@ def get_battles_from_judgment(judge_name, first_game_only=False, WEIGHT=3):
121
 
122
  for _, row in df.iterrows():
123
  # game 1
124
- output = {"question_id": row["question_id"],
125
- "model_a": "gpt-3.5-turbo-0125",
126
- "model_b": row["model"]}
127
 
128
  game = row["games"][0]
129
 
@@ -148,9 +154,7 @@ def get_battles_from_judgment(judge_name, first_game_only=False, WEIGHT=3):
148
 
149
  if not first_game_only:
150
  # game 2
151
- output = {"question_id": row["question_id"],
152
- "model_a": "gpt-3.5-turbo-0125",
153
- "model_b": row["model"]}
154
 
155
  game = row["games"][1]
156
 
@@ -190,7 +194,9 @@ if __name__ == "__main__":
190
  parser.add_argument("--first-game-only", action="store_true")
191
  args = parser.parse_args()
192
  print(args)
193
- assert not args.load_bootstrap or (args.load_battles and args.load_bootstrap), "If loading prexisting bootstrapping data, you must also load preexisting battles."
 
 
194
 
195
  answer_dir = os.path.join("data", args.bench_name, "model_answer/external")
196
  model_answers = load_model_answers(answer_dir)
@@ -203,7 +209,6 @@ if __name__ == "__main__":
203
 
204
  bootstrap_online_elo = compute_mle_elo(battles)
205
 
206
-
207
  if args.load_bootstrap:
208
  bootstrap_elo_lu = pd.read_json("data/bootstrapping_results.jsonl", lines=True)
209
  else:
@@ -213,7 +218,7 @@ if __name__ == "__main__":
213
 
214
  stats = pd.DataFrame()
215
  stats["results"] = None
216
- stats["results"] = stats['results'].astype('object')
217
 
218
  for i, model in enumerate(bootstrap_online_elo.index):
219
  assert model in bootstrap_elo_lu.columns
@@ -241,18 +246,24 @@ if __name__ == "__main__":
241
  decimal = 1
242
  else:
243
  decimal = 0
244
- stats = stats.astype({"score" : int, "lower" : int, "upper" : int})
245
 
246
  stats.sort_values(by="score", ascending=False, inplace=True)
247
  for _, row in stats.iterrows():
248
- interval = str((round(row['lower'] - row['score'], decimal), round(row['upper'] - row['score'], decimal)))
249
- print(f"{row['model'] : <30} | score: {round(row['score'], decimal) : ^5} | 95% CI: {interval : ^12} | average #tokens: {int(row['avg_tokens'])}")
 
 
250
 
251
  if args.output:
252
  cur_date = datetime.datetime.now()
253
  date_str = cur_date.strftime("%Y%m%d")
254
  stats.to_json(f"arena_hard_leaderboard_{date_str}.json", orient="records", indent=4)
255
  import huggingface_hub
256
- huggingface_hub.HfApi().upload_file(path_or_fileobj=f"arena_hard_leaderboard_{date_str}.json",path_in_repo='evals/upd.json',
257
- repo_id='Vikhrmodels/openbench-eval',
258
- repo_type='dataset')
 
 
 
 
 
2
  import numpy as np
3
  import plotly.express as px
4
 
 
5
  import datetime
6
  import argparse
7
  import os
 
14
  from collections import defaultdict
15
  from utils import load_model_answers
16
 
17
+
18
  def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
19
  models = pd.concat([df["model_a"], df["model_b"]]).unique()
20
  models = pd.Series(np.arange(len(models)), index=models)
 
35
  # one tie => one A win + one B win
36
  # find tie + tie (both bad) index
37
  tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
38
+ tie_idx[len(tie_idx) // 2 :] = False
39
  Y[tie_idx] = 1.0
40
 
41
  lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
42
+ lr.fit(X, Y)
43
 
44
  elo_scores = SCALE * lr.coef_[0] + INIT_RATING
45
 
46
  # set anchor as gpt-3.5-turbo-0125 = 1000
47
  if "gpt-3.5-turbo-0125" in models.index:
48
  elo_scores += 1000 - elo_scores[models["gpt-3.5-turbo-0125"]]
49
+ return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
50
 
51
 
52
  def get_bootstrap_result(battles, func_compute_elo, num_round):
 
58
 
59
 
60
  def preety_print_two_ratings(ratings_1, ratings_2, column_names):
61
+ df = (
62
+ pd.DataFrame(
63
+ [[n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()],
64
+ columns=["Model", column_names[0], column_names[1]],
65
+ )
66
+ .sort_values(column_names[0], ascending=False)
67
+ .reset_index(drop=True)
68
+ )
69
  df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
70
  df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
71
  df.index = df.index + 1
 
73
 
74
 
75
  def visualize_bootstrap_scores(df, title):
76
+ bars = (
77
+ pd.DataFrame(dict(lower=df.quantile(0.025), rating=df.quantile(0.5), upper=df.quantile(0.975)))
78
+ .reset_index(names="model")
79
+ .sort_values("rating", ascending=False)
80
+ )
81
+ bars["error_y"] = bars["upper"] - bars["rating"]
82
+ bars["error_y_minus"] = bars["rating"] - bars["lower"]
83
+ bars["rating_rounded"] = np.round(bars["rating"], 2)
84
+ fig = px.scatter(
85
+ bars,
86
+ x="model",
87
+ y="rating",
88
+ error_y="error_y",
89
+ error_y_minus="error_y_minus",
90
+ text="rating_rounded",
91
+ title=title,
92
+ )
93
+ fig.update_layout(xaxis_title="Model", yaxis_title="Rating", height=600)
94
  return fig
95
 
96
 
 
103
  wins[a][b] = ea
104
  wins[b][a] = 1 - ea
105
 
106
+ data = {a: [wins[a][b] if a != b else np.NAN for b in names] for a in names}
 
 
 
107
 
108
  df = pd.DataFrame(data, index=names)
109
  df.index.name = "model_a"
 
129
 
130
  for _, row in df.iterrows():
131
  # game 1
132
+ output = {"question_id": row["question_id"], "model_a": "gpt-3.5-turbo-0125", "model_b": row["model"]}
 
 
133
 
134
  game = row["games"][0]
135
 
 
154
 
155
  if not first_game_only:
156
  # game 2
157
+ output = {"question_id": row["question_id"], "model_a": "gpt-3.5-turbo-0125", "model_b": row["model"]}
 
 
158
 
159
  game = row["games"][1]
160
 
 
194
  parser.add_argument("--first-game-only", action="store_true")
195
  args = parser.parse_args()
196
  print(args)
197
+ assert not args.load_bootstrap or (
198
+ args.load_battles and args.load_bootstrap
199
+ ), "If loading prexisting bootstrapping data, you must also load preexisting battles."
200
 
201
  answer_dir = os.path.join("data", args.bench_name, "model_answer/external")
202
  model_answers = load_model_answers(answer_dir)
 
209
 
210
  bootstrap_online_elo = compute_mle_elo(battles)
211
 
 
212
  if args.load_bootstrap:
213
  bootstrap_elo_lu = pd.read_json("data/bootstrapping_results.jsonl", lines=True)
214
  else:
 
218
 
219
  stats = pd.DataFrame()
220
  stats["results"] = None
221
+ stats["results"] = stats["results"].astype("object")
222
 
223
  for i, model in enumerate(bootstrap_online_elo.index):
224
  assert model in bootstrap_elo_lu.columns
 
246
  decimal = 1
247
  else:
248
  decimal = 0
249
+ stats = stats.astype({"score": int, "lower": int, "upper": int})
250
 
251
  stats.sort_values(by="score", ascending=False, inplace=True)
252
  for _, row in stats.iterrows():
253
+ interval = str((round(row["lower"] - row["score"], decimal), round(row["upper"] - row["score"], decimal)))
254
+ print(
255
+ f"{row['model'] : <30} | score: {round(row['score'], decimal) : ^5} | 95% CI: {interval : ^12} | average #tokens: {int(row['avg_tokens'])}"
256
+ )
257
 
258
  if args.output:
259
  cur_date = datetime.datetime.now()
260
  date_str = cur_date.strftime("%Y%m%d")
261
  stats.to_json(f"arena_hard_leaderboard_{date_str}.json", orient="records", indent=4)
262
  import huggingface_hub
263
+
264
+ huggingface_hub.HfApi().upload_file(
265
+ path_or_fileobj=f"arena_hard_leaderboard_{date_str}.json",
266
+ path_in_repo="evals/upd.json",
267
+ repo_id="Vikhrmodels/openbench-eval",
268
+ repo_type="dataset",
269
+ )
src/gen/utils.py CHANGED
@@ -77,9 +77,7 @@ def get_endpoint(endpoint_list):
77
  return None
78
  assert endpoint_list is not None
79
  # randomly pick one
80
- api_dict = random.choices(
81
- endpoint_list
82
- )[0]
83
  return api_dict
84
 
85
 
@@ -91,9 +89,11 @@ def make_config(config_file: str) -> dict:
91
 
92
  return config_kwargs
93
 
 
94
  def chat_completion_gigachat(model, messages, temperature, max_tokens, api_dict=None):
95
  from gigachat import GigaChat
96
  from gigachat.models import Chat, Messages
 
97
  assert api_dict is not None, "no api settings provided!"
98
  auth_token = api_dict.get("auth_token", os.environ.get(api_dict["auth_token"], ""))
99
  client = GigaChat(credentials=auth_token, model=model, verify_ssl_certs=False)
@@ -115,15 +115,13 @@ def chat_completion_gigachat(model, messages, temperature, max_tokens, api_dict=
115
 
116
  return output
117
 
 
118
  def chat_completion_yandex(model, messages, temperature, max_tokens, api_dict=None):
119
  from yandex_gpt import YandexGPT, YandexGPTConfigManagerForIAMToken
 
120
  assert api_dict is not None, "no api settings provided!"
121
  iam_token = api_dict.get("iam_token", os.environ.get(api_dict["iam_token_ENV"], ""))
122
- config = YandexGPTConfigManagerForIAMToken(
123
- model_type=model,
124
- catalog_id=api_dict["catalog_id"],
125
- iam_token=iam_token
126
- )
127
  client = YandexGPT(config_manager=config)
128
 
129
  messages = [{"role": m["role"], "text": m["content"]} for m in messages]
@@ -147,6 +145,7 @@ def chat_completion_yandex(model, messages, temperature, max_tokens, api_dict=No
147
 
148
  def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None):
149
  import openai
 
150
  api_key = api_dict.get("api_key", os.environ.get(api_dict["api_key_ENV"], ""))
151
  if api_dict:
152
  client = openai.OpenAI(
@@ -165,8 +164,8 @@ def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=No
165
  messages=messages,
166
  temperature=temperature,
167
  max_tokens=max_tokens,
168
- stop=["</s>", "<eos>", "<|eot_id|>"]
169
- )
170
  output = completion.choices[0].message.content
171
  break
172
  except openai.RateLimitError as e:
@@ -175,7 +174,7 @@ def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=No
175
  except openai.BadRequestError as e:
176
  print(messages)
177
  print(type(e), e)
178
- except KeyError:
179
  print(type(e), e)
180
  break
181
 
@@ -189,11 +188,7 @@ def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_d
189
  api_base = api_dict["api_base"]
190
  api_key = api_dict.get("api_key", os.environ.get(api_dict["api_key_ENV"], ""))
191
  client = AzureOpenAI(
192
- azure_endpoint = api_base,
193
- api_key= api_key,
194
- api_version=api_dict["api_version"],
195
- timeout=240,
196
- max_retries=2
197
  )
198
 
199
  output = API_ERROR_OUTPUT
@@ -215,7 +210,7 @@ def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_d
215
  except openai.BadRequestError as e:
216
  print(type(e), e)
217
  break
218
- except KeyError:
219
  print(type(e), e)
220
  break
221
 
@@ -246,7 +241,7 @@ def chat_completion_anthropic(model, messages, temperature, max_tokens, api_dict
246
  stop_sequences=[anthropic.HUMAN_PROMPT],
247
  max_tokens=max_tokens,
248
  temperature=temperature,
249
- system=sys_msg
250
  )
251
  output = response.content[0].text
252
  break
@@ -286,25 +281,14 @@ def chat_completion_mistral(model, messages, temperature, max_tokens):
286
 
287
  def chat_completion_gemini(model, messages, temperature, max_tokens):
288
  import google.generativeai as genai
 
289
  genai.configure(api_key=os.environ["GEMINI_API_KEY"])
290
 
291
  safety_settings = [
292
- {
293
- "category": "HARM_CATEGORY_HARASSMENT",
294
- "threshold": "BLOCK_NONE"
295
- },
296
- {
297
- "category": "HARM_CATEGORY_HATE_SPEECH",
298
- "threshold": "BLOCK_NONE"
299
- },
300
- {
301
- "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
302
- "threshold": "BLOCK_NONE"
303
- },
304
- {
305
- "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
306
- "threshold": "BLOCK_NONE"
307
- },
308
  ]
309
 
310
  # Set up the model
@@ -319,9 +303,8 @@ def chat_completion_gemini(model, messages, temperature, max_tokens):
319
  for _ in range(API_MAX_RETRY):
320
  try:
321
  gemini = genai.GenerativeModel(
322
- model_name=model,
323
- generation_config=generation_config,
324
- safety_settings=safety_settings)
325
 
326
  convo = gemini.start_chat(history=[])
327
 
@@ -344,9 +327,7 @@ def chat_completion_cohere(model, messages, temperature, max_tokens):
344
  co = cohere.Client(os.environ["COHERE_API_KEY"])
345
  assert len(messages) > 0
346
 
347
- template_map = {"system":"SYSTEM",
348
- "assistant":"CHATBOT",
349
- "user":"USER"}
350
 
351
  assert messages[-1]["role"] == "user"
352
  prompt = messages[-1]["content"]
@@ -354,7 +335,7 @@ def chat_completion_cohere(model, messages, temperature, max_tokens):
354
  if len(messages) > 1:
355
  history = []
356
  for message in messages[:-1]:
357
- history.append({"role":template_map[message["role"]], "message":message["content"]})
358
  else:
359
  history = None
360
 
@@ -384,9 +365,9 @@ def reorg_answer_file(answer_file):
384
  """Sort by question id and de-duplication"""
385
  answers = {}
386
  with open(answer_file, "r") as fin:
387
- for l in fin:
388
- qid = json.loads(l)["question_id"]
389
- answers[qid] = l
390
 
391
  qids = sorted(list(answers.keys()))
392
  with open(answer_file, "w") as fout:
 
77
  return None
78
  assert endpoint_list is not None
79
  # randomly pick one
80
+ api_dict = random.choices(endpoint_list)[0]
 
 
81
  return api_dict
82
 
83
 
 
89
 
90
  return config_kwargs
91
 
92
+
93
  def chat_completion_gigachat(model, messages, temperature, max_tokens, api_dict=None):
94
  from gigachat import GigaChat
95
  from gigachat.models import Chat, Messages
96
+
97
  assert api_dict is not None, "no api settings provided!"
98
  auth_token = api_dict.get("auth_token", os.environ.get(api_dict["auth_token"], ""))
99
  client = GigaChat(credentials=auth_token, model=model, verify_ssl_certs=False)
 
115
 
116
  return output
117
 
118
+
119
  def chat_completion_yandex(model, messages, temperature, max_tokens, api_dict=None):
120
  from yandex_gpt import YandexGPT, YandexGPTConfigManagerForIAMToken
121
+
122
  assert api_dict is not None, "no api settings provided!"
123
  iam_token = api_dict.get("iam_token", os.environ.get(api_dict["iam_token_ENV"], ""))
124
+ config = YandexGPTConfigManagerForIAMToken(model_type=model, catalog_id=api_dict["catalog_id"], iam_token=iam_token)
 
 
 
 
125
  client = YandexGPT(config_manager=config)
126
 
127
  messages = [{"role": m["role"], "text": m["content"]} for m in messages]
 
145
 
146
  def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None):
147
  import openai
148
+
149
  api_key = api_dict.get("api_key", os.environ.get(api_dict["api_key_ENV"], ""))
150
  if api_dict:
151
  client = openai.OpenAI(
 
164
  messages=messages,
165
  temperature=temperature,
166
  max_tokens=max_tokens,
167
+ stop=["</s>", "<eos>", "<|eot_id|>"],
168
+ )
169
  output = completion.choices[0].message.content
170
  break
171
  except openai.RateLimitError as e:
 
174
  except openai.BadRequestError as e:
175
  print(messages)
176
  print(type(e), e)
177
+ except KeyError as e:
178
  print(type(e), e)
179
  break
180
 
 
188
  api_base = api_dict["api_base"]
189
  api_key = api_dict.get("api_key", os.environ.get(api_dict["api_key_ENV"], ""))
190
  client = AzureOpenAI(
191
+ azure_endpoint=api_base, api_key=api_key, api_version=api_dict["api_version"], timeout=240, max_retries=2
 
 
 
 
192
  )
193
 
194
  output = API_ERROR_OUTPUT
 
210
  except openai.BadRequestError as e:
211
  print(type(e), e)
212
  break
213
+ except KeyError as e:
214
  print(type(e), e)
215
  break
216
 
 
241
  stop_sequences=[anthropic.HUMAN_PROMPT],
242
  max_tokens=max_tokens,
243
  temperature=temperature,
244
+ system=sys_msg,
245
  )
246
  output = response.content[0].text
247
  break
 
281
 
282
  def chat_completion_gemini(model, messages, temperature, max_tokens):
283
  import google.generativeai as genai
284
+
285
  genai.configure(api_key=os.environ["GEMINI_API_KEY"])
286
 
287
  safety_settings = [
288
+ {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
289
+ {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
290
+ {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
291
+ {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
 
 
 
 
 
 
 
 
 
 
 
 
292
  ]
293
 
294
  # Set up the model
 
303
  for _ in range(API_MAX_RETRY):
304
  try:
305
  gemini = genai.GenerativeModel(
306
+ model_name=model, generation_config=generation_config, safety_settings=safety_settings
307
+ )
 
308
 
309
  convo = gemini.start_chat(history=[])
310
 
 
327
  co = cohere.Client(os.environ["COHERE_API_KEY"])
328
  assert len(messages) > 0
329
 
330
+ template_map = {"system": "SYSTEM", "assistant": "CHATBOT", "user": "USER"}
 
 
331
 
332
  assert messages[-1]["role"] == "user"
333
  prompt = messages[-1]["content"]
 
335
  if len(messages) > 1:
336
  history = []
337
  for message in messages[:-1]:
338
+ history.append({"role": template_map[message["role"]], "message": message["content"]})
339
  else:
340
  history = None
341
 
 
365
  """Sort by question id and de-duplication"""
366
  answers = {}
367
  with open(answer_file, "r") as fin:
368
+ for line in fin:
369
+ qid = json.loads(line)["question_id"]
370
+ answers[qid] = line
371
 
372
  qids = sorted(list(answers.keys()))
373
  with open(answer_file, "w") as fout:
src/leaderboard/build_leaderboard.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import json
3
  import logging
4
  import os
@@ -11,7 +10,8 @@ from huggingface_hub import snapshot_download
11
  from src.envs import EVAL_RESULTS_PATH
12
 
13
  # Configure logging
14
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
15
 
16
  def time_diff_wrapper(func):
17
  def wrapper(*args, **kwargs):
@@ -21,15 +21,17 @@ def time_diff_wrapper(func):
21
  diff = end_time - start_time
22
  logging.info(f"Time taken for {func.__name__}: {diff} seconds")
23
  return result
 
24
  return wrapper
25
 
 
26
  @time_diff_wrapper
27
  def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
28
  """Download dataset with exponential backoff retries."""
29
  attempt = 0
30
  while attempt < max_attempts:
31
  try:
32
- logging.info(f"Downloading {repo_id} to {local_dir}")
33
  snapshot_download(
34
  repo_id=repo_id,
35
  local_dir=local_dir,
@@ -42,27 +44,41 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
42
  logging.info("Download successful")
43
  return
44
  except Exception as e:
45
- wait_time = backoff_factor ** attempt
46
  logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
47
  time.sleep(wait_time)
48
  attempt += 1
49
  logging.error(f"Failed to download {repo_id} after {max_attempts} attempts")
50
 
 
51
  def build_leadearboard_df():
52
  """Initializes the application space, loading only necessary data."""
53
- # Check ENV LEADERBOARD_DOWNLOAD if wee need to download the leaderboard
54
- if os.getenv("LEADERBOARD_DOWNLOAD", "True") == "True":
55
- # These downloads only occur on full initialization
56
- # try:
57
- # download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
58
- # download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
59
- download_dataset("Vikhrmodels/openbench-eval", EVAL_RESULTS_PATH)
60
- # print(subprocess.Popen('ls src'))
61
- subprocess.run(['rsync', '-avzP', '--ignore-existing', f'{EVAL_RESULTS_PATH[2:]}/external/*', 'src/gen/data/arena-hard-v0.1/model_answer/'], check=False)
62
- subprocess.run(['rsync', '-avzP', '--ignore-existing', f'{EVAL_RESULTS_PATH[2:]}/model_judgment/*', 'src/gen/data/arena-hard-v0.1/model_judgement/'], check=False)
63
- # except Exception:
64
- # restart_space()
65
 
66
- # Always retrieve the leaderboard DataFrame
67
- leaderboard_df = pd.DataFrame.from_records(json.load(open('eval-results/evals/upd.json','r')))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  return leaderboard_df.copy()
 
 
1
  import json
2
  import logging
3
  import os
 
10
  from src.envs import EVAL_RESULTS_PATH
11
 
12
  # Configure logging
13
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
14
+
15
 
16
  def time_diff_wrapper(func):
17
  def wrapper(*args, **kwargs):
 
21
  diff = end_time - start_time
22
  logging.info(f"Time taken for {func.__name__}: {diff} seconds")
23
  return result
24
+
25
  return wrapper
26
 
27
+
28
  @time_diff_wrapper
29
  def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
30
  """Download dataset with exponential backoff retries."""
31
  attempt = 0
32
  while attempt < max_attempts:
33
  try:
34
+ logging.info("Downloading %s to %s", repo_id, local_dir)
35
  snapshot_download(
36
  repo_id=repo_id,
37
  local_dir=local_dir,
 
44
  logging.info("Download successful")
45
  return
46
  except Exception as e:
47
+ wait_time = backoff_factor**attempt
48
  logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
49
  time.sleep(wait_time)
50
  attempt += 1
51
  logging.error(f"Failed to download {repo_id} after {max_attempts} attempts")
52
 
53
+
54
  def build_leadearboard_df():
55
  """Initializes the application space, loading only necessary data."""
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ # download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
58
+ # download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
59
+ download_dataset("Vikhrmodels/openbench-eval", EVAL_RESULTS_PATH)
60
+ # print(subprocess.Popen('ls src'))
61
+ subprocess.run(
62
+ [
63
+ "rsync",
64
+ "-avzP",
65
+ "--ignore-existing",
66
+ f"{EVAL_RESULTS_PATH}/external/*",
67
+ "src/gen/data/arena-hard-v0.1/model_answer/",
68
+ ],
69
+ check=False,
70
+ )
71
+ subprocess.run(
72
+ [
73
+ "rsync",
74
+ "-avzP",
75
+ "--ignore-existing",
76
+ f"{EVAL_RESULTS_PATH}/model_judgment/*",
77
+ "src/gen/data/arena-hard-v0.1/model_judgement/",
78
+ ],
79
+ check=False,
80
+ )
81
+
82
+ # Retrieve the leaderboard DataFrame
83
+ leaderboard_df = pd.DataFrame.from_records(json.load(open("eval-results/evals/upd.json", "r")))
84
  return leaderboard_df.copy()
src/leaderboard/filter_models.py CHANGED
@@ -137,9 +137,9 @@ def flag_models(leaderboard_data: list[dict]):
137
  if model_data[AutoEvalColumn.not_flagged.name]:
138
  flag_key = model_data[AutoEvalColumn.fullname.name]
139
  else:
140
- # Merges and moes are flagged
141
  flag_key = "merged"
142
-
143
  # Reverse the logic: Check for non-flagged models instead
144
  if flag_key in FLAGGED_MODELS:
145
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
@@ -147,9 +147,9 @@ def flag_models(leaderboard_data: list[dict]):
147
  FLAGGED_MODELS[flag_key],
148
  f"See discussion #{issue_num}",
149
  )
150
- model_data[AutoEvalColumn.model.name] = (
151
- f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
152
- )
153
  model_data[AutoEvalColumn.not_flagged.name] = False
154
  else:
155
  model_data[AutoEvalColumn.not_flagged.name] = True
@@ -171,4 +171,3 @@ def remove_forbidden_models(leaderboard_data: list[dict]):
171
  def filter_models_flags(leaderboard_data: list[dict]):
172
  leaderboard_data = remove_forbidden_models(leaderboard_data)
173
  flag_models(leaderboard_data)
174
-
 
137
  if model_data[AutoEvalColumn.not_flagged.name]:
138
  flag_key = model_data[AutoEvalColumn.fullname.name]
139
  else:
140
+ # Merges and moes are flagged
141
  flag_key = "merged"
142
+
143
  # Reverse the logic: Check for non-flagged models instead
144
  if flag_key in FLAGGED_MODELS:
145
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
 
147
  FLAGGED_MODELS[flag_key],
148
  f"See discussion #{issue_num}",
149
  )
150
+ model_data[
151
+ AutoEvalColumn.model.name
152
+ ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
153
  model_data[AutoEvalColumn.not_flagged.name] = False
154
  else:
155
  model_data[AutoEvalColumn.not_flagged.name] = True
 
171
  def filter_models_flags(leaderboard_data: list[dict]):
172
  leaderboard_data = remove_forbidden_models(leaderboard_data)
173
  flag_models(leaderboard_data)
 
src/leaderboard/read_evals.py CHANGED
@@ -16,36 +16,36 @@ from src.display.formatting import make_clickable_model
16
  from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
17
 
18
  # Configure logging
19
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
20
 
21
  @dataclass
22
  class EvalResult:
23
  # Also see src.display.utils.AutoEvalColumn for what will be displayed.
24
- eval_name: str # org_model_precision (uid)
25
- full_model: str # org/model (path on hub)
26
  org: Optional[str]
27
  model: str
28
- revision: str # commit hash, "" if main
29
  results: Dict[str, float]
30
  precision: Precision = Precision.Unknown
31
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
32
  weight_type: WeightType = WeightType.Original
33
- architecture: str = "Unknown" # From config file
34
  license: str = "?"
35
  likes: int = 0
36
  num_params: int = 0
37
- date: str = "" # submission date of request file
38
  still_on_hub: bool = True
39
  is_merge: bool = False
40
  not_flagged: bool = False
41
  status: str = "FINISHED"
42
  # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
43
  tags: List[str] = field(default_factory=list)
44
-
45
-
46
  @classmethod
47
- def init_from_json_file(cls, json_filepath: str) -> 'EvalResult':
48
- with open(json_filepath, 'r') as fp:
49
  data = json.load(fp)
50
 
51
  config = data.get("config_general", {})
@@ -72,7 +72,7 @@ class EvalResult:
72
  model=model,
73
  results=results,
74
  precision=precision,
75
- revision=config.get("model_sha", "")
76
  )
77
 
78
  @staticmethod
@@ -118,9 +118,8 @@ class EvalResult:
118
 
119
  mean_acc = np.mean(accs) * 100.0
120
  results[task.benchmark] = mean_acc
121
-
122
- return results
123
 
 
124
 
125
  def update_with_request_file(self, requests_path):
126
  """Finds the relevant request file for the current model and updates info with it."""
@@ -130,17 +129,17 @@ class EvalResult:
130
  logging.warning(f"No request file for {self.org}/{self.model}")
131
  self.status = "FAILED"
132
  return
133
-
134
  with open(request_file, "r") as f:
135
  request = json.load(f)
136
-
137
  self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
138
  self.weight_type = WeightType[request.get("weight_type", "Original")]
139
  self.num_params = int(request.get("params", 0)) # Ensuring type safety
140
  self.date = request.get("submitted_time", "")
141
  self.architecture = request.get("architectures", "Unknown")
142
  self.status = request.get("status", "FAILED")
143
-
144
  except FileNotFoundError:
145
  self.status = "FAILED"
146
  logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
@@ -154,7 +153,6 @@ class EvalResult:
154
  self.status = "FAILED"
155
  logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
156
 
157
-
158
  def update_with_dynamic_file_dict(self, file_dict):
159
  """Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
160
  # Default values set for optional or potentially missing keys.
@@ -162,11 +160,10 @@ class EvalResult:
162
  self.likes = int(file_dict.get("likes", 0)) # Ensure likes is treated as an integer
163
  self.still_on_hub = file_dict.get("still_on_hub", False) # Default to False if key is missing
164
  self.tags = file_dict.get("tags", [])
165
-
166
  # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
167
  self.not_flagged = not (any("flagged" in tag for tag in self.tags))
168
 
169
-
170
  def to_dict(self):
171
  """Converts the Eval Result to a dict compatible with our dataframe display"""
172
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
@@ -185,8 +182,10 @@ class EvalResult:
185
  AutoEvalColumn.likes.name: self.likes,
186
  AutoEvalColumn.params.name: self.num_params,
187
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
188
- AutoEvalColumn.merged.name: not( "merge" in self.tags if self.tags else False),
189
- AutoEvalColumn.moe.name: not ( ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()) ,
 
 
190
  AutoEvalColumn.not_flagged.name: self.not_flagged,
191
  }
192
 
@@ -194,16 +193,16 @@ class EvalResult:
194
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
195
 
196
  return data_dict
197
-
198
 
199
  def get_request_file_for_model(requests_path, model_name, precision):
200
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
201
  requests_path = Path(requests_path)
202
  pattern = f"{model_name}_eval_request_*.json"
203
-
204
  # Using pathlib to find files matching the pattern
205
  request_files = list(requests_path.glob(pattern))
206
-
207
  # Sort the files by name in descending order to mimic 'reverse=True'
208
  request_files.sort(reverse=True)
209
 
@@ -214,7 +213,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
214
  req_content = json.load(f)
215
  if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
216
  request_file = str(request_file)
217
-
218
  # Return empty string if no file found that matches criteria
219
  return request_file
220
 
@@ -223,9 +222,9 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
223
  """From the path of the results folder root, extract all needed info for results"""
224
  with open(dynamic_path) as f:
225
  dynamic_data = json.load(f)
226
-
227
  results_path = Path(results_path)
228
- model_files = list(results_path.rglob('results_*.json'))
229
  model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
230
 
231
  eval_results = {}
@@ -260,4 +259,3 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
260
  continue
261
 
262
  return results
263
-
 
16
  from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
17
 
18
  # Configure logging
19
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
20
+
21
 
22
  @dataclass
23
  class EvalResult:
24
  # Also see src.display.utils.AutoEvalColumn for what will be displayed.
25
+ eval_name: str # org_model_precision (uid)
26
+ full_model: str # org/model (path on hub)
27
  org: Optional[str]
28
  model: str
29
+ revision: str # commit hash, "" if main
30
  results: Dict[str, float]
31
  precision: Precision = Precision.Unknown
32
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
33
  weight_type: WeightType = WeightType.Original
34
+ architecture: str = "Unknown" # From config file
35
  license: str = "?"
36
  likes: int = 0
37
  num_params: int = 0
38
+ date: str = "" # submission date of request file
39
  still_on_hub: bool = True
40
  is_merge: bool = False
41
  not_flagged: bool = False
42
  status: str = "FINISHED"
43
  # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
44
  tags: List[str] = field(default_factory=list)
45
+
 
46
  @classmethod
47
+ def init_from_json_file(cls, json_filepath: str) -> "EvalResult":
48
+ with open(json_filepath, "r") as fp:
49
  data = json.load(fp)
50
 
51
  config = data.get("config_general", {})
 
72
  model=model,
73
  results=results,
74
  precision=precision,
75
+ revision=config.get("model_sha", ""),
76
  )
77
 
78
  @staticmethod
 
118
 
119
  mean_acc = np.mean(accs) * 100.0
120
  results[task.benchmark] = mean_acc
 
 
121
 
122
+ return results
123
 
124
  def update_with_request_file(self, requests_path):
125
  """Finds the relevant request file for the current model and updates info with it."""
 
129
  logging.warning(f"No request file for {self.org}/{self.model}")
130
  self.status = "FAILED"
131
  return
132
+
133
  with open(request_file, "r") as f:
134
  request = json.load(f)
135
+
136
  self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
137
  self.weight_type = WeightType[request.get("weight_type", "Original")]
138
  self.num_params = int(request.get("params", 0)) # Ensuring type safety
139
  self.date = request.get("submitted_time", "")
140
  self.architecture = request.get("architectures", "Unknown")
141
  self.status = request.get("status", "FAILED")
142
+
143
  except FileNotFoundError:
144
  self.status = "FAILED"
145
  logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
 
153
  self.status = "FAILED"
154
  logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
155
 
 
156
  def update_with_dynamic_file_dict(self, file_dict):
157
  """Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
158
  # Default values set for optional or potentially missing keys.
 
160
  self.likes = int(file_dict.get("likes", 0)) # Ensure likes is treated as an integer
161
  self.still_on_hub = file_dict.get("still_on_hub", False) # Default to False if key is missing
162
  self.tags = file_dict.get("tags", [])
163
+
164
  # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
165
  self.not_flagged = not (any("flagged" in tag for tag in self.tags))
166
 
 
167
  def to_dict(self):
168
  """Converts the Eval Result to a dict compatible with our dataframe display"""
169
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
182
  AutoEvalColumn.likes.name: self.likes,
183
  AutoEvalColumn.params.name: self.num_params,
184
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
185
+ AutoEvalColumn.merged.name: not ("merge" in self.tags if self.tags else False),
186
+ AutoEvalColumn.moe.name: not (
187
+ ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()
188
+ ),
189
  AutoEvalColumn.not_flagged.name: self.not_flagged,
190
  }
191
 
 
193
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
194
 
195
  return data_dict
196
+
197
 
198
  def get_request_file_for_model(requests_path, model_name, precision):
199
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
200
  requests_path = Path(requests_path)
201
  pattern = f"{model_name}_eval_request_*.json"
202
+
203
  # Using pathlib to find files matching the pattern
204
  request_files = list(requests_path.glob(pattern))
205
+
206
  # Sort the files by name in descending order to mimic 'reverse=True'
207
  request_files.sort(reverse=True)
208
 
 
213
  req_content = json.load(f)
214
  if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
215
  request_file = str(request_file)
216
+
217
  # Return empty string if no file found that matches criteria
218
  return request_file
219
 
 
222
  """From the path of the results folder root, extract all needed info for results"""
223
  with open(dynamic_path) as f:
224
  dynamic_data = json.load(f)
225
+
226
  results_path = Path(results_path)
227
+ model_files = list(results_path.rglob("results_*.json"))
228
  model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
229
 
230
  eval_results = {}
 
259
  continue
260
 
261
  return results
 
src/populate.py CHANGED
@@ -1,5 +1,3 @@
1
- import json
2
- import os
3
  import pathlib
4
  import pandas as pd
5
  from src.display.formatting import has_no_nan_values, make_clickable_model
@@ -21,7 +19,7 @@ def get_evaluation_queue_df(save_path, cols):
21
  save_path = pathlib.Path(save_path)
22
  all_evals = []
23
 
24
- for path in save_path.rglob('*.json'):
25
  data = load_json_data(path)
26
  if data:
27
  all_evals.append(_process_model_data(data))
 
 
 
1
  import pathlib
2
  import pandas as pd
3
  from src.display.formatting import has_no_nan_values, make_clickable_model
 
19
  save_path = pathlib.Path(save_path)
20
  all_evals = []
21
 
22
+ for path in save_path.rglob("*.json"):
23
  data = load_json_data(path)
24
  if data:
25
  all_evals.append(_process_model_data(data))
src/scripts/create_request_file.py CHANGED
@@ -47,7 +47,7 @@ def main():
47
  eval_entry = {
48
  "model": model_name,
49
  "base_model": base_model,
50
- "revision": model_info.sha, # force to use the exact model commit
51
  "private": False,
52
  "precision": precision,
53
  "weight_type": weight_type,
 
47
  eval_entry = {
48
  "model": model_name,
49
  "base_model": base_model,
50
+ "revision": model_info.sha, # force to use the exact model commit
51
  "private": False,
52
  "precision": precision,
53
  "weight_type": weight_type,
src/scripts/update_all_request_files.py CHANGED
@@ -91,6 +91,6 @@ def update_models(file_path, models_on_the_hub):
91
 
92
  def update_dynamic_files():
93
  # from gen import gen_answer,gen_judgment\
94
- subprocess.Popen('python3 ../gen/gen_judgement.py')
95
 
96
- subprocess.Popen('python3 ../gen/show_result.py --output')
 
91
 
92
  def update_dynamic_files():
93
  # from gen import gen_answer,gen_judgment\
94
+ subprocess.Popen("python3 ../gen/gen_judgement.py")
95
 
96
+ subprocess.Popen("python3 ../gen/show_result.py --output")
src/submission/check_validity.py CHANGED
@@ -49,7 +49,7 @@ def is_model_on_hub(
49
  ) # , force_download=True)
50
  if test_tokenizer:
51
  try:
52
- tk = AutoTokenizer.from_pretrained(
53
  model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
54
  )
55
  except ValueError as e:
 
49
  ) # , force_download=True)
50
  if test_tokenizer:
51
  try:
52
+ AutoTokenizer.from_pretrained(
53
  model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
54
  )
55
  except ValueError as e:
src/submission/submit.py CHANGED
@@ -1,21 +1,4 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
-
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.display.formatting import styled_error, styled_message, styled_warning
8
- from src.envs import (
9
- API,
10
- DYNAMIC_INFO_FILE_PATH,
11
- DYNAMIC_INFO_PATH,
12
- DYNAMIC_INFO_REPO,
13
- EVAL_REQUESTS_PATH,
14
- H4_TOKEN,
15
- QUEUE_REPO,
16
- RATE_LIMIT_PERIOD,
17
- RATE_LIMIT_QUOTA,
18
- )
19
  # from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
20
  # from src.submission.check_validity import (
21
  # already_submitted_models,
@@ -38,7 +21,6 @@ def add_new_eval(
38
  # if not REQUESTED_MODELS:
39
  # REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
40
 
41
-
42
  # user_name = ""
43
  # model_path = model
44
  # if "/" in model:
@@ -186,6 +168,4 @@ def add_new_eval(
186
  # # Remove the local file
187
  # os.remove(out_path)
188
 
189
- return styled_message(
190
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour."
191
- )
 
1
+ from src.display.formatting import styled_message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  # from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
3
  # from src.submission.check_validity import (
4
  # already_submitted_models,
 
21
  # if not REQUESTED_MODELS:
22
  # REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
23
 
 
24
  # user_name = ""
25
  # model_path = model
26
  # if "/" in model:
 
168
  # # Remove the local file
169
  # os.remove(out_path)
170
 
171
+ return styled_message("Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour.")
 
 
src/tools/plots.py CHANGED
@@ -3,7 +3,7 @@ import pandas as pd
3
  import plotly.express as px
4
  from plotly.graph_objs import Figure
5
 
6
- from src.display.utils import AutoEvalColumn, Task, Tasks
7
  from src.display.utils import human_baseline_row as HUMAN_BASELINE
8
  from src.leaderboard.filter_models import FLAGGED_MODELS
9
  from src.leaderboard.read_evals import EvalResult
 
3
  import plotly.express as px
4
  from plotly.graph_objs import Figure
5
 
6
+ from src.display.utils import AutoEvalColumn, Task, Tasks
7
  from src.display.utils import human_baseline_row as HUMAN_BASELINE
8
  from src.leaderboard.filter_models import FLAGGED_MODELS
9
  from src.leaderboard.read_evals import EvalResult