Yotam-Perlitz commited on
Commit
363d8ae
1 Parent(s): 566ad63

improve bench upload code

Browse files

Signed-off-by: Yotam-Perlitz <[email protected]>

Files changed (1) hide show
  1. app.py +76 -31
app.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
  import plotly.express as px
6
  import streamlit as st
7
  from bat import Benchmark, Config, Reporter, Tester
 
8
 
9
 
10
  holistic_scenarios = [
@@ -58,7 +59,7 @@ with st.expander("Leaderboard configurations (defaults are great BTW)", icon="
58
  label="Select Correlation type", options=["kendall", "pearson"], index=0
59
  )
60
 
61
- aggragate_scenario_whitelist = aggragate_scenarios
62
  # [
63
  # scen
64
  # for scen in all_scenarios_for_aggragate
@@ -98,7 +99,38 @@ with st.expander("Add your benchmarks here!", icon="🔥"):
98
  my_benchmark = Benchmark()
99
  if uploaded_file is not None:
100
  df = pd.read_csv(uploaded_file)
101
- my_benchmark.assign_df(df, data_source="Uploaded Benchmark")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
 
104
  def run_load(
@@ -108,7 +140,7 @@ def run_load(
108
  corr_types=["kendall"],
109
  n_exps=10,
110
  my_benchmark=Benchmark(),
111
- use_caching=True,
112
  ):
113
  # Create a hash of the inputs to generate a unique cache file for each set of inputs
114
  input_str = (
@@ -147,24 +179,6 @@ def run_load(
147
  else:
148
  print("Cached results not found, calculating")
149
 
150
- cfg = Config(
151
- exp_to_run="example",
152
- n_models_taken_list=n_models_taken_list,
153
- model_select_strategy_list=model_select_strategy_list,
154
- corr_types=corr_types,
155
- n_exps=n_exps if n_models_taken_list != [0] else 1,
156
- )
157
-
158
- # holistic = Benchmark()
159
- # holistic.load_local_catalog()
160
- # holistic.df = holistic.df.query("scenario in @holistic_scenarios")
161
-
162
- # holistic.clear_repeated_scenarios()
163
-
164
- # aggragate_scores = holistic.df.query('scenario=="aggregate"')[
165
- # ["model", "score"]
166
- # ].sort_values(by="score", ascending=False)
167
-
168
  allbench = Benchmark()
169
  allbench.load_local_catalog()
170
 
@@ -172,22 +186,43 @@ def run_load(
172
  new_col_name="aggregate",
173
  agg_source_name="aggregate",
174
  scenario_whitelist=aggregate_scenario_whitelist,
175
- min_scenario_for_models_to_appear_in_agg=1,
 
 
176
  )
177
 
 
 
 
178
  aggragate_scores = allbench.df.query('scenario=="aggregate"')[
179
  ["model", "score"]
180
  ].sort_values(by="score", ascending=False)
181
 
182
- # allbench.df = allbench.df[~allbench.df["source"].str.contains("livebench")]
 
 
 
183
 
184
- allbench.extend(my_benchmark)
185
- # allbench.df = allbench.df.drop(columns=["tag"])
186
- allbench.clear_repeated_scenarios()
 
 
 
 
187
 
188
- # removing and adding the holistic scenarios
189
- # allbench.df = allbench.df.query("scenario not in @holistic_scenarios")
190
- # allbench = allbench.extend(holistic)
 
 
 
 
 
 
 
 
 
191
 
192
  tester = Tester(cfg=cfg)
193
 
@@ -205,7 +240,7 @@ def run_load(
205
 
206
 
207
  agreements, aggragare_score_df = run_load(
208
- aggregate_scenario_whitelist=aggragate_scenario_whitelist,
209
  n_models_taken_list=n_models_taken_list,
210
  model_select_strategy_list=[model_select_strategy],
211
  corr_types=[corr_type],
@@ -255,7 +290,7 @@ data = (
255
 
256
  # Apply coloring based on 'Z' valuesz
257
  def highlight_uploaded_benchmark(row):
258
- if row["Source"] == "Uploaded Benchmark":
259
  return ["background-color: rgba(100,100,100,0.1)"] * len(row)
260
  else:
261
  return [""] * len(row)
@@ -314,6 +349,16 @@ with st.expander(label="Model scored by the aggragate"):
314
  with st.expander(label="Citations"):
315
  st.code(
316
  r"""
 
 
 
 
 
 
 
 
 
 
317
  @misc{liu2023agentbenchevaluatingllmsagents,
318
  title={AgentBench: Evaluating LLMs as Agents},
319
  author={Xiao Liu and Hao Yu and Hanchen Zhang and Yifan Xu and Xuanyu Lei and Hanyu Lai and Yu Gu and Hangliang Ding and Kaiwen Men and Kejuan Yang and Shudan Zhang and Xiang Deng and Aohan Zeng and Zhengxiao Du and Chenhui Zhang and Sheng Shen and Tianjun Zhang and Yu Su and Huan Sun and Minlie Huang and Yuxiao Dong and Jie Tang},
 
5
  import plotly.express as px
6
  import streamlit as st
7
  from bat import Benchmark, Config, Reporter, Tester
8
+ from datetime import datetime
9
 
10
 
11
  holistic_scenarios = [
 
59
  label="Select Correlation type", options=["kendall", "pearson"], index=0
60
  )
61
 
62
+ aggregate_scenario_whitelist = aggragate_scenarios
63
  # [
64
  # scen
65
  # for scen in all_scenarios_for_aggragate
 
99
  my_benchmark = Benchmark()
100
  if uploaded_file is not None:
101
  df = pd.read_csv(uploaded_file)
102
+
103
+ my_benchmark.assign_df(
104
+ df,
105
+ data_source=f"uploaded_benchmark_{datetime.now().strftime('%y%m%d')}.csv",
106
+ )
107
+
108
+ allbench = Benchmark()
109
+ allbench.load_local_catalog()
110
+
111
+ allbench.add_aggregate(
112
+ new_col_name="aggregate",
113
+ agg_source_name="aggregate",
114
+ scenario_whitelist=aggregate_scenario_whitelist,
115
+ min_scenario_for_models_to_appear_in_agg=1
116
+ if len(aggregate_scenario_whitelist) == 1
117
+ else 3,
118
+ )
119
+
120
+ uploaded_models = my_benchmark.df[
121
+ my_benchmark.df["source"].str.contains("uploaded")
122
+ ]["model"].unique()
123
+ aggregate_models = allbench.df[allbench.df["source"].str.contains("aggregate")][
124
+ "model"
125
+ ].unique()
126
+
127
+ # Find the intersection (overlap) of models
128
+ overlap_models = set(aggregate_models).intersection(uploaded_models)
129
+ if len(overlap_models) < n_models_taken_list[0]:
130
+ st.warning(
131
+ f"You have just {len(overlap_models)} models intersecting with the aggregate!"
132
+ f"Here are some models you should run your benchmark over:{aggregate_models}"
133
+ )
134
 
135
 
136
  def run_load(
 
140
  corr_types=["kendall"],
141
  n_exps=10,
142
  my_benchmark=Benchmark(),
143
+ use_caching=False,
144
  ):
145
  # Create a hash of the inputs to generate a unique cache file for each set of inputs
146
  input_str = (
 
179
  else:
180
  print("Cached results not found, calculating")
181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  allbench = Benchmark()
183
  allbench.load_local_catalog()
184
 
 
186
  new_col_name="aggregate",
187
  agg_source_name="aggregate",
188
  scenario_whitelist=aggregate_scenario_whitelist,
189
+ min_scenario_for_models_to_appear_in_agg=1
190
+ if len(aggregate_scenario_whitelist) == 1
191
+ else 2,
192
  )
193
 
194
+ allbench.extend(my_benchmark)
195
+ allbench.clear_repeated_scenarios()
196
+
197
  aggragate_scores = allbench.df.query('scenario=="aggregate"')[
198
  ["model", "score"]
199
  ].sort_values(by="score", ascending=False)
200
 
201
+ if not my_benchmark.is_empty:
202
+ aggragate_scores["in_uploaded"] = aggragate_scores["model"].apply(
203
+ lambda x: x in my_benchmark.df["model"].unique()
204
+ )
205
 
206
+ # Get unique models for each scenario
207
+ uploaded_models = allbench.df[
208
+ allbench.df["source"].str.contains("uploaded")
209
+ ]["model"].unique()
210
+ aggregate_models = allbench.df[
211
+ allbench.df["source"].str.contains("aggregate")
212
+ ]["model"].unique()
213
 
214
+ # Find the intersection (overlap) of models
215
+ n_overlap_models = len(set(aggregate_models).intersection(uploaded_models))
216
+ # make sure we are asking for the maximal number of models between the request benchmark and the aggregate
217
+ n_models_taken_list = [min(n_models_taken_list[0], n_overlap_models)]
218
+
219
+ cfg = Config(
220
+ exp_to_run="example",
221
+ n_models_taken_list=n_models_taken_list,
222
+ model_select_strategy_list=model_select_strategy_list,
223
+ corr_types=corr_types,
224
+ n_exps=n_exps if n_models_taken_list != [0] else 1,
225
+ )
226
 
227
  tester = Tester(cfg=cfg)
228
 
 
240
 
241
 
242
  agreements, aggragare_score_df = run_load(
243
+ aggregate_scenario_whitelist=aggregate_scenario_whitelist,
244
  n_models_taken_list=n_models_taken_list,
245
  model_select_strategy_list=[model_select_strategy],
246
  corr_types=[corr_type],
 
290
 
291
  # Apply coloring based on 'Z' valuesz
292
  def highlight_uploaded_benchmark(row):
293
+ if "uploaded_benchmark" in row["Source"]:
294
  return ["background-color: rgba(100,100,100,0.1)"] * len(row)
295
  else:
296
  return [""] * len(row)
 
349
  with st.expander(label="Citations"):
350
  st.code(
351
  r"""
352
+
353
+ @misc{berkeley-function-calling-leaderboard,
354
+ title={Berkeley Function Calling Leaderboard},
355
+ author={Fanjia Yan and Huanzhi Mao and Charlie Cheng-Jie Ji
356
+ and Tianjun Zhang and Shishir G. Patil and Ion Stoica and Joseph E.
357
+ Gonzalez},
358
+ howpublished={\url{https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html}},
359
+ year={2024},
360
+ }
361
+
362
  @misc{liu2023agentbenchevaluatingllmsagents,
363
  title={AgentBench: Evaluating LLMs as Agents},
364
  author={Xiao Liu and Hao Yu and Hanchen Zhang and Yifan Xu and Xuanyu Lei and Hanyu Lai and Yu Gu and Hangliang Ding and Kaiwen Men and Kejuan Yang and Shudan Zhang and Xiang Deng and Aohan Zeng and Zhengxiao Du and Chenhui Zhang and Sheng Shen and Tianjun Zhang and Yu Su and Huan Sun and Minlie Huang and Yuxiao Dong and Jie Tang},