natolambert commited on
Commit
51d7804
·
1 Parent(s): 74240b0
Files changed (1) hide show
  1. app.py +39 -11
app.py CHANGED
@@ -12,14 +12,6 @@ from leaderboard.css import custom_css
12
  from leaderboard.md import *
13
  from leaderboard.utils import load_all_data
14
 
15
- # get v1 data
16
- rb_orig_snapshot = pd.read_csv("leaderboard/final-rbv1-data.csv")
17
- # rename column "Unnamed: 0" to ""
18
- rb_orig_snapshot = rb_orig_snapshot.rename(columns={"Unnamed: 0": ""})
19
- # rb_orig_snapshot = rb_orig_snapshot.drop(columns=["Unnamed: 0", ''])
20
- rb_orig_snapshot.reset_index(drop=True, inplace=True)
21
-
22
- # import ipdb; ipdb.set_trace()
23
  #######################################################
24
  # Setup #
25
  #######################################################
@@ -153,18 +145,51 @@ def prep_df(df):
153
 
154
  return df
155
 
 
 
 
 
 
 
 
156
  rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by="average", ascending=False)
157
- rewardbench_data_avg = avg_over_rewardbench_v2(rewardbench_data).sort_values(by="average", ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
  # add count column to all dataframes
160
  rewardbench_data = prep_df(rewardbench_data)
161
  rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
162
 
163
-
 
 
 
 
164
 
165
  col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
166
  col_types_rewardbench_v1 = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rb_orig_snapshot.columns) - 1)
167
 
 
 
168
  ###########################################
169
  # Leaderboard Helpers & Setting #
170
  ###########################################
@@ -256,6 +281,9 @@ def regex_table(dataframe, regex, filter_button, style=True):
256
  update_scores = True
257
  # remove the column "Prior Sets (0.5 weight)" from the outputted table
258
  dataframe = dataframe.drop(columns=["Prior Sets (0.5 weight)"])
 
 
 
259
  if "Seq. Classifiers" not in filter_button:
260
  dataframe = dataframe[~dataframe["Model Type"].str.contains("Seq. Classifier", case=False, na=False)]
261
  if "DPO" not in filter_button:
@@ -354,7 +382,7 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
354
  show_label=False,
355
  )
356
  model_types_1 = gr.CheckboxGroup(
357
- ["Seq. Classifiers", "Custom Classifiers", "Generative"],
358
  value=["Seq. Classifiers", "Custom Classifiers", "Generative"],
359
  label="Model Types",
360
  show_label=False,
 
12
  from leaderboard.md import *
13
  from leaderboard.utils import load_all_data
14
 
 
 
 
 
 
 
 
 
15
  #######################################################
16
  # Setup #
17
  #######################################################
 
145
 
146
  return df
147
 
148
+ # get v1 data
149
+ rb_orig_snapshot = pd.read_csv("leaderboard/final-rbv1-data.csv")
150
+ # rename column "Unnamed: 0" to ""
151
+ rb_orig_snapshot = rb_orig_snapshot.rename(columns={"Unnamed: 0": ""})
152
+ # rb_orig_snapshot = rb_orig_snapshot.drop(columns=["Unnamed: 0", ''])
153
+ rb_orig_snapshot.reset_index(drop=True, inplace=True)
154
+
155
  rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by="average", ascending=False)
156
+ rewardbench_data_avg_intermediate = avg_over_rewardbench_v2(rewardbench_data.copy())
157
+
158
+ # Prepare RBv1 scores for merging
159
+ rb_v1_scores_to_merge = rb_orig_snapshot[['Model', 'Score']].copy()
160
+
161
+ # if " ⚠️" in rb_v1_scores_to_merge["Model"].values, shorten the model name without it
162
+ rb_v1_scores_to_merge["Model"] = rb_v1_scores_to_merge["Model"].str.replace(" ⚠️", "", regex=False)
163
+
164
+ rb_v1_scores_to_merge.rename(columns={'Score': 'RBv1'}, inplace=True)
165
+ # rename rb_v1 "Model" to "model"
166
+ rb_v1_scores_to_merge.rename(columns={'Model': 'model'}, inplace=True)
167
+
168
+ # Merge RBv1 scores into the v2 data
169
+ rewardbench_data_avg = pd.merge(rewardbench_data_avg_intermediate, rb_v1_scores_to_merge, on='model', how='left')
170
+
171
+ # Drop any models with only RBv1 scores and no v2 scores
172
+ rewardbench_data_avg = rewardbench_data_avg.dropna(subset=['average'])
173
+
174
+ # Sort by the v2 average
175
+ rewardbench_data_avg = rewardbench_data_avg.sort_values(by="average", ascending=False)
176
+
177
 
178
  # add count column to all dataframes
179
  rewardbench_data = prep_df(rewardbench_data)
180
  rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
181
 
182
+ # Ensure RBv1 is the last column if it's not already (merge usually places it at the end of non-key columns)
183
+ # If 'RBv1' is present and not last, move it to be the last column.
184
+ if 'RBv1' in rewardbench_data_avg.columns:
185
+ rbv1_col = rewardbench_data_avg.pop('RBv1')
186
+ rewardbench_data_avg['RBv1'] = rbv1_col
187
 
188
  col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
189
  col_types_rewardbench_v1 = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rb_orig_snapshot.columns) - 1)
190
 
191
+ # import ipdb; ipdb.set_trace()
192
+
193
  ###########################################
194
  # Leaderboard Helpers & Setting #
195
  ###########################################
 
281
  update_scores = True
282
  # remove the column "Prior Sets (0.5 weight)" from the outputted table
283
  dataframe = dataframe.drop(columns=["Prior Sets (0.5 weight)"])
284
+ if "RBv1" not in filter_button and "RBv1" in dataframe.columns:
285
+ # remove the column "Prior Sets (0.5 weight)" from the outputted table
286
+ dataframe = dataframe.drop(columns=["RBv1"])
287
  if "Seq. Classifiers" not in filter_button:
288
  dataframe = dataframe[~dataframe["Model Type"].str.contains("Seq. Classifier", case=False, na=False)]
289
  if "DPO" not in filter_button:
 
382
  show_label=False,
383
  )
384
  model_types_1 = gr.CheckboxGroup(
385
+ ["Seq. Classifiers", "Custom Classifiers", "Generative", "RBv1"],
386
  value=["Seq. Classifiers", "Custom Classifiers", "Generative"],
387
  label="Model Types",
388
  show_label=False,