root commited on
Commit
c1765cf
·
1 Parent(s): 99399ee
Files changed (1) hide show
  1. app.py +3 -13
app.py CHANGED
@@ -33,16 +33,6 @@ repo = snapshot_download(
33
  )
34
 
35
  def avg_over_rewardbench_v2(dataframe_core):
36
- """
37
- Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
38
-
39
- We average over 4 core sections (per prompt weighting):
40
- 1. Chat: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
41
- 2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
42
- 3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
43
- 4. Reasoning: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
44
- 5. Prior Sets (0.5 weight): Includes the test sets (anthropic_helpful, mtbench_human, shp, summarize)
45
- """
46
  domain_cols = ['factuality', 'coconot/safety', 'math', 'precise instruction following']
47
  new_df = dataframe_core.copy()
48
 
@@ -165,8 +155,8 @@ def length_bias_check(dataframe):
165
 
166
 
167
  rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by='average', ascending=False)
168
- rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
169
- prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
170
  # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
171
 
172
  rewardbench_data_avg = avg_over_rewardbenc_v2(rewardbench_data, prefs_data).sort_values(by='average', ascending=False)
@@ -193,7 +183,7 @@ rewardbench_data = prep_df(rewardbench_data)
193
  rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
194
  # adjust weight of this average to 50% for Prior Sets (0.5 weight), 1 for others
195
 
196
- rewardbench_data_length = prep_df(rewardbench_data_length)
197
  prefs_data = prep_df(prefs_data)
198
 
199
  col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
 
33
  )
34
 
35
  def avg_over_rewardbench_v2(dataframe_core):
 
 
 
 
 
 
 
 
 
 
36
  domain_cols = ['factuality', 'coconot/safety', 'math', 'precise instruction following']
37
  new_df = dataframe_core.copy()
38
 
 
155
 
156
 
157
  rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by='average', ascending=False)
158
+ # rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
159
+ # prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
160
  # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
161
 
162
  rewardbench_data_avg = avg_over_rewardbenc_v2(rewardbench_data, prefs_data).sort_values(by='average', ascending=False)
 
183
  rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
184
  # adjust weight of this average to 50% for Prior Sets (0.5 weight), 1 for others
185
 
186
+ # rewardbench_data_length = prep_df(rewardbench_data_length)
187
  prefs_data = prep_df(prefs_data)
188
 
189
  col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)