Spaces:
Running
Running
root
commited on
Commit
·
c1765cf
1
Parent(s):
99399ee
fix
Browse files
app.py
CHANGED
@@ -33,16 +33,6 @@ repo = snapshot_download(
|
|
33 |
)
|
34 |
|
35 |
def avg_over_rewardbench_v2(dataframe_core):
|
36 |
-
"""
|
37 |
-
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
|
38 |
-
|
39 |
-
We average over 4 core sections (per prompt weighting):
|
40 |
-
1. Chat: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
|
41 |
-
2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
|
42 |
-
3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
|
43 |
-
4. Reasoning: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
|
44 |
-
5. Prior Sets (0.5 weight): Includes the test sets (anthropic_helpful, mtbench_human, shp, summarize)
|
45 |
-
"""
|
46 |
domain_cols = ['factuality', 'coconot/safety', 'math', 'precise instruction following']
|
47 |
new_df = dataframe_core.copy()
|
48 |
|
@@ -165,8 +155,8 @@ def length_bias_check(dataframe):
|
|
165 |
|
166 |
|
167 |
rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by='average', ascending=False)
|
168 |
-
rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
|
169 |
-
prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
|
170 |
# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
|
171 |
|
172 |
rewardbench_data_avg = avg_over_rewardbenc_v2(rewardbench_data, prefs_data).sort_values(by='average', ascending=False)
|
@@ -193,7 +183,7 @@ rewardbench_data = prep_df(rewardbench_data)
|
|
193 |
rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
|
194 |
# adjust weight of this average to 50% for Prior Sets (0.5 weight), 1 for others
|
195 |
|
196 |
-
rewardbench_data_length = prep_df(rewardbench_data_length)
|
197 |
prefs_data = prep_df(prefs_data)
|
198 |
|
199 |
col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
|
|
|
33 |
)
|
34 |
|
35 |
def avg_over_rewardbench_v2(dataframe_core):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
domain_cols = ['factuality', 'coconot/safety', 'math', 'precise instruction following']
|
37 |
new_df = dataframe_core.copy()
|
38 |
|
|
|
155 |
|
156 |
|
157 |
rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by='average', ascending=False)
|
158 |
+
# rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
|
159 |
+
# prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
|
160 |
# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
|
161 |
|
162 |
rewardbench_data_avg = avg_over_rewardbenc_v2(rewardbench_data, prefs_data).sort_values(by='average', ascending=False)
|
|
|
183 |
rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
|
184 |
# adjust weight of this average to 50% for Prior Sets (0.5 weight), 1 for others
|
185 |
|
186 |
+
# rewardbench_data_length = prep_df(rewardbench_data_length)
|
187 |
prefs_data = prep_df(prefs_data)
|
188 |
|
189 |
col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
|