Spaces:
Running
Running
rahulnair23
commited on
Commit
Β·
18e32a8
1
Parent(s):
ea18bb3
synthetic cases
Browse files- app.py +100 -191
- assets/synth.md +3 -0
- executors.py +272 -0
- selfrank/algos/triplet.py +3 -2
app.py
CHANGED
@@ -1,15 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
-
import
|
3 |
-
import numpy as np
|
4 |
-
from rouge_score import rouge_scorer
|
5 |
-
from joblib import Parallel, delayed
|
6 |
-
from selfrank.algos.greedy import SelfRankGreedy
|
7 |
-
from selfrank.algos.iterative import SelfRank
|
8 |
-
from selfrank.algos.baseline import MCARank
|
9 |
-
from selfrank.algos.triplet import equality, rouge
|
10 |
-
import matplotlib.pyplot as plt
|
11 |
-
from itertools import zip_longest
|
12 |
|
|
|
13 |
class UI:
|
14 |
|
15 |
def __init__(self):
|
@@ -32,36 +24,37 @@ class UI:
|
|
32 |
gr.Markdown(
|
33 |
"""Using inference data gathered from [HELM](https://crfm.stanford.edu/helm/classic/latest/) we first show how our estimated rankings compare to rankings derived from using ground-truth or reference data."""
|
34 |
)
|
|
|
35 |
with gr.Column(variant="compact"):
|
36 |
self.data = gr.Dropdown(
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
self.mmlu = gr.Dropdown(visible=False)
|
45 |
self.evaluation = gr.Dropdown(
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
|
54 |
def update_mmlu(v):
|
55 |
if v == "MMLU":
|
56 |
return gr.Dropdown(
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
else:
|
66 |
return gr.Dropdown(visible=False), gr.Dropdown(choices=['Rouge'], value='Rouge')
|
67 |
|
@@ -69,30 +62,31 @@ class UI:
|
|
69 |
|
70 |
|
71 |
self.nmodels = gr.Dropdown(
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
self.nrows = gr.Dropdown(
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
self.method = gr.Dropdown(
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
self.btn_execute = gr.Button("Run")
|
93 |
|
94 |
def output_panel(self):
|
95 |
"""Plots/leaderboard/bump charts"""
|
|
|
96 |
with gr.Column(variant="default"):
|
97 |
gr.Markdown("""<h2 style='color: purple;'> Estimated ranking </h2> """)
|
98 |
self.leaderboard = gr.DataFrame(headers=["rank", "model"],
|
@@ -110,8 +104,47 @@ class UI:
|
|
110 |
"""Synthetic data experiments"""
|
111 |
gr.Markdown("<br>")
|
112 |
gr.Markdown("---")
|
113 |
-
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
def byod_panel(self):
|
117 |
"""Instructions panel"""
|
@@ -141,12 +174,12 @@ class UI:
|
|
141 |
# Output panel/leaderboard
|
142 |
self.output_panel()
|
143 |
|
144 |
-
|
145 |
self.byod_panel()
|
146 |
|
147 |
# Register event listeners
|
148 |
self.btn_execute.click(
|
149 |
-
fn=
|
150 |
inputs=[
|
151 |
self.data,
|
152 |
self.mmlu,
|
@@ -157,145 +190,21 @@ class UI:
|
|
157 |
],
|
158 |
outputs=[self.leaderboard, self.bumpchart, self.eval_metrics],
|
159 |
)
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
match data:
|
173 |
-
case "MMLU":
|
174 |
-
adf = pd.read_pickle(f"data/mmlu_subject_{mmlu_subject}.pkl")
|
175 |
-
|
176 |
-
case "CNN/DM":
|
177 |
-
adf = pd.read_pickle(f"data/cnndm.pkl")
|
178 |
-
|
179 |
-
case "XSUM":
|
180 |
-
adf = pd.read_pickle(f"data/xsum.pkl")
|
181 |
-
|
182 |
-
case _:
|
183 |
-
raise ValueError(f"'{data}' not understood.")
|
184 |
-
|
185 |
-
MODELS = adf.model.unique()
|
186 |
-
|
187 |
-
# Sample fewer models if so needed
|
188 |
-
if nmodels != "All":
|
189 |
-
if nmodels < len(MODELS):
|
190 |
-
|
191 |
-
MODELS = np.random.choice(MODELS, nmodels, replace=False).tolist()
|
192 |
-
adf = adf[adf.model.isin(MODELS)]
|
193 |
-
|
194 |
-
match data:
|
195 |
-
case "MMLU":
|
196 |
-
keys = [
|
197 |
-
"id",
|
198 |
-
"trial_id",
|
199 |
-
"perturbation",
|
200 |
-
] # MMLU has this extra parameter
|
201 |
-
case "CNN/DM" | "XSUM":
|
202 |
-
keys = ["id", "trial_id"]
|
203 |
-
case _:
|
204 |
-
pass
|
205 |
-
|
206 |
-
df = adf.pivot_table(
|
207 |
-
columns="model",
|
208 |
-
index=keys,
|
209 |
-
values="output",
|
210 |
-
aggfunc="first",
|
211 |
-
)
|
212 |
-
|
213 |
-
# Filter by number of rows
|
214 |
-
df.dropna(inplace=True)
|
215 |
-
if nrows != "All":
|
216 |
-
if nrows < df.shape[0]:
|
217 |
-
df = df.sample(nrows, random_state=seed)
|
218 |
-
|
219 |
-
# Compute true ranking
|
220 |
-
adf = adf.set_index(keys).loc[df.index].reset_index()
|
221 |
-
|
222 |
-
if evaluation == "Rouge":
|
223 |
-
|
224 |
-
def __true_rouge(x, scorer):
|
225 |
-
return scorer.score(x["reference"], x["output"])["rouge2"].fmeasure
|
226 |
-
|
227 |
-
scorer = rouge_scorer.RougeScorer(["rouge2"], use_stemmer=True)
|
228 |
-
adf["rouge"] = Parallel(n_jobs=-1, batch_size=128)(
|
229 |
-
delayed(__true_rouge)(i, scorer) for _, i in adf.iterrows()
|
230 |
-
)
|
231 |
-
|
232 |
-
# Method 2 - look at "win rates" - for each question, see which model
|
233 |
-
# wins (i.e. has the best ROUGE score)
|
234 |
-
idx = adf.groupby(["id", "trial_id"])["rouge"].idxmax()
|
235 |
-
win_rates = adf.loc[idx].model.value_counts()
|
236 |
-
win_rate_rank = win_rates.index.tolist()
|
237 |
-
|
238 |
-
# include models with nowins at the bottom
|
239 |
-
no_wins = list(set(MODELS) - set(win_rate_rank))
|
240 |
-
true_ranking = win_rate_rank + no_wins
|
241 |
-
evaluator = rouge
|
242 |
-
|
243 |
-
elif evaluation == "Equality":
|
244 |
-
|
245 |
-
# Compute the true ranking (multiple choice - so use equality between
|
246 |
-
# LLM response and reference-value)
|
247 |
-
adf["C"] = (adf.output == adf.reference).astype(int)
|
248 |
-
true_ranking = (
|
249 |
-
adf.groupby("model")["C"]
|
250 |
-
.apply(lambda x: sum(x) / len(x))
|
251 |
-
.sort_values(ascending=False)
|
252 |
-
.index.tolist()
|
253 |
)
|
254 |
-
|
255 |
-
|
256 |
-
else:
|
257 |
-
raise ValueError(f"'{evaluation}' not understood.")
|
258 |
-
|
259 |
-
match method:
|
260 |
-
case "Full":
|
261 |
-
ranker = SelfRank(MODELS, evaluator, true_ranking)
|
262 |
-
|
263 |
-
case "Greedy":
|
264 |
-
ranker = SelfRankGreedy(MODELS, evaluator, true_ranking)
|
265 |
-
|
266 |
-
case "MCA":
|
267 |
-
raise NotImplementedError
|
268 |
-
case _:
|
269 |
-
raise ValueError(f"'{method}' not understood.")
|
270 |
-
|
271 |
-
# generate outputs
|
272 |
-
ranker.fit(df)
|
273 |
-
ranks = ranker.ranking
|
274 |
-
|
275 |
-
ranks = [
|
276 |
-
j + i for i, j in zip_longest(ranks, ["π₯ ", "π₯ ", "π₯ "], fillvalue="")
|
277 |
-
]
|
278 |
-
out_df = pd.DataFrame({"rank": range(1, len(true_ranking) + 1), "model": ranks})
|
279 |
-
|
280 |
-
out_metrics = {
|
281 |
-
"rbo": ranker.measure(metric="rbo"),
|
282 |
-
"map-1": ranker.measure(metric="mapk", k=1),
|
283 |
-
"map-3": ranker.measure(metric="mapk", k=3),
|
284 |
-
"map-5": ranker.measure(metric="mapk", k=5),
|
285 |
-
"map-10": ranker.measure(metric="mapk", k=10),
|
286 |
-
"evaluations": evaluator.calls,
|
287 |
-
}
|
288 |
-
eval_metrics = (
|
289 |
-
f"<h2> Evaluation measures </h2>"
|
290 |
-
f"Rank-Biased Overlap: {out_metrics['rbo']:0.3f}<br>"
|
291 |
-
f"MAP-3 : {out_metrics['map-3']:0.3f}<br>"
|
292 |
-
f"MAP-5 : {out_metrics['map-5']:0.3f}<br>"
|
293 |
-
f"MAP-10 : {out_metrics['map-10']: 0.3f}."
|
294 |
-
)
|
295 |
-
|
296 |
-
out_plot = ranker.plot()
|
297 |
|
298 |
-
|
299 |
|
300 |
def run(self):
|
301 |
self.ui = self.layout()
|
|
|
1 |
import gradio as gr
|
2 |
+
from executors import benchmark_executor, synth_executor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
+
from gradio_rangeslider import RangeSlider
|
5 |
class UI:
|
6 |
|
7 |
def __init__(self):
|
|
|
24 |
gr.Markdown(
|
25 |
"""Using inference data gathered from [HELM](https://crfm.stanford.edu/helm/classic/latest/) we first show how our estimated rankings compare to rankings derived from using ground-truth or reference data."""
|
26 |
)
|
27 |
+
|
28 |
with gr.Column(variant="compact"):
|
29 |
self.data = gr.Dropdown(
|
30 |
+
choices=["CNN/DM", "XSUM", "MMLU"],
|
31 |
+
multiselect=False,
|
32 |
+
value="CNN/DM",
|
33 |
+
label="Choose a dataset.",
|
34 |
+
info="The dataset describes a specific task, either summarization (CNN/DM, XSUM) or multiple choice (MMLU).",
|
35 |
+
interactive=True,
|
36 |
+
)
|
37 |
self.mmlu = gr.Dropdown(visible=False)
|
38 |
self.evaluation = gr.Dropdown(
|
39 |
+
choices=["Rouge", "Equality"],
|
40 |
+
multiselect=False,
|
41 |
+
value="Rouge",
|
42 |
+
interactive=True,
|
43 |
+
label="Evaluation function",
|
44 |
+
info="How should the Judge model decide the winner? Demo limited to use 'Rouge' for generative tasks like summarization, and 'equality' for multiple choice or classification tasks. In practice you can use any function that compares judge responses to the contestant models.",
|
45 |
+
)
|
46 |
|
47 |
def update_mmlu(v):
|
48 |
if v == "MMLU":
|
49 |
return gr.Dropdown(
|
50 |
+
choices=list(['abstract_algebra', 'college_chemistry', 'computer_security', 'econometrics', 'us_foreign_policy']),
|
51 |
+
value='us_foreign_policy',
|
52 |
+
multiselect=False,
|
53 |
+
label="Choose MMLU subject.",
|
54 |
+
info="MMLU subject area.",
|
55 |
+
interactive=True,
|
56 |
+
visible=True,
|
57 |
+
), gr.Dropdown(choices=['Equality'], value='Equality')
|
58 |
else:
|
59 |
return gr.Dropdown(visible=False), gr.Dropdown(choices=['Rouge'], value='Rouge')
|
60 |
|
|
|
62 |
|
63 |
|
64 |
self.nmodels = gr.Dropdown(
|
65 |
+
choices=["All", 10, 20, 30],
|
66 |
+
label="Number of models",
|
67 |
+
info="Sample a subset of LLMs to rank.",
|
68 |
+
value=10,
|
69 |
+
interactive=True,
|
70 |
+
)
|
71 |
self.nrows = gr.Dropdown(
|
72 |
+
choices=["All", 10, 20, 30],
|
73 |
+
label="Number of instances",
|
74 |
+
info="Sample a subset of instances to evaluate (smaller is faster).",
|
75 |
+
value=10,
|
76 |
+
interactive=True,
|
77 |
+
)
|
78 |
self.method = gr.Dropdown(
|
79 |
+
choices=["Greedy", "Full"],
|
80 |
+
label="Algorithm variant to use",
|
81 |
+
info="Choose from one of two variants. 'Full' (FTR in the paper) runs all triplet combinations, recommended when evaluations are cheap or for smaller datasets, or 'greedy' (GTR) a faster variant suggested for more complex evaluations.",
|
82 |
+
value="Full",
|
83 |
+
interactive=True,
|
84 |
+
)
|
85 |
self.btn_execute = gr.Button("Run")
|
86 |
|
87 |
def output_panel(self):
|
88 |
"""Plots/leaderboard/bump charts"""
|
89 |
+
|
90 |
with gr.Column(variant="default"):
|
91 |
gr.Markdown("""<h2 style='color: purple;'> Estimated ranking </h2> """)
|
92 |
self.leaderboard = gr.DataFrame(headers=["rank", "model"],
|
|
|
104 |
"""Synthetic data experiments"""
|
105 |
gr.Markdown("<br>")
|
106 |
gr.Markdown("---")
|
107 |
+
with open("assets/synth.md", "r") as f:
|
108 |
+
content = f.read()
|
109 |
+
|
110 |
+
gr.Markdown(content)
|
111 |
+
|
112 |
+
with gr.Row():
|
113 |
+
with gr.Column(scale=1):
|
114 |
+
with gr.Column(variant='compact'):
|
115 |
+
self.synth_range = RangeSlider(10, 100, value=(50, 90), step=1, label="Model Accuracy Range (%)", interactive=True)
|
116 |
+
self.synth_nmodels = gr.Slider(3, 50, value=10, step=1, label="Number of models to synthesise.", info="Equally spaced in the accuracy range.", interactive=True)
|
117 |
+
self.synth_nanswers = gr.Slider(2, 50, value=10, step=1, label="Number of possible (discrete) answers per prompt.", interactive=True)
|
118 |
+
self.synth_nquestions = gr.Slider(10, 100, step=10, label="Number of prompts to simulate.", interactive=True)
|
119 |
+
self.synth_noise = gr.Slider(0, 1, value=0, label='Noise in evaluation (p)', info="Evaluation function decisions flipped with probability p. p=0 implies no noise.", interactive=True)
|
120 |
+
self.synth_method = gr.Dropdown(
|
121 |
+
choices=["Greedy", "Full"],
|
122 |
+
label="Algorithm variant to use",
|
123 |
+
info="Choose from one of two variants. 'Full' (FTR in the paper) runs all triplet combinations, recommended when evaluations are cheap or for smaller datasets, or 'greedy' (GTR) a faster variant suggested for more complex evaluations.",
|
124 |
+
value="Full",
|
125 |
+
interactive=True,
|
126 |
+
)
|
127 |
+
|
128 |
+
examples = gr.Examples([[(10, 30), 10, 10, 10, 0, "Full"],
|
129 |
+
[(10, 30), 10, 10, 10, 0.5, "Full"],
|
130 |
+
[[10, 30], 10, 2, 10, 0, "Full"]],
|
131 |
+
[self.synth_range, self.synth_nmodels, self.synth_nanswers, self.synth_nquestions, self.synth_noise, self.synth_method],
|
132 |
+
label='Some interesting cases (click and run)', example_labels=["Rankings recovered for low accuracy models",
|
133 |
+
"Robust recovery when evaluations have noise",
|
134 |
+
"Binary outcomes are challenging"
|
135 |
+
] )
|
136 |
+
|
137 |
+
self.synth_execute = gr.Button("Run")
|
138 |
+
with gr.Column(scale=1):
|
139 |
+
with gr.Column(variant="default"):
|
140 |
+
gr.Markdown(
|
141 |
+
"""<h2 style='color: purple;'> Estimated vs. true ranking </h2> """
|
142 |
+
)
|
143 |
+
|
144 |
+
self.synth_bumpchart = gr.Image()
|
145 |
+
with gr.Column(scale=1):
|
146 |
+
self.synth_eval_metrics = gr.Markdown()
|
147 |
+
|
148 |
|
149 |
def byod_panel(self):
|
150 |
"""Instructions panel"""
|
|
|
174 |
# Output panel/leaderboard
|
175 |
self.output_panel()
|
176 |
|
177 |
+
self.synth_panel()
|
178 |
self.byod_panel()
|
179 |
|
180 |
# Register event listeners
|
181 |
self.btn_execute.click(
|
182 |
+
fn=benchmark_executor,
|
183 |
inputs=[
|
184 |
self.data,
|
185 |
self.mmlu,
|
|
|
190 |
],
|
191 |
outputs=[self.leaderboard, self.bumpchart, self.eval_metrics],
|
192 |
)
|
193 |
+
self.synth_execute.click(
|
194 |
+
fn=synth_executor,
|
195 |
+
inputs=[
|
196 |
+
self.synth_range,
|
197 |
+
self.synth_nmodels,
|
198 |
+
self.synth_nanswers,
|
199 |
+
self.synth_nquestions,
|
200 |
+
self.synth_noise,
|
201 |
+
self.synth_method,
|
202 |
+
],
|
203 |
+
outputs=[self.synth_bumpchart, self.synth_eval_metrics],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
)
|
205 |
+
return demo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
+
|
208 |
|
209 |
def run(self):
|
210 |
self.ui = self.layout()
|
assets/synth.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
<h1 style='color: purple;'>Synthetic multiple choice </h1>
|
2 |
+
|
3 |
+
To analyse our methods, we synthesise data from models with known accuracy in a multiple choice setting, i.e. discrete set of possible responses. Several parameters (number of models, model accuracy, number of prompts, and number of possible answers, noisy comparisons) can have an impact on quality of results. Rankings can be recovered for a range of challenging cases, for instance when the accuracy of underlying models is low or when the evaluation function is noisy and imperfect. When the number of possible answers are low, for example in binary choice settings, recovering rankings becomes challenging. In general low variance in wrong answers cause triplet evaluations to treat wrong answers as the right one.
|
executors.py
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from rouge_score import rouge_scorer
|
5 |
+
from joblib import Parallel, delayed
|
6 |
+
from selfrank.algos.greedy import SelfRankGreedy
|
7 |
+
from selfrank.algos.iterative import SelfRank
|
8 |
+
from selfrank.algos.baseline import MCARank
|
9 |
+
from selfrank.algos.triplet import equality, rouge, noisy_equality
|
10 |
+
import matplotlib.pyplot as plt
|
11 |
+
from itertools import zip_longest
|
12 |
+
from uuid import uuid4
|
13 |
+
import csv, os
|
14 |
+
from functools import partial
|
15 |
+
|
16 |
+
|
17 |
+
def generate_data(max_acc, min_acc, nmodels, nanswers, nquestions) -> tuple[pd.DataFrame, list]:
|
18 |
+
|
19 |
+
np.random.seed(42)
|
20 |
+
# Spread model accuracies between min and max
|
21 |
+
model_acc = np.linspace(max_acc, min_acc, nmodels)
|
22 |
+
|
23 |
+
gt_and_model_ans = np.zeros(
|
24 |
+
(nquestions, nmodels + 1), dtype=int
|
25 |
+
) # array to store ground truth and model ans
|
26 |
+
|
27 |
+
# Create ground truth answers i.e. first column
|
28 |
+
for i in range(nquestions):
|
29 |
+
gt_and_model_ans[i][0] = np.random.randint(nanswers)
|
30 |
+
|
31 |
+
for i in range(0, nmodels):
|
32 |
+
no_of_entries_frm_gt = np.ceil(model_acc[i] / 100 * (nquestions)).astype(int)
|
33 |
+
# print(no_of_entries_frm_gt)
|
34 |
+
offsets_to_match = np.random.permutation(nquestions)[0:no_of_entries_frm_gt]
|
35 |
+
# print(offsets_to_match)
|
36 |
+
for j in range(nquestions):
|
37 |
+
if j in offsets_to_match:
|
38 |
+
gt_and_model_ans[j][i + 1] = gt_and_model_ans[j][0]
|
39 |
+
else:
|
40 |
+
lst_wo_gt = list(range(nanswers))
|
41 |
+
lst_wo_gt.remove(gt_and_model_ans[j][0])
|
42 |
+
gt_and_model_ans[j][i + 1] = lst_wo_gt[np.random.randint(nanswers - 1)]
|
43 |
+
|
44 |
+
# print(gt_and_model_ans)
|
45 |
+
filename = str(uuid4())
|
46 |
+
|
47 |
+
fields = ["GT"]
|
48 |
+
for i in range(nmodels):
|
49 |
+
fields.append("M" + str(i + 1))
|
50 |
+
|
51 |
+
# writing to csv file
|
52 |
+
with open(filename, "w") as csvfile:
|
53 |
+
# creating a csv writer object
|
54 |
+
csvwriter = csv.writer(csvfile)
|
55 |
+
|
56 |
+
# writing the fields
|
57 |
+
csvwriter.writerow(fields)
|
58 |
+
|
59 |
+
# writing the data rows
|
60 |
+
csvwriter.writerows(gt_and_model_ans)
|
61 |
+
|
62 |
+
df = pd.read_csv(filename)
|
63 |
+
os.remove(filename)
|
64 |
+
|
65 |
+
true_ranking = [f"M{i}" for i in range(1, nmodels + 1)]
|
66 |
+
|
67 |
+
return df, true_ranking
|
68 |
+
|
69 |
+
def synth_executor(acc_range: tuple[float, float], nmodels, nanswers, nquestions, noise, method) -> tuple[str, dict]:
|
70 |
+
|
71 |
+
min_acc, max_acc = acc_range
|
72 |
+
df, true_ranking = generate_data(max_acc, min_acc, nmodels, nanswers, nquestions)
|
73 |
+
|
74 |
+
if noise == 0.:
|
75 |
+
comp = equality
|
76 |
+
else:
|
77 |
+
comp = partial(noisy_equality, p=noise)
|
78 |
+
|
79 |
+
df = df.drop(columns=["GT"])
|
80 |
+
MODELS = df.columns.tolist()
|
81 |
+
|
82 |
+
if method == "Full":
|
83 |
+
ranker = SelfRank(MODELS, comp, true_ranking)
|
84 |
+
ranker.fit(df)
|
85 |
+
|
86 |
+
# outputs of interest
|
87 |
+
out = {
|
88 |
+
"true_ranking": true_ranking,
|
89 |
+
"estimated_ranking": ranker.ranking,
|
90 |
+
"rbo": ranker.measure(metric="rbo"),
|
91 |
+
"map-1": ranker.measure(metric='mapk', k=1),
|
92 |
+
"map-3": ranker.measure(metric='mapk', k=3),
|
93 |
+
"map-5": ranker.measure(metric='mapk', k=5),
|
94 |
+
"map-10": ranker.measure(metric='mapk', k=10)
|
95 |
+
}
|
96 |
+
|
97 |
+
elif method == "Greedy":
|
98 |
+
ranker = SelfRankGreedy(MODELS, comp, true_ranking)
|
99 |
+
ranker.fit(df)
|
100 |
+
out = {
|
101 |
+
"true_ranking": true_ranking,
|
102 |
+
"estimated_ranking": ranker.ranking,
|
103 |
+
"rbo": ranker.measure(metric="rbo"),
|
104 |
+
"map-1": ranker.measure(metric='mapk', k=1),
|
105 |
+
"map-3": ranker.measure(metric='mapk', k=3),
|
106 |
+
"map-5": ranker.measure(metric='mapk', k=5),
|
107 |
+
"map-10": ranker.measure(metric='mapk', k=10)
|
108 |
+
}
|
109 |
+
elif method == 'MCA':
|
110 |
+
ranker = MCARank(MODELS, comp, true_ranking)
|
111 |
+
ranker.fit(df, measure='noisy_equality', p=noise)
|
112 |
+
out = {
|
113 |
+
"true_ranking": true_ranking,
|
114 |
+
"estimated_ranking": ranker.ranking,
|
115 |
+
"rbo": ranker.measure(metric="rbo"),
|
116 |
+
"map-1": ranker.measure(metric='mapk', k=1),
|
117 |
+
"map-3": ranker.measure(metric='mapk', k=3),
|
118 |
+
"map-5": ranker.measure(metric='mapk', k=5),
|
119 |
+
"map-10": ranker.measure(metric='mapk', k=10)
|
120 |
+
}
|
121 |
+
else:
|
122 |
+
raise ValueError(f"{method} not understood.")
|
123 |
+
|
124 |
+
eval_metrics = (
|
125 |
+
f"<h2 style='color: purple;'> Evaluation measures </h2>"
|
126 |
+
f"Rank-Biased Overlap: {out['rbo']:0.3f}<br>"
|
127 |
+
f"MAP-3 : {out['map-3']:0.3f}<br>"
|
128 |
+
f"MAP-5 : {out['map-5']:0.3f}<br>"
|
129 |
+
f"MAP-10 : {out['map-10']: 0.3f}."
|
130 |
+
)
|
131 |
+
|
132 |
+
out_plot = ranker.plot("synth")
|
133 |
+
|
134 |
+
return "synth.png", eval_metrics
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
def benchmark_executor(data, mmlu_subject, evaluation, nmodels, nrows, method
|
139 |
+
) -> tuple[pd.DataFrame, plt.figure]:
|
140 |
+
"""Main execution flow for benchmarks"""
|
141 |
+
|
142 |
+
# gr.Info(f"Loaded run config: {data}, {evaluation}, {nmodels}.")
|
143 |
+
seed = 40
|
144 |
+
np.random.seed(seed)
|
145 |
+
|
146 |
+
match data:
|
147 |
+
case "MMLU":
|
148 |
+
adf = pd.read_pickle(f"data/mmlu_subject_{mmlu_subject}.pkl")
|
149 |
+
|
150 |
+
case "CNN/DM":
|
151 |
+
adf = pd.read_pickle(f"data/cnndm.pkl")
|
152 |
+
|
153 |
+
case "XSUM":
|
154 |
+
adf = pd.read_pickle(f"data/xsum.pkl")
|
155 |
+
|
156 |
+
case _:
|
157 |
+
raise ValueError(f"'{data}' not understood.")
|
158 |
+
|
159 |
+
MODELS = adf.model.unique()
|
160 |
+
|
161 |
+
# Sample fewer models if so needed
|
162 |
+
if nmodels != "All":
|
163 |
+
if nmodels < len(MODELS):
|
164 |
+
|
165 |
+
MODELS = np.random.choice(MODELS, nmodels, replace=False).tolist()
|
166 |
+
adf = adf[adf.model.isin(MODELS)]
|
167 |
+
|
168 |
+
match data:
|
169 |
+
case "MMLU":
|
170 |
+
keys = [
|
171 |
+
"id",
|
172 |
+
"trial_id",
|
173 |
+
"perturbation",
|
174 |
+
] # MMLU has this extra parameter
|
175 |
+
case "CNN/DM" | "XSUM":
|
176 |
+
keys = ["id", "trial_id"]
|
177 |
+
case _:
|
178 |
+
pass
|
179 |
+
|
180 |
+
df = adf.pivot_table(
|
181 |
+
columns="model",
|
182 |
+
index=keys,
|
183 |
+
values="output",
|
184 |
+
aggfunc="first",
|
185 |
+
)
|
186 |
+
|
187 |
+
# Filter by number of rows
|
188 |
+
df.dropna(inplace=True)
|
189 |
+
if nrows != "All":
|
190 |
+
if nrows < df.shape[0]:
|
191 |
+
df = df.sample(nrows, random_state=seed)
|
192 |
+
|
193 |
+
# Compute true ranking
|
194 |
+
adf = adf.set_index(keys).loc[df.index].reset_index()
|
195 |
+
|
196 |
+
if evaluation == "Rouge":
|
197 |
+
|
198 |
+
def __true_rouge(x, scorer):
|
199 |
+
return scorer.score(x["reference"], x["output"])["rouge2"].fmeasure
|
200 |
+
|
201 |
+
scorer = rouge_scorer.RougeScorer(["rouge2"], use_stemmer=True)
|
202 |
+
adf["rouge"] = Parallel(n_jobs=-1, batch_size=128)(
|
203 |
+
delayed(__true_rouge)(i, scorer) for _, i in adf.iterrows()
|
204 |
+
)
|
205 |
+
|
206 |
+
# Method 2 - look at "win rates" - for each question, see which model
|
207 |
+
# wins (i.e. has the best ROUGE score)
|
208 |
+
idx = adf.groupby(["id", "trial_id"])["rouge"].idxmax()
|
209 |
+
win_rates = adf.loc[idx].model.value_counts()
|
210 |
+
win_rate_rank = win_rates.index.tolist()
|
211 |
+
|
212 |
+
# include models with nowins at the bottom
|
213 |
+
no_wins = list(set(MODELS) - set(win_rate_rank))
|
214 |
+
true_ranking = win_rate_rank + no_wins
|
215 |
+
evaluator = rouge
|
216 |
+
|
217 |
+
elif evaluation == "Equality":
|
218 |
+
|
219 |
+
# Compute the true ranking (multiple choice - so use equality between
|
220 |
+
# LLM response and reference-value)
|
221 |
+
adf["C"] = (adf.output == adf.reference).astype(int)
|
222 |
+
true_ranking = (
|
223 |
+
adf.groupby("model")["C"]
|
224 |
+
.apply(lambda x: sum(x) / len(x))
|
225 |
+
.sort_values(ascending=False)
|
226 |
+
.index.tolist()
|
227 |
+
)
|
228 |
+
evaluator = equality
|
229 |
+
|
230 |
+
else:
|
231 |
+
raise ValueError(f"'{evaluation}' not understood.")
|
232 |
+
|
233 |
+
match method:
|
234 |
+
case "Full":
|
235 |
+
ranker = SelfRank(MODELS, evaluator, true_ranking)
|
236 |
+
|
237 |
+
case "Greedy":
|
238 |
+
ranker = SelfRankGreedy(MODELS, evaluator, true_ranking)
|
239 |
+
|
240 |
+
case "MCA":
|
241 |
+
raise NotImplementedError
|
242 |
+
case _:
|
243 |
+
raise ValueError(f"'{method}' not understood.")
|
244 |
+
|
245 |
+
# generate outputs
|
246 |
+
ranker.fit(df)
|
247 |
+
ranks = ranker.ranking
|
248 |
+
|
249 |
+
ranks = [
|
250 |
+
j + i for i, j in zip_longest(ranks, ["π₯ ", "π₯ ", "π₯ "], fillvalue="")
|
251 |
+
]
|
252 |
+
out_df = pd.DataFrame({"rank": range(1, len(true_ranking) + 1), "model": ranks})
|
253 |
+
|
254 |
+
out_metrics = {
|
255 |
+
"rbo": ranker.measure(metric="rbo"),
|
256 |
+
"map-1": ranker.measure(metric="mapk", k=1),
|
257 |
+
"map-3": ranker.measure(metric="mapk", k=3),
|
258 |
+
"map-5": ranker.measure(metric="mapk", k=5),
|
259 |
+
"map-10": ranker.measure(metric="mapk", k=10),
|
260 |
+
"evaluations": evaluator.calls,
|
261 |
+
}
|
262 |
+
eval_metrics = (
|
263 |
+
f"<h2 style='color: purple;'> Evaluation measures </h2>"
|
264 |
+
f"Rank-Biased Overlap: {out_metrics['rbo']:0.3f}<br>"
|
265 |
+
f"MAP-3 : {out_metrics['map-3']:0.3f}<br>"
|
266 |
+
f"MAP-5 : {out_metrics['map-5']:0.3f}<br>"
|
267 |
+
f"MAP-10 : {out_metrics['map-10']: 0.3f}."
|
268 |
+
)
|
269 |
+
|
270 |
+
out_plot = ranker.plot()
|
271 |
+
|
272 |
+
return out_df, "output.png", eval_metrics
|
selfrank/algos/triplet.py
CHANGED
@@ -9,6 +9,7 @@ import logging
|
|
9 |
from .plots import bcolors
|
10 |
import random
|
11 |
|
|
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
14 |
# Local only for now
|
@@ -114,7 +115,7 @@ def equality(a: str, b:str, c:str, df:pd.DataFrame) -> int:
|
|
114 |
|
115 |
simple heuristic as the answers are multiple choice, so use equality.
|
116 |
"""
|
117 |
-
|
118 |
ties = df[a] == df[b]
|
119 |
a_wins = sum((df[a] == df[c]) & ~(ties))
|
120 |
b_wins = sum((df[b] == df[c]) & ~(ties))
|
@@ -132,7 +133,7 @@ def noisy_equality(a: str, b:str, c:str, df:pd.DataFrame, p: float) -> int:
|
|
132 |
noisy version of equality - where evaluations are flipped independently with
|
133 |
probability p (p=1 will always flip, p=0, will never)
|
134 |
"""
|
135 |
-
|
136 |
perturb = lambda x: not x if (random.random() <= p) else x
|
137 |
|
138 |
ties = (df[a] == df[b])
|
|
|
9 |
from .plots import bcolors
|
10 |
import random
|
11 |
|
12 |
+
|
13 |
logger = logging.getLogger(__name__)
|
14 |
|
15 |
# Local only for now
|
|
|
115 |
|
116 |
simple heuristic as the answers are multiple choice, so use equality.
|
117 |
"""
|
118 |
+
|
119 |
ties = df[a] == df[b]
|
120 |
a_wins = sum((df[a] == df[c]) & ~(ties))
|
121 |
b_wins = sum((df[b] == df[c]) & ~(ties))
|
|
|
133 |
noisy version of equality - where evaluations are flipped independently with
|
134 |
probability p (p=1 will always flip, p=0, will never)
|
135 |
"""
|
136 |
+
random.seed(42)
|
137 |
perturb = lambda x: not x if (random.random() <= p) else x
|
138 |
|
139 |
ties = (df[a] == df[b])
|