rahulnair23 commited on
Commit
18e32a8
Β·
1 Parent(s): ea18bb3

synthetic cases

Browse files
Files changed (4) hide show
  1. app.py +100 -191
  2. assets/synth.md +3 -0
  3. executors.py +272 -0
  4. selfrank/algos/triplet.py +3 -2
app.py CHANGED
@@ -1,15 +1,7 @@
1
  import gradio as gr
2
- import pandas as pd
3
- import numpy as np
4
- from rouge_score import rouge_scorer
5
- from joblib import Parallel, delayed
6
- from selfrank.algos.greedy import SelfRankGreedy
7
- from selfrank.algos.iterative import SelfRank
8
- from selfrank.algos.baseline import MCARank
9
- from selfrank.algos.triplet import equality, rouge
10
- import matplotlib.pyplot as plt
11
- from itertools import zip_longest
12
 
 
13
  class UI:
14
 
15
  def __init__(self):
@@ -32,36 +24,37 @@ class UI:
32
  gr.Markdown(
33
  """Using inference data gathered from [HELM](https://crfm.stanford.edu/helm/classic/latest/) we first show how our estimated rankings compare to rankings derived from using ground-truth or reference data."""
34
  )
 
35
  with gr.Column(variant="compact"):
36
  self.data = gr.Dropdown(
37
- choices=["CNN/DM", "XSUM", "MMLU"],
38
- multiselect=False,
39
- value="CNN/DM",
40
- label="Choose a dataset.",
41
- info="The dataset describes a specific task, either summarization (CNN/DM, XSUM) or multiple choice (MMLU).",
42
- interactive=True,
43
- )
44
  self.mmlu = gr.Dropdown(visible=False)
45
  self.evaluation = gr.Dropdown(
46
- choices=["Rouge", "Equality"],
47
- multiselect=False,
48
- value="Rouge",
49
- interactive=True,
50
- label="Evaluation function",
51
- info="How should the Judge model decide the winner? Demo limited to use 'Rouge' for generative tasks like summarization, and 'equality' for multiple choice or classification tasks. In practice you can use any function that compares judge responses to the contestant models.",
52
- )
53
 
54
  def update_mmlu(v):
55
  if v == "MMLU":
56
  return gr.Dropdown(
57
- choices=list(['abstract_algebra', 'college_chemistry', 'computer_security', 'econometrics', 'us_foreign_policy']),
58
- value='us_foreign_policy',
59
- multiselect=False,
60
- label="Choose MMLU subject.",
61
- info="MMLU subject area.",
62
- interactive=True,
63
- visible=True,
64
- ), gr.Dropdown(choices=['Equality'], value='Equality')
65
  else:
66
  return gr.Dropdown(visible=False), gr.Dropdown(choices=['Rouge'], value='Rouge')
67
 
@@ -69,30 +62,31 @@ class UI:
69
 
70
 
71
  self.nmodels = gr.Dropdown(
72
- choices=["All", 10, 20, 30],
73
- label="Number of models",
74
- info="Sample a subset of LLMs to rank.",
75
- value=10,
76
- interactive=True,
77
- )
78
  self.nrows = gr.Dropdown(
79
- choices=["All", 10, 20, 30],
80
- label="Number of instances",
81
- info="Sample a subset of instances to evaluate (smaller is faster).",
82
- value=10,
83
- interactive=True,
84
- )
85
  self.method = gr.Dropdown(
86
- choices=["Greedy", "Full"],
87
- label="Algorithm variant to use",
88
- info="Choose from one of two variants. 'Full' (FTR in the paper) runs all triplet combinations, recommended when evaluations are cheap or for smaller datasets, or 'greedy' (GTR) a faster variant suggested for more complex evaluations.",
89
- value="Full",
90
- interactive=True,
91
- )
92
  self.btn_execute = gr.Button("Run")
93
 
94
  def output_panel(self):
95
  """Plots/leaderboard/bump charts"""
 
96
  with gr.Column(variant="default"):
97
  gr.Markdown("""<h2 style='color: purple;'> Estimated ranking </h2> """)
98
  self.leaderboard = gr.DataFrame(headers=["rank", "model"],
@@ -110,8 +104,47 @@ class UI:
110
  """Synthetic data experiments"""
111
  gr.Markdown("<br>")
112
  gr.Markdown("---")
113
- gr.Markdown("""<h1 style='color: purple;'>Synthetic multiple choice </h1> """)
114
- gr.Markdown("Coming soon.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  def byod_panel(self):
117
  """Instructions panel"""
@@ -141,12 +174,12 @@ class UI:
141
  # Output panel/leaderboard
142
  self.output_panel()
143
 
144
- # TODO: self.synth_panel()
145
  self.byod_panel()
146
 
147
  # Register event listeners
148
  self.btn_execute.click(
149
- fn=self.benchmark_executor,
150
  inputs=[
151
  self.data,
152
  self.mmlu,
@@ -157,145 +190,21 @@ class UI:
157
  ],
158
  outputs=[self.leaderboard, self.bumpchart, self.eval_metrics],
159
  )
160
-
161
- return demo
162
-
163
- def benchmark_executor(
164
- self, data, mmlu_subject, evaluation, nmodels, nrows, method
165
- ) -> tuple[pd.DataFrame, plt.figure]:
166
- """Main execution flow for benchmarks"""
167
-
168
- # gr.Info(f"Loaded run config: {data}, {evaluation}, {nmodels}.")
169
- seed = 40
170
- np.random.seed(seed)
171
-
172
- match data:
173
- case "MMLU":
174
- adf = pd.read_pickle(f"data/mmlu_subject_{mmlu_subject}.pkl")
175
-
176
- case "CNN/DM":
177
- adf = pd.read_pickle(f"data/cnndm.pkl")
178
-
179
- case "XSUM":
180
- adf = pd.read_pickle(f"data/xsum.pkl")
181
-
182
- case _:
183
- raise ValueError(f"'{data}' not understood.")
184
-
185
- MODELS = adf.model.unique()
186
-
187
- # Sample fewer models if so needed
188
- if nmodels != "All":
189
- if nmodels < len(MODELS):
190
-
191
- MODELS = np.random.choice(MODELS, nmodels, replace=False).tolist()
192
- adf = adf[adf.model.isin(MODELS)]
193
-
194
- match data:
195
- case "MMLU":
196
- keys = [
197
- "id",
198
- "trial_id",
199
- "perturbation",
200
- ] # MMLU has this extra parameter
201
- case "CNN/DM" | "XSUM":
202
- keys = ["id", "trial_id"]
203
- case _:
204
- pass
205
-
206
- df = adf.pivot_table(
207
- columns="model",
208
- index=keys,
209
- values="output",
210
- aggfunc="first",
211
- )
212
-
213
- # Filter by number of rows
214
- df.dropna(inplace=True)
215
- if nrows != "All":
216
- if nrows < df.shape[0]:
217
- df = df.sample(nrows, random_state=seed)
218
-
219
- # Compute true ranking
220
- adf = adf.set_index(keys).loc[df.index].reset_index()
221
-
222
- if evaluation == "Rouge":
223
-
224
- def __true_rouge(x, scorer):
225
- return scorer.score(x["reference"], x["output"])["rouge2"].fmeasure
226
-
227
- scorer = rouge_scorer.RougeScorer(["rouge2"], use_stemmer=True)
228
- adf["rouge"] = Parallel(n_jobs=-1, batch_size=128)(
229
- delayed(__true_rouge)(i, scorer) for _, i in adf.iterrows()
230
- )
231
-
232
- # Method 2 - look at "win rates" - for each question, see which model
233
- # wins (i.e. has the best ROUGE score)
234
- idx = adf.groupby(["id", "trial_id"])["rouge"].idxmax()
235
- win_rates = adf.loc[idx].model.value_counts()
236
- win_rate_rank = win_rates.index.tolist()
237
-
238
- # include models with nowins at the bottom
239
- no_wins = list(set(MODELS) - set(win_rate_rank))
240
- true_ranking = win_rate_rank + no_wins
241
- evaluator = rouge
242
-
243
- elif evaluation == "Equality":
244
-
245
- # Compute the true ranking (multiple choice - so use equality between
246
- # LLM response and reference-value)
247
- adf["C"] = (adf.output == adf.reference).astype(int)
248
- true_ranking = (
249
- adf.groupby("model")["C"]
250
- .apply(lambda x: sum(x) / len(x))
251
- .sort_values(ascending=False)
252
- .index.tolist()
253
  )
254
- evaluator = equality
255
-
256
- else:
257
- raise ValueError(f"'{evaluation}' not understood.")
258
-
259
- match method:
260
- case "Full":
261
- ranker = SelfRank(MODELS, evaluator, true_ranking)
262
-
263
- case "Greedy":
264
- ranker = SelfRankGreedy(MODELS, evaluator, true_ranking)
265
-
266
- case "MCA":
267
- raise NotImplementedError
268
- case _:
269
- raise ValueError(f"'{method}' not understood.")
270
-
271
- # generate outputs
272
- ranker.fit(df)
273
- ranks = ranker.ranking
274
-
275
- ranks = [
276
- j + i for i, j in zip_longest(ranks, ["πŸ₯‡ ", "πŸ₯ˆ ", "πŸ₯‰ "], fillvalue="")
277
- ]
278
- out_df = pd.DataFrame({"rank": range(1, len(true_ranking) + 1), "model": ranks})
279
-
280
- out_metrics = {
281
- "rbo": ranker.measure(metric="rbo"),
282
- "map-1": ranker.measure(metric="mapk", k=1),
283
- "map-3": ranker.measure(metric="mapk", k=3),
284
- "map-5": ranker.measure(metric="mapk", k=5),
285
- "map-10": ranker.measure(metric="mapk", k=10),
286
- "evaluations": evaluator.calls,
287
- }
288
- eval_metrics = (
289
- f"<h2> Evaluation measures </h2>"
290
- f"Rank-Biased Overlap: {out_metrics['rbo']:0.3f}<br>"
291
- f"MAP-3 : {out_metrics['map-3']:0.3f}<br>"
292
- f"MAP-5 : {out_metrics['map-5']:0.3f}<br>"
293
- f"MAP-10 : {out_metrics['map-10']: 0.3f}."
294
- )
295
-
296
- out_plot = ranker.plot()
297
 
298
- return out_df, "output.png", eval_metrics
299
 
300
  def run(self):
301
  self.ui = self.layout()
 
1
  import gradio as gr
2
+ from executors import benchmark_executor, synth_executor
 
 
 
 
 
 
 
 
 
3
 
4
+ from gradio_rangeslider import RangeSlider
5
  class UI:
6
 
7
  def __init__(self):
 
24
  gr.Markdown(
25
  """Using inference data gathered from [HELM](https://crfm.stanford.edu/helm/classic/latest/) we first show how our estimated rankings compare to rankings derived from using ground-truth or reference data."""
26
  )
27
+
28
  with gr.Column(variant="compact"):
29
  self.data = gr.Dropdown(
30
+ choices=["CNN/DM", "XSUM", "MMLU"],
31
+ multiselect=False,
32
+ value="CNN/DM",
33
+ label="Choose a dataset.",
34
+ info="The dataset describes a specific task, either summarization (CNN/DM, XSUM) or multiple choice (MMLU).",
35
+ interactive=True,
36
+ )
37
  self.mmlu = gr.Dropdown(visible=False)
38
  self.evaluation = gr.Dropdown(
39
+ choices=["Rouge", "Equality"],
40
+ multiselect=False,
41
+ value="Rouge",
42
+ interactive=True,
43
+ label="Evaluation function",
44
+ info="How should the Judge model decide the winner? Demo limited to use 'Rouge' for generative tasks like summarization, and 'equality' for multiple choice or classification tasks. In practice you can use any function that compares judge responses to the contestant models.",
45
+ )
46
 
47
  def update_mmlu(v):
48
  if v == "MMLU":
49
  return gr.Dropdown(
50
+ choices=list(['abstract_algebra', 'college_chemistry', 'computer_security', 'econometrics', 'us_foreign_policy']),
51
+ value='us_foreign_policy',
52
+ multiselect=False,
53
+ label="Choose MMLU subject.",
54
+ info="MMLU subject area.",
55
+ interactive=True,
56
+ visible=True,
57
+ ), gr.Dropdown(choices=['Equality'], value='Equality')
58
  else:
59
  return gr.Dropdown(visible=False), gr.Dropdown(choices=['Rouge'], value='Rouge')
60
 
 
62
 
63
 
64
  self.nmodels = gr.Dropdown(
65
+ choices=["All", 10, 20, 30],
66
+ label="Number of models",
67
+ info="Sample a subset of LLMs to rank.",
68
+ value=10,
69
+ interactive=True,
70
+ )
71
  self.nrows = gr.Dropdown(
72
+ choices=["All", 10, 20, 30],
73
+ label="Number of instances",
74
+ info="Sample a subset of instances to evaluate (smaller is faster).",
75
+ value=10,
76
+ interactive=True,
77
+ )
78
  self.method = gr.Dropdown(
79
+ choices=["Greedy", "Full"],
80
+ label="Algorithm variant to use",
81
+ info="Choose from one of two variants. 'Full' (FTR in the paper) runs all triplet combinations, recommended when evaluations are cheap or for smaller datasets, or 'greedy' (GTR) a faster variant suggested for more complex evaluations.",
82
+ value="Full",
83
+ interactive=True,
84
+ )
85
  self.btn_execute = gr.Button("Run")
86
 
87
  def output_panel(self):
88
  """Plots/leaderboard/bump charts"""
89
+
90
  with gr.Column(variant="default"):
91
  gr.Markdown("""<h2 style='color: purple;'> Estimated ranking </h2> """)
92
  self.leaderboard = gr.DataFrame(headers=["rank", "model"],
 
104
  """Synthetic data experiments"""
105
  gr.Markdown("<br>")
106
  gr.Markdown("---")
107
+ with open("assets/synth.md", "r") as f:
108
+ content = f.read()
109
+
110
+ gr.Markdown(content)
111
+
112
+ with gr.Row():
113
+ with gr.Column(scale=1):
114
+ with gr.Column(variant='compact'):
115
+ self.synth_range = RangeSlider(10, 100, value=(50, 90), step=1, label="Model Accuracy Range (%)", interactive=True)
116
+ self.synth_nmodels = gr.Slider(3, 50, value=10, step=1, label="Number of models to synthesise.", info="Equally spaced in the accuracy range.", interactive=True)
117
+ self.synth_nanswers = gr.Slider(2, 50, value=10, step=1, label="Number of possible (discrete) answers per prompt.", interactive=True)
118
+ self.synth_nquestions = gr.Slider(10, 100, step=10, label="Number of prompts to simulate.", interactive=True)
119
+ self.synth_noise = gr.Slider(0, 1, value=0, label='Noise in evaluation (p)', info="Evaluation function decisions flipped with probability p. p=0 implies no noise.", interactive=True)
120
+ self.synth_method = gr.Dropdown(
121
+ choices=["Greedy", "Full"],
122
+ label="Algorithm variant to use",
123
+ info="Choose from one of two variants. 'Full' (FTR in the paper) runs all triplet combinations, recommended when evaluations are cheap or for smaller datasets, or 'greedy' (GTR) a faster variant suggested for more complex evaluations.",
124
+ value="Full",
125
+ interactive=True,
126
+ )
127
+
128
+ examples = gr.Examples([[(10, 30), 10, 10, 10, 0, "Full"],
129
+ [(10, 30), 10, 10, 10, 0.5, "Full"],
130
+ [[10, 30], 10, 2, 10, 0, "Full"]],
131
+ [self.synth_range, self.synth_nmodels, self.synth_nanswers, self.synth_nquestions, self.synth_noise, self.synth_method],
132
+ label='Some interesting cases (click and run)', example_labels=["Rankings recovered for low accuracy models",
133
+ "Robust recovery when evaluations have noise",
134
+ "Binary outcomes are challenging"
135
+ ] )
136
+
137
+ self.synth_execute = gr.Button("Run")
138
+ with gr.Column(scale=1):
139
+ with gr.Column(variant="default"):
140
+ gr.Markdown(
141
+ """<h2 style='color: purple;'> Estimated vs. true ranking </h2> """
142
+ )
143
+
144
+ self.synth_bumpchart = gr.Image()
145
+ with gr.Column(scale=1):
146
+ self.synth_eval_metrics = gr.Markdown()
147
+
148
 
149
  def byod_panel(self):
150
  """Instructions panel"""
 
174
  # Output panel/leaderboard
175
  self.output_panel()
176
 
177
+ self.synth_panel()
178
  self.byod_panel()
179
 
180
  # Register event listeners
181
  self.btn_execute.click(
182
+ fn=benchmark_executor,
183
  inputs=[
184
  self.data,
185
  self.mmlu,
 
190
  ],
191
  outputs=[self.leaderboard, self.bumpchart, self.eval_metrics],
192
  )
193
+ self.synth_execute.click(
194
+ fn=synth_executor,
195
+ inputs=[
196
+ self.synth_range,
197
+ self.synth_nmodels,
198
+ self.synth_nanswers,
199
+ self.synth_nquestions,
200
+ self.synth_noise,
201
+ self.synth_method,
202
+ ],
203
+ outputs=[self.synth_bumpchart, self.synth_eval_metrics],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  )
205
+ return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
+
208
 
209
  def run(self):
210
  self.ui = self.layout()
assets/synth.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ <h1 style='color: purple;'>Synthetic multiple choice </h1>
2
+
3
+ To analyse our methods, we synthesise data from models with known accuracy in a multiple choice setting, i.e. discrete set of possible responses. Several parameters (number of models, model accuracy, number of prompts, and number of possible answers, noisy comparisons) can have an impact on quality of results. Rankings can be recovered for a range of challenging cases, for instance when the accuracy of underlying models is low or when the evaluation function is noisy and imperfect. When the number of possible answers are low, for example in binary choice settings, recovering rankings becomes challenging. In general low variance in wrong answers cause triplet evaluations to treat wrong answers as the right one.
executors.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import numpy as np
4
+ from rouge_score import rouge_scorer
5
+ from joblib import Parallel, delayed
6
+ from selfrank.algos.greedy import SelfRankGreedy
7
+ from selfrank.algos.iterative import SelfRank
8
+ from selfrank.algos.baseline import MCARank
9
+ from selfrank.algos.triplet import equality, rouge, noisy_equality
10
+ import matplotlib.pyplot as plt
11
+ from itertools import zip_longest
12
+ from uuid import uuid4
13
+ import csv, os
14
+ from functools import partial
15
+
16
+
17
+ def generate_data(max_acc, min_acc, nmodels, nanswers, nquestions) -> tuple[pd.DataFrame, list]:
18
+
19
+ np.random.seed(42)
20
+ # Spread model accuracies between min and max
21
+ model_acc = np.linspace(max_acc, min_acc, nmodels)
22
+
23
+ gt_and_model_ans = np.zeros(
24
+ (nquestions, nmodels + 1), dtype=int
25
+ ) # array to store ground truth and model ans
26
+
27
+ # Create ground truth answers i.e. first column
28
+ for i in range(nquestions):
29
+ gt_and_model_ans[i][0] = np.random.randint(nanswers)
30
+
31
+ for i in range(0, nmodels):
32
+ no_of_entries_frm_gt = np.ceil(model_acc[i] / 100 * (nquestions)).astype(int)
33
+ # print(no_of_entries_frm_gt)
34
+ offsets_to_match = np.random.permutation(nquestions)[0:no_of_entries_frm_gt]
35
+ # print(offsets_to_match)
36
+ for j in range(nquestions):
37
+ if j in offsets_to_match:
38
+ gt_and_model_ans[j][i + 1] = gt_and_model_ans[j][0]
39
+ else:
40
+ lst_wo_gt = list(range(nanswers))
41
+ lst_wo_gt.remove(gt_and_model_ans[j][0])
42
+ gt_and_model_ans[j][i + 1] = lst_wo_gt[np.random.randint(nanswers - 1)]
43
+
44
+ # print(gt_and_model_ans)
45
+ filename = str(uuid4())
46
+
47
+ fields = ["GT"]
48
+ for i in range(nmodels):
49
+ fields.append("M" + str(i + 1))
50
+
51
+ # writing to csv file
52
+ with open(filename, "w") as csvfile:
53
+ # creating a csv writer object
54
+ csvwriter = csv.writer(csvfile)
55
+
56
+ # writing the fields
57
+ csvwriter.writerow(fields)
58
+
59
+ # writing the data rows
60
+ csvwriter.writerows(gt_and_model_ans)
61
+
62
+ df = pd.read_csv(filename)
63
+ os.remove(filename)
64
+
65
+ true_ranking = [f"M{i}" for i in range(1, nmodels + 1)]
66
+
67
+ return df, true_ranking
68
+
69
+ def synth_executor(acc_range: tuple[float, float], nmodels, nanswers, nquestions, noise, method) -> tuple[str, dict]:
70
+
71
+ min_acc, max_acc = acc_range
72
+ df, true_ranking = generate_data(max_acc, min_acc, nmodels, nanswers, nquestions)
73
+
74
+ if noise == 0.:
75
+ comp = equality
76
+ else:
77
+ comp = partial(noisy_equality, p=noise)
78
+
79
+ df = df.drop(columns=["GT"])
80
+ MODELS = df.columns.tolist()
81
+
82
+ if method == "Full":
83
+ ranker = SelfRank(MODELS, comp, true_ranking)
84
+ ranker.fit(df)
85
+
86
+ # outputs of interest
87
+ out = {
88
+ "true_ranking": true_ranking,
89
+ "estimated_ranking": ranker.ranking,
90
+ "rbo": ranker.measure(metric="rbo"),
91
+ "map-1": ranker.measure(metric='mapk', k=1),
92
+ "map-3": ranker.measure(metric='mapk', k=3),
93
+ "map-5": ranker.measure(metric='mapk', k=5),
94
+ "map-10": ranker.measure(metric='mapk', k=10)
95
+ }
96
+
97
+ elif method == "Greedy":
98
+ ranker = SelfRankGreedy(MODELS, comp, true_ranking)
99
+ ranker.fit(df)
100
+ out = {
101
+ "true_ranking": true_ranking,
102
+ "estimated_ranking": ranker.ranking,
103
+ "rbo": ranker.measure(metric="rbo"),
104
+ "map-1": ranker.measure(metric='mapk', k=1),
105
+ "map-3": ranker.measure(metric='mapk', k=3),
106
+ "map-5": ranker.measure(metric='mapk', k=5),
107
+ "map-10": ranker.measure(metric='mapk', k=10)
108
+ }
109
+ elif method == 'MCA':
110
+ ranker = MCARank(MODELS, comp, true_ranking)
111
+ ranker.fit(df, measure='noisy_equality', p=noise)
112
+ out = {
113
+ "true_ranking": true_ranking,
114
+ "estimated_ranking": ranker.ranking,
115
+ "rbo": ranker.measure(metric="rbo"),
116
+ "map-1": ranker.measure(metric='mapk', k=1),
117
+ "map-3": ranker.measure(metric='mapk', k=3),
118
+ "map-5": ranker.measure(metric='mapk', k=5),
119
+ "map-10": ranker.measure(metric='mapk', k=10)
120
+ }
121
+ else:
122
+ raise ValueError(f"{method} not understood.")
123
+
124
+ eval_metrics = (
125
+ f"<h2 style='color: purple;'> Evaluation measures </h2>"
126
+ f"Rank-Biased Overlap: {out['rbo']:0.3f}<br>"
127
+ f"MAP-3 : {out['map-3']:0.3f}<br>"
128
+ f"MAP-5 : {out['map-5']:0.3f}<br>"
129
+ f"MAP-10 : {out['map-10']: 0.3f}."
130
+ )
131
+
132
+ out_plot = ranker.plot("synth")
133
+
134
+ return "synth.png", eval_metrics
135
+
136
+
137
+
138
+ def benchmark_executor(data, mmlu_subject, evaluation, nmodels, nrows, method
139
+ ) -> tuple[pd.DataFrame, plt.figure]:
140
+ """Main execution flow for benchmarks"""
141
+
142
+ # gr.Info(f"Loaded run config: {data}, {evaluation}, {nmodels}.")
143
+ seed = 40
144
+ np.random.seed(seed)
145
+
146
+ match data:
147
+ case "MMLU":
148
+ adf = pd.read_pickle(f"data/mmlu_subject_{mmlu_subject}.pkl")
149
+
150
+ case "CNN/DM":
151
+ adf = pd.read_pickle(f"data/cnndm.pkl")
152
+
153
+ case "XSUM":
154
+ adf = pd.read_pickle(f"data/xsum.pkl")
155
+
156
+ case _:
157
+ raise ValueError(f"'{data}' not understood.")
158
+
159
+ MODELS = adf.model.unique()
160
+
161
+ # Sample fewer models if so needed
162
+ if nmodels != "All":
163
+ if nmodels < len(MODELS):
164
+
165
+ MODELS = np.random.choice(MODELS, nmodels, replace=False).tolist()
166
+ adf = adf[adf.model.isin(MODELS)]
167
+
168
+ match data:
169
+ case "MMLU":
170
+ keys = [
171
+ "id",
172
+ "trial_id",
173
+ "perturbation",
174
+ ] # MMLU has this extra parameter
175
+ case "CNN/DM" | "XSUM":
176
+ keys = ["id", "trial_id"]
177
+ case _:
178
+ pass
179
+
180
+ df = adf.pivot_table(
181
+ columns="model",
182
+ index=keys,
183
+ values="output",
184
+ aggfunc="first",
185
+ )
186
+
187
+ # Filter by number of rows
188
+ df.dropna(inplace=True)
189
+ if nrows != "All":
190
+ if nrows < df.shape[0]:
191
+ df = df.sample(nrows, random_state=seed)
192
+
193
+ # Compute true ranking
194
+ adf = adf.set_index(keys).loc[df.index].reset_index()
195
+
196
+ if evaluation == "Rouge":
197
+
198
+ def __true_rouge(x, scorer):
199
+ return scorer.score(x["reference"], x["output"])["rouge2"].fmeasure
200
+
201
+ scorer = rouge_scorer.RougeScorer(["rouge2"], use_stemmer=True)
202
+ adf["rouge"] = Parallel(n_jobs=-1, batch_size=128)(
203
+ delayed(__true_rouge)(i, scorer) for _, i in adf.iterrows()
204
+ )
205
+
206
+ # Method 2 - look at "win rates" - for each question, see which model
207
+ # wins (i.e. has the best ROUGE score)
208
+ idx = adf.groupby(["id", "trial_id"])["rouge"].idxmax()
209
+ win_rates = adf.loc[idx].model.value_counts()
210
+ win_rate_rank = win_rates.index.tolist()
211
+
212
+ # include models with nowins at the bottom
213
+ no_wins = list(set(MODELS) - set(win_rate_rank))
214
+ true_ranking = win_rate_rank + no_wins
215
+ evaluator = rouge
216
+
217
+ elif evaluation == "Equality":
218
+
219
+ # Compute the true ranking (multiple choice - so use equality between
220
+ # LLM response and reference-value)
221
+ adf["C"] = (adf.output == adf.reference).astype(int)
222
+ true_ranking = (
223
+ adf.groupby("model")["C"]
224
+ .apply(lambda x: sum(x) / len(x))
225
+ .sort_values(ascending=False)
226
+ .index.tolist()
227
+ )
228
+ evaluator = equality
229
+
230
+ else:
231
+ raise ValueError(f"'{evaluation}' not understood.")
232
+
233
+ match method:
234
+ case "Full":
235
+ ranker = SelfRank(MODELS, evaluator, true_ranking)
236
+
237
+ case "Greedy":
238
+ ranker = SelfRankGreedy(MODELS, evaluator, true_ranking)
239
+
240
+ case "MCA":
241
+ raise NotImplementedError
242
+ case _:
243
+ raise ValueError(f"'{method}' not understood.")
244
+
245
+ # generate outputs
246
+ ranker.fit(df)
247
+ ranks = ranker.ranking
248
+
249
+ ranks = [
250
+ j + i for i, j in zip_longest(ranks, ["πŸ₯‡ ", "πŸ₯ˆ ", "πŸ₯‰ "], fillvalue="")
251
+ ]
252
+ out_df = pd.DataFrame({"rank": range(1, len(true_ranking) + 1), "model": ranks})
253
+
254
+ out_metrics = {
255
+ "rbo": ranker.measure(metric="rbo"),
256
+ "map-1": ranker.measure(metric="mapk", k=1),
257
+ "map-3": ranker.measure(metric="mapk", k=3),
258
+ "map-5": ranker.measure(metric="mapk", k=5),
259
+ "map-10": ranker.measure(metric="mapk", k=10),
260
+ "evaluations": evaluator.calls,
261
+ }
262
+ eval_metrics = (
263
+ f"<h2 style='color: purple;'> Evaluation measures </h2>"
264
+ f"Rank-Biased Overlap: {out_metrics['rbo']:0.3f}<br>"
265
+ f"MAP-3 : {out_metrics['map-3']:0.3f}<br>"
266
+ f"MAP-5 : {out_metrics['map-5']:0.3f}<br>"
267
+ f"MAP-10 : {out_metrics['map-10']: 0.3f}."
268
+ )
269
+
270
+ out_plot = ranker.plot()
271
+
272
+ return out_df, "output.png", eval_metrics
selfrank/algos/triplet.py CHANGED
@@ -9,6 +9,7 @@ import logging
9
  from .plots import bcolors
10
  import random
11
 
 
12
  logger = logging.getLogger(__name__)
13
 
14
  # Local only for now
@@ -114,7 +115,7 @@ def equality(a: str, b:str, c:str, df:pd.DataFrame) -> int:
114
 
115
  simple heuristic as the answers are multiple choice, so use equality.
116
  """
117
-
118
  ties = df[a] == df[b]
119
  a_wins = sum((df[a] == df[c]) & ~(ties))
120
  b_wins = sum((df[b] == df[c]) & ~(ties))
@@ -132,7 +133,7 @@ def noisy_equality(a: str, b:str, c:str, df:pd.DataFrame, p: float) -> int:
132
  noisy version of equality - where evaluations are flipped independently with
133
  probability p (p=1 will always flip, p=0, will never)
134
  """
135
-
136
  perturb = lambda x: not x if (random.random() <= p) else x
137
 
138
  ties = (df[a] == df[b])
 
9
  from .plots import bcolors
10
  import random
11
 
12
+
13
  logger = logging.getLogger(__name__)
14
 
15
  # Local only for now
 
115
 
116
  simple heuristic as the answers are multiple choice, so use equality.
117
  """
118
+
119
  ties = df[a] == df[b]
120
  a_wins = sum((df[a] == df[c]) & ~(ties))
121
  b_wins = sum((df[b] == df[c]) & ~(ties))
 
133
  noisy version of equality - where evaluations are flipped independently with
134
  probability p (p=1 will always flip, p=0, will never)
135
  """
136
+ random.seed(42)
137
  perturb = lambda x: not x if (random.random() <= p) else x
138
 
139
  ties = (df[a] == df[b])