rahulnair23 commited on
Commit
d39c67a
β€’
1 Parent(s): 771a43c

instructions

Browse files
Files changed (4) hide show
  1. app.py +32 -23
  2. assets/header.md +6 -0
  3. assets/instructions.md +74 -0
  4. css.css β†’ style.css +0 -0
app.py CHANGED
@@ -14,32 +14,29 @@ class UI:
14
 
15
  def __init__(self):
16
  """Load any static assets"""
17
- pass
18
 
19
  def header_block(self):
20
  """Title/description"""
21
 
22
- gr.Markdown(
23
- """<h1 style='text-align: center; color: black;'>πŸ₯‡ Ranking LLMs without ground truth </h1>"""
24
- )
25
- gr.Markdown(
26
- "This space demonstrates reference-free ranking of large language models describe in our ACL Findings paper [Ranking Large Language Models without Ground Truth](https://arxiv.org/abs/2402.14860). <br>"
27
- "Inspired by real life where both an expert and a knowledgeable person can identify a novice the main idea is to consider triplets of models, where each one of them evaluates the other two, correctly identifying the worst model in the triplet with high probability. Iteratively performing such evaluations yields a estimated ranking that doesn't require ground truth/reference data which can be expensive to gather. The methods are a viable low-resource ranking mechanism for practical use.<br>"
28
- "[Source code](https://huggingface.co/spaces/ibm/llm-rank-themselves/tree/main).<br>"
29
- )
30
  gr.Markdown('---')
31
  gr.Markdown('<br>')
32
-
33
 
34
  def selection_panel(self):
35
  """user selections"""
36
- gr.Markdown("""<h2 style='color: purple;'> Benchmark experiments </h2> """)
 
37
  with gr.Column(variant='compact'):
38
  self.data = gr.Dropdown(
39
  choices=["CNN/DM", "XSUM", "MMLU"],
40
  multiselect=False, value='CNN/DM',
41
  label="Choose a dataset.",
42
- info="The dataset describes a task",
43
  interactive=True,
44
  )
45
  self.evaluation = gr.Dropdown(
@@ -50,14 +47,14 @@ class UI:
50
  info="How should the Judge model decide the winner? Demo limited to use 'Rouge' for generative tasks like summarization, and 'equality' for multiple choice or classification tasks. In practice you can use any function that compares judge responses to the contestant models.",
51
  )
52
  self.nmodels = gr.Dropdown(
53
- choices=[None, 10, 20, 30],
54
  label="Number of models",
55
  info="Sample a subset of LLMs to rank.",
56
  value=10,
57
  interactive=True,
58
  )
59
  self.nrows = gr.Dropdown(
60
- choices=[None, 10, 20, 30],
61
  label="Number of instances",
62
  info="Sample a subset of instances to evaluate (smaller is faster).",
63
  value=10,
@@ -89,14 +86,21 @@ class UI:
89
  """ Synthetic data experiments """
90
  gr.Markdown('<br>')
91
  gr.Markdown('---')
92
- gr.Markdown("""<h2 style='color: purple;'>Synthetic multiple choice </h2> """)
 
93
 
94
  def byod_panel(self):
95
- """ Synthetic data experiments """
96
  gr.Markdown('<br>')
97
  gr.Markdown('---')
98
- gr.Markdown("""<h2 style='color: purple;'>BYOD </h2> """)
 
 
 
99
 
 
 
 
100
 
101
  def layout(self):
102
  """ Assemble the overall layout """
@@ -113,7 +117,7 @@ class UI:
113
  # Output panel/leaderboard
114
  self.output_panel()
115
 
116
- self.synth_panel()
117
  self.byod_panel()
118
 
119
  # Register event listeners
@@ -128,6 +132,8 @@ class UI:
128
  """ Main execution flow for benchmarks """
129
 
130
  #gr.Info(f"Loaded run config: {data}, {evaluation}, {nmodels}.")
 
 
131
 
132
  match data:
133
  case 'MMLU':
@@ -145,7 +151,7 @@ class UI:
145
  raise ValueError(f"'{data}' not understood.")
146
 
147
  # Sample fewer models if so needed
148
- if nmodels is not None:
149
  if nmodels < len(MODELS):
150
 
151
  MODELS = np.random.choice(MODELS, nmodels, replace=False).tolist()
@@ -168,9 +174,9 @@ class UI:
168
 
169
  # Filter by number of rows
170
  df.dropna(inplace=True)
171
- if nrows is not None:
172
  if nrows < df.shape[0]:
173
- df = df.sample(nrows)
174
 
175
  # Compute true ranking
176
  adf = adf.set_index(keys).loc[df.index].reset_index()
@@ -227,7 +233,10 @@ class UI:
227
 
228
  # generate outputs
229
  ranker.fit(df)
230
- out_df = pd.DataFrame({'rank': range(1, len(true_ranking)+1), 'model': ranker.ranking})
 
 
 
231
 
232
  out_metrics = {"rbo": ranker.measure(metric="rbo"),
233
  "map-1": ranker.measure(metric="mapk", k=1),
@@ -236,7 +245,7 @@ class UI:
236
  "map-10": ranker.measure(metric="mapk", k=10),
237
  "evaluations": evaluator.calls
238
  }
239
- eval_metrics = (f"Evaluation measures: <br>"
240
  f"Rank-Biased Overlap: {out_metrics['rbo']:0.3f}<br>"
241
  f"MAP-3 : {out_metrics['map-3']:0.3f}<br>"
242
  f"MAP-5 : {out_metrics['map-5']:0.3f}<br>"
 
14
 
15
  def __init__(self):
16
  """Load any static assets"""
17
+ self.load_css()
18
 
19
  def header_block(self):
20
  """Title/description"""
21
 
22
+ with open("assets/header.md", 'r') as f:
23
+ content = f.read()
24
+
25
+ gr.Markdown(content)
 
 
 
 
26
  gr.Markdown('---')
27
  gr.Markdown('<br>')
28
+
29
 
30
  def selection_panel(self):
31
  """user selections"""
32
+ gr.Markdown("""<h1 style='color: purple;'> Ranking with benchmarks </h1> """)
33
+ gr.Markdown("""Using inference data gathered from [HELM](https://crfm.stanford.edu/helm/classic/latest/) we first show how our estimated rankings compare to rankings derived from using ground-truth or reference data.""")
34
  with gr.Column(variant='compact'):
35
  self.data = gr.Dropdown(
36
  choices=["CNN/DM", "XSUM", "MMLU"],
37
  multiselect=False, value='CNN/DM',
38
  label="Choose a dataset.",
39
+ info="The dataset describes a specific task, either summarization (CNN/DM, XSUM) or multiple choice (MMLU).",
40
  interactive=True,
41
  )
42
  self.evaluation = gr.Dropdown(
 
47
  info="How should the Judge model decide the winner? Demo limited to use 'Rouge' for generative tasks like summarization, and 'equality' for multiple choice or classification tasks. In practice you can use any function that compares judge responses to the contestant models.",
48
  )
49
  self.nmodels = gr.Dropdown(
50
+ choices=["All", 10, 20, 30],
51
  label="Number of models",
52
  info="Sample a subset of LLMs to rank.",
53
  value=10,
54
  interactive=True,
55
  )
56
  self.nrows = gr.Dropdown(
57
+ choices=["All", 10, 20, 30],
58
  label="Number of instances",
59
  info="Sample a subset of instances to evaluate (smaller is faster).",
60
  value=10,
 
86
  """ Synthetic data experiments """
87
  gr.Markdown('<br>')
88
  gr.Markdown('---')
89
+ gr.Markdown("""<h1 style='color: purple;'>Synthetic multiple choice </h1> """)
90
+ gr.Markdown("Coming soon.")
91
 
92
  def byod_panel(self):
93
+ """ Instructions panel """
94
  gr.Markdown('<br>')
95
  gr.Markdown('---')
96
+ with open("assets/instructions.md", 'r') as f:
97
+ content = f.read()
98
+ gr.Markdown(content)
99
+ gr.Markdown('---')
100
 
101
+ def load_css(self):
102
+ with open('style.css', 'r') as file:
103
+ self.css = file.read()
104
 
105
  def layout(self):
106
  """ Assemble the overall layout """
 
117
  # Output panel/leaderboard
118
  self.output_panel()
119
 
120
+ #TODO: self.synth_panel()
121
  self.byod_panel()
122
 
123
  # Register event listeners
 
132
  """ Main execution flow for benchmarks """
133
 
134
  #gr.Info(f"Loaded run config: {data}, {evaluation}, {nmodels}.")
135
+ seed = 40
136
+ np.random.seed(seed)
137
 
138
  match data:
139
  case 'MMLU':
 
151
  raise ValueError(f"'{data}' not understood.")
152
 
153
  # Sample fewer models if so needed
154
+ if nmodels != "All":
155
  if nmodels < len(MODELS):
156
 
157
  MODELS = np.random.choice(MODELS, nmodels, replace=False).tolist()
 
174
 
175
  # Filter by number of rows
176
  df.dropna(inplace=True)
177
+ if nrows != "All":
178
  if nrows < df.shape[0]:
179
+ df = df.sample(nrows, random_state=seed)
180
 
181
  # Compute true ranking
182
  adf = adf.set_index(keys).loc[df.index].reset_index()
 
233
 
234
  # generate outputs
235
  ranker.fit(df)
236
+ ranks = ranker.ranking
237
+ from itertools import zip_longest
238
+ ranks = [j + i for i, j in zip_longest(ranks, ["πŸ₯‡ ", "πŸ₯ˆ ", "πŸ₯‰ "], fillvalue='')]
239
+ out_df = pd.DataFrame({'rank': range(1, len(true_ranking)+1), 'model': ranks})
240
 
241
  out_metrics = {"rbo": ranker.measure(metric="rbo"),
242
  "map-1": ranker.measure(metric="mapk", k=1),
 
245
  "map-10": ranker.measure(metric="mapk", k=10),
246
  "evaluations": evaluator.calls
247
  }
248
+ eval_metrics = (f"<h2> Evaluation measures </h2>"
249
  f"Rank-Biased Overlap: {out_metrics['rbo']:0.3f}<br>"
250
  f"MAP-3 : {out_metrics['map-3']:0.3f}<br>"
251
  f"MAP-5 : {out_metrics['map-5']:0.3f}<br>"
assets/header.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <h1 style='text-align: center; color: black;'>πŸ₯‡ Ranking LLMs without ground truth </h1>
2
+
3
+
4
+ This space demonstrates reference-free ranking of large language models describe in our ACL Findings paper [Ranking Large Language Models without Ground Truth](https://arxiv.org/abs/2402.14860). <br>
5
+
6
+ Inspired by real life where both an expert and a knowledgeable person can identify a novice the main idea is to consider triplets of models, where each one of them evaluates the other two, correctly identifying the worst model in the triplet with high probability. Iteratively performing such evaluations yields a estimated ranking that doesn't require ground truth/reference data which can be expensive to gather. The methods are a viable low-resource ranking mechanism for practical use. [Source code](https://huggingface.co/spaces/ibm/llm-rank-themselves/tree/main) is included as part of this space. Installation and usage instructions are provided below.<br>
assets/instructions.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <h1 style='color: purple;'> Using on your data </h1>
2
+
3
+ Source code is available as a pip installable python package.
4
+
5
+ ## Installation
6
+
7
+ Use of a virtual enviroment is recommended.
8
+ ```bash
9
+ $ conda create -n selfrank python=3.10
10
+ ```
11
+
12
+ To install,
13
+ ```bash
14
+ $ conda activate selfrank
15
+ $ pip install git+https://huggingface.co/spaces/ibm/llm-rank-themselves.git
16
+ ```
17
+
18
+ ## Usage
19
+
20
+ Start by gathering model inferences for the same question/prompt across all models you want to rank. The ranking method expects a pandas dataframe, with a row for each prompt, and a column for each model, i.e.
21
+ | | M1 | M2 | M3 | ... |
22
+ |:-----------|:-----|:-----|:-----|:------|
23
+ | Q1 | a | a | b | ... |
24
+ | Q2 | a | b | b | ... |
25
+ | ... | ... | ... | ... | ... |
26
+
27
+
28
+ With this data, the self ranking procedure can be invoked as follows:
29
+
30
+ ```python
31
+ import pandas as pd
32
+ from algos.iterative import SelfRank # The full ranking algorithm
33
+ from algos.greedy import SelfRankGreedy # The greedy version
34
+ from algos.triplet import rouge, equality
35
+
36
+ f = "inferences.csv"
37
+ df = pd.read_csv(f)
38
+
39
+ models_to_rank = df.columns.tolist()
40
+ evaluator = rouge
41
+ true_ranking = None
42
+
43
+ r = SelfRank(models_to_rank, evaluator, true_ranking)
44
+ # or, for the greedy version
45
+ # r = SelfRankGreedy(models_to_rank, evaluator, true_ranking)
46
+ r.fit(adf)
47
+ print(r.ranking)
48
+ ```
49
+
50
+ This should output the estimated ranking (best to worst): `['M5', 'M2', 'M1', ...]`. If true rankings are known, evaluation measures can be computed by `r.measure(metric='rbo')` (for rank-biased overlap) or `r.measure(metric='mapk')` for mean-average precision.
51
+
52
+ We provide implementations of few evaluation function, i.e. the function the judge model uses to evaluate the contestant models. While `rouge` is recommended for generative tasks like summarization, `equality` would be more appropriate for multiple choice settings (like MMLU) or classification tasks with a discrete set of outcomes.
53
+
54
+ You can also pass any arbitrary function to the ranker as long as it follows the following signature:
55
+ ```python
56
+ def user_function(a: str, b:str, c:str, df:pd.DataFrame) -> int:
57
+ """
58
+ use model c to evaluate a vs. b
59
+ df: is a dataframe with inferences of all models
60
+ returns 1 if a is preferred or 0 if b is preferred
61
+ """
62
+
63
+ # Is this example, we count number of times a/b is the same as c
64
+ ties = df[a] == df[b]
65
+ a_wins = sum((df[a] == df[c]) & ~(ties))
66
+ b_wins = sum((df[b] == df[c]) & ~(ties))
67
+
68
+ if a_wins >= b_wins:
69
+ return 1
70
+ else:
71
+ return 0
72
+
73
+ ```
74
+ <br>
css.css β†’ style.css RENAMED
File without changes