Spaces:
Running
Running
rahulnair23
commited on
Commit
β’
d39c67a
1
Parent(s):
771a43c
instructions
Browse files- app.py +32 -23
- assets/header.md +6 -0
- assets/instructions.md +74 -0
- css.css β style.css +0 -0
app.py
CHANGED
@@ -14,32 +14,29 @@ class UI:
|
|
14 |
|
15 |
def __init__(self):
|
16 |
"""Load any static assets"""
|
17 |
-
|
18 |
|
19 |
def header_block(self):
|
20 |
"""Title/description"""
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
gr.Markdown(
|
26 |
-
"This space demonstrates reference-free ranking of large language models describe in our ACL Findings paper [Ranking Large Language Models without Ground Truth](https://arxiv.org/abs/2402.14860). <br>"
|
27 |
-
"Inspired by real life where both an expert and a knowledgeable person can identify a novice the main idea is to consider triplets of models, where each one of them evaluates the other two, correctly identifying the worst model in the triplet with high probability. Iteratively performing such evaluations yields a estimated ranking that doesn't require ground truth/reference data which can be expensive to gather. The methods are a viable low-resource ranking mechanism for practical use.<br>"
|
28 |
-
"[Source code](https://huggingface.co/spaces/ibm/llm-rank-themselves/tree/main).<br>"
|
29 |
-
)
|
30 |
gr.Markdown('---')
|
31 |
gr.Markdown('<br>')
|
32 |
-
|
33 |
|
34 |
def selection_panel(self):
|
35 |
"""user selections"""
|
36 |
-
gr.Markdown("""<
|
|
|
37 |
with gr.Column(variant='compact'):
|
38 |
self.data = gr.Dropdown(
|
39 |
choices=["CNN/DM", "XSUM", "MMLU"],
|
40 |
multiselect=False, value='CNN/DM',
|
41 |
label="Choose a dataset.",
|
42 |
-
info="The dataset describes a task",
|
43 |
interactive=True,
|
44 |
)
|
45 |
self.evaluation = gr.Dropdown(
|
@@ -50,14 +47,14 @@ class UI:
|
|
50 |
info="How should the Judge model decide the winner? Demo limited to use 'Rouge' for generative tasks like summarization, and 'equality' for multiple choice or classification tasks. In practice you can use any function that compares judge responses to the contestant models.",
|
51 |
)
|
52 |
self.nmodels = gr.Dropdown(
|
53 |
-
choices=[
|
54 |
label="Number of models",
|
55 |
info="Sample a subset of LLMs to rank.",
|
56 |
value=10,
|
57 |
interactive=True,
|
58 |
)
|
59 |
self.nrows = gr.Dropdown(
|
60 |
-
choices=[
|
61 |
label="Number of instances",
|
62 |
info="Sample a subset of instances to evaluate (smaller is faster).",
|
63 |
value=10,
|
@@ -89,14 +86,21 @@ class UI:
|
|
89 |
""" Synthetic data experiments """
|
90 |
gr.Markdown('<br>')
|
91 |
gr.Markdown('---')
|
92 |
-
gr.Markdown("""<
|
|
|
93 |
|
94 |
def byod_panel(self):
|
95 |
-
"""
|
96 |
gr.Markdown('<br>')
|
97 |
gr.Markdown('---')
|
98 |
-
|
|
|
|
|
|
|
99 |
|
|
|
|
|
|
|
100 |
|
101 |
def layout(self):
|
102 |
""" Assemble the overall layout """
|
@@ -113,7 +117,7 @@ class UI:
|
|
113 |
# Output panel/leaderboard
|
114 |
self.output_panel()
|
115 |
|
116 |
-
self.synth_panel()
|
117 |
self.byod_panel()
|
118 |
|
119 |
# Register event listeners
|
@@ -128,6 +132,8 @@ class UI:
|
|
128 |
""" Main execution flow for benchmarks """
|
129 |
|
130 |
#gr.Info(f"Loaded run config: {data}, {evaluation}, {nmodels}.")
|
|
|
|
|
131 |
|
132 |
match data:
|
133 |
case 'MMLU':
|
@@ -145,7 +151,7 @@ class UI:
|
|
145 |
raise ValueError(f"'{data}' not understood.")
|
146 |
|
147 |
# Sample fewer models if so needed
|
148 |
-
if nmodels
|
149 |
if nmodels < len(MODELS):
|
150 |
|
151 |
MODELS = np.random.choice(MODELS, nmodels, replace=False).tolist()
|
@@ -168,9 +174,9 @@ class UI:
|
|
168 |
|
169 |
# Filter by number of rows
|
170 |
df.dropna(inplace=True)
|
171 |
-
if nrows
|
172 |
if nrows < df.shape[0]:
|
173 |
-
df = df.sample(nrows)
|
174 |
|
175 |
# Compute true ranking
|
176 |
adf = adf.set_index(keys).loc[df.index].reset_index()
|
@@ -227,7 +233,10 @@ class UI:
|
|
227 |
|
228 |
# generate outputs
|
229 |
ranker.fit(df)
|
230 |
-
|
|
|
|
|
|
|
231 |
|
232 |
out_metrics = {"rbo": ranker.measure(metric="rbo"),
|
233 |
"map-1": ranker.measure(metric="mapk", k=1),
|
@@ -236,7 +245,7 @@ class UI:
|
|
236 |
"map-10": ranker.measure(metric="mapk", k=10),
|
237 |
"evaluations": evaluator.calls
|
238 |
}
|
239 |
-
eval_metrics = (f"Evaluation measures
|
240 |
f"Rank-Biased Overlap: {out_metrics['rbo']:0.3f}<br>"
|
241 |
f"MAP-3 : {out_metrics['map-3']:0.3f}<br>"
|
242 |
f"MAP-5 : {out_metrics['map-5']:0.3f}<br>"
|
|
|
14 |
|
15 |
def __init__(self):
|
16 |
"""Load any static assets"""
|
17 |
+
self.load_css()
|
18 |
|
19 |
def header_block(self):
|
20 |
"""Title/description"""
|
21 |
|
22 |
+
with open("assets/header.md", 'r') as f:
|
23 |
+
content = f.read()
|
24 |
+
|
25 |
+
gr.Markdown(content)
|
|
|
|
|
|
|
|
|
26 |
gr.Markdown('---')
|
27 |
gr.Markdown('<br>')
|
28 |
+
|
29 |
|
30 |
def selection_panel(self):
|
31 |
"""user selections"""
|
32 |
+
gr.Markdown("""<h1 style='color: purple;'> Ranking with benchmarks </h1> """)
|
33 |
+
gr.Markdown("""Using inference data gathered from [HELM](https://crfm.stanford.edu/helm/classic/latest/) we first show how our estimated rankings compare to rankings derived from using ground-truth or reference data.""")
|
34 |
with gr.Column(variant='compact'):
|
35 |
self.data = gr.Dropdown(
|
36 |
choices=["CNN/DM", "XSUM", "MMLU"],
|
37 |
multiselect=False, value='CNN/DM',
|
38 |
label="Choose a dataset.",
|
39 |
+
info="The dataset describes a specific task, either summarization (CNN/DM, XSUM) or multiple choice (MMLU).",
|
40 |
interactive=True,
|
41 |
)
|
42 |
self.evaluation = gr.Dropdown(
|
|
|
47 |
info="How should the Judge model decide the winner? Demo limited to use 'Rouge' for generative tasks like summarization, and 'equality' for multiple choice or classification tasks. In practice you can use any function that compares judge responses to the contestant models.",
|
48 |
)
|
49 |
self.nmodels = gr.Dropdown(
|
50 |
+
choices=["All", 10, 20, 30],
|
51 |
label="Number of models",
|
52 |
info="Sample a subset of LLMs to rank.",
|
53 |
value=10,
|
54 |
interactive=True,
|
55 |
)
|
56 |
self.nrows = gr.Dropdown(
|
57 |
+
choices=["All", 10, 20, 30],
|
58 |
label="Number of instances",
|
59 |
info="Sample a subset of instances to evaluate (smaller is faster).",
|
60 |
value=10,
|
|
|
86 |
""" Synthetic data experiments """
|
87 |
gr.Markdown('<br>')
|
88 |
gr.Markdown('---')
|
89 |
+
gr.Markdown("""<h1 style='color: purple;'>Synthetic multiple choice </h1> """)
|
90 |
+
gr.Markdown("Coming soon.")
|
91 |
|
92 |
def byod_panel(self):
|
93 |
+
""" Instructions panel """
|
94 |
gr.Markdown('<br>')
|
95 |
gr.Markdown('---')
|
96 |
+
with open("assets/instructions.md", 'r') as f:
|
97 |
+
content = f.read()
|
98 |
+
gr.Markdown(content)
|
99 |
+
gr.Markdown('---')
|
100 |
|
101 |
+
def load_css(self):
|
102 |
+
with open('style.css', 'r') as file:
|
103 |
+
self.css = file.read()
|
104 |
|
105 |
def layout(self):
|
106 |
""" Assemble the overall layout """
|
|
|
117 |
# Output panel/leaderboard
|
118 |
self.output_panel()
|
119 |
|
120 |
+
#TODO: self.synth_panel()
|
121 |
self.byod_panel()
|
122 |
|
123 |
# Register event listeners
|
|
|
132 |
""" Main execution flow for benchmarks """
|
133 |
|
134 |
#gr.Info(f"Loaded run config: {data}, {evaluation}, {nmodels}.")
|
135 |
+
seed = 40
|
136 |
+
np.random.seed(seed)
|
137 |
|
138 |
match data:
|
139 |
case 'MMLU':
|
|
|
151 |
raise ValueError(f"'{data}' not understood.")
|
152 |
|
153 |
# Sample fewer models if so needed
|
154 |
+
if nmodels != "All":
|
155 |
if nmodels < len(MODELS):
|
156 |
|
157 |
MODELS = np.random.choice(MODELS, nmodels, replace=False).tolist()
|
|
|
174 |
|
175 |
# Filter by number of rows
|
176 |
df.dropna(inplace=True)
|
177 |
+
if nrows != "All":
|
178 |
if nrows < df.shape[0]:
|
179 |
+
df = df.sample(nrows, random_state=seed)
|
180 |
|
181 |
# Compute true ranking
|
182 |
adf = adf.set_index(keys).loc[df.index].reset_index()
|
|
|
233 |
|
234 |
# generate outputs
|
235 |
ranker.fit(df)
|
236 |
+
ranks = ranker.ranking
|
237 |
+
from itertools import zip_longest
|
238 |
+
ranks = [j + i for i, j in zip_longest(ranks, ["π₯ ", "π₯ ", "π₯ "], fillvalue='')]
|
239 |
+
out_df = pd.DataFrame({'rank': range(1, len(true_ranking)+1), 'model': ranks})
|
240 |
|
241 |
out_metrics = {"rbo": ranker.measure(metric="rbo"),
|
242 |
"map-1": ranker.measure(metric="mapk", k=1),
|
|
|
245 |
"map-10": ranker.measure(metric="mapk", k=10),
|
246 |
"evaluations": evaluator.calls
|
247 |
}
|
248 |
+
eval_metrics = (f"<h2> Evaluation measures </h2>"
|
249 |
f"Rank-Biased Overlap: {out_metrics['rbo']:0.3f}<br>"
|
250 |
f"MAP-3 : {out_metrics['map-3']:0.3f}<br>"
|
251 |
f"MAP-5 : {out_metrics['map-5']:0.3f}<br>"
|
assets/header.md
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<h1 style='text-align: center; color: black;'>π₯ Ranking LLMs without ground truth </h1>
|
2 |
+
|
3 |
+
|
4 |
+
This space demonstrates reference-free ranking of large language models describe in our ACL Findings paper [Ranking Large Language Models without Ground Truth](https://arxiv.org/abs/2402.14860). <br>
|
5 |
+
|
6 |
+
Inspired by real life where both an expert and a knowledgeable person can identify a novice the main idea is to consider triplets of models, where each one of them evaluates the other two, correctly identifying the worst model in the triplet with high probability. Iteratively performing such evaluations yields a estimated ranking that doesn't require ground truth/reference data which can be expensive to gather. The methods are a viable low-resource ranking mechanism for practical use. [Source code](https://huggingface.co/spaces/ibm/llm-rank-themselves/tree/main) is included as part of this space. Installation and usage instructions are provided below.<br>
|
assets/instructions.md
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<h1 style='color: purple;'> Using on your data </h1>
|
2 |
+
|
3 |
+
Source code is available as a pip installable python package.
|
4 |
+
|
5 |
+
## Installation
|
6 |
+
|
7 |
+
Use of a virtual enviroment is recommended.
|
8 |
+
```bash
|
9 |
+
$ conda create -n selfrank python=3.10
|
10 |
+
```
|
11 |
+
|
12 |
+
To install,
|
13 |
+
```bash
|
14 |
+
$ conda activate selfrank
|
15 |
+
$ pip install git+https://huggingface.co/spaces/ibm/llm-rank-themselves.git
|
16 |
+
```
|
17 |
+
|
18 |
+
## Usage
|
19 |
+
|
20 |
+
Start by gathering model inferences for the same question/prompt across all models you want to rank. The ranking method expects a pandas dataframe, with a row for each prompt, and a column for each model, i.e.
|
21 |
+
| | M1 | M2 | M3 | ... |
|
22 |
+
|:-----------|:-----|:-----|:-----|:------|
|
23 |
+
| Q1 | a | a | b | ... |
|
24 |
+
| Q2 | a | b | b | ... |
|
25 |
+
| ... | ... | ... | ... | ... |
|
26 |
+
|
27 |
+
|
28 |
+
With this data, the self ranking procedure can be invoked as follows:
|
29 |
+
|
30 |
+
```python
|
31 |
+
import pandas as pd
|
32 |
+
from algos.iterative import SelfRank # The full ranking algorithm
|
33 |
+
from algos.greedy import SelfRankGreedy # The greedy version
|
34 |
+
from algos.triplet import rouge, equality
|
35 |
+
|
36 |
+
f = "inferences.csv"
|
37 |
+
df = pd.read_csv(f)
|
38 |
+
|
39 |
+
models_to_rank = df.columns.tolist()
|
40 |
+
evaluator = rouge
|
41 |
+
true_ranking = None
|
42 |
+
|
43 |
+
r = SelfRank(models_to_rank, evaluator, true_ranking)
|
44 |
+
# or, for the greedy version
|
45 |
+
# r = SelfRankGreedy(models_to_rank, evaluator, true_ranking)
|
46 |
+
r.fit(adf)
|
47 |
+
print(r.ranking)
|
48 |
+
```
|
49 |
+
|
50 |
+
This should output the estimated ranking (best to worst): `['M5', 'M2', 'M1', ...]`. If true rankings are known, evaluation measures can be computed by `r.measure(metric='rbo')` (for rank-biased overlap) or `r.measure(metric='mapk')` for mean-average precision.
|
51 |
+
|
52 |
+
We provide implementations of few evaluation function, i.e. the function the judge model uses to evaluate the contestant models. While `rouge` is recommended for generative tasks like summarization, `equality` would be more appropriate for multiple choice settings (like MMLU) or classification tasks with a discrete set of outcomes.
|
53 |
+
|
54 |
+
You can also pass any arbitrary function to the ranker as long as it follows the following signature:
|
55 |
+
```python
|
56 |
+
def user_function(a: str, b:str, c:str, df:pd.DataFrame) -> int:
|
57 |
+
"""
|
58 |
+
use model c to evaluate a vs. b
|
59 |
+
df: is a dataframe with inferences of all models
|
60 |
+
returns 1 if a is preferred or 0 if b is preferred
|
61 |
+
"""
|
62 |
+
|
63 |
+
# Is this example, we count number of times a/b is the same as c
|
64 |
+
ties = df[a] == df[b]
|
65 |
+
a_wins = sum((df[a] == df[c]) & ~(ties))
|
66 |
+
b_wins = sum((df[b] == df[c]) & ~(ties))
|
67 |
+
|
68 |
+
if a_wins >= b_wins:
|
69 |
+
return 1
|
70 |
+
else:
|
71 |
+
return 0
|
72 |
+
|
73 |
+
```
|
74 |
+
<br>
|
css.css β style.css
RENAMED
File without changes
|