Spaces:
Sleeping
Sleeping
Commit
·
d4dddf1
1
Parent(s):
180caf6
Allow Alpaca and Arena results to be presented in the same space
Browse files- app.py +21 -7
- docs/{disclaimer.md → alpaca/disclaimer.md} +0 -0
- docs/{readme.md → alpaca/readme.md} +0 -0
- docs/arena/disclaimer.md +27 -0
- docs/arena/readme.md +12 -0
app.py
CHANGED
@@ -13,6 +13,7 @@ from datasets import load_dataset
|
|
13 |
from scipy.special import expit
|
14 |
|
15 |
HDI = cl.namedtuple('HDI', 'lower, upper')
|
|
|
16 |
|
17 |
#
|
18 |
# See https://cran.r-project.org/package=HDInterval
|
@@ -46,7 +47,7 @@ def load(repo):
|
|
46 |
model,
|
47 |
'value',
|
48 |
]
|
49 |
-
dataset = load_dataset(repo)
|
50 |
|
51 |
return (dataset
|
52 |
.get('train')
|
@@ -190,11 +191,10 @@ class DocumentationReader:
|
|
190 |
#
|
191 |
#
|
192 |
#
|
193 |
-
|
194 |
-
df = load('jerome-white
|
195 |
-
docs = DocumentationReader(Path('docs'))
|
196 |
|
197 |
-
gr.Markdown('# Alpaca Bradley–Terry')
|
198 |
with gr.Row():
|
199 |
with gr.Column():
|
200 |
gr.Markdown(docs['readme'])
|
@@ -232,8 +232,9 @@ with gr.Blocks() as demo:
|
|
232 |
|
233 |
''')
|
234 |
with gr.Column():
|
235 |
-
models =
|
236 |
-
|
|
|
237 |
inputs = [ drops(label=f'Model {x}') for x in range(1, 3) ]
|
238 |
|
239 |
button = gr.Button(value='Compare!')
|
@@ -242,4 +243,17 @@ with gr.Blocks() as demo:
|
|
242 |
with gr.Accordion('Disclaimer', open=False):
|
243 |
gr.Markdown(docs['disclaimer'])
|
244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
demo.launch()
|
|
|
13 |
from scipy.special import expit
|
14 |
|
15 |
HDI = cl.namedtuple('HDI', 'lower, upper')
|
16 |
+
TabGroup = cl.namedtuple('TabGroup', 'name, docs, dataset')
|
17 |
|
18 |
#
|
19 |
# See https://cran.r-project.org/package=HDInterval
|
|
|
47 |
model,
|
48 |
'value',
|
49 |
]
|
50 |
+
dataset = load_dataset(str(repo))
|
51 |
|
52 |
return (dataset
|
53 |
.get('train')
|
|
|
191 |
#
|
192 |
#
|
193 |
#
|
194 |
+
def layout(tab):
|
195 |
+
df = load(Path('jerome-white', tab.dataset))
|
196 |
+
docs = DocumentationReader(Path('docs', t.docs))
|
197 |
|
|
|
198 |
with gr.Row():
|
199 |
with gr.Column():
|
200 |
gr.Markdown(docs['readme'])
|
|
|
232 |
|
233 |
''')
|
234 |
with gr.Column():
|
235 |
+
models = df['model'].unique()
|
236 |
+
choices = sorted(models, key=lambda x: x.lower())
|
237 |
+
drops = ft.partial(gr.Dropdown, choices=choices)
|
238 |
inputs = [ drops(label=f'Model {x}') for x in range(1, 3) ]
|
239 |
|
240 |
button = gr.Button(value='Compare!')
|
|
|
243 |
with gr.Accordion('Disclaimer', open=False):
|
244 |
gr.Markdown(docs['disclaimer'])
|
245 |
|
246 |
+
#
|
247 |
+
#
|
248 |
+
#
|
249 |
+
with gr.Blocks() as demo:
|
250 |
+
tabs = it.starmap(TabGroup, (
|
251 |
+
('Alpaca', 'alpaca', 'alpaca-bt-stan'),
|
252 |
+
('Chatbot Arena', 'arena', 'arena-bt-stan'),
|
253 |
+
))
|
254 |
+
|
255 |
+
for t in tabs:
|
256 |
+
with gr.Tab(t.name):
|
257 |
+
layout(t)
|
258 |
+
|
259 |
demo.launch()
|
docs/{disclaimer.md → alpaca/disclaimer.md}
RENAMED
File without changes
|
docs/{readme.md → alpaca/readme.md}
RENAMED
File without changes
|
docs/arena/disclaimer.md
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Disclaimer
|
2 |
+
|
3 |
+
This Space is primarily intended for exploration. For now its results
|
4 |
+
should be treated as points of reference rather than absolute
|
5 |
+
facts. Viewers are encouraged to study the pipeline and understand the
|
6 |
+
model to help put the results into context.
|
7 |
+
|
8 |
+
Suggestions for improving this Space from those familiar with Chatbot
|
9 |
+
Arena or Bayesian data analysis are welcome! Please use the
|
10 |
+
[community](https://huggingface.co/spaces/jerome-white/arena-bradley-terry/discussions)
|
11 |
+
to do so.
|
12 |
+
|
13 |
+
## Resources
|
14 |
+
|
15 |
+
* [Source code](https://github.com/jerome-white/alpaca-bda/tree/chatbot-arena) for
|
16 |
+
producing results
|
17 |
+
|
18 |
+
## TODO
|
19 |
+
|
20 |
+
* Extend the Stan model to incorporate ties and response presentation
|
21 |
+
ordering
|
22 |
+
|
23 |
+
* Add details of the MCMC chains
|
24 |
+
|
25 |
+
* Automate data processing
|
26 |
+
|
27 |
+
* Explicit documentation of the process
|
docs/arena/readme.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[LMSYS Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is an
|
2 |
+
LLM evaluation platform. This Space presents an alternative method of
|
3 |
+
ranking based on the [Bradley–Terry
|
4 |
+
model](https://en.wikipedia.org/wiki/Bradley%E2%80%93Terry_model)
|
5 |
+
(BT). This Space takes a Bayesian approach to BT parameter estimation,
|
6 |
+
unlike the MLE approach used by the LMSYS organization.
|
7 |
+
|
8 |
+
This Space is divided into two primary sections: the first presents a
|
9 |
+
ranking of models based on estimated ability. The figure on the right
|
10 |
+
visualizes this ranking for the top 10 models, while the table below
|
11 |
+
presents the full set. The second section estimates the probability
|
12 |
+
that one model will be preferred to another.
|