Spaces:
Running
Running
Yu (Hope) Hou
commited on
Commit
·
7b5f267
1
Parent(s):
5683255
update full instructions V1
Browse files- app.py +10 -10
- src/about.py +74 -6
app.py
CHANGED
@@ -287,7 +287,7 @@ with demo:
|
|
287 |
|
288 |
with gr.Row():
|
289 |
with gr.Column():
|
290 |
-
model_name_textbox = gr.Textbox(label="QA
|
291 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
292 |
model_type = gr.Dropdown(
|
293 |
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
@@ -330,15 +330,15 @@ with demo:
|
|
330 |
submission_result,
|
331 |
)
|
332 |
|
333 |
-
with gr.Row():
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
|
343 |
scheduler = BackgroundScheduler()
|
344 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
|
|
287 |
|
288 |
with gr.Row():
|
289 |
with gr.Column():
|
290 |
+
model_name_textbox = gr.Textbox(label="QA model name")
|
291 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
292 |
model_type = gr.Dropdown(
|
293 |
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
|
|
330 |
submission_result,
|
331 |
)
|
332 |
|
333 |
+
# with gr.Row():
|
334 |
+
# with gr.Accordion("📙 More about the task", open=False):
|
335 |
+
# citation_button = gr.Textbox(
|
336 |
+
# value=CITATION_BUTTON_TEXT,
|
337 |
+
# label=CITATION_BUTTON_LABEL,
|
338 |
+
# lines=20,
|
339 |
+
# elem_id="citation-button",
|
340 |
+
# show_copy_button=True,
|
341 |
+
# )
|
342 |
|
343 |
scheduler = BackgroundScheduler()
|
344 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
src/about.py
CHANGED
@@ -25,20 +25,88 @@ TITLE = """<h1 align="center" id="space-title">Grounded QA leaderboard</h1>"""
|
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
Build an open-domain QA system that can answer any question posed by humans!
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
32 |
LLM_BENCHMARKS_TEXT = f"""
|
33 |
-
##
|
34 |
-
If you are working on a generative QA model, you are expected to submit your system by filling the `Model name`.
|
35 |
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
"""
|
39 |
|
40 |
EVALUATION_QUEUE_TEXT = """
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
"""
|
43 |
|
44 |
CITATION_BUTTON_LABEL = "Copy the following link to check more details"
|
|
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
+
Build an open-domain QA system that can answer any question posed by humans! For more: https://sites.google.com/view/qanta/home
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
32 |
LLM_BENCHMARKS_TEXT = f"""
|
33 |
+
## QA variants
|
|
|
34 |
|
35 |
+
### Generative QA
|
36 |
+
This type of QA system aims to generate an answer to a given question directly.
|
37 |
+
|
38 |
+
#### Input
|
39 |
+
(1) `question` string
|
40 |
+
|
41 |
+
```
|
42 |
+
E.g. qa_pipe(question)
|
43 |
+
```
|
44 |
+
|
45 |
+
#### Output
|
46 |
+
Return in a JSON format: (1) `guess` string, (2) `confidence` score which should be a float number representing the probability (0-1) of your guess.
|
47 |
+
|
48 |
+
```
|
49 |
+
E.g. {'guess': 'Apple', 'confidence': 0.02}
|
50 |
+
```
|
51 |
+
|
52 |
+
Reminder: Feel free to check the tutorial provided to see how you could calculate the probability of the generated tokens!
|
53 |
+
|
54 |
+
### Extractive QA
|
55 |
+
This type of QA system aims to extract an answer span from a context passage for a given question.
|
56 |
+
|
57 |
+
#### Input
|
58 |
+
(1) `question` string, and (2) `context` string
|
59 |
+
|
60 |
+
```
|
61 |
+
E.g. qa_pipe(question=question, context=context)
|
62 |
+
```
|
63 |
+
|
64 |
+
#### Output
|
65 |
+
Return in a JSON format: (1) `guess` string, (2) `confidence` score which should be a float number representing the probability (0-1) of your guess.
|
66 |
+
|
67 |
+
```
|
68 |
+
E.g. {'guess': 'Apple', 'confidence': 0.02}
|
69 |
+
```
|
70 |
+
|
71 |
+
Reminder: If you are playing around with an extractive QA model already, HF QA models output the `score` already, so you only need to wrap the `score` to `confidence`.
|
72 |
+
|
73 |
+
#### Customized retriever
|
74 |
+
If you didn’t submit anything for retriever, we will feed the `context` string with our pre-loaded context. However, we do provide the option for you to customize your retriever model with the dataset you wish to do retrieval. Please check the tutorial example for more details.
|
75 |
+
|
76 |
+
## Evaluation Metric
|
77 |
+
For each question in the test set, we parsed it into multiple runs and fed each run as the question to your pipeline. Then we use the confidence scores calculated for all runs to get the Buzz Confidence.
|
78 |
+
|
79 |
+
## FAQ
|
80 |
+
What if my system type is not specified here or not supported yet?
|
81 |
+
- Please have a private post to instructors so we could check how we could adapt the leaderboard for your purpose. Thanks!
|
82 |
+
|
83 |
+
I don’t understand where I could start to build a QA system for submission.
|
84 |
+
- Please check our submission tutorials. From there, you could fine-tune or do anything above the base models.
|
85 |
"""
|
86 |
|
87 |
EVALUATION_QUEUE_TEXT = """
|
88 |
+
**Step 1: Make sure it could work locally**
|
89 |
+
After you have a QA system uploaded to HuggingFace (with license specified), please check with the following example code to see if your pipe could return the guess and confidence score in a **JSON** format.
|
90 |
+
|
91 |
+
```
|
92 |
+
from transformers import pipeline
|
93 |
+
qa_pipe = pipeline(model="...", trust_remote_code=True)
|
94 |
+
|
95 |
+
# If it is a Generative QA pipeline
|
96 |
+
qa_pipe(“Where is UMD?”)
|
97 |
+
|
98 |
+
# If it is a Extractive QA pipeline
|
99 |
+
qa_pipe(question=“Where is UMD?”, context=”UMD is in Maryland.”)
|
100 |
+
```
|
101 |
+
|
102 |
+
**Step 2: Fill in the submission form**
|
103 |
+
(1) Fill in the `QA model name`
|
104 |
+
(2) Fill in the `Revision commit`: if you leave it empty, by default it will be `main`.
|
105 |
+
(3) Fill in the `Model type`
|
106 |
+
(4) `Precision` by default is `float16 `. You could update it as needed.
|
107 |
+
(5) If you have a trained retriever and want to submit an Extractive QA system, please also fill in the `Retrieved dataset name` and `Retriever model`.
|
108 |
+
|
109 |
+
Here is a tutorial on how you could make pipe wrappers for submissions: [Colab](https://colab.research.google.com/drive/1bCt2870SdY6tI4uE3JPG8_3nLmNJXX6_?usp=sharing)
|
110 |
"""
|
111 |
|
112 |
CITATION_BUTTON_LABEL = "Copy the following link to check more details"
|