Spaces:
Runtime error
Runtime error
Upload 10 files
Browse files- app.py +53 -31
- data/eval_board.csv +11 -7
- src/__pycache__/css_html.cpython-38.pyc +0 -0
- src/__pycache__/demo.cpython-38.pyc +0 -0
- src/__pycache__/utils.cpython-38.pyc +0 -0
- src/css_html.py +4 -0
- src/demo.py +35 -10
- src/utils.py +1 -0
app.py
CHANGED
@@ -11,6 +11,7 @@ from src.utils import (
|
|
11 |
from src.demo import (
|
12 |
generate,
|
13 |
random_examples,
|
|
|
14 |
)
|
15 |
|
16 |
|
@@ -66,14 +67,21 @@ demo = gr.Blocks(css=custom_css)
|
|
66 |
with demo:
|
67 |
with gr.Row():
|
68 |
gr.Markdown(
|
69 |
-
"""<div
|
70 |
-
|
71 |
-
|
72 |
|
73 |
-
|
74 |
-
</p>""",
|
75 |
elem_classes="markdown-text",
|
76 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
79 |
with gr.TabItem("🔢 Data", id=0):
|
@@ -145,19 +153,48 @@ with demo:
|
|
145 |
)
|
146 |
|
147 |
with gr.TabItem("Submit results 🚀", id=3):
|
148 |
-
gr.Markdown("
|
|
|
|
|
|
|
|
|
149 |
|
150 |
with gr.Column():
|
151 |
gr.Markdown(
|
152 |
-
"""<div style="text-align: center;"><
|
153 |
<br>\
|
154 |
""",
|
155 |
elem_classes="markdown-text",
|
156 |
)
|
157 |
|
158 |
-
output_box = gr.Textbox(lines=10, max_lines=10, label="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
-
input_box = gr.Textbox(lines=12, max_lines=12, label="Input")
|
161 |
|
162 |
with gr.Row():
|
163 |
re2text_easy_btn = gr.Button("Random Re2Text Easy Example 😄")
|
@@ -188,28 +225,13 @@ with demo:
|
|
188 |
outputs = input_box,
|
189 |
)
|
190 |
|
191 |
-
with gr.Accordion("Additional Inputs", open=False):
|
192 |
-
sys_prompt = gr.Textbox(label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=6)
|
193 |
-
|
194 |
-
gr.Slider(
|
195 |
-
label="Max new tokens",
|
196 |
-
minimum=1,
|
197 |
-
maximum=MAX_MAX_NEW_TOKENS,
|
198 |
-
step=1,
|
199 |
-
value=DEFAULT_MAX_NEW_TOKENS,
|
200 |
-
)
|
201 |
-
|
202 |
-
gr.Slider(
|
203 |
-
label="Temperature",
|
204 |
-
minimum=0,
|
205 |
-
maximum=4.0,
|
206 |
-
step=0.05,
|
207 |
-
value=0,
|
208 |
-
)
|
209 |
-
|
210 |
with gr.Row():
|
211 |
gr.ClearButton([input_box, output_box])
|
212 |
-
submit_btn = gr.Button("Submit")
|
213 |
-
submit_btn.click(generate, inputs=[input_box, sys_prompt], outputs=[output_box])
|
|
|
|
|
|
|
|
|
214 |
|
215 |
-
demo.launch()
|
|
|
11 |
from src.demo import (
|
12 |
generate,
|
13 |
random_examples,
|
14 |
+
return_ground_truth,
|
15 |
)
|
16 |
|
17 |
|
|
|
67 |
with demo:
|
68 |
with gr.Row():
|
69 |
gr.Markdown(
|
70 |
+
"""<div align= "center">
|
71 |
+
<h1>🤖 ConvRe 🤯 <span style='color: #e6b800;'> Leaderboard</span></h1>
|
72 |
+
</div>
|
73 |
|
74 |
+
""",
|
|
|
75 |
elem_classes="markdown-text",
|
76 |
)
|
77 |
+
|
78 |
+
gr.Markdown("""🤖**ConvRe**🤯 is the benchmark proposed in our EMNLP 2023 main conference paper: [An Investigation of LLMs’ Inefficacy in Understanding Converse Relations]().
|
79 |
+
It aims to evaluate LLMs' ability on understanding converse relations.
|
80 |
+
Converse relation is defined as the opposite of semantic relation while keeping the surface form of the triple unchanged.
|
81 |
+
For example, the triple `(x, has part, y)` is interpreted as "x has a part called y" in normal relation, while "y has a part called x" in converse relation 🔁.
|
82 |
+
|
83 |
+
The experiments in our paper suggested that LLMs often resort to shortcut learning (or superficial correlations) and still face challenges on our 🤖ConvRe🤯 benchmark even for powerful models like GPT-4.
|
84 |
+
""", elem_classes="markdown-text")
|
85 |
|
86 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
87 |
with gr.TabItem("🔢 Data", id=0):
|
|
|
153 |
)
|
154 |
|
155 |
with gr.TabItem("Submit results 🚀", id=3):
|
156 |
+
gr.Markdown("""<div align= "center">
|
157 |
+
<h1>Comming Soon ❤️</span></h1>
|
158 |
+
</div>
|
159 |
+
|
160 |
+
""")
|
161 |
|
162 |
with gr.Column():
|
163 |
gr.Markdown(
|
164 |
+
"""<div style="text-align: center;"><h1> 🤖ConvRe🤯 Demo (Llama-2-Chat-7B🦙) </h1></div>\
|
165 |
<br>\
|
166 |
""",
|
167 |
elem_classes="markdown-text",
|
168 |
)
|
169 |
|
170 |
+
output_box = gr.Textbox(lines=10, max_lines=10, label="Llama-2-Chat-7B Answer", interactive=False)
|
171 |
+
|
172 |
+
input_box = gr.Textbox(lines=12, max_lines=12, label="User Input")
|
173 |
+
|
174 |
+
ground_truth_display = gr.Textbox("", lines=1, max_lines=1, label="😊Correct Answer😊", interactive=False)
|
175 |
+
|
176 |
+
with gr.Column():
|
177 |
+
|
178 |
+
|
179 |
+
with gr.Accordion("Additional Inputs", open=False):
|
180 |
+
sys_prompt = gr.Textbox(label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=6)
|
181 |
+
|
182 |
+
max_new_tokens=gr.Slider(
|
183 |
+
label="Max new tokens",
|
184 |
+
minimum=1,
|
185 |
+
maximum=MAX_MAX_NEW_TOKENS,
|
186 |
+
step=1,
|
187 |
+
value=DEFAULT_MAX_NEW_TOKENS,
|
188 |
+
)
|
189 |
+
|
190 |
+
temperature = gr.Slider(
|
191 |
+
label="Temperature",
|
192 |
+
minimum=0.1,
|
193 |
+
maximum=4.0,
|
194 |
+
step=0.1,
|
195 |
+
value=0.1,
|
196 |
+
)
|
197 |
|
|
|
198 |
|
199 |
with gr.Row():
|
200 |
re2text_easy_btn = gr.Button("Random Re2Text Easy Example 😄")
|
|
|
225 |
outputs = input_box,
|
226 |
)
|
227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
with gr.Row():
|
229 |
gr.ClearButton([input_box, output_box])
|
230 |
+
submit_btn = gr.Button("Submit🏃")
|
231 |
+
submit_btn.click(generate, inputs=[input_box, sys_prompt, temperature, max_new_tokens], outputs=[output_box])
|
232 |
+
|
233 |
+
answer_btn = gr.Button("Answer🤔")
|
234 |
+
answer_btn.click(return_ground_truth, inputs=[], outputs=[ground_truth_display])
|
235 |
+
|
236 |
|
237 |
+
demo.queue(max_size=32).launch(enable_queue=True)
|
data/eval_board.csv
CHANGED
@@ -1,7 +1,11 @@
|
|
1 |
-
Models,Re2Text-Easy,Text2Re-Easy,Re2Text-Hard,Text2Re-Hard,Avg,Links
|
2 |
-
gpt-4-0314,
|
3 |
-
gpt-3.5-turbo,83.
|
4 |
-
text-davinci-003,85.
|
5 |
-
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
Models,Re2Text-Easy,Text2Re-Easy,Re2Text-Hard,Text2Re-Hard,Avg,Model Size,Links
|
2 |
+
gpt-4-0314,98.7,93.6,16.4,17.1,56.5,unknown,https://openai.com/research/gpt-4
|
3 |
+
gpt-3.5-turbo-0301,83.5,60.7,59.0,39.0,60.6,unknown,https://chat.openai.com/
|
4 |
+
text-davinci-003,85.4,83.8,55.8,34.8,65.0,175B,https://platform.openai.com/docs/models/gpt-3-5
|
5 |
+
claude-instant-1.1,65.7,87.2,52.3,26.2,57.9,unknown,https://www.anthropic.com/index/introducing-claude
|
6 |
+
claude-1.3,89.7,82.3,37.3,56.6,66.5,unknown,https://www.anthropic.com/index/introducing-claude
|
7 |
+
flan-t5-xxl,79.4,96.8,20.7,4.8,50.4,11B,https://huggingface.co/google/flan-t5-xxl
|
8 |
+
flan-t5-xl,91.5,90.6,7.9,17.8,52.0,3B,https://huggingface.co/google/flan-t5-xl
|
9 |
+
flan-t5-large,71.5,77.3,26.2,29.6,51.2,780M,https://huggingface.co/google/flan-t5-large
|
10 |
+
flan-t5-base,84.6,51.2,17.0,50.2,50.8,250M,https://huggingface.co/google/flan-t5-base
|
11 |
+
flan-t5-small,51.8,50.1,46.5,49.5,49.5,60M,https://huggingface.co/google/flan-t5-small
|
src/__pycache__/css_html.cpython-38.pyc
CHANGED
Binary files a/src/__pycache__/css_html.cpython-38.pyc and b/src/__pycache__/css_html.cpython-38.pyc differ
|
|
src/__pycache__/demo.cpython-38.pyc
CHANGED
Binary files a/src/__pycache__/demo.cpython-38.pyc and b/src/__pycache__/demo.cpython-38.pyc differ
|
|
src/__pycache__/utils.cpython-38.pyc
CHANGED
Binary files a/src/__pycache__/utils.cpython-38.pyc and b/src/__pycache__/utils.cpython-38.pyc differ
|
|
src/css_html.py
CHANGED
@@ -12,6 +12,10 @@ custom_css = """
|
|
12 |
font-size: 16px !important;
|
13 |
}
|
14 |
|
|
|
|
|
|
|
|
|
15 |
#models-to-add-text {
|
16 |
font-size: 18px !important;
|
17 |
}
|
|
|
12 |
font-size: 16px !important;
|
13 |
}
|
14 |
|
15 |
+
#answer-text {
|
16 |
+
font-size: 28px !important;
|
17 |
+
}
|
18 |
+
|
19 |
#models-to-add-text {
|
20 |
font-size: 18px !important;
|
21 |
}
|
src/demo.py
CHANGED
@@ -6,12 +6,12 @@ from typing import Iterable
|
|
6 |
import torch
|
7 |
from huggingface_hub import HfApi
|
8 |
from datasets import load_dataset
|
9 |
-
from transformers import
|
10 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
11 |
|
12 |
|
13 |
-
|
14 |
|
|
|
15 |
|
16 |
type2dataset = {
|
17 |
"re2text-easy": load_dataset('3B-Group/ConvRe', "en-re2text", token=TOKEN, split="prompt1"),
|
@@ -24,10 +24,15 @@ model_id = "meta-llama/Llama-2-7b-chat-hf"
|
|
24 |
tokenizer = AutoTokenizer.from_pretrained(model_id, token=TOKEN)
|
25 |
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, token=TOKEN, device_map="auto").eval()
|
26 |
|
|
|
|
|
|
|
|
|
|
|
27 |
# type2dataset = {}
|
28 |
|
29 |
|
30 |
-
def generate(input_text, sys_prompt) -> str:
|
31 |
sys_prompt = f'''[INST] <<SYS>>
|
32 |
{sys_prompt}
|
33 |
<</SYS>>
|
@@ -35,24 +40,44 @@ def generate(input_text, sys_prompt) -> str:
|
|
35 |
'''
|
36 |
input_str = sys_prompt + input_text + " [/INST]"
|
37 |
|
38 |
-
input_ids = tokenizer(input_str, return_tensors="pt").
|
39 |
-
outputs = model.generate(input_ids, max_length=512)
|
40 |
|
41 |
-
|
42 |
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
|
49 |
def random_examples(dataset_key) -> str:
|
|
|
|
|
50 |
# target_dataset = type2dataset[f"{task.lower()}-{type.lower()}"]
|
51 |
target_dataset = type2dataset[dataset_key]
|
52 |
|
53 |
idx = random.randint(0, len(target_dataset) - 1)
|
54 |
item = target_dataset[idx]
|
|
|
|
|
|
|
|
|
55 |
return item['query']
|
56 |
|
57 |
|
|
|
|
|
|
|
58 |
|
|
|
6 |
import torch
|
7 |
from huggingface_hub import HfApi
|
8 |
from datasets import load_dataset
|
9 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
|
|
|
10 |
|
11 |
|
12 |
+
ground_truth = ""
|
13 |
|
14 |
+
TOKEN = os.environ.get("HF_TOKEN", None)
|
15 |
|
16 |
type2dataset = {
|
17 |
"re2text-easy": load_dataset('3B-Group/ConvRe', "en-re2text", token=TOKEN, split="prompt1"),
|
|
|
24 |
tokenizer = AutoTokenizer.from_pretrained(model_id, token=TOKEN)
|
25 |
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, token=TOKEN, device_map="auto").eval()
|
26 |
|
27 |
+
|
28 |
+
# model_id = "google/flan-t5-base"
|
29 |
+
# tokenizer = T5Tokenizer.from_pretrained(model_id)
|
30 |
+
# model = T5ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
|
31 |
+
|
32 |
# type2dataset = {}
|
33 |
|
34 |
|
35 |
+
def generate(input_text, sys_prompt, temperature, max_new_tokens) -> str:
|
36 |
sys_prompt = f'''[INST] <<SYS>>
|
37 |
{sys_prompt}
|
38 |
<</SYS>>
|
|
|
40 |
'''
|
41 |
input_str = sys_prompt + input_text + " [/INST]"
|
42 |
|
43 |
+
input_ids = tokenizer(input_str, return_tensors="pt").to('cuda')
|
|
|
44 |
|
45 |
+
streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
|
46 |
|
47 |
+
generate_kwargs = dict(
|
48 |
+
input_ids,
|
49 |
+
streamer=streamer,
|
50 |
+
max_new_tokens=max_new_tokens,
|
51 |
+
do_sample=True,
|
52 |
+
temperature=float(temperature)
|
53 |
+
)
|
54 |
+
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
55 |
+
t.start()
|
56 |
|
57 |
+
# Pull the generated text from the streamer, and update the model output.
|
58 |
+
model_output = ""
|
59 |
+
for new_text in streamer:
|
60 |
+
model_output += new_text
|
61 |
+
yield model_output
|
62 |
+
return model_output
|
63 |
|
64 |
|
65 |
def random_examples(dataset_key) -> str:
|
66 |
+
|
67 |
+
|
68 |
# target_dataset = type2dataset[f"{task.lower()}-{type.lower()}"]
|
69 |
target_dataset = type2dataset[dataset_key]
|
70 |
|
71 |
idx = random.randint(0, len(target_dataset) - 1)
|
72 |
item = target_dataset[idx]
|
73 |
+
|
74 |
+
global ground_truth
|
75 |
+
ground_truth = item['answer']
|
76 |
+
|
77 |
return item['query']
|
78 |
|
79 |
|
80 |
+
def return_ground_truth() -> str:
|
81 |
+
correct_answer = ground_truth
|
82 |
+
return correct_answer
|
83 |
|
src/utils.py
CHANGED
@@ -22,6 +22,7 @@ class AutoEvalColumn: # Auto evals column
|
|
22 |
re2text_hard = ColumnContent("Re2Text-Hard", "number", True)
|
23 |
text2re_hard = ColumnContent("Text2Re-Hard", "number", True)
|
24 |
avg = ColumnContent("Avg", "number", True)
|
|
|
25 |
|
26 |
link = ColumnContent("Links", "str", False)
|
27 |
|
|
|
22 |
re2text_hard = ColumnContent("Re2Text-Hard", "number", True)
|
23 |
text2re_hard = ColumnContent("Text2Re-Hard", "number", True)
|
24 |
avg = ColumnContent("Avg", "number", True)
|
25 |
+
model_size = ColumnContent("Model Size", "markdown", True)
|
26 |
|
27 |
link = ColumnContent("Links", "str", False)
|
28 |
|