Spaces:
Runtime error
Runtime error
Commit
·
d0408b3
1
Parent(s):
67efd29
Update app.py
Browse files
app.py
CHANGED
@@ -109,9 +109,9 @@ def get_figure(df, model_name, occ):
|
|
109 |
ax.bar(xs, ys)
|
110 |
ax.axis('tight')
|
111 |
ax.set_xlabel("Sentence number")
|
112 |
-
ax.set_ylabel("
|
113 |
ax.set_title(
|
114 |
-
f"Task
|
115 |
return fig
|
116 |
|
117 |
|
@@ -122,7 +122,7 @@ def predict_gender_pronouns(
|
|
122 |
texts,
|
123 |
occ,
|
124 |
):
|
125 |
-
"""Run inference on input_text for selected model type, returning Task
|
126 |
"""
|
127 |
|
128 |
# TODO: make these selectable by user
|
@@ -209,6 +209,7 @@ def predict_gender_pronouns(
|
|
209 |
demo = gr.Blocks()
|
210 |
with demo:
|
211 |
input_texts = gr.Variable([])
|
|
|
212 |
# gr.Markdown("LLMs are pretty good at reporting task underspecification. We just need to ask the right way.")
|
213 |
# gr.Markdown("Using our Underspecification Metric informed by applying causal inference techniques, \
|
214 |
# we are able to identify likely spurious correlations and exploit them in \
|
@@ -219,10 +220,16 @@ with demo:
|
|
219 |
# only two of the sentences are well-specified while the rest remain underspecified.")
|
220 |
# gr.Markdown("If a model can reliably report the underspecification of an inference-time task, an AI systems can replace only those task predictions with\
|
221 |
# an appropriate heuristic or information retrieval process.")
|
222 |
-
gr.Markdown("Follow the numbered steps below to test
|
223 |
-
|
|
|
|
|
|
|
|
|
224 |
|
225 |
|
|
|
|
|
226 |
with gr.Row():
|
227 |
model_name = gr.Radio(
|
228 |
MODEL_NAMES,
|
|
|
109 |
ax.bar(xs, ys)
|
110 |
ax.axis('tight')
|
111 |
ax.set_xlabel("Sentence number")
|
112 |
+
ax.set_ylabel("Specification Metric")
|
113 |
ax.set_title(
|
114 |
+
f"Task Specification Metric on {MODEL_NAME_DICT[model_name]} for '{occ}' sentences")
|
115 |
return fig
|
116 |
|
117 |
|
|
|
122 |
texts,
|
123 |
occ,
|
124 |
):
|
125 |
+
"""Run inference on input_text for selected model type, returning Task Specification metric results.
|
126 |
"""
|
127 |
|
128 |
# TODO: make these selectable by user
|
|
|
209 |
demo = gr.Blocks()
|
210 |
with demo:
|
211 |
input_texts = gr.Variable([])
|
212 |
+
gr.Markdown("**Detect Task Specification at Inference-time.**")
|
213 |
# gr.Markdown("LLMs are pretty good at reporting task underspecification. We just need to ask the right way.")
|
214 |
# gr.Markdown("Using our Underspecification Metric informed by applying causal inference techniques, \
|
215 |
# we are able to identify likely spurious correlations and exploit them in \
|
|
|
220 |
# only two of the sentences are well-specified while the rest remain underspecified.")
|
221 |
# gr.Markdown("If a model can reliably report the underspecification of an inference-time task, an AI systems can replace only those task predictions with\
|
222 |
# an appropriate heuristic or information retrieval process.")
|
223 |
+
gr.Markdown("*Follow the numbered steps below to test one of the pre-loaded options.* Once you get the hang of it, you can load a new model and/or provide your own input texts.")
|
224 |
+
|
225 |
+
gr.Markdown("1) Pick a preloaded BERT-like model (note: RoBERTa-large performance is best).")
|
226 |
+
gr.Markdown(f"2) Pick an Occupation type from the Winogender Schemas evaluation set, or select '{PICK_YOUR_OWN_LABEL}' (it need not be about an occupation).")
|
227 |
+
gr.Markdown("3) Click button to load input texts. Read the sentences to determine which two are well-specified for gendered pronoun coreference resolution. The rest are gender-unspecified.")
|
228 |
+
gr.Markdown("4) Click button to get Task Specification Metric results!")
|
229 |
|
230 |
|
231 |
+
|
232 |
+
|
233 |
with gr.Row():
|
234 |
model_name = gr.Radio(
|
235 |
MODEL_NAMES,
|