Spaces:
Runtime error
Runtime error
Commit
·
2aed722
1
Parent(s):
816b8b8
Update app.py
Browse files
app.py
CHANGED
@@ -109,9 +109,9 @@ def get_figure(df, model_name, occ):
|
|
109 |
ax.bar(xs, ys)
|
110 |
ax.axis('tight')
|
111 |
ax.set_xlabel("Sentence number")
|
112 |
-
ax.set_ylabel("
|
113 |
ax.set_title(
|
114 |
-
f"{MODEL_NAME_DICT[model_name]}
|
115 |
return fig
|
116 |
|
117 |
|
@@ -122,7 +122,7 @@ def predict_gender_pronouns(
|
|
122 |
texts,
|
123 |
occ,
|
124 |
):
|
125 |
-
"""Run inference on input_text for selected model type, returning
|
126 |
"""
|
127 |
|
128 |
# TODO: make these selectable by user
|
@@ -209,27 +209,26 @@ def predict_gender_pronouns(
|
|
209 |
demo = gr.Blocks()
|
210 |
with demo:
|
211 |
input_texts = gr.Variable([])
|
212 |
-
gr.Markdown("
|
213 |
-
gr.Markdown(
|
214 |
-
|
215 |
-
gr.Markdown("Using our underspecification metric informed by applying causal inference techniques, \
|
216 |
we are able to identify likely spurious correlations and exploit them in \
|
217 |
the scenario of gender underspecified tasks. (Note that introspecting softmax probabilities alone is insufficient, as in the sentences \
|
218 |
below, LLMs may report a softmax prob of ~0.9 despite the task being underspecified.)")
|
219 |
gr.Markdown("We extend the [Winogender Schemas](https://github.com/rudinger/winogender-schemas) evaluation set to produce\
|
220 |
eight syntactically similar sentences. However semantically, \
|
221 |
only two of the sentences are well-specified while the rest remain underspecified.")
|
222 |
-
gr.Markdown("If a model can reliably
|
223 |
an appropriate heuristic or information retrieval process.")
|
224 |
|
225 |
-
gr.Markdown("
|
226 |
gr.Markdown("Follow steps below to test out one of the pre-loaded options. Once you get the hang of it, you can load a new model and/or provide your own input texts.")
|
227 |
|
228 |
with gr.Row():
|
229 |
model_name = gr.Radio(
|
230 |
MODEL_NAMES,
|
231 |
type="value",
|
232 |
-
label="1) Pick a preloaded BERT-like model
|
233 |
)
|
234 |
own_model_name = gr.Textbox(
|
235 |
label=f"...Or, if you selected an '{OWN_MODEL_NAME}' model, put any Hugging Face pipeline model name \
|
@@ -261,7 +260,7 @@ with demo:
|
|
261 |
)
|
262 |
|
263 |
with gr.Row():
|
264 |
-
uncertain_btn = gr.Button("4) Get
|
265 |
gr.Markdown(
|
266 |
"If there is an * by a sentence number, then at least one top prediction for that sentence was non-gendered.")
|
267 |
|
|
|
109 |
ax.bar(xs, ys)
|
110 |
ax.axis('tight')
|
111 |
ax.set_xlabel("Sentence number")
|
112 |
+
ax.set_ylabel("Underspecification Metric")
|
113 |
ax.set_title(
|
114 |
+
f"Task Underspecification Metric on {MODEL_NAME_DICT[model_name]} for '{occ}' sentences")
|
115 |
return fig
|
116 |
|
117 |
|
|
|
122 |
texts,
|
123 |
occ,
|
124 |
):
|
125 |
+
"""Run inference on input_text for selected model type, returning Task Underspecification metric results.
|
126 |
"""
|
127 |
|
128 |
# TODO: make these selectable by user
|
|
|
209 |
demo = gr.Blocks()
|
210 |
with demo:
|
211 |
input_texts = gr.Variable([])
|
212 |
+
gr.Markdown("**Are you certain?**")
|
213 |
+
gr.Markdown("LLMs are pretty good at reporting task underspecification. We just need to ask the right way.")
|
214 |
+
gr.Markdown("Using our Underspecification Metric informed by applying causal inference techniques, \
|
|
|
215 |
we are able to identify likely spurious correlations and exploit them in \
|
216 |
the scenario of gender underspecified tasks. (Note that introspecting softmax probabilities alone is insufficient, as in the sentences \
|
217 |
below, LLMs may report a softmax prob of ~0.9 despite the task being underspecified.)")
|
218 |
gr.Markdown("We extend the [Winogender Schemas](https://github.com/rudinger/winogender-schemas) evaluation set to produce\
|
219 |
eight syntactically similar sentences. However semantically, \
|
220 |
only two of the sentences are well-specified while the rest remain underspecified.")
|
221 |
+
gr.Markdown("If a model can reliably report the underspecification of an inference-time task, an AI systems can replace only those task predictions with\
|
222 |
an appropriate heuristic or information retrieval process.")
|
223 |
|
224 |
+
gr.Markdown("**TL;DR**")
|
225 |
gr.Markdown("Follow steps below to test out one of the pre-loaded options. Once you get the hang of it, you can load a new model and/or provide your own input texts.")
|
226 |
|
227 |
with gr.Row():
|
228 |
model_name = gr.Radio(
|
229 |
MODEL_NAMES,
|
230 |
type="value",
|
231 |
+
label="1) Pick a preloaded BERT-like model (note: RoBERTa-large performance is best)...",
|
232 |
)
|
233 |
own_model_name = gr.Textbox(
|
234 |
label=f"...Or, if you selected an '{OWN_MODEL_NAME}' model, put any Hugging Face pipeline model name \
|
|
|
260 |
)
|
261 |
|
262 |
with gr.Row():
|
263 |
+
uncertain_btn = gr.Button("4) Get Underspecification Metric results!")
|
264 |
gr.Markdown(
|
265 |
"If there is an * by a sentence number, then at least one top prediction for that sentence was non-gendered.")
|
266 |
|