Spaces:

anonymousauthorsanonymous
/

uncertainty

Runtime error

App Files Files Community

anonymousauthorsanonymous commited on Feb 1, 2023

Commit

2aed722

1 Parent(s): 816b8b8

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -11

app.py CHANGED Viewed

@@ -109,9 +109,9 @@ def get_figure(df, model_name, occ):
     ax.bar(xs, ys)
     ax.axis('tight')
     ax.set_xlabel("Sentence number")
-    ax.set_ylabel("Uncertainty metric")
     ax.set_title(
-        f"{MODEL_NAME_DICT[model_name]} gender pronoun uncertainty in '{occ}' sentences")
     return fig
@@ -122,7 +122,7 @@ def predict_gender_pronouns(
     texts,
     occ,
 ):
-    """Run inference on input_text for selected model type, returning uncertainty results.
     """
     # TODO: make these selectable by user
@@ -209,27 +209,26 @@ def predict_gender_pronouns(
 demo = gr.Blocks()
 with demo:
     input_texts = gr.Variable([])
-    gr.Markdown("## Are you certain?")
-    gr.Markdown(
-        "#### LLMs are pretty good at reporting their uncertainty. We just need to ask the right way.")
-    gr.Markdown("Using our underspecification metric informed by applying causal inference techniques, \
         we are able to identify likely spurious correlations and exploit them in \
         the scenario of gender underspecified tasks. (Note that introspecting softmax probabilities alone is insufficient, as in the sentences \
         below, LLMs may report a softmax prob of ~0.9 despite the task being underspecified.)")
     gr.Markdown("We extend the [Winogender Schemas](https://github.com/rudinger/winogender-schemas) evaluation set to produce\
         eight syntactically similar sentences. However semantically, \
         only two of the sentences are well-specified while the rest remain underspecified.")
-    gr.Markdown("If a model can reliably tell us when it is uncertain about its predictions, one can replace only those uncertain predictions with\
         an appropriate heuristic or information retrieval process.")
-    gr.Markdown("#### TL;DR")
     gr.Markdown("Follow steps below to test out one of the pre-loaded options. Once you get the hang of it, you can load a new model and/or provide your own input texts.")
     with gr.Row():
         model_name = gr.Radio(
             MODEL_NAMES,
             type="value",
-            label="1) Pick a preloaded BERT-like model for uncertainty evaluation (note: RoBERTa-large performance is best)...",
         )
         own_model_name = gr.Textbox(
             label=f"...Or, if you selected an '{OWN_MODEL_NAME}' model, put any Hugging Face pipeline model name \
@@ -261,7 +260,7 @@ with demo:
     )
     with gr.Row():
-        uncertain_btn = gr.Button("4) Get uncertainty results!")
     gr.Markdown(
         "If there is an * by a sentence number, then at least one top prediction for that sentence was non-gendered.")

     ax.bar(xs, ys)
     ax.axis('tight')
     ax.set_xlabel("Sentence number")
+    ax.set_ylabel("Underspecification Metric")
     ax.set_title(
+        f"Task Underspecification Metric on {MODEL_NAME_DICT[model_name]}  for '{occ}' sentences")
     return fig
     texts,
     occ,
 ):
+    """Run inference on input_text for selected model type, returning Task Underspecification metric results.
     """
     # TODO: make these selectable by user
 demo = gr.Blocks()
 with demo:
     input_texts = gr.Variable([])
+    gr.Markdown("**Are you certain?**")
+    gr.Markdown("LLMs are pretty good at reporting task underspecification. We just need to ask the right way.")
+    gr.Markdown("Using our Underspecification Metric informed by applying causal inference techniques, \
         we are able to identify likely spurious correlations and exploit them in \
         the scenario of gender underspecified tasks. (Note that introspecting softmax probabilities alone is insufficient, as in the sentences \
         below, LLMs may report a softmax prob of ~0.9 despite the task being underspecified.)")
     gr.Markdown("We extend the [Winogender Schemas](https://github.com/rudinger/winogender-schemas) evaluation set to produce\
         eight syntactically similar sentences. However semantically, \
         only two of the sentences are well-specified while the rest remain underspecified.")
+    gr.Markdown("If a model can reliably report the underspecification of an inference-time task, an AI systems can replace only those task predictions with\
         an appropriate heuristic or information retrieval process.")
+    gr.Markdown("**TL;DR**")
     gr.Markdown("Follow steps below to test out one of the pre-loaded options. Once you get the hang of it, you can load a new model and/or provide your own input texts.")
     with gr.Row():
         model_name = gr.Radio(
             MODEL_NAMES,
             type="value",
+            label="1) Pick a preloaded BERT-like model (note: RoBERTa-large performance is best)...",
         )
         own_model_name = gr.Textbox(
             label=f"...Or, if you selected an '{OWN_MODEL_NAME}' model, put any Hugging Face pipeline model name \
     )
     with gr.Row():
+        uncertain_btn = gr.Button("4) Get Underspecification Metric results!")
     gr.Markdown(
         "If there is an * by a sentence number, then at least one top prediction for that sentence was non-gendered.")