Spaces:

anonymousauthorsanonymous
/

uncertainty

Runtime error

App Files Files Community

anonymousauthorsanonymous commited on Feb 21, 2023

Commit

67efd29

1 Parent(s): f86ff7a

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -15

app.py CHANGED Viewed

@@ -194,7 +194,7 @@ def predict_gender_pronouns(
                                                 / num_ave), DECIMAL_PLACES)
     uncertain_df = pd.DataFrame.from_dict(
-        all_uncertainty_f, orient='index', columns=['Underspecification Metric'])
     uncertain_df = uncertain_df.reset_index().rename(
         columns={'index': 'Sentence number'})
@@ -209,23 +209,25 @@ def predict_gender_pronouns(
 demo = gr.Blocks()
 with demo:
     input_texts = gr.Variable([])
-    gr.Markdown("LLMs are pretty good at reporting task underspecification. We just need to ask the right way.")
-    gr.Markdown("Using our Underspecification Metric informed by applying causal inference techniques, \
-        we are able to identify likely spurious correlations and exploit them in \
-        the scenario of gender underspecified tasks. (Note that introspecting softmax probabilities alone is insufficient, as in the sentences \
-        below, LLMs may report a softmax prob of ~0.9 despite the task being underspecified.)")
-    gr.Markdown("We extend the [Winogender Schemas](https://github.com/rudinger/winogender-schemas) evaluation set to produce\
-        eight syntactically similar sentences. However semantically, \
-        only two of the sentences are well-specified while the rest remain underspecified.")
-    gr.Markdown("If a model can reliably report the underspecification of an inference-time task, an AI systems can replace only those task predictions with\
-        an appropriate heuristic or information retrieval process.")
-    gr.Markdown("**TL;DR**: Follow steps below to test out one of the pre-loaded options. Once you get the hang of it, you can load a new model and/or provide your own input texts.")
     with gr.Row():
         model_name = gr.Radio(
             MODEL_NAMES,
             type="value",
-            label="1) Pick a preloaded BERT-like model (note: RoBERTa-large performance is best)...",
         )
         own_model_name = gr.Textbox(
             label=f"...Or, if you selected an '{OWN_MODEL_NAME}' model, put any Hugging Face pipeline model name \
@@ -246,7 +248,7 @@ with demo:
         )
     with gr.Row():
-        get_text_btn = gr.Button("3) Load input texts")
     get_text_btn.click(
         fn=display_input_texts,
@@ -257,7 +259,7 @@ with demo:
     )
     with gr.Row():
-        uncertain_btn = gr.Button("4) Get Underspecification Metric results!")
     gr.Markdown(
         "If there is an * by a sentence number, then at least one top prediction for that sentence was non-gendered.")

                                                 / num_ave), DECIMAL_PLACES)
     uncertain_df = pd.DataFrame.from_dict(
+        all_uncertainty_f, orient='index', columns=['Specification Metric'])
     uncertain_df = uncertain_df.reset_index().rename(
         columns={'index': 'Sentence number'})
 demo = gr.Blocks()
 with demo:
     input_texts = gr.Variable([])
+    # gr.Markdown("LLMs are pretty good at reporting task underspecification. We just need to ask the right way.")
+    # gr.Markdown("Using our Underspecification Metric informed by applying causal inference techniques, \
+    #     we are able to identify likely spurious correlations and exploit them in \
+    #     the scenario of gender underspecified tasks. (Note that introspecting softmax probabilities alone is insufficient, as in the sentences \
+    #     below, LLMs may report a softmax prob of ~0.9 despite the task being underspecified.)")
+    # gr.Markdown("We extend the [Winogender Schemas](https://github.com/rudinger/winogender-schemas) evaluation set to produce\
+    #     eight syntactically similar sentences. However semantically, \
+    #     only two of the sentences are well-specified while the rest remain underspecified.")
+    # gr.Markdown("If a model can reliably report the underspecification of an inference-time task, an AI systems can replace only those task predictions with\
+    #     an appropriate heuristic or information retrieval process.")
+    gr.Markdown("Follow the numbered steps below to test out one of the pre-loaded options. Once you get the hang of it, you can load a new model and/or provide your own input texts.")
     with gr.Row():
         model_name = gr.Radio(
             MODEL_NAMES,
             type="value",
+            label="1) Pick a preloaded BERT-like model (note: RoBERTa-large performance is best).",
         )
         own_model_name = gr.Textbox(
             label=f"...Or, if you selected an '{OWN_MODEL_NAME}' model, put any Hugging Face pipeline model name \
         )
     with gr.Row():
+        get_text_btn = gr.Button("3) Load input texts. Read the sentences to determine which two are well-specified for gendered pronoun coreference resolution. The rest are gender-unspecified.")
     get_text_btn.click(
         fn=display_input_texts,
     )
     with gr.Row():
+        uncertain_btn = gr.Button("4) Get Task Specification Metric results!")
     gr.Markdown(
         "If there is an * by a sentence number, then at least one top prediction for that sentence was non-gendered.")