anonymousauthorsanonymous commited on
Commit
2aed722
·
1 Parent(s): 816b8b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -11
app.py CHANGED
@@ -109,9 +109,9 @@ def get_figure(df, model_name, occ):
109
  ax.bar(xs, ys)
110
  ax.axis('tight')
111
  ax.set_xlabel("Sentence number")
112
- ax.set_ylabel("Uncertainty metric")
113
  ax.set_title(
114
- f"{MODEL_NAME_DICT[model_name]} gender pronoun uncertainty in '{occ}' sentences")
115
  return fig
116
 
117
 
@@ -122,7 +122,7 @@ def predict_gender_pronouns(
122
  texts,
123
  occ,
124
  ):
125
- """Run inference on input_text for selected model type, returning uncertainty results.
126
  """
127
 
128
  # TODO: make these selectable by user
@@ -209,27 +209,26 @@ def predict_gender_pronouns(
209
  demo = gr.Blocks()
210
  with demo:
211
  input_texts = gr.Variable([])
212
- gr.Markdown("## Are you certain?")
213
- gr.Markdown(
214
- "#### LLMs are pretty good at reporting their uncertainty. We just need to ask the right way.")
215
- gr.Markdown("Using our underspecification metric informed by applying causal inference techniques, \
216
  we are able to identify likely spurious correlations and exploit them in \
217
  the scenario of gender underspecified tasks. (Note that introspecting softmax probabilities alone is insufficient, as in the sentences \
218
  below, LLMs may report a softmax prob of ~0.9 despite the task being underspecified.)")
219
  gr.Markdown("We extend the [Winogender Schemas](https://github.com/rudinger/winogender-schemas) evaluation set to produce\
220
  eight syntactically similar sentences. However semantically, \
221
  only two of the sentences are well-specified while the rest remain underspecified.")
222
- gr.Markdown("If a model can reliably tell us when it is uncertain about its predictions, one can replace only those uncertain predictions with\
223
  an appropriate heuristic or information retrieval process.")
224
 
225
- gr.Markdown("#### TL;DR")
226
  gr.Markdown("Follow steps below to test out one of the pre-loaded options. Once you get the hang of it, you can load a new model and/or provide your own input texts.")
227
 
228
  with gr.Row():
229
  model_name = gr.Radio(
230
  MODEL_NAMES,
231
  type="value",
232
- label="1) Pick a preloaded BERT-like model for uncertainty evaluation (note: RoBERTa-large performance is best)...",
233
  )
234
  own_model_name = gr.Textbox(
235
  label=f"...Or, if you selected an '{OWN_MODEL_NAME}' model, put any Hugging Face pipeline model name \
@@ -261,7 +260,7 @@ with demo:
261
  )
262
 
263
  with gr.Row():
264
- uncertain_btn = gr.Button("4) Get uncertainty results!")
265
  gr.Markdown(
266
  "If there is an * by a sentence number, then at least one top prediction for that sentence was non-gendered.")
267
 
 
109
  ax.bar(xs, ys)
110
  ax.axis('tight')
111
  ax.set_xlabel("Sentence number")
112
+ ax.set_ylabel("Underspecification Metric")
113
  ax.set_title(
114
+ f"Task Underspecification Metric on {MODEL_NAME_DICT[model_name]} for '{occ}' sentences")
115
  return fig
116
 
117
 
 
122
  texts,
123
  occ,
124
  ):
125
+ """Run inference on input_text for selected model type, returning Task Underspecification metric results.
126
  """
127
 
128
  # TODO: make these selectable by user
 
209
  demo = gr.Blocks()
210
  with demo:
211
  input_texts = gr.Variable([])
212
+ gr.Markdown("**Are you certain?**")
213
+ gr.Markdown("LLMs are pretty good at reporting task underspecification. We just need to ask the right way.")
214
+ gr.Markdown("Using our Underspecification Metric informed by applying causal inference techniques, \
 
215
  we are able to identify likely spurious correlations and exploit them in \
216
  the scenario of gender underspecified tasks. (Note that introspecting softmax probabilities alone is insufficient, as in the sentences \
217
  below, LLMs may report a softmax prob of ~0.9 despite the task being underspecified.)")
218
  gr.Markdown("We extend the [Winogender Schemas](https://github.com/rudinger/winogender-schemas) evaluation set to produce\
219
  eight syntactically similar sentences. However semantically, \
220
  only two of the sentences are well-specified while the rest remain underspecified.")
221
+ gr.Markdown("If a model can reliably report the underspecification of an inference-time task, an AI systems can replace only those task predictions with\
222
  an appropriate heuristic or information retrieval process.")
223
 
224
+ gr.Markdown("**TL;DR**")
225
  gr.Markdown("Follow steps below to test out one of the pre-loaded options. Once you get the hang of it, you can load a new model and/or provide your own input texts.")
226
 
227
  with gr.Row():
228
  model_name = gr.Radio(
229
  MODEL_NAMES,
230
  type="value",
231
+ label="1) Pick a preloaded BERT-like model (note: RoBERTa-large performance is best)...",
232
  )
233
  own_model_name = gr.Textbox(
234
  label=f"...Or, if you selected an '{OWN_MODEL_NAME}' model, put any Hugging Face pipeline model name \
 
260
  )
261
 
262
  with gr.Row():
263
+ uncertain_btn = gr.Button("4) Get Underspecification Metric results!")
264
  gr.Markdown(
265
  "If there is an * by a sentence number, then at least one top prediction for that sentence was non-gendered.")
266