anonymousauthorsanonymous commited on
Commit
67efd29
·
1 Parent(s): f86ff7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -15
app.py CHANGED
@@ -194,7 +194,7 @@ def predict_gender_pronouns(
194
  / num_ave), DECIMAL_PLACES)
195
 
196
  uncertain_df = pd.DataFrame.from_dict(
197
- all_uncertainty_f, orient='index', columns=['Underspecification Metric'])
198
 
199
  uncertain_df = uncertain_df.reset_index().rename(
200
  columns={'index': 'Sentence number'})
@@ -209,23 +209,25 @@ def predict_gender_pronouns(
209
  demo = gr.Blocks()
210
  with demo:
211
  input_texts = gr.Variable([])
212
- gr.Markdown("LLMs are pretty good at reporting task underspecification. We just need to ask the right way.")
213
- gr.Markdown("Using our Underspecification Metric informed by applying causal inference techniques, \
214
- we are able to identify likely spurious correlations and exploit them in \
215
- the scenario of gender underspecified tasks. (Note that introspecting softmax probabilities alone is insufficient, as in the sentences \
216
- below, LLMs may report a softmax prob of ~0.9 despite the task being underspecified.)")
217
- gr.Markdown("We extend the [Winogender Schemas](https://github.com/rudinger/winogender-schemas) evaluation set to produce\
218
- eight syntactically similar sentences. However semantically, \
219
- only two of the sentences are well-specified while the rest remain underspecified.")
220
- gr.Markdown("If a model can reliably report the underspecification of an inference-time task, an AI systems can replace only those task predictions with\
221
- an appropriate heuristic or information retrieval process.")
222
- gr.Markdown("**TL;DR**: Follow steps below to test out one of the pre-loaded options. Once you get the hang of it, you can load a new model and/or provide your own input texts.")
 
 
223
 
224
  with gr.Row():
225
  model_name = gr.Radio(
226
  MODEL_NAMES,
227
  type="value",
228
- label="1) Pick a preloaded BERT-like model (note: RoBERTa-large performance is best)...",
229
  )
230
  own_model_name = gr.Textbox(
231
  label=f"...Or, if you selected an '{OWN_MODEL_NAME}' model, put any Hugging Face pipeline model name \
@@ -246,7 +248,7 @@ with demo:
246
  )
247
 
248
  with gr.Row():
249
- get_text_btn = gr.Button("3) Load input texts")
250
 
251
  get_text_btn.click(
252
  fn=display_input_texts,
@@ -257,7 +259,7 @@ with demo:
257
  )
258
 
259
  with gr.Row():
260
- uncertain_btn = gr.Button("4) Get Underspecification Metric results!")
261
  gr.Markdown(
262
  "If there is an * by a sentence number, then at least one top prediction for that sentence was non-gendered.")
263
 
 
194
  / num_ave), DECIMAL_PLACES)
195
 
196
  uncertain_df = pd.DataFrame.from_dict(
197
+ all_uncertainty_f, orient='index', columns=['Specification Metric'])
198
 
199
  uncertain_df = uncertain_df.reset_index().rename(
200
  columns={'index': 'Sentence number'})
 
209
  demo = gr.Blocks()
210
  with demo:
211
  input_texts = gr.Variable([])
212
+ # gr.Markdown("LLMs are pretty good at reporting task underspecification. We just need to ask the right way.")
213
+ # gr.Markdown("Using our Underspecification Metric informed by applying causal inference techniques, \
214
+ # we are able to identify likely spurious correlations and exploit them in \
215
+ # the scenario of gender underspecified tasks. (Note that introspecting softmax probabilities alone is insufficient, as in the sentences \
216
+ # below, LLMs may report a softmax prob of ~0.9 despite the task being underspecified.)")
217
+ # gr.Markdown("We extend the [Winogender Schemas](https://github.com/rudinger/winogender-schemas) evaluation set to produce\
218
+ # eight syntactically similar sentences. However semantically, \
219
+ # only two of the sentences are well-specified while the rest remain underspecified.")
220
+ # gr.Markdown("If a model can reliably report the underspecification of an inference-time task, an AI systems can replace only those task predictions with\
221
+ # an appropriate heuristic or information retrieval process.")
222
+ gr.Markdown("Follow the numbered steps below to test out one of the pre-loaded options. Once you get the hang of it, you can load a new model and/or provide your own input texts.")
223
+
224
+
225
 
226
  with gr.Row():
227
  model_name = gr.Radio(
228
  MODEL_NAMES,
229
  type="value",
230
+ label="1) Pick a preloaded BERT-like model (note: RoBERTa-large performance is best).",
231
  )
232
  own_model_name = gr.Textbox(
233
  label=f"...Or, if you selected an '{OWN_MODEL_NAME}' model, put any Hugging Face pipeline model name \
 
248
  )
249
 
250
  with gr.Row():
251
+ get_text_btn = gr.Button("3) Load input texts. Read the sentences to determine which two are well-specified for gendered pronoun coreference resolution. The rest are gender-unspecified.")
252
 
253
  get_text_btn.click(
254
  fn=display_input_texts,
 
259
  )
260
 
261
  with gr.Row():
262
+ uncertain_btn = gr.Button("4) Get Task Specification Metric results!")
263
  gr.Markdown(
264
  "If there is an * by a sentence number, then at least one top prediction for that sentence was non-gendered.")
265