Spaces:

anonymousauthorsanonymous
/

uncertainty

Runtime error

App Files Files Community

Anon Anon commited on Nov 11, 2022

Commit

de623fb

1 Parent(s): 15ca15a

Create winogender_sentences.py

Browse files

Files changed (1) hide show

winogender_sentences.py +105 -0

winogender_sentences.py ADDED Viewed

	@@ -0,0 +1,105 @@

+######################################################################
+##
+## This script is a lightly modified version of that provided in winogender-schemas
+## https://github.com/rudinger/winogender-schemas
+##
+######################################################################
+import csv
+import os
+from pathlib import Path
+from collections import OrderedDict
+# This script fully instantiates the 120 templates in ../data/templates.tsv
+# to generate the 720 sentences in ../data/all_sentences.tsv
+# By default this script prints to stdout, and can be run with no arguments:
+def load_templates(path):
+    fp = open(path, 'r')
+    next(fp)  # first line headers
+    S = []
+    for line in fp:
+        line = line.strip().split('\t')
+        occupation, other_participant, answer, sentence = line[0], line[1], line[2], line[3]
+        S.append((occupation, other_participant, answer, sentence))
+    return S
+def generate(occupation, other_participant, sentence, second_ref="", context=None):
+    toks = sentence.split(" ")
+    occ_index = toks.index("$OCCUPATION")
+    part_index = toks.index("$PARTICIPANT")
+    toks[occ_index] = occupation
+    # we are using the instantiated participant, e.g. "client", "patient", "customer",...
+    if not second_ref:
+        toks[part_index] = other_participant
+    elif second_ref != 'someone':
+        toks[part_index] = second_ref
+    else:
+        # we are using the bleached NP "someone" for the other participant
+        # first, remove the token that precedes $PARTICIPANT, i.e. "the"
+        toks = toks[:part_index-1]+toks[part_index:]
+        # recompute participant index (it should be part_index - 1)
+        part_index = toks.index("$PARTICIPANT")
+        if part_index == 0:
+            toks[part_index] = "Someone"
+        else:
+            toks[part_index] = "someone"
+    NOM = "$NOM_PRONOUN"
+    POSS = "$POSS_PRONOUN"
+    ACC = "$ACC_PRONOUN"
+    special_toks = set({NOM, POSS, ACC})
+    mask_map = {NOM: "MASK", POSS: "MASK", ACC: "MASK"}
+    mask_toks = [x if not x in special_toks else mask_map[x] for x in toks]
+    masked_sent = " ".join(mask_toks)
+    return masked_sent
+# %%
+def get_sentences():
+    script_dir = os.path.dirname(__file__)
+    rel_path = "winogender_schema"
+    abs_path = os.path.join(script_dir, rel_path)
+    Path(abs_path).mkdir(parents=True, exist_ok=True)
+    # %%
+    S = load_templates(os.path.join(abs_path, "templates.tsv"))
+    # %%
+    with open(os.path.join(abs_path, "all_sentences.tsv"), 'w', newline='') as csvfile:
+        sentence_writer = csv.writer(csvfile, delimiter='\t')
+        sentence_writer.writerow(['sentid', 'sentence'])
+        sentence_dict = OrderedDict()
+        for s in S:
+            occupation, other_participant, answer, sentence = s
+            gendered_sentence = generate(
+                occupation, other_participant, sentence)
+            gendered_sentid = f"{occupation}_{other_participant}_{answer}"
+            sentence_dict[gendered_sentid] = gendered_sentence
+            someone_sentence = generate(
+                occupation, other_participant, sentence, second_ref='someone')
+            someone_sentid = f"{occupation}_someone_{answer}"
+            sentence_dict[someone_sentid] = someone_sentence
+            man_sentence = generate(
+                occupation, other_participant, sentence, second_ref='man')
+            man_sentid = f"{occupation}_man_{answer}"
+            sentence_dict[man_sentid] = man_sentence
+            woman_sentence = generate(
+                occupation, other_participant, sentence, second_ref='woman')
+            woman_sentid = f"{occupation}_woman_{answer}"
+            sentence_dict[woman_sentid] = woman_sentence
+            sentence_writer.writerow([gendered_sentid, gendered_sentence])
+            sentence_writer.writerow([someone_sentid, someone_sentence])
+            sentence_writer.writerow([man_sentid, man_sentence])
+            sentence_writer.writerow([woman_sentid, woman_sentence])
+    return sentence_dict