Spaces:
Runtime error
Runtime error
Anon Anon
commited on
Commit
·
de623fb
1
Parent(s):
15ca15a
Create winogender_sentences.py
Browse files- winogender_sentences.py +105 -0
winogender_sentences.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
######################################################################
|
2 |
+
##
|
3 |
+
## This script is a lightly modified version of that provided in winogender-schemas
|
4 |
+
## https://github.com/rudinger/winogender-schemas
|
5 |
+
##
|
6 |
+
######################################################################
|
7 |
+
|
8 |
+
import csv
|
9 |
+
import os
|
10 |
+
from pathlib import Path
|
11 |
+
from collections import OrderedDict
|
12 |
+
|
13 |
+
# This script fully instantiates the 120 templates in ../data/templates.tsv
|
14 |
+
# to generate the 720 sentences in ../data/all_sentences.tsv
|
15 |
+
# By default this script prints to stdout, and can be run with no arguments:
|
16 |
+
|
17 |
+
def load_templates(path):
|
18 |
+
fp = open(path, 'r')
|
19 |
+
next(fp) # first line headers
|
20 |
+
S = []
|
21 |
+
for line in fp:
|
22 |
+
|
23 |
+
line = line.strip().split('\t')
|
24 |
+
occupation, other_participant, answer, sentence = line[0], line[1], line[2], line[3]
|
25 |
+
S.append((occupation, other_participant, answer, sentence))
|
26 |
+
return S
|
27 |
+
|
28 |
+
def generate(occupation, other_participant, sentence, second_ref="", context=None):
|
29 |
+
toks = sentence.split(" ")
|
30 |
+
occ_index = toks.index("$OCCUPATION")
|
31 |
+
part_index = toks.index("$PARTICIPANT")
|
32 |
+
toks[occ_index] = occupation
|
33 |
+
# we are using the instantiated participant, e.g. "client", "patient", "customer",...
|
34 |
+
if not second_ref:
|
35 |
+
toks[part_index] = other_participant
|
36 |
+
elif second_ref != 'someone':
|
37 |
+
toks[part_index] = second_ref
|
38 |
+
else:
|
39 |
+
# we are using the bleached NP "someone" for the other participant
|
40 |
+
# first, remove the token that precedes $PARTICIPANT, i.e. "the"
|
41 |
+
toks = toks[:part_index-1]+toks[part_index:]
|
42 |
+
# recompute participant index (it should be part_index - 1)
|
43 |
+
part_index = toks.index("$PARTICIPANT")
|
44 |
+
if part_index == 0:
|
45 |
+
toks[part_index] = "Someone"
|
46 |
+
else:
|
47 |
+
toks[part_index] = "someone"
|
48 |
+
NOM = "$NOM_PRONOUN"
|
49 |
+
POSS = "$POSS_PRONOUN"
|
50 |
+
ACC = "$ACC_PRONOUN"
|
51 |
+
special_toks = set({NOM, POSS, ACC})
|
52 |
+
mask_map = {NOM: "MASK", POSS: "MASK", ACC: "MASK"}
|
53 |
+
mask_toks = [x if not x in special_toks else mask_map[x] for x in toks]
|
54 |
+
masked_sent = " ".join(mask_toks)
|
55 |
+
|
56 |
+
return masked_sent
|
57 |
+
# %%
|
58 |
+
|
59 |
+
|
60 |
+
def get_sentences():
|
61 |
+
script_dir = os.path.dirname(__file__)
|
62 |
+
rel_path = "winogender_schema"
|
63 |
+
abs_path = os.path.join(script_dir, rel_path)
|
64 |
+
Path(abs_path).mkdir(parents=True, exist_ok=True)
|
65 |
+
# %%
|
66 |
+
|
67 |
+
S = load_templates(os.path.join(abs_path, "templates.tsv"))
|
68 |
+
|
69 |
+
# %%
|
70 |
+
with open(os.path.join(abs_path, "all_sentences.tsv"), 'w', newline='') as csvfile:
|
71 |
+
sentence_writer = csv.writer(csvfile, delimiter='\t')
|
72 |
+
sentence_writer.writerow(['sentid', 'sentence'])
|
73 |
+
sentence_dict = OrderedDict()
|
74 |
+
|
75 |
+
for s in S:
|
76 |
+
occupation, other_participant, answer, sentence = s
|
77 |
+
|
78 |
+
gendered_sentence = generate(
|
79 |
+
occupation, other_participant, sentence)
|
80 |
+
gendered_sentid = f"{occupation}_{other_participant}_{answer}"
|
81 |
+
sentence_dict[gendered_sentid] = gendered_sentence
|
82 |
+
|
83 |
+
someone_sentence = generate(
|
84 |
+
occupation, other_participant, sentence, second_ref='someone')
|
85 |
+
someone_sentid = f"{occupation}_someone_{answer}"
|
86 |
+
sentence_dict[someone_sentid] = someone_sentence
|
87 |
+
|
88 |
+
man_sentence = generate(
|
89 |
+
occupation, other_participant, sentence, second_ref='man')
|
90 |
+
man_sentid = f"{occupation}_man_{answer}"
|
91 |
+
sentence_dict[man_sentid] = man_sentence
|
92 |
+
|
93 |
+
woman_sentence = generate(
|
94 |
+
occupation, other_participant, sentence, second_ref='woman')
|
95 |
+
woman_sentid = f"{occupation}_woman_{answer}"
|
96 |
+
sentence_dict[woman_sentid] = woman_sentence
|
97 |
+
|
98 |
+
sentence_writer.writerow([gendered_sentid, gendered_sentence])
|
99 |
+
sentence_writer.writerow([someone_sentid, someone_sentence])
|
100 |
+
sentence_writer.writerow([man_sentid, man_sentence])
|
101 |
+
sentence_writer.writerow([woman_sentid, woman_sentence])
|
102 |
+
|
103 |
+
return sentence_dict
|
104 |
+
|
105 |
+
|