Anon Anon commited on
Commit
de623fb
·
1 Parent(s): 15ca15a

Create winogender_sentences.py

Browse files
Files changed (1) hide show
  1. winogender_sentences.py +105 -0
winogender_sentences.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ######################################################################
2
+ ##
3
+ ## This script is a lightly modified version of that provided in winogender-schemas
4
+ ## https://github.com/rudinger/winogender-schemas
5
+ ##
6
+ ######################################################################
7
+
8
+ import csv
9
+ import os
10
+ from pathlib import Path
11
+ from collections import OrderedDict
12
+
13
+ # This script fully instantiates the 120 templates in ../data/templates.tsv
14
+ # to generate the 720 sentences in ../data/all_sentences.tsv
15
+ # By default this script prints to stdout, and can be run with no arguments:
16
+
17
+ def load_templates(path):
18
+ fp = open(path, 'r')
19
+ next(fp) # first line headers
20
+ S = []
21
+ for line in fp:
22
+
23
+ line = line.strip().split('\t')
24
+ occupation, other_participant, answer, sentence = line[0], line[1], line[2], line[3]
25
+ S.append((occupation, other_participant, answer, sentence))
26
+ return S
27
+
28
+ def generate(occupation, other_participant, sentence, second_ref="", context=None):
29
+ toks = sentence.split(" ")
30
+ occ_index = toks.index("$OCCUPATION")
31
+ part_index = toks.index("$PARTICIPANT")
32
+ toks[occ_index] = occupation
33
+ # we are using the instantiated participant, e.g. "client", "patient", "customer",...
34
+ if not second_ref:
35
+ toks[part_index] = other_participant
36
+ elif second_ref != 'someone':
37
+ toks[part_index] = second_ref
38
+ else:
39
+ # we are using the bleached NP "someone" for the other participant
40
+ # first, remove the token that precedes $PARTICIPANT, i.e. "the"
41
+ toks = toks[:part_index-1]+toks[part_index:]
42
+ # recompute participant index (it should be part_index - 1)
43
+ part_index = toks.index("$PARTICIPANT")
44
+ if part_index == 0:
45
+ toks[part_index] = "Someone"
46
+ else:
47
+ toks[part_index] = "someone"
48
+ NOM = "$NOM_PRONOUN"
49
+ POSS = "$POSS_PRONOUN"
50
+ ACC = "$ACC_PRONOUN"
51
+ special_toks = set({NOM, POSS, ACC})
52
+ mask_map = {NOM: "MASK", POSS: "MASK", ACC: "MASK"}
53
+ mask_toks = [x if not x in special_toks else mask_map[x] for x in toks]
54
+ masked_sent = " ".join(mask_toks)
55
+
56
+ return masked_sent
57
+ # %%
58
+
59
+
60
+ def get_sentences():
61
+ script_dir = os.path.dirname(__file__)
62
+ rel_path = "winogender_schema"
63
+ abs_path = os.path.join(script_dir, rel_path)
64
+ Path(abs_path).mkdir(parents=True, exist_ok=True)
65
+ # %%
66
+
67
+ S = load_templates(os.path.join(abs_path, "templates.tsv"))
68
+
69
+ # %%
70
+ with open(os.path.join(abs_path, "all_sentences.tsv"), 'w', newline='') as csvfile:
71
+ sentence_writer = csv.writer(csvfile, delimiter='\t')
72
+ sentence_writer.writerow(['sentid', 'sentence'])
73
+ sentence_dict = OrderedDict()
74
+
75
+ for s in S:
76
+ occupation, other_participant, answer, sentence = s
77
+
78
+ gendered_sentence = generate(
79
+ occupation, other_participant, sentence)
80
+ gendered_sentid = f"{occupation}_{other_participant}_{answer}"
81
+ sentence_dict[gendered_sentid] = gendered_sentence
82
+
83
+ someone_sentence = generate(
84
+ occupation, other_participant, sentence, second_ref='someone')
85
+ someone_sentid = f"{occupation}_someone_{answer}"
86
+ sentence_dict[someone_sentid] = someone_sentence
87
+
88
+ man_sentence = generate(
89
+ occupation, other_participant, sentence, second_ref='man')
90
+ man_sentid = f"{occupation}_man_{answer}"
91
+ sentence_dict[man_sentid] = man_sentence
92
+
93
+ woman_sentence = generate(
94
+ occupation, other_participant, sentence, second_ref='woman')
95
+ woman_sentid = f"{occupation}_woman_{answer}"
96
+ sentence_dict[woman_sentid] = woman_sentence
97
+
98
+ sentence_writer.writerow([gendered_sentid, gendered_sentence])
99
+ sentence_writer.writerow([someone_sentid, someone_sentence])
100
+ sentence_writer.writerow([man_sentid, man_sentence])
101
+ sentence_writer.writerow([woman_sentid, woman_sentence])
102
+
103
+ return sentence_dict
104
+
105
+