Spaces:
Build error
Build error
data and utils
Browse files- data/definitional_pairs.json +1 -0
- data/equalize_pairs.json +1 -0
- data/female_word_file.txt +221 -0
- data/gender_specific_full.json +1 -0
- data/male_word_file.txt +221 -0
- eval.py +424 -0
- utils.py +406 -0
data/definitional_pairs.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[["woman", "man"], ["girl", "boy"], ["she", "he"], ["mother", "father"], ["daughter", "son"], ["gal", "guy"], ["female", "male"], ["her", "his"], ["herself", "himself"], ["Mary", "John"]]
|
data/equalize_pairs.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[["monastery", "convent"], ["spokesman", "spokeswoman"], ["Catholic_priest", "nun"], ["Dad", "Mom"], ["Men", "Women"], ["councilman", "councilwoman"], ["grandpa", "grandma"], ["grandsons", "granddaughters"], ["prostate_cancer", "ovarian_cancer"], ["testosterone", "estrogen"], ["uncle", "aunt"], ["husbands", "wives"], ["Father", "Mother"], ["Grandpa", "Grandma"], ["He", "She"], ["boy", "girl"], ["boys", "girls"], ["brother", "sister"], ["brothers", "sisters"], ["businessman", "businesswoman"], ["chairman", "chairwoman"], ["colt", "filly"], ["congressman", "congresswoman"], ["dad", "mom"], ["dads", "moms"], ["dudes", "gals"], ["ex_boyfriend", "ex_girlfriend"], ["father", "mother"], ["fatherhood", "motherhood"], ["fathers", "mothers"], ["fella", "granny"], ["fraternity", "sorority"], ["gelding", "mare"], ["gentleman", "lady"], ["gentlemen", "ladies"], ["grandfather", "grandmother"], ["grandson", "granddaughter"], ["he", "she"], ["himself", "herself"], ["his", "her"], ["king", "queen"], ["kings", "queens"], ["male", "female"], ["males", "females"], ["man", "woman"], ["men", "women"], ["nephew", "niece"], ["prince", "princess"], ["schoolboy", "schoolgirl"], ["son", "daughter"], ["sons", "daughters"], ["twin_brother", "twin_sister"]]
|
data/female_word_file.txt
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
countrywoman
|
2 |
+
sororal
|
3 |
+
witches
|
4 |
+
maidservant
|
5 |
+
mothers
|
6 |
+
diva
|
7 |
+
actress
|
8 |
+
spinster
|
9 |
+
mama
|
10 |
+
duchesses
|
11 |
+
barwoman
|
12 |
+
countrywomen
|
13 |
+
dowry
|
14 |
+
hostesses
|
15 |
+
suitors
|
16 |
+
airwomen
|
17 |
+
menopause
|
18 |
+
clitoris
|
19 |
+
princess
|
20 |
+
governesses
|
21 |
+
abbess
|
22 |
+
women
|
23 |
+
widow
|
24 |
+
ladies
|
25 |
+
sorceresses
|
26 |
+
madam
|
27 |
+
brides
|
28 |
+
baroness
|
29 |
+
housewives
|
30 |
+
godesses
|
31 |
+
niece
|
32 |
+
widows
|
33 |
+
lady
|
34 |
+
sister
|
35 |
+
brides
|
36 |
+
nun
|
37 |
+
adultresses
|
38 |
+
obstetrics
|
39 |
+
bellgirls
|
40 |
+
her
|
41 |
+
marchioness
|
42 |
+
princesses
|
43 |
+
empresses
|
44 |
+
mare
|
45 |
+
chairwoman
|
46 |
+
convent
|
47 |
+
priestesses
|
48 |
+
girlhood
|
49 |
+
ladies
|
50 |
+
queen
|
51 |
+
gals
|
52 |
+
mommies
|
53 |
+
maid
|
54 |
+
female_ejaculation
|
55 |
+
spokeswoman
|
56 |
+
seamstress
|
57 |
+
cowgirls
|
58 |
+
chick
|
59 |
+
spinsters
|
60 |
+
hair_salon
|
61 |
+
empress
|
62 |
+
mommy
|
63 |
+
feminism
|
64 |
+
gals
|
65 |
+
enchantress
|
66 |
+
gal
|
67 |
+
motherhood
|
68 |
+
estrogen
|
69 |
+
camerawomen
|
70 |
+
godmother
|
71 |
+
strongwoman
|
72 |
+
goddess
|
73 |
+
matriarch
|
74 |
+
aunt
|
75 |
+
chairwomen
|
76 |
+
ma'am
|
77 |
+
sisterhood
|
78 |
+
hostess
|
79 |
+
estradiol
|
80 |
+
wife
|
81 |
+
mom
|
82 |
+
stewardess
|
83 |
+
females
|
84 |
+
viagra
|
85 |
+
spokeswomen
|
86 |
+
ma
|
87 |
+
belle
|
88 |
+
minx
|
89 |
+
maiden
|
90 |
+
witch
|
91 |
+
miss
|
92 |
+
nieces
|
93 |
+
mothered
|
94 |
+
cow
|
95 |
+
belles
|
96 |
+
councilwomen
|
97 |
+
landlords
|
98 |
+
granddaughter
|
99 |
+
fiancees
|
100 |
+
stepmothers
|
101 |
+
horsemen
|
102 |
+
grandmothers
|
103 |
+
adultress
|
104 |
+
schoolgirl
|
105 |
+
hen
|
106 |
+
granddaughters
|
107 |
+
bachelorette
|
108 |
+
camerawoman
|
109 |
+
moms
|
110 |
+
her
|
111 |
+
mistress
|
112 |
+
lass
|
113 |
+
policewoman
|
114 |
+
nun
|
115 |
+
actresses
|
116 |
+
saleswomen
|
117 |
+
girlfriend
|
118 |
+
councilwoman
|
119 |
+
lady
|
120 |
+
stateswoman
|
121 |
+
maternal
|
122 |
+
lass
|
123 |
+
landlady
|
124 |
+
sistren
|
125 |
+
ladies
|
126 |
+
wenches
|
127 |
+
sorority
|
128 |
+
bellgirl
|
129 |
+
duchess
|
130 |
+
ballerina
|
131 |
+
chicks
|
132 |
+
fiancee
|
133 |
+
fillies
|
134 |
+
wives
|
135 |
+
suitress
|
136 |
+
paternity
|
137 |
+
she
|
138 |
+
businesswoman
|
139 |
+
masseuses
|
140 |
+
heroine
|
141 |
+
doe
|
142 |
+
busgirls
|
143 |
+
girlfriends
|
144 |
+
queens
|
145 |
+
sisters
|
146 |
+
mistresses
|
147 |
+
stepmother
|
148 |
+
daughter
|
149 |
+
minxes
|
150 |
+
cowgirl
|
151 |
+
lady
|
152 |
+
daughters
|
153 |
+
mezzo
|
154 |
+
saleswoman
|
155 |
+
mistress
|
156 |
+
hostess
|
157 |
+
nuns
|
158 |
+
maids
|
159 |
+
mrs.
|
160 |
+
headmistresses
|
161 |
+
lasses
|
162 |
+
congresswoman
|
163 |
+
airwoman
|
164 |
+
housewife
|
165 |
+
priestess
|
166 |
+
barwomen
|
167 |
+
barnoesses
|
168 |
+
abbesses
|
169 |
+
handywoman
|
170 |
+
toque
|
171 |
+
sororities
|
172 |
+
stewardesses
|
173 |
+
filly
|
174 |
+
czarina
|
175 |
+
stepdaughters
|
176 |
+
herself
|
177 |
+
girls
|
178 |
+
lionesses
|
179 |
+
lady
|
180 |
+
vagina
|
181 |
+
hers
|
182 |
+
masseuse
|
183 |
+
cows
|
184 |
+
aunts
|
185 |
+
wench
|
186 |
+
toques
|
187 |
+
wife
|
188 |
+
lioness
|
189 |
+
sorceress
|
190 |
+
effeminate
|
191 |
+
mother
|
192 |
+
lesbians
|
193 |
+
female
|
194 |
+
waitresses
|
195 |
+
ovum
|
196 |
+
skene_gland
|
197 |
+
stepdaughter
|
198 |
+
womb
|
199 |
+
businesswomen
|
200 |
+
heiress
|
201 |
+
waitress
|
202 |
+
headmistress
|
203 |
+
woman
|
204 |
+
governess
|
205 |
+
godess
|
206 |
+
bride
|
207 |
+
grandma
|
208 |
+
bride
|
209 |
+
gal
|
210 |
+
lesbian
|
211 |
+
ladies
|
212 |
+
girl
|
213 |
+
grandmother
|
214 |
+
mare
|
215 |
+
hens
|
216 |
+
uterus
|
217 |
+
nuns
|
218 |
+
maidservants
|
219 |
+
seamstress'
|
220 |
+
busgirl
|
221 |
+
heroines
|
data/gender_specific_full.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
["he", "his", "He", "her", "she", "him", "She", "man", "women", "men", "His", "woman", "spokesman", "wife", "himself", "son", "mother", "father", "chairman", "daughter", "husband", "guy", "girls", "girl", "Her", "boy", "King", "boys", "brother", "Chairman", "spokeswoman", "female", "sister", "Women", "Man", "male", "herself", "Lions", "Lady", "brothers", "dad", "actress", "mom", "sons", "girlfriend", "Kings", "Men", "daughters", "Prince", "Queen", "teenager", "lady", "Bulls", "boyfriend", "sisters", "Colts", "mothers", "Sir", "king", "businessman", "Boys", "grandmother", "grandfather", "deer", "cousin", "Woman", "ladies", "Girls", "Father", "uncle", "PA", "Boy", "Councilman", "mum", "Brothers", "MA", "males", "Girl", "Mom", "Guy", "Queens", "congressman", "Dad", "Mother", "grandson", "twins", "bull", "queen", "businessmen", "wives", "widow", "nephew", "bride", "females", "aunt", "Congressman", "prostate_cancer", "lesbian", "chairwoman", "fathers", "Son", "moms", "Ladies", "maiden", "granddaughter", "younger_brother", "Princess", "Guys", "lads", "Ma", "Sons", "lion", "Bachelor", "gentleman", "fraternity", "bachelor", "niece", "Lion", "Sister", "bulls", "husbands", "prince", "colt", "salesman", "Bull", "Sisters", "hers", "dude", "Spokesman", "beard", "filly", "Actress", "Him", "princess", "Brother", "lesbians", "councilman", "actresses", "Viagra", "gentlemen", "stepfather", "Deer", "monks", "Beard", "Uncle", "ex_girlfriend", "lad", "sperm", "Daddy", "testosterone", "MAN", "Female", "nephews", "maid", "daddy", "mare", "fiance", "Wife", "fiancee", "kings", "dads", "waitress", "Male", "maternal", "heroine", "feminist", "Mama", "nieces", "girlfriends", "Councilwoman", "sir", "stud", "Mothers", "mistress", "lions", "estranged_wife", "womb", "Brotherhood", "Statesman", "grandma", "maternity", "estrogen", "ex_boyfriend", "widows", "gelding", "diva", "teenage_girls", "nuns", "Daughter", "czar", "ovarian_cancer", "HE", "Monk", "countrymen", "Grandma", "teenage_girl", "penis", "bloke", "nun", "Husband", "brides", "housewife", "spokesmen", "suitors", "menopause", "monastery", "patriarch", "Beau", "motherhood", "brethren", "stepmother", "Dude", "prostate", "Moms", "hostess", "twin_brother", "Colt", "schoolboy", "eldest", "brotherhood", "Godfather", "fillies", "stepson", "congresswoman", "Chairwoman", "Daughters", "uncles", "witch", "Mommy", "monk", "viagra", "paternity", "suitor", "chick", "Pa", "fianc\u00e9", "sorority", "macho", "Spokeswoman", "businesswoman", "eldest_son", "gal", "statesman", "schoolgirl", "fathered", "goddess", "hubby", "mares", "stepdaughter", "blokes", "dudes", "socialite", "strongman", "Witch", "fianc\u00e9e", "uterus", "grandsons", "Bride", "studs", "mama", "Aunt", "godfather", "hens", "hen", "mommy", "Babe", "estranged_husband", "Fathers", "elder_brother", "boyhood", "baritone", "Diva", "Lesbian", "grandmothers", "grandpa", "boyfriends", "feminism", "countryman", "stallion", "heiress", "queens", "Grandpa", "witches", "aunts", "semen", "fella", "granddaughters", "chap", "knight", "widower", "Maiden", "salesmen", "convent", "KING", "vagina", "beau", "babe", "HIS", "beards", "handyman", "twin_sister", "maids", "gals", "housewives", "Gentlemen", "horsemen", "Businessman", "obstetrics", "fatherhood", "beauty_queen", "councilwoman", "princes", "matriarch", "colts", "manly", "ma", "fraternities", "Spokesmen", "pa", "fellas", "Gentleman", "councilmen", "dowry", "barbershop", "Monks", "WOMAN", "fraternal", "ballerina", "manhood", "Dads", "heroines", "granny", "gynecologist", "princesses", "Goddess", "yo", "Granny", "knights", "eldest_daughter", "HER", "underage_girls", "masculinity", "Girlfriend", "bro", "Grandmother", "grandfathers", "crown_prince", "Restless", "paternal", "Queen_Mother", "Boyfriend", "womens", "Males", "SHE", "Countess", "stepchildren", "Belles", "bachelors", "matron", "momma", "Legs", "maidens", "goddesses", "landlady", "sisterhood", "Grandfather", "Fraternity", "Majesty", "Babes", "lass", "maternal_grandmother", "blondes", "ma'am", "Womens", "divorcee", "Momma", "fathering", "Effie", "Lad", "womanhood", "missus", "Sisterhood", "granddad", "Mens", "papa", "gf", "sis", "Husbands", "Hen", "womanizer", "gynecological", "stepsister", "Handsome", "Prince_Charming", "BOY", "stepdad", "teen_ager", "GIRL", "dame", "Sorority", "beauty_pageants", "raspy", "harem", "maternal_grandfather", "Hes", "deliveryman", "septuagenarian", "damsel", "paternal_grandmother", "paramour", "paternal_grandparents", "Nun", "DAD", "mothering", "shes", "HE_'S", "Nuns", "teenage_daughters", "auntie", "widowed_mother", "Girlfriends", "FATHER", "virile", "COUPLE", "grandmas", "Hubby", "nan", "vixen", "Joan_Crawford", "stepdaughters", "endometrial_cancer", "stepsons", "loins", "Grandson", "Mitchells", "erections", "Matron", "Fella", "daddies", "ter", "Sweetie", "Dudes", "Princesses", "Lads", "lioness", "Mamma", "virility", "bros", "womenfolk", "Heir", "BROTHERS", "manliness", "patriarchs", "earl", "sisterly", "Whore", "Gynaecology", "countess", "convents", "Oratory", "witch_doctor", "mamas", "yah", "aunty", "aunties", "Heiress", "lasses", "Breasts", "fairer_sex", "sorority_sisters", "WIFE", "Laurels", "penile", "nuh", "mah", "toms", "mam", "Granddad", "premenopausal_women", "Granddaddy", "nana", "coeds", "dames", "herdsman", "Mammy", "Fellas", "Niece", "menfolk", "Grandad", "bloods", "Gramps", "damsels", "Granddaughter", "mamma", "concubine", "Oros", "Blarney", "filial", "broads", "Ethel_Kennedy", "ACTRESS", "Tit", "fianc", "Hunk", "Night_Shift", "wifey", "Lothario", "Holy_Roman_Emperor", "horse_breeder", "grandnephew", "Lewises", "Muscular", "feminist_movement", "Sanan", "women\u00e2_\u20ac_\u2122", "Fiancee", "dowries", "Carmelite", "rah", "n_roller", "bay_filly", "belles", "Uncles", "PRINCESS", "womans", "Homeboy", "Blokes", "Charmer", "codger", "Delta_Zeta", "courtesans", "grandaughter", "SISTER", "Highness", "grandbabies", "crone", "Skip_Away", "noblewoman", "bf", "jane", "philandering_husband", "Sisqo", "mammy", "daugher", "director_Skip_Bertman", "DAUGHTER", "Royal_Highness", "mannish", "spinsters", "Missus", "madame", "Godfathers", "saleswomen", "beaus", "Risha", "luh", "sah", "negligee", "Women\u00e2_\u20ac_\u2122", "Hos", "salesgirl", "grandmom", "Grandmas", "Lawsons", "countrywomen", "Booby", "darlin", "Sheiks", "boyz", "wifes", "Bayi", "Il_Duce", "\u00e2_\u20ac_\u0153My", "fem", "daugther", "Potti", "hussy", "tch", "Gelding", "stemmed_roses", "Damson", "puh", "Tylers", "neice", "Mutha", "GRANDMOTHER", "youse", "spurned_lover", "mae", "Britt_Ekland", "clotheshorse", "Carlita_Kilpatrick", "Cambest", "Pretty_Polly", "banshees", "male_chauvinist", "Arliss", "mommas", "maidservant", "Gale_Harold", "Little_Bo_Peep", "Cleavers", "hags", "blowsy", "Queen_Elizabeth_I.", "lassies", "papas", "BABE", "ugly_ducklings", "Jims", "hellion", "Beautician", "coalminer", "relaxin", "El_Mahroug", "Victoria_Secret_Angel", "shepherdess", "Mosco", "Slacks", "nanna", "wifely", "tomboys", "LAH", "hast", "apo", "Kaplans", "milkmaid", "Robin_Munis", "John_Barleycorn", "royal_highness", "Meanie", "NAH", "trollop", "roh", "Jewess", "Sheik_Hamad", "mumsy", "Big_Pussy", "chil_dren", "Aunt_Bea", "basso", "sista", "girlies", "nun_Sister", "chica", "Bubbas", "massa", "Southern_belles", "Nephews", "castrations", "Mister_Ed", "Grandsons", "Calaf", "Malachy_McCourt", "Shamash", "hey_hey", "Harmen", "sonofabitch", "Donovans", "Grannie", "Kalinka", "hisself", "Devean", "goatherd", "hinds", "El_Corredor", "Kens", "notorious_womanizer", "goh", "Mommas", "washerwoman", "Samaira", "Coo_Coo", "Governess", "grandsire", "PRINCE_WILLIAM", "gramma", "him.He", "Coptic_priest", "Corbie", "Kennys", "thathe", "Pa_Pa", "Bristols", "Hotep", "snowy_haired", "El_Prado_Ire", "Girl_hitmaker", "Hurleys", "St._Meinrad", "sexually_perverted", "authoress", "Prudie", "raven_haired_beauty", "Bonos", "domestic_shorthair", "brothas", "nymphet", "Neelma", "Seita", "stud_muffin", "St._Judes", "yenta", "bare_shouldered", "Pinkney_Sr.", "PRINCE_CHARLES", "Bisutti", "sistas", "Blanche_Devereaux", "Momoa", "Quiff", "Scotswoman", "balaclava_clad_men", "Louis_Leakey", "dearie", "vacuum_cleaner_salesman", "grandads", "postulant", "SARAH_JESSICA_PARKER", "AUNT", "Prince_Dauntless", "Dalys", "Darkie", "Czar_Nicholas", "Lion_Hearted", "Boy_recliner", "baby_mamas", "giantess", "Lawd", "GRANNY", "fianc_e", "Bilqis", "WCTU", "famly", "Ellas", "feminazis", "Pentheus", "MAMAS", "Town_Criers", "Saggy", "youngman", "grandam", "divorc\u00e9", "bosomed", "roon", "Simmentals", "eponymous_heroine", "LEYLAND", "REE'", "cain't", "Evelynn", "WAH'", "sistah", "Horners", "Elsie_Poncher", "Coochie", "rat_terriers", "Limousins", "Buchinski", "Schicchi", "Carpitcher", "Khwezi", "HAH'", "Shazza", "Mackeson", "ROH'", "kuya", "novice_nun", "Shei", "Elmasri", "ladykiller", "6yo", "Yenta", "SHEL", "pater", "Souse", "Tahirah", "comedian_Rodney_Dangerfield", "Shottle", "carryin", "Sath", "fa'afafine", "royal_consort", "hus_band", "maternal_uncles", "dressing_provocatively", "dreamgirl", "millionaire_industrialist", "Georgie_Girl", "Must_Be_Obeyed", "joh", "Arabian_stallion", "ahr", "mso_para_margin_0in", "SOO'", "Biddles", "Chincoteague_Volunteer_Fire", "Lisa_Miceli", "gorgeous_brunette", "fianc\u017d", "Moved_fluently", "Afternoon_Deelites", "biker_dude", "Vito_Spatafore", "MICK_JAGGER", "Adesida", "Reineman", "witz", "Djamila", "Glenroe", "daddys", "Romanzi", "gentlewomen", "Dandie_Dinmont_terrier", "Excess_Ire", "By_SYVJ_Staff", "zan", "CONFESSIONS", "Magees", "wimmin", "tash", "Theatrical_Ire", "Prince_Charmings", "chocolate_eclair", "bron", "daughers", "Felly", "fiftyish", "Spritely", "GRANDPA", "distaffer", "Norbertines", "DAH'", "leader_Muammar_Gadaffi", "swains", "Prince_Tomohito", "Honneur", "Soeur", "jouster", "Pharaoh_Amenhotep_III", "QUEEN_ELIZABETH_II", "Ne'er", "Galileo_Ire", "Fools_Crow", "Lannisters", "Devines", "gonzales", "columnist_Ann_Landers", "Moseleys", "hiz", "busch", "roastee", "toyboys", "Sheffields", "grandaunt", "Galvins", "Giongo", "geh", "flame_haired_actress", "Grammarian", "Greg_Evigan", "frontierswoman", "Debele", "rabs", "nymphets", "aai", "BREE", "Shaqs", "ZAY", "pappa", "Housa", "refrigerator_repairman", "artificial_inseminations", "chickie", "Rippa", "teenager_Tracy_Turnblad", "homebred_colt", "Abigaille", "hen_pecked_husband", "businesman", "her.She", "Kaikeyi", "Stittsworth", "self_proclaimed_redneck", "Khella", "NeW", "Evers_Swindell", "Asmerom_Gebreselassie", "Boy_recliners", "Cliff_Claven", "Legge_Bourke", "Costos", "d'_honneur", "sistahs", "Cabble", "sahn", "CROW_AGENCY_Mont", "jezebel", "Harrolds", "ROSARIO_DAWSON", "INXS_frontman_Michael_Hutchence", "Gursikh", "Dadas", "VIAGA", "keen_horsewoman", "Theodoric", "Eldery", "lihn", "Alice_Kramden", "Santarina", "radical_cleric_al_Sadr", "Curleys", "SY'", "Fidaa", "Saptapadi", "Actor_Sean_Astin", "Kellita_Smith", "Doly", "Libertina", "Money_McBags", "Chief_Bearhart", "choirgirl", "chestnut_stallion", "VIGRA", "BY_JIM_McCONNELL", "Sal_Vitale", "Trivia_buffs", "kumaris", "fraternal_lodge", "galpals", "Borino_Quinn", "lina", "LATEST_Rapper", "Bezar", "Manro", "bakla", "Grisetti", "blond_bimbo", "spinster_aunt", "gurls", "hiswife", "paleface", "Charlye", "hippie_chicks", "Khalifas", "Picture_JUSTIN_SANSON", "Hepburns", "yez", "ALDER", "Sanussi", "Lil_Sis", "McLoughlins", "Barbra_Jean", "Lulua", "thatshe", "actress_Shohreh_Aghdashloo", "SIR_ANTHONY_HOPKINS", "Gloddy", "ZAH'", "ORANGE_'S", "Danielle_Bimber", "grandmum", "Kulkis", "Brazington", "Marisa_Lenhard_CFA", "SIR_JOHN", "Clareman", "Aqila", "Heavily_tattooed", "Libbys", "thim", "elocutionist", "submissives", "Inja", "rahm", "Agnes_Gooch", "fake_tits", "nancy_boys", "Swaidan", "SHAH'", "ain'ta_bed", "Shumail_Raj", "Duchesse", "diethylstilbestrol_DES", "colt_foal", "unfaithful_lover", "Maseri", "nevah", "SAHN", "Barths", "Toughkenamon", "GUEST_STARS", "him.But", "Donna_Claspell", "gingham_dresses", "Massage_Parlour", "wae", "Wasacz", "Magistra", "vihl", "Smriti_Iraani", "boyish_haircut", "workingwoman", "borthers", "Capuchin_friars", "Nejma", "yes_sirs", "bivocational_pastor", "Grafters", "HOPWOOD", "Nicknamed_Godzilla", "yos", "Berkenfield", "Missis", "sitcom_Designing_Women", "Kafoa", "trainer_Emma_Lavelle", "sadomasochistic_dungeon", "iht", "desperates", "predessor", "wolf_cub", "indigenous_Peruvians", "Livia_Soprano", "troh", "colt_sired", "BOND_HILL", "ihl", "Drydens", "rahs", "Piserchia", "Sonny_Corinthos", "bankrobber", "Fwank", "feisty_redhead", "booze_guzzling", "COOPERS", "actress_Q'orianka_Kilcher", "Cortezar", "twe", "Jacoub", "Cindy_Iannarelli", "Hell_Raiser", "Fondly_referred", "Bridal_Shoppe", "Noleta", "Christinas", "IAGRA", "LaTanya_Richardson", "Sang_Bender", "Assasins", "sorrel_gelding", "septugenarian", "Hissy", "Muqtada_al_Sadr_mook", "Pfeni", "MADRID_AFX_Banco_Santander", "tuchis", "LeVaughn", "Gadzicki", "transvestite_hooker", "Fame_jockey_Laffit", "nun_Sister_Mary", "SAMSONOV", "Mayflower_Madam", "Shaque", "well.He", "Trainer_Julio_Canani", "sorrel_mare", "minivehicle_joint_venture", "wife_Dwina", "Aasiya_AH'_see", "Baratheon", "Rick_O'Shay", "Mammies", "goatie", "Nell_Gwynne", "charmingly_awkward", "Slamma", "DEHL", "Lorenzo_Borghese", "ALMA_Wis.", "Anne_Scurria", "father_Peruvians_alternately", "JULIE_ANDREWS", "Slim_Pickins", "Victoria_Secret_stunner", "BY'", "Sanam_Devdas", "pronounced_luh", "Pasha_Selim", "\u4e2d\u534e", "rson", "maternal_grandmothers", "IOWA_CITY_Ia", "Madame_de_Tourvel", "JAY'", "Sheika_Mozah_bint_Nasser", "Hotsy_Totsy", "D'_Ginto", "singer_Johnny_Paycheck", "uterine_prolapse_surgery", "SCOTTDALE_Pa.", "AdelaideNow_reports", "Marcus_Schenkenberg", "Clyse", "Obiter_Dicta", "comic_Sam_Kinison", "bitties", "ROCKVILLE_Ind.", "swimsuit_calendars", "Decicio_Smith", "Ma_ma", "Rie_Miyazawa", "celibate_chastity", "gwah", "ZAY'", "HER_Majesty", "Defrere", "Las_Madrinas", "\u7c3f_\u8042_\u7ffb", "Bea_Hamill", "ARCADIA_Calif._Trainer", "Bold_Badgett", "stakes_victress", "Hoppin_Frog", "Narumiya", "Flayfil", "hardman_Vinnie_Jones", "Marilyn_Monroe_lookalike", "Kivanc_Tatlitug", "Persis_Khambatta", "SINKING_SPRING_Pa.", "len_3rd", "DEAR_TRYING", "Farndon_Cheshire", "Krishna_Madiga", "daughter_Princess_Chulabhorn", "Marshall_Rooster_Cogburn", "Kitty_Kiernan", "Yokich", "Jarou", "Serdaris", "ee_ay", "Montifiore", "Chuderewicz", "Samuel_Le_Bihan", "filly_Proud_Spell", "Umm_Hiba", "pronounced_koo", "Sandy_Fonzo", "KOR'", "Fielder_Civil_kisses", "Federalsburg_Maryland", "Nikah_ceremony", "Brinke_Stevens", "Yakama_Tribal_Council", "Capuchin_Father", "wife_Callista_Bisek", "Beau_Dare", "Bedoni", "Arjun_Punj", "JOHNNY_KNOXVILLE", "cap_tain", "Alderwood_Boys", "Chi_Eta_Phi", "ringleader_Charles_Graner", "Savoies", "Lalla_Salma", "Mrs._Potiphar", "fahn", "name_Taylor_Sumers", "Vernita_Green", "Bollywood_baddie", "BENBROOK_Texas", "Assemblyman_Lou_Papan", "virgin_brides", "Cho_Eun", "CATHY_Freeman", "Uncle_Saul", "Lao_Brewery", "Ibo_tribe", "ruf", "rival_Edurne_Pasaban", "Hei_Shangri_La", "Mommy_dearest", "interest_Angola_Sonogal", "Ger_Monsun", "PUSSYCAT_DOLL", "Crown_Jewels_Condoms", "Lord_Marke", "Patootie", "Nora_Bey", "huntin_shootin", "Minister_Raymond_Tshibanda", "La_Nina_la_NEEN", "signature_Whoppers", "estranged_hubby_Kevin_Federline", "UR'", "pill_poppin", "GEHR'", "purebred_Arabians", "husbandly_duties", "VIAGRA_TIMING", "Hereford_heifer", "hushed_monotone_voice", "Pola_Uddin", "Wee_Jimmy_Krankie", "Kwakwanso", "Our_Galvinator", "shoh", "Codependency_Anonymous_Group", "LA'", "Taufa'ahau", "Invincible_Spirit_colt", "SAH'_dur", "MOUNT_CARMEL_Pa.", "watches_attentively", "SNL_spinoffs", "Seth_Nitschke", "Duns_Berwickshire", "defendant_Colleen_LaRose", "Silky_O'Sullivan", "Highcliff_Farm", "REN'", "Comestar", "Satisfied_Frog", "Jai_Maharashtra", "ATTICA_Ind.", "lover_Larry_Birkhead", "Tami_Megal", "chauvinist_pigs", "Phi_sorority", "Micronesian_immigrant", "Lia_Boldt", "Sugar_Tits", "actress_Kathy_Najimy", "zhoo", "Colombo_underboss", "Katsav_accusers", "Bess_Houdini", "rap_mogul_Diddy", "companions_Khin_Khin", "Van_Het", "Mastoi_tribe", "VITALY", "ROLLING_STONES_rocker", "womanizing_cad", "LILY_COLE", "paternal_grandfathers", "Lt._Col._Kurt_Kosmatka", "Kasseem_Jr.", "Ji_Ji", "Wilburforce", "VIAGRA_DOSE", "English_Sheepdogs", "pronounced_Kah", "Htet_Htet_Oo", "Brisk_Breeze", "Eau_du", "BY_MELANIE_EVANS", "Neovasc_Medical", "British_funnyman_RICKY", "4YO_mare", "Hemaida", "MONKTON", "Mrs_Mujuru", "BaGhana_BaGhana", "Shaaban_Abdel_Rahim", "Edward_Jazlowiecki_lawyer", "Ajman_Stud", "manly_pharaoh_even", "Serra_Madeira_Islands", "FRAY'", "panto_dames", "Khin_Myo", "dancer_Karima_El_Mahroug", "CROWN_Princess", "Baseball_HOFer", "Hasta_la_Pasta", "GIRLS_NEXT_DOOR", "Benedict_Groeschel", "Bousamra", "Ruby_Rubacuori_Ruby", "Monde_Bleu", "Un_homme_qui", "Taylor_Sumers", "Rapper_EMINEM", "Joe_Menchetti", "VAY'", "supermodel_NAOMI_CAMPBELL", "Supermodel_GISELE_BUNDCHEN", "Au_Lait", "Radar_Installed", "THOMAS_TOWNSHIP_Mich.", "Rafinesque", "Herman_Weinrich", "Abraxas_Antelope", "raspy_voiced_rocker", "Manurewa_Cosmopolitan_Club", "Paraone", "THE_LEOPARD", "Boy_Incorporated_LZB", "Dansili_filly", "Lumpy_Rutherford", "unwedded_bliss", "Bhavna_Sharma", "Scarvagh", "en_flagrante", "Mottu_Maid", "Dowager_Queen", "NEEN", "model_Monika_Zsibrita", "ROSIE_PEREZ", "Mattock_Ranger", "Valorous", "Surpreme", "Marwari_businessmen", "Grandparents_aunts", "Kimberley_Vlaeminck", "Lyn_Treece_Boys", "PDX_Update", "Virsa_Punjab", "eyelash_fluttering", "Pi_fraternity", "HUNTLEIGH_Mo.", "novelist_Jilly_Cooper", "Naha_Shuri_temple", "Yasmine_Al_Massri", "Mu_Gamma_Xi", "Mica_Ertegun", "Ocleppo", "VIAGRA_CONTRAINDICATIONS", "daughter_PEACHES", "trainer_Geoff_Wragg", "OVERNIGHT_DELIVERY", "Fitts_retiree", "de_Tourvel", "Lil_Lad", "north_easterner", "Aol_Weird_News", "Somewhat_improbably", "Sikh_panth", "Worcester_2m_7f", "Zainab_Jah", "OLYMPIC_medalist", "Enoch_Petrucelly", "collie_Lassie", "LOW'", "clumsiness_Holloway", "ayr", "OHR'", "ROLLING_STONES_guitarist", "LAH'_nee", "Ian_Beefy_Botham", "Awapuni_trainer", "Glamorous_Granny", "Chiang_Ching", "MidAtlantic_Cardiovascular_Associates", "Yeke", "Seaforth_Huron_Expositor", "Westley_Cary_Elwes", "Cate_Blanchett_Veronica_Guerin", "Bellas_Gate", "witch_Glinda", "wives_mistresses", "Woodsville_Walmart", "2YO_colt", "Manav_Sushant_Singh", "Pupi_Avati_Il", "Sigma_Beta_Rho", "Bishop_Christopher_Senyonjo", "Vodou_priest", "Rubel_Chowdhury", "Claddagh_Ring", "TAH'_duh_al", "al_Sadr_mook_TAH'", "ROBIN_GIBB", "GAHN'", "BY_THOMAS_RANSON", "sister_Carine_Jena", "Lyphard_mare", "summa_cum", "Semenya_grandmother_Maputhi", "Clare_Nuns", "Talac", "sex_hormones_androgens", "majeste", "Saint_Ballado_mare", "Carrie_Huchel", "Mae_Dok", "wife_Dieula", "Earnest_Sirls", "spoof_bar_mitzvah", "von_Boetticher", "Audwin_Mosby", "Case_presentationWe", "Vincent_Papandrea", "KRAY'", "Sergi_Benavent", "Le_Poisson", "Von_Cramm", "Patti_Mell", "Raymi_Coya", "Benjamin_BeBe_Winans", "Nana_Akosua", "Auld_Acquaintance", "Desire_Burunga", "Company_Wrangler_Nestea", "ask_Krisy_Plourde", "JUANITA_BYNUM", "livia", "GAMB", "Gail_Rosario_Dawson", "Ramgarhia_Sikh", "Catholic_nun_Sister", "FOUR_WEDDINGS_AND", "Robyn_Scherer", "brother_King_Athelstan", "Santo_Loquasto_Fences", "Wee_Frees", "MARISOL", "Soliloquy_Stakes", "Whatever_Spoetzl", "Marc'Aurelio", "mon_petit", "Sabbar_al_Mashhadani", "KAY'_lee", "m_zah_MAH'", "BY_TAMI_ALTHOFF", "hobbit_Samwise_Gamgee", "Bahiya_Hariri_sister", "daddy_Larry_Birkhead", "Sow_Tracey_Ullman", "coach_Viljo_Nousiainen", "Carmen_Lebbos", "conjoined_twins_Zainab", "Rob_Komosa", "ample_bosomed", "Ageing_rocker", "psychic_Oda"]
|
data/male_word_file.txt
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
countryman
|
2 |
+
fraternal
|
3 |
+
wizards
|
4 |
+
manservant
|
5 |
+
fathers
|
6 |
+
divo
|
7 |
+
actor
|
8 |
+
bachelor
|
9 |
+
papa
|
10 |
+
dukes
|
11 |
+
barman
|
12 |
+
countrymen
|
13 |
+
brideprice
|
14 |
+
hosts
|
15 |
+
potential_suitors
|
16 |
+
airmen
|
17 |
+
andropause
|
18 |
+
penis
|
19 |
+
prince
|
20 |
+
governors
|
21 |
+
abbot
|
22 |
+
men
|
23 |
+
widower
|
24 |
+
gentlemen
|
25 |
+
sorcerers
|
26 |
+
sir
|
27 |
+
bridegrooms
|
28 |
+
baron
|
29 |
+
househusbands
|
30 |
+
gods
|
31 |
+
nephew
|
32 |
+
widowers
|
33 |
+
lord
|
34 |
+
brother
|
35 |
+
grooms
|
36 |
+
priest
|
37 |
+
adultors
|
38 |
+
andrology
|
39 |
+
bellboys
|
40 |
+
his
|
41 |
+
marquis
|
42 |
+
princes
|
43 |
+
emperors
|
44 |
+
stallion
|
45 |
+
chairman
|
46 |
+
monastery
|
47 |
+
priests
|
48 |
+
boyhood
|
49 |
+
fellas
|
50 |
+
king
|
51 |
+
dudes
|
52 |
+
daddies
|
53 |
+
manservant
|
54 |
+
semen
|
55 |
+
spokesman
|
56 |
+
tailor
|
57 |
+
cowboys
|
58 |
+
dude
|
59 |
+
bachelors
|
60 |
+
barbershop
|
61 |
+
emperor
|
62 |
+
daddy
|
63 |
+
masculism
|
64 |
+
guys
|
65 |
+
enchanter
|
66 |
+
guy
|
67 |
+
fatherhood
|
68 |
+
androgen
|
69 |
+
cameramen
|
70 |
+
godfather
|
71 |
+
strongman
|
72 |
+
god
|
73 |
+
patriarch
|
74 |
+
uncle
|
75 |
+
chairmen
|
76 |
+
sir
|
77 |
+
brotherhood
|
78 |
+
host
|
79 |
+
testosterone
|
80 |
+
husband
|
81 |
+
dad
|
82 |
+
steward
|
83 |
+
males
|
84 |
+
cialis
|
85 |
+
spokesmen
|
86 |
+
pa
|
87 |
+
beau
|
88 |
+
stud
|
89 |
+
bachelor
|
90 |
+
wizard
|
91 |
+
sir
|
92 |
+
nephews
|
93 |
+
fathered
|
94 |
+
bull
|
95 |
+
beaus
|
96 |
+
councilmen
|
97 |
+
landladies
|
98 |
+
grandson
|
99 |
+
fiances
|
100 |
+
stepfathers
|
101 |
+
horsewomen
|
102 |
+
grandfathers
|
103 |
+
adultor
|
104 |
+
schoolboy
|
105 |
+
rooster
|
106 |
+
grandsons
|
107 |
+
bachelor
|
108 |
+
cameraman
|
109 |
+
dads
|
110 |
+
him
|
111 |
+
master
|
112 |
+
lad
|
113 |
+
policeman
|
114 |
+
monk
|
115 |
+
actors
|
116 |
+
salesmen
|
117 |
+
boyfriend
|
118 |
+
councilman
|
119 |
+
fella
|
120 |
+
statesman
|
121 |
+
paternal
|
122 |
+
chap
|
123 |
+
landlord
|
124 |
+
brethren
|
125 |
+
lords
|
126 |
+
blokes
|
127 |
+
fraternity
|
128 |
+
bellboy
|
129 |
+
duke
|
130 |
+
ballet_dancer
|
131 |
+
dudes
|
132 |
+
fiance
|
133 |
+
colts
|
134 |
+
husbands
|
135 |
+
suitor
|
136 |
+
maternity
|
137 |
+
he
|
138 |
+
businessman
|
139 |
+
masseurs
|
140 |
+
hero
|
141 |
+
deer
|
142 |
+
busboys
|
143 |
+
boyfriends
|
144 |
+
kings
|
145 |
+
brothers
|
146 |
+
masters
|
147 |
+
stepfather
|
148 |
+
son
|
149 |
+
studs
|
150 |
+
cowboy
|
151 |
+
mentleman
|
152 |
+
sons
|
153 |
+
baritone
|
154 |
+
salesman
|
155 |
+
paramour
|
156 |
+
male_host
|
157 |
+
monks
|
158 |
+
menservants
|
159 |
+
mr.
|
160 |
+
headmasters
|
161 |
+
lads
|
162 |
+
congressman
|
163 |
+
airman
|
164 |
+
househusband
|
165 |
+
priest
|
166 |
+
barmen
|
167 |
+
barons
|
168 |
+
abbots
|
169 |
+
handyman
|
170 |
+
beard
|
171 |
+
fraternities
|
172 |
+
stewards
|
173 |
+
colt
|
174 |
+
czar
|
175 |
+
stepsons
|
176 |
+
himself
|
177 |
+
boys
|
178 |
+
lions
|
179 |
+
gentleman
|
180 |
+
penis
|
181 |
+
his
|
182 |
+
masseur
|
183 |
+
bulls
|
184 |
+
uncles
|
185 |
+
bloke
|
186 |
+
beards
|
187 |
+
hubby
|
188 |
+
lion
|
189 |
+
sorcerer
|
190 |
+
macho
|
191 |
+
father
|
192 |
+
gays
|
193 |
+
male
|
194 |
+
waiters
|
195 |
+
sperm
|
196 |
+
prostate
|
197 |
+
stepson
|
198 |
+
prostatic_utricle
|
199 |
+
businessmen
|
200 |
+
heir
|
201 |
+
waiter
|
202 |
+
headmaster
|
203 |
+
man
|
204 |
+
governor
|
205 |
+
god
|
206 |
+
bridegroom
|
207 |
+
grandpa
|
208 |
+
groom
|
209 |
+
dude
|
210 |
+
gay
|
211 |
+
gents
|
212 |
+
boy
|
213 |
+
grandfather
|
214 |
+
gelding
|
215 |
+
roosters
|
216 |
+
prostatic_utricle
|
217 |
+
priests
|
218 |
+
manservants
|
219 |
+
stailor
|
220 |
+
busboy
|
221 |
+
heros
|
eval.py
ADDED
@@ -0,0 +1,424 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from sklearn.cluster import AgglomerativeClustering, KMeans
|
3 |
+
from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999, fetch_MTurk, fetch_RG65, fetch_RW, fetch_TR9856
|
4 |
+
from web.datasets.categorization import fetch_AP, fetch_battig, fetch_BLESS, fetch_ESSLI_1a, fetch_ESSLI_2b, \
|
5 |
+
fetch_ESSLI_2c
|
6 |
+
from web.analogy import *
|
7 |
+
from six import iteritems
|
8 |
+
from web.embedding import Embedding
|
9 |
+
from web.evaluate import calculate_purity, evaluate_categorization, evaluate_on_semeval_2012_2, evaluate_analogy, \
|
10 |
+
evaluate_on_WordRep, evaluate_similarity
|
11 |
+
|
12 |
+
def evaluate_similarity_pearson(w, X, y):
|
13 |
+
"""
|
14 |
+
Calculate Pearson correlation between cosine similarity of the model
|
15 |
+
and human rated similarity of word pairs
|
16 |
+
Parameters
|
17 |
+
----------
|
18 |
+
w : Embedding or dict
|
19 |
+
Embedding or dict instance.
|
20 |
+
X: array, shape: (n_samples, 2)
|
21 |
+
Word pairs
|
22 |
+
y: vector, shape: (n_samples,)
|
23 |
+
Human ratings
|
24 |
+
Returns
|
25 |
+
-------
|
26 |
+
cor: float
|
27 |
+
Pearson correlation
|
28 |
+
"""
|
29 |
+
if isinstance(w, dict):
|
30 |
+
w = Embedding.from_dict(w)
|
31 |
+
|
32 |
+
missing_words = 0
|
33 |
+
words = w.vocabulary.word_id
|
34 |
+
for query in X:
|
35 |
+
for query_word in query:
|
36 |
+
if query_word not in words:
|
37 |
+
missing_words += 1
|
38 |
+
if missing_words > 0:
|
39 |
+
print("Missing {} words. Will replace them with mean vector".format(missing_words))
|
40 |
+
|
41 |
+
new_x = []
|
42 |
+
new_y = []
|
43 |
+
for i in range(len(X)):
|
44 |
+
if X[i, 0] in words and X[i, 1] in words:
|
45 |
+
new_x.append(X[i])
|
46 |
+
new_y.append(y[i])
|
47 |
+
|
48 |
+
X = np.array(new_x)
|
49 |
+
y = np.array(new_y)
|
50 |
+
|
51 |
+
mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
|
52 |
+
A = np.vstack(list(w.get(word, mean_vector) for word in X[:, 0]))
|
53 |
+
B = np.vstack(list(w.get(word, mean_vector) for word in X[:, 1]))
|
54 |
+
scores = np.array([v1.dot(v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2)) for v1, v2 in zip(A, B)])
|
55 |
+
return scipy.stats.pearsonr(scores, y.squeeze())
|
56 |
+
|
57 |
+
def evaluate_similarity(w, X, y):
|
58 |
+
"""
|
59 |
+
Calculate Spearman correlation between cosine similarity of the model
|
60 |
+
and human rated similarity of word pairs
|
61 |
+
|
62 |
+
Parameters
|
63 |
+
----------
|
64 |
+
w : Embedding or dict
|
65 |
+
Embedding or dict instance.
|
66 |
+
|
67 |
+
X: array, shape: (n_samples, 2)
|
68 |
+
Word pairs
|
69 |
+
|
70 |
+
y: vector, shape: (n_samples,)
|
71 |
+
Human ratings
|
72 |
+
|
73 |
+
Returns
|
74 |
+
-------
|
75 |
+
cor: float
|
76 |
+
Spearman correlation
|
77 |
+
"""
|
78 |
+
if isinstance(w, dict):
|
79 |
+
w = Embedding.from_dict(w)
|
80 |
+
|
81 |
+
missing_words = 0
|
82 |
+
words = w.vocabulary.word_id
|
83 |
+
for query in X:
|
84 |
+
for query_word in query:
|
85 |
+
if query_word not in words:
|
86 |
+
missing_words += 1
|
87 |
+
# if missing_words > 0:
|
88 |
+
# print("Missing {} words. Will replace them with mean vector".format(missing_words))
|
89 |
+
|
90 |
+
new_x = []
|
91 |
+
new_y = []
|
92 |
+
exist_cnt = 0
|
93 |
+
|
94 |
+
for i in range(len(X)):
|
95 |
+
if X[i, 0] in words and X[i, 1] in words:
|
96 |
+
new_x.append(X[i])
|
97 |
+
new_y.append(y[i])
|
98 |
+
exist_cnt += 1
|
99 |
+
|
100 |
+
print('exist {} in {}'.format(exist_cnt, len(X)))
|
101 |
+
X = np.array(new_x)
|
102 |
+
y = np.array(new_y)
|
103 |
+
|
104 |
+
|
105 |
+
mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
|
106 |
+
A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
|
107 |
+
B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])
|
108 |
+
# scores = np.array([v1.dot(v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2)) for v1, v2 in zip(A, B)])
|
109 |
+
scores = np.array([v1.dot(v2.T) for v1, v2 in zip(A, B)])
|
110 |
+
return scipy.stats.spearmanr(scores, y).correlation
|
111 |
+
|
112 |
+
|
113 |
+
def evaluate_simi(wv, w2i, vocab):
|
114 |
+
wv_dict = dict()
|
115 |
+
for w in vocab:
|
116 |
+
wv_dict[w] = wv[w2i[w], :]
|
117 |
+
|
118 |
+
if isinstance(wv_dict, dict):
|
119 |
+
w = Embedding.from_dict(wv_dict)
|
120 |
+
|
121 |
+
# Calculate results on similarity
|
122 |
+
print("Calculating similarity benchmarks")
|
123 |
+
similarity_tasks = {
|
124 |
+
"WS353": fetch_WS353(),
|
125 |
+
"RG65": fetch_RG65(),
|
126 |
+
# "WS353R": fetch_WS353(which="relatedness"),
|
127 |
+
# "WS353S": fetch_WS353(which="similarity"),
|
128 |
+
"SimLex999": fetch_SimLex999(),
|
129 |
+
"MTurk": fetch_MTurk(),
|
130 |
+
"RW": fetch_RW(),
|
131 |
+
"MEN": fetch_MEN(),
|
132 |
+
}
|
133 |
+
|
134 |
+
# similarity_results = {}
|
135 |
+
|
136 |
+
for name, data in iteritems(similarity_tasks):
|
137 |
+
print("Sample data from {}, num of samples: {} : pair \"{}\" and \"{}\" is assigned score {}".format(
|
138 |
+
name, len(data.X), data.X[0][0], data.X[0][1], data.y[0]))
|
139 |
+
score = evaluate_similarity(w, data.X, data.y)
|
140 |
+
print("Spearman correlation of scores on {} {}".format(name, score))
|
141 |
+
# score, p_value = evaluate_similarity_pearson(w, data.X, data.y)
|
142 |
+
# print("Pearson correlation of scores on {} {}, p value: {}".format(name, score, p_value))
|
143 |
+
|
144 |
+
def evaluate_categorization(w, X, y, method="kmeans", seed=None):
|
145 |
+
"""
|
146 |
+
Evaluate embeddings on categorization task.
|
147 |
+
|
148 |
+
Parameters
|
149 |
+
----------
|
150 |
+
w: Embedding or dict
|
151 |
+
Embedding to test.
|
152 |
+
|
153 |
+
X: vector, shape: (n_samples, )
|
154 |
+
Vector of words.
|
155 |
+
|
156 |
+
y: vector, shape: (n_samples, )
|
157 |
+
Vector of cluster assignments.
|
158 |
+
|
159 |
+
method: string, default: "all"
|
160 |
+
What method to use. Possible values are "agglomerative", "kmeans", "all.
|
161 |
+
If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude
|
162 |
+
hyperparameter tuning to avoid overfitting).
|
163 |
+
If "kmeans" is passed, method will fit KMeans.
|
164 |
+
In both cases number of clusters is preset to the correct value.
|
165 |
+
|
166 |
+
seed: int, default: None
|
167 |
+
Seed passed to KMeans.
|
168 |
+
|
169 |
+
Returns
|
170 |
+
-------
|
171 |
+
purity: float
|
172 |
+
Purity of the best obtained clustering.
|
173 |
+
|
174 |
+
Notes
|
175 |
+
-----
|
176 |
+
KMedoids method was excluded as empirically didn't improve over KMeans (for categorization
|
177 |
+
tasks available in the package).
|
178 |
+
"""
|
179 |
+
|
180 |
+
if isinstance(w, dict):
|
181 |
+
w = Embedding.from_dict(w)
|
182 |
+
|
183 |
+
assert method in ["all", "kmeans", "agglomerative"], "Uncrecognized method"
|
184 |
+
|
185 |
+
mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
|
186 |
+
new_x = []
|
187 |
+
new_y = []
|
188 |
+
exist_cnt = 0
|
189 |
+
|
190 |
+
for idx, word in enumerate(X.flatten()):
|
191 |
+
if word in w :
|
192 |
+
new_x.append(X[idx])
|
193 |
+
new_y.append(y[idx])
|
194 |
+
exist_cnt += 1
|
195 |
+
|
196 |
+
print('exist {} in {}'.format(exist_cnt, len(X)))
|
197 |
+
X = np.array(new_x)
|
198 |
+
y = np.array(new_y)
|
199 |
+
|
200 |
+
words = np.vstack([w.get(word, mean_vector) for word in X.flatten()])
|
201 |
+
ids = np.random.RandomState(seed).choice(range(len(X)), len(X), replace=False)
|
202 |
+
|
203 |
+
# Evaluate clustering on several hyperparameters of AgglomerativeClustering and
|
204 |
+
# KMeans
|
205 |
+
best_purity = 0
|
206 |
+
|
207 |
+
if method == "all" or method == "agglomerative":
|
208 |
+
best_purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)),
|
209 |
+
affinity="euclidean",
|
210 |
+
linkage="ward").fit_predict(words[ids]))
|
211 |
+
logger.debug("Purity={:.3f} using affinity={} linkage={}".format(best_purity, 'euclidean', 'ward'))
|
212 |
+
for affinity in ["cosine", "euclidean"]:
|
213 |
+
for linkage in ["average", "complete"]:
|
214 |
+
purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)),
|
215 |
+
affinity=affinity,
|
216 |
+
linkage=linkage).fit_predict(words[ids]))
|
217 |
+
logger.debug("Purity={:.3f} using affinity={} linkage={}".format(purity, affinity, linkage))
|
218 |
+
best_purity = max(best_purity, purity)
|
219 |
+
|
220 |
+
if method == "all" or method == "kmeans":
|
221 |
+
purity = calculate_purity(y[ids], KMeans(random_state=seed, n_init=10, n_clusters=len(set(y))).
|
222 |
+
fit_predict(words[ids]))
|
223 |
+
logger.debug("Purity={:.3f} using KMeans".format(purity))
|
224 |
+
best_purity = max(purity, best_purity)
|
225 |
+
|
226 |
+
return best_purity
|
227 |
+
|
228 |
+
def evaluate_cate(wv, w2i, vocab, method="all", seed=None):
|
229 |
+
"""
|
230 |
+
method: string, default: "all"
|
231 |
+
What method to use. Possible values are "agglomerative", "kmeans", "all.
|
232 |
+
If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude
|
233 |
+
hyperparameter tuning to avoid overfitting).
|
234 |
+
If "kmeans" is passed, method will fit KMeans.
|
235 |
+
In both cases number of clusters is preset to the correct value.
|
236 |
+
seed: int, default: None
|
237 |
+
Seed passed to KMeans.
|
238 |
+
"""
|
239 |
+
wv_dict = dict()
|
240 |
+
for w in vocab:
|
241 |
+
wv_dict[w] = wv[w2i[w], :]
|
242 |
+
|
243 |
+
if isinstance(wv_dict, dict):
|
244 |
+
w = Embedding.from_dict(wv_dict)
|
245 |
+
|
246 |
+
# Calculate results on categorization
|
247 |
+
print("Calculating categorization benchmarks")
|
248 |
+
categorization_tasks = {
|
249 |
+
"AP": fetch_AP(),
|
250 |
+
"ESSLI_2c": fetch_ESSLI_2c(),
|
251 |
+
"ESSLI_2b": fetch_ESSLI_2b(),
|
252 |
+
"ESSLI_1a": fetch_ESSLI_1a(),
|
253 |
+
"Battig": fetch_battig(),
|
254 |
+
"BLESS": fetch_BLESS(),
|
255 |
+
}
|
256 |
+
|
257 |
+
categorization_results = {}
|
258 |
+
|
259 |
+
# Calculate results using helper function
|
260 |
+
for name, data in iteritems(categorization_tasks):
|
261 |
+
print("Sample data from {}, num of samples: {} : \"{}\" is assigned class {}".format(
|
262 |
+
name, len(data.X), data.X[0], data.y[0]))
|
263 |
+
categorization_results[name] = evaluate_categorization(w, data.X, data.y, method=method, seed=None)
|
264 |
+
print("Cluster purity on {} {}".format(name, categorization_results[name]))
|
265 |
+
|
266 |
+
def evaluate_analogy_google(W, vocab):
|
267 |
+
"""Evaluate the trained w vectors on a variety of tasks"""
|
268 |
+
|
269 |
+
filenames = [
|
270 |
+
'capital-common-countries.txt', 'capital-world.txt', 'currency.txt',
|
271 |
+
'city-in-state.txt', 'family.txt', 'gram1-adjective-to-adverb.txt',
|
272 |
+
'gram2-opposite.txt', 'gram3-comparative.txt', 'gram4-superlative.txt',
|
273 |
+
'gram5-present-participle.txt', 'gram6-nationality-adjective.txt',
|
274 |
+
'gram7-past-tense.txt', 'gram8-plural.txt', 'gram9-plural-verbs.txt',
|
275 |
+
]
|
276 |
+
prefix = '/zf15/tw8cb/summer_2019/code/GloVe/eval/question-data/'
|
277 |
+
|
278 |
+
# to avoid memory overflow, could be increased/decreased
|
279 |
+
# depending on system and vocab size
|
280 |
+
split_size = 100
|
281 |
+
|
282 |
+
correct_sem = 0; # count correct semantic questions
|
283 |
+
correct_syn = 0; # count correct syntactic questions
|
284 |
+
correct_tot = 0 # count correct questions
|
285 |
+
count_sem = 0; # count all semantic questions
|
286 |
+
count_syn = 0; # count all syntactic questions
|
287 |
+
count_tot = 0 # count all questions
|
288 |
+
full_count = 0 # count all questions, including those with unknown words
|
289 |
+
|
290 |
+
for i in range(len(filenames)):
|
291 |
+
with open('%s/%s' % (prefix, filenames[i]), 'r') as f:
|
292 |
+
full_data = [line.rstrip().split(' ') for line in f]
|
293 |
+
full_count += len(full_data)
|
294 |
+
data = [x for x in full_data if all(word in vocab for word in x)]
|
295 |
+
|
296 |
+
indices = np.array([[vocab[word] for word in row] for row in data])
|
297 |
+
ind1, ind2, ind3, ind4 = indices.T
|
298 |
+
|
299 |
+
predictions = np.zeros((len(indices),))
|
300 |
+
num_iter = int(np.ceil(len(indices) / float(split_size)))
|
301 |
+
for j in range(num_iter):
|
302 |
+
subset = np.arange(j*split_size, min((j + 1)*split_size, len(ind1)))
|
303 |
+
|
304 |
+
pred_vec = (W[ind2[subset], :] - W[ind1[subset], :]
|
305 |
+
+ W[ind3[subset], :])
|
306 |
+
#cosine similarity if input W has been normalized
|
307 |
+
dist = np.dot(W, pred_vec.T)
|
308 |
+
|
309 |
+
for k in range(len(subset)):
|
310 |
+
dist[ind1[subset[k]], k] = -np.Inf
|
311 |
+
dist[ind2[subset[k]], k] = -np.Inf
|
312 |
+
dist[ind3[subset[k]], k] = -np.Inf
|
313 |
+
|
314 |
+
# predicted word index
|
315 |
+
predictions[subset] = np.argmax(dist, 0).flatten()
|
316 |
+
|
317 |
+
val = (ind4 == predictions) # correct predictions
|
318 |
+
count_tot = count_tot + len(ind1)
|
319 |
+
correct_tot = correct_tot + sum(val)
|
320 |
+
if i < 5:
|
321 |
+
count_sem = count_sem + len(ind1)
|
322 |
+
correct_sem = correct_sem + sum(val)
|
323 |
+
else:
|
324 |
+
count_syn = count_syn + len(ind1)
|
325 |
+
correct_syn = correct_syn + sum(val)
|
326 |
+
|
327 |
+
print("%s:" % filenames[i])
|
328 |
+
print('ACCURACY TOP1: %.2f%% (%d/%d)' %
|
329 |
+
(np.mean(val) * 100, np.sum(val), len(val)))
|
330 |
+
|
331 |
+
print('Questions seen/total: %.2f%% (%d/%d)' %
|
332 |
+
(100 * count_tot / float(full_count), count_tot, full_count))
|
333 |
+
print('Semantic accuracy: %.2f%% (%i/%i)' %
|
334 |
+
(100 * correct_sem / float(count_sem), correct_sem, count_sem))
|
335 |
+
print('Syntactic accuracy: %.2f%% (%i/%i)' %
|
336 |
+
(100 * correct_syn / float(count_syn), correct_syn, count_syn))
|
337 |
+
print('Total accuracy: %.2f%% (%i/%i)' % (100 * correct_tot / float(count_tot), correct_tot, count_tot))
|
338 |
+
|
339 |
+
|
340 |
+
def evaluate_analogy_msr(W, vocab, file_name='EN-MSR.txt'):
|
341 |
+
"""Evaluate the trained word vectors on a variety of tasks"""
|
342 |
+
|
343 |
+
prefix = '/zf15/tw8cb/summer_2019/code/GloVe/eval/question-data/'
|
344 |
+
|
345 |
+
# to avoid memory overflow, could be increased/decreased
|
346 |
+
# depending on system and vocab size
|
347 |
+
split_size = 100
|
348 |
+
|
349 |
+
correct_sem = 0; # count correct semantic questions
|
350 |
+
correct_syn = 0; # count correct syntactic questions
|
351 |
+
correct_tot = 0 # count correct questions
|
352 |
+
count_sem = 0; # count all semantic questions
|
353 |
+
count_syn = 0; # count all syntactic questions
|
354 |
+
count_tot = 0 # count all questions
|
355 |
+
full_count = 0 # count all questions, including those with unknown words
|
356 |
+
|
357 |
+
with open('%s/%s' % (prefix, file_name), 'r') as f:
|
358 |
+
full_data = []
|
359 |
+
for line in f:
|
360 |
+
tokens = line.rstrip().split(' ')
|
361 |
+
full_data.append([tokens[0], tokens[1], tokens[2], tokens[4]])
|
362 |
+
full_count += len(full_data)
|
363 |
+
data = [x for x in full_data if all(word in vocab for word in x)]
|
364 |
+
|
365 |
+
indices = np.array([[vocab[word] for word in row] for row in data])
|
366 |
+
ind1, ind2, ind3, ind4 = indices.T
|
367 |
+
|
368 |
+
predictions = np.zeros((len(indices),))
|
369 |
+
num_iter = int(np.ceil(len(indices) / float(split_size)))
|
370 |
+
for j in range(num_iter):
|
371 |
+
subset = np.arange(j*split_size, min((j + 1)*split_size, len(ind1)))
|
372 |
+
|
373 |
+
pred_vec = (W[ind2[subset], :] - W[ind1[subset], :]
|
374 |
+
+ W[ind3[subset], :])
|
375 |
+
#cosine similarity if input W has been normalized
|
376 |
+
dist = np.dot(W, pred_vec.T)
|
377 |
+
|
378 |
+
for k in range(len(subset)):
|
379 |
+
dist[ind1[subset[k]], k] = -np.Inf
|
380 |
+
dist[ind2[subset[k]], k] = -np.Inf
|
381 |
+
dist[ind3[subset[k]], k] = -np.Inf
|
382 |
+
|
383 |
+
# predicted word index
|
384 |
+
predictions[subset] = np.argmax(dist, 0).flatten()
|
385 |
+
|
386 |
+
val = (ind4 == predictions) # correct predictions
|
387 |
+
count_tot = count_tot + len(ind1)
|
388 |
+
correct_tot = correct_tot + sum(val)
|
389 |
+
|
390 |
+
# print("%s:" % filenames[i])
|
391 |
+
print(len(val))
|
392 |
+
print('ACCURACY TOP1-MSR: %.2f%% (%d/%d)' %
|
393 |
+
(np.mean(val) * 100, np.sum(val), len(val)))
|
394 |
+
|
395 |
+
def evaluate_analogy_semeval2012(w_dict):
|
396 |
+
score = evaluate_on_semeval_2012_2(w_dict)['all']
|
397 |
+
print("Analogy prediction accuracy on {} {}".format("SemEval2012", score))
|
398 |
+
|
399 |
+
def evaluate_ana(wv, w2i, vocab):
|
400 |
+
W_norm = np.zeros(wv.shape)
|
401 |
+
d = (np.sum(wv ** 2, 1) ** (0.5))
|
402 |
+
W_norm = (wv.T / d).T
|
403 |
+
|
404 |
+
evaluate_analogy_msr(W_norm, w2i)
|
405 |
+
evaluate_analogy_google(W_norm, w2i)
|
406 |
+
|
407 |
+
wv_dict = dict()
|
408 |
+
for w in vocab:
|
409 |
+
wv_dict[w] = W_norm[w2i[w], :]
|
410 |
+
|
411 |
+
if isinstance(wv_dict, dict):
|
412 |
+
w = Embedding.from_dict(wv_dict)
|
413 |
+
evaluate_analogy_semeval2012(w)
|
414 |
+
|
415 |
+
# analogy_tasks = {
|
416 |
+
# "Google": fetch_google_analogy(),
|
417 |
+
# "MSR": fetch_msr_analogy()
|
418 |
+
# }
|
419 |
+
|
420 |
+
# analogy_results = {}
|
421 |
+
|
422 |
+
# for name, data in iteritems(analogy_tasks):
|
423 |
+
# analogy_results[name] = evaluate_analogy(w, data.X, data.y)
|
424 |
+
# print("Analogy prediction accuracy on {} {}".format(name, analogy_results[name]))
|
utils.py
ADDED
@@ -0,0 +1,406 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import string
|
2 |
+
from tqdm import tqdm
|
3 |
+
import pickle
|
4 |
+
|
5 |
+
import scipy
|
6 |
+
import numpy as np
|
7 |
+
from numpy import linalg as LA
|
8 |
+
from sklearn.decomposition import PCA
|
9 |
+
|
10 |
+
# Experiment 1
|
11 |
+
WEAT_words = {
|
12 |
+
'A':['John', 'Paul', 'Mike', 'Kevin', 'Steve', 'Greg', 'Jeff', 'Bill'],
|
13 |
+
'B':['Amy', 'Joan', 'Lisa', 'Sarah', 'Diana', 'Kate', 'Ann', 'Donna'],
|
14 |
+
'C':['executive', 'management', 'professional', 'corporation', 'salary', 'office', 'business', 'career'],
|
15 |
+
'D':['home', 'parents', 'children', 'family', 'cousins', 'marriage', 'wedding', 'relatives'],
|
16 |
+
'E':['math', 'algebra', 'geometry', 'calculus', 'equations', 'computation', 'numbers', 'addition'],
|
17 |
+
'F':['poetry', 'art', 'dance', 'literature', 'novel', 'symphony', 'drama', 'sculpture'],
|
18 |
+
'G':['science', 'technology', 'physics', 'chemistry', 'einstein', 'nasa', 'experiment', 'astronomy'],
|
19 |
+
'H':['poetry', 'art', 'shakespeare', 'dance', 'literature', 'novel', 'symphony', 'drama'],
|
20 |
+
}
|
21 |
+
|
22 |
+
|
23 |
+
def has_punct(w):
|
24 |
+
|
25 |
+
if any([c in string.punctuation for c in w]):
|
26 |
+
return True
|
27 |
+
return False
|
28 |
+
|
29 |
+
def has_digit(w):
|
30 |
+
|
31 |
+
if any([c in '0123456789' for c in w]):
|
32 |
+
return True
|
33 |
+
return False
|
34 |
+
|
35 |
+
def limit_vocab(wv, w2i, vocab, exclude = None):
|
36 |
+
vocab_limited = []
|
37 |
+
for w in tqdm(vocab[:50000]):
|
38 |
+
if w.lower() != w:
|
39 |
+
continue
|
40 |
+
if len(w) >= 20:
|
41 |
+
continue
|
42 |
+
if has_digit(w):
|
43 |
+
continue
|
44 |
+
if '_' in w:
|
45 |
+
p = [has_punct(subw) for subw in w.split('_')]
|
46 |
+
if not any(p):
|
47 |
+
vocab_limited.append(w)
|
48 |
+
continue
|
49 |
+
if has_punct(w):
|
50 |
+
continue
|
51 |
+
vocab_limited.append(w)
|
52 |
+
|
53 |
+
if exclude:
|
54 |
+
vocab_limited = list(set(vocab_limited) - set(exclude))
|
55 |
+
|
56 |
+
print("size of vocabulary:", len(vocab_limited))
|
57 |
+
|
58 |
+
wv_limited = np.zeros((len(vocab_limited), len(wv[0, :])))
|
59 |
+
for i,w in enumerate(vocab_limited):
|
60 |
+
wv_limited[i,:] = wv[w2i[w],:]
|
61 |
+
|
62 |
+
w2i_limited = {w: i for i, w in enumerate(vocab_limited)}
|
63 |
+
|
64 |
+
return vocab_limited, wv_limited, w2i_limited
|
65 |
+
|
66 |
+
def norm_stand(wv):
|
67 |
+
W_norm = np.zeros(wv.shape)
|
68 |
+
d = (np.sum(wv ** 2, 1) ** (0.5))
|
69 |
+
W_norm = (wv.T / d).T
|
70 |
+
return W_norm
|
71 |
+
|
72 |
+
def normalize(wv):
|
73 |
+
|
74 |
+
# normalize vectors
|
75 |
+
norms = np.apply_along_axis(LA.norm, 1, wv)
|
76 |
+
wv = wv / norms[:, np.newaxis]
|
77 |
+
return wv
|
78 |
+
|
79 |
+
|
80 |
+
def topK(w, wv, w2i, vocab, k=10):
|
81 |
+
|
82 |
+
# extract the word vector for word w
|
83 |
+
idx = w2i[w]
|
84 |
+
vec = wv[idx, :]
|
85 |
+
|
86 |
+
# compute similarity of w with all words in the vocabulary
|
87 |
+
sim = wv.dot(vec)
|
88 |
+
# sim = []
|
89 |
+
# for i in range(len(wv)):
|
90 |
+
# sim.append(1-scipy.spatial.distance.cosine(wv[i, :], vec))
|
91 |
+
# sim = np.array(sim)
|
92 |
+
|
93 |
+
# sort similarities by descending order
|
94 |
+
sort_sim = (sim.argsort())[::-1]
|
95 |
+
|
96 |
+
# choose topK
|
97 |
+
best = sort_sim[:(k+1)]
|
98 |
+
|
99 |
+
return [vocab[i] for i in best if i!=idx]
|
100 |
+
|
101 |
+
|
102 |
+
def similarity(w1, w2, wv, w2i):
|
103 |
+
|
104 |
+
i1 = w2i[w1]
|
105 |
+
i2 = w2i[w2]
|
106 |
+
vec1 = wv[i1, :]
|
107 |
+
vec2 = wv[i2, :]
|
108 |
+
|
109 |
+
return 1-scipy.spatial.distance.cosine(vec1, vec2)
|
110 |
+
|
111 |
+
|
112 |
+
|
113 |
+
def drop(u, v):
|
114 |
+
return u - v * u.dot(v) / v.dot(v)
|
115 |
+
|
116 |
+
from sklearn.decomposition import PCA
|
117 |
+
from sklearn import preprocessing
|
118 |
+
|
119 |
+
def doPCA(pairs, wv, w2i):
|
120 |
+
|
121 |
+
matrix = []
|
122 |
+
cnt = 0
|
123 |
+
|
124 |
+
if type(pairs[0]) is list:
|
125 |
+
for a, b in pairs:
|
126 |
+
if not (a in w2i and b in w2i): continue
|
127 |
+
center = (wv[w2i[a], :] + wv[w2i[b], :])/2
|
128 |
+
matrix.append(wv[w2i[a], :] - center)
|
129 |
+
matrix.append(wv[w2i[b], :] - center)
|
130 |
+
cnt += 1
|
131 |
+
else:
|
132 |
+
for a in pairs:
|
133 |
+
if not (a in w2i): continue
|
134 |
+
matrix.append(wv[w2i[a], :])
|
135 |
+
cnt += 1
|
136 |
+
|
137 |
+
embeds = np.array(matrix)
|
138 |
+
wv_mean = np.mean(np.array(embeds), axis=0)
|
139 |
+
wv_hat = np.zeros(embeds.shape).astype(float)
|
140 |
+
|
141 |
+
for i in range(len(embeds)):
|
142 |
+
wv_hat[i, :] = embeds[i, :] - wv_mean
|
143 |
+
matrix = wv_hat
|
144 |
+
|
145 |
+
matrix = np.array(matrix)
|
146 |
+
pca = PCA()
|
147 |
+
pca.fit(matrix)
|
148 |
+
print('pairs used in PCA: ', cnt)
|
149 |
+
return pca
|
150 |
+
|
151 |
+
# get tuples of biases and counts of masculine/feminine NN for each word (for bias-by-neighbors)
|
152 |
+
import operator
|
153 |
+
def bias_by_neighbors(wv, w2i, vocab, gender_bias_bef, size, neighbours_num = 100):
|
154 |
+
|
155 |
+
tuples = []
|
156 |
+
|
157 |
+
sorted_g = sorted(gender_bias_bef.items(), key=operator.itemgetter(1))
|
158 |
+
female = [item[0] for item in sorted_g[:size]]
|
159 |
+
male = [item[0] for item in sorted_g[-size:]]
|
160 |
+
# vocab = male + female
|
161 |
+
selected = female + male if size > 0 else vocab
|
162 |
+
|
163 |
+
for w in selected:
|
164 |
+
|
165 |
+
top = topK(w, wv, w2i, vocab, k=neighbours_num+5)[:neighbours_num]
|
166 |
+
|
167 |
+
m = 0
|
168 |
+
f = 0
|
169 |
+
for t in top:
|
170 |
+
if gender_bias_bef[t] > 0:
|
171 |
+
m+=1
|
172 |
+
else:
|
173 |
+
f+=1
|
174 |
+
|
175 |
+
tuples.append((w, gender_bias_bef[w], m, f))
|
176 |
+
|
177 |
+
return tuples
|
178 |
+
|
179 |
+
def get_tuples_prof(wv, w2i, vocab, words, gender_bias_dict):
|
180 |
+
|
181 |
+
wv = normalize(wv)
|
182 |
+
|
183 |
+
tuples = []
|
184 |
+
for w in words:
|
185 |
+
if w not in gender_bias_dict:
|
186 |
+
continue
|
187 |
+
|
188 |
+
top = topK(w, wv, w2i, vocab, k=105)[:100]
|
189 |
+
|
190 |
+
m = 0
|
191 |
+
f = 0
|
192 |
+
for t in top:
|
193 |
+
if gender_bias_dict[t] > 0:
|
194 |
+
m+=1
|
195 |
+
else:
|
196 |
+
f+=1
|
197 |
+
|
198 |
+
tuples.append((w, gender_bias_dict[w], m, f))
|
199 |
+
|
200 |
+
return tuples
|
201 |
+
|
202 |
+
# compute correlation between bias-by-projection and bias-by-neighbors
|
203 |
+
|
204 |
+
import scipy.stats
|
205 |
+
|
206 |
+
def pearson(a,b):
|
207 |
+
|
208 |
+
return scipy.stats.pearsonr(a,b)
|
209 |
+
|
210 |
+
def compute_corr(tuples, i1, i2):
|
211 |
+
|
212 |
+
a = []
|
213 |
+
b = []
|
214 |
+
for t in tuples:
|
215 |
+
a.append(t[i1])
|
216 |
+
b.append(t[i2])
|
217 |
+
assert(len(a)==len(b))
|
218 |
+
print('pearson: ', scipy.stats.pearsonr(a,b))
|
219 |
+
print('spearman: ', scipy.stats.spearmanr(a, b))
|
220 |
+
|
221 |
+
# Auxiliary finctions
|
222 |
+
|
223 |
+
from sklearn.cluster import KMeans
|
224 |
+
from sklearn.manifold import TSNE
|
225 |
+
|
226 |
+
def visualize(vectors, y_true, y_pred, ax, title, random_state, num_clusters = 2):
|
227 |
+
|
228 |
+
# perform TSNE
|
229 |
+
|
230 |
+
X_embedded = TSNE(n_components=2, random_state=random_state).fit_transform(vectors)
|
231 |
+
for x,p,y in zip(X_embedded, y_pred, y_true):
|
232 |
+
if p:
|
233 |
+
if y:
|
234 |
+
ax.scatter(x[0], x[1], marker = '.', c = 'c')
|
235 |
+
else:
|
236 |
+
ax.scatter(x[0], x[1], marker = 'x', c = 'c')
|
237 |
+
else:
|
238 |
+
if y:
|
239 |
+
ax.scatter(x[0], x[1], marker = '.', c = 'darkviolet')
|
240 |
+
else:
|
241 |
+
ax.scatter(x[0], x[1], marker = 'x', c = 'darkviolet')
|
242 |
+
|
243 |
+
|
244 |
+
ax.text(.01, .9, title ,transform=ax.transAxes, fontsize=15)
|
245 |
+
|
246 |
+
|
247 |
+
def extract_vectors(words, wv, w2i):
|
248 |
+
|
249 |
+
X = [wv[w2i[x],:] for x in words]
|
250 |
+
|
251 |
+
return X
|
252 |
+
|
253 |
+
|
254 |
+
def cluster_and_visualize(words, X, random_state, y_true, num=2):
|
255 |
+
|
256 |
+
y_pred = KMeans(n_clusters=num, random_state=random_state).fit_predict(X)
|
257 |
+
# fig, axs = plt.subplots(figsize=(6, 3))
|
258 |
+
# visualize(X, y_true, y_pred, axs, 'Original', random_state)
|
259 |
+
correct = [1 if item1 == item2 else 0 for (item1,item2) in zip(y_true, y_pred) ]
|
260 |
+
print('precision', max(sum(correct)/float(len(correct)), 1 - sum(correct)/float(len(correct))))
|
261 |
+
|
262 |
+
|
263 |
+
import scipy.stats
|
264 |
+
from sklearn import svm
|
265 |
+
def train_and_predict(wv, w2i, vocab, size_train, size_test, males, females):
|
266 |
+
|
267 |
+
X_train = [wv[w2i[w],:] for w in males[:size_train]+females[:size_train]]
|
268 |
+
Y_train = [1]*size_train + [0]*size_train
|
269 |
+
X_test = [wv[w2i[w],:] for w in males[size_train:]+females[size_train:]]
|
270 |
+
Y_test = [1]*size_test + [0]*size_test
|
271 |
+
|
272 |
+
clf = svm.SVC(gamma='auto')
|
273 |
+
clf.fit(X_train, Y_train)
|
274 |
+
|
275 |
+
preds = clf.predict(X_test)
|
276 |
+
|
277 |
+
accuracy = [1 if y==z else 0 for y,z in zip(preds, Y_test)]
|
278 |
+
acc = float(sum(accuracy))/len(accuracy)
|
279 |
+
print('accuracy:', float(sum(accuracy))/len(accuracy))
|
280 |
+
|
281 |
+
return acc
|
282 |
+
|
283 |
+
|
284 |
+
# Auxiliary functions for experiments by Caliskan et al.
|
285 |
+
|
286 |
+
import scipy
|
287 |
+
import scipy.misc as misc
|
288 |
+
import itertools
|
289 |
+
|
290 |
+
|
291 |
+
def s_word(w, A, B, wv, w2i, vocab, all_s_words):
|
292 |
+
|
293 |
+
if w in all_s_words:
|
294 |
+
return all_s_words[w]
|
295 |
+
|
296 |
+
mean_a = []
|
297 |
+
mean_b = []
|
298 |
+
|
299 |
+
for a in A:
|
300 |
+
mean_a.append(similarity(w, a, wv, w2i))
|
301 |
+
for b in B:
|
302 |
+
mean_b.append(similarity(w, b, wv, w2i))
|
303 |
+
|
304 |
+
mean_a = sum(mean_a)/float(len(mean_a))
|
305 |
+
mean_b = sum(mean_b)/float(len(mean_b))
|
306 |
+
|
307 |
+
all_s_words[w] = mean_a - mean_b
|
308 |
+
|
309 |
+
return all_s_words[w]
|
310 |
+
|
311 |
+
|
312 |
+
def s_group(X, Y, A, B, wv, w2i, vocab, all_s_words):
|
313 |
+
|
314 |
+
total = 0
|
315 |
+
for x in X:
|
316 |
+
total += s_word(x, A, B, wv, w2i, vocab, all_s_words)
|
317 |
+
for y in Y:
|
318 |
+
total -= s_word(y, A, B, wv, w2i, vocab, all_s_words)
|
319 |
+
|
320 |
+
return total
|
321 |
+
|
322 |
+
|
323 |
+
def p_value_exhust(X, Y, A, B, wv, w2i, vocab):
|
324 |
+
|
325 |
+
if len(X) > 10:
|
326 |
+
print('might take too long, use sampled version: p_value')
|
327 |
+
return
|
328 |
+
|
329 |
+
assert(len(X) == len(Y))
|
330 |
+
|
331 |
+
all_s_words = {}
|
332 |
+
s_orig = s_group(X, Y, A, B, wv, w2i, vocab, all_s_words)
|
333 |
+
|
334 |
+
union = set(X+Y)
|
335 |
+
subset_size = int(len(union)/2)
|
336 |
+
|
337 |
+
larger = 0
|
338 |
+
total = 0
|
339 |
+
for subset in set(itertools.combinations(union, subset_size)):
|
340 |
+
total += 1
|
341 |
+
Xi = list(set(subset))
|
342 |
+
Yi = list(union - set(subset))
|
343 |
+
if s_group(Xi, Yi, A, B, wv, w2i, vocab, all_s_words) > s_orig:
|
344 |
+
larger += 1
|
345 |
+
print('num of samples', total)
|
346 |
+
return larger/float(total)
|
347 |
+
|
348 |
+
def association_diff(t, A, B, wv, w2i):
|
349 |
+
|
350 |
+
mean_a = []
|
351 |
+
mean_b = []
|
352 |
+
|
353 |
+
for a in A:
|
354 |
+
mean_a.append(similarity(t, a, wv, w2i))
|
355 |
+
for b in B:
|
356 |
+
mean_b.append(similarity(t, b, wv, w2i))
|
357 |
+
|
358 |
+
mean_a = sum(mean_a)/float(len(mean_a))
|
359 |
+
mean_b = sum(mean_b)/float(len(mean_b))
|
360 |
+
|
361 |
+
return mean_a - mean_b
|
362 |
+
|
363 |
+
def effect_size(X, Y, A, B, wv, w2i, vocab):
|
364 |
+
|
365 |
+
assert(len(X) == len(Y))
|
366 |
+
assert(len(A) == len(B))
|
367 |
+
|
368 |
+
norm_x = []
|
369 |
+
norm_y = []
|
370 |
+
|
371 |
+
for x in X:
|
372 |
+
norm_x.append(association_diff(x, A, B, wv, w2i))
|
373 |
+
for y in Y:
|
374 |
+
norm_y.append(association_diff(y, A, B, wv, w2i))
|
375 |
+
|
376 |
+
std = np.std(norm_x+norm_y, ddof=1)
|
377 |
+
norm_x = sum(norm_x) / float(len(norm_x))
|
378 |
+
norm_y = sum(norm_y) / float(len(norm_y))
|
379 |
+
|
380 |
+
return (norm_x-norm_y)/std
|
381 |
+
|
382 |
+
|
383 |
+
def p_value_sample(X, Y, A, B, wv, w2i, vocab):
|
384 |
+
|
385 |
+
random.seed(10)
|
386 |
+
np.random.seed(10)
|
387 |
+
all_s_words = {}
|
388 |
+
|
389 |
+
assert(len(X) == len(Y))
|
390 |
+
length = len(X)
|
391 |
+
|
392 |
+
s_orig = s_group(X, Y, A, B, wv, w2i, vocab, all_s_words)
|
393 |
+
|
394 |
+
num_of_samples = min(1000000, int(scipy.special.comb(length*2,length)*100))
|
395 |
+
print('num of samples', num_of_samples)
|
396 |
+
larger = 0
|
397 |
+
for i in range(num_of_samples):
|
398 |
+
permute = np.random.permutation(X+Y)
|
399 |
+
Xi = permute[:length]
|
400 |
+
Yi = permute[length:]
|
401 |
+
if s_group(Xi, Yi, A, B, space, all_s_words) > s_orig:
|
402 |
+
larger += 1
|
403 |
+
|
404 |
+
return larger/float(num_of_samples)
|
405 |
+
|
406 |
+
|