Spaces:

DS-20202
/

DoubleHardDebias

Build error

App Files Files Community

DS-20202 commited on Jul 31, 2022

Commit

95b1b47

verified ·

1 Parent(s): 2f7cc88

data and utils

Browse files

Files changed (7) hide show

data/definitional_pairs.json +1 -0
data/equalize_pairs.json +1 -0
data/female_word_file.txt +221 -0
data/gender_specific_full.json +1 -0
data/male_word_file.txt +221 -0
eval.py +424 -0
utils.py +406 -0

data/definitional_pairs.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ [["woman", "man"], ["girl", "boy"], ["she", "he"], ["mother", "father"], ["daughter", "son"], ["gal", "guy"], ["female", "male"], ["her", "his"], ["herself", "himself"], ["Mary", "John"]]

data/equalize_pairs.json ADDED Viewed

	@@ -0,0 +1 @@

+ [["monastery", "convent"], ["spokesman", "spokeswoman"], ["Catholic_priest", "nun"], ["Dad", "Mom"], ["Men", "Women"], ["councilman", "councilwoman"], ["grandpa", "grandma"], ["grandsons", "granddaughters"], ["prostate_cancer", "ovarian_cancer"], ["testosterone", "estrogen"], ["uncle", "aunt"], ["husbands", "wives"], ["Father", "Mother"], ["Grandpa", "Grandma"], ["He", "She"], ["boy", "girl"], ["boys", "girls"], ["brother", "sister"], ["brothers", "sisters"], ["businessman", "businesswoman"], ["chairman", "chairwoman"], ["colt", "filly"], ["congressman", "congresswoman"], ["dad", "mom"], ["dads", "moms"], ["dudes", "gals"], ["ex_boyfriend", "ex_girlfriend"], ["father", "mother"], ["fatherhood", "motherhood"], ["fathers", "mothers"], ["fella", "granny"], ["fraternity", "sorority"], ["gelding", "mare"], ["gentleman", "lady"], ["gentlemen", "ladies"], ["grandfather", "grandmother"], ["grandson", "granddaughter"], ["he", "she"], ["himself", "herself"], ["his", "her"], ["king", "queen"], ["kings", "queens"], ["male", "female"], ["males", "females"], ["man", "woman"], ["men", "women"], ["nephew", "niece"], ["prince", "princess"], ["schoolboy", "schoolgirl"], ["son", "daughter"], ["sons", "daughters"], ["twin_brother", "twin_sister"]]

data/female_word_file.txt ADDED Viewed

	@@ -0,0 +1,221 @@

+countrywoman
+sororal
+witches
+maidservant
+mothers
+diva
+actress
+spinster
+mama
+duchesses
+barwoman
+countrywomen
+dowry
+hostesses
+suitors
+airwomen
+menopause
+clitoris
+princess
+governesses
+abbess
+women
+widow
+ladies
+sorceresses
+madam
+brides
+baroness
+housewives
+godesses
+niece
+widows
+lady
+sister
+brides
+nun
+adultresses
+obstetrics
+bellgirls
+her
+marchioness
+princesses
+empresses
+mare
+chairwoman
+convent
+priestesses
+girlhood
+ladies
+queen
+gals
+mommies
+maid
+female_ejaculation
+spokeswoman
+seamstress
+cowgirls
+chick
+spinsters
+hair_salon
+empress
+mommy
+feminism
+gals
+enchantress
+gal
+motherhood
+estrogen
+camerawomen
+godmother
+strongwoman
+goddess
+matriarch
+aunt
+chairwomen
+ma'am
+sisterhood
+hostess
+estradiol
+wife
+mom
+stewardess
+females
+viagra
+spokeswomen
+ma
+belle
+minx
+maiden
+witch
+miss
+nieces
+mothered
+cow
+belles
+councilwomen
+landlords
+granddaughter
+fiancees
+stepmothers
+horsemen
+grandmothers
+adultress
+schoolgirl
+hen
+granddaughters
+bachelorette
+camerawoman
+moms
+her
+mistress
+lass
+policewoman
+nun
+actresses
+saleswomen
+girlfriend
+councilwoman
+lady
+stateswoman
+maternal
+lass
+landlady
+sistren
+ladies
+wenches
+sorority
+bellgirl
+duchess
+ballerina
+chicks
+fiancee
+fillies
+wives
+suitress
+paternity
+she
+businesswoman
+masseuses
+heroine
+doe
+busgirls
+girlfriends
+queens
+sisters
+mistresses
+stepmother
+daughter
+minxes
+cowgirl
+lady
+daughters
+mezzo
+saleswoman
+mistress
+hostess
+nuns
+maids
+mrs.
+headmistresses
+lasses
+congresswoman
+airwoman
+housewife
+priestess
+barwomen
+barnoesses
+abbesses
+handywoman
+toque
+sororities
+stewardesses
+filly
+czarina
+stepdaughters
+herself
+girls
+lionesses
+lady
+vagina
+hers
+masseuse
+cows
+aunts
+wench
+toques
+wife
+lioness
+sorceress
+effeminate
+mother
+lesbians
+female
+waitresses
+ovum
+skene_gland
+stepdaughter
+womb
+businesswomen
+heiress
+waitress
+headmistress
+woman
+governess
+godess
+bride
+grandma
+bride
+gal
+lesbian
+ladies
+girl
+grandmother
+mare
+hens
+uterus
+nuns
+maidservants
+seamstress'
+busgirl
+heroines

data/gender_specific_full.json ADDED Viewed

	@@ -0,0 +1 @@

+ ["he", "his", "He", "her", "she", "him", "She", "man", "women", "men", "His", "woman", "spokesman", "wife", "himself", "son", "mother", "father", "chairman", "daughter", "husband", "guy", "girls", "girl", "Her", "boy", "King", "boys", "brother", "Chairman", "spokeswoman", "female", "sister", "Women", "Man", "male", "herself", "Lions", "Lady", "brothers", "dad", "actress", "mom", "sons", "girlfriend", "Kings", "Men", "daughters", "Prince", "Queen", "teenager", "lady", "Bulls", "boyfriend", "sisters", "Colts", "mothers", "Sir", "king", "businessman", "Boys", "grandmother", "grandfather", "deer", "cousin", "Woman", "ladies", "Girls", "Father", "uncle", "PA", "Boy", "Councilman", "mum", "Brothers", "MA", "males", "Girl", "Mom", "Guy", "Queens", "congressman", "Dad", "Mother", "grandson", "twins", "bull", "queen", "businessmen", "wives", "widow", "nephew", "bride", "females", "aunt", "Congressman", "prostate_cancer", "lesbian", "chairwoman", "fathers", "Son", "moms", "Ladies", "maiden", "granddaughter", "younger_brother", "Princess", "Guys", "lads", "Ma", "Sons", "lion", "Bachelor", "gentleman", "fraternity", "bachelor", "niece", "Lion", "Sister", "bulls", "husbands", "prince", "colt", "salesman", "Bull", "Sisters", "hers", "dude", "Spokesman", "beard", "filly", "Actress", "Him", "princess", "Brother", "lesbians", "councilman", "actresses", "Viagra", "gentlemen", "stepfather", "Deer", "monks", "Beard", "Uncle", "ex_girlfriend", "lad", "sperm", "Daddy", "testosterone", "MAN", "Female", "nephews", "maid", "daddy", "mare", "fiance", "Wife", "fiancee", "kings", "dads", "waitress", "Male", "maternal", "heroine", "feminist", "Mama", "nieces", "girlfriends", "Councilwoman", "sir", "stud", "Mothers", "mistress", "lions", "estranged_wife", "womb", "Brotherhood", "Statesman", "grandma", "maternity", "estrogen", "ex_boyfriend", "widows", "gelding", "diva", "teenage_girls", "nuns", "Daughter", "czar", "ovarian_cancer", "HE", "Monk", "countrymen", "Grandma", "teenage_girl", "penis", "bloke", "nun", "Husband", "brides", "housewife", "spokesmen", "suitors", "menopause", "monastery", "patriarch", "Beau", "motherhood", "brethren", "stepmother", "Dude", "prostate", "Moms", "hostess", "twin_brother", "Colt", "schoolboy", "eldest", "brotherhood", "Godfather", "fillies", "stepson", "congresswoman", "Chairwoman", "Daughters", "uncles", "witch", "Mommy", "monk", "viagra", "paternity", "suitor", "chick", "Pa", "fianc\u00e9", "sorority", "macho", "Spokeswoman", "businesswoman", "eldest_son", "gal", "statesman", "schoolgirl", "fathered", "goddess", "hubby", "mares", "stepdaughter", "blokes", "dudes", "socialite", "strongman", "Witch", "fianc\u00e9e", "uterus", "grandsons", "Bride", "studs", "mama", "Aunt", "godfather", "hens", "hen", "mommy", "Babe", "estranged_husband", "Fathers", "elder_brother", "boyhood", "baritone", "Diva", "Lesbian", "grandmothers", "grandpa", "boyfriends", "feminism", "countryman", "stallion", "heiress", "queens", "Grandpa", "witches", "aunts", "semen", "fella", "granddaughters", "chap", "knight", "widower", "Maiden", "salesmen", "convent", "KING", "vagina", "beau", "babe", "HIS", "beards", "handyman", "twin_sister", "maids", "gals", "housewives", "Gentlemen", "horsemen", "Businessman", "obstetrics", "fatherhood", "beauty_queen", "councilwoman", "princes", "matriarch", "colts", "manly", "ma", "fraternities", "Spokesmen", "pa", "fellas", "Gentleman", "councilmen", "dowry", "barbershop", "Monks", "WOMAN", "fraternal", "ballerina", "manhood", "Dads", "heroines", "granny", "gynecologist", "princesses", "Goddess", "yo", "Granny", "knights", "eldest_daughter", "HER", "underage_girls", "masculinity", "Girlfriend", "bro", "Grandmother", "grandfathers", "crown_prince", "Restless", "paternal", "Queen_Mother", "Boyfriend", "womens", "Males", "SHE", "Countess", "stepchildren", "Belles", "bachelors", "matron", "momma", "Legs", "maidens", "goddesses", "landlady", "sisterhood", "Grandfather", "Fraternity", "Majesty", "Babes", "lass", "maternal_grandmother", "blondes", "ma'am", "Womens", "divorcee", "Momma", "fathering", "Effie", "Lad", "womanhood", "missus", "Sisterhood", "granddad", "Mens", "papa", "gf", "sis", "Husbands", "Hen", "womanizer", "gynecological", "stepsister", "Handsome", "Prince_Charming", "BOY", "stepdad", "teen_ager", "GIRL", "dame", "Sorority", "beauty_pageants", "raspy", "harem", "maternal_grandfather", "Hes", "deliveryman", "septuagenarian", "damsel", "paternal_grandmother", "paramour", "paternal_grandparents", "Nun", "DAD", "mothering", "shes", "HE_'S", "Nuns", "teenage_daughters", "auntie", "widowed_mother", "Girlfriends", "FATHER", "virile", "COUPLE", "grandmas", "Hubby", "nan", "vixen", "Joan_Crawford", "stepdaughters", "endometrial_cancer", "stepsons", "loins", "Grandson", "Mitchells", "erections", "Matron", "Fella", "daddies", "ter", "Sweetie", "Dudes", "Princesses", "Lads", "lioness", "Mamma", "virility", "bros", "womenfolk", "Heir", "BROTHERS", "manliness", "patriarchs", "earl", "sisterly", "Whore", "Gynaecology", "countess", "convents", "Oratory", "witch_doctor", "mamas", "yah", "aunty", "aunties", "Heiress", "lasses", "Breasts", "fairer_sex", "sorority_sisters", "WIFE", "Laurels", "penile", "nuh", "mah", "toms", "mam", "Granddad", "premenopausal_women", "Granddaddy", "nana", "coeds", "dames", "herdsman", "Mammy", "Fellas", "Niece", "menfolk", "Grandad", "bloods", "Gramps", "damsels", "Granddaughter", "mamma", "concubine", "Oros", "Blarney", "filial", "broads", "Ethel_Kennedy", "ACTRESS", "Tit", "fianc", "Hunk", "Night_Shift", "wifey", "Lothario", "Holy_Roman_Emperor", "horse_breeder", "grandnephew", "Lewises", "Muscular", "feminist_movement", "Sanan", "women\u00e2_\u20ac_\u2122", "Fiancee", "dowries", "Carmelite", "rah", "n_roller", "bay_filly", "belles", "Uncles", "PRINCESS", "womans", "Homeboy", "Blokes", "Charmer", "codger", "Delta_Zeta", "courtesans", "grandaughter", "SISTER", "Highness", "grandbabies", "crone", "Skip_Away", "noblewoman", "bf", "jane", "philandering_husband", "Sisqo", "mammy", "daugher", "director_Skip_Bertman", "DAUGHTER", "Royal_Highness", "mannish", "spinsters", "Missus", "madame", "Godfathers", "saleswomen", "beaus", "Risha", "luh", "sah", "negligee", "Women\u00e2_\u20ac_\u2122", "Hos", "salesgirl", "grandmom", "Grandmas", "Lawsons", "countrywomen", "Booby", "darlin", "Sheiks", "boyz", "wifes", "Bayi", "Il_Duce", "\u00e2_\u20ac_\u0153My", "fem", "daugther", "Potti", "hussy", "tch", "Gelding", "stemmed_roses", "Damson", "puh", "Tylers", "neice", "Mutha", "GRANDMOTHER", "youse", "spurned_lover", "mae", "Britt_Ekland", "clotheshorse", "Carlita_Kilpatrick", "Cambest", "Pretty_Polly", "banshees", "male_chauvinist", "Arliss", "mommas", "maidservant", "Gale_Harold", "Little_Bo_Peep", "Cleavers", "hags", "blowsy", "Queen_Elizabeth_I.", "lassies", "papas", "BABE", "ugly_ducklings", "Jims", "hellion", "Beautician", "coalminer", "relaxin", "El_Mahroug", "Victoria_Secret_Angel", "shepherdess", "Mosco", "Slacks", "nanna", "wifely", "tomboys", "LAH", "hast", "apo", "Kaplans", "milkmaid", "Robin_Munis", "John_Barleycorn", "royal_highness", "Meanie", "NAH", "trollop", "roh", "Jewess", "Sheik_Hamad", "mumsy", "Big_Pussy", "chil_dren", "Aunt_Bea", "basso", "sista", "girlies", "nun_Sister", "chica", "Bubbas", "massa", "Southern_belles", "Nephews", "castrations", "Mister_Ed", "Grandsons", "Calaf", "Malachy_McCourt", "Shamash", "hey_hey", "Harmen", "sonofabitch", "Donovans", "Grannie", "Kalinka", "hisself", "Devean", "goatherd", "hinds", "El_Corredor", "Kens", "notorious_womanizer", "goh", "Mommas", "washerwoman", "Samaira", "Coo_Coo", "Governess", "grandsire", "PRINCE_WILLIAM", "gramma", "him.He", "Coptic_priest", "Corbie", "Kennys", "thathe", "Pa_Pa", "Bristols", "Hotep", "snowy_haired", "El_Prado_Ire", "Girl_hitmaker", "Hurleys", "St._Meinrad", "sexually_perverted", "authoress", "Prudie", "raven_haired_beauty", "Bonos", "domestic_shorthair", "brothas", "nymphet", "Neelma", "Seita", "stud_muffin", "St._Judes", "yenta", "bare_shouldered", "Pinkney_Sr.", "PRINCE_CHARLES", "Bisutti", "sistas", "Blanche_Devereaux", "Momoa", "Quiff", "Scotswoman", "balaclava_clad_men", "Louis_Leakey", "dearie", "vacuum_cleaner_salesman", "grandads", "postulant", "SARAH_JESSICA_PARKER", "AUNT", "Prince_Dauntless", "Dalys", "Darkie", "Czar_Nicholas", "Lion_Hearted", "Boy_recliner", "baby_mamas", "giantess", "Lawd", "GRANNY", "fianc_e", "Bilqis", "WCTU", "famly", "Ellas", "feminazis", "Pentheus", "MAMAS", "Town_Criers", "Saggy", "youngman", "grandam", "divorc\u00e9", "bosomed", "roon", "Simmentals", "eponymous_heroine", "LEYLAND", "REE'", "cain't", "Evelynn", "WAH'", "sistah", "Horners", "Elsie_Poncher", "Coochie", "rat_terriers", "Limousins", "Buchinski", "Schicchi", "Carpitcher", "Khwezi", "HAH'", "Shazza", "Mackeson", "ROH'", "kuya", "novice_nun", "Shei", "Elmasri", "ladykiller", "6yo", "Yenta", "SHEL", "pater", "Souse", "Tahirah", "comedian_Rodney_Dangerfield", "Shottle", "carryin", "Sath", "fa'afafine", "royal_consort", "hus_band", "maternal_uncles", "dressing_provocatively", "dreamgirl", "millionaire_industrialist", "Georgie_Girl", "Must_Be_Obeyed", "joh", "Arabian_stallion", "ahr", "mso_para_margin_0in", "SOO'", "Biddles", "Chincoteague_Volunteer_Fire", "Lisa_Miceli", "gorgeous_brunette", "fianc\u017d", "Moved_fluently", "Afternoon_Deelites", "biker_dude", "Vito_Spatafore", "MICK_JAGGER", "Adesida", "Reineman", "witz", "Djamila", "Glenroe", "daddys", "Romanzi", "gentlewomen", "Dandie_Dinmont_terrier", "Excess_Ire", "By_SYVJ_Staff", "zan", "CONFESSIONS", "Magees", "wimmin", "tash", "Theatrical_Ire", "Prince_Charmings", "chocolate_eclair", "bron", "daughers", "Felly", "fiftyish", "Spritely", "GRANDPA", "distaffer", "Norbertines", "DAH'", "leader_Muammar_Gadaffi", "swains", "Prince_Tomohito", "Honneur", "Soeur", "jouster", "Pharaoh_Amenhotep_III", "QUEEN_ELIZABETH_II", "Ne'er", "Galileo_Ire", "Fools_Crow", "Lannisters", "Devines", "gonzales", "columnist_Ann_Landers", "Moseleys", "hiz", "busch", "roastee", "toyboys", "Sheffields", "grandaunt", "Galvins", "Giongo", "geh", "flame_haired_actress", "Grammarian", "Greg_Evigan", "frontierswoman", "Debele", "rabs", "nymphets", "aai", "BREE", "Shaqs", "ZAY", "pappa", "Housa", "refrigerator_repairman", "artificial_inseminations", "chickie", "Rippa", "teenager_Tracy_Turnblad", "homebred_colt", "Abigaille", "hen_pecked_husband", "businesman", "her.She", "Kaikeyi", "Stittsworth", "self_proclaimed_redneck", "Khella", "NeW", "Evers_Swindell", "Asmerom_Gebreselassie", "Boy_recliners", "Cliff_Claven", "Legge_Bourke", "Costos", "d'_honneur", "sistahs", "Cabble", "sahn", "CROW_AGENCY_Mont", "jezebel", "Harrolds", "ROSARIO_DAWSON", "INXS_frontman_Michael_Hutchence", "Gursikh", "Dadas", "VIAGA", "keen_horsewoman", "Theodoric", "Eldery", "lihn", "Alice_Kramden", "Santarina", "radical_cleric_al_Sadr", "Curleys", "SY'", "Fidaa", "Saptapadi", "Actor_Sean_Astin", "Kellita_Smith", "Doly", "Libertina", "Money_McBags", "Chief_Bearhart", "choirgirl", "chestnut_stallion", "VIGRA", "BY_JIM_McCONNELL", "Sal_Vitale", "Trivia_buffs", "kumaris", "fraternal_lodge", "galpals", "Borino_Quinn", "lina", "LATEST_Rapper", "Bezar", "Manro", "bakla", "Grisetti", "blond_bimbo", "spinster_aunt", "gurls", "hiswife", "paleface", "Charlye", "hippie_chicks", "Khalifas", "Picture_JUSTIN_SANSON", "Hepburns", "yez", "ALDER", "Sanussi", "Lil_Sis", "McLoughlins", "Barbra_Jean", "Lulua", "thatshe", "actress_Shohreh_Aghdashloo", "SIR_ANTHONY_HOPKINS", "Gloddy", "ZAH'", "ORANGE_'S", "Danielle_Bimber", "grandmum", "Kulkis", "Brazington", "Marisa_Lenhard_CFA", "SIR_JOHN", "Clareman", "Aqila", "Heavily_tattooed", "Libbys", "thim", "elocutionist", "submissives", "Inja", "rahm", "Agnes_Gooch", "fake_tits", "nancy_boys", "Swaidan", "SHAH'", "ain'ta_bed", "Shumail_Raj", "Duchesse", "diethylstilbestrol_DES", "colt_foal", "unfaithful_lover", "Maseri", "nevah", "SAHN", "Barths", "Toughkenamon", "GUEST_STARS", "him.But", "Donna_Claspell", "gingham_dresses", "Massage_Parlour", "wae", "Wasacz", "Magistra", "vihl", "Smriti_Iraani", "boyish_haircut", "workingwoman", "borthers", "Capuchin_friars", "Nejma", "yes_sirs", "bivocational_pastor", "Grafters", "HOPWOOD", "Nicknamed_Godzilla", "yos", "Berkenfield", "Missis", "sitcom_Designing_Women", "Kafoa", "trainer_Emma_Lavelle", "sadomasochistic_dungeon", "iht", "desperates", "predessor", "wolf_cub", "indigenous_Peruvians", "Livia_Soprano", "troh", "colt_sired", "BOND_HILL", "ihl", "Drydens", "rahs", "Piserchia", "Sonny_Corinthos", "bankrobber", "Fwank", "feisty_redhead", "booze_guzzling", "COOPERS", "actress_Q'orianka_Kilcher", "Cortezar", "twe", "Jacoub", "Cindy_Iannarelli", "Hell_Raiser", "Fondly_referred", "Bridal_Shoppe", "Noleta", "Christinas", "IAGRA", "LaTanya_Richardson", "Sang_Bender", "Assasins", "sorrel_gelding", "septugenarian", "Hissy", "Muqtada_al_Sadr_mook", "Pfeni", "MADRID_AFX_Banco_Santander", "tuchis", "LeVaughn", "Gadzicki", "transvestite_hooker", "Fame_jockey_Laffit", "nun_Sister_Mary", "SAMSONOV", "Mayflower_Madam", "Shaque", "well.He", "Trainer_Julio_Canani", "sorrel_mare", "minivehicle_joint_venture", "wife_Dwina", "Aasiya_AH'_see", "Baratheon", "Rick_O'Shay", "Mammies", "goatie", "Nell_Gwynne", "charmingly_awkward", "Slamma", "DEHL", "Lorenzo_Borghese", "ALMA_Wis.", "Anne_Scurria", "father_Peruvians_alternately", "JULIE_ANDREWS", "Slim_Pickins", "Victoria_Secret_stunner", "BY'", "Sanam_Devdas", "pronounced_luh", "Pasha_Selim", "\u4e2d\u534e", "rson", "maternal_grandmothers", "IOWA_CITY_Ia", "Madame_de_Tourvel", "JAY'", "Sheika_Mozah_bint_Nasser", "Hotsy_Totsy", "D'_Ginto", "singer_Johnny_Paycheck", "uterine_prolapse_surgery", "SCOTTDALE_Pa.", "AdelaideNow_reports", "Marcus_Schenkenberg", "Clyse", "Obiter_Dicta", "comic_Sam_Kinison", "bitties", "ROCKVILLE_Ind.", "swimsuit_calendars", "Decicio_Smith", "Ma_ma", "Rie_Miyazawa", "celibate_chastity", "gwah", "ZAY'", "HER_Majesty", "Defrere", "Las_Madrinas", "\u7c3f_\u8042_\u7ffb", "Bea_Hamill", "ARCADIA_Calif._Trainer", "Bold_Badgett", "stakes_victress", "Hoppin_Frog", "Narumiya", "Flayfil", "hardman_Vinnie_Jones", "Marilyn_Monroe_lookalike", "Kivanc_Tatlitug", "Persis_Khambatta", "SINKING_SPRING_Pa.", "len_3rd", "DEAR_TRYING", "Farndon_Cheshire", "Krishna_Madiga", "daughter_Princess_Chulabhorn", "Marshall_Rooster_Cogburn", "Kitty_Kiernan", "Yokich", "Jarou", "Serdaris", "ee_ay", "Montifiore", "Chuderewicz", "Samuel_Le_Bihan", "filly_Proud_Spell", "Umm_Hiba", "pronounced_koo", "Sandy_Fonzo", "KOR'", "Fielder_Civil_kisses", "Federalsburg_Maryland", "Nikah_ceremony", "Brinke_Stevens", "Yakama_Tribal_Council", "Capuchin_Father", "wife_Callista_Bisek", "Beau_Dare", "Bedoni", "Arjun_Punj", "JOHNNY_KNOXVILLE", "cap_tain", "Alderwood_Boys", "Chi_Eta_Phi", "ringleader_Charles_Graner", "Savoies", "Lalla_Salma", "Mrs._Potiphar", "fahn", "name_Taylor_Sumers", "Vernita_Green", "Bollywood_baddie", "BENBROOK_Texas", "Assemblyman_Lou_Papan", "virgin_brides", "Cho_Eun", "CATHY_Freeman", "Uncle_Saul", "Lao_Brewery", "Ibo_tribe", "ruf", "rival_Edurne_Pasaban", "Hei_Shangri_La", "Mommy_dearest", "interest_Angola_Sonogal", "Ger_Monsun", "PUSSYCAT_DOLL", "Crown_Jewels_Condoms", "Lord_Marke", "Patootie", "Nora_Bey", "huntin_shootin", "Minister_Raymond_Tshibanda", "La_Nina_la_NEEN", "signature_Whoppers", "estranged_hubby_Kevin_Federline", "UR'", "pill_poppin", "GEHR'", "purebred_Arabians", "husbandly_duties", "VIAGRA_TIMING", "Hereford_heifer", "hushed_monotone_voice", "Pola_Uddin", "Wee_Jimmy_Krankie", "Kwakwanso", "Our_Galvinator", "shoh", "Codependency_Anonymous_Group", "LA'", "Taufa'ahau", "Invincible_Spirit_colt", "SAH'_dur", "MOUNT_CARMEL_Pa.", "watches_attentively", "SNL_spinoffs", "Seth_Nitschke", "Duns_Berwickshire", "defendant_Colleen_LaRose", "Silky_O'Sullivan", "Highcliff_Farm", "REN'", "Comestar", "Satisfied_Frog", "Jai_Maharashtra", "ATTICA_Ind.", "lover_Larry_Birkhead", "Tami_Megal", "chauvinist_pigs", "Phi_sorority", "Micronesian_immigrant", "Lia_Boldt", "Sugar_Tits", "actress_Kathy_Najimy", "zhoo", "Colombo_underboss", "Katsav_accusers", "Bess_Houdini", "rap_mogul_Diddy", "companions_Khin_Khin", "Van_Het", "Mastoi_tribe", "VITALY", "ROLLING_STONES_rocker", "womanizing_cad", "LILY_COLE", "paternal_grandfathers", "Lt._Col._Kurt_Kosmatka", "Kasseem_Jr.", "Ji_Ji", "Wilburforce", "VIAGRA_DOSE", "English_Sheepdogs", "pronounced_Kah", "Htet_Htet_Oo", "Brisk_Breeze", "Eau_du", "BY_MELANIE_EVANS", "Neovasc_Medical", "British_funnyman_RICKY", "4YO_mare", "Hemaida", "MONKTON", "Mrs_Mujuru", "BaGhana_BaGhana", "Shaaban_Abdel_Rahim", "Edward_Jazlowiecki_lawyer", "Ajman_Stud", "manly_pharaoh_even", "Serra_Madeira_Islands", "FRAY'", "panto_dames", "Khin_Myo", "dancer_Karima_El_Mahroug", "CROWN_Princess", "Baseball_HOFer", "Hasta_la_Pasta", "GIRLS_NEXT_DOOR", "Benedict_Groeschel", "Bousamra", "Ruby_Rubacuori_Ruby", "Monde_Bleu", "Un_homme_qui", "Taylor_Sumers", "Rapper_EMINEM", "Joe_Menchetti", "VAY'", "supermodel_NAOMI_CAMPBELL", "Supermodel_GISELE_BUNDCHEN", "Au_Lait", "Radar_Installed", "THOMAS_TOWNSHIP_Mich.", "Rafinesque", "Herman_Weinrich", "Abraxas_Antelope", "raspy_voiced_rocker", "Manurewa_Cosmopolitan_Club", "Paraone", "THE_LEOPARD", "Boy_Incorporated_LZB", "Dansili_filly", "Lumpy_Rutherford", "unwedded_bliss", "Bhavna_Sharma", "Scarvagh", "en_flagrante", "Mottu_Maid", "Dowager_Queen", "NEEN", "model_Monika_Zsibrita", "ROSIE_PEREZ", "Mattock_Ranger", "Valorous", "Surpreme", "Marwari_businessmen", "Grandparents_aunts", "Kimberley_Vlaeminck", "Lyn_Treece_Boys", "PDX_Update", "Virsa_Punjab", "eyelash_fluttering", "Pi_fraternity", "HUNTLEIGH_Mo.", "novelist_Jilly_Cooper", "Naha_Shuri_temple", "Yasmine_Al_Massri", "Mu_Gamma_Xi", "Mica_Ertegun", "Ocleppo", "VIAGRA_CONTRAINDICATIONS", "daughter_PEACHES", "trainer_Geoff_Wragg", "OVERNIGHT_DELIVERY", "Fitts_retiree", "de_Tourvel", "Lil_Lad", "north_easterner", "Aol_Weird_News", "Somewhat_improbably", "Sikh_panth", "Worcester_2m_7f", "Zainab_Jah", "OLYMPIC_medalist", "Enoch_Petrucelly", "collie_Lassie", "LOW'", "clumsiness_Holloway", "ayr", "OHR'", "ROLLING_STONES_guitarist", "LAH'_nee", "Ian_Beefy_Botham", "Awapuni_trainer", "Glamorous_Granny", "Chiang_Ching", "MidAtlantic_Cardiovascular_Associates", "Yeke", "Seaforth_Huron_Expositor", "Westley_Cary_Elwes", "Cate_Blanchett_Veronica_Guerin", "Bellas_Gate", "witch_Glinda", "wives_mistresses", "Woodsville_Walmart", "2YO_colt", "Manav_Sushant_Singh", "Pupi_Avati_Il", "Sigma_Beta_Rho", "Bishop_Christopher_Senyonjo", "Vodou_priest", "Rubel_Chowdhury", "Claddagh_Ring", "TAH'_duh_al", "al_Sadr_mook_TAH'", "ROBIN_GIBB", "GAHN'", "BY_THOMAS_RANSON", "sister_Carine_Jena", "Lyphard_mare", "summa_cum", "Semenya_grandmother_Maputhi", "Clare_Nuns", "Talac", "sex_hormones_androgens", "majeste", "Saint_Ballado_mare", "Carrie_Huchel", "Mae_Dok", "wife_Dieula", "Earnest_Sirls", "spoof_bar_mitzvah", "von_Boetticher", "Audwin_Mosby", "Case_presentationWe", "Vincent_Papandrea", "KRAY'", "Sergi_Benavent", "Le_Poisson", "Von_Cramm", "Patti_Mell", "Raymi_Coya", "Benjamin_BeBe_Winans", "Nana_Akosua", "Auld_Acquaintance", "Desire_Burunga", "Company_Wrangler_Nestea", "ask_Krisy_Plourde", "JUANITA_BYNUM", "livia", "GAMB", "Gail_Rosario_Dawson", "Ramgarhia_Sikh", "Catholic_nun_Sister", "FOUR_WEDDINGS_AND", "Robyn_Scherer", "brother_King_Athelstan", "Santo_Loquasto_Fences", "Wee_Frees", "MARISOL", "Soliloquy_Stakes", "Whatever_Spoetzl", "Marc'Aurelio", "mon_petit", "Sabbar_al_Mashhadani", "KAY'_lee", "m_zah_MAH'", "BY_TAMI_ALTHOFF", "hobbit_Samwise_Gamgee", "Bahiya_Hariri_sister", "daddy_Larry_Birkhead", "Sow_Tracey_Ullman", "coach_Viljo_Nousiainen", "Carmen_Lebbos", "conjoined_twins_Zainab", "Rob_Komosa", "ample_bosomed", "Ageing_rocker", "psychic_Oda"]

data/male_word_file.txt ADDED Viewed

	@@ -0,0 +1,221 @@

+countryman
+fraternal
+wizards
+manservant
+fathers
+divo
+actor
+bachelor
+papa
+dukes
+barman
+countrymen
+brideprice
+hosts
+potential_suitors
+airmen
+andropause
+penis
+prince
+governors
+abbot
+men
+widower
+gentlemen
+sorcerers
+sir
+bridegrooms
+baron
+househusbands
+gods
+nephew
+widowers
+lord
+brother
+grooms
+priest
+adultors
+andrology
+bellboys
+his
+marquis
+princes
+emperors
+stallion
+chairman
+monastery
+priests
+boyhood
+fellas
+king
+dudes
+daddies
+manservant
+semen
+spokesman
+tailor
+cowboys
+dude
+bachelors
+barbershop
+emperor
+daddy
+masculism
+guys
+enchanter
+guy
+fatherhood
+androgen
+cameramen
+godfather
+strongman
+god
+patriarch
+uncle
+chairmen
+sir
+brotherhood
+host
+testosterone
+husband
+dad
+steward
+males
+cialis
+spokesmen
+pa
+beau
+stud
+bachelor
+wizard
+sir
+nephews
+fathered
+bull
+beaus
+councilmen
+landladies
+grandson
+fiances
+stepfathers
+horsewomen
+grandfathers
+adultor
+schoolboy
+rooster
+grandsons
+bachelor
+cameraman
+dads
+him
+master
+lad
+policeman
+monk
+actors
+salesmen
+boyfriend
+councilman
+fella
+statesman
+paternal
+chap
+landlord
+brethren
+lords
+blokes
+fraternity
+bellboy
+duke
+ballet_dancer
+dudes
+fiance
+colts
+husbands
+suitor
+maternity
+he
+businessman
+masseurs
+hero
+deer
+busboys
+boyfriends
+kings
+brothers
+masters
+stepfather
+son
+studs
+cowboy
+mentleman
+sons
+baritone
+salesman
+paramour
+male_host
+monks
+menservants
+mr.
+headmasters
+lads
+congressman
+airman
+househusband
+priest
+barmen
+barons
+abbots
+handyman
+beard
+fraternities
+stewards
+colt
+czar
+stepsons
+himself
+boys
+lions
+gentleman
+penis
+his
+masseur
+bulls
+uncles
+bloke
+beards
+hubby
+lion
+sorcerer
+macho
+father
+gays
+male
+waiters
+sperm
+prostate
+stepson
+prostatic_utricle
+businessmen
+heir
+waiter
+headmaster
+man
+governor
+god
+bridegroom
+grandpa
+groom
+dude
+gay
+gents
+boy
+grandfather
+gelding
+roosters
+prostatic_utricle
+priests
+manservants
+stailor
+busboy
+heros

eval.py ADDED Viewed

	@@ -0,0 +1,424 @@

+import numpy as np
+from sklearn.cluster import AgglomerativeClustering, KMeans
+from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999, fetch_MTurk, fetch_RG65, fetch_RW, fetch_TR9856
+from web.datasets.categorization import fetch_AP, fetch_battig, fetch_BLESS, fetch_ESSLI_1a, fetch_ESSLI_2b, \
+    fetch_ESSLI_2c
+from web.analogy import *
+from six import iteritems
+from web.embedding import Embedding
+from web.evaluate import calculate_purity, evaluate_categorization, evaluate_on_semeval_2012_2, evaluate_analogy, \
+evaluate_on_WordRep, evaluate_similarity
+def evaluate_similarity_pearson(w, X, y):
+    """
+    Calculate Pearson correlation between cosine similarity of the model
+    and human rated similarity of word pairs
+    Parameters
+    ----------
+    w : Embedding or dict
+      Embedding or dict instance.
+    X: array, shape: (n_samples, 2)
+      Word pairs
+    y: vector, shape: (n_samples,)
+      Human ratings
+    Returns
+    -------
+    cor: float
+      Pearson correlation
+    """
+    if isinstance(w, dict):
+        w = Embedding.from_dict(w)
+    missing_words = 0
+    words = w.vocabulary.word_id
+    for query in X:
+        for query_word in query:
+            if query_word not in words:
+                missing_words += 1
+    if missing_words > 0:
+        print("Missing {} words. Will replace them with mean vector".format(missing_words))
+    new_x = []
+    new_y = []
+    for i in range(len(X)):
+        if X[i, 0] in words and X[i, 1] in words:
+            new_x.append(X[i])
+            new_y.append(y[i])
+    X = np.array(new_x)
+    y = np.array(new_y)
+    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
+    A = np.vstack(list(w.get(word, mean_vector) for word in X[:, 0]))
+    B = np.vstack(list(w.get(word, mean_vector) for word in X[:, 1]))
+    scores = np.array([v1.dot(v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2)) for v1, v2 in zip(A, B)])
+    return scipy.stats.pearsonr(scores, y.squeeze())
+def evaluate_similarity(w, X, y):
+    """
+    Calculate Spearman correlation between cosine similarity of the model
+    and human rated similarity of word pairs
+    Parameters
+    ----------
+    w : Embedding or dict
+      Embedding or dict instance.
+    X: array, shape: (n_samples, 2)
+      Word pairs
+    y: vector, shape: (n_samples,)
+      Human ratings
+    Returns
+    -------
+    cor: float
+      Spearman correlation
+    """
+    if isinstance(w, dict):
+        w = Embedding.from_dict(w)
+    missing_words = 0
+    words = w.vocabulary.word_id
+    for query in X:
+        for query_word in query:
+            if query_word not in words:
+                missing_words += 1
+#     if missing_words > 0:
+#         print("Missing {} words. Will replace them with mean vector".format(missing_words))
+    new_x = []
+    new_y = []
+    exist_cnt = 0
+    for i in range(len(X)):
+        if X[i, 0] in words and X[i, 1] in words:
+            new_x.append(X[i])
+            new_y.append(y[i])
+            exist_cnt += 1
+    print('exist {} in {}'.format(exist_cnt, len(X)))
+    X = np.array(new_x)
+    y = np.array(new_y)
+    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
+    A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
+    B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])
+#     scores = np.array([v1.dot(v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2)) for v1, v2 in zip(A, B)])
+    scores = np.array([v1.dot(v2.T) for v1, v2 in zip(A, B)])
+    return scipy.stats.spearmanr(scores, y).correlation
+def evaluate_simi(wv, w2i, vocab):
+    wv_dict = dict()
+    for w in vocab:
+        wv_dict[w] = wv[w2i[w], :]
+    if isinstance(wv_dict, dict):
+        w = Embedding.from_dict(wv_dict)
+    # Calculate results on similarity
+    print("Calculating similarity benchmarks")
+    similarity_tasks = {
+        "WS353": fetch_WS353(),
+        "RG65": fetch_RG65(),
+#         "WS353R": fetch_WS353(which="relatedness"),
+#         "WS353S": fetch_WS353(which="similarity"),
+        "SimLex999": fetch_SimLex999(),
+        "MTurk": fetch_MTurk(),
+        "RW": fetch_RW(),
+        "MEN": fetch_MEN(),
+    }
+#     similarity_results = {}
+    for name, data in iteritems(similarity_tasks):
+        print("Sample data from {}, num of samples: {} : pair \"{}\" and \"{}\" is assigned score {}".format(
+            name, len(data.X), data.X[0][0], data.X[0][1], data.y[0]))
+        score = evaluate_similarity(w, data.X, data.y)
+        print("Spearman correlation of scores on {} {}".format(name, score))
+#         score, p_value = evaluate_similarity_pearson(w, data.X, data.y)
+#         print("Pearson correlation of scores on {} {}, p value: {}".format(name, score, p_value))
+def evaluate_categorization(w, X, y, method="kmeans", seed=None):
+    """
+    Evaluate embeddings on categorization task.
+    Parameters
+    ----------
+    w: Embedding or dict
+      Embedding to test.
+    X: vector, shape: (n_samples, )
+      Vector of words.
+    y: vector, shape: (n_samples, )
+      Vector of cluster assignments.
+    method: string, default: "all"
+      What method to use. Possible values are "agglomerative", "kmeans", "all.
+      If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude
+      hyperparameter tuning to avoid overfitting).
+      If "kmeans" is passed, method will fit KMeans.
+      In both cases number of clusters is preset to the correct value.
+    seed: int, default: None
+      Seed passed to KMeans.
+    Returns
+    -------
+    purity: float
+      Purity of the best obtained clustering.
+    Notes
+    -----
+    KMedoids method was excluded as empirically didn't improve over KMeans (for categorization
+    tasks available in the package).
+    """
+    if isinstance(w, dict):
+        w = Embedding.from_dict(w)
+    assert method in ["all", "kmeans", "agglomerative"], "Uncrecognized method"
+    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
+    new_x = []
+    new_y = []
+    exist_cnt = 0
+    for idx, word in enumerate(X.flatten()):
+        if word in w :
+            new_x.append(X[idx])
+            new_y.append(y[idx])
+            exist_cnt += 1
+    print('exist {} in {}'.format(exist_cnt, len(X)))
+    X = np.array(new_x)
+    y = np.array(new_y)
+    words = np.vstack([w.get(word, mean_vector) for word in X.flatten()])
+    ids = np.random.RandomState(seed).choice(range(len(X)), len(X), replace=False)
+    # Evaluate clustering on several hyperparameters of AgglomerativeClustering and
+    # KMeans
+    best_purity = 0
+    if method == "all" or method == "agglomerative":
+        best_purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)),
+                                                                       affinity="euclidean",
+                                                                       linkage="ward").fit_predict(words[ids]))
+        logger.debug("Purity={:.3f} using affinity={} linkage={}".format(best_purity, 'euclidean', 'ward'))
+        for affinity in ["cosine", "euclidean"]:
+            for linkage in ["average", "complete"]:
+                purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)),
+                                                                          affinity=affinity,
+                                                                          linkage=linkage).fit_predict(words[ids]))
+                logger.debug("Purity={:.3f} using affinity={} linkage={}".format(purity, affinity, linkage))
+                best_purity = max(best_purity, purity)
+    if method == "all" or method == "kmeans":
+        purity = calculate_purity(y[ids], KMeans(random_state=seed, n_init=10, n_clusters=len(set(y))).
+                                  fit_predict(words[ids]))
+        logger.debug("Purity={:.3f} using KMeans".format(purity))
+        best_purity = max(purity, best_purity)
+    return best_purity
+def evaluate_cate(wv, w2i, vocab, method="all", seed=None):
+    """
+    method: string, default: "all"
+      What method to use. Possible values are "agglomerative", "kmeans", "all.
+      If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude
+      hyperparameter tuning to avoid overfitting).
+      If "kmeans" is passed, method will fit KMeans.
+      In both cases number of clusters is preset to the correct value.
+    seed: int, default: None
+      Seed passed to KMeans.
+    """
+    wv_dict = dict()
+    for w in vocab:
+        wv_dict[w] = wv[w2i[w], :]
+    if isinstance(wv_dict, dict):
+        w = Embedding.from_dict(wv_dict)
+    # Calculate results on categorization
+    print("Calculating categorization benchmarks")
+    categorization_tasks = {
+        "AP": fetch_AP(),
+        "ESSLI_2c": fetch_ESSLI_2c(),
+        "ESSLI_2b": fetch_ESSLI_2b(),
+        "ESSLI_1a": fetch_ESSLI_1a(),
+        "Battig": fetch_battig(),
+        "BLESS": fetch_BLESS(),
+    }
+    categorization_results = {}
+    # Calculate results using helper function
+    for name, data in iteritems(categorization_tasks):
+        print("Sample data from {}, num of samples: {} : \"{}\" is assigned class {}".format(
+            name, len(data.X), data.X[0], data.y[0]))
+        categorization_results[name] = evaluate_categorization(w, data.X, data.y, method=method, seed=None)
+        print("Cluster purity on {} {}".format(name, categorization_results[name]))
+def evaluate_analogy_google(W, vocab):
+    """Evaluate the trained w vectors on a variety of tasks"""
+    filenames = [
+        'capital-common-countries.txt', 'capital-world.txt', 'currency.txt',
+        'city-in-state.txt', 'family.txt', 'gram1-adjective-to-adverb.txt',
+        'gram2-opposite.txt', 'gram3-comparative.txt', 'gram4-superlative.txt',
+        'gram5-present-participle.txt', 'gram6-nationality-adjective.txt',
+        'gram7-past-tense.txt', 'gram8-plural.txt', 'gram9-plural-verbs.txt',
+        ]
+    prefix = '/zf15/tw8cb/summer_2019/code/GloVe/eval/question-data/'
+    # to avoid memory overflow, could be increased/decreased
+    # depending on system and vocab size
+    split_size = 100
+    correct_sem = 0; # count correct semantic questions
+    correct_syn = 0; # count correct syntactic questions
+    correct_tot = 0 # count correct questions
+    count_sem = 0; # count all semantic questions
+    count_syn = 0; # count all syntactic questions
+    count_tot = 0 # count all questions
+    full_count = 0 # count all questions, including those with unknown words
+    for i in range(len(filenames)):
+        with open('%s/%s' % (prefix, filenames[i]), 'r') as f:
+            full_data = [line.rstrip().split(' ') for line in f]
+            full_count += len(full_data)
+            data = [x for x in full_data if all(word in vocab for word in x)]
+        indices = np.array([[vocab[word] for word in row] for row in data])
+        ind1, ind2, ind3, ind4 = indices.T
+        predictions = np.zeros((len(indices),))
+        num_iter = int(np.ceil(len(indices) / float(split_size)))
+        for j in range(num_iter):
+            subset = np.arange(j*split_size, min((j + 1)*split_size, len(ind1)))
+            pred_vec = (W[ind2[subset], :] - W[ind1[subset], :]
+                +  W[ind3[subset], :])
+            #cosine similarity if input W has been normalized
+            dist = np.dot(W, pred_vec.T)
+            for k in range(len(subset)):
+                dist[ind1[subset[k]], k] = -np.Inf
+                dist[ind2[subset[k]], k] = -np.Inf
+                dist[ind3[subset[k]], k] = -np.Inf
+            # predicted word index
+            predictions[subset] = np.argmax(dist, 0).flatten()
+        val = (ind4 == predictions) # correct predictions
+        count_tot = count_tot + len(ind1)
+        correct_tot = correct_tot + sum(val)
+        if i < 5:
+            count_sem = count_sem + len(ind1)
+            correct_sem = correct_sem + sum(val)
+        else:
+            count_syn = count_syn + len(ind1)
+            correct_syn = correct_syn + sum(val)
+        print("%s:" % filenames[i])
+        print('ACCURACY TOP1: %.2f%% (%d/%d)' %
+            (np.mean(val) * 100, np.sum(val), len(val)))
+    print('Questions seen/total: %.2f%% (%d/%d)' %
+        (100 * count_tot / float(full_count), count_tot, full_count))
+    print('Semantic accuracy: %.2f%%  (%i/%i)' %
+        (100 * correct_sem / float(count_sem), correct_sem, count_sem))
+    print('Syntactic accuracy: %.2f%%  (%i/%i)' %
+        (100 * correct_syn / float(count_syn), correct_syn, count_syn))
+    print('Total accuracy: %.2f%%  (%i/%i)' % (100 * correct_tot / float(count_tot), correct_tot, count_tot))
+def evaluate_analogy_msr(W, vocab, file_name='EN-MSR.txt'):
+    """Evaluate the trained word vectors on a variety of tasks"""
+    prefix = '/zf15/tw8cb/summer_2019/code/GloVe/eval/question-data/'
+    # to avoid memory overflow, could be increased/decreased
+    # depending on system and vocab size
+    split_size = 100
+    correct_sem = 0; # count correct semantic questions
+    correct_syn = 0; # count correct syntactic questions
+    correct_tot = 0 # count correct questions
+    count_sem = 0; # count all semantic questions
+    count_syn = 0; # count all syntactic questions
+    count_tot = 0 # count all questions
+    full_count = 0 # count all questions, including those with unknown words
+    with open('%s/%s' % (prefix, file_name), 'r') as f:
+        full_data = []
+        for line in f:
+            tokens = line.rstrip().split(' ')
+            full_data.append([tokens[0], tokens[1], tokens[2], tokens[4]])
+        full_count += len(full_data)
+        data = [x for x in full_data if all(word in vocab for word in x)]
+    indices = np.array([[vocab[word] for word in row] for row in data])
+    ind1, ind2, ind3, ind4 = indices.T
+    predictions = np.zeros((len(indices),))
+    num_iter = int(np.ceil(len(indices) / float(split_size)))
+    for j in range(num_iter):
+        subset = np.arange(j*split_size, min((j + 1)*split_size, len(ind1)))
+        pred_vec = (W[ind2[subset], :] - W[ind1[subset], :]
+            +  W[ind3[subset], :])
+        #cosine similarity if input W has been normalized
+        dist = np.dot(W, pred_vec.T)
+        for k in range(len(subset)):
+            dist[ind1[subset[k]], k] = -np.Inf
+            dist[ind2[subset[k]], k] = -np.Inf
+            dist[ind3[subset[k]], k] = -np.Inf
+        # predicted word index
+        predictions[subset] = np.argmax(dist, 0).flatten()
+    val = (ind4 == predictions) # correct predictions
+    count_tot = count_tot + len(ind1)
+    correct_tot = correct_tot + sum(val)
+#     print("%s:" % filenames[i])
+    print(len(val))
+    print('ACCURACY TOP1-MSR: %.2f%% (%d/%d)' %
+        (np.mean(val) * 100, np.sum(val), len(val)))
+def evaluate_analogy_semeval2012(w_dict):
+    score = evaluate_on_semeval_2012_2(w_dict)['all']
+    print("Analogy prediction accuracy on {} {}".format("SemEval2012", score))
+def evaluate_ana(wv, w2i, vocab):
+    W_norm = np.zeros(wv.shape)
+    d = (np.sum(wv ** 2, 1) ** (0.5))
+    W_norm = (wv.T / d).T
+    evaluate_analogy_msr(W_norm, w2i)
+    evaluate_analogy_google(W_norm, w2i)
+    wv_dict = dict()
+    for w in vocab:
+        wv_dict[w] = W_norm[w2i[w], :]
+    if isinstance(wv_dict, dict):
+        w = Embedding.from_dict(wv_dict)
+    evaluate_analogy_semeval2012(w)
+#     analogy_tasks = {
+#         "Google": fetch_google_analogy(),
+#         "MSR": fetch_msr_analogy()
+#     }
+#     analogy_results = {}
+#     for name, data in iteritems(analogy_tasks):
+#         analogy_results[name] = evaluate_analogy(w, data.X, data.y)
+#         print("Analogy prediction accuracy on {} {}".format(name, analogy_results[name]))

utils.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import string
+from tqdm import tqdm
+import pickle
+import scipy
+import numpy as np
+from numpy import linalg as LA
+from sklearn.decomposition import PCA
+# Experiment 1
+WEAT_words = {
+'A':['John', 'Paul', 'Mike', 'Kevin', 'Steve', 'Greg', 'Jeff', 'Bill'],
+'B':['Amy', 'Joan', 'Lisa', 'Sarah', 'Diana', 'Kate', 'Ann', 'Donna'],
+'C':['executive', 'management', 'professional', 'corporation', 'salary', 'office', 'business', 'career'],
+'D':['home', 'parents', 'children', 'family', 'cousins', 'marriage', 'wedding', 'relatives'],
+'E':['math', 'algebra', 'geometry', 'calculus', 'equations', 'computation', 'numbers', 'addition'],
+'F':['poetry', 'art', 'dance', 'literature', 'novel', 'symphony', 'drama', 'sculpture'],
+'G':['science', 'technology', 'physics', 'chemistry', 'einstein', 'nasa', 'experiment', 'astronomy'],
+'H':['poetry', 'art', 'shakespeare', 'dance', 'literature', 'novel', 'symphony', 'drama'],
+}
+def has_punct(w):
+    if any([c in string.punctuation for c in w]):
+        return True
+    return False
+def has_digit(w):
+    if any([c in '0123456789' for c in w]):
+        return True
+    return False
+def limit_vocab(wv, w2i, vocab, exclude = None):
+    vocab_limited = []
+    for w in tqdm(vocab[:50000]):
+        if w.lower() != w:
+            continue
+        if len(w) >= 20:
+            continue
+        if has_digit(w):
+            continue
+        if '_' in w:
+            p = [has_punct(subw) for subw in w.split('_')]
+            if not any(p):
+                vocab_limited.append(w)
+            continue
+        if has_punct(w):
+            continue
+        vocab_limited.append(w)
+    if exclude:
+        vocab_limited = list(set(vocab_limited) - set(exclude))
+    print("size of vocabulary:", len(vocab_limited))
+    wv_limited = np.zeros((len(vocab_limited), len(wv[0, :])))
+    for i,w in enumerate(vocab_limited):
+        wv_limited[i,:] = wv[w2i[w],:]
+    w2i_limited = {w: i for i, w in enumerate(vocab_limited)}
+    return vocab_limited, wv_limited, w2i_limited
+def norm_stand(wv):
+    W_norm = np.zeros(wv.shape)
+    d = (np.sum(wv ** 2, 1) ** (0.5))
+    W_norm = (wv.T / d).T
+    return W_norm
+def normalize(wv):
+    # normalize vectors
+    norms = np.apply_along_axis(LA.norm, 1, wv)
+    wv = wv / norms[:, np.newaxis]
+    return wv
+def topK(w, wv, w2i, vocab, k=10):
+    # extract the word vector for word w
+    idx = w2i[w]
+    vec = wv[idx, :]
+    # compute similarity of w with all words in the vocabulary
+    sim = wv.dot(vec)
+#     sim = []
+#     for i in range(len(wv)):
+#         sim.append(1-scipy.spatial.distance.cosine(wv[i, :], vec))
+#     sim = np.array(sim)
+    # sort similarities by descending order
+    sort_sim = (sim.argsort())[::-1]
+    # choose topK
+    best = sort_sim[:(k+1)]
+    return [vocab[i] for i in best if i!=idx]
+def similarity(w1, w2, wv, w2i):
+    i1 = w2i[w1]
+    i2 = w2i[w2]
+    vec1 = wv[i1, :]
+    vec2 = wv[i2, :]
+    return 1-scipy.spatial.distance.cosine(vec1, vec2)
+def drop(u, v):
+    return u - v * u.dot(v) / v.dot(v)
+from sklearn.decomposition import PCA
+from sklearn import preprocessing
+def doPCA(pairs, wv, w2i):
+    matrix = []
+    cnt = 0
+    if type(pairs[0]) is list:
+        for a, b in pairs:
+            if not (a in w2i and b in w2i): continue
+            center = (wv[w2i[a], :] + wv[w2i[b], :])/2
+            matrix.append(wv[w2i[a], :] - center)
+            matrix.append(wv[w2i[b], :] - center)
+            cnt += 1
+    else:
+        for a in pairs:
+            if not (a in w2i): continue
+            matrix.append(wv[w2i[a], :])
+            cnt += 1
+        embeds = np.array(matrix)
+        wv_mean = np.mean(np.array(embeds), axis=0)
+        wv_hat = np.zeros(embeds.shape).astype(float)
+        for i in range(len(embeds)):
+            wv_hat[i, :] = embeds[i, :] - wv_mean
+        matrix = wv_hat
+    matrix = np.array(matrix)
+    pca = PCA()
+    pca.fit(matrix)
+    print('pairs used in PCA: ', cnt)
+    return pca
+# get tuples of biases and counts of masculine/feminine NN for each word (for bias-by-neighbors)
+import operator
+def bias_by_neighbors(wv, w2i, vocab, gender_bias_bef, size, neighbours_num = 100):
+    tuples = []
+    sorted_g = sorted(gender_bias_bef.items(), key=operator.itemgetter(1))
+    female = [item[0] for item in sorted_g[:size]]
+    male = [item[0] for item in sorted_g[-size:]]
+#     vocab = male + female
+    selected = female + male if size > 0 else vocab
+    for w in selected:
+        top = topK(w, wv, w2i, vocab, k=neighbours_num+5)[:neighbours_num]
+        m = 0
+        f = 0
+        for t in top:
+            if gender_bias_bef[t] > 0:
+                m+=1
+            else:
+                f+=1
+        tuples.append((w, gender_bias_bef[w], m, f))
+    return tuples
+def get_tuples_prof(wv, w2i, vocab,  words, gender_bias_dict):
+    wv = normalize(wv)
+    tuples = []
+    for w in words:
+        if w not in gender_bias_dict:
+            continue
+        top = topK(w, wv, w2i, vocab, k=105)[:100]
+        m = 0
+        f = 0
+        for t in top:
+            if gender_bias_dict[t] > 0:
+                m+=1
+            else:
+                f+=1
+        tuples.append((w, gender_bias_dict[w], m, f))
+    return tuples
+# compute correlation between bias-by-projection and bias-by-neighbors
+import scipy.stats
+def pearson(a,b):
+    return scipy.stats.pearsonr(a,b)
+def compute_corr(tuples, i1, i2):
+    a = []
+    b = []
+    for t in tuples:
+        a.append(t[i1])
+        b.append(t[i2])
+    assert(len(a)==len(b))
+    print('pearson: ', scipy.stats.pearsonr(a,b))
+    print('spearman: ', scipy.stats.spearmanr(a, b))
+# Auxiliary finctions
+from sklearn.cluster import KMeans
+from sklearn.manifold import TSNE
+def visualize(vectors, y_true, y_pred, ax, title, random_state, num_clusters = 2):
+    # perform TSNE
+    X_embedded = TSNE(n_components=2, random_state=random_state).fit_transform(vectors)
+    for x,p,y in zip(X_embedded, y_pred, y_true):
+        if p:
+            if y:
+                ax.scatter(x[0], x[1], marker = '.', c = 'c')
+            else:
+                ax.scatter(x[0], x[1], marker = 'x', c = 'c')
+        else:
+            if y:
+                ax.scatter(x[0], x[1], marker = '.', c = 'darkviolet')
+            else:
+                ax.scatter(x[0], x[1], marker = 'x', c = 'darkviolet')
+    ax.text(.01, .9, title ,transform=ax.transAxes, fontsize=15)
+def extract_vectors(words, wv, w2i):
+    X = [wv[w2i[x],:] for x in words]
+    return X
+def cluster_and_visualize(words, X, random_state, y_true, num=2):
+    y_pred = KMeans(n_clusters=num, random_state=random_state).fit_predict(X)
+#     fig, axs = plt.subplots(figsize=(6, 3))
+#     visualize(X, y_true, y_pred, axs, 'Original', random_state)
+    correct = [1 if item1 == item2 else 0 for (item1,item2) in zip(y_true, y_pred) ]
+    print('precision', max(sum(correct)/float(len(correct)), 1 - sum(correct)/float(len(correct))))
+import scipy.stats
+from sklearn import svm
+def train_and_predict(wv, w2i, vocab, size_train, size_test, males, females):
+    X_train = [wv[w2i[w],:] for w in males[:size_train]+females[:size_train]]
+    Y_train = [1]*size_train + [0]*size_train
+    X_test = [wv[w2i[w],:] for w in males[size_train:]+females[size_train:]]
+    Y_test = [1]*size_test + [0]*size_test
+    clf = svm.SVC(gamma='auto')
+    clf.fit(X_train, Y_train)
+    preds = clf.predict(X_test)
+    accuracy = [1 if y==z else 0 for y,z in zip(preds, Y_test)]
+    acc = float(sum(accuracy))/len(accuracy)
+    print('accuracy:', float(sum(accuracy))/len(accuracy))
+    return acc
+# Auxiliary functions for experiments by Caliskan et al.
+import scipy
+import scipy.misc as misc
+import itertools
+def s_word(w, A, B, wv, w2i, vocab, all_s_words):
+    if w in all_s_words:
+        return all_s_words[w]
+    mean_a = []
+    mean_b = []
+    for a in A:
+        mean_a.append(similarity(w, a, wv, w2i))
+    for b in B:
+        mean_b.append(similarity(w, b, wv, w2i))
+    mean_a = sum(mean_a)/float(len(mean_a))
+    mean_b = sum(mean_b)/float(len(mean_b))
+    all_s_words[w] = mean_a - mean_b
+    return all_s_words[w]
+def s_group(X, Y, A, B,  wv, w2i, vocab, all_s_words):
+    total = 0
+    for x in X:
+        total += s_word(x, A, B,  wv, w2i, vocab, all_s_words)
+    for y in Y:
+        total -= s_word(y, A, B,  wv, w2i, vocab, all_s_words)
+    return total
+def p_value_exhust(X, Y, A, B,  wv, w2i, vocab):
+    if len(X) > 10:
+        print('might take too long, use sampled version: p_value')
+        return
+    assert(len(X) == len(Y))
+    all_s_words = {}
+    s_orig = s_group(X, Y, A, B, wv, w2i, vocab, all_s_words)
+    union = set(X+Y)
+    subset_size = int(len(union)/2)
+    larger = 0
+    total = 0
+    for subset in set(itertools.combinations(union, subset_size)):
+        total += 1
+        Xi = list(set(subset))
+        Yi = list(union - set(subset))
+        if s_group(Xi, Yi, A, B, wv, w2i, vocab, all_s_words) > s_orig:
+            larger += 1
+    print('num of samples', total)
+    return larger/float(total)
+def association_diff(t, A, B, wv, w2i):
+    mean_a = []
+    mean_b = []
+    for a in A:
+        mean_a.append(similarity(t, a, wv, w2i))
+    for b in B:
+        mean_b.append(similarity(t, b, wv, w2i))
+    mean_a = sum(mean_a)/float(len(mean_a))
+    mean_b = sum(mean_b)/float(len(mean_b))
+    return mean_a - mean_b
+def effect_size(X, Y, A, B,  wv, w2i, vocab):
+    assert(len(X) == len(Y))
+    assert(len(A) == len(B))
+    norm_x = []
+    norm_y = []
+    for x in X:
+        norm_x.append(association_diff(x, A, B, wv, w2i))
+    for y in Y:
+        norm_y.append(association_diff(y, A, B, wv, w2i))
+    std = np.std(norm_x+norm_y, ddof=1)
+    norm_x = sum(norm_x) / float(len(norm_x))
+    norm_y = sum(norm_y) / float(len(norm_y))
+    return (norm_x-norm_y)/std
+def p_value_sample(X, Y, A, B, wv, w2i, vocab):
+    random.seed(10)
+    np.random.seed(10)
+    all_s_words = {}
+    assert(len(X) == len(Y))
+    length = len(X)
+    s_orig = s_group(X, Y, A, B,  wv, w2i, vocab, all_s_words)
+    num_of_samples = min(1000000, int(scipy.special.comb(length*2,length)*100))
+    print('num of samples', num_of_samples)
+    larger = 0
+    for i in range(num_of_samples):
+        permute = np.random.permutation(X+Y)
+        Xi = permute[:length]
+        Yi = permute[length:]
+        if s_group(Xi, Yi, A, B, space, all_s_words) > s_orig:
+            larger += 1
+    return larger/float(num_of_samples)