biasaware / scripts /gender_profession_bias.py
freyam's picture
Add Plot for Gender Divide
0946447
raw
history blame
3.5 kB
import re
import json
import pandas as pd
import plotly.express as px
import multiprocessing.pool
from spacy.lang.en import English
gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
profession_lexicons = json.load(open("config/profession_lexicons.json", "r"))
nlp = English()
nlp.add_pipe("sentencizer")
def get_split_text(text):
doc = nlp(text)
sentences = [sent for sent in doc.sents]
return sentences
def compile_regex_patterns(patterns):
return [
re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE)
for pattern in patterns
]
def get_gender_prof_match_details(df_text):
male_pronouns = gender_lexicons.get("male_pronouns")
female_pronouns = gender_lexicons.get("female_pronouns")
professions = profession_lexicons.get("professions")
male_pronoun_pat, female_pronoun_pat, professions_pat = compile_regex_patterns(
[male_pronouns, female_pronouns, professions]
)
split_text = get_split_text(df_text)
results = []
for text in split_text:
male_pronoun_match = re.findall(male_pronoun_pat, str(text))
female_pronoun_match = re.findall(female_pronoun_pat, str(text))
prof_match = re.findall(professions_pat, str(text))
both_match = "No"
if len(male_pronoun_match) != 0 and len(prof_match) != 0:
both_match = "Yes"
if len(female_pronoun_match) != 0 and len(prof_match) != 0:
both_match = "Yes"
male_pronoun_match = ",".join(male_pronoun_match)
female_pronoun_match = ",".join(female_pronoun_match)
prof_match = ",".join(prof_match)
results.append(
(
str(text),
male_pronoun_match,
female_pronoun_match,
prof_match,
both_match,
)
)
return results
def call_multiprocessing_pool(df_text):
concurrent = 2000
pool = multiprocessing.pool.ThreadPool(processes=concurrent)
result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
pool.close()
flat_return_list = [item for sublist in result_list for item in sublist]
cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
return_df = pd.DataFrame(flat_return_list, columns=cols)
return return_df
def get_statistics(result):
stats = {
"both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
"count_male_pronoun": str((result["Male Pronoun"] != "").sum()),
"count_female_pronoun": str((result["Female Pronoun"] != "").sum()),
"count_male_pronoun_profession": str(
((result["Male Pronoun"] != "") & (result["Profession"] != "")).sum()
),
"count_female_pronoun_profession": str(
((result["Female Pronoun"] != "") & (result["Profession"] != "")).sum()
),
"total_sentence": str(len(result)),
}
return stats
def get_plot(result_df):
return
def eval_gender_profession(data):
data = data[data.columns[0]].str.lower().str.strip()
result = call_multiprocessing_pool(data)
result_json = get_statistics(result)
result_plot = get_plot(result_json)
result_df = (
pd.DataFrame.from_dict(result_json, orient="index")
.reset_index()
.rename(columns={"index": "Metric", 0: "Value"})
)
result_conclusion = ""
return result_df, result_plot, result_conclusion