|
import re |
|
import json |
|
|
|
import pandas as pd |
|
import plotly.express as px |
|
import multiprocessing.pool |
|
from spacy.lang.en import English |
|
|
|
gender_lexicons = json.load(open("config/gender_lexicons.json", "r")) |
|
profession_lexicons = json.load(open("config/profession_lexicons.json", "r")) |
|
|
|
nlp = English() |
|
nlp.add_pipe("sentencizer") |
|
|
|
|
|
def get_split_text(text): |
|
doc = nlp(text) |
|
sentences = [sent for sent in doc.sents] |
|
return sentences |
|
|
|
|
|
def compile_regex_patterns(patterns): |
|
return [ |
|
re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE) |
|
for pattern in patterns |
|
] |
|
|
|
|
|
def get_gender_prof_match_details(df_text): |
|
male_pronouns = gender_lexicons.get("male_pronouns") |
|
female_pronouns = gender_lexicons.get("female_pronouns") |
|
professions = profession_lexicons.get("professions") |
|
|
|
male_pronoun_pat, female_pronoun_pat, professions_pat = compile_regex_patterns( |
|
[male_pronouns, female_pronouns, professions] |
|
) |
|
|
|
split_text = get_split_text(df_text) |
|
|
|
results = [] |
|
|
|
for text in split_text: |
|
male_pronoun_match = re.findall(male_pronoun_pat, str(text)) |
|
female_pronoun_match = re.findall(female_pronoun_pat, str(text)) |
|
|
|
prof_match = re.findall(professions_pat, str(text)) |
|
|
|
both_match = "No" |
|
|
|
if len(male_pronoun_match) != 0 and len(prof_match) != 0: |
|
both_match = "Yes" |
|
|
|
if len(female_pronoun_match) != 0 and len(prof_match) != 0: |
|
both_match = "Yes" |
|
|
|
male_pronoun_match = ",".join(male_pronoun_match) |
|
female_pronoun_match = ",".join(female_pronoun_match) |
|
|
|
prof_match = ",".join(prof_match) |
|
|
|
results.append( |
|
( |
|
str(text), |
|
male_pronoun_match, |
|
female_pronoun_match, |
|
prof_match, |
|
both_match, |
|
) |
|
) |
|
|
|
return results |
|
|
|
|
|
def call_multiprocessing_pool(df_text): |
|
concurrent = 2000 |
|
pool = multiprocessing.pool.ThreadPool(processes=concurrent) |
|
result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1) |
|
pool.close() |
|
|
|
flat_return_list = [item for sublist in result_list for item in sublist] |
|
|
|
cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"] |
|
return_df = pd.DataFrame(flat_return_list, columns=cols) |
|
|
|
return return_df |
|
|
|
|
|
def get_statistics(result): |
|
stats = { |
|
"both_gender_prof_match": str((result["Both Match"] == "Yes").sum()), |
|
"count_male_pronoun": str((result["Male Pronoun"] != "").sum()), |
|
"count_female_pronoun": str((result["Female Pronoun"] != "").sum()), |
|
"count_male_pronoun_profession": str( |
|
((result["Male Pronoun"] != "") & (result["Profession"] != "")).sum() |
|
), |
|
"count_female_pronoun_profession": str( |
|
((result["Female Pronoun"] != "") & (result["Profession"] != "")).sum() |
|
), |
|
"total_sentence": str(len(result)), |
|
} |
|
|
|
return stats |
|
|
|
|
|
def get_plot(result_df): |
|
return |
|
|
|
|
|
def eval_gender_profession(data): |
|
data = data[data.columns[0]].str.lower().str.strip() |
|
|
|
result = call_multiprocessing_pool(data) |
|
|
|
result_json = get_statistics(result) |
|
result_plot = get_plot(result_json) |
|
|
|
result_df = ( |
|
pd.DataFrame.from_dict(result_json, orient="index") |
|
.reset_index() |
|
.rename(columns={"index": "Metric", 0: "Value"}) |
|
) |
|
|
|
result_conclusion = "" |
|
|
|
return result_df, result_plot, result_conclusion |
|
|