|
import re |
|
import json |
|
|
|
import pandas as pd |
|
import plotly.express as px |
|
import multiprocessing.pool |
|
from spacy.lang.en import English |
|
|
|
|
|
nlp = English() |
|
nlp.add_pipe("sentencizer") |
|
|
|
|
|
def call_multiprocessing_pool(df_text): |
|
concurrent = multiprocessing.cpu_count() |
|
pool = multiprocessing.pool.ThreadPool(processes=concurrent) |
|
result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1) |
|
pool.close() |
|
|
|
flat_return_list = [item for sublist in result_list for item in sublist] |
|
|
|
cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"] |
|
return_df = pd.DataFrame(flat_return_list, columns=cols) |
|
|
|
return return_df |
|
|
|
|
|
def get_gender_prof_match_details(df_text): |
|
gender_lexicons = json.load(open("config/gender_lexicons.json", "r")) |
|
profession_lexicons = json.load(open("config/profession_lexicons.json", "r")) |
|
|
|
male_pronouns = gender_lexicons.get("male_pronouns") |
|
female_pronouns = gender_lexicons.get("female_pronouns") |
|
professions = profession_lexicons.get("professions") |
|
|
|
male_pronoun_pat, female_pronoun_pat, professions_pat = ( |
|
re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE) |
|
for pattern in [male_pronouns, female_pronouns, professions] |
|
) |
|
|
|
doc = nlp(df_text) |
|
split_text = [sent for sent in doc.sents] |
|
|
|
results = [] |
|
|
|
for text in split_text: |
|
male_pronoun_match = re.findall(male_pronoun_pat, str(text)) |
|
female_pronoun_match = re.findall(female_pronoun_pat, str(text)) |
|
|
|
prof_match = re.findall(professions_pat, str(text)) |
|
|
|
both_match = "No" |
|
|
|
if len(male_pronoun_match) != 0 and len(prof_match) != 0: |
|
both_match = "Yes" |
|
|
|
if len(female_pronoun_match) != 0 and len(prof_match) != 0: |
|
both_match = "Yes" |
|
|
|
male_pronoun_match = ",".join(male_pronoun_match) |
|
female_pronoun_match = ",".join(female_pronoun_match) |
|
|
|
prof_match = ",".join(prof_match) |
|
|
|
results.append( |
|
( |
|
str(text), |
|
male_pronoun_match, |
|
female_pronoun_match, |
|
prof_match, |
|
both_match, |
|
) |
|
) |
|
|
|
return results |
|
|
|
|
|
def get_statistics(result): |
|
stats = { |
|
"both_gender_prof_match": str((result["Both Match"] == "Yes").sum()), |
|
"count_male_pronoun": str((result["Male Pronoun"] != "").sum()), |
|
"count_female_pronoun": str((result["Female Pronoun"] != "").sum()), |
|
"count_male_pronoun_profession": str( |
|
((result["Male Pronoun"] != "") & (result["Profession"] != "")).sum() |
|
), |
|
"count_female_pronoun_profession": str( |
|
((result["Female Pronoun"] != "") & (result["Profession"] != "")).sum() |
|
), |
|
"total_sentence": str(len(result)), |
|
} |
|
|
|
return stats |
|
|
|
|
|
def get_plot(result_json): |
|
both_gender_prof_match = int(result_json["both_gender_prof_match"]) |
|
count_male_pronoun = int(result_json["count_male_pronoun"]) |
|
count_female_pronoun = int(result_json["count_female_pronoun"]) |
|
count_male_pronoun_profession = int(result_json["count_male_pronoun_profession"]) |
|
count_female_pronoun_profession = int( |
|
result_json["count_female_pronoun_profession"] |
|
) |
|
|
|
data = { |
|
"Labels": [ |
|
"Both Gender & Profession Match", |
|
"Male Pronoun", |
|
"Female Pronoun", |
|
"Male Pronoun & Profession", |
|
"Female Pronoun & Profession", |
|
], |
|
"Values": [ |
|
both_gender_prof_match, |
|
count_male_pronoun, |
|
count_female_pronoun, |
|
count_male_pronoun_profession, |
|
count_female_pronoun_profession, |
|
], |
|
} |
|
|
|
fig = px.pie( |
|
data, |
|
names="Labels", |
|
values="Values", |
|
title="Gender & Profession Match Statistics", |
|
) |
|
|
|
return fig |
|
|
|
|
|
def eval_gender_profession(data): |
|
data = data[data.columns[0]].str.lower().str.strip() |
|
|
|
result = call_multiprocessing_pool(data) |
|
|
|
result_json = get_statistics(result) |
|
result_plot = get_plot(result_json) |
|
|
|
result_df = ( |
|
pd.DataFrame.from_dict(result_json, orient="index") |
|
.reset_index() |
|
.rename(columns={"index": "Metric", 0: "Value"}) |
|
) |
|
|
|
return result_df, result_plot |
|
|