import re import json import pandas as pd import plotly.express as px import multiprocessing.pool from spacy.lang.en import English nlp = English() nlp.add_pipe("sentencizer") def call_multiprocessing_pool(df_text): concurrent = multiprocessing.cpu_count() pool = multiprocessing.pool.ThreadPool(processes=concurrent) result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1) pool.close() flat_return_list = [item for sublist in result_list for item in sublist] cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"] return_df = pd.DataFrame(flat_return_list, columns=cols) return return_df def get_gender_prof_match_details(df_text): gender_lexicons = json.load(open("config/gender_lexicons.json", "r")) profession_lexicons = json.load(open("config/profession_lexicons.json", "r")) male_pronouns = gender_lexicons.get("male_pronouns") female_pronouns = gender_lexicons.get("female_pronouns") professions = profession_lexicons.get("professions") male_pronoun_pat, female_pronoun_pat, professions_pat = ( re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE) for pattern in [male_pronouns, female_pronouns, professions] ) doc = nlp(df_text) split_text = [sent for sent in doc.sents] results = [] for text in split_text: male_pronoun_match = re.findall(male_pronoun_pat, str(text)) female_pronoun_match = re.findall(female_pronoun_pat, str(text)) prof_match = re.findall(professions_pat, str(text)) both_match = "No" if len(male_pronoun_match) != 0 and len(prof_match) != 0: both_match = "Yes" if len(female_pronoun_match) != 0 and len(prof_match) != 0: both_match = "Yes" male_pronoun_match = ",".join(male_pronoun_match) female_pronoun_match = ",".join(female_pronoun_match) prof_match = ",".join(prof_match) results.append( ( str(text), male_pronoun_match, female_pronoun_match, prof_match, both_match, ) ) return results def get_statistics(result): stats = { "both_gender_prof_match": str((result["Both Match"] == "Yes").sum()), "count_male_pronoun": str((result["Male Pronoun"] != "").sum()), "count_female_pronoun": str((result["Female Pronoun"] != "").sum()), "count_male_pronoun_profession": str( ((result["Male Pronoun"] != "") & (result["Profession"] != "")).sum() ), "count_female_pronoun_profession": str( ((result["Female Pronoun"] != "") & (result["Profession"] != "")).sum() ), "total_sentence": str(len(result)), } return stats def get_plot(result_json): both_gender_prof_match = int(result_json["both_gender_prof_match"]) count_male_pronoun = int(result_json["count_male_pronoun"]) count_female_pronoun = int(result_json["count_female_pronoun"]) count_male_pronoun_profession = int(result_json["count_male_pronoun_profession"]) count_female_pronoun_profession = int( result_json["count_female_pronoun_profession"] ) data = { "Labels": [ "Both Gender & Profession Match", "Male Pronoun", "Female Pronoun", "Male Pronoun & Profession", "Female Pronoun & Profession", ], "Values": [ both_gender_prof_match, count_male_pronoun, count_female_pronoun, count_male_pronoun_profession, count_female_pronoun_profession, ], } fig = px.pie( data, names="Labels", values="Values", title="Gender & Profession Match Statistics", ) return fig def eval_gender_profession(data): data = data[data.columns[0]].str.lower().str.strip() result = call_multiprocessing_pool(data) result_json = get_statistics(result) result_plot = get_plot(result_json) result_df = ( pd.DataFrame.from_dict(result_json, orient="index") .reset_index() .rename(columns={"index": "Metric", 0: "Value"}) ) return result_df, result_plot