import re import json import pandas as pd import plotly.express as px import multiprocessing.pool from spacy.lang.en import English gender_lexicons = json.load(open("config/gender_lexicons.json", "r")) profession_lexicons = json.load(open("config/profession_lexicons.json", "r")) nlp = English() nlp.add_pipe("sentencizer") def get_split_text(text): doc = nlp(text) sentences = [sent for sent in doc.sents] return sentences def compile_regex_patterns(patterns): return [ re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE) for pattern in patterns ] def get_gender_prof_match_details(df_text): male_pronouns = gender_lexicons.get("male_pronouns") female_pronouns = gender_lexicons.get("female_pronouns") professions = profession_lexicons.get("professions") male_pronoun_pat, female_pronoun_pat, professions_pat = compile_regex_patterns( [male_pronouns, female_pronouns, professions] ) split_text = get_split_text(df_text) results = [] for text in split_text: male_pronoun_match = re.findall(male_pronoun_pat, str(text)) female_pronoun_match = re.findall(female_pronoun_pat, str(text)) prof_match = re.findall(professions_pat, str(text)) both_match = "No" if len(male_pronoun_match) != 0 and len(prof_match) != 0: both_match = "Yes" if len(female_pronoun_match) != 0 and len(prof_match) != 0: both_match = "Yes" male_pronoun_match = ",".join(male_pronoun_match) female_pronoun_match = ",".join(female_pronoun_match) prof_match = ",".join(prof_match) results.append( ( str(text), male_pronoun_match, female_pronoun_match, prof_match, both_match, ) ) return results def call_multiprocessing_pool(df_text): concurrent = 2000 pool = multiprocessing.pool.ThreadPool(processes=concurrent) result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1) pool.close() flat_return_list = [item for sublist in result_list for item in sublist] cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"] return_df = pd.DataFrame(flat_return_list, columns=cols) return return_df def get_statistics(result): stats = { "both_gender_prof_match": str((result["Both Match"] == "Yes").sum()), "count_male_pronoun": str((result["Male Pronoun"] != "").sum()), "count_female_pronoun": str((result["Female Pronoun"] != "").sum()), "count_male_pronoun_profession": str( ((result["Male Pronoun"] != "") & (result["Profession"] != "")).sum() ), "count_female_pronoun_profession": str( ((result["Female Pronoun"] != "") & (result["Profession"] != "")).sum() ), "total_sentence": str(len(result)), } return stats def get_plot(result_df): return def eval_gender_profession(data): data = data[data.columns[0]].str.lower().str.strip() result = call_multiprocessing_pool(data) result_json = get_statistics(result) result_plot = get_plot(result_json) result_df = ( pd.DataFrame.from_dict(result_json, orient="index") .reset_index() .rename(columns={"index": "Metric", 0: "Value"}) ) result_conclusion = "" return result_df, result_plot, result_conclusion