File size: 3,502 Bytes
e0db39e 0946447 e0db39e 6d2d9db e0db39e 0946447 e0db39e 0946447 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import re
import json
import pandas as pd
import plotly.express as px
import multiprocessing.pool
from spacy.lang.en import English
gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
profession_lexicons = json.load(open("config/profession_lexicons.json", "r"))
nlp = English()
nlp.add_pipe("sentencizer")
def get_split_text(text):
doc = nlp(text)
sentences = [sent for sent in doc.sents]
return sentences
def compile_regex_patterns(patterns):
return [
re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE)
for pattern in patterns
]
def get_gender_prof_match_details(df_text):
male_pronouns = gender_lexicons.get("male_pronouns")
female_pronouns = gender_lexicons.get("female_pronouns")
professions = profession_lexicons.get("professions")
male_pronoun_pat, female_pronoun_pat, professions_pat = compile_regex_patterns(
[male_pronouns, female_pronouns, professions]
)
split_text = get_split_text(df_text)
results = []
for text in split_text:
male_pronoun_match = re.findall(male_pronoun_pat, str(text))
female_pronoun_match = re.findall(female_pronoun_pat, str(text))
prof_match = re.findall(professions_pat, str(text))
both_match = "No"
if len(male_pronoun_match) != 0 and len(prof_match) != 0:
both_match = "Yes"
if len(female_pronoun_match) != 0 and len(prof_match) != 0:
both_match = "Yes"
male_pronoun_match = ",".join(male_pronoun_match)
female_pronoun_match = ",".join(female_pronoun_match)
prof_match = ",".join(prof_match)
results.append(
(
str(text),
male_pronoun_match,
female_pronoun_match,
prof_match,
both_match,
)
)
return results
def call_multiprocessing_pool(df_text):
concurrent = 2000
pool = multiprocessing.pool.ThreadPool(processes=concurrent)
result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
pool.close()
flat_return_list = [item for sublist in result_list for item in sublist]
cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
return_df = pd.DataFrame(flat_return_list, columns=cols)
return return_df
def get_statistics(result):
stats = {
"both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
"count_male_pronoun": str((result["Male Pronoun"] != "").sum()),
"count_female_pronoun": str((result["Female Pronoun"] != "").sum()),
"count_male_pronoun_profession": str(
((result["Male Pronoun"] != "") & (result["Profession"] != "")).sum()
),
"count_female_pronoun_profession": str(
((result["Female Pronoun"] != "") & (result["Profession"] != "")).sum()
),
"total_sentence": str(len(result)),
}
return stats
def get_plot(result_df):
return
def eval_gender_profession(data):
data = data[data.columns[0]].str.lower().str.strip()
result = call_multiprocessing_pool(data)
result_json = get_statistics(result)
result_plot = get_plot(result_json)
result_df = (
pd.DataFrame.from_dict(result_json, orient="index")
.reset_index()
.rename(columns={"index": "Metric", 0: "Value"})
)
result_conclusion = ""
return result_df, result_plot, result_conclusion
|