|
import re |
|
import json |
|
|
|
gender_lexicons = json.load(open("config/gender_lexicons.json", "r")) |
|
|
|
|
|
def count_gender_terms(text, gender_terms): |
|
pattern = r"\b({})\b".format("|".join(gender_terms)) |
|
matches = re.findall(pattern, str(text)) |
|
return len(matches) |
|
|
|
|
|
def get_gender_tag(count_m_term, count_f_term): |
|
total_terms = count_m_term + count_f_term |
|
if total_terms == 0: |
|
return "No Gender" |
|
|
|
m_proportion = (count_m_term / total_terms) * 100 |
|
if m_proportion >= 75: |
|
return "Male Strongly Positive Gender" |
|
elif m_proportion >= 50: |
|
return "Male Positive Gender" |
|
|
|
f_proportion = (count_f_term / total_terms) * 100 |
|
if f_proportion >= 75: |
|
return "Female Strongly Positive Gender" |
|
elif f_proportion >= 50: |
|
return "Female Positive Gender" |
|
|
|
return "Equal Gender" |
|
|
|
|
|
def get_pg_spg(sample_df): |
|
gender_labels = [ |
|
"Gender", |
|
"No Gender", |
|
"Equal Gender", |
|
"Female Positive Gender", |
|
"Male Positive Gender", |
|
"Female Strongly Positive Gender", |
|
"Male Strongly Positive Gender", |
|
] |
|
|
|
gender_counts = sample_df["gender_cat"].value_counts() |
|
result = {label: str(gender_counts.get(label, 0)) for label in gender_labels} |
|
|
|
return result |
|
|
|
|
|
def eval_gender_divide(data): |
|
male_terms = gender_lexicons.get("male_lexicons") |
|
female_terms = gender_lexicons.get("female_lexicons") |
|
|
|
data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip() |
|
|
|
data["count_male_term"] = data.apply( |
|
lambda x: count_gender_terms(x[data.columns[0]], male_terms), axis=1 |
|
) |
|
data["count_female_term"] = data.apply( |
|
lambda x: count_gender_terms(x[:], female_terms), axis=1 |
|
) |
|
|
|
data["gender_cat"] = data.apply( |
|
lambda row: get_gender_tag(row["count_male_term"], row["count_female_term"]), |
|
axis=1, |
|
) |
|
|
|
collection = get_pg_spg(data) |
|
return collection |
|
|