|
import re |
|
import json |
|
import plotly.express as px |
|
import pandas as pd |
|
|
|
|
|
def load_gender_lexicons(): |
|
with open("config/gender_lexicons.json", "r") as lexicon_file: |
|
gender_lexicons = json.load(lexicon_file) |
|
return gender_lexicons |
|
|
|
|
|
def count_gender_terms(text, gender_pattern): |
|
return len(gender_pattern.findall(text)) |
|
|
|
|
|
def get_gender_tag(count_male_terms, count_female_terms): |
|
total_terms = count_male_terms + count_female_terms |
|
if total_terms == 0: |
|
return "No Gender" |
|
|
|
male_proportion = (count_male_terms / total_terms) * 100 |
|
female_proportion = (count_female_terms / total_terms) * 100 |
|
|
|
if male_proportion >= 75: |
|
return "Male Strongly Positive Gender" |
|
elif male_proportion >= 50: |
|
return "Male Positive Gender" |
|
elif female_proportion >= 75: |
|
return "Female Strongly Positive Gender" |
|
elif female_proportion >= 50: |
|
return "Female Positive Gender" |
|
return "Equal Gender" |
|
|
|
|
|
def analyze_text(text, gender_lexicons): |
|
male_lexicon = set(gender_lexicons.get("male_lexicons")) |
|
female_lexicon = set(gender_lexicons.get("female_lexicons")) |
|
|
|
male_pattern = re.compile( |
|
r"\b({})\b".format("|".join(map(re.escape, male_lexicon))) |
|
) |
|
female_pattern = re.compile( |
|
r"\b({})\b".format("|".join(map(re.escape, female_lexicon))) |
|
) |
|
|
|
text = text.lower().strip() |
|
count_male_terms = count_gender_terms(text, male_pattern) |
|
count_female_terms = count_gender_terms(text, female_pattern) |
|
gender_category = get_gender_tag(count_male_terms, count_female_terms) |
|
|
|
return count_male_terms, count_female_terms, gender_category |
|
|
|
|
|
def plot_gender_category_counts(labels, values): |
|
fig = px.pie( |
|
values=values, |
|
names=labels, |
|
title="Gender Distribution", |
|
) |
|
|
|
fig.update_traces( |
|
pull=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1], |
|
textinfo="percent+label", |
|
marker=dict( |
|
line=dict(color="#000000", width=1), |
|
), |
|
) |
|
|
|
fig.update_layout(showlegend=False) |
|
|
|
return fig |
|
|
|
|
|
def eval_gender_distribution(data): |
|
gender_lexicons = load_gender_lexicons() |
|
|
|
data["count_male_terms"], data["count_female_terms"], data["gender_category"] = zip( |
|
*data[data.columns[0]].apply(lambda x: analyze_text(x, gender_lexicons)) |
|
) |
|
|
|
gender_labels = [ |
|
"No Gender", |
|
"Equal Gender", |
|
"Male Positive Gender", |
|
"Male Strongly Positive Gender", |
|
"Female Positive Gender", |
|
"Female Strongly Positive Gender", |
|
] |
|
|
|
result_json = ( |
|
data["gender_category"].value_counts().reindex(gender_labels, fill_value=0) |
|
) |
|
|
|
result_df = pd.DataFrame({"Metric": result_json.index, "Value": result_json.values}) |
|
result_plot = plot_gender_category_counts(gender_labels, result_json) |
|
|
|
return result_df, result_plot |
|
|