|
import re |
|
import json |
|
import plotly.express as px |
|
import pandas as pd |
|
|
|
with open("config/gender_lexicons.json", "r") as lexicon_file: |
|
gender_lexicons = json.load(lexicon_file) |
|
|
|
male_lexicon = set(gender_lexicons.get("male_lexicons")) |
|
female_lexicon = set(gender_lexicons.get("female_lexicons")) |
|
|
|
male_pattern = re.compile(r"\b({})\b".format("|".join(map(re.escape, male_lexicon)))) |
|
female_pattern = re.compile( |
|
r"\b({})\b".format("|".join(map(re.escape, female_lexicon))) |
|
) |
|
|
|
|
|
def count_gender_terms(text, gender_pattern): |
|
matches = re.findall(gender_pattern, text) |
|
return len(matches) |
|
|
|
|
|
def get_gender_tag(count_male_terms, count_female_terms): |
|
total_terms = count_male_terms + count_female_terms |
|
|
|
if total_terms == 0: |
|
return "No Gender" |
|
|
|
male_proportion = (count_male_terms / total_terms) * 100 |
|
if male_proportion >= 75: |
|
return "Male Strongly Positive Gender" |
|
elif male_proportion >= 50: |
|
return "Male Positive Gender" |
|
|
|
female_proportion = (count_female_terms / total_terms) * 100 |
|
if female_proportion >= 75: |
|
return "Female Strongly Positive Gender" |
|
elif female_proportion >= 50: |
|
return "Female Positive Gender" |
|
|
|
return "Equal Gender" |
|
|
|
|
|
def get_gender_category_counts(sample_df): |
|
gender_labels = [ |
|
"No Gender", |
|
"Equal Gender", |
|
"Male Positive Gender", |
|
"Male Strongly Positive Gender", |
|
"Female Positive Gender", |
|
"Female Strongly Positive Gender", |
|
] |
|
|
|
gender_counts = sample_df["gender_category"].value_counts() |
|
result = {label: str(gender_counts.get(label, 0)) for label in gender_labels} |
|
|
|
return result |
|
|
|
|
|
def plot_gender_category_counts(gender_labels): |
|
labels = [ |
|
"No Gender", |
|
"Equal Gender", |
|
"Male Positive Gender", |
|
"Male Strongly Positive Gender", |
|
"Female Positive Gender", |
|
"Female Strongly Positive Gender", |
|
] |
|
|
|
values = [gender_labels[label] for label in labels] |
|
|
|
fig = px.pie( |
|
values=values, |
|
names=labels, |
|
title="Gender Distribution", |
|
category_orders={"names": labels}, |
|
) |
|
|
|
fig.update_traces( |
|
pull=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1], |
|
textinfo="percent+label", |
|
marker=dict(line=dict(color="#000000", width=1)), |
|
) |
|
|
|
fig.update_layout(showlegend=False) |
|
|
|
return fig |
|
|
|
|
|
def eval_gender_distribution(data): |
|
data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip() |
|
|
|
data["count_male_terms"] = data[data.columns[0]].apply( |
|
lambda x: count_gender_terms(x, male_pattern) |
|
) |
|
data["count_female_terms"] = data[data.columns[0]].apply( |
|
lambda x: count_gender_terms(x, female_pattern) |
|
) |
|
|
|
data["gender_category"] = data.apply( |
|
lambda row: get_gender_tag(row["count_male_terms"], row["count_female_terms"]), |
|
axis=1, |
|
) |
|
|
|
result_json = get_gender_category_counts(data) |
|
result_plot = plot_gender_category_counts(result_json) |
|
|
|
result_df = ( |
|
pd.DataFrame.from_dict(result_json, orient="index") |
|
.reset_index() |
|
.rename(columns={"index": "Metric", 0: "Value"}) |
|
) |
|
|
|
result_conclusion = "" |
|
|
|
return result_df, result_plot, result_conclusion |
|
|