biasaware / scripts /gender_distribution.py
freyam's picture
Add Plot for Gender Divide
0946447
raw
history blame
3.23 kB
import re
import json
import plotly.express as px
import pandas as pd
with open("config/gender_lexicons.json", "r") as lexicon_file:
gender_lexicons = json.load(lexicon_file)
male_lexicon = set(gender_lexicons.get("male_lexicons"))
female_lexicon = set(gender_lexicons.get("female_lexicons"))
male_pattern = re.compile(r"\b({})\b".format("|".join(map(re.escape, male_lexicon))))
female_pattern = re.compile(
r"\b({})\b".format("|".join(map(re.escape, female_lexicon)))
)
def count_gender_terms(text, gender_pattern):
matches = re.findall(gender_pattern, text)
return len(matches)
def get_gender_tag(count_male_terms, count_female_terms):
total_terms = count_male_terms + count_female_terms
if total_terms == 0:
return "No Gender"
male_proportion = (count_male_terms / total_terms) * 100
if male_proportion >= 75:
return "Male Strongly Positive Gender"
elif male_proportion >= 50:
return "Male Positive Gender"
female_proportion = (count_female_terms / total_terms) * 100
if female_proportion >= 75:
return "Female Strongly Positive Gender"
elif female_proportion >= 50:
return "Female Positive Gender"
return "Equal Gender"
def get_gender_category_counts(sample_df):
gender_labels = [
"No Gender",
"Equal Gender",
"Male Positive Gender",
"Male Strongly Positive Gender",
"Female Positive Gender",
"Female Strongly Positive Gender",
]
gender_counts = sample_df["gender_category"].value_counts()
result = {label: str(gender_counts.get(label, 0)) for label in gender_labels}
return result
def plot_gender_category_counts(gender_labels):
labels = [
"No Gender",
"Equal Gender",
"Male Positive Gender",
"Male Strongly Positive Gender",
"Female Positive Gender",
"Female Strongly Positive Gender",
]
values = [gender_labels[label] for label in labels]
fig = px.pie(
values=values,
names=labels,
title="Gender Distribution",
category_orders={"names": labels},
)
fig.update_traces(
pull=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
textinfo="percent+label",
marker=dict(line=dict(color="#000000", width=1)),
)
fig.update_layout(showlegend=False)
return fig
def eval_gender_distribution(data):
data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip()
data["count_male_terms"] = data[data.columns[0]].apply(
lambda x: count_gender_terms(x, male_pattern)
)
data["count_female_terms"] = data[data.columns[0]].apply(
lambda x: count_gender_terms(x, female_pattern)
)
data["gender_category"] = data.apply(
lambda row: get_gender_tag(row["count_male_terms"], row["count_female_terms"]),
axis=1,
)
result_json = get_gender_category_counts(data)
result_plot = plot_gender_category_counts(result_json)
result_df = (
pd.DataFrame.from_dict(result_json, orient="index")
.reset_index()
.rename(columns={"index": "Metric", 0: "Value"})
)
result_conclusion = ""
return result_df, result_plot, result_conclusion