biasaware / scripts /gender_distribution.py
freyam
Add sample size limit and AVID report
8ab9329
raw
history blame
2.85 kB
import re
import json
import plotly.express as px
import pandas as pd
def load_gender_lexicons():
with open("config/gender_lexicons.json", "r") as lexicon_file:
gender_lexicons = json.load(lexicon_file)
return gender_lexicons
def count_gender_terms(text, gender_pattern):
return len(gender_pattern.findall(text))
def get_gender_tag(count_male_terms, count_female_terms):
total_terms = count_male_terms + count_female_terms
if total_terms == 0:
return "No Gender"
male_proportion = (count_male_terms / total_terms) * 100
female_proportion = (count_female_terms / total_terms) * 100
if male_proportion >= 75:
return "Male Strongly Positive Gender"
elif male_proportion >= 50:
return "Male Positive Gender"
elif female_proportion >= 75:
return "Female Strongly Positive Gender"
elif female_proportion >= 50:
return "Female Positive Gender"
return "Equal Gender"
def analyze_text(text, gender_lexicons):
male_lexicon = set(gender_lexicons.get("male_lexicons"))
female_lexicon = set(gender_lexicons.get("female_lexicons"))
male_pattern = re.compile(
r"\b({})\b".format("|".join(map(re.escape, male_lexicon)))
)
female_pattern = re.compile(
r"\b({})\b".format("|".join(map(re.escape, female_lexicon)))
)
text = text.lower().strip()
count_male_terms = count_gender_terms(text, male_pattern)
count_female_terms = count_gender_terms(text, female_pattern)
gender_category = get_gender_tag(count_male_terms, count_female_terms)
return count_male_terms, count_female_terms, gender_category
def plot_gender_category_counts(labels, values):
fig = px.pie(
values=values,
names=labels,
title="Gender Distribution",
)
fig.update_traces(
pull=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
textinfo="percent+label",
marker=dict(
line=dict(color="#000000", width=1),
),
)
fig.update_layout(showlegend=False)
return fig
def eval_gender_distribution(data):
gender_lexicons = load_gender_lexicons()
data["count_male_terms"], data["count_female_terms"], data["gender_category"] = zip(
*data[data.columns[0]].apply(lambda x: analyze_text(x, gender_lexicons))
)
gender_labels = [
"No Gender",
"Equal Gender",
"Male Positive Gender",
"Male Strongly Positive Gender",
"Female Positive Gender",
"Female Strongly Positive Gender",
]
result_json = (
data["gender_category"].value_counts().reindex(gender_labels, fill_value=0)
)
result_df = pd.DataFrame({"Metric": result_json.index, "Value": result_json.values})
result_plot = plot_gender_category_counts(gender_labels, result_json)
return result_df, result_plot