import re import json import plotly.express as px import pandas as pd with open("config/gender_lexicons.json", "r") as lexicon_file: gender_lexicons = json.load(lexicon_file) male_lexicon = set(gender_lexicons.get("male_lexicons")) female_lexicon = set(gender_lexicons.get("female_lexicons")) male_pattern = re.compile(r"\b({})\b".format("|".join(map(re.escape, male_lexicon)))) female_pattern = re.compile( r"\b({})\b".format("|".join(map(re.escape, female_lexicon))) ) def count_gender_terms(text, gender_pattern): matches = re.findall(gender_pattern, text) return len(matches) def get_gender_tag(count_male_terms, count_female_terms): total_terms = count_male_terms + count_female_terms if total_terms == 0: return "No Gender" male_proportion = (count_male_terms / total_terms) * 100 if male_proportion >= 75: return "Male Strongly Positive Gender" elif male_proportion >= 50: return "Male Positive Gender" female_proportion = (count_female_terms / total_terms) * 100 if female_proportion >= 75: return "Female Strongly Positive Gender" elif female_proportion >= 50: return "Female Positive Gender" return "Equal Gender" def get_gender_category_counts(sample_df): gender_labels = [ "No Gender", "Equal Gender", "Male Positive Gender", "Male Strongly Positive Gender", "Female Positive Gender", "Female Strongly Positive Gender", ] gender_counts = sample_df["gender_category"].value_counts() result = {label: str(gender_counts.get(label, 0)) for label in gender_labels} return result def plot_gender_category_counts(gender_labels): labels = [ "No Gender", "Equal Gender", "Male Positive Gender", "Male Strongly Positive Gender", "Female Positive Gender", "Female Strongly Positive Gender", ] values = [gender_labels[label] for label in labels] fig = px.pie( values=values, names=labels, title="Gender Distribution", category_orders={"names": labels}, ) fig.update_traces( pull=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1], textinfo="percent+label", marker=dict(line=dict(color="#000000", width=1)), ) fig.update_layout(showlegend=False) return fig def eval_gender_distribution(data): data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip() data["count_male_terms"] = data[data.columns[0]].apply( lambda x: count_gender_terms(x, male_pattern) ) data["count_female_terms"] = data[data.columns[0]].apply( lambda x: count_gender_terms(x, female_pattern) ) data["gender_category"] = data.apply( lambda row: get_gender_tag(row["count_male_terms"], row["count_female_terms"]), axis=1, ) result_json = get_gender_category_counts(data) result_plot = plot_gender_category_counts(result_json) result_df = ( pd.DataFrame.from_dict(result_json, orient="index") .reset_index() .rename(columns={"index": "Metric", 0: "Value"}) ) result_conclusion = "" return result_df, result_plot, result_conclusion