File size: 3,226 Bytes
0946447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import re
import json
import plotly.express as px
import pandas as pd

with open("config/gender_lexicons.json", "r") as lexicon_file:
    gender_lexicons = json.load(lexicon_file)

male_lexicon = set(gender_lexicons.get("male_lexicons"))
female_lexicon = set(gender_lexicons.get("female_lexicons"))

male_pattern = re.compile(r"\b({})\b".format("|".join(map(re.escape, male_lexicon))))
female_pattern = re.compile(
    r"\b({})\b".format("|".join(map(re.escape, female_lexicon)))
)


def count_gender_terms(text, gender_pattern):
    matches = re.findall(gender_pattern, text)
    return len(matches)


def get_gender_tag(count_male_terms, count_female_terms):
    total_terms = count_male_terms + count_female_terms

    if total_terms == 0:
        return "No Gender"

    male_proportion = (count_male_terms / total_terms) * 100
    if male_proportion >= 75:
        return "Male Strongly Positive Gender"
    elif male_proportion >= 50:
        return "Male Positive Gender"

    female_proportion = (count_female_terms / total_terms) * 100
    if female_proportion >= 75:
        return "Female Strongly Positive Gender"
    elif female_proportion >= 50:
        return "Female Positive Gender"

    return "Equal Gender"


def get_gender_category_counts(sample_df):
    gender_labels = [
        "No Gender",
        "Equal Gender",
        "Male Positive Gender",
        "Male Strongly Positive Gender",
        "Female Positive Gender",
        "Female Strongly Positive Gender",
    ]

    gender_counts = sample_df["gender_category"].value_counts()
    result = {label: str(gender_counts.get(label, 0)) for label in gender_labels}

    return result


def plot_gender_category_counts(gender_labels):
    labels = [
        "No Gender",
        "Equal Gender",
        "Male Positive Gender",
        "Male Strongly Positive Gender",
        "Female Positive Gender",
        "Female Strongly Positive Gender",
    ]

    values = [gender_labels[label] for label in labels]

    fig = px.pie(
        values=values,
        names=labels,
        title="Gender Distribution",
        category_orders={"names": labels},
    )

    fig.update_traces(
        pull=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
        textinfo="percent+label",
        marker=dict(line=dict(color="#000000", width=1)),
    )

    fig.update_layout(showlegend=False)

    return fig


def eval_gender_distribution(data):
    data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip()

    data["count_male_terms"] = data[data.columns[0]].apply(
        lambda x: count_gender_terms(x, male_pattern)
    )
    data["count_female_terms"] = data[data.columns[0]].apply(
        lambda x: count_gender_terms(x, female_pattern)
    )

    data["gender_category"] = data.apply(
        lambda row: get_gender_tag(row["count_male_terms"], row["count_female_terms"]),
        axis=1,
    )

    result_json = get_gender_category_counts(data)
    result_plot = plot_gender_category_counts(result_json)

    result_df = (
        pd.DataFrame.from_dict(result_json, orient="index")
        .reset_index()
        .rename(columns={"index": "Metric", 0: "Value"})
    )

    result_conclusion = ""

    return result_df, result_plot, result_conclusion