File size: 2,845 Bytes
0946447
 
 
 
 
 
7192c24
 
 
 
0946447
 
 
7192c24
0946447
 
 
 
 
 
 
 
7192c24
 
0946447
 
 
 
7192c24
0946447
 
 
 
 
 
7192c24
 
 
0946447
7192c24
 
 
 
 
 
0946447
7192c24
 
 
 
0946447
7192c24
0946447
 
7192c24
0946447
 
 
 
 
 
 
 
 
7192c24
 
 
0946447
 
 
 
 
 
 
 
7192c24
0946447
7192c24
 
0946447
 
7192c24
 
 
 
 
 
 
 
0946447
8ab9329
7192c24
 
0946447
8ab9329
 
0946447
8ab9329
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import re
import json
import plotly.express as px
import pandas as pd


def load_gender_lexicons():
    with open("config/gender_lexicons.json", "r") as lexicon_file:
        gender_lexicons = json.load(lexicon_file)
    return gender_lexicons


def count_gender_terms(text, gender_pattern):
    return len(gender_pattern.findall(text))


def get_gender_tag(count_male_terms, count_female_terms):
    total_terms = count_male_terms + count_female_terms
    if total_terms == 0:
        return "No Gender"

    male_proportion = (count_male_terms / total_terms) * 100
    female_proportion = (count_female_terms / total_terms) * 100

    if male_proportion >= 75:
        return "Male Strongly Positive Gender"
    elif male_proportion >= 50:
        return "Male Positive Gender"
    elif female_proportion >= 75:
        return "Female Strongly Positive Gender"
    elif female_proportion >= 50:
        return "Female Positive Gender"
    return "Equal Gender"


def analyze_text(text, gender_lexicons):
    male_lexicon = set(gender_lexicons.get("male_lexicons"))
    female_lexicon = set(gender_lexicons.get("female_lexicons"))

    male_pattern = re.compile(
        r"\b({})\b".format("|".join(map(re.escape, male_lexicon)))
    )
    female_pattern = re.compile(
        r"\b({})\b".format("|".join(map(re.escape, female_lexicon)))
    )

    text = text.lower().strip()
    count_male_terms = count_gender_terms(text, male_pattern)
    count_female_terms = count_gender_terms(text, female_pattern)
    gender_category = get_gender_tag(count_male_terms, count_female_terms)

    return count_male_terms, count_female_terms, gender_category


def plot_gender_category_counts(labels, values):
    fig = px.pie(
        values=values,
        names=labels,
        title="Gender Distribution",
    )

    fig.update_traces(
        pull=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
        textinfo="percent+label",
        marker=dict(
            line=dict(color="#000000", width=1),
        ),
    )

    fig.update_layout(showlegend=False)

    return fig


def eval_gender_distribution(data):
    gender_lexicons = load_gender_lexicons()

    data["count_male_terms"], data["count_female_terms"], data["gender_category"] = zip(
        *data[data.columns[0]].apply(lambda x: analyze_text(x, gender_lexicons))
    )

    gender_labels = [
        "No Gender",
        "Equal Gender",
        "Male Positive Gender",
        "Male Strongly Positive Gender",
        "Female Positive Gender",
        "Female Strongly Positive Gender",
    ]

    result_json = (
        data["gender_category"].value_counts().reindex(gender_labels, fill_value=0)
    )

    result_df = pd.DataFrame({"Metric": result_json.index, "Value": result_json.values})
    result_plot = plot_gender_category_counts(gender_labels, result_json)

    return result_df, result_plot