File size: 3,502 Bytes
e0db39e
 
 
 
0946447
e0db39e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d2d9db
 
 
 
 
 
 
 
 
 
 
e0db39e
 
 
 
 
0946447
 
 
 
e0db39e
 
 
 
 
0946447
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import re
import json

import pandas as pd
import plotly.express as px
import multiprocessing.pool
from spacy.lang.en import English

gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
profession_lexicons = json.load(open("config/profession_lexicons.json", "r"))

nlp = English()
nlp.add_pipe("sentencizer")


def get_split_text(text):
    doc = nlp(text)
    sentences = [sent for sent in doc.sents]
    return sentences


def compile_regex_patterns(patterns):
    return [
        re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE)
        for pattern in patterns
    ]


def get_gender_prof_match_details(df_text):
    male_pronouns = gender_lexicons.get("male_pronouns")
    female_pronouns = gender_lexicons.get("female_pronouns")
    professions = profession_lexicons.get("professions")

    male_pronoun_pat, female_pronoun_pat, professions_pat = compile_regex_patterns(
        [male_pronouns, female_pronouns, professions]
    )

    split_text = get_split_text(df_text)

    results = []

    for text in split_text:
        male_pronoun_match = re.findall(male_pronoun_pat, str(text))
        female_pronoun_match = re.findall(female_pronoun_pat, str(text))

        prof_match = re.findall(professions_pat, str(text))

        both_match = "No"

        if len(male_pronoun_match) != 0 and len(prof_match) != 0:
            both_match = "Yes"

        if len(female_pronoun_match) != 0 and len(prof_match) != 0:
            both_match = "Yes"

        male_pronoun_match = ",".join(male_pronoun_match)
        female_pronoun_match = ",".join(female_pronoun_match)

        prof_match = ",".join(prof_match)

        results.append(
            (
                str(text),
                male_pronoun_match,
                female_pronoun_match,
                prof_match,
                both_match,
            )
        )

    return results


def call_multiprocessing_pool(df_text):
    concurrent = 2000
    pool = multiprocessing.pool.ThreadPool(processes=concurrent)
    result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
    pool.close()

    flat_return_list = [item for sublist in result_list for item in sublist]

    cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
    return_df = pd.DataFrame(flat_return_list, columns=cols)

    return return_df


def get_statistics(result):
    stats = {
        "both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
        "count_male_pronoun": str((result["Male Pronoun"] != "").sum()),
        "count_female_pronoun": str((result["Female Pronoun"] != "").sum()),
        "count_male_pronoun_profession": str(
            ((result["Male Pronoun"] != "") & (result["Profession"] != "")).sum()
        ),
        "count_female_pronoun_profession": str(
            ((result["Female Pronoun"] != "") & (result["Profession"] != "")).sum()
        ),
        "total_sentence": str(len(result)),
    }

    return stats


def get_plot(result_df):
    return


def eval_gender_profession(data):
    data = data[data.columns[0]].str.lower().str.strip()

    result = call_multiprocessing_pool(data)

    result_json = get_statistics(result)
    result_plot = get_plot(result_json)

    result_df = (
        pd.DataFrame.from_dict(result_json, orient="index")
        .reset_index()
        .rename(columns={"index": "Metric", 0: "Value"})
    )

    result_conclusion = ""

    return result_df, result_plot, result_conclusion