Spaces:

avid-ml
/

biasaware

Running

File size: 3,502 Bytes

import re
import json

import pandas as pd
import plotly.express as px
import multiprocessing.pool
from spacy.lang.en import English

gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
profession_lexicons = json.load(open("config/profession_lexicons.json", "r"))

nlp = English()
nlp.add_pipe("sentencizer")


def get_split_text(text):
    doc = nlp(text)
    sentences = [sent for sent in doc.sents]
    return sentences


def compile_regex_patterns(patterns):
    return [
        re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE)
        for pattern in patterns
    ]


def get_gender_prof_match_details(df_text):
    male_pronouns = gender_lexicons.get("male_pronouns")
    female_pronouns = gender_lexicons.get("female_pronouns")
    professions = profession_lexicons.get("professions")

    male_pronoun_pat, female_pronoun_pat, professions_pat = compile_regex_patterns(
        [male_pronouns, female_pronouns, professions]
    )

    split_text = get_split_text(df_text)

    results = []

    for text in split_text:
        male_pronoun_match = re.findall(male_pronoun_pat, str(text))
        female_pronoun_match = re.findall(female_pronoun_pat, str(text))

        prof_match = re.findall(professions_pat, str(text))

        both_match = "No"

        if len(male_pronoun_match) != 0 and len(prof_match) != 0:
            both_match = "Yes"

        if len(female_pronoun_match) != 0 and len(prof_match) != 0:
            both_match = "Yes"

        male_pronoun_match = ",".join(male_pronoun_match)
        female_pronoun_match = ",".join(female_pronoun_match)

        prof_match = ",".join(prof_match)

        results.append(
            (
                str(text),
                male_pronoun_match,
                female_pronoun_match,
                prof_match,
                both_match,
            )
        )

    return results


def call_multiprocessing_pool(df_text):
    concurrent = 2000
    pool = multiprocessing.pool.ThreadPool(processes=concurrent)
    result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
    pool.close()

    flat_return_list = [item for sublist in result_list for item in sublist]

    cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
    return_df = pd.DataFrame(flat_return_list, columns=cols)

    return return_df


def get_statistics(result):
    stats = {
        "both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
        "count_male_pronoun": str((result["Male Pronoun"] != "").sum()),
        "count_female_pronoun": str((result["Female Pronoun"] != "").sum()),
        "count_male_pronoun_profession": str(
            ((result["Male Pronoun"] != "") & (result["Profession"] != "")).sum()
        ),
        "count_female_pronoun_profession": str(
            ((result["Female Pronoun"] != "") & (result["Profession"] != "")).sum()
        ),
        "total_sentence": str(len(result)),
    }

    return stats


def get_plot(result_df):
    return


def eval_gender_profession(data):
    data = data[data.columns[0]].str.lower().str.strip()

    result = call_multiprocessing_pool(data)

    result_json = get_statistics(result)
    result_plot = get_plot(result_json)

    result_df = (
        pd.DataFrame.from_dict(result_json, orient="index")
        .reset_index()
        .rename(columns={"index": "Metric", 0: "Value"})
    )

    result_conclusion = ""

    return result_df, result_plot, result_conclusion