File size: 2,845 Bytes
0946447 7192c24 0946447 7192c24 0946447 7192c24 0946447 7192c24 0946447 7192c24 0946447 7192c24 0946447 7192c24 0946447 7192c24 0946447 7192c24 0946447 7192c24 0946447 7192c24 0946447 7192c24 0946447 7192c24 0946447 8ab9329 7192c24 0946447 8ab9329 0946447 8ab9329 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import re
import json
import plotly.express as px
import pandas as pd
def load_gender_lexicons():
with open("config/gender_lexicons.json", "r") as lexicon_file:
gender_lexicons = json.load(lexicon_file)
return gender_lexicons
def count_gender_terms(text, gender_pattern):
return len(gender_pattern.findall(text))
def get_gender_tag(count_male_terms, count_female_terms):
total_terms = count_male_terms + count_female_terms
if total_terms == 0:
return "No Gender"
male_proportion = (count_male_terms / total_terms) * 100
female_proportion = (count_female_terms / total_terms) * 100
if male_proportion >= 75:
return "Male Strongly Positive Gender"
elif male_proportion >= 50:
return "Male Positive Gender"
elif female_proportion >= 75:
return "Female Strongly Positive Gender"
elif female_proportion >= 50:
return "Female Positive Gender"
return "Equal Gender"
def analyze_text(text, gender_lexicons):
male_lexicon = set(gender_lexicons.get("male_lexicons"))
female_lexicon = set(gender_lexicons.get("female_lexicons"))
male_pattern = re.compile(
r"\b({})\b".format("|".join(map(re.escape, male_lexicon)))
)
female_pattern = re.compile(
r"\b({})\b".format("|".join(map(re.escape, female_lexicon)))
)
text = text.lower().strip()
count_male_terms = count_gender_terms(text, male_pattern)
count_female_terms = count_gender_terms(text, female_pattern)
gender_category = get_gender_tag(count_male_terms, count_female_terms)
return count_male_terms, count_female_terms, gender_category
def plot_gender_category_counts(labels, values):
fig = px.pie(
values=values,
names=labels,
title="Gender Distribution",
)
fig.update_traces(
pull=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
textinfo="percent+label",
marker=dict(
line=dict(color="#000000", width=1),
),
)
fig.update_layout(showlegend=False)
return fig
def eval_gender_distribution(data):
gender_lexicons = load_gender_lexicons()
data["count_male_terms"], data["count_female_terms"], data["gender_category"] = zip(
*data[data.columns[0]].apply(lambda x: analyze_text(x, gender_lexicons))
)
gender_labels = [
"No Gender",
"Equal Gender",
"Male Positive Gender",
"Male Strongly Positive Gender",
"Female Positive Gender",
"Female Strongly Positive Gender",
]
result_json = (
data["gender_category"].value_counts().reindex(gender_labels, fill_value=0)
)
result_df = pd.DataFrame({"Metric": result_json.index, "Value": result_json.values})
result_plot = plot_gender_category_counts(gender_labels, result_json)
return result_df, result_plot
|