biasaware / scripts /gender_divide.py
freyam's picture
Optimise evaluation logic
6d2d9db
raw
history blame
1.93 kB
import re
import json
gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
def count_gender_terms(text, gender_terms):
pattern = r"\b({})\b".format("|".join(gender_terms))
matches = re.findall(pattern, str(text))
return len(matches)
def get_gender_tag(count_m_term, count_f_term):
total_terms = count_m_term + count_f_term
if total_terms == 0:
return "No Gender"
m_proportion = (count_m_term / total_terms) * 100
if m_proportion >= 75:
return "Male Strongly Positive Gender"
elif m_proportion >= 50:
return "Male Positive Gender"
f_proportion = (count_f_term / total_terms) * 100
if f_proportion >= 75:
return "Female Strongly Positive Gender"
elif f_proportion >= 50:
return "Female Positive Gender"
return "Equal Gender"
def get_pg_spg(sample_df):
gender_labels = [
"Gender",
"No Gender",
"Equal Gender",
"Female Positive Gender",
"Male Positive Gender",
"Female Strongly Positive Gender",
"Male Strongly Positive Gender",
]
gender_counts = sample_df["gender_cat"].value_counts()
result = {label: str(gender_counts.get(label, 0)) for label in gender_labels}
return result
def eval_gender_divide(data):
male_terms = gender_lexicons.get("male_lexicons")
female_terms = gender_lexicons.get("female_lexicons")
data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip()
data["count_male_term"] = data.apply(
lambda x: count_gender_terms(x[data.columns[0]], male_terms), axis=1
)
data["count_female_term"] = data.apply(
lambda x: count_gender_terms(x[:], female_terms), axis=1
)
data["gender_cat"] = data.apply(
lambda row: get_gender_tag(row["count_male_term"], row["count_female_term"]),
axis=1,
)
collection = get_pg_spg(data)
return collection