import re import json gender_lexicons = json.load(open("config/gender_lexicons.json", "r")) def count_gender_terms(text, gender_terms): pattern = r"\b({})\b".format("|".join(gender_terms)) matches = re.findall(pattern, str(text)) return len(matches) def get_gender_tag(count_m_term, count_f_term): total_terms = count_m_term + count_f_term if total_terms == 0: return "No Gender" m_proportion = (count_m_term / total_terms) * 100 if m_proportion >= 75: return "Male Strongly Positive Gender" elif m_proportion >= 50: return "Male Positive Gender" f_proportion = (count_f_term / total_terms) * 100 if f_proportion >= 75: return "Female Strongly Positive Gender" elif f_proportion >= 50: return "Female Positive Gender" return "Equal Gender" def get_pg_spg(sample_df): gender_labels = [ "Gender", "No Gender", "Equal Gender", "Female Positive Gender", "Male Positive Gender", "Female Strongly Positive Gender", "Male Strongly Positive Gender", ] gender_counts = sample_df["gender_cat"].value_counts() result = {label: str(gender_counts.get(label, 0)) for label in gender_labels} return result def eval_gender_divide(data): male_terms = gender_lexicons.get("male_lexicons") female_terms = gender_lexicons.get("female_lexicons") data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip() data["count_male_term"] = data.apply( lambda x: count_gender_terms(x[data.columns[0]], male_terms), axis=1 ) data["count_female_term"] = data.apply( lambda x: count_gender_terms(x[:], female_terms), axis=1 ) data["gender_cat"] = data.apply( lambda row: get_gender_tag(row["count_male_term"], row["count_female_term"]), axis=1, ) collection = get_pg_spg(data) return collection