import re import json gender_lexicons = json.load(open("config/gender_lexicons.json", "r")) def count_male_terms(text, male_terms): pattern = r"\b({})\b".format("|".join(male_terms)) match = re.findall(pattern, str(text)) return len(match) def count_female_terms(text, female_terms): pattern = r"\b({})\b".format("|".join(female_terms)) match = re.findall(pattern, str(text)) return len(match) def get_gender_tag(count_m_term, count_f_term): tag = "" if count_m_term == 0 and count_f_term == 0: tag = "No Gender" elif count_m_term == count_f_term: tag = "Equal Gender" elif count_m_term > count_f_term: m_proportion = (count_m_term / (count_m_term + count_f_term)) * 100 if m_proportion >= 50 and m_proportion < 75: tag = "Male Positive Gender" elif m_proportion >= 75: tag = "Male Strongly Positive Gender" elif count_m_term < count_f_term: f_proportion = (count_f_term / (count_m_term + count_f_term)) * 100 if f_proportion >= 50 and f_proportion < 75: tag = "Female Positive Gender" elif f_proportion >= 75: tag = "Female Strongly Positive Gender" return tag def get_pg_spg(sample_df): count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"][ "gender_cat" ].count() count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"][ "gender_cat" ].count() count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"][ "gender_cat" ].count() count_male_pg = sample_df[sample_df["gender_cat"] == "Male Positive Gender"][ "gender_cat" ].count() count_male_spg = sample_df[ sample_df["gender_cat"] == "Male Strongly Positive Gender" ]["gender_cat"].count() count_female_pg = sample_df[sample_df["gender_cat"] == "Female Positive Gender"][ "gender_cat" ].count() count_female_spg = sample_df[ sample_df["gender_cat"] == "Female Stronly Positive Gender" ]["gender_cat"].count() return { "gender": str(count_gender_sentences), "no gender": str(count_no_gender_sentences), "equal gender": str(count_equal_gender), "female pg": str(count_female_pg), "male pg": str(count_male_pg), "female spg": str(count_female_spg), "male spg": str(count_male_spg), } def eval_gender_divide(data): male_terms = gender_lexicons.get("male_lexicons") female_terms = gender_lexicons.get("female_lexicons") data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip() data["count_male_term"] = data.apply( lambda x: count_male_terms(x[data.columns[0]], male_terms), axis=1 ) data["count_female_term"] = data.apply( lambda x: count_female_terms(x[:], female_terms), axis=1 ) data["gender_cat"] = data.apply( lambda row: get_gender_tag(row["count_male_term"], row["count_female_term"]), axis=1, ) collection = get_pg_spg(data) return collection