# Import required libraries import pandas as pd import re from utils.read_config import get_args from utils.load_csv import load_sample # Function to get count of male terms in text def count_male_terms(text, male_terms): # Get pattern pattern = r"\b({})\b".format("|".join(male_terms)) match = re.findall(pattern, str(text)) return len(match) # Function to get count of female terms in text def count_female_terms(text, female_terms): # Get pattern pattern = r"\b({})\b".format("|".join(female_terms)) match = re.findall(pattern, str(text)) return len(match) # Function to get gender tag categories def get_gender_tag(count_m_term, count_f_term): tag = '' if count_m_term == 0 and count_f_term == 0: tag = "No Gender" elif count_m_term == count_f_term: tag = "Equal Gender" elif count_m_term > count_f_term: m_proportion = (count_m_term / (count_m_term + count_f_term)) * 100 if m_proportion >= 50 and m_proportion < 75: tag = "Male Positive Gender" elif m_proportion >= 75: tag = "Male Strongly Positive Gender" elif count_m_term < count_f_term: f_proportion = (count_f_term / (count_m_term + count_f_term)) * 100 if f_proportion >= 50 and f_proportion < 75: tag = "Female Positive Gender" elif f_proportion >= 75: tag = "Female Strongly Positive Gender" return tag # Function to calculate PG and SPG def get_pg_spg(sample_df): count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"]['gender_cat'].count() count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"]['gender_cat'].count() count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"]['gender_cat'].count() count_male_pg = sample_df[sample_df['gender_cat'] == "Male Positive Gender"]['gender_cat'].count() count_male_spg = sample_df[sample_df['gender_cat'] == "Male Strongly Positive Gender"]['gender_cat'].count() count_female_pg = sample_df[sample_df['gender_cat'] == "Female Positive Gender"]['gender_cat'].count() count_female_spg = sample_df[sample_df['gender_cat'] == "Female Stronly Positive Gender"]['gender_cat'].count() return { "gender" : str(count_gender_sentences), "no gender" : str(count_no_gender_sentences), "equal gender" : str(count_equal_gender), "female pg" : str(count_female_pg), "male pg" : str(count_male_pg), "female spg" : str(count_female_spg), "male spg" : str(count_male_spg) } # Function to load dataset and get the analysis done def load_dataset_and_analyze_gender_tag(df, sample_method, col_name, num_sample_records): # Read config file male_terms = get_args("male_terms") female_terms = get_args("female_terms") # Load sample sample_df = load_sample(num_sample_records, sample_method, df, col_name) # Lowercase of text sample_df[col_name] = sample_df[col_name].str.lower().str.strip() # Get new columns of count - male terms and female terms sample_df['count_male_term'] = sample_df.apply(lambda x : count_male_terms(x[col_name], male_terms), axis=1) sample_df['count_female_term'] = sample_df.apply(lambda x : count_female_terms(x[:], female_terms), axis=1) # Get tag categories sample_df['gender_cat'] = sample_df.apply(lambda row: get_gender_tag(row['count_male_term'], row['count_female_term']), axis=1) # Get statistics collection = get_pg_spg(sample_df) return collection