File size: 3,543 Bytes
b20457e a365da6 b20457e a365da6 b20457e a365da6 b20457e a365da6 b20457e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
# Import required libraries
import pandas as pd
import re
from utils.read_config import get_args
from utils.load_csv import load_sample
# Function to get count of male terms in text
def count_male_terms(text, male_terms):
# Get pattern
pattern = r"\b({})\b".format("|".join(male_terms))
match = re.findall(pattern, str(text))
return len(match)
# Function to get count of female terms in text
def count_female_terms(text, female_terms):
# Get pattern
pattern = r"\b({})\b".format("|".join(female_terms))
match = re.findall(pattern, str(text))
return len(match)
# Function to get gender tag categories
def get_gender_tag(count_m_term, count_f_term):
tag = ''
if count_m_term == 0 and count_f_term == 0:
tag = "No Gender"
elif count_m_term == count_f_term:
tag = "Equal Gender"
elif count_m_term > count_f_term:
m_proportion = (count_m_term / (count_m_term + count_f_term)) * 100
if m_proportion >= 50 and m_proportion < 75:
tag = "Male Positive Gender"
elif m_proportion >= 75:
tag = "Male Strongly Positive Gender"
elif count_m_term < count_f_term:
f_proportion = (count_f_term / (count_m_term + count_f_term)) * 100
if f_proportion >= 50 and f_proportion < 75:
tag = "Female Positive Gender"
elif f_proportion >= 75:
tag = "Female Strongly Positive Gender"
return tag
# Function to calculate PG and SPG
def get_pg_spg(sample_df):
count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"]['gender_cat'].count()
count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"]['gender_cat'].count()
count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"]['gender_cat'].count()
count_male_pg = sample_df[sample_df['gender_cat'] == "Male Positive Gender"]['gender_cat'].count()
count_male_spg = sample_df[sample_df['gender_cat'] == "Male Strongly Positive Gender"]['gender_cat'].count()
count_female_pg = sample_df[sample_df['gender_cat'] == "Female Positive Gender"]['gender_cat'].count()
count_female_spg = sample_df[sample_df['gender_cat'] == "Female Stronly Positive Gender"]['gender_cat'].count()
return {
"gender" : str(count_gender_sentences),
"no gender" : str(count_no_gender_sentences),
"equal gender" : str(count_equal_gender),
"female pg" : str(count_female_pg),
"male pg" : str(count_male_pg),
"female spg" : str(count_female_spg),
"male spg" : str(count_male_spg)
}
# Function to load dataset and get the analysis done
def load_dataset_and_analyze_gender_tag(df, sample_method, col_name, num_sample_records):
# Read config file
male_terms = get_args("male_terms")
female_terms = get_args("female_terms")
# Load sample
sample_df = load_sample(num_sample_records, sample_method, df, col_name)
# Lowercase of text
sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
# Get new columns of count - male terms and female terms
sample_df['count_male_term'] = sample_df.apply(lambda x : count_male_terms(x[col_name], male_terms), axis=1)
sample_df['count_female_term'] = sample_df.apply(lambda x : count_female_terms(x[:], female_terms), axis=1)
# Get tag categories
sample_df['gender_cat'] = sample_df.apply(lambda row: get_gender_tag(row['count_male_term'], row['count_female_term']), axis=1)
# Get statistics
collection = get_pg_spg(sample_df)
return collection
|