|
|
|
import pandas as pd |
|
import re |
|
from utils.read_config import get_args |
|
|
|
|
|
def count_male_terms(text, male_terms): |
|
|
|
pattern = r"\b({})\b".format("|".join(male_terms)) |
|
match = re.findall(pattern, str(text)) |
|
return len(match) |
|
|
|
|
|
def count_female_terms(text, female_terms): |
|
|
|
pattern = r"\b({})\b".format("|".join(female_terms)) |
|
match = re.findall(pattern, str(text)) |
|
return len(match) |
|
|
|
|
|
def get_gender_tag(count_m_term, count_f_term): |
|
tag = '' |
|
if count_m_term == 0 and count_f_term == 0: |
|
tag = "No Gender" |
|
|
|
elif count_m_term == count_f_term: |
|
tag = "Equal Gender" |
|
|
|
elif count_m_term > count_f_term: |
|
m_proportion = (count_m_term / (count_m_term + count_f_term)) * 100 |
|
if m_proportion >= 50 and m_proportion < 75: |
|
tag = "Male Positive Gender" |
|
elif m_proportion >= 75: |
|
tag = "Male Strongly Positive Gender" |
|
|
|
elif count_m_term < count_f_term: |
|
f_proportion = (count_f_term / (count_m_term + count_f_term)) * 100 |
|
if f_proportion >= 50 and f_proportion < 75: |
|
tag = "Female Positive Gender" |
|
elif f_proportion >= 75: |
|
tag = "Female Strongly Positive Gender" |
|
|
|
return tag |
|
|
|
|
|
|
|
def load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name): |
|
|
|
|
|
df = df[[col_name]] |
|
if sample_method == "first_record" and df.shape[0] > sample_first_records: |
|
df = df.iloc[:sample_first_records].copy().reset_index() |
|
if sample_method == "random_pick" and df.shape[0] > sample_first_records: |
|
df = df.sample(sample_first_records, random_state=sample_random_seed).copy().reset_index() |
|
return df |
|
|
|
|
|
def get_pg_spg(sample_df): |
|
count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"]['gender_cat'].count() |
|
|
|
count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"]['gender_cat'].count() |
|
count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"]['gender_cat'].count() |
|
|
|
count_male_pg = sample_df[sample_df['gender_cat'] == "Male Positive Gender"]['gender_cat'].count() |
|
count_male_spg = sample_df[sample_df['gender_cat'] == "Male Strongly Positive Gender"]['gender_cat'].count() |
|
|
|
count_female_pg = sample_df[sample_df['gender_cat'] == "Female Positive Gender"]['gender_cat'].count() |
|
count_female_spg = sample_df[sample_df['gender_cat'] == "Female Stronly Positive Gender"]['gender_cat'].count() |
|
|
|
return { |
|
"gender" : count_gender_sentences, |
|
"no gender" : count_no_gender_sentences, |
|
"equal gender" : count_equal_gender, |
|
"female pg" : count_female_pg, |
|
"male pg" : count_male_pg, |
|
"female spg" : count_female_spg, |
|
"male spg" : count_male_spg |
|
} |
|
|
|
|
|
def load_dataset_and_analyze_gender_tag(df, sample_method, col_name): |
|
|
|
|
|
male_terms = get_args("male_terms") |
|
female_terms = get_args("female_terms") |
|
sample_first_records = get_args("first_records") |
|
sample_random_seed = get_args("random_seed") |
|
|
|
|
|
|
|
sample_df = load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name) |
|
|
|
|
|
|
|
sample_df[col_name] = sample_df[col_name].str.lower().str.strip() |
|
|
|
|
|
sample_df['count_male_term'] = sample_df.apply(lambda x : count_male_terms(x[col_name], male_terms), axis=1) |
|
sample_df['count_female_term'] = sample_df.apply(lambda x : count_female_terms(x[:], female_terms), axis=1) |
|
|
|
|
|
sample_df['gender_cat'] = sample_df.apply(lambda row: get_gender_tag(row['count_male_term'], row['count_female_term']), axis=1) |
|
|
|
|
|
collection = get_pg_spg(sample_df) |
|
return collection |
|
|
|
|
|
|
|
|