File size: 3,543 Bytes
b20457e
 
 
 
a365da6
b20457e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a365da6
 
 
 
 
 
 
b20457e
 
 
a365da6
b20457e
 
 
a365da6
 
b20457e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# Import required libraries
import pandas as pd
import re
from utils.read_config import get_args
from utils.load_csv import load_sample

# Function to get count of male terms in text
def count_male_terms(text, male_terms):
  # Get pattern
  pattern = r"\b({})\b".format("|".join(male_terms))
  match = re.findall(pattern, str(text))
  return len(match)

# Function to get count of female terms in text
def count_female_terms(text, female_terms):
  # Get pattern
  pattern = r"\b({})\b".format("|".join(female_terms)) 
  match = re.findall(pattern, str(text))
  return len(match)

# Function to get gender tag categories
def get_gender_tag(count_m_term, count_f_term):
    tag = ''
    if count_m_term == 0 and count_f_term == 0:
        tag = "No Gender"

    elif count_m_term == count_f_term:
        tag = "Equal Gender"

    elif count_m_term > count_f_term:
        m_proportion = (count_m_term / (count_m_term + count_f_term)) * 100
        if m_proportion >= 50 and m_proportion < 75:
            tag = "Male Positive Gender"
        elif m_proportion >= 75:
            tag = "Male Strongly Positive Gender"

    elif count_m_term < count_f_term:
        f_proportion = (count_f_term / (count_m_term + count_f_term)) * 100
        if f_proportion >= 50 and f_proportion < 75:
            tag = "Female Positive Gender"
        elif f_proportion >= 75:
            tag = "Female Strongly Positive Gender"

    return tag


# Function to calculate PG and SPG
def get_pg_spg(sample_df):
    count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"]['gender_cat'].count()

    count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"]['gender_cat'].count()
    count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"]['gender_cat'].count()

    count_male_pg = sample_df[sample_df['gender_cat'] == "Male Positive Gender"]['gender_cat'].count()
    count_male_spg = sample_df[sample_df['gender_cat'] == "Male Strongly Positive Gender"]['gender_cat'].count()

    count_female_pg = sample_df[sample_df['gender_cat'] == "Female Positive Gender"]['gender_cat'].count()
    count_female_spg = sample_df[sample_df['gender_cat'] == "Female Stronly Positive Gender"]['gender_cat'].count()
    
    return {
    "gender" : str(count_gender_sentences),
    "no gender" : str(count_no_gender_sentences),
    "equal gender" : str(count_equal_gender),
    "female pg" : str(count_female_pg),
    "male pg" : str(count_male_pg),
    "female spg" : str(count_female_spg),
    "male spg" : str(count_male_spg)
    }

# Function to load dataset and get the analysis done
def load_dataset_and_analyze_gender_tag(df, sample_method, col_name, num_sample_records):
    # Read config file
    male_terms = get_args("male_terms")
    female_terms = get_args("female_terms")
    # Load sample
    sample_df = load_sample(num_sample_records, sample_method, df, col_name)

    # Lowercase of text
    sample_df[col_name] = sample_df[col_name].str.lower().str.strip()

    # Get new columns of count - male terms and female terms
    sample_df['count_male_term'] = sample_df.apply(lambda x : count_male_terms(x[col_name], male_terms), axis=1)
    sample_df['count_female_term'] = sample_df.apply(lambda x : count_female_terms(x[:], female_terms), axis=1)

    # Get tag categories
    sample_df['gender_cat'] = sample_df.apply(lambda row: get_gender_tag(row['count_male_term'], row['count_female_term']), axis=1)
     
    # Get statistics
    collection = get_pg_spg(sample_df)
    return collection