Spaces:

avid-ml
/

biasaware

Sleeping

App Files Files Community

biasaware / scripts /gender_tagging.py

sudipta002

Add backend scripts

b20457e almost 2 years ago

raw

history blame

4.09 kB

	# Import required libraries
	import pandas as pd
	import re
	from utils.read_config import get_args

	# Function to get count of male terms in text
	def count_male_terms(text, male_terms):
	# Get pattern
	pattern = r"\b({})\b".format("\|".join(male_terms))
	match = re.findall(pattern, str(text))
	return len(match)

	# Function to get count of female terms in text
	def count_female_terms(text, female_terms):
	# Get pattern
	pattern = r"\b({})\b".format("\|".join(female_terms))
	match = re.findall(pattern, str(text))
	return len(match)

	# Function to get gender tag categories
	def get_gender_tag(count_m_term, count_f_term):
	tag = ''
	if count_m_term == 0 and count_f_term == 0:
	tag = "No Gender"

	elif count_m_term == count_f_term:
	tag = "Equal Gender"

	elif count_m_term > count_f_term:
	m_proportion = (count_m_term / (count_m_term + count_f_term)) * 100
	if m_proportion >= 50 and m_proportion < 75:
	tag = "Male Positive Gender"
	elif m_proportion >= 75:
	tag = "Male Strongly Positive Gender"

	elif count_m_term < count_f_term:
	f_proportion = (count_f_term / (count_m_term + count_f_term)) * 100
	if f_proportion >= 50 and f_proportion < 75:
	tag = "Female Positive Gender"
	elif f_proportion >= 75:
	tag = "Female Strongly Positive Gender"

	return tag


	# Function to load sample of dataset
	def load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name):

	# Keep only requireed column
	df = df[[col_name]]
	if sample_method == "first_record" and df.shape[0] > sample_first_records:
	df = df.iloc[:sample_first_records].copy().reset_index()
	if sample_method == "random_pick" and df.shape[0] > sample_first_records:
	df = df.sample(sample_first_records, random_state=sample_random_seed).copy().reset_index()
	return df

	# Function to calculate PG and SPG
	def get_pg_spg(sample_df):
	count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"]['gender_cat'].count()

	count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"]['gender_cat'].count()
	count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"]['gender_cat'].count()

	count_male_pg = sample_df[sample_df['gender_cat'] == "Male Positive Gender"]['gender_cat'].count()
	count_male_spg = sample_df[sample_df['gender_cat'] == "Male Strongly Positive Gender"]['gender_cat'].count()

	count_female_pg = sample_df[sample_df['gender_cat'] == "Female Positive Gender"]['gender_cat'].count()
	count_female_spg = sample_df[sample_df['gender_cat'] == "Female Stronly Positive Gender"]['gender_cat'].count()

	return {
	"gender" : count_gender_sentences,
	"no gender" : count_no_gender_sentences,
	"equal gender" : count_equal_gender,
	"female pg" : count_female_pg,
	"male pg" : count_male_pg,
	"female spg" : count_female_spg,
	"male spg" : count_male_spg
	}

	# Function to load dataset and get the analysis done
	def load_dataset_and_analyze_gender_tag(df, sample_method, col_name):
	# Read config file

	male_terms = get_args("male_terms")
	female_terms = get_args("female_terms")
	sample_first_records = get_args("first_records")
	sample_random_seed = get_args("random_seed")



	sample_df = load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name)


	# Lowercase of text
	sample_df[col_name] = sample_df[col_name].str.lower().str.strip()

	# Get new columns of count - male terms and female terms
	sample_df['count_male_term'] = sample_df.apply(lambda x : count_male_terms(x[col_name], male_terms), axis=1)
	sample_df['count_female_term'] = sample_df.apply(lambda x : count_female_terms(x[:], female_terms), axis=1)

	# Get tag categories
	sample_df['gender_cat'] = sample_df.apply(lambda row: get_gender_tag(row['count_male_term'], row['count_female_term']), axis=1)

	# Get statistics
	collection = get_pg_spg(sample_df)
	return collection