Spaces:

avid-ml
/

biasaware

Sleeping

App Files Files Community

sudipta002 commited on Aug 31, 2023

Commit

b20457e

1 Parent(s): 239a9e5

Add backend scripts

Browse files

Files changed (8) hide show

.gitignore +2 -0
requirements.txt +3 -2
scripts/.keep +0 -0
scripts/gender_profession_tagging.py +140 -0
scripts/gender_tagging.py +109 -0
setup.py +7 -0
utils/config.json +160 -0
utils/read_config.py +13 -0

.gitignore CHANGED Viewed

@@ -1,5 +1,7 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class

 # Byte-compiled / optimized / DLL files
 __pycache__/
+testing/
+check_gender_tagging.py
 *.py[cod]
 *$py.class

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
-gradio==3.41.2
 gradio_client==0.5.0
 numpy==1.25.2
-pandas==2.0.3

+gradio==3.40.1
 gradio_client==0.5.0
 numpy==1.25.2
+pandas==2.0.3
+spacy

scripts/.keep DELETED Viewed

File without changes

scripts/gender_profession_tagging.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import pandas as pd
+import re
+import spacy
+from spacy.lang.en import English
+import time
+from tqdm import tqdm
+import multiprocessing.pool
+import warnings
+warnings.filterwarnings("ignore")
+from utils.read_config import get_args
+# For sentence split
+nlp = English()
+nlp.add_pipe("sentencizer")
+# Function to split sentences
+def get_split_text(text):
+    doc = nlp(text)
+    sentences = [sent for sent in doc.sents]
+    return sentences
+def get_gender_prof_match_details(df_text):
+    # Get args from config file
+    male_pronoun = get_args("male_pronoun")
+    female_pronoun = get_args("female_pronoun")
+    professions = get_args("professions")
+    # Get regex pattern
+    male_pronoun_pat, female_pronoun_pat, professions_pat = get_regex_pattern(male_pronoun, female_pronoun, professions)
+    split_text = get_split_text(df_text)
+    results = []
+    for text in split_text:
+        male_pronoun_match = re.findall(male_pronoun_pat, str(text))
+        female_pronoun_match = re.findall(female_pronoun_pat, str(text))
+        prof_match = re.findall(professions_pat, str(text))
+        both_match = "No"
+        if len(male_pronoun_match) != 0 and len(prof_match) != 0:
+            both_match = "Yes"
+        if len(female_pronoun_match) != 0 and len(prof_match) != 0:
+            both_match = "Yes"
+        # Unpack from list
+        male_pronoun_match = ",".join(male_pronoun_match)
+        female_pronoun_match = ",".join(female_pronoun_match)
+        prof_match = ",".join(prof_match)
+        results.append((str(text), male_pronoun_match, female_pronoun_match, prof_match, both_match))
+    return results
+# Function to call multiprocessing threadpool
+def call_multiprocessing_pool(df_text):
+    concurrent = 2000
+    pool = multiprocessing.pool.ThreadPool(processes=concurrent)
+    result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
+    pool.close()
+    # return_list is nested -- we need to flatten it
+    flat_return_list = [item for sublist in result_list for item in sublist]
+    # add column names
+    cols = ["Split_Text", 'Male Pronoun', 'Female Pronoun', 'Profession', "Both Match"]
+    return_df = pd.DataFrame(flat_return_list, columns=cols)
+    return return_df
+# Function to get statistics
+def get_statistics(results_df):
+    count_total_sentence = results_df.shape[0]
+    count_both_match = results_df[results_df["Both Match"] == "Yes"]['Both Match'].count()
+    count_male_pronoun = results_df[results_df["Male Pronoun"] != ""]["Male Pronoun"].count()
+    count_female_pronoun = results_df[results_df["Female Pronoun"] != ""]["Female Pronoun"].count()
+    count_male_pronoun_profession = results_df[(results_df["Male Pronoun"] != "") & (results_df["Profession"] != "")]["Male Pronoun"].count()
+    count_female_pronoun_profession = results_df[(results_df["Female Pronoun"] != "") & (results_df["Profession"] != "")]["Female Pronoun"].count()
+    return{
+        "total_sentence" : count_total_sentence,
+        "both_gender_prof_match" : count_both_match,
+        "count_male_pronoun" : count_male_pronoun,
+        "count_female_pronoun" : count_female_pronoun,
+        "count_male_pronoun_profession" : count_male_pronoun_profession,
+        "count_female_pronoun_profession" : count_female_pronoun_profession
+    }
+# Function to return regular expression patterns
+def get_regex_pattern(male_pronoun, female_pronoun, professions):
+    male_pronoun_pat = r'\b({})\b'.format("|".join(male_pronoun))
+    female_pronoun_pat = r'\b({})\b'.format("|".join(female_pronoun))
+    #Lower case male professioon
+    professions = [prof.lower() for prof in professions]
+    professions_pat = r'\b({})\b'.format("|".join(professions))
+    return male_pronoun_pat, female_pronoun_pat, professions_pat
+# Function to load sample of dataset
+def load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name):
+    # Keep only requireed column
+    df = df[[col_name]]
+    if sample_method == "first_record" and df.shape[0] > sample_first_records:
+        df = df.iloc[:sample_first_records].copy().reset_index()
+    if sample_method == "random_pick" and df.shape[0] > sample_first_records:
+        df = df.sample(sample_first_records, random_state=sample_random_seed).copy().reset_index()
+    return df
+def load_dataset_and_analyze_gender_profession(df, sample_method, col_name):
+    # Get args from config file
+    sample_first_records = get_args("first_records")
+    sample_random_seed = get_args("random_seed")
+    sample_df = load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name)
+    # Lowercase of text
+    sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
+    # Call multiple threadpool
+    results_df = call_multiprocessing_pool(sample_df[col_name])
+    stats = get_statistics(results_df)
+    # Get statistics
+    return stats

scripts/gender_tagging.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# Import required libraries
+import pandas as pd
+import re
+from utils.read_config import get_args
+# Function to get count of male terms in text
+def count_male_terms(text, male_terms):
+  # Get pattern
+  pattern = r"\b({})\b".format("|".join(male_terms))
+  match = re.findall(pattern, str(text))
+  return len(match)
+# Function to get count of female terms in text
+def count_female_terms(text, female_terms):
+  # Get pattern
+  pattern = r"\b({})\b".format("|".join(female_terms))
+  match = re.findall(pattern, str(text))
+  return len(match)
+# Function to get gender tag categories
+def get_gender_tag(count_m_term, count_f_term):
+    tag = ''
+    if count_m_term == 0 and count_f_term == 0:
+        tag = "No Gender"
+    elif count_m_term == count_f_term:
+        tag = "Equal Gender"
+    elif count_m_term > count_f_term:
+        m_proportion = (count_m_term / (count_m_term + count_f_term)) * 100
+        if m_proportion >= 50 and m_proportion < 75:
+            tag = "Male Positive Gender"
+        elif m_proportion >= 75:
+            tag = "Male Strongly Positive Gender"
+    elif count_m_term < count_f_term:
+        f_proportion = (count_f_term / (count_m_term + count_f_term)) * 100
+        if f_proportion >= 50 and f_proportion < 75:
+            tag = "Female Positive Gender"
+        elif f_proportion >= 75:
+            tag = "Female Strongly Positive Gender"
+    return tag
+# Function to load sample of dataset
+def load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name):
+    # Keep only requireed column
+    df = df[[col_name]]
+    if sample_method == "first_record" and df.shape[0] > sample_first_records:
+        df = df.iloc[:sample_first_records].copy().reset_index()
+    if sample_method == "random_pick" and df.shape[0] > sample_first_records:
+        df = df.sample(sample_first_records, random_state=sample_random_seed).copy().reset_index()
+    return df
+# Function to calculate PG and SPG
+def get_pg_spg(sample_df):
+    count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"]['gender_cat'].count()
+    count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"]['gender_cat'].count()
+    count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"]['gender_cat'].count()
+    count_male_pg = sample_df[sample_df['gender_cat'] == "Male Positive Gender"]['gender_cat'].count()
+    count_male_spg = sample_df[sample_df['gender_cat'] == "Male Strongly Positive Gender"]['gender_cat'].count()
+    count_female_pg = sample_df[sample_df['gender_cat'] == "Female Positive Gender"]['gender_cat'].count()
+    count_female_spg = sample_df[sample_df['gender_cat'] == "Female Stronly Positive Gender"]['gender_cat'].count()
+    return {
+    "gender" : count_gender_sentences,
+    "no gender" : count_no_gender_sentences,
+    "equal gender" : count_equal_gender,
+    "female pg" : count_female_pg,
+    "male pg" : count_male_pg,
+    "female spg" : count_female_spg,
+    "male spg" : count_male_spg
+    }
+# Function to load dataset and get the analysis done
+def load_dataset_and_analyze_gender_tag(df, sample_method, col_name):
+    # Read config file
+    male_terms = get_args("male_terms")
+    female_terms = get_args("female_terms")
+    sample_first_records = get_args("first_records")
+    sample_random_seed = get_args("random_seed")
+    sample_df = load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name)
+    # Lowercase of text
+    sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
+    # Get new columns of count - male terms and female terms
+    sample_df['count_male_term'] = sample_df.apply(lambda x : count_male_terms(x[col_name], male_terms), axis=1)
+    sample_df['count_female_term'] = sample_df.apply(lambda x : count_female_terms(x[:], female_terms), axis=1)
+    # Get tag categories
+    sample_df['gender_cat'] = sample_df.apply(lambda row: get_gender_tag(row['count_male_term'], row['count_female_term']), axis=1)
+    # Get statistics
+    collection = get_pg_spg(sample_df)
+    return collection

setup.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from setuptools import setup, find_packages
+setup(
+    name='BIASAWARE',
+    version='0.1',
+    packages=find_packages(),
+)

utils/config.json ADDED Viewed

	@@ -0,0 +1,160 @@

+{
+"first_records" : 200,
+"random_seed" : 42,
+"male_terms" : ["man", "boy", "male", "he", "son", "his", "himself", "guy", "father", "john"],
+"female_terms" : ["woman", "girl", "female", "she", "daughter", "her", "herself", "gal", "mother", "mary"],
+"male_pronoun" : ["he", "him", "his"],
+"female_pronoun" : ["she", "her", "hers"],
+"professions" : ["Accountant",
+"Actor",
+"Actress",
+"Aerospace Engineer",
+"Agricultural Scientist",
+"Air Traffic Controller",
+"Aircraft Mechanic",
+"Animator",
+"Architect",
+"Art Director",
+"Attorney",
+"Lawyer",
+"Audiologist",
+"Author",
+"Writer",
+"Baker",
+"Barber",
+"Hairdresser",
+"Bartender",
+"Biomedical Engineer",
+"Botanist",
+"Broadcast Journalist",
+"Business Analyst",
+"Carpenter",
+"Chef",
+"Cook",
+"Chemist",
+"Civil Engineer",
+"Clinical Psychologist",
+"Commercial Diver",
+"Computer Programmer",
+"Construction Worker",
+"Corporate Trainer",
+"Cosmetologist",
+"Counselor",
+"Therapist",
+"Court Reporter",
+"Creative Director",
+"Criminologist",
+"Customer Service Representative",
+"Data Analyst",
+"Dental Assistant",
+"Dentist",
+"Dermatologist",
+"Dietician",
+"Nutritionist",
+"Doctor",
+"Physician",
+"Economist",
+"Electrician",
+"Elementary School Teacher",
+"Emergency Medical Technician",
+"Engineer",
+"Environmental Scientist",
+"Event Planner",
+"Fashion Designer",
+"Film Director",
+"Financial Analyst",
+"Firefighter",
+"Fisherman",
+"Fitness Trainer",
+"Flight Attendant",
+"Florist",
+"Food Scientist",
+"Forensic Scientist",
+"Furniture Maker",
+"Game Developer",
+"Gardener",
+"Landscaper",
+"Geologist",
+"Graphic Designer",
+"Hair Stylist",
+"Historian",
+"Home Health Aide",
+"Hotel Manager",
+"Human Resources Manager",
+"Immigration Lawyer",
+"Industrial Designer",
+"Insurance Agent",
+"Interior Designer",
+"Interpreter",
+"Translator",
+"Investment Banker",
+"IT Specialist",
+"Journalist",
+"Judge",
+"Kindergarten Teacher",
+"Land Surveyor",
+"Landscape Architect",
+"Lawyer",
+"Attorney",
+"Librarian",
+"Life Coach",
+"Linguist",
+"Makeup Artist",
+"Management Consultant",
+"Manufacturing Engineer",
+"Marine Biologist",
+"Marketing Manager",
+"Massage Therapist",
+"Mechanical Engineer",
+"Medical Assistant",
+"Medical Researcher",
+"Meteorologist",
+"Midwife",
+"Military Officer",
+"Music Producer",
+"Musician",
+"Nurse",
+"Occupational Therapist",
+"Optician",
+"Optometrist",
+"Paralegal",
+"Paramedic",
+"Patent Attorney",
+"Pediatrician",
+"Personal Trainer",
+"Petroleum Engineer",
+"Pharmacist",
+"Photographer",
+"Physical Therapist",
+"Physician Assistant",
+"Pilot",
+"Plumber",
+"Police Officer",
+"Political Scientist",
+"Preschool Teacher",
+"Private Investigator",
+"Product Manager",
+"Professor",
+"Lecturer",
+"Programmer",
+"Psychiatrist",
+"Psychologist",
+"Public Relations Specialist",
+"Public School Teacher",
+"Real Estate Agent",
+"Broker",
+"Receptionist",
+"Registered Nurse",
+"Reporter",
+"Restaurant Manager",
+"Sales Representative",
+"School Counselor",
+"Scientist",
+"Screenwriter",
+"Social Media Manager",
+"Social Worker",
+"Software Developer",
+"Speech-Language Pathologist",
+"Sports Coach",
+"Statistician"]
+}

utils/read_config.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import json
+def read_config_file():
+    with open("utils/config.json", "r") as jsonfile:
+        data = json.load(jsonfile)
+    return data
+def get_args(args):
+    try:
+        data = read_config_file()
+    except:
+        raise "Could not read config file."
+    return data[args]