Merge remote-tracking branch 'origin/dev-sudipta' into dev-frey
Browse files- .gitignore +3 -0
- app.py +42 -12
- data/z_animal.csv +11 -0
- data/z_employee.csv +26 -0
- data/z_house.csv +7 -0
- requirements.txt +4 -2
- scripts/.keep +0 -0
- scripts/genbit_metrics.py +48 -0
- scripts/gender_profession_tagging.py +129 -0
- scripts/gender_tagging.py +93 -0
- utils/config.json +160 -0
- utils/load_csv.py +23 -0
- utils/read_config.py +13 -0
.gitignore
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
# Byte-compiled / optimized / DLL files
|
2 |
__pycache__/
|
|
|
|
|
|
|
3 |
*.py[cod]
|
4 |
*$py.class
|
5 |
|
|
|
1 |
# Byte-compiled / optimized / DLL files
|
2 |
__pycache__/
|
3 |
+
testing/
|
4 |
+
flagged/
|
5 |
+
check_gender_tagging.py
|
6 |
*.py[cod]
|
7 |
*$py.class
|
8 |
|
app.py
CHANGED
@@ -1,6 +1,11 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import os
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
dummy_data = [
|
6 |
["Category", "Value", "Percentage"],
|
@@ -42,14 +47,32 @@ def display_methodology(methodology):
|
|
42 |
)
|
43 |
|
44 |
|
45 |
-
def run_evaluation(dataset_file, dataset_scope, dataset_scope_n,
|
46 |
-
status = {
|
47 |
-
"dataset": dataset_file.name,
|
48 |
-
"methodology": methodology,
|
49 |
-
"scope": dataset_scope + " " + str(dataset_scope_n),
|
50 |
-
"column": dataset_corpus.columns[0]
|
51 |
|
52 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
return gr.JSON.update(status, visible=True)
|
55 |
|
@@ -62,7 +85,11 @@ def run_evaluation(dataset_file, dataset_scope, dataset_scope_n, dataset_corpus,
|
|
62 |
|
63 |
|
64 |
def process_dataset(dataset):
|
|
|
65 |
data = pd.read_csv(dataset.name)
|
|
|
|
|
|
|
66 |
|
67 |
columns = data.columns.tolist()
|
68 |
columns = [x for x in columns if data[x].dtype == "object"]
|
@@ -98,7 +125,7 @@ def process_dataset(dataset):
|
|
98 |
|
99 |
def process_column(dataset, column):
|
100 |
data = pd.read_csv(dataset.name)
|
101 |
-
corpus = data[column].to_list()
|
102 |
|
103 |
return gr.Dataframe.update(value=pd.DataFrame({"Data Corpus": corpus}), max_rows=5, visible=True)
|
104 |
|
@@ -118,9 +145,12 @@ with BiasAware:
|
|
118 |
dataset_file = gr.File()
|
119 |
dataset_examples = gr.Examples(
|
120 |
[
|
121 |
-
os.path.join(os.path.dirname(__file__),
|
122 |
-
|
123 |
-
os.path.join(os.path.dirname(__file__),
|
|
|
|
|
|
|
124 |
|
125 |
],
|
126 |
inputs=dataset_file,
|
@@ -182,7 +212,7 @@ with BiasAware:
|
|
182 |
evalButton.click(
|
183 |
fn=run_evaluation,
|
184 |
inputs=[dataset_file, dataset_scope,
|
185 |
-
dataset_scope_n,
|
186 |
outputs=[result_status]
|
187 |
)
|
188 |
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import os
|
4 |
+
from scripts.genbit_metrics import *
|
5 |
+
from scripts.gender_profession_tagging import *
|
6 |
+
from scripts.gender_tagging import *
|
7 |
+
from utils.load_csv import *
|
8 |
+
from utils.read_config import get_args
|
9 |
|
10 |
dummy_data = [
|
11 |
["Category", "Value", "Percentage"],
|
|
|
47 |
)
|
48 |
|
49 |
|
50 |
+
def run_evaluation(dataset_file, dataset_scope, dataset_scope_n, dataset_columns, methodology):
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
+
status = {}
|
53 |
+
# Read CSV file
|
54 |
+
dataset = check_csv(dataset_file.name)
|
55 |
+
sample_method = dataset_scope
|
56 |
+
col_name = dataset_columns
|
57 |
+
num_sample_records = dataset_scope_n
|
58 |
+
|
59 |
+
# Check selected methodology
|
60 |
+
if methodology == "Term Identity Diversity Analysis":
|
61 |
+
status = load_dataset_and_analyze_gender_tag(
|
62 |
+
dataset, sample_method, col_name, num_sample_records)
|
63 |
+
if methodology == "Gender Label Evaluation":
|
64 |
+
status = load_dataset_and_analyze_gender_profession(
|
65 |
+
dataset, sample_method, col_name, num_sample_records)
|
66 |
+
if methodology == "Microsoft Genbit":
|
67 |
+
status = load_dataset_and_get_genbit_metrics(
|
68 |
+
dataset, sample_method, col_name, num_sample_records)
|
69 |
+
|
70 |
+
# status = {
|
71 |
+
# "dataset": dataset_file.name,
|
72 |
+
# "methodology": methodology,
|
73 |
+
# "scope": dataset_scope + " " + str(dataset_scope_n),
|
74 |
+
# "column": dataset_columns
|
75 |
+
# }
|
76 |
|
77 |
return gr.JSON.update(status, visible=True)
|
78 |
|
|
|
85 |
|
86 |
|
87 |
def process_dataset(dataset):
|
88 |
+
|
89 |
data = pd.read_csv(dataset.name)
|
90 |
+
# maximum_records = get_args("first_records")
|
91 |
+
# input_records = data.shape(0)
|
92 |
+
# num_sample_records = input_records if input_records < maximum_records else maximum_records
|
93 |
|
94 |
columns = data.columns.tolist()
|
95 |
columns = [x for x in columns if data[x].dtype == "object"]
|
|
|
125 |
|
126 |
def process_column(dataset, column):
|
127 |
data = pd.read_csv(dataset.name)
|
128 |
+
corpus = data[column].to_list()[:10]
|
129 |
|
130 |
return gr.Dataframe.update(value=pd.DataFrame({"Data Corpus": corpus}), max_rows=5, visible=True)
|
131 |
|
|
|
145 |
dataset_file = gr.File()
|
146 |
dataset_examples = gr.Examples(
|
147 |
[
|
148 |
+
os.path.join(os.path.dirname(__file__),
|
149 |
+
"data/z_animal.csv"),
|
150 |
+
os.path.join(os.path.dirname(__file__),
|
151 |
+
"data/z_employee.csv"),
|
152 |
+
os.path.join(os.path.dirname(
|
153 |
+
__file__), "data/z_house.csv"),
|
154 |
|
155 |
],
|
156 |
inputs=dataset_file,
|
|
|
212 |
evalButton.click(
|
213 |
fn=run_evaluation,
|
214 |
inputs=[dataset_file, dataset_scope,
|
215 |
+
dataset_scope_n, dataset_columns, methodology],
|
216 |
outputs=[result_status]
|
217 |
)
|
218 |
|
data/z_animal.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AnimalID,CommonName,ScientificName,Class,Order,Family,Habitat,ConservationStatus
|
2 |
+
1,Lion,Panthera leo,Mammalia,Carnivora,Felidae,Savanna,Vulnerable
|
3 |
+
2,Eagle,Aquila chrysaetos,Aves,Accipitriformes,Accipitridae,Mountains,Least Concern
|
4 |
+
3,Dolphin,Tursiops truncatus,Mammalia,Cetacea,Delphinidae,Ocean,Least Concern
|
5 |
+
4,Elephant,Loxodonta africana,Mammalia,Proboscidea,Elephantidae,Grassland,Vulnerable
|
6 |
+
5,Tiger,Panthera tigris,Mammalia,Carnivora,Felidae,Forest,Endangered
|
7 |
+
6,Penguin,Spheniscidae,Aves,Sphenisciformes,Spheniscidae,Antarctica,Least Concern
|
8 |
+
7,Giraffe,Giraffa camelopardalis,Mammalia,Artiodactyla,Giraffidae,Savanna,Vulnerable
|
9 |
+
8,Cheetah,Acinonyx jubatus,Mammalia,Carnivora,Felidae,Grassland,Vulnerable
|
10 |
+
9,Panda,Ailuropoda melanoleuca,Mammalia,Carnivora,Ursidae,Forest,Endangered
|
11 |
+
10,Kangaroo,Macropus rufus,Mammalia,Diprotodontia,Macropodidae,Grassland,Least Concern
|
data/z_employee.csv
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
EmployeeID,FirstName,LastName,Email,Department,Salary
|
2 |
+
101,John,Smith,[email protected],Finance,60000
|
3 |
+
102,Emily,Johnson,[email protected],Marketing,55000
|
4 |
+
103,Michael,Williams,[email protected],HR,50000
|
5 |
+
104,Susan,Anderson,[email protected],IT,65000
|
6 |
+
105,David,Martin,[email protected],Sales,58000
|
7 |
+
106,Linda,Davis,[email protected],Finance,62000
|
8 |
+
107,William,Miller,[email protected],Marketing,56000
|
9 |
+
108,Sarah,Anderson,[email protected],HR,51000
|
10 |
+
109,Robert,Clark,[email protected],IT,67000
|
11 |
+
110,Karen,Wilson,[email protected],Sales,59000
|
12 |
+
111,James,Brown,[email protected],Finance,61000
|
13 |
+
112,Anna,Johnson,[email protected],Marketing,57000
|
14 |
+
113,Christopher,Moore,[email protected],HR,52000
|
15 |
+
114,Laura,White,[email protected],IT,68000
|
16 |
+
115,Mark,Davis,[email protected],Sales,60000
|
17 |
+
116,Patricia,Jones,[email protected],Finance,63000
|
18 |
+
117,Matthew,Taylor,[email protected],Marketing,58000
|
19 |
+
118,Jennifer,Young,[email protected],HR,53000
|
20 |
+
119,Steven,Anderson,[email protected],IT,69000
|
21 |
+
120,Elizabeth,Thomas,[email protected],Sales,61000
|
22 |
+
121,Kevin,Harris,[email protected],Finance,64000
|
23 |
+
122,Deborah,Smith,[email protected],Marketing,59000
|
24 |
+
123,Joseph,Walker,[email protected],HR,54000
|
25 |
+
124,Cynthia,Jackson,[email protected],IT,70000
|
26 |
+
125,Daniel,Hall,[email protected],Sales,62000
|
data/z_house.csv
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PropertyID,StreetAddress,City,State,ZipCode,NumberOfBedrooms,NumberOfBathrooms,SquareFootage,Price
|
2 |
+
1,123 Main St,Los Angeles,CA,90001,3,2,1800,550000
|
3 |
+
2,456 Elm St,New York,NY,10001,2,1,1200,750000
|
4 |
+
3,789 Oak St,San Francisco,CA,94101,4,3,2500,950000
|
5 |
+
4,101 Maple St,Boston,MA,02101,3,2.5,2000,680000
|
6 |
+
5,202 Pine St,Miami,FL,33101,4,3.5,2700,820000
|
7 |
+
6,303 Cedar St,Chicago,IL,60601,2,1,1100,450000
|
requirements.txt
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
-
gradio==3.
|
2 |
gradio_client==0.5.0
|
3 |
numpy==1.25.2
|
4 |
-
pandas==2.0.3
|
|
|
|
|
|
1 |
+
gradio==3.40.1
|
2 |
gradio_client==0.5.0
|
3 |
numpy==1.25.2
|
4 |
+
pandas==2.0.3
|
5 |
+
spacy
|
6 |
+
genbit
|
scripts/.keep
DELETED
File without changes
|
scripts/genbit_metrics.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from genbit.genbit_metrics import GenBitMetrics
|
2 |
+
import pandas as pd
|
3 |
+
from utils.read_config import get_args
|
4 |
+
from utils.load_csv import load_sample
|
5 |
+
|
6 |
+
|
7 |
+
def cal_metrics(dataset):
|
8 |
+
# Create a GenBit object with the desired settings:
|
9 |
+
|
10 |
+
genbit_metrics_object = GenBitMetrics(language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80)
|
11 |
+
|
12 |
+
# Let's say you want to use GenBit with a test sentence, you can add the sentence to GenBit:
|
13 |
+
#dataset = ["I think she does not like cats. I think he does not like cats.", "He is a dog person."]
|
14 |
+
|
15 |
+
genbit_metrics_object.add_data(dataset, tokenized=False)
|
16 |
+
|
17 |
+
|
18 |
+
# To generate the gender bias metrics, we run `get_metrics` by setting `output_statistics` and `output_word_lists` to false, we can reduce the number of metrics created.
|
19 |
+
metrics = genbit_metrics_object.get_metrics(output_statistics=True, output_word_list=True)
|
20 |
+
|
21 |
+
return metrics
|
22 |
+
|
23 |
+
|
24 |
+
# Function to extract genbit metrics
|
25 |
+
def extract_genbit_metris(stats):
|
26 |
+
metrics = {}
|
27 |
+
metrics["genbit_score"] = str(stats["genbit_score"])
|
28 |
+
metrics["percentage_of_female_gender_definition_words"] = str(stats["percentage_of_female_gender_definition_words"])
|
29 |
+
metrics["percentage_of_male_gender_definition_words"] = str(stats["percentage_of_male_gender_definition_words"])
|
30 |
+
metrics["percentage_of_non_binary_gender_definition_words"] = str(stats["percentage_of_non_binary_gender_definition_words"])
|
31 |
+
metrics["percentage_of_trans_gender_definition_words"] = str(stats["percentage_of_trans_gender_definition_words"])
|
32 |
+
metrics["percentage_of_cis_gender_definition_words"] = str(stats["percentage_of_cis_gender_definition_words"])
|
33 |
+
metrics["num_words_considered"] = str(stats["statistics"]["num_words_considered"])
|
34 |
+
|
35 |
+
return metrics
|
36 |
+
|
37 |
+
def load_dataset_and_get_genbit_metrics(df, sample_method, col_name, num_sample_records):
|
38 |
+
|
39 |
+
|
40 |
+
sample_df = load_sample(num_sample_records, sample_method, df, col_name)
|
41 |
+
|
42 |
+
# Turn into a list of text.
|
43 |
+
sample_text = sample_df[col_name].tolist()
|
44 |
+
|
45 |
+
# Call cal_metrics function
|
46 |
+
stats = cal_metrics(sample_text)
|
47 |
+
metrics = extract_genbit_metris(stats)
|
48 |
+
return metrics
|
scripts/gender_profession_tagging.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import re
|
3 |
+
import spacy
|
4 |
+
from spacy.lang.en import English
|
5 |
+
import time
|
6 |
+
from tqdm import tqdm
|
7 |
+
import multiprocessing.pool
|
8 |
+
|
9 |
+
import warnings
|
10 |
+
warnings.filterwarnings("ignore")
|
11 |
+
from utils.read_config import get_args
|
12 |
+
from utils.load_csv import load_sample
|
13 |
+
|
14 |
+
|
15 |
+
# For sentence split
|
16 |
+
nlp = English()
|
17 |
+
nlp.add_pipe("sentencizer")
|
18 |
+
|
19 |
+
# Function to split sentences
|
20 |
+
def get_split_text(text):
|
21 |
+
|
22 |
+
doc = nlp(text)
|
23 |
+
sentences = [sent for sent in doc.sents]
|
24 |
+
return sentences
|
25 |
+
|
26 |
+
def get_gender_prof_match_details(df_text):
|
27 |
+
|
28 |
+
# Get args from config file
|
29 |
+
male_pronoun = get_args("male_pronoun")
|
30 |
+
female_pronoun = get_args("female_pronoun")
|
31 |
+
professions = get_args("professions")
|
32 |
+
|
33 |
+
# Get regex pattern
|
34 |
+
male_pronoun_pat, female_pronoun_pat, professions_pat = get_regex_pattern(male_pronoun, female_pronoun, professions)
|
35 |
+
|
36 |
+
|
37 |
+
split_text = get_split_text(df_text)
|
38 |
+
|
39 |
+
results = []
|
40 |
+
|
41 |
+
for text in split_text:
|
42 |
+
male_pronoun_match = re.findall(male_pronoun_pat, str(text))
|
43 |
+
female_pronoun_match = re.findall(female_pronoun_pat, str(text))
|
44 |
+
|
45 |
+
prof_match = re.findall(professions_pat, str(text))
|
46 |
+
|
47 |
+
both_match = "No"
|
48 |
+
|
49 |
+
if len(male_pronoun_match) != 0 and len(prof_match) != 0:
|
50 |
+
both_match = "Yes"
|
51 |
+
|
52 |
+
if len(female_pronoun_match) != 0 and len(prof_match) != 0:
|
53 |
+
both_match = "Yes"
|
54 |
+
|
55 |
+
# Unpack from list
|
56 |
+
male_pronoun_match = ",".join(male_pronoun_match)
|
57 |
+
female_pronoun_match = ",".join(female_pronoun_match)
|
58 |
+
|
59 |
+
prof_match = ",".join(prof_match)
|
60 |
+
|
61 |
+
results.append((str(text), male_pronoun_match, female_pronoun_match, prof_match, both_match))
|
62 |
+
|
63 |
+
return results
|
64 |
+
|
65 |
+
# Function to call multiprocessing threadpool
|
66 |
+
def call_multiprocessing_pool(df_text):
|
67 |
+
concurrent = 2000
|
68 |
+
pool = multiprocessing.pool.ThreadPool(processes=concurrent)
|
69 |
+
result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
|
70 |
+
pool.close()
|
71 |
+
|
72 |
+
# return_list is nested -- we need to flatten it
|
73 |
+
flat_return_list = [item for sublist in result_list for item in sublist]
|
74 |
+
|
75 |
+
# add column names
|
76 |
+
cols = ["Split_Text", 'Male Pronoun', 'Female Pronoun', 'Profession', "Both Match"]
|
77 |
+
return_df = pd.DataFrame(flat_return_list, columns=cols)
|
78 |
+
|
79 |
+
return return_df
|
80 |
+
|
81 |
+
# Function to get statistics
|
82 |
+
def get_statistics(results_df):
|
83 |
+
count_total_sentence = results_df.shape[0]
|
84 |
+
count_both_match = results_df[results_df["Both Match"] == "Yes"]['Both Match'].count()
|
85 |
+
count_male_pronoun = results_df[results_df["Male Pronoun"] != ""]["Male Pronoun"].count()
|
86 |
+
count_female_pronoun = results_df[results_df["Female Pronoun"] != ""]["Female Pronoun"].count()
|
87 |
+
|
88 |
+
count_male_pronoun_profession = results_df[(results_df["Male Pronoun"] != "") & (results_df["Profession"] != "")]["Male Pronoun"].count()
|
89 |
+
count_female_pronoun_profession = results_df[(results_df["Female Pronoun"] != "") & (results_df["Profession"] != "")]["Female Pronoun"].count()
|
90 |
+
|
91 |
+
return{
|
92 |
+
"total_sentence" : str(count_total_sentence),
|
93 |
+
"both_gender_prof_match" : str(count_both_match),
|
94 |
+
"count_male_pronoun" : str(count_male_pronoun),
|
95 |
+
"count_female_pronoun" : str(count_female_pronoun),
|
96 |
+
"count_male_pronoun_profession" : str(count_male_pronoun_profession),
|
97 |
+
"count_female_pronoun_profession" : str(count_female_pronoun_profession)
|
98 |
+
}
|
99 |
+
|
100 |
+
# Function to return regular expression patterns
|
101 |
+
def get_regex_pattern(male_pronoun, female_pronoun, professions):
|
102 |
+
|
103 |
+
|
104 |
+
male_pronoun_pat = r'\b({})\b'.format("|".join(male_pronoun))
|
105 |
+
female_pronoun_pat = r'\b({})\b'.format("|".join(female_pronoun))
|
106 |
+
|
107 |
+
#Lower case male professioon
|
108 |
+
professions = [prof.lower() for prof in professions]
|
109 |
+
professions_pat = r'\b({})\b'.format("|".join(professions))
|
110 |
+
|
111 |
+
return male_pronoun_pat, female_pronoun_pat, professions_pat
|
112 |
+
|
113 |
+
|
114 |
+
def load_dataset_and_analyze_gender_profession(df, sample_method, col_name, num_sample_records):
|
115 |
+
# Get args from config file
|
116 |
+
|
117 |
+
sample_df = load_sample(num_sample_records, sample_method, df, col_name)
|
118 |
+
|
119 |
+
|
120 |
+
# Lowercase of text
|
121 |
+
sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
|
122 |
+
|
123 |
+
# Call multiple threadpool
|
124 |
+
results_df = call_multiprocessing_pool(sample_df[col_name])
|
125 |
+
|
126 |
+
stats = get_statistics(results_df)
|
127 |
+
|
128 |
+
# Get statistics
|
129 |
+
return stats
|
scripts/gender_tagging.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import required libraries
|
2 |
+
import pandas as pd
|
3 |
+
import re
|
4 |
+
from utils.read_config import get_args
|
5 |
+
from utils.load_csv import load_sample
|
6 |
+
|
7 |
+
# Function to get count of male terms in text
|
8 |
+
def count_male_terms(text, male_terms):
|
9 |
+
# Get pattern
|
10 |
+
pattern = r"\b({})\b".format("|".join(male_terms))
|
11 |
+
match = re.findall(pattern, str(text))
|
12 |
+
return len(match)
|
13 |
+
|
14 |
+
# Function to get count of female terms in text
|
15 |
+
def count_female_terms(text, female_terms):
|
16 |
+
# Get pattern
|
17 |
+
pattern = r"\b({})\b".format("|".join(female_terms))
|
18 |
+
match = re.findall(pattern, str(text))
|
19 |
+
return len(match)
|
20 |
+
|
21 |
+
# Function to get gender tag categories
|
22 |
+
def get_gender_tag(count_m_term, count_f_term):
|
23 |
+
tag = ''
|
24 |
+
if count_m_term == 0 and count_f_term == 0:
|
25 |
+
tag = "No Gender"
|
26 |
+
|
27 |
+
elif count_m_term == count_f_term:
|
28 |
+
tag = "Equal Gender"
|
29 |
+
|
30 |
+
elif count_m_term > count_f_term:
|
31 |
+
m_proportion = (count_m_term / (count_m_term + count_f_term)) * 100
|
32 |
+
if m_proportion >= 50 and m_proportion < 75:
|
33 |
+
tag = "Male Positive Gender"
|
34 |
+
elif m_proportion >= 75:
|
35 |
+
tag = "Male Strongly Positive Gender"
|
36 |
+
|
37 |
+
elif count_m_term < count_f_term:
|
38 |
+
f_proportion = (count_f_term / (count_m_term + count_f_term)) * 100
|
39 |
+
if f_proportion >= 50 and f_proportion < 75:
|
40 |
+
tag = "Female Positive Gender"
|
41 |
+
elif f_proportion >= 75:
|
42 |
+
tag = "Female Strongly Positive Gender"
|
43 |
+
|
44 |
+
return tag
|
45 |
+
|
46 |
+
|
47 |
+
# Function to calculate PG and SPG
|
48 |
+
def get_pg_spg(sample_df):
|
49 |
+
count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"]['gender_cat'].count()
|
50 |
+
|
51 |
+
count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"]['gender_cat'].count()
|
52 |
+
count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"]['gender_cat'].count()
|
53 |
+
|
54 |
+
count_male_pg = sample_df[sample_df['gender_cat'] == "Male Positive Gender"]['gender_cat'].count()
|
55 |
+
count_male_spg = sample_df[sample_df['gender_cat'] == "Male Strongly Positive Gender"]['gender_cat'].count()
|
56 |
+
|
57 |
+
count_female_pg = sample_df[sample_df['gender_cat'] == "Female Positive Gender"]['gender_cat'].count()
|
58 |
+
count_female_spg = sample_df[sample_df['gender_cat'] == "Female Stronly Positive Gender"]['gender_cat'].count()
|
59 |
+
|
60 |
+
return {
|
61 |
+
"gender" : str(count_gender_sentences),
|
62 |
+
"no gender" : str(count_no_gender_sentences),
|
63 |
+
"equal gender" : str(count_equal_gender),
|
64 |
+
"female pg" : str(count_female_pg),
|
65 |
+
"male pg" : str(count_male_pg),
|
66 |
+
"female spg" : str(count_female_spg),
|
67 |
+
"male spg" : str(count_male_spg)
|
68 |
+
}
|
69 |
+
|
70 |
+
# Function to load dataset and get the analysis done
|
71 |
+
def load_dataset_and_analyze_gender_tag(df, sample_method, col_name, num_sample_records):
|
72 |
+
# Read config file
|
73 |
+
male_terms = get_args("male_terms")
|
74 |
+
female_terms = get_args("female_terms")
|
75 |
+
# Load sample
|
76 |
+
sample_df = load_sample(num_sample_records, sample_method, df, col_name)
|
77 |
+
|
78 |
+
# Lowercase of text
|
79 |
+
sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
|
80 |
+
|
81 |
+
# Get new columns of count - male terms and female terms
|
82 |
+
sample_df['count_male_term'] = sample_df.apply(lambda x : count_male_terms(x[col_name], male_terms), axis=1)
|
83 |
+
sample_df['count_female_term'] = sample_df.apply(lambda x : count_female_terms(x[:], female_terms), axis=1)
|
84 |
+
|
85 |
+
# Get tag categories
|
86 |
+
sample_df['gender_cat'] = sample_df.apply(lambda row: get_gender_tag(row['count_male_term'], row['count_female_term']), axis=1)
|
87 |
+
|
88 |
+
# Get statistics
|
89 |
+
collection = get_pg_spg(sample_df)
|
90 |
+
return collection
|
91 |
+
|
92 |
+
|
93 |
+
|
utils/config.json
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"first_records" : 2000,
|
3 |
+
"random_seed" : 42,
|
4 |
+
"male_terms" : ["man", "boy", "male", "he", "son", "his", "himself", "guy", "father", "john"],
|
5 |
+
"female_terms" : ["woman", "girl", "female", "she", "daughter", "her", "herself", "gal", "mother", "mary"],
|
6 |
+
"male_pronoun" : ["he", "him", "his"],
|
7 |
+
"female_pronoun" : ["she", "her", "hers"],
|
8 |
+
"professions" : ["Accountant",
|
9 |
+
"Actor",
|
10 |
+
"Actress",
|
11 |
+
"Aerospace Engineer",
|
12 |
+
"Agricultural Scientist",
|
13 |
+
"Air Traffic Controller",
|
14 |
+
"Aircraft Mechanic",
|
15 |
+
"Animator",
|
16 |
+
"Architect",
|
17 |
+
"Art Director",
|
18 |
+
"Attorney",
|
19 |
+
"Lawyer",
|
20 |
+
"Audiologist",
|
21 |
+
"Author",
|
22 |
+
"Writer",
|
23 |
+
"Baker",
|
24 |
+
"Barber",
|
25 |
+
"Hairdresser",
|
26 |
+
"Bartender",
|
27 |
+
"Biomedical Engineer",
|
28 |
+
"Botanist",
|
29 |
+
"Broadcast Journalist",
|
30 |
+
"Business Analyst",
|
31 |
+
"Carpenter",
|
32 |
+
"Chef",
|
33 |
+
"Cook",
|
34 |
+
"Chemist",
|
35 |
+
"Civil Engineer",
|
36 |
+
"Clinical Psychologist",
|
37 |
+
"Commercial Diver",
|
38 |
+
"Computer Programmer",
|
39 |
+
"Construction Worker",
|
40 |
+
"Corporate Trainer",
|
41 |
+
"Cosmetologist",
|
42 |
+
"Counselor",
|
43 |
+
"Therapist",
|
44 |
+
"Court Reporter",
|
45 |
+
"Creative Director",
|
46 |
+
"Criminologist",
|
47 |
+
"Customer Service Representative",
|
48 |
+
"Data Analyst",
|
49 |
+
"Dental Assistant",
|
50 |
+
"Dentist",
|
51 |
+
"Dermatologist",
|
52 |
+
"Dietician",
|
53 |
+
"Nutritionist",
|
54 |
+
"Doctor",
|
55 |
+
"Physician",
|
56 |
+
"Economist",
|
57 |
+
"Electrician",
|
58 |
+
"Elementary School Teacher",
|
59 |
+
"Emergency Medical Technician",
|
60 |
+
"Engineer",
|
61 |
+
"Environmental Scientist",
|
62 |
+
"Event Planner",
|
63 |
+
"Fashion Designer",
|
64 |
+
"Film Director",
|
65 |
+
"Financial Analyst",
|
66 |
+
"Firefighter",
|
67 |
+
"Fisherman",
|
68 |
+
"Fitness Trainer",
|
69 |
+
"Flight Attendant",
|
70 |
+
"Florist",
|
71 |
+
"Food Scientist",
|
72 |
+
"Forensic Scientist",
|
73 |
+
"Furniture Maker",
|
74 |
+
"Game Developer",
|
75 |
+
"Gardener",
|
76 |
+
"Landscaper",
|
77 |
+
"Geologist",
|
78 |
+
"Graphic Designer",
|
79 |
+
"Hair Stylist",
|
80 |
+
"Historian",
|
81 |
+
"Home Health Aide",
|
82 |
+
"Hotel Manager",
|
83 |
+
"Human Resources Manager",
|
84 |
+
"Immigration Lawyer",
|
85 |
+
"Industrial Designer",
|
86 |
+
"Insurance Agent",
|
87 |
+
"Interior Designer",
|
88 |
+
"Interpreter",
|
89 |
+
"Translator",
|
90 |
+
"Investment Banker",
|
91 |
+
"IT Specialist",
|
92 |
+
"Journalist",
|
93 |
+
"Judge",
|
94 |
+
"Kindergarten Teacher",
|
95 |
+
"Land Surveyor",
|
96 |
+
"Landscape Architect",
|
97 |
+
"Lawyer",
|
98 |
+
"Attorney",
|
99 |
+
"Librarian",
|
100 |
+
"Life Coach",
|
101 |
+
"Linguist",
|
102 |
+
"Makeup Artist",
|
103 |
+
"Management Consultant",
|
104 |
+
"Manufacturing Engineer",
|
105 |
+
"Marine Biologist",
|
106 |
+
"Marketing Manager",
|
107 |
+
"Massage Therapist",
|
108 |
+
"Mechanical Engineer",
|
109 |
+
"Medical Assistant",
|
110 |
+
"Medical Researcher",
|
111 |
+
"Meteorologist",
|
112 |
+
"Midwife",
|
113 |
+
"Military Officer",
|
114 |
+
"Music Producer",
|
115 |
+
"Musician",
|
116 |
+
"Nurse",
|
117 |
+
"Occupational Therapist",
|
118 |
+
"Optician",
|
119 |
+
"Optometrist",
|
120 |
+
"Paralegal",
|
121 |
+
"Paramedic",
|
122 |
+
"Patent Attorney",
|
123 |
+
"Pediatrician",
|
124 |
+
"Personal Trainer",
|
125 |
+
"Petroleum Engineer",
|
126 |
+
"Pharmacist",
|
127 |
+
"Photographer",
|
128 |
+
"Physical Therapist",
|
129 |
+
"Physician Assistant",
|
130 |
+
"Pilot",
|
131 |
+
"Plumber",
|
132 |
+
"Police Officer",
|
133 |
+
"Political Scientist",
|
134 |
+
"Preschool Teacher",
|
135 |
+
"Private Investigator",
|
136 |
+
"Product Manager",
|
137 |
+
"Professor",
|
138 |
+
"Lecturer",
|
139 |
+
"Programmer",
|
140 |
+
"Psychiatrist",
|
141 |
+
"Psychologist",
|
142 |
+
"Public Relations Specialist",
|
143 |
+
"Public School Teacher",
|
144 |
+
"Real Estate Agent",
|
145 |
+
"Broker",
|
146 |
+
"Receptionist",
|
147 |
+
"Registered Nurse",
|
148 |
+
"Reporter",
|
149 |
+
"Restaurant Manager",
|
150 |
+
"Sales Representative",
|
151 |
+
"School Counselor",
|
152 |
+
"Scientist",
|
153 |
+
"Screenwriter",
|
154 |
+
"Social Media Manager",
|
155 |
+
"Social Worker",
|
156 |
+
"Software Developer",
|
157 |
+
"Speech-Language Pathologist",
|
158 |
+
"Sports Coach",
|
159 |
+
"Statistician"]
|
160 |
+
}
|
utils/load_csv.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from utils.read_config import get_args
|
3 |
+
def check_csv(upload_file):
|
4 |
+
df = pd.read_csv(upload_file)
|
5 |
+
return df
|
6 |
+
|
7 |
+
# Function to load sample of dataset
|
8 |
+
def load_sample(num_sample_records, sample_method, df, col_name):
|
9 |
+
|
10 |
+
sample_first_records = get_args("first_records")
|
11 |
+
sample_random_seed = get_args("random_seed")
|
12 |
+
|
13 |
+
num_sample_records = num_sample_records if num_sample_records <= sample_first_records else sample_first_records
|
14 |
+
|
15 |
+
# Keep only required column
|
16 |
+
df = df[[col_name]]
|
17 |
+
if sample_method == "First":
|
18 |
+
df = df.iloc[:num_sample_records].copy().reset_index()
|
19 |
+
if sample_method == "Last":
|
20 |
+
df = df.iloc[-num_sample_records:].copy().reset_index()
|
21 |
+
if sample_method == "Random":
|
22 |
+
df = df.sample(num_sample_records, random_state=sample_random_seed).copy().reset_index()
|
23 |
+
return df
|
utils/read_config.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
def read_config_file():
|
4 |
+
with open("utils/config.json", "r") as jsonfile:
|
5 |
+
data = json.load(jsonfile)
|
6 |
+
return data
|
7 |
+
|
8 |
+
def get_args(args):
|
9 |
+
try:
|
10 |
+
data = read_config_file()
|
11 |
+
except:
|
12 |
+
raise "Could not read config file."
|
13 |
+
return data[args]
|