freyam commited on
Commit
1dc0d8b
·
2 Parent(s): 38ba037 b3ce2b2

Merge remote-tracking branch 'origin/dev-sudipta' into dev-frey

Browse files
.gitignore CHANGED
@@ -1,5 +1,8 @@
1
  # Byte-compiled / optimized / DLL files
2
  __pycache__/
 
 
 
3
  *.py[cod]
4
  *$py.class
5
 
 
1
  # Byte-compiled / optimized / DLL files
2
  __pycache__/
3
+ testing/
4
+ flagged/
5
+ check_gender_tagging.py
6
  *.py[cod]
7
  *$py.class
8
 
app.py CHANGED
@@ -1,6 +1,11 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import os
 
 
 
 
 
4
 
5
  dummy_data = [
6
  ["Category", "Value", "Percentage"],
@@ -42,14 +47,32 @@ def display_methodology(methodology):
42
  )
43
 
44
 
45
- def run_evaluation(dataset_file, dataset_scope, dataset_scope_n, dataset_corpus, methodology):
46
- status = {
47
- "dataset": dataset_file.name,
48
- "methodology": methodology,
49
- "scope": dataset_scope + " " + str(dataset_scope_n),
50
- "column": dataset_corpus.columns[0]
51
 
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  return gr.JSON.update(status, visible=True)
55
 
@@ -62,7 +85,11 @@ def run_evaluation(dataset_file, dataset_scope, dataset_scope_n, dataset_corpus,
62
 
63
 
64
  def process_dataset(dataset):
 
65
  data = pd.read_csv(dataset.name)
 
 
 
66
 
67
  columns = data.columns.tolist()
68
  columns = [x for x in columns if data[x].dtype == "object"]
@@ -98,7 +125,7 @@ def process_dataset(dataset):
98
 
99
  def process_column(dataset, column):
100
  data = pd.read_csv(dataset.name)
101
- corpus = data[column].to_list()
102
 
103
  return gr.Dataframe.update(value=pd.DataFrame({"Data Corpus": corpus}), max_rows=5, visible=True)
104
 
@@ -118,9 +145,12 @@ with BiasAware:
118
  dataset_file = gr.File()
119
  dataset_examples = gr.Examples(
120
  [
121
- os.path.join(os.path.dirname(__file__), "z_animal.csv"),
122
- os.path.join(os.path.dirname(__file__), "z_employee.csv"),
123
- os.path.join(os.path.dirname(__file__), "z_house.csv"),
 
 
 
124
 
125
  ],
126
  inputs=dataset_file,
@@ -182,7 +212,7 @@ with BiasAware:
182
  evalButton.click(
183
  fn=run_evaluation,
184
  inputs=[dataset_file, dataset_scope,
185
- dataset_scope_n, dataset_corpus, methodology],
186
  outputs=[result_status]
187
  )
188
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import os
4
+ from scripts.genbit_metrics import *
5
+ from scripts.gender_profession_tagging import *
6
+ from scripts.gender_tagging import *
7
+ from utils.load_csv import *
8
+ from utils.read_config import get_args
9
 
10
  dummy_data = [
11
  ["Category", "Value", "Percentage"],
 
47
  )
48
 
49
 
50
+ def run_evaluation(dataset_file, dataset_scope, dataset_scope_n, dataset_columns, methodology):
 
 
 
 
 
51
 
52
+ status = {}
53
+ # Read CSV file
54
+ dataset = check_csv(dataset_file.name)
55
+ sample_method = dataset_scope
56
+ col_name = dataset_columns
57
+ num_sample_records = dataset_scope_n
58
+
59
+ # Check selected methodology
60
+ if methodology == "Term Identity Diversity Analysis":
61
+ status = load_dataset_and_analyze_gender_tag(
62
+ dataset, sample_method, col_name, num_sample_records)
63
+ if methodology == "Gender Label Evaluation":
64
+ status = load_dataset_and_analyze_gender_profession(
65
+ dataset, sample_method, col_name, num_sample_records)
66
+ if methodology == "Microsoft Genbit":
67
+ status = load_dataset_and_get_genbit_metrics(
68
+ dataset, sample_method, col_name, num_sample_records)
69
+
70
+ # status = {
71
+ # "dataset": dataset_file.name,
72
+ # "methodology": methodology,
73
+ # "scope": dataset_scope + " " + str(dataset_scope_n),
74
+ # "column": dataset_columns
75
+ # }
76
 
77
  return gr.JSON.update(status, visible=True)
78
 
 
85
 
86
 
87
  def process_dataset(dataset):
88
+
89
  data = pd.read_csv(dataset.name)
90
+ # maximum_records = get_args("first_records")
91
+ # input_records = data.shape(0)
92
+ # num_sample_records = input_records if input_records < maximum_records else maximum_records
93
 
94
  columns = data.columns.tolist()
95
  columns = [x for x in columns if data[x].dtype == "object"]
 
125
 
126
  def process_column(dataset, column):
127
  data = pd.read_csv(dataset.name)
128
+ corpus = data[column].to_list()[:10]
129
 
130
  return gr.Dataframe.update(value=pd.DataFrame({"Data Corpus": corpus}), max_rows=5, visible=True)
131
 
 
145
  dataset_file = gr.File()
146
  dataset_examples = gr.Examples(
147
  [
148
+ os.path.join(os.path.dirname(__file__),
149
+ "data/z_animal.csv"),
150
+ os.path.join(os.path.dirname(__file__),
151
+ "data/z_employee.csv"),
152
+ os.path.join(os.path.dirname(
153
+ __file__), "data/z_house.csv"),
154
 
155
  ],
156
  inputs=dataset_file,
 
212
  evalButton.click(
213
  fn=run_evaluation,
214
  inputs=[dataset_file, dataset_scope,
215
+ dataset_scope_n, dataset_columns, methodology],
216
  outputs=[result_status]
217
  )
218
 
data/z_animal.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AnimalID,CommonName,ScientificName,Class,Order,Family,Habitat,ConservationStatus
2
+ 1,Lion,Panthera leo,Mammalia,Carnivora,Felidae,Savanna,Vulnerable
3
+ 2,Eagle,Aquila chrysaetos,Aves,Accipitriformes,Accipitridae,Mountains,Least Concern
4
+ 3,Dolphin,Tursiops truncatus,Mammalia,Cetacea,Delphinidae,Ocean,Least Concern
5
+ 4,Elephant,Loxodonta africana,Mammalia,Proboscidea,Elephantidae,Grassland,Vulnerable
6
+ 5,Tiger,Panthera tigris,Mammalia,Carnivora,Felidae,Forest,Endangered
7
+ 6,Penguin,Spheniscidae,Aves,Sphenisciformes,Spheniscidae,Antarctica,Least Concern
8
+ 7,Giraffe,Giraffa camelopardalis,Mammalia,Artiodactyla,Giraffidae,Savanna,Vulnerable
9
+ 8,Cheetah,Acinonyx jubatus,Mammalia,Carnivora,Felidae,Grassland,Vulnerable
10
+ 9,Panda,Ailuropoda melanoleuca,Mammalia,Carnivora,Ursidae,Forest,Endangered
11
+ 10,Kangaroo,Macropus rufus,Mammalia,Diprotodontia,Macropodidae,Grassland,Least Concern
data/z_employee.csv ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EmployeeID,FirstName,LastName,Email,Department,Salary
2
+ 101,John,Smith,[email protected],Finance,60000
3
+ 102,Emily,Johnson,[email protected],Marketing,55000
4
+ 103,Michael,Williams,[email protected],HR,50000
5
+ 104,Susan,Anderson,[email protected],IT,65000
6
+ 105,David,Martin,[email protected],Sales,58000
7
+ 106,Linda,Davis,[email protected],Finance,62000
8
+ 107,William,Miller,[email protected],Marketing,56000
9
+ 108,Sarah,Anderson,[email protected],HR,51000
10
+ 109,Robert,Clark,[email protected],IT,67000
11
+ 110,Karen,Wilson,[email protected],Sales,59000
12
+ 111,James,Brown,[email protected],Finance,61000
13
+ 112,Anna,Johnson,[email protected],Marketing,57000
14
+ 113,Christopher,Moore,[email protected],HR,52000
15
+ 114,Laura,White,[email protected],IT,68000
16
+ 115,Mark,Davis,[email protected],Sales,60000
17
+ 116,Patricia,Jones,[email protected],Finance,63000
18
+ 117,Matthew,Taylor,[email protected],Marketing,58000
19
+ 118,Jennifer,Young,[email protected],HR,53000
20
+ 119,Steven,Anderson,[email protected],IT,69000
21
+ 120,Elizabeth,Thomas,[email protected],Sales,61000
22
+ 121,Kevin,Harris,[email protected],Finance,64000
23
+ 122,Deborah,Smith,[email protected],Marketing,59000
24
+ 123,Joseph,Walker,[email protected],HR,54000
25
+ 124,Cynthia,Jackson,[email protected],IT,70000
26
+ 125,Daniel,Hall,[email protected],Sales,62000
data/z_house.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ PropertyID,StreetAddress,City,State,ZipCode,NumberOfBedrooms,NumberOfBathrooms,SquareFootage,Price
2
+ 1,123 Main St,Los Angeles,CA,90001,3,2,1800,550000
3
+ 2,456 Elm St,New York,NY,10001,2,1,1200,750000
4
+ 3,789 Oak St,San Francisco,CA,94101,4,3,2500,950000
5
+ 4,101 Maple St,Boston,MA,02101,3,2.5,2000,680000
6
+ 5,202 Pine St,Miami,FL,33101,4,3.5,2700,820000
7
+ 6,303 Cedar St,Chicago,IL,60601,2,1,1100,450000
requirements.txt CHANGED
@@ -1,4 +1,6 @@
1
- gradio==3.41.2
2
  gradio_client==0.5.0
3
  numpy==1.25.2
4
- pandas==2.0.3
 
 
 
1
+ gradio==3.40.1
2
  gradio_client==0.5.0
3
  numpy==1.25.2
4
+ pandas==2.0.3
5
+ spacy
6
+ genbit
scripts/.keep DELETED
File without changes
scripts/genbit_metrics.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from genbit.genbit_metrics import GenBitMetrics
2
+ import pandas as pd
3
+ from utils.read_config import get_args
4
+ from utils.load_csv import load_sample
5
+
6
+
7
+ def cal_metrics(dataset):
8
+ # Create a GenBit object with the desired settings:
9
+
10
+ genbit_metrics_object = GenBitMetrics(language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80)
11
+
12
+ # Let's say you want to use GenBit with a test sentence, you can add the sentence to GenBit:
13
+ #dataset = ["I think she does not like cats. I think he does not like cats.", "He is a dog person."]
14
+
15
+ genbit_metrics_object.add_data(dataset, tokenized=False)
16
+
17
+
18
+ # To generate the gender bias metrics, we run `get_metrics` by setting `output_statistics` and `output_word_lists` to false, we can reduce the number of metrics created.
19
+ metrics = genbit_metrics_object.get_metrics(output_statistics=True, output_word_list=True)
20
+
21
+ return metrics
22
+
23
+
24
+ # Function to extract genbit metrics
25
+ def extract_genbit_metris(stats):
26
+ metrics = {}
27
+ metrics["genbit_score"] = str(stats["genbit_score"])
28
+ metrics["percentage_of_female_gender_definition_words"] = str(stats["percentage_of_female_gender_definition_words"])
29
+ metrics["percentage_of_male_gender_definition_words"] = str(stats["percentage_of_male_gender_definition_words"])
30
+ metrics["percentage_of_non_binary_gender_definition_words"] = str(stats["percentage_of_non_binary_gender_definition_words"])
31
+ metrics["percentage_of_trans_gender_definition_words"] = str(stats["percentage_of_trans_gender_definition_words"])
32
+ metrics["percentage_of_cis_gender_definition_words"] = str(stats["percentage_of_cis_gender_definition_words"])
33
+ metrics["num_words_considered"] = str(stats["statistics"]["num_words_considered"])
34
+
35
+ return metrics
36
+
37
+ def load_dataset_and_get_genbit_metrics(df, sample_method, col_name, num_sample_records):
38
+
39
+
40
+ sample_df = load_sample(num_sample_records, sample_method, df, col_name)
41
+
42
+ # Turn into a list of text.
43
+ sample_text = sample_df[col_name].tolist()
44
+
45
+ # Call cal_metrics function
46
+ stats = cal_metrics(sample_text)
47
+ metrics = extract_genbit_metris(stats)
48
+ return metrics
scripts/gender_profession_tagging.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ import spacy
4
+ from spacy.lang.en import English
5
+ import time
6
+ from tqdm import tqdm
7
+ import multiprocessing.pool
8
+
9
+ import warnings
10
+ warnings.filterwarnings("ignore")
11
+ from utils.read_config import get_args
12
+ from utils.load_csv import load_sample
13
+
14
+
15
+ # For sentence split
16
+ nlp = English()
17
+ nlp.add_pipe("sentencizer")
18
+
19
+ # Function to split sentences
20
+ def get_split_text(text):
21
+
22
+ doc = nlp(text)
23
+ sentences = [sent for sent in doc.sents]
24
+ return sentences
25
+
26
+ def get_gender_prof_match_details(df_text):
27
+
28
+ # Get args from config file
29
+ male_pronoun = get_args("male_pronoun")
30
+ female_pronoun = get_args("female_pronoun")
31
+ professions = get_args("professions")
32
+
33
+ # Get regex pattern
34
+ male_pronoun_pat, female_pronoun_pat, professions_pat = get_regex_pattern(male_pronoun, female_pronoun, professions)
35
+
36
+
37
+ split_text = get_split_text(df_text)
38
+
39
+ results = []
40
+
41
+ for text in split_text:
42
+ male_pronoun_match = re.findall(male_pronoun_pat, str(text))
43
+ female_pronoun_match = re.findall(female_pronoun_pat, str(text))
44
+
45
+ prof_match = re.findall(professions_pat, str(text))
46
+
47
+ both_match = "No"
48
+
49
+ if len(male_pronoun_match) != 0 and len(prof_match) != 0:
50
+ both_match = "Yes"
51
+
52
+ if len(female_pronoun_match) != 0 and len(prof_match) != 0:
53
+ both_match = "Yes"
54
+
55
+ # Unpack from list
56
+ male_pronoun_match = ",".join(male_pronoun_match)
57
+ female_pronoun_match = ",".join(female_pronoun_match)
58
+
59
+ prof_match = ",".join(prof_match)
60
+
61
+ results.append((str(text), male_pronoun_match, female_pronoun_match, prof_match, both_match))
62
+
63
+ return results
64
+
65
+ # Function to call multiprocessing threadpool
66
+ def call_multiprocessing_pool(df_text):
67
+ concurrent = 2000
68
+ pool = multiprocessing.pool.ThreadPool(processes=concurrent)
69
+ result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
70
+ pool.close()
71
+
72
+ # return_list is nested -- we need to flatten it
73
+ flat_return_list = [item for sublist in result_list for item in sublist]
74
+
75
+ # add column names
76
+ cols = ["Split_Text", 'Male Pronoun', 'Female Pronoun', 'Profession', "Both Match"]
77
+ return_df = pd.DataFrame(flat_return_list, columns=cols)
78
+
79
+ return return_df
80
+
81
+ # Function to get statistics
82
+ def get_statistics(results_df):
83
+ count_total_sentence = results_df.shape[0]
84
+ count_both_match = results_df[results_df["Both Match"] == "Yes"]['Both Match'].count()
85
+ count_male_pronoun = results_df[results_df["Male Pronoun"] != ""]["Male Pronoun"].count()
86
+ count_female_pronoun = results_df[results_df["Female Pronoun"] != ""]["Female Pronoun"].count()
87
+
88
+ count_male_pronoun_profession = results_df[(results_df["Male Pronoun"] != "") & (results_df["Profession"] != "")]["Male Pronoun"].count()
89
+ count_female_pronoun_profession = results_df[(results_df["Female Pronoun"] != "") & (results_df["Profession"] != "")]["Female Pronoun"].count()
90
+
91
+ return{
92
+ "total_sentence" : str(count_total_sentence),
93
+ "both_gender_prof_match" : str(count_both_match),
94
+ "count_male_pronoun" : str(count_male_pronoun),
95
+ "count_female_pronoun" : str(count_female_pronoun),
96
+ "count_male_pronoun_profession" : str(count_male_pronoun_profession),
97
+ "count_female_pronoun_profession" : str(count_female_pronoun_profession)
98
+ }
99
+
100
+ # Function to return regular expression patterns
101
+ def get_regex_pattern(male_pronoun, female_pronoun, professions):
102
+
103
+
104
+ male_pronoun_pat = r'\b({})\b'.format("|".join(male_pronoun))
105
+ female_pronoun_pat = r'\b({})\b'.format("|".join(female_pronoun))
106
+
107
+ #Lower case male professioon
108
+ professions = [prof.lower() for prof in professions]
109
+ professions_pat = r'\b({})\b'.format("|".join(professions))
110
+
111
+ return male_pronoun_pat, female_pronoun_pat, professions_pat
112
+
113
+
114
+ def load_dataset_and_analyze_gender_profession(df, sample_method, col_name, num_sample_records):
115
+ # Get args from config file
116
+
117
+ sample_df = load_sample(num_sample_records, sample_method, df, col_name)
118
+
119
+
120
+ # Lowercase of text
121
+ sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
122
+
123
+ # Call multiple threadpool
124
+ results_df = call_multiprocessing_pool(sample_df[col_name])
125
+
126
+ stats = get_statistics(results_df)
127
+
128
+ # Get statistics
129
+ return stats
scripts/gender_tagging.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import required libraries
2
+ import pandas as pd
3
+ import re
4
+ from utils.read_config import get_args
5
+ from utils.load_csv import load_sample
6
+
7
+ # Function to get count of male terms in text
8
+ def count_male_terms(text, male_terms):
9
+ # Get pattern
10
+ pattern = r"\b({})\b".format("|".join(male_terms))
11
+ match = re.findall(pattern, str(text))
12
+ return len(match)
13
+
14
+ # Function to get count of female terms in text
15
+ def count_female_terms(text, female_terms):
16
+ # Get pattern
17
+ pattern = r"\b({})\b".format("|".join(female_terms))
18
+ match = re.findall(pattern, str(text))
19
+ return len(match)
20
+
21
+ # Function to get gender tag categories
22
+ def get_gender_tag(count_m_term, count_f_term):
23
+ tag = ''
24
+ if count_m_term == 0 and count_f_term == 0:
25
+ tag = "No Gender"
26
+
27
+ elif count_m_term == count_f_term:
28
+ tag = "Equal Gender"
29
+
30
+ elif count_m_term > count_f_term:
31
+ m_proportion = (count_m_term / (count_m_term + count_f_term)) * 100
32
+ if m_proportion >= 50 and m_proportion < 75:
33
+ tag = "Male Positive Gender"
34
+ elif m_proportion >= 75:
35
+ tag = "Male Strongly Positive Gender"
36
+
37
+ elif count_m_term < count_f_term:
38
+ f_proportion = (count_f_term / (count_m_term + count_f_term)) * 100
39
+ if f_proportion >= 50 and f_proportion < 75:
40
+ tag = "Female Positive Gender"
41
+ elif f_proportion >= 75:
42
+ tag = "Female Strongly Positive Gender"
43
+
44
+ return tag
45
+
46
+
47
+ # Function to calculate PG and SPG
48
+ def get_pg_spg(sample_df):
49
+ count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"]['gender_cat'].count()
50
+
51
+ count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"]['gender_cat'].count()
52
+ count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"]['gender_cat'].count()
53
+
54
+ count_male_pg = sample_df[sample_df['gender_cat'] == "Male Positive Gender"]['gender_cat'].count()
55
+ count_male_spg = sample_df[sample_df['gender_cat'] == "Male Strongly Positive Gender"]['gender_cat'].count()
56
+
57
+ count_female_pg = sample_df[sample_df['gender_cat'] == "Female Positive Gender"]['gender_cat'].count()
58
+ count_female_spg = sample_df[sample_df['gender_cat'] == "Female Stronly Positive Gender"]['gender_cat'].count()
59
+
60
+ return {
61
+ "gender" : str(count_gender_sentences),
62
+ "no gender" : str(count_no_gender_sentences),
63
+ "equal gender" : str(count_equal_gender),
64
+ "female pg" : str(count_female_pg),
65
+ "male pg" : str(count_male_pg),
66
+ "female spg" : str(count_female_spg),
67
+ "male spg" : str(count_male_spg)
68
+ }
69
+
70
+ # Function to load dataset and get the analysis done
71
+ def load_dataset_and_analyze_gender_tag(df, sample_method, col_name, num_sample_records):
72
+ # Read config file
73
+ male_terms = get_args("male_terms")
74
+ female_terms = get_args("female_terms")
75
+ # Load sample
76
+ sample_df = load_sample(num_sample_records, sample_method, df, col_name)
77
+
78
+ # Lowercase of text
79
+ sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
80
+
81
+ # Get new columns of count - male terms and female terms
82
+ sample_df['count_male_term'] = sample_df.apply(lambda x : count_male_terms(x[col_name], male_terms), axis=1)
83
+ sample_df['count_female_term'] = sample_df.apply(lambda x : count_female_terms(x[:], female_terms), axis=1)
84
+
85
+ # Get tag categories
86
+ sample_df['gender_cat'] = sample_df.apply(lambda row: get_gender_tag(row['count_male_term'], row['count_female_term']), axis=1)
87
+
88
+ # Get statistics
89
+ collection = get_pg_spg(sample_df)
90
+ return collection
91
+
92
+
93
+
utils/config.json ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "first_records" : 2000,
3
+ "random_seed" : 42,
4
+ "male_terms" : ["man", "boy", "male", "he", "son", "his", "himself", "guy", "father", "john"],
5
+ "female_terms" : ["woman", "girl", "female", "she", "daughter", "her", "herself", "gal", "mother", "mary"],
6
+ "male_pronoun" : ["he", "him", "his"],
7
+ "female_pronoun" : ["she", "her", "hers"],
8
+ "professions" : ["Accountant",
9
+ "Actor",
10
+ "Actress",
11
+ "Aerospace Engineer",
12
+ "Agricultural Scientist",
13
+ "Air Traffic Controller",
14
+ "Aircraft Mechanic",
15
+ "Animator",
16
+ "Architect",
17
+ "Art Director",
18
+ "Attorney",
19
+ "Lawyer",
20
+ "Audiologist",
21
+ "Author",
22
+ "Writer",
23
+ "Baker",
24
+ "Barber",
25
+ "Hairdresser",
26
+ "Bartender",
27
+ "Biomedical Engineer",
28
+ "Botanist",
29
+ "Broadcast Journalist",
30
+ "Business Analyst",
31
+ "Carpenter",
32
+ "Chef",
33
+ "Cook",
34
+ "Chemist",
35
+ "Civil Engineer",
36
+ "Clinical Psychologist",
37
+ "Commercial Diver",
38
+ "Computer Programmer",
39
+ "Construction Worker",
40
+ "Corporate Trainer",
41
+ "Cosmetologist",
42
+ "Counselor",
43
+ "Therapist",
44
+ "Court Reporter",
45
+ "Creative Director",
46
+ "Criminologist",
47
+ "Customer Service Representative",
48
+ "Data Analyst",
49
+ "Dental Assistant",
50
+ "Dentist",
51
+ "Dermatologist",
52
+ "Dietician",
53
+ "Nutritionist",
54
+ "Doctor",
55
+ "Physician",
56
+ "Economist",
57
+ "Electrician",
58
+ "Elementary School Teacher",
59
+ "Emergency Medical Technician",
60
+ "Engineer",
61
+ "Environmental Scientist",
62
+ "Event Planner",
63
+ "Fashion Designer",
64
+ "Film Director",
65
+ "Financial Analyst",
66
+ "Firefighter",
67
+ "Fisherman",
68
+ "Fitness Trainer",
69
+ "Flight Attendant",
70
+ "Florist",
71
+ "Food Scientist",
72
+ "Forensic Scientist",
73
+ "Furniture Maker",
74
+ "Game Developer",
75
+ "Gardener",
76
+ "Landscaper",
77
+ "Geologist",
78
+ "Graphic Designer",
79
+ "Hair Stylist",
80
+ "Historian",
81
+ "Home Health Aide",
82
+ "Hotel Manager",
83
+ "Human Resources Manager",
84
+ "Immigration Lawyer",
85
+ "Industrial Designer",
86
+ "Insurance Agent",
87
+ "Interior Designer",
88
+ "Interpreter",
89
+ "Translator",
90
+ "Investment Banker",
91
+ "IT Specialist",
92
+ "Journalist",
93
+ "Judge",
94
+ "Kindergarten Teacher",
95
+ "Land Surveyor",
96
+ "Landscape Architect",
97
+ "Lawyer",
98
+ "Attorney",
99
+ "Librarian",
100
+ "Life Coach",
101
+ "Linguist",
102
+ "Makeup Artist",
103
+ "Management Consultant",
104
+ "Manufacturing Engineer",
105
+ "Marine Biologist",
106
+ "Marketing Manager",
107
+ "Massage Therapist",
108
+ "Mechanical Engineer",
109
+ "Medical Assistant",
110
+ "Medical Researcher",
111
+ "Meteorologist",
112
+ "Midwife",
113
+ "Military Officer",
114
+ "Music Producer",
115
+ "Musician",
116
+ "Nurse",
117
+ "Occupational Therapist",
118
+ "Optician",
119
+ "Optometrist",
120
+ "Paralegal",
121
+ "Paramedic",
122
+ "Patent Attorney",
123
+ "Pediatrician",
124
+ "Personal Trainer",
125
+ "Petroleum Engineer",
126
+ "Pharmacist",
127
+ "Photographer",
128
+ "Physical Therapist",
129
+ "Physician Assistant",
130
+ "Pilot",
131
+ "Plumber",
132
+ "Police Officer",
133
+ "Political Scientist",
134
+ "Preschool Teacher",
135
+ "Private Investigator",
136
+ "Product Manager",
137
+ "Professor",
138
+ "Lecturer",
139
+ "Programmer",
140
+ "Psychiatrist",
141
+ "Psychologist",
142
+ "Public Relations Specialist",
143
+ "Public School Teacher",
144
+ "Real Estate Agent",
145
+ "Broker",
146
+ "Receptionist",
147
+ "Registered Nurse",
148
+ "Reporter",
149
+ "Restaurant Manager",
150
+ "Sales Representative",
151
+ "School Counselor",
152
+ "Scientist",
153
+ "Screenwriter",
154
+ "Social Media Manager",
155
+ "Social Worker",
156
+ "Software Developer",
157
+ "Speech-Language Pathologist",
158
+ "Sports Coach",
159
+ "Statistician"]
160
+ }
utils/load_csv.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from utils.read_config import get_args
3
+ def check_csv(upload_file):
4
+ df = pd.read_csv(upload_file)
5
+ return df
6
+
7
+ # Function to load sample of dataset
8
+ def load_sample(num_sample_records, sample_method, df, col_name):
9
+
10
+ sample_first_records = get_args("first_records")
11
+ sample_random_seed = get_args("random_seed")
12
+
13
+ num_sample_records = num_sample_records if num_sample_records <= sample_first_records else sample_first_records
14
+
15
+ # Keep only required column
16
+ df = df[[col_name]]
17
+ if sample_method == "First":
18
+ df = df.iloc[:num_sample_records].copy().reset_index()
19
+ if sample_method == "Last":
20
+ df = df.iloc[-num_sample_records:].copy().reset_index()
21
+ if sample_method == "Random":
22
+ df = df.sample(num_sample_records, random_state=sample_random_seed).copy().reset_index()
23
+ return df
utils/read_config.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ def read_config_file():
4
+ with open("utils/config.json", "r") as jsonfile:
5
+ data = json.load(jsonfile)
6
+ return data
7
+
8
+ def get_args(args):
9
+ try:
10
+ data = read_config_file()
11
+ except:
12
+ raise "Could not read config file."
13
+ return data[args]