freyam commited on
Commit
e0db39e
·
1 Parent(s): 0321f34

Restructure Logic and Data Flow

Browse files
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🦀
4
  colorFrom: indigo
5
  colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 3.40.1
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
4
  colorFrom: indigo
5
  colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 3.43.2
8
  app_file: app.py
9
  pinned: false
10
  license: mit
app.py CHANGED
@@ -3,78 +3,105 @@ import gradio as gr
3
  import pandas as pd
4
  import os
5
 
6
- from scripts.genbit_metrics import *
7
- from scripts.gender_profession_tagging import *
8
- from scripts.gender_tagging import *
9
- from utils.load_csv import *
10
- from utils.read_config import get_args
11
 
12
- methodologies = json.load(open("methodologies.json", "r"))
13
 
 
14
 
15
- def get_methodology_metadata(methodology):
16
- title = "## " + methodology
17
- description = methodologies.get(methodology).get("description")
18
 
19
- metadata = f"{title}\n\n{description}"
 
 
 
 
 
20
 
21
- return gr.Markdown.update(metadata, visible=True)
 
 
 
 
 
22
 
 
23
 
24
- def evaluate(dataset_file, dataset_scope, dataset_scope_n, dataset_column, methodology):
25
- status = {}
26
- dataset = pd.read_csv(dataset_file.name)
27
- sample_method = dataset_scope
28
- col_name = dataset_column
29
- num_sample_records = dataset_scope_n
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- status = globals()[methodologies.get(methodology).get("fx")](
32
- dataset, sample_method, col_name, num_sample_records
33
  )
34
 
35
- return gr.JSON.update(status, visible=True)
36
-
37
 
38
- def process_dataset(dataset):
39
- data = pd.read_csv(dataset.name)
 
40
 
41
- columns = data.select_dtypes(include=["object"]).columns.tolist()
42
 
43
  return (
44
- gr.Radio.update(
45
- label="Scope",
46
- info="Determines the scope of the dataset to be analyzed",
47
- choices=["First", "Last", "Random"],
48
- value="First",
49
- visible=True,
50
- interactive=True,
51
- ),
52
- gr.Slider.update(
53
- label=f"Number of Entries",
54
- info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {get_args('first_records')}.",
55
- minimum=1,
56
- maximum=min(data.shape[0], get_args("first_records")),
57
- value=min(data.shape[0], get_args("first_records")) // 2,
58
- visible=True,
59
- interactive=True,
60
- ),
61
- gr.Radio.update(
62
- label="Column",
63
- info="Determines the column to be analyzed. These are the columns with text data.",
64
- choices=columns,
65
- value=columns[0],
66
- visible=True,
67
- interactive=True,
68
- ),
69
- )
70
-
71
-
72
- def get_column_metadata(dataset, column):
73
- data = pd.read_csv(dataset.name)
74
- corpus = data[column].head(10).tolist()
75
-
76
- return gr.Dataframe.update(
77
- value=pd.DataFrame({f"Data Corpus: {column}": corpus}), visible=True
78
  )
79
 
80
 
@@ -89,19 +116,19 @@ with BiasAware:
89
  with gr.Column(scale=2):
90
  gr.Markdown("## Dataset")
91
 
92
- dataset_file = gr.File(label="Dataset")
93
  dataset_examples = gr.Examples(
94
  [
95
  os.path.join(os.path.dirname(__file__), "data/z_animal.csv"),
96
  os.path.join(os.path.dirname(__file__), "data/z_employee.csv"),
97
- os.path.join(os.path.dirname(__file__), "data/z_house.csv"),
98
  ],
99
  inputs=dataset_file,
100
  label="Example Datasets",
101
  )
102
 
103
- dataset_scope = gr.Radio(visible=False)
104
- dataset_scope_n = gr.Slider(visible=False)
105
  dataset_column = gr.Radio(visible=False)
106
 
107
  dataset_corpus = gr.Dataframe(
@@ -114,14 +141,10 @@ with BiasAware:
114
  methodology = gr.Radio(
115
  label="Methodology",
116
  info="Determines the methodology to be used for bias detection",
117
- choices=[
118
- "Gender Divide (Term Identity Diversity)",
119
- "Gender Profession Bias (Lexical Evaluation)",
120
- "GenBiT (Microsoft Responsible AI Gender Bias Tool)",
121
- ],
122
  )
123
 
124
- evalButton = gr.Button("Run Evaluation")
125
 
126
  methodology_metadata = gr.Markdown(visible=False)
127
 
@@ -134,13 +157,18 @@ with BiasAware:
134
  )
135
 
136
  dataset_file.change(
137
- fn=process_dataset,
138
  inputs=[dataset_file],
139
- outputs=[dataset_scope, dataset_scope_n, dataset_column],
 
 
 
 
 
140
  )
141
 
142
  dataset_column.change(
143
- fn=get_column_metadata,
144
  inputs=[dataset_file, dataset_column],
145
  outputs=[dataset_corpus],
146
  )
@@ -148,15 +176,15 @@ with BiasAware:
148
  methodology.change(
149
  fn=get_methodology_metadata,
150
  inputs=[methodology],
151
- outputs=[methodology_metadata],
152
  )
153
 
154
  evalButton.click(
155
  fn=evaluate,
156
  inputs=[
157
  dataset_file,
158
- dataset_scope,
159
- dataset_scope_n,
160
  dataset_column,
161
  methodology,
162
  ],
 
3
  import pandas as pd
4
  import os
5
 
6
+ from scripts.genbit import *
7
+ from scripts.gender_profession_bias import *
8
+ from scripts.gender_divide import *
 
 
9
 
10
+ methodologies = json.load(open("config/methodologies.json", "r"))
11
 
12
+ MAX_THRESHOLD = 1000
13
 
 
 
 
14
 
15
+ def evaluate(dataset, sampling_method, sampling_size, column, methodology):
16
+ try:
17
+ print(
18
+ f"[{dataset.name.split('/')[-1]}::{column}] - {sampling_method} {sampling_size} entries"
19
+ )
20
+ data = pd.read_csv(dataset.name, usecols=[column])
21
 
22
+ if sampling_method == "First":
23
+ data = data.head(sampling_size)
24
+ elif sampling_method == "Last":
25
+ data = data.tail(sampling_size)
26
+ elif sampling_method == "Random":
27
+ data = data.sample(n=sampling_size, random_state=42)
28
 
29
+ result = globals()[methodologies.get(methodology).get("fx")](data)
30
 
31
+ return gr.JSON.update(result, visible=True)
32
+ except Exception as e:
33
+ return gr.JSON.update(
34
+ {
35
+ "error": f"An error occurred while processing the dataset. Please check the dataset and try again. Error: {e}"
36
+ },
37
+ visible=True,
38
+ )
39
+
40
+
41
+ def display_dataset_config(dataset):
42
+ try:
43
+ data = pd.read_csv(dataset.name)
44
+
45
+ columns = data.select_dtypes(include=["object"]).columns.tolist()
46
+ corpus = data[columns[0]].tolist()
47
+
48
+ return (
49
+ gr.Radio.update(
50
+ label="Scope",
51
+ info="Determines the scope of the dataset to be analyzed",
52
+ choices=["First", "Last", "Random"],
53
+ value="First",
54
+ visible=True,
55
+ interactive=True,
56
+ ),
57
+ gr.Slider.update(
58
+ label=f"Number of Entries",
59
+ info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {MAX_THRESHOLD}.",
60
+ minimum=1,
61
+ maximum=min(data.shape[0], MAX_THRESHOLD),
62
+ value=min(data.shape[0], MAX_THRESHOLD) // 2,
63
+ visible=True,
64
+ interactive=True,
65
+ ),
66
+ gr.Radio.update(
67
+ label="Column",
68
+ info="Determines the column to be analyzed. These are the columns with text data.",
69
+ choices=columns,
70
+ value=columns[0],
71
+ visible=True,
72
+ interactive=True,
73
+ ),
74
+ gr.DataFrame.update(
75
+ value=pd.DataFrame({f"Data Corpus: {columns[0]}": corpus}), visible=True
76
+ ),
77
+ )
78
+ except:
79
+ return (
80
+ gr.Radio.update(visible=False),
81
+ gr.Slider.update(visible=False),
82
+ gr.Radio.update(visible=False),
83
+ gr.DataFrame.update(visible=False),
84
+ )
85
+
86
+
87
+ def update_column_metadata(dataset, column):
88
+ data = pd.read_csv(dataset.name)
89
+ corpus = data[column].tolist()
90
 
91
+ return gr.Dataframe.update(
92
+ value=pd.DataFrame({f"Data Corpus: {column}": corpus}), visible=True
93
  )
94
 
 
 
95
 
96
+ def get_methodology_metadata(methodology):
97
+ title = "## " + methodology
98
+ description = methodologies.get(methodology).get("description")
99
 
100
+ metadata = f"{title}\n\n{description}"
101
 
102
  return (
103
+ gr.Markdown.update(metadata, visible=True),
104
+ gr.Button.update(interactive=True, visible=True),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  )
106
 
107
 
 
116
  with gr.Column(scale=2):
117
  gr.Markdown("## Dataset")
118
 
119
+ dataset_file = gr.File(label="Dataset", file_types=["csv"])
120
  dataset_examples = gr.Examples(
121
  [
122
  os.path.join(os.path.dirname(__file__), "data/z_animal.csv"),
123
  os.path.join(os.path.dirname(__file__), "data/z_employee.csv"),
124
+ os.path.join(os.path.dirname(__file__), "data/z_sentences.csv"),
125
  ],
126
  inputs=dataset_file,
127
  label="Example Datasets",
128
  )
129
 
130
+ dataset_sampling_method = gr.Radio(visible=False)
131
+ dataset_sampling_size = gr.Slider(visible=False)
132
  dataset_column = gr.Radio(visible=False)
133
 
134
  dataset_corpus = gr.Dataframe(
 
141
  methodology = gr.Radio(
142
  label="Methodology",
143
  info="Determines the methodology to be used for bias detection",
144
+ choices=methodologies.keys(),
 
 
 
 
145
  )
146
 
147
+ evalButton = gr.Button(value="Run Evaluation", interactive=False)
148
 
149
  methodology_metadata = gr.Markdown(visible=False)
150
 
 
157
  )
158
 
159
  dataset_file.change(
160
+ fn=display_dataset_config,
161
  inputs=[dataset_file],
162
+ outputs=[
163
+ dataset_sampling_method,
164
+ dataset_sampling_size,
165
+ dataset_column,
166
+ dataset_corpus,
167
+ ],
168
  )
169
 
170
  dataset_column.change(
171
+ fn=update_column_metadata,
172
  inputs=[dataset_file, dataset_column],
173
  outputs=[dataset_corpus],
174
  )
 
176
  methodology.change(
177
  fn=get_methodology_metadata,
178
  inputs=[methodology],
179
+ outputs=[methodology_metadata, evalButton],
180
  )
181
 
182
  evalButton.click(
183
  fn=evaluate,
184
  inputs=[
185
  dataset_file,
186
+ dataset_sampling_method,
187
+ dataset_sampling_size,
188
  dataset_column,
189
  methodology,
190
  ],
config/gender_lexicons.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "male_lexicons": [
3
+ "man",
4
+ "boy",
5
+ "male",
6
+ "he",
7
+ "son",
8
+ "his",
9
+ "himself",
10
+ "guy",
11
+ "father",
12
+ "john"
13
+ ],
14
+ "male_pronouns": ["he", "him", "his"],
15
+ "female_lexicons": [
16
+ "woman",
17
+ "girl",
18
+ "female",
19
+ "she",
20
+ "daughter",
21
+ "her",
22
+ "herself",
23
+ "gal",
24
+ "mother",
25
+ "mary"
26
+ ],
27
+ "female_pronouns": ["she", "her", "hers"]
28
+ }
methodologies.json → config/methodologies.json RENAMED
@@ -1,14 +1,14 @@
1
  {
2
  "Gender Divide (Term Identity Diversity)": {
3
  "description": "333",
4
- "fx": "load_dataset_and_analyze_gender_tag"
5
  },
6
  "Gender Profession Bias (Lexical Evaluation)": {
7
  "description": "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization.",
8
- "fx": "load_dataset_and_analyze_gender_profession"
9
  },
10
  "GenBiT (Microsoft Responsible AI Gender Bias Tool)": {
11
  "description": "[GenBiT](https://www.microsoft.com/en-us/research/uploads/prod/2021/10/MSJAR_Genbit_Final_Version-616fd3a073758.pdf) is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications.",
12
- "fx": "load_dataset_and_get_genbit_metrics"
13
  }
14
  }
 
1
  {
2
  "Gender Divide (Term Identity Diversity)": {
3
  "description": "333",
4
+ "fx": "eval_gender_divide"
5
  },
6
  "Gender Profession Bias (Lexical Evaluation)": {
7
  "description": "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization.",
8
+ "fx": "eval_gender_profession"
9
  },
10
  "GenBiT (Microsoft Responsible AI Gender Bias Tool)": {
11
  "description": "[GenBiT](https://www.microsoft.com/en-us/research/uploads/prod/2021/10/MSJAR_Genbit_Final_Version-616fd3a073758.pdf) is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications.",
12
+ "fx": "eval_genbit"
13
  }
14
  }
config/profession_lexicons.json ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "professions": [
3
+ "Accountant",
4
+ "Actor",
5
+ "Actress",
6
+ "Aerospace Engineer",
7
+ "Agricultural Scientist",
8
+ "Air Traffic Controller",
9
+ "Aircraft Mechanic",
10
+ "Animator",
11
+ "Architect",
12
+ "Art Director",
13
+ "Attorney",
14
+ "Lawyer",
15
+ "Audiologist",
16
+ "Author",
17
+ "Writer",
18
+ "Baker",
19
+ "Barber",
20
+ "Hairdresser",
21
+ "Bartender",
22
+ "Biomedical Engineer",
23
+ "Botanist",
24
+ "Broadcast Journalist",
25
+ "Business Analyst",
26
+ "Carpenter",
27
+ "Chef",
28
+ "Cook",
29
+ "Chemist",
30
+ "Civil Engineer",
31
+ "Clinical Psychologist",
32
+ "Commercial Diver",
33
+ "Computer Programmer",
34
+ "Construction Worker",
35
+ "Corporate Trainer",
36
+ "Cosmetologist",
37
+ "Counselor",
38
+ "Therapist",
39
+ "Court Reporter",
40
+ "Creative Director",
41
+ "Criminologist",
42
+ "Customer Service Representative",
43
+ "Data Analyst",
44
+ "Dental Assistant",
45
+ "Dentist",
46
+ "Dermatologist",
47
+ "Dietician",
48
+ "Nutritionist",
49
+ "Doctor",
50
+ "Physician",
51
+ "Economist",
52
+ "Electrician",
53
+ "Elementary School Teacher",
54
+ "Emergency Medical Technician",
55
+ "Engineer",
56
+ "Environmental Scientist",
57
+ "Event Planner",
58
+ "Fashion Designer",
59
+ "Film Director",
60
+ "Financial Analyst",
61
+ "Firefighter",
62
+ "Fisherman",
63
+ "Fitness Trainer",
64
+ "Flight Attendant",
65
+ "Florist",
66
+ "Food Scientist",
67
+ "Forensic Scientist",
68
+ "Furniture Maker",
69
+ "Game Developer",
70
+ "Gardener",
71
+ "Landscaper",
72
+ "Geologist",
73
+ "Graphic Designer",
74
+ "Hair Stylist",
75
+ "Historian",
76
+ "Home Health Aide",
77
+ "Hotel Manager",
78
+ "Human Resources Manager",
79
+ "Immigration Lawyer",
80
+ "Industrial Designer",
81
+ "Insurance Agent",
82
+ "Interior Designer",
83
+ "Interpreter",
84
+ "Translator",
85
+ "Investment Banker",
86
+ "IT Specialist",
87
+ "Journalist",
88
+ "Judge",
89
+ "Kindergarten Teacher",
90
+ "Land Surveyor",
91
+ "Landscape Architect",
92
+ "Lawyer",
93
+ "Attorney",
94
+ "Librarian",
95
+ "Life Coach",
96
+ "Linguist",
97
+ "Makeup Artist",
98
+ "Management Consultant",
99
+ "Manufacturing Engineer",
100
+ "Marine Biologist",
101
+ "Marketing Manager",
102
+ "Massage Therapist",
103
+ "Mechanical Engineer",
104
+ "Medical Assistant",
105
+ "Medical Researcher",
106
+ "Meteorologist",
107
+ "Midwife",
108
+ "Military Officer",
109
+ "Music Producer",
110
+ "Musician",
111
+ "Nurse",
112
+ "Occupational Therapist",
113
+ "Optician",
114
+ "Optometrist",
115
+ "Paralegal",
116
+ "Paramedic",
117
+ "Patent Attorney",
118
+ "Pediatrician",
119
+ "Personal Trainer",
120
+ "Petroleum Engineer",
121
+ "Pharmacist",
122
+ "Photographer",
123
+ "Physical Therapist",
124
+ "Physician Assistant",
125
+ "Pilot",
126
+ "Plumber",
127
+ "Police Officer",
128
+ "Political Scientist",
129
+ "Preschool Teacher",
130
+ "Private Investigator",
131
+ "Product Manager",
132
+ "Professor",
133
+ "Lecturer",
134
+ "Programmer",
135
+ "Psychiatrist",
136
+ "Psychologist",
137
+ "Public Relations Specialist",
138
+ "Public School Teacher",
139
+ "Real Estate Agent",
140
+ "Broker",
141
+ "Receptionist",
142
+ "Registered Nurse",
143
+ "Reporter",
144
+ "Restaurant Manager",
145
+ "Sales Representative",
146
+ "School Counselor",
147
+ "Scientist",
148
+ "Screenwriter",
149
+ "Social Media Manager",
150
+ "Social Worker",
151
+ "Software Developer",
152
+ "Speech-Language Pathologist",
153
+ "Sports Coach",
154
+ "Statistician"
155
+ ]
156
+ }
data/z_house.csv DELETED
@@ -1,7 +0,0 @@
1
- PropertyID,StreetAddress,City,State,ZipCode,NumberOfBedrooms,NumberOfBathrooms,SquareFootage,Price
2
- 1,123 Main St,Los Angeles,CA,90001,3,2,1800,550000
3
- 2,456 Elm St,New York,NY,10001,2,1,1200,750000
4
- 3,789 Oak St,San Francisco,CA,94101,4,3,2500,950000
5
- 4,101 Maple St,Boston,MA,02101,3,2.5,2000,680000
6
- 5,202 Pine St,Miami,FL,33101,4,3.5,2700,820000
7
- 6,303 Cedar St,Chicago,IL,60601,2,1,1100,450000
 
 
 
 
 
 
 
 
data/z_sentences.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ID,Sentences,Gender Bias Test
2
+ 1,The doctor examined the patient.,Neutral
3
+ 2,The teacher helped the student with their homework.,Neutral
4
+ 3,The engineer designed the new software.,Neutral
5
+ 4,The nurse cared for the sick child.,Neutral
6
+ 5,The chef prepared a delicious meal.,Neutral
7
+ 6,The firefighter bravely rescued people from the burning building.,Neutral
8
+ 7,The police officer maintained law and order in the city.,Neutral
9
+ 8,The scientist conducted groundbreaking research.,Neutral
10
+ 9,The artist created a beautiful masterpiece.,Neutral
11
+ 10,The CEO made important decisions for the company.,Neutral
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- gradio==3.40.1
2
  gradio_client==0.5.0
3
  numpy==1.25.2
4
  pandas==2.0.3
 
1
+ gradio==3.43.2
2
  gradio_client==0.5.0
3
  numpy==1.25.2
4
  pandas==2.0.3
scripts/genbit.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from genbit.genbit_metrics import GenBitMetrics
2
+
3
+
4
+ def eval_genbit(data):
5
+ genbit_metrics = GenBitMetrics(
6
+ language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80
7
+ )
8
+
9
+ data[data.columns[0]] = data[data.columns[0]].to_list()
10
+
11
+ genbit_metrics.add_data(data, tokenized=False)
12
+ genbit_metrics = genbit_metrics.get_metrics(output_word_list=False)
13
+
14
+ return genbit_metrics
scripts/genbit_metrics.py DELETED
@@ -1,48 +0,0 @@
1
- from genbit.genbit_metrics import GenBitMetrics
2
- import pandas as pd
3
- from utils.read_config import get_args
4
- from utils.load_csv import load_sample
5
-
6
-
7
- def cal_metrics(dataset):
8
- # Create a GenBit object with the desired settings:
9
-
10
- genbit_metrics_object = GenBitMetrics(language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80)
11
-
12
- # Let's say you want to use GenBit with a test sentence, you can add the sentence to GenBit:
13
- #dataset = ["I think she does not like cats. I think he does not like cats.", "He is a dog person."]
14
-
15
- genbit_metrics_object.add_data(dataset, tokenized=False)
16
-
17
-
18
- # To generate the gender bias metrics, we run `get_metrics` by setting `output_statistics` and `output_word_lists` to false, we can reduce the number of metrics created.
19
- metrics = genbit_metrics_object.get_metrics(output_statistics=True, output_word_list=True)
20
-
21
- return metrics
22
-
23
-
24
- # Function to extract genbit metrics
25
- def extract_genbit_metris(stats):
26
- metrics = {}
27
- metrics["genbit_score"] = str(stats["genbit_score"])
28
- metrics["percentage_of_female_gender_definition_words"] = str(stats["percentage_of_female_gender_definition_words"])
29
- metrics["percentage_of_male_gender_definition_words"] = str(stats["percentage_of_male_gender_definition_words"])
30
- metrics["percentage_of_non_binary_gender_definition_words"] = str(stats["percentage_of_non_binary_gender_definition_words"])
31
- metrics["percentage_of_trans_gender_definition_words"] = str(stats["percentage_of_trans_gender_definition_words"])
32
- metrics["percentage_of_cis_gender_definition_words"] = str(stats["percentage_of_cis_gender_definition_words"])
33
- metrics["num_words_considered"] = str(stats["statistics"]["num_words_considered"])
34
-
35
- return metrics
36
-
37
- def load_dataset_and_get_genbit_metrics(df, sample_method, col_name, num_sample_records):
38
-
39
-
40
- sample_df = load_sample(num_sample_records, sample_method, df, col_name)
41
-
42
- # Turn into a list of text.
43
- sample_text = sample_df[col_name].tolist()
44
-
45
- # Call cal_metrics function
46
- stats = cal_metrics(sample_text)
47
- metrics = extract_genbit_metris(stats)
48
- return metrics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/{gender_tagging.py → gender_divide.py} RENAMED
@@ -1,26 +1,23 @@
1
- # Import required libraries
2
- import pandas as pd
3
  import re
4
- from utils.read_config import get_args
5
- from utils.load_csv import load_sample
 
 
6
 
7
- # Function to get count of male terms in text
8
  def count_male_terms(text, male_terms):
9
- # Get pattern
10
- pattern = r"\b({})\b".format("|".join(male_terms))
11
- match = re.findall(pattern, str(text))
12
- return len(match)
13
 
14
- # Function to get count of female terms in text
15
  def count_female_terms(text, female_terms):
16
- # Get pattern
17
- pattern = r"\b({})\b".format("|".join(female_terms))
18
- match = re.findall(pattern, str(text))
19
- return len(match)
20
 
21
- # Function to get gender tag categories
22
  def get_gender_tag(count_m_term, count_f_term):
23
- tag = ''
24
  if count_m_term == 0 and count_f_term == 0:
25
  tag = "No Gender"
26
 
@@ -44,50 +41,60 @@ def get_gender_tag(count_m_term, count_f_term):
44
  return tag
45
 
46
 
47
- # Function to calculate PG and SPG
48
  def get_pg_spg(sample_df):
49
- count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"]['gender_cat'].count()
50
-
51
- count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"]['gender_cat'].count()
52
- count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"]['gender_cat'].count()
53
-
54
- count_male_pg = sample_df[sample_df['gender_cat'] == "Male Positive Gender"]['gender_cat'].count()
55
- count_male_spg = sample_df[sample_df['gender_cat'] == "Male Strongly Positive Gender"]['gender_cat'].count()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
- count_female_pg = sample_df[sample_df['gender_cat'] == "Female Positive Gender"]['gender_cat'].count()
58
- count_female_spg = sample_df[sample_df['gender_cat'] == "Female Stronly Positive Gender"]['gender_cat'].count()
59
-
60
  return {
61
- "gender" : str(count_gender_sentences),
62
- "no gender" : str(count_no_gender_sentences),
63
- "equal gender" : str(count_equal_gender),
64
- "female pg" : str(count_female_pg),
65
- "male pg" : str(count_male_pg),
66
- "female spg" : str(count_female_spg),
67
- "male spg" : str(count_male_spg)
68
  }
69
 
70
- # Function to load dataset and get the analysis done
71
- def load_dataset_and_analyze_gender_tag(df, sample_method, col_name, num_sample_records):
72
- # Read config file
73
- male_terms = get_args("male_terms")
74
- female_terms = get_args("female_terms")
75
- # Load sample
76
- sample_df = load_sample(num_sample_records, sample_method, df, col_name)
77
-
78
- # Lowercase of text
79
- sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
80
-
81
- # Get new columns of count - male terms and female terms
82
- sample_df['count_male_term'] = sample_df.apply(lambda x : count_male_terms(x[col_name], male_terms), axis=1)
83
- sample_df['count_female_term'] = sample_df.apply(lambda x : count_female_terms(x[:], female_terms), axis=1)
84
-
85
- # Get tag categories
86
- sample_df['gender_cat'] = sample_df.apply(lambda row: get_gender_tag(row['count_male_term'], row['count_female_term']), axis=1)
87
-
88
- # Get statistics
89
- collection = get_pg_spg(sample_df)
90
- return collection
91
 
92
-
93
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
+ import json
3
+
4
+ gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
5
+
6
 
 
7
  def count_male_terms(text, male_terms):
8
+ pattern = r"\b({})\b".format("|".join(male_terms))
9
+ match = re.findall(pattern, str(text))
10
+ return len(match)
11
+
12
 
 
13
  def count_female_terms(text, female_terms):
14
+ pattern = r"\b({})\b".format("|".join(female_terms))
15
+ match = re.findall(pattern, str(text))
16
+ return len(match)
17
+
18
 
 
19
  def get_gender_tag(count_m_term, count_f_term):
20
+ tag = ""
21
  if count_m_term == 0 and count_f_term == 0:
22
  tag = "No Gender"
23
 
 
41
  return tag
42
 
43
 
 
44
  def get_pg_spg(sample_df):
45
+ count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"][
46
+ "gender_cat"
47
+ ].count()
48
+
49
+ count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"][
50
+ "gender_cat"
51
+ ].count()
52
+ count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"][
53
+ "gender_cat"
54
+ ].count()
55
+
56
+ count_male_pg = sample_df[sample_df["gender_cat"] == "Male Positive Gender"][
57
+ "gender_cat"
58
+ ].count()
59
+ count_male_spg = sample_df[
60
+ sample_df["gender_cat"] == "Male Strongly Positive Gender"
61
+ ]["gender_cat"].count()
62
+
63
+ count_female_pg = sample_df[sample_df["gender_cat"] == "Female Positive Gender"][
64
+ "gender_cat"
65
+ ].count()
66
+ count_female_spg = sample_df[
67
+ sample_df["gender_cat"] == "Female Stronly Positive Gender"
68
+ ]["gender_cat"].count()
69
 
 
 
 
70
  return {
71
+ "gender": str(count_gender_sentences),
72
+ "no gender": str(count_no_gender_sentences),
73
+ "equal gender": str(count_equal_gender),
74
+ "female pg": str(count_female_pg),
75
+ "male pg": str(count_male_pg),
76
+ "female spg": str(count_female_spg),
77
+ "male spg": str(count_male_spg),
78
  }
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
+ def eval_gender_divide(data):
82
+ male_terms = gender_lexicons.get("male_lexicons")
83
+ female_terms = gender_lexicons.get("female_lexicons")
84
+
85
+ data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip()
86
+
87
+ data["count_male_term"] = data.apply(
88
+ lambda x: count_male_terms(x[data.columns[0]], male_terms), axis=1
89
+ )
90
+ data["count_female_term"] = data.apply(
91
+ lambda x: count_female_terms(x[:], female_terms), axis=1
92
+ )
93
+
94
+ data["gender_cat"] = data.apply(
95
+ lambda row: get_gender_tag(row["count_male_term"], row["count_female_term"]),
96
+ axis=1,
97
+ )
98
+
99
+ collection = get_pg_spg(data)
100
+ return collection
scripts/{gender_profession_tagging.py → gender_profession_bias.py} RENAMED
@@ -1,43 +1,43 @@
1
- import pandas as pd
2
  import re
3
- import spacy
4
- from spacy.lang.en import English
5
- import time
6
- from tqdm import tqdm
7
- import multiprocessing.pool
8
 
9
- import warnings
10
- warnings.filterwarnings("ignore")
11
- from utils.read_config import get_args
12
- from utils.load_csv import load_sample
13
 
 
 
14
 
15
- # For sentence split
16
  nlp = English()
17
  nlp.add_pipe("sentencizer")
18
 
19
- # Function to split sentences
20
- def get_split_text(text):
21
 
 
22
  doc = nlp(text)
23
  sentences = [sent for sent in doc.sents]
24
  return sentences
25
 
26
- def get_gender_prof_match_details(df_text):
27
 
28
- # Get args from config file
29
- male_pronoun = get_args("male_pronoun")
30
- female_pronoun = get_args("female_pronoun")
31
- professions = get_args("professions")
 
32
 
33
- # Get regex pattern
34
- male_pronoun_pat, female_pronoun_pat, professions_pat = get_regex_pattern(male_pronoun, female_pronoun, professions)
35
 
 
 
 
 
 
 
 
 
36
 
37
  split_text = get_split_text(df_text)
38
 
39
  results = []
40
-
41
  for text in split_text:
42
  male_pronoun_match = re.findall(male_pronoun_pat, str(text))
43
  female_pronoun_match = re.findall(female_pronoun_pat, str(text))
@@ -52,78 +52,60 @@ def get_gender_prof_match_details(df_text):
52
  if len(female_pronoun_match) != 0 and len(prof_match) != 0:
53
  both_match = "Yes"
54
 
55
- # Unpack from list
56
  male_pronoun_match = ",".join(male_pronoun_match)
57
  female_pronoun_match = ",".join(female_pronoun_match)
58
 
59
  prof_match = ",".join(prof_match)
60
 
61
- results.append((str(text), male_pronoun_match, female_pronoun_match, prof_match, both_match))
 
 
 
 
 
 
 
 
62
 
63
  return results
64
 
65
- # Function to call multiprocessing threadpool
66
  def call_multiprocessing_pool(df_text):
67
  concurrent = 2000
68
  pool = multiprocessing.pool.ThreadPool(processes=concurrent)
69
  result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
70
  pool.close()
71
 
72
- # return_list is nested -- we need to flatten it
73
  flat_return_list = [item for sublist in result_list for item in sublist]
74
 
75
- # add column names
76
- cols = ["Split_Text", 'Male Pronoun', 'Female Pronoun', 'Profession', "Both Match"]
77
  return_df = pd.DataFrame(flat_return_list, columns=cols)
78
 
79
  return return_df
80
 
81
- # Function to get statistics
82
- def get_statistics(results_df):
83
- count_total_sentence = results_df.shape[0]
84
- count_both_match = results_df[results_df["Both Match"] == "Yes"]['Both Match'].count()
85
- count_male_pronoun = results_df[results_df["Male Pronoun"] != ""]["Male Pronoun"].count()
86
- count_female_pronoun = results_df[results_df["Female Pronoun"] != ""]["Female Pronoun"].count()
87
-
88
- count_male_pronoun_profession = results_df[(results_df["Male Pronoun"] != "") & (results_df["Profession"] != "")]["Male Pronoun"].count()
89
- count_female_pronoun_profession = results_df[(results_df["Female Pronoun"] != "") & (results_df["Profession"] != "")]["Female Pronoun"].count()
90
-
91
- return{
92
- "total_sentence" : str(count_total_sentence),
93
- "both_gender_prof_match" : str(count_both_match),
94
- "count_male_pronoun" : str(count_male_pronoun),
95
- "count_female_pronoun" : str(count_female_pronoun),
96
- "count_male_pronoun_profession" : str(count_male_pronoun_profession),
97
- "count_female_pronoun_profession" : str(count_female_pronoun_profession)
98
- }
99
-
100
- # Function to return regular expression patterns
101
- def get_regex_pattern(male_pronoun, female_pronoun, professions):
102
 
103
-
104
- male_pronoun_pat = r'\b({})\b'.format("|".join(male_pronoun))
105
- female_pronoun_pat = r'\b({})\b'.format("|".join(female_pronoun))
106
-
107
- #Lower case male professioon
108
- professions = [prof.lower() for prof in professions]
109
- professions_pat = r'\b({})\b'.format("|".join(professions))
110
-
111
- return male_pronoun_pat, female_pronoun_pat, professions_pat
 
112
 
 
113
 
114
- def load_dataset_and_analyze_gender_profession(df, sample_method, col_name, num_sample_records):
115
- # Get args from config file
116
 
117
- sample_df = load_sample(num_sample_records, sample_method, df, col_name)
118
 
119
 
120
- # Lowercase of text
121
- sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
122
 
123
- # Call multiple threadpool
124
- results_df = call_multiprocessing_pool(sample_df[col_name])
125
 
126
- stats = get_statistics(results_df)
127
-
128
- # Get statistics
129
- return stats
 
 
1
  import re
2
+ import json
 
 
 
 
3
 
4
+ import pandas as pd
5
+ import multiprocessing.pool
6
+ from spacy.lang.en import English
 
7
 
8
+ gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
9
+ profession_lexicons = json.load(open("config/profession_lexicons.json", "r"))
10
 
 
11
  nlp = English()
12
  nlp.add_pipe("sentencizer")
13
 
 
 
14
 
15
+ def get_split_text(text):
16
  doc = nlp(text)
17
  sentences = [sent for sent in doc.sents]
18
  return sentences
19
 
 
20
 
21
+ def compile_regex_patterns(patterns):
22
+ return [
23
+ re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE)
24
+ for pattern in patterns
25
+ ]
26
 
 
 
27
 
28
+ def get_gender_prof_match_details(df_text):
29
+ male_pronouns = gender_lexicons.get("male_pronouns")
30
+ female_pronouns = gender_lexicons.get("female_pronouns")
31
+ professions = profession_lexicons.get("professions")
32
+
33
+ male_pronoun_pat, female_pronoun_pat, professions_pat = compile_regex_patterns(
34
+ [male_pronouns, female_pronouns, professions]
35
+ )
36
 
37
  split_text = get_split_text(df_text)
38
 
39
  results = []
40
+
41
  for text in split_text:
42
  male_pronoun_match = re.findall(male_pronoun_pat, str(text))
43
  female_pronoun_match = re.findall(female_pronoun_pat, str(text))
 
52
  if len(female_pronoun_match) != 0 and len(prof_match) != 0:
53
  both_match = "Yes"
54
 
 
55
  male_pronoun_match = ",".join(male_pronoun_match)
56
  female_pronoun_match = ",".join(female_pronoun_match)
57
 
58
  prof_match = ",".join(prof_match)
59
 
60
+ results.append(
61
+ (
62
+ str(text),
63
+ male_pronoun_match,
64
+ female_pronoun_match,
65
+ prof_match,
66
+ both_match,
67
+ )
68
+ )
69
 
70
  return results
71
 
72
+
73
  def call_multiprocessing_pool(df_text):
74
  concurrent = 2000
75
  pool = multiprocessing.pool.ThreadPool(processes=concurrent)
76
  result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
77
  pool.close()
78
 
 
79
  flat_return_list = [item for sublist in result_list for item in sublist]
80
 
81
+ cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
 
82
  return_df = pd.DataFrame(flat_return_list, columns=cols)
83
 
84
  return return_df
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ def get_statistics(result):
88
+ conditions = {
89
+ "both_gender_prof_match": result["Both Match"].eq("Yes"),
90
+ "count_male_pronoun": result["Male Pronoun"].ne(""),
91
+ "count_female_pronoun": result["Female Pronoun"].ne(""),
92
+ "count_male_pronoun_profession": result["Male Pronoun"].ne("")
93
+ & result["Profession"].ne(""),
94
+ "count_female_pronoun_profession": result["Female Pronoun"].ne("")
95
+ & result["Profession"].ne(""),
96
+ }
97
 
98
+ stats = {key: str(value.sum()) for key, value in conditions.items()}
99
 
100
+ stats["total_sentence"] = str(len(result))
 
101
 
102
+ return stats
103
 
104
 
105
+ def eval_gender_profession(data):
106
+ data = data[data.columns[0]].str.lower().str.strip()
107
 
108
+ result = call_multiprocessing_pool(data)
109
+ stats = get_statistics(result)
110
 
111
+ return stats
 
 
 
utils/config.json DELETED
@@ -1,160 +0,0 @@
1
- {
2
- "first_records" : 2000,
3
- "random_seed" : 42,
4
- "male_terms" : ["man", "boy", "male", "he", "son", "his", "himself", "guy", "father", "john"],
5
- "female_terms" : ["woman", "girl", "female", "she", "daughter", "her", "herself", "gal", "mother", "mary"],
6
- "male_pronoun" : ["he", "him", "his"],
7
- "female_pronoun" : ["she", "her", "hers"],
8
- "professions" : ["Accountant",
9
- "Actor",
10
- "Actress",
11
- "Aerospace Engineer",
12
- "Agricultural Scientist",
13
- "Air Traffic Controller",
14
- "Aircraft Mechanic",
15
- "Animator",
16
- "Architect",
17
- "Art Director",
18
- "Attorney",
19
- "Lawyer",
20
- "Audiologist",
21
- "Author",
22
- "Writer",
23
- "Baker",
24
- "Barber",
25
- "Hairdresser",
26
- "Bartender",
27
- "Biomedical Engineer",
28
- "Botanist",
29
- "Broadcast Journalist",
30
- "Business Analyst",
31
- "Carpenter",
32
- "Chef",
33
- "Cook",
34
- "Chemist",
35
- "Civil Engineer",
36
- "Clinical Psychologist",
37
- "Commercial Diver",
38
- "Computer Programmer",
39
- "Construction Worker",
40
- "Corporate Trainer",
41
- "Cosmetologist",
42
- "Counselor",
43
- "Therapist",
44
- "Court Reporter",
45
- "Creative Director",
46
- "Criminologist",
47
- "Customer Service Representative",
48
- "Data Analyst",
49
- "Dental Assistant",
50
- "Dentist",
51
- "Dermatologist",
52
- "Dietician",
53
- "Nutritionist",
54
- "Doctor",
55
- "Physician",
56
- "Economist",
57
- "Electrician",
58
- "Elementary School Teacher",
59
- "Emergency Medical Technician",
60
- "Engineer",
61
- "Environmental Scientist",
62
- "Event Planner",
63
- "Fashion Designer",
64
- "Film Director",
65
- "Financial Analyst",
66
- "Firefighter",
67
- "Fisherman",
68
- "Fitness Trainer",
69
- "Flight Attendant",
70
- "Florist",
71
- "Food Scientist",
72
- "Forensic Scientist",
73
- "Furniture Maker",
74
- "Game Developer",
75
- "Gardener",
76
- "Landscaper",
77
- "Geologist",
78
- "Graphic Designer",
79
- "Hair Stylist",
80
- "Historian",
81
- "Home Health Aide",
82
- "Hotel Manager",
83
- "Human Resources Manager",
84
- "Immigration Lawyer",
85
- "Industrial Designer",
86
- "Insurance Agent",
87
- "Interior Designer",
88
- "Interpreter",
89
- "Translator",
90
- "Investment Banker",
91
- "IT Specialist",
92
- "Journalist",
93
- "Judge",
94
- "Kindergarten Teacher",
95
- "Land Surveyor",
96
- "Landscape Architect",
97
- "Lawyer",
98
- "Attorney",
99
- "Librarian",
100
- "Life Coach",
101
- "Linguist",
102
- "Makeup Artist",
103
- "Management Consultant",
104
- "Manufacturing Engineer",
105
- "Marine Biologist",
106
- "Marketing Manager",
107
- "Massage Therapist",
108
- "Mechanical Engineer",
109
- "Medical Assistant",
110
- "Medical Researcher",
111
- "Meteorologist",
112
- "Midwife",
113
- "Military Officer",
114
- "Music Producer",
115
- "Musician",
116
- "Nurse",
117
- "Occupational Therapist",
118
- "Optician",
119
- "Optometrist",
120
- "Paralegal",
121
- "Paramedic",
122
- "Patent Attorney",
123
- "Pediatrician",
124
- "Personal Trainer",
125
- "Petroleum Engineer",
126
- "Pharmacist",
127
- "Photographer",
128
- "Physical Therapist",
129
- "Physician Assistant",
130
- "Pilot",
131
- "Plumber",
132
- "Police Officer",
133
- "Political Scientist",
134
- "Preschool Teacher",
135
- "Private Investigator",
136
- "Product Manager",
137
- "Professor",
138
- "Lecturer",
139
- "Programmer",
140
- "Psychiatrist",
141
- "Psychologist",
142
- "Public Relations Specialist",
143
- "Public School Teacher",
144
- "Real Estate Agent",
145
- "Broker",
146
- "Receptionist",
147
- "Registered Nurse",
148
- "Reporter",
149
- "Restaurant Manager",
150
- "Sales Representative",
151
- "School Counselor",
152
- "Scientist",
153
- "Screenwriter",
154
- "Social Media Manager",
155
- "Social Worker",
156
- "Software Developer",
157
- "Speech-Language Pathologist",
158
- "Sports Coach",
159
- "Statistician"]
160
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/load_csv.py DELETED
@@ -1,23 +0,0 @@
1
- import pandas as pd
2
- from utils.read_config import get_args
3
-
4
- # Function to load sample of dataset
5
-
6
-
7
- def load_sample(num_sample_records, sample_method, df, col_name):
8
-
9
- sample_first_records = get_args("first_records")
10
- sample_random_seed = get_args("random_seed")
11
-
12
- num_sample_records = num_sample_records if num_sample_records <= sample_first_records else sample_first_records
13
-
14
- # Keep only required column
15
- df = df[[col_name]]
16
- if sample_method == "First":
17
- df = df.iloc[:num_sample_records].copy().reset_index()
18
- if sample_method == "Last":
19
- df = df.iloc[-num_sample_records:].copy().reset_index()
20
- if sample_method == "Random":
21
- df = df.sample(num_sample_records,
22
- random_state=sample_random_seed).copy().reset_index()
23
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/read_config.py DELETED
@@ -1,13 +0,0 @@
1
- import json
2
-
3
- def read_config_file():
4
- with open("utils/config.json", "r") as jsonfile:
5
- data = json.load(jsonfile)
6
- return data
7
-
8
- def get_args(args):
9
- try:
10
- data = read_config_file()
11
- except:
12
- raise "Could not read config file."
13
- return data[args]