freyam commited on
Commit
0321f34
·
1 Parent(s): 1dc0d8b

Update the UI and Modularise the methodologies

Browse files
Files changed (7) hide show
  1. app.py +62 -115
  2. methodologies.json +14 -0
  3. plot.ipynb +0 -0
  4. utils/load_csv.py +6 -6
  5. z_animal.csv +0 -11
  6. z_employee.csv +0 -26
  7. z_house.csv +0 -7
app.py CHANGED
@@ -1,98 +1,44 @@
 
1
  import gradio as gr
2
  import pandas as pd
3
  import os
 
4
  from scripts.genbit_metrics import *
5
  from scripts.gender_profession_tagging import *
6
  from scripts.gender_tagging import *
7
  from utils.load_csv import *
8
  from utils.read_config import get_args
9
 
10
- dummy_data = [
11
- ["Category", "Value", "Percentage"],
12
- ["Total Reviews", 50000, None],
13
- ["Total Sentences", 621647, None],
14
- ["Pronouns in Sentences", None, None],
15
- ["Male Pronouns", 85615, None],
16
- ["Female Pronouns", 39372, None],
17
- ["Both Male and Female Pronouns", 7765, None],
18
- ["Exclusive Usage of Pronouns", None, None],
19
- ["Only Male Pronouns", 77860, 13.77],
20
- ["Only Female Pronouns", 31617, 6.33],
21
- ["Pronouns and Professions in Sentences", None, None],
22
- ["Male Pronouns with Professions", 5580, 0.9],
23
- ["Female Pronouns with Professions", 2618, 0.42],
24
- ["Exclusive Usage of Pronouns with Professions", None, None],
25
- ["Only Male Pronouns with Professions", 5011, 0.81],
26
- ["Only Female Pronouns with Professions", 2049, 0.33],
27
- ["Pronouns and Professions in Combination", None, None],
28
- ["Male or Female Pronouns with Professions", 7629, 1.23],
29
- ["Male and Female Pronouns with Professions", 569, 0.09]
30
- ]
31
-
32
-
33
- def display_methodology(methodology):
34
- title = "### " + methodology
35
- description = ""
36
-
37
- if methodology == "Term Identity Diversity Analysis":
38
- description = "333"
39
- elif methodology == "Gender Label Evaluation":
40
- description = "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization."
41
- elif methodology == "Microsoft Genbit":
42
- description = "GenBiT is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications."
43
 
44
- return (
45
- gr.Markdown.update(title, visible=True),
46
- gr.Markdown.update(description, visible=True)
47
- )
48
 
 
 
 
 
 
 
 
49
 
50
- def run_evaluation(dataset_file, dataset_scope, dataset_scope_n, dataset_columns, methodology):
51
 
 
52
  status = {}
53
- # Read CSV file
54
- dataset = check_csv(dataset_file.name)
55
  sample_method = dataset_scope
56
- col_name = dataset_columns
57
  num_sample_records = dataset_scope_n
58
 
59
- # Check selected methodology
60
- if methodology == "Term Identity Diversity Analysis":
61
- status = load_dataset_and_analyze_gender_tag(
62
- dataset, sample_method, col_name, num_sample_records)
63
- if methodology == "Gender Label Evaluation":
64
- status = load_dataset_and_analyze_gender_profession(
65
- dataset, sample_method, col_name, num_sample_records)
66
- if methodology == "Microsoft Genbit":
67
- status = load_dataset_and_get_genbit_metrics(
68
- dataset, sample_method, col_name, num_sample_records)
69
-
70
- # status = {
71
- # "dataset": dataset_file.name,
72
- # "methodology": methodology,
73
- # "scope": dataset_scope + " " + str(dataset_scope_n),
74
- # "column": dataset_columns
75
- # }
76
 
77
  return gr.JSON.update(status, visible=True)
78
 
79
- if methodology == "A":
80
- run_a(dataset)
81
- elif methodology == "B":
82
- run_b(dataset)
83
- elif methodology == "C":
84
- run_c(dataset)
85
-
86
 
87
  def process_dataset(dataset):
88
-
89
  data = pd.read_csv(dataset.name)
90
- # maximum_records = get_args("first_records")
91
- # input_records = data.shape(0)
92
- # num_sample_records = input_records if input_records < maximum_records else maximum_records
93
 
94
- columns = data.columns.tolist()
95
- columns = [x for x in columns if data[x].dtype == "object"]
96
 
97
  return (
98
  gr.Radio.update(
@@ -104,13 +50,13 @@ def process_dataset(dataset):
104
  interactive=True,
105
  ),
106
  gr.Slider.update(
107
- label="Number of Entries",
108
- info=f"Determines the number of entries to be analyzed. The dataset has {data.shape[0]} entries.",
109
  minimum=1,
110
- maximum=data.shape[0],
111
- value=data.shape[0] // 2,
112
  visible=True,
113
- interactive=True
114
  ),
115
  gr.Radio.update(
116
  label="Column",
@@ -119,101 +65,102 @@ def process_dataset(dataset):
119
  value=columns[0],
120
  visible=True,
121
  interactive=True,
122
- )
123
  )
124
 
125
 
126
- def process_column(dataset, column):
127
  data = pd.read_csv(dataset.name)
128
- corpus = data[column].to_list()[:10]
129
 
130
- return gr.Dataframe.update(value=pd.DataFrame({"Data Corpus": corpus}), max_rows=5, visible=True)
 
 
131
 
132
 
133
  BiasAware = gr.Blocks(title="BiasAware: Dataset Bias Detection")
134
 
135
  with BiasAware:
136
- gr.Markdown("# BiasAware: Dataset Bias Detection")
137
  gr.Markdown(
138
- "Natural Language Processing (NLP) training datasets often reflect the biases present in the data sources they are compiled from, leading to the **perpetuation of stereotypes, underrepresentation, and skewed perspectives in AI models**. BiasAware is designed to **identify and quantify biases present in text data**, making it an invaluable resource for data scientists, machine learning practitioners, and organizations committed to **mitigating bias in AI systems**."
139
  )
140
 
141
  with gr.Row():
142
- with gr.Column(scale=1):
143
  gr.Markdown("## Dataset")
144
 
145
- dataset_file = gr.File()
146
  dataset_examples = gr.Examples(
147
  [
148
- os.path.join(os.path.dirname(__file__),
149
- "data/z_animal.csv"),
150
- os.path.join(os.path.dirname(__file__),
151
- "data/z_employee.csv"),
152
- os.path.join(os.path.dirname(
153
- __file__), "data/z_house.csv"),
154
-
155
  ],
156
  inputs=dataset_file,
 
157
  )
158
 
159
  dataset_scope = gr.Radio(visible=False)
160
  dataset_scope_n = gr.Slider(visible=False)
161
- dataset_columns = gr.Radio(visible=False)
162
 
163
- dataset_corpus = gr.Dataframe(visible=False)
 
 
164
 
165
- with gr.Column(scale=1):
166
  gr.Markdown("## Methodology")
167
 
168
  methodology = gr.Radio(
169
  label="Methodology",
170
  info="Determines the methodology to be used for bias detection",
171
  choices=[
172
- "Term Identity Diversity Analysis",
173
- "Gender Label Evaluation",
174
- "Microsoft Genbit",
175
  ],
176
- value="Term Identity Diversity Analysis",
177
  )
178
 
179
  evalButton = gr.Button("Run Evaluation")
180
 
181
- methodology_title = gr.Markdown(visible=False)
182
- methodology_description = gr.Markdown(visible=False)
183
 
184
  with gr.Column(scale=4):
185
  gr.Markdown("## Result")
186
 
187
  result_status = gr.JSON(visible=False)
188
- result = gr.DataFrame()
 
 
189
 
190
  dataset_file.change(
191
  fn=process_dataset,
192
  inputs=[dataset_file],
193
- outputs=[
194
- dataset_scope,
195
- dataset_scope_n,
196
- dataset_columns
197
- ]
198
  )
199
 
200
- dataset_columns.change(
201
- fn=process_column,
202
- inputs=[dataset_file, dataset_columns],
203
  outputs=[dataset_corpus],
204
  )
205
 
206
  methodology.change(
207
- fn=display_methodology,
208
  inputs=[methodology],
209
- outputs=[methodology_title, methodology_description],
210
  )
211
 
212
  evalButton.click(
213
- fn=run_evaluation,
214
- inputs=[dataset_file, dataset_scope,
215
- dataset_scope_n, dataset_columns, methodology],
216
- outputs=[result_status]
 
 
 
 
 
217
  )
218
 
219
  BiasAware.launch()
 
1
+ import json
2
  import gradio as gr
3
  import pandas as pd
4
  import os
5
+
6
  from scripts.genbit_metrics import *
7
  from scripts.gender_profession_tagging import *
8
  from scripts.gender_tagging import *
9
  from utils.load_csv import *
10
  from utils.read_config import get_args
11
 
12
+ methodologies = json.load(open("methodologies.json", "r"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
 
 
 
 
14
 
15
+ def get_methodology_metadata(methodology):
16
+ title = "## " + methodology
17
+ description = methodologies.get(methodology).get("description")
18
+
19
+ metadata = f"{title}\n\n{description}"
20
+
21
+ return gr.Markdown.update(metadata, visible=True)
22
 
 
23
 
24
+ def evaluate(dataset_file, dataset_scope, dataset_scope_n, dataset_column, methodology):
25
  status = {}
26
+ dataset = pd.read_csv(dataset_file.name)
 
27
  sample_method = dataset_scope
28
+ col_name = dataset_column
29
  num_sample_records = dataset_scope_n
30
 
31
+ status = globals()[methodologies.get(methodology).get("fx")](
32
+ dataset, sample_method, col_name, num_sample_records
33
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  return gr.JSON.update(status, visible=True)
36
 
 
 
 
 
 
 
 
37
 
38
  def process_dataset(dataset):
 
39
  data = pd.read_csv(dataset.name)
 
 
 
40
 
41
+ columns = data.select_dtypes(include=["object"]).columns.tolist()
 
42
 
43
  return (
44
  gr.Radio.update(
 
50
  interactive=True,
51
  ),
52
  gr.Slider.update(
53
+ label=f"Number of Entries",
54
+ info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {get_args('first_records')}.",
55
  minimum=1,
56
+ maximum=min(data.shape[0], get_args("first_records")),
57
+ value=min(data.shape[0], get_args("first_records")) // 2,
58
  visible=True,
59
+ interactive=True,
60
  ),
61
  gr.Radio.update(
62
  label="Column",
 
65
  value=columns[0],
66
  visible=True,
67
  interactive=True,
68
+ ),
69
  )
70
 
71
 
72
+ def get_column_metadata(dataset, column):
73
  data = pd.read_csv(dataset.name)
74
+ corpus = data[column].head(10).tolist()
75
 
76
+ return gr.Dataframe.update(
77
+ value=pd.DataFrame({f"Data Corpus: {column}": corpus}), visible=True
78
+ )
79
 
80
 
81
  BiasAware = gr.Blocks(title="BiasAware: Dataset Bias Detection")
82
 
83
  with BiasAware:
 
84
  gr.Markdown(
85
+ "# BiasAware: Dataset Bias Detection\n\nBiasAware is a specialized tool for detecting and quantifying biases within datasets used for Natural Language Processing (NLP) tasks. NLP training datasets frequently mirror the inherent biases of their source materials, resulting in AI models that unintentionally perpetuate stereotypes, exhibit underrepresentation, and showcase skewed perspectives."
86
  )
87
 
88
  with gr.Row():
89
+ with gr.Column(scale=2):
90
  gr.Markdown("## Dataset")
91
 
92
+ dataset_file = gr.File(label="Dataset")
93
  dataset_examples = gr.Examples(
94
  [
95
+ os.path.join(os.path.dirname(__file__), "data/z_animal.csv"),
96
+ os.path.join(os.path.dirname(__file__), "data/z_employee.csv"),
97
+ os.path.join(os.path.dirname(__file__), "data/z_house.csv"),
 
 
 
 
98
  ],
99
  inputs=dataset_file,
100
+ label="Example Datasets",
101
  )
102
 
103
  dataset_scope = gr.Radio(visible=False)
104
  dataset_scope_n = gr.Slider(visible=False)
105
+ dataset_column = gr.Radio(visible=False)
106
 
107
+ dataset_corpus = gr.Dataframe(
108
+ row_count=(5, "fixed"), col_count=(1, "fixed"), visible=False
109
+ )
110
 
111
+ with gr.Column(scale=2):
112
  gr.Markdown("## Methodology")
113
 
114
  methodology = gr.Radio(
115
  label="Methodology",
116
  info="Determines the methodology to be used for bias detection",
117
  choices=[
118
+ "Gender Divide (Term Identity Diversity)",
119
+ "Gender Profession Bias (Lexical Evaluation)",
120
+ "GenBiT (Microsoft Responsible AI Gender Bias Tool)",
121
  ],
 
122
  )
123
 
124
  evalButton = gr.Button("Run Evaluation")
125
 
126
+ methodology_metadata = gr.Markdown(visible=False)
 
127
 
128
  with gr.Column(scale=4):
129
  gr.Markdown("## Result")
130
 
131
  result_status = gr.JSON(visible=False)
132
+ result = gr.DataFrame(
133
+ row_count=(5, "fixed"), col_count=(3, "fixed"), visible=False
134
+ )
135
 
136
  dataset_file.change(
137
  fn=process_dataset,
138
  inputs=[dataset_file],
139
+ outputs=[dataset_scope, dataset_scope_n, dataset_column],
 
 
 
 
140
  )
141
 
142
+ dataset_column.change(
143
+ fn=get_column_metadata,
144
+ inputs=[dataset_file, dataset_column],
145
  outputs=[dataset_corpus],
146
  )
147
 
148
  methodology.change(
149
+ fn=get_methodology_metadata,
150
  inputs=[methodology],
151
+ outputs=[methodology_metadata],
152
  )
153
 
154
  evalButton.click(
155
+ fn=evaluate,
156
+ inputs=[
157
+ dataset_file,
158
+ dataset_scope,
159
+ dataset_scope_n,
160
+ dataset_column,
161
+ methodology,
162
+ ],
163
+ outputs=[result_status],
164
  )
165
 
166
  BiasAware.launch()
methodologies.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Gender Divide (Term Identity Diversity)": {
3
+ "description": "333",
4
+ "fx": "load_dataset_and_analyze_gender_tag"
5
+ },
6
+ "Gender Profession Bias (Lexical Evaluation)": {
7
+ "description": "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization.",
8
+ "fx": "load_dataset_and_analyze_gender_profession"
9
+ },
10
+ "GenBiT (Microsoft Responsible AI Gender Bias Tool)": {
11
+ "description": "[GenBiT](https://www.microsoft.com/en-us/research/uploads/prod/2021/10/MSJAR_Genbit_Final_Version-616fd3a073758.pdf) is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications.",
12
+ "fx": "load_dataset_and_get_genbit_metrics"
13
+ }
14
+ }
plot.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
utils/load_csv.py CHANGED
@@ -1,17 +1,16 @@
1
  import pandas as pd
2
  from utils.read_config import get_args
3
- def check_csv(upload_file):
4
- df = pd.read_csv(upload_file)
5
- return df
6
 
7
  # Function to load sample of dataset
 
 
8
  def load_sample(num_sample_records, sample_method, df, col_name):
9
 
10
  sample_first_records = get_args("first_records")
11
  sample_random_seed = get_args("random_seed")
12
 
13
  num_sample_records = num_sample_records if num_sample_records <= sample_first_records else sample_first_records
14
-
15
  # Keep only required column
16
  df = df[[col_name]]
17
  if sample_method == "First":
@@ -19,5 +18,6 @@ def load_sample(num_sample_records, sample_method, df, col_name):
19
  if sample_method == "Last":
20
  df = df.iloc[-num_sample_records:].copy().reset_index()
21
  if sample_method == "Random":
22
- df = df.sample(num_sample_records, random_state=sample_random_seed).copy().reset_index()
23
- return df
 
 
1
  import pandas as pd
2
  from utils.read_config import get_args
 
 
 
3
 
4
  # Function to load sample of dataset
5
+
6
+
7
  def load_sample(num_sample_records, sample_method, df, col_name):
8
 
9
  sample_first_records = get_args("first_records")
10
  sample_random_seed = get_args("random_seed")
11
 
12
  num_sample_records = num_sample_records if num_sample_records <= sample_first_records else sample_first_records
13
+
14
  # Keep only required column
15
  df = df[[col_name]]
16
  if sample_method == "First":
 
18
  if sample_method == "Last":
19
  df = df.iloc[-num_sample_records:].copy().reset_index()
20
  if sample_method == "Random":
21
+ df = df.sample(num_sample_records,
22
+ random_state=sample_random_seed).copy().reset_index()
23
+ return df
z_animal.csv DELETED
@@ -1,11 +0,0 @@
1
- AnimalID,CommonName,ScientificName,Class,Order,Family,Habitat,ConservationStatus
2
- 1,Lion,Panthera leo,Mammalia,Carnivora,Felidae,Savanna,Vulnerable
3
- 2,Eagle,Aquila chrysaetos,Aves,Accipitriformes,Accipitridae,Mountains,Least Concern
4
- 3,Dolphin,Tursiops truncatus,Mammalia,Cetacea,Delphinidae,Ocean,Least Concern
5
- 4,Elephant,Loxodonta africana,Mammalia,Proboscidea,Elephantidae,Grassland,Vulnerable
6
- 5,Tiger,Panthera tigris,Mammalia,Carnivora,Felidae,Forest,Endangered
7
- 6,Penguin,Spheniscidae,Aves,Sphenisciformes,Spheniscidae,Antarctica,Least Concern
8
- 7,Giraffe,Giraffa camelopardalis,Mammalia,Artiodactyla,Giraffidae,Savanna,Vulnerable
9
- 8,Cheetah,Acinonyx jubatus,Mammalia,Carnivora,Felidae,Grassland,Vulnerable
10
- 9,Panda,Ailuropoda melanoleuca,Mammalia,Carnivora,Ursidae,Forest,Endangered
11
- 10,Kangaroo,Macropus rufus,Mammalia,Diprotodontia,Macropodidae,Grassland,Least Concern
 
 
 
 
 
 
 
 
 
 
 
 
z_employee.csv DELETED
@@ -1,26 +0,0 @@
1
- EmployeeID,FirstName,LastName,Email,Department,Salary
2
- 101,John,Smith,[email protected],Finance,60000
3
- 102,Emily,Johnson,[email protected],Marketing,55000
4
- 103,Michael,Williams,[email protected],HR,50000
5
- 104,Susan,Anderson,[email protected],IT,65000
6
- 105,David,Martin,[email protected],Sales,58000
7
- 106,Linda,Davis,[email protected],Finance,62000
8
- 107,William,Miller,[email protected],Marketing,56000
9
- 108,Sarah,Anderson,[email protected],HR,51000
10
- 109,Robert,Clark,[email protected],IT,67000
11
- 110,Karen,Wilson,[email protected],Sales,59000
12
- 111,James,Brown,[email protected],Finance,61000
13
- 112,Anna,Johnson,[email protected],Marketing,57000
14
- 113,Christopher,Moore,[email protected],HR,52000
15
- 114,Laura,White,[email protected],IT,68000
16
- 115,Mark,Davis,[email protected],Sales,60000
17
- 116,Patricia,Jones,[email protected],Finance,63000
18
- 117,Matthew,Taylor,[email protected],Marketing,58000
19
- 118,Jennifer,Young,[email protected],HR,53000
20
- 119,Steven,Anderson,[email protected],IT,69000
21
- 120,Elizabeth,Thomas,[email protected],Sales,61000
22
- 121,Kevin,Harris,[email protected],Finance,64000
23
- 122,Deborah,Smith,[email protected],Marketing,59000
24
- 123,Joseph,Walker,[email protected],HR,54000
25
- 124,Cynthia,Jackson,[email protected],IT,70000
26
- 125,Daniel,Hall,[email protected],Sales,62000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
z_house.csv DELETED
@@ -1,7 +0,0 @@
1
- PropertyID,StreetAddress,City,State,ZipCode,NumberOfBedrooms,NumberOfBathrooms,SquareFootage,Price
2
- 1,123 Main St,Los Angeles,CA,90001,3,2,1800,550000
3
- 2,456 Elm St,New York,NY,10001,2,1,1200,750000
4
- 3,789 Oak St,San Francisco,CA,94101,4,3,2500,950000
5
- 4,101 Maple St,Boston,MA,02101,3,2.5,2000,680000
6
- 5,202 Pine St,Miami,FL,33101,4,3.5,2700,820000
7
- 6,303 Cedar St,Chicago,IL,60601,2,1,1100,450000