freyam commited on
Commit
38ba037
·
1 Parent(s): 6fc6046

Integrate the IO and Setup pipeline

Browse files
Files changed (7) hide show
  1. app.py +103 -48
  2. sample1.csv +0 -21
  3. sample2.csv +0 -21
  4. sample3.csv +0 -21
  5. z_animal.csv +11 -0
  6. z_employee.csv +26 -0
  7. z_house.csv +7 -0
app.py CHANGED
@@ -25,11 +25,6 @@ dummy_data = [
25
  ]
26
 
27
 
28
- def display_dataset(dataset):
29
- data = pd.read_csv(dataset.name)
30
- return data
31
-
32
-
33
  def display_methodology(methodology):
34
  title = "### " + methodology
35
  description = ""
@@ -38,14 +33,25 @@ def display_methodology(methodology):
38
  description = "333"
39
  elif methodology == "Gender Label Evaluation":
40
  description = "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization."
41
- elif methodology == "Microsoft Genbit: Word Co-occurrence and Contextual Mitigation":
42
  description = "GenBiT is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications."
43
 
44
- return title, description
 
 
 
 
 
 
 
 
 
 
 
45
 
 
46
 
47
- def run_evaluation(dataset, methodology):
48
- return f"Running **{methodology}** on **{dataset.name.split('/')[-1].split('.')[0]}**", pd.DataFrame(dummy_data)
49
 
50
  if methodology == "A":
51
  run_a(dataset)
@@ -55,8 +61,49 @@ def run_evaluation(dataset, methodology):
55
  run_c(dataset)
56
 
57
 
58
- BiasAware = gr.Blocks(title="BiasAware: Dataset Bias Detection",
59
- theme="freddyaboulton/dracula_revamped")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  with BiasAware:
62
  gr.Markdown("# BiasAware: Dataset Bias Detection")
@@ -65,70 +112,78 @@ with BiasAware:
65
  )
66
 
67
  with gr.Row():
68
- with gr.Column(scale=2):
69
  gr.Markdown("## Dataset")
70
 
71
- dataset = gr.File()
72
- gr.Examples(
73
  [
74
- os.path.join(os.path.dirname(__file__), "sample1.csv"),
75
- os.path.join(os.path.dirname(__file__), "sample2.csv"),
76
- os.path.join(os.path.dirname(__file__), "sample3.csv"),
77
 
78
  ],
79
- inputs=dataset,
80
  )
81
 
82
- dataset_entries = gr.Dataframe()
 
 
 
 
 
 
 
83
 
84
  methodology = gr.Radio(
85
- [
 
 
86
  "Term Identity Diversity Analysis",
87
  "Gender Label Evaluation",
88
- "Microsoft Genbit: Word Co-occurrence and Contextual Mitigation",
89
  ],
90
- label="Methodology",
91
  )
92
 
93
- button = gr.Button("Run Evaluation")
94
 
95
- methodology_title = gr.Markdown("")
96
- methodology_description = gr.Markdown("")
97
 
98
  with gr.Column(scale=4):
99
  gr.Markdown("## Result")
100
 
101
- results_status = gr.Markdown()
102
- results = gr.DataFrame()
103
 
104
- with gr.Column(scale=1):
105
- gr.Markdown("## Leaderboard")
106
- gr.DataFrame(
107
- headers=["Dataset", "Score"],
108
- value=[
109
- ["imdb", 0.9],
110
- ["amazon_reviews_multi", 0.8],
111
- ["tweet_eval", 0.7],
112
- ],
113
- interactive=False,
114
- )
115
 
116
- dataset.change(
117
- fn=display_dataset,
118
- inputs=[dataset],
119
- outputs=[dataset_entries],
120
  )
121
 
122
  methodology.change(
123
  fn=display_methodology,
124
  inputs=[methodology],
125
- outputs=[
126
- methodology_title,
127
- methodology_description,
128
- ],
129
  )
130
 
131
- button.click(fn=run_evaluation, inputs=[
132
- dataset, methodology], outputs=[results_status, results])
 
 
 
 
133
 
134
  BiasAware.launch()
 
25
  ]
26
 
27
 
 
 
 
 
 
28
  def display_methodology(methodology):
29
  title = "### " + methodology
30
  description = ""
 
33
  description = "333"
34
  elif methodology == "Gender Label Evaluation":
35
  description = "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization."
36
+ elif methodology == "Microsoft Genbit":
37
  description = "GenBiT is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications."
38
 
39
+ return (
40
+ gr.Markdown.update(title, visible=True),
41
+ gr.Markdown.update(description, visible=True)
42
+ )
43
+
44
+
45
+ def run_evaluation(dataset_file, dataset_scope, dataset_scope_n, dataset_corpus, methodology):
46
+ status = {
47
+ "dataset": dataset_file.name,
48
+ "methodology": methodology,
49
+ "scope": dataset_scope + " " + str(dataset_scope_n),
50
+ "column": dataset_corpus.columns[0]
51
 
52
+ }
53
 
54
+ return gr.JSON.update(status, visible=True)
 
55
 
56
  if methodology == "A":
57
  run_a(dataset)
 
61
  run_c(dataset)
62
 
63
 
64
+ def process_dataset(dataset):
65
+ data = pd.read_csv(dataset.name)
66
+
67
+ columns = data.columns.tolist()
68
+ columns = [x for x in columns if data[x].dtype == "object"]
69
+
70
+ return (
71
+ gr.Radio.update(
72
+ label="Scope",
73
+ info="Determines the scope of the dataset to be analyzed",
74
+ choices=["First", "Last", "Random"],
75
+ value="First",
76
+ visible=True,
77
+ interactive=True,
78
+ ),
79
+ gr.Slider.update(
80
+ label="Number of Entries",
81
+ info=f"Determines the number of entries to be analyzed. The dataset has {data.shape[0]} entries.",
82
+ minimum=1,
83
+ maximum=data.shape[0],
84
+ value=data.shape[0] // 2,
85
+ visible=True,
86
+ interactive=True
87
+ ),
88
+ gr.Radio.update(
89
+ label="Column",
90
+ info="Determines the column to be analyzed. These are the columns with text data.",
91
+ choices=columns,
92
+ value=columns[0],
93
+ visible=True,
94
+ interactive=True,
95
+ )
96
+ )
97
+
98
+
99
+ def process_column(dataset, column):
100
+ data = pd.read_csv(dataset.name)
101
+ corpus = data[column].to_list()
102
+
103
+ return gr.Dataframe.update(value=pd.DataFrame({"Data Corpus": corpus}), max_rows=5, visible=True)
104
+
105
+
106
+ BiasAware = gr.Blocks(title="BiasAware: Dataset Bias Detection")
107
 
108
  with BiasAware:
109
  gr.Markdown("# BiasAware: Dataset Bias Detection")
 
112
  )
113
 
114
  with gr.Row():
115
+ with gr.Column(scale=1):
116
  gr.Markdown("## Dataset")
117
 
118
+ dataset_file = gr.File()
119
+ dataset_examples = gr.Examples(
120
  [
121
+ os.path.join(os.path.dirname(__file__), "z_animal.csv"),
122
+ os.path.join(os.path.dirname(__file__), "z_employee.csv"),
123
+ os.path.join(os.path.dirname(__file__), "z_house.csv"),
124
 
125
  ],
126
+ inputs=dataset_file,
127
  )
128
 
129
+ dataset_scope = gr.Radio(visible=False)
130
+ dataset_scope_n = gr.Slider(visible=False)
131
+ dataset_columns = gr.Radio(visible=False)
132
+
133
+ dataset_corpus = gr.Dataframe(visible=False)
134
+
135
+ with gr.Column(scale=1):
136
+ gr.Markdown("## Methodology")
137
 
138
  methodology = gr.Radio(
139
+ label="Methodology",
140
+ info="Determines the methodology to be used for bias detection",
141
+ choices=[
142
  "Term Identity Diversity Analysis",
143
  "Gender Label Evaluation",
144
+ "Microsoft Genbit",
145
  ],
146
+ value="Term Identity Diversity Analysis",
147
  )
148
 
149
+ evalButton = gr.Button("Run Evaluation")
150
 
151
+ methodology_title = gr.Markdown(visible=False)
152
+ methodology_description = gr.Markdown(visible=False)
153
 
154
  with gr.Column(scale=4):
155
  gr.Markdown("## Result")
156
 
157
+ result_status = gr.JSON(visible=False)
158
+ result = gr.DataFrame()
159
 
160
+ dataset_file.change(
161
+ fn=process_dataset,
162
+ inputs=[dataset_file],
163
+ outputs=[
164
+ dataset_scope,
165
+ dataset_scope_n,
166
+ dataset_columns
167
+ ]
168
+ )
 
 
169
 
170
+ dataset_columns.change(
171
+ fn=process_column,
172
+ inputs=[dataset_file, dataset_columns],
173
+ outputs=[dataset_corpus],
174
  )
175
 
176
  methodology.change(
177
  fn=display_methodology,
178
  inputs=[methodology],
179
+ outputs=[methodology_title, methodology_description],
 
 
 
180
  )
181
 
182
+ evalButton.click(
183
+ fn=run_evaluation,
184
+ inputs=[dataset_file, dataset_scope,
185
+ dataset_scope_n, dataset_corpus, methodology],
186
+ outputs=[result_status]
187
+ )
188
 
189
  BiasAware.launch()
sample1.csv DELETED
@@ -1,21 +0,0 @@
1
- id,sentence,name,number
2
- 1,"This is the first entry","John Doe",12345
3
- 2,"A sample sentence here","Jane Smith",67890
4
- 3,"Another example sentence","Robert Johnson",54321
5
- 4,"CSV data entry number four","Emily Brown",98765
6
- 5,"Fifth CSV entry","Michael Davis",13579
7
- 6,"Just a test sentence","Sarah Wilson",24680
8
- 7,"Seventh data point","David Lee",86420
9
- 8,"Eighth entry for CSV","Jessica Turner",97531
10
- 9,"Ninth item in the list","Christopher White",31415
11
- 10,"Tenth CSV record","Laura Hall",27183
12
- 11,"Eleventh entry here","Matthew Taylor",98712
13
- 12,"This is the twelfth one","Olivia Harris",12309
14
- 13,"Lucky thirteen","William Martin",56789
15
- 14,"Fourteenth CSV line","Sophia Anderson",98765
16
- 15,"Fifteenth data row","Daniel Thomas",54321
17
- 16,"Sixteenth entry in CSV","Ava Clark",13579
18
- 17,"Seventeenth CSV record","Ethan Lewis",24680
19
- 18,"Eighteenth data point","Mia Turner",86420
20
- 19,"Nineteenth CSV entry","James Brown",97531
21
- 20,"Twentieth and final entry","Grace Walker",31415
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sample2.csv DELETED
@@ -1,21 +0,0 @@
1
- id,sentence,name,number
2
- 1,"Fresh CSV data entry","Alice Johnson",12345
3
- 2,"Brand new CSV sentence","Kevin Smith",67890
4
- 3,"Another unique sentence","Linda Davis",54321
5
- 4,"CSV record number twenty-four","Brian Miller",98765
6
- 5,"A different CSV entry","Catherine Wilson",13579
7
- 6,"Random CSV data point","George Brown",24680
8
- 7,"Unique CSV entry","Susan Lee",86420
9
- 8,"New CSV data here","Richard Turner",97531
10
- 9,"CSV record twenty-nine","Patricia White",31415
11
- 10,"Another brand new entry","Michael Hall",27183
12
- 11,"Yet another new CSV entry","Elizabeth Taylor",98712
13
- 12,"Fresh CSV data point","Daniel Harris",12309
14
- 13,"A distinct CSV entry","Jennifer Martin",56789
15
- 14,"CSV line thirty-four","Robert Anderson",98765
16
- 15,"Thirty-fifth CSV record","Karen Thomas",54321
17
- 16,"New CSV data line","Anthony Clark",13579
18
- 17,"Unique seventeenth CSV record","Nancy Lewis",24680
19
- 18,"New eighteenth CSV point","Paul Turner",86420
20
- 19,"Nineteenth brand new CSV entry","Jessica Brown",97531
21
- 20,"A completely new twentieth entry","Christopher Walker",31415
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sample3.csv DELETED
@@ -1,21 +0,0 @@
1
- id,sentence,name,number
2
- 1,"New CSV entry one","Alice Johnson",12345
3
- 2,"Fresh CSV data point","Kevin Smith",67890
4
- 3,"Another unique record","Linda Davis",54321
5
- 4,"CSV record number four","Brian Miller",98765
6
- 5,"A different CSV line","Catherine Wilson",13579
7
- 6,"Random CSV entry","George Brown",24680
8
- 7,"Unique CSV record seven","Susan Lee",86420
9
- 8,"New CSV data eight","Richard Turner",97531
10
- 9,"CSV entry number nine","Patricia White",31415
11
- 10,"Another brand new record","Michael Hall",27183
12
- 11,"Unique CSV record eleven","Elizabeth Taylor",98712
13
- 12,"Fresh CSV entry twelve","Daniel Harris",12309
14
- 13,"A distinct CSV line","Jennifer Martin",56789
15
- 14,"CSV record fourteen","Robert Anderson",98765
16
- 15,"Fifteenth CSV data point","Karen Thomas",54321
17
- 16,"New CSV entry sixteen","Anthony Clark",13579
18
- 17,"Seventeenth unique CSV record","Nancy Lewis",24680
19
- 18,"New eighteenth CSV point","Paul Turner",86420
20
- 19,"Nineteenth brand new CSV record","Jessica Brown",97531
21
- 20,"A completely new twentieth CSV entry","Christopher Walker",31415
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
z_animal.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AnimalID,CommonName,ScientificName,Class,Order,Family,Habitat,ConservationStatus
2
+ 1,Lion,Panthera leo,Mammalia,Carnivora,Felidae,Savanna,Vulnerable
3
+ 2,Eagle,Aquila chrysaetos,Aves,Accipitriformes,Accipitridae,Mountains,Least Concern
4
+ 3,Dolphin,Tursiops truncatus,Mammalia,Cetacea,Delphinidae,Ocean,Least Concern
5
+ 4,Elephant,Loxodonta africana,Mammalia,Proboscidea,Elephantidae,Grassland,Vulnerable
6
+ 5,Tiger,Panthera tigris,Mammalia,Carnivora,Felidae,Forest,Endangered
7
+ 6,Penguin,Spheniscidae,Aves,Sphenisciformes,Spheniscidae,Antarctica,Least Concern
8
+ 7,Giraffe,Giraffa camelopardalis,Mammalia,Artiodactyla,Giraffidae,Savanna,Vulnerable
9
+ 8,Cheetah,Acinonyx jubatus,Mammalia,Carnivora,Felidae,Grassland,Vulnerable
10
+ 9,Panda,Ailuropoda melanoleuca,Mammalia,Carnivora,Ursidae,Forest,Endangered
11
+ 10,Kangaroo,Macropus rufus,Mammalia,Diprotodontia,Macropodidae,Grassland,Least Concern
z_employee.csv ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EmployeeID,FirstName,LastName,Email,Department,Salary
2
+ 101,John,Smith,[email protected],Finance,60000
3
+ 102,Emily,Johnson,[email protected],Marketing,55000
4
+ 103,Michael,Williams,[email protected],HR,50000
5
+ 104,Susan,Anderson,[email protected],IT,65000
6
+ 105,David,Martin,[email protected],Sales,58000
7
+ 106,Linda,Davis,[email protected],Finance,62000
8
+ 107,William,Miller,[email protected],Marketing,56000
9
+ 108,Sarah,Anderson,[email protected],HR,51000
10
+ 109,Robert,Clark,[email protected],IT,67000
11
+ 110,Karen,Wilson,[email protected],Sales,59000
12
+ 111,James,Brown,[email protected],Finance,61000
13
+ 112,Anna,Johnson,[email protected],Marketing,57000
14
+ 113,Christopher,Moore,[email protected],HR,52000
15
+ 114,Laura,White,[email protected],IT,68000
16
+ 115,Mark,Davis,[email protected],Sales,60000
17
+ 116,Patricia,Jones,[email protected],Finance,63000
18
+ 117,Matthew,Taylor,[email protected],Marketing,58000
19
+ 118,Jennifer,Young,[email protected],HR,53000
20
+ 119,Steven,Anderson,[email protected],IT,69000
21
+ 120,Elizabeth,Thomas,[email protected],Sales,61000
22
+ 121,Kevin,Harris,[email protected],Finance,64000
23
+ 122,Deborah,Smith,[email protected],Marketing,59000
24
+ 123,Joseph,Walker,[email protected],HR,54000
25
+ 124,Cynthia,Jackson,[email protected],IT,70000
26
+ 125,Daniel,Hall,[email protected],Sales,62000
z_house.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ PropertyID,StreetAddress,City,State,ZipCode,NumberOfBedrooms,NumberOfBathrooms,SquareFootage,Price
2
+ 1,123 Main St,Los Angeles,CA,90001,3,2,1800,550000
3
+ 2,456 Elm St,New York,NY,10001,2,1,1200,750000
4
+ 3,789 Oak St,San Francisco,CA,94101,4,3,2500,950000
5
+ 4,101 Maple St,Boston,MA,02101,3,2.5,2000,680000
6
+ 5,202 Pine St,Miami,FL,33101,4,3.5,2700,820000
7
+ 6,303 Cedar St,Chicago,IL,60601,2,1,1100,450000