Integrate the IO and Setup pipeline
Browse files- app.py +103 -48
- sample1.csv +0 -21
- sample2.csv +0 -21
- sample3.csv +0 -21
- z_animal.csv +11 -0
- z_employee.csv +26 -0
- z_house.csv +7 -0
app.py
CHANGED
@@ -25,11 +25,6 @@ dummy_data = [
|
|
25 |
]
|
26 |
|
27 |
|
28 |
-
def display_dataset(dataset):
|
29 |
-
data = pd.read_csv(dataset.name)
|
30 |
-
return data
|
31 |
-
|
32 |
-
|
33 |
def display_methodology(methodology):
|
34 |
title = "### " + methodology
|
35 |
description = ""
|
@@ -38,14 +33,25 @@ def display_methodology(methodology):
|
|
38 |
description = "333"
|
39 |
elif methodology == "Gender Label Evaluation":
|
40 |
description = "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization."
|
41 |
-
elif methodology == "Microsoft Genbit
|
42 |
description = "GenBiT is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications."
|
43 |
|
44 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
|
|
46 |
|
47 |
-
|
48 |
-
return f"Running **{methodology}** on **{dataset.name.split('/')[-1].split('.')[0]}**", pd.DataFrame(dummy_data)
|
49 |
|
50 |
if methodology == "A":
|
51 |
run_a(dataset)
|
@@ -55,8 +61,49 @@ def run_evaluation(dataset, methodology):
|
|
55 |
run_c(dataset)
|
56 |
|
57 |
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
with BiasAware:
|
62 |
gr.Markdown("# BiasAware: Dataset Bias Detection")
|
@@ -65,70 +112,78 @@ with BiasAware:
|
|
65 |
)
|
66 |
|
67 |
with gr.Row():
|
68 |
-
with gr.Column(scale=
|
69 |
gr.Markdown("## Dataset")
|
70 |
|
71 |
-
|
72 |
-
gr.Examples(
|
73 |
[
|
74 |
-
os.path.join(os.path.dirname(__file__), "
|
75 |
-
os.path.join(os.path.dirname(__file__), "
|
76 |
-
os.path.join(os.path.dirname(__file__), "
|
77 |
|
78 |
],
|
79 |
-
inputs=
|
80 |
)
|
81 |
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
methodology = gr.Radio(
|
85 |
-
|
|
|
|
|
86 |
"Term Identity Diversity Analysis",
|
87 |
"Gender Label Evaluation",
|
88 |
-
"Microsoft Genbit
|
89 |
],
|
90 |
-
|
91 |
)
|
92 |
|
93 |
-
|
94 |
|
95 |
-
methodology_title = gr.Markdown(
|
96 |
-
methodology_description = gr.Markdown(
|
97 |
|
98 |
with gr.Column(scale=4):
|
99 |
gr.Markdown("## Result")
|
100 |
|
101 |
-
|
102 |
-
|
103 |
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
interactive=False,
|
114 |
-
)
|
115 |
|
116 |
-
|
117 |
-
fn=
|
118 |
-
inputs=[
|
119 |
-
outputs=[
|
120 |
)
|
121 |
|
122 |
methodology.change(
|
123 |
fn=display_methodology,
|
124 |
inputs=[methodology],
|
125 |
-
outputs=[
|
126 |
-
methodology_title,
|
127 |
-
methodology_description,
|
128 |
-
],
|
129 |
)
|
130 |
|
131 |
-
|
132 |
-
|
|
|
|
|
|
|
|
|
133 |
|
134 |
BiasAware.launch()
|
|
|
25 |
]
|
26 |
|
27 |
|
|
|
|
|
|
|
|
|
|
|
28 |
def display_methodology(methodology):
|
29 |
title = "### " + methodology
|
30 |
description = ""
|
|
|
33 |
description = "333"
|
34 |
elif methodology == "Gender Label Evaluation":
|
35 |
description = "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization."
|
36 |
+
elif methodology == "Microsoft Genbit":
|
37 |
description = "GenBiT is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications."
|
38 |
|
39 |
+
return (
|
40 |
+
gr.Markdown.update(title, visible=True),
|
41 |
+
gr.Markdown.update(description, visible=True)
|
42 |
+
)
|
43 |
+
|
44 |
+
|
45 |
+
def run_evaluation(dataset_file, dataset_scope, dataset_scope_n, dataset_corpus, methodology):
|
46 |
+
status = {
|
47 |
+
"dataset": dataset_file.name,
|
48 |
+
"methodology": methodology,
|
49 |
+
"scope": dataset_scope + " " + str(dataset_scope_n),
|
50 |
+
"column": dataset_corpus.columns[0]
|
51 |
|
52 |
+
}
|
53 |
|
54 |
+
return gr.JSON.update(status, visible=True)
|
|
|
55 |
|
56 |
if methodology == "A":
|
57 |
run_a(dataset)
|
|
|
61 |
run_c(dataset)
|
62 |
|
63 |
|
64 |
+
def process_dataset(dataset):
|
65 |
+
data = pd.read_csv(dataset.name)
|
66 |
+
|
67 |
+
columns = data.columns.tolist()
|
68 |
+
columns = [x for x in columns if data[x].dtype == "object"]
|
69 |
+
|
70 |
+
return (
|
71 |
+
gr.Radio.update(
|
72 |
+
label="Scope",
|
73 |
+
info="Determines the scope of the dataset to be analyzed",
|
74 |
+
choices=["First", "Last", "Random"],
|
75 |
+
value="First",
|
76 |
+
visible=True,
|
77 |
+
interactive=True,
|
78 |
+
),
|
79 |
+
gr.Slider.update(
|
80 |
+
label="Number of Entries",
|
81 |
+
info=f"Determines the number of entries to be analyzed. The dataset has {data.shape[0]} entries.",
|
82 |
+
minimum=1,
|
83 |
+
maximum=data.shape[0],
|
84 |
+
value=data.shape[0] // 2,
|
85 |
+
visible=True,
|
86 |
+
interactive=True
|
87 |
+
),
|
88 |
+
gr.Radio.update(
|
89 |
+
label="Column",
|
90 |
+
info="Determines the column to be analyzed. These are the columns with text data.",
|
91 |
+
choices=columns,
|
92 |
+
value=columns[0],
|
93 |
+
visible=True,
|
94 |
+
interactive=True,
|
95 |
+
)
|
96 |
+
)
|
97 |
+
|
98 |
+
|
99 |
+
def process_column(dataset, column):
|
100 |
+
data = pd.read_csv(dataset.name)
|
101 |
+
corpus = data[column].to_list()
|
102 |
+
|
103 |
+
return gr.Dataframe.update(value=pd.DataFrame({"Data Corpus": corpus}), max_rows=5, visible=True)
|
104 |
+
|
105 |
+
|
106 |
+
BiasAware = gr.Blocks(title="BiasAware: Dataset Bias Detection")
|
107 |
|
108 |
with BiasAware:
|
109 |
gr.Markdown("# BiasAware: Dataset Bias Detection")
|
|
|
112 |
)
|
113 |
|
114 |
with gr.Row():
|
115 |
+
with gr.Column(scale=1):
|
116 |
gr.Markdown("## Dataset")
|
117 |
|
118 |
+
dataset_file = gr.File()
|
119 |
+
dataset_examples = gr.Examples(
|
120 |
[
|
121 |
+
os.path.join(os.path.dirname(__file__), "z_animal.csv"),
|
122 |
+
os.path.join(os.path.dirname(__file__), "z_employee.csv"),
|
123 |
+
os.path.join(os.path.dirname(__file__), "z_house.csv"),
|
124 |
|
125 |
],
|
126 |
+
inputs=dataset_file,
|
127 |
)
|
128 |
|
129 |
+
dataset_scope = gr.Radio(visible=False)
|
130 |
+
dataset_scope_n = gr.Slider(visible=False)
|
131 |
+
dataset_columns = gr.Radio(visible=False)
|
132 |
+
|
133 |
+
dataset_corpus = gr.Dataframe(visible=False)
|
134 |
+
|
135 |
+
with gr.Column(scale=1):
|
136 |
+
gr.Markdown("## Methodology")
|
137 |
|
138 |
methodology = gr.Radio(
|
139 |
+
label="Methodology",
|
140 |
+
info="Determines the methodology to be used for bias detection",
|
141 |
+
choices=[
|
142 |
"Term Identity Diversity Analysis",
|
143 |
"Gender Label Evaluation",
|
144 |
+
"Microsoft Genbit",
|
145 |
],
|
146 |
+
value="Term Identity Diversity Analysis",
|
147 |
)
|
148 |
|
149 |
+
evalButton = gr.Button("Run Evaluation")
|
150 |
|
151 |
+
methodology_title = gr.Markdown(visible=False)
|
152 |
+
methodology_description = gr.Markdown(visible=False)
|
153 |
|
154 |
with gr.Column(scale=4):
|
155 |
gr.Markdown("## Result")
|
156 |
|
157 |
+
result_status = gr.JSON(visible=False)
|
158 |
+
result = gr.DataFrame()
|
159 |
|
160 |
+
dataset_file.change(
|
161 |
+
fn=process_dataset,
|
162 |
+
inputs=[dataset_file],
|
163 |
+
outputs=[
|
164 |
+
dataset_scope,
|
165 |
+
dataset_scope_n,
|
166 |
+
dataset_columns
|
167 |
+
]
|
168 |
+
)
|
|
|
|
|
169 |
|
170 |
+
dataset_columns.change(
|
171 |
+
fn=process_column,
|
172 |
+
inputs=[dataset_file, dataset_columns],
|
173 |
+
outputs=[dataset_corpus],
|
174 |
)
|
175 |
|
176 |
methodology.change(
|
177 |
fn=display_methodology,
|
178 |
inputs=[methodology],
|
179 |
+
outputs=[methodology_title, methodology_description],
|
|
|
|
|
|
|
180 |
)
|
181 |
|
182 |
+
evalButton.click(
|
183 |
+
fn=run_evaluation,
|
184 |
+
inputs=[dataset_file, dataset_scope,
|
185 |
+
dataset_scope_n, dataset_corpus, methodology],
|
186 |
+
outputs=[result_status]
|
187 |
+
)
|
188 |
|
189 |
BiasAware.launch()
|
sample1.csv
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
id,sentence,name,number
|
2 |
-
1,"This is the first entry","John Doe",12345
|
3 |
-
2,"A sample sentence here","Jane Smith",67890
|
4 |
-
3,"Another example sentence","Robert Johnson",54321
|
5 |
-
4,"CSV data entry number four","Emily Brown",98765
|
6 |
-
5,"Fifth CSV entry","Michael Davis",13579
|
7 |
-
6,"Just a test sentence","Sarah Wilson",24680
|
8 |
-
7,"Seventh data point","David Lee",86420
|
9 |
-
8,"Eighth entry for CSV","Jessica Turner",97531
|
10 |
-
9,"Ninth item in the list","Christopher White",31415
|
11 |
-
10,"Tenth CSV record","Laura Hall",27183
|
12 |
-
11,"Eleventh entry here","Matthew Taylor",98712
|
13 |
-
12,"This is the twelfth one","Olivia Harris",12309
|
14 |
-
13,"Lucky thirteen","William Martin",56789
|
15 |
-
14,"Fourteenth CSV line","Sophia Anderson",98765
|
16 |
-
15,"Fifteenth data row","Daniel Thomas",54321
|
17 |
-
16,"Sixteenth entry in CSV","Ava Clark",13579
|
18 |
-
17,"Seventeenth CSV record","Ethan Lewis",24680
|
19 |
-
18,"Eighteenth data point","Mia Turner",86420
|
20 |
-
19,"Nineteenth CSV entry","James Brown",97531
|
21 |
-
20,"Twentieth and final entry","Grace Walker",31415
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sample2.csv
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
id,sentence,name,number
|
2 |
-
1,"Fresh CSV data entry","Alice Johnson",12345
|
3 |
-
2,"Brand new CSV sentence","Kevin Smith",67890
|
4 |
-
3,"Another unique sentence","Linda Davis",54321
|
5 |
-
4,"CSV record number twenty-four","Brian Miller",98765
|
6 |
-
5,"A different CSV entry","Catherine Wilson",13579
|
7 |
-
6,"Random CSV data point","George Brown",24680
|
8 |
-
7,"Unique CSV entry","Susan Lee",86420
|
9 |
-
8,"New CSV data here","Richard Turner",97531
|
10 |
-
9,"CSV record twenty-nine","Patricia White",31415
|
11 |
-
10,"Another brand new entry","Michael Hall",27183
|
12 |
-
11,"Yet another new CSV entry","Elizabeth Taylor",98712
|
13 |
-
12,"Fresh CSV data point","Daniel Harris",12309
|
14 |
-
13,"A distinct CSV entry","Jennifer Martin",56789
|
15 |
-
14,"CSV line thirty-four","Robert Anderson",98765
|
16 |
-
15,"Thirty-fifth CSV record","Karen Thomas",54321
|
17 |
-
16,"New CSV data line","Anthony Clark",13579
|
18 |
-
17,"Unique seventeenth CSV record","Nancy Lewis",24680
|
19 |
-
18,"New eighteenth CSV point","Paul Turner",86420
|
20 |
-
19,"Nineteenth brand new CSV entry","Jessica Brown",97531
|
21 |
-
20,"A completely new twentieth entry","Christopher Walker",31415
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sample3.csv
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
id,sentence,name,number
|
2 |
-
1,"New CSV entry one","Alice Johnson",12345
|
3 |
-
2,"Fresh CSV data point","Kevin Smith",67890
|
4 |
-
3,"Another unique record","Linda Davis",54321
|
5 |
-
4,"CSV record number four","Brian Miller",98765
|
6 |
-
5,"A different CSV line","Catherine Wilson",13579
|
7 |
-
6,"Random CSV entry","George Brown",24680
|
8 |
-
7,"Unique CSV record seven","Susan Lee",86420
|
9 |
-
8,"New CSV data eight","Richard Turner",97531
|
10 |
-
9,"CSV entry number nine","Patricia White",31415
|
11 |
-
10,"Another brand new record","Michael Hall",27183
|
12 |
-
11,"Unique CSV record eleven","Elizabeth Taylor",98712
|
13 |
-
12,"Fresh CSV entry twelve","Daniel Harris",12309
|
14 |
-
13,"A distinct CSV line","Jennifer Martin",56789
|
15 |
-
14,"CSV record fourteen","Robert Anderson",98765
|
16 |
-
15,"Fifteenth CSV data point","Karen Thomas",54321
|
17 |
-
16,"New CSV entry sixteen","Anthony Clark",13579
|
18 |
-
17,"Seventeenth unique CSV record","Nancy Lewis",24680
|
19 |
-
18,"New eighteenth CSV point","Paul Turner",86420
|
20 |
-
19,"Nineteenth brand new CSV record","Jessica Brown",97531
|
21 |
-
20,"A completely new twentieth CSV entry","Christopher Walker",31415
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
z_animal.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AnimalID,CommonName,ScientificName,Class,Order,Family,Habitat,ConservationStatus
|
2 |
+
1,Lion,Panthera leo,Mammalia,Carnivora,Felidae,Savanna,Vulnerable
|
3 |
+
2,Eagle,Aquila chrysaetos,Aves,Accipitriformes,Accipitridae,Mountains,Least Concern
|
4 |
+
3,Dolphin,Tursiops truncatus,Mammalia,Cetacea,Delphinidae,Ocean,Least Concern
|
5 |
+
4,Elephant,Loxodonta africana,Mammalia,Proboscidea,Elephantidae,Grassland,Vulnerable
|
6 |
+
5,Tiger,Panthera tigris,Mammalia,Carnivora,Felidae,Forest,Endangered
|
7 |
+
6,Penguin,Spheniscidae,Aves,Sphenisciformes,Spheniscidae,Antarctica,Least Concern
|
8 |
+
7,Giraffe,Giraffa camelopardalis,Mammalia,Artiodactyla,Giraffidae,Savanna,Vulnerable
|
9 |
+
8,Cheetah,Acinonyx jubatus,Mammalia,Carnivora,Felidae,Grassland,Vulnerable
|
10 |
+
9,Panda,Ailuropoda melanoleuca,Mammalia,Carnivora,Ursidae,Forest,Endangered
|
11 |
+
10,Kangaroo,Macropus rufus,Mammalia,Diprotodontia,Macropodidae,Grassland,Least Concern
|
z_employee.csv
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
EmployeeID,FirstName,LastName,Email,Department,Salary
|
2 |
+
101,John,Smith,[email protected],Finance,60000
|
3 |
+
102,Emily,Johnson,[email protected],Marketing,55000
|
4 |
+
103,Michael,Williams,[email protected],HR,50000
|
5 |
+
104,Susan,Anderson,[email protected],IT,65000
|
6 |
+
105,David,Martin,[email protected],Sales,58000
|
7 |
+
106,Linda,Davis,[email protected],Finance,62000
|
8 |
+
107,William,Miller,[email protected],Marketing,56000
|
9 |
+
108,Sarah,Anderson,[email protected],HR,51000
|
10 |
+
109,Robert,Clark,[email protected],IT,67000
|
11 |
+
110,Karen,Wilson,[email protected],Sales,59000
|
12 |
+
111,James,Brown,[email protected],Finance,61000
|
13 |
+
112,Anna,Johnson,[email protected],Marketing,57000
|
14 |
+
113,Christopher,Moore,[email protected],HR,52000
|
15 |
+
114,Laura,White,[email protected],IT,68000
|
16 |
+
115,Mark,Davis,[email protected],Sales,60000
|
17 |
+
116,Patricia,Jones,[email protected],Finance,63000
|
18 |
+
117,Matthew,Taylor,[email protected],Marketing,58000
|
19 |
+
118,Jennifer,Young,[email protected],HR,53000
|
20 |
+
119,Steven,Anderson,[email protected],IT,69000
|
21 |
+
120,Elizabeth,Thomas,[email protected],Sales,61000
|
22 |
+
121,Kevin,Harris,[email protected],Finance,64000
|
23 |
+
122,Deborah,Smith,[email protected],Marketing,59000
|
24 |
+
123,Joseph,Walker,[email protected],HR,54000
|
25 |
+
124,Cynthia,Jackson,[email protected],IT,70000
|
26 |
+
125,Daniel,Hall,[email protected],Sales,62000
|
z_house.csv
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PropertyID,StreetAddress,City,State,ZipCode,NumberOfBedrooms,NumberOfBathrooms,SquareFootage,Price
|
2 |
+
1,123 Main St,Los Angeles,CA,90001,3,2,1800,550000
|
3 |
+
2,456 Elm St,New York,NY,10001,2,1,1200,750000
|
4 |
+
3,789 Oak St,San Francisco,CA,94101,4,3,2500,950000
|
5 |
+
4,101 Maple St,Boston,MA,02101,3,2.5,2000,680000
|
6 |
+
5,202 Pine St,Miami,FL,33101,4,3.5,2700,820000
|
7 |
+
6,303 Cedar St,Chicago,IL,60601,2,1,1100,450000
|