Add CSV Workflow
Browse files- app.py +47 -38
- sample1.csv +21 -0
- sample2.csv +21 -0
- sample3.csv +21 -0
app.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
|
|
3 |
|
4 |
-
|
5 |
["Category", "Value", "Percentage"],
|
6 |
["Total Reviews", 50000, None],
|
7 |
["Total Sentences", 621647, None],
|
@@ -24,25 +25,27 @@ data = [
|
|
24 |
]
|
25 |
|
26 |
|
|
|
|
|
|
|
|
|
|
|
27 |
def display_methodology(methodology):
|
28 |
-
title = methodology
|
29 |
description = ""
|
30 |
-
|
31 |
if methodology == "Term Identity Diversity Analysis":
|
32 |
-
description = "111"
|
33 |
-
details = "222"
|
34 |
-
elif methodology == "Textual Gender Label Evaluation":
|
35 |
description = "333"
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
|
41 |
-
return title, description
|
42 |
|
43 |
|
44 |
def run_evaluation(dataset, methodology):
|
45 |
-
return f"Running
|
46 |
|
47 |
if methodology == "A":
|
48 |
run_a(dataset)
|
@@ -52,53 +55,54 @@ def run_evaluation(dataset, methodology):
|
|
52 |
run_c(dataset)
|
53 |
|
54 |
|
55 |
-
|
56 |
-
|
57 |
|
58 |
-
with
|
59 |
gr.Markdown("# BiasAware: Dataset Bias Detection")
|
60 |
gr.Markdown(
|
61 |
"Natural Language Processing (NLP) training datasets often reflect the biases present in the data sources they are compiled from, leading to the **perpetuation of stereotypes, underrepresentation, and skewed perspectives in AI models**. BiasAware is designed to **identify and quantify biases present in text data**, making it an invaluable resource for data scientists, machine learning practitioners, and organizations committed to **mitigating bias in AI systems**."
|
62 |
)
|
63 |
|
64 |
with gr.Row():
|
65 |
-
with gr.Column(scale=
|
66 |
-
gr.Markdown("
|
67 |
|
68 |
-
dataset = gr.
|
69 |
gr.Examples(
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
73 |
)
|
74 |
|
|
|
|
|
75 |
methodology = gr.Radio(
|
76 |
[
|
77 |
"Term Identity Diversity Analysis",
|
78 |
-
"
|
79 |
-
"
|
80 |
],
|
81 |
label="Methodology",
|
82 |
)
|
83 |
|
84 |
button = gr.Button("Run Evaluation")
|
85 |
|
86 |
-
|
87 |
-
gr.Markdown("
|
88 |
-
|
89 |
-
with gr.Box():
|
90 |
-
methodology_title = gr.Markdown("### Title")
|
91 |
-
methodology_description = gr.Markdown("lorem ipsum")
|
92 |
|
93 |
-
|
94 |
-
|
95 |
-
outputs = gr.DataFrame(pd.DataFrame(data), headers=[
|
96 |
-
"", "Count", "Percentage"])
|
97 |
|
98 |
-
gr.
|
|
|
99 |
|
100 |
with gr.Column(scale=1):
|
101 |
-
gr.Markdown("
|
102 |
gr.DataFrame(
|
103 |
headers=["Dataset", "Score"],
|
104 |
value=[
|
@@ -109,17 +113,22 @@ with demo:
|
|
109 |
interactive=False,
|
110 |
)
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
methodology.change(
|
113 |
fn=display_methodology,
|
114 |
inputs=[methodology],
|
115 |
outputs=[
|
116 |
methodology_title,
|
117 |
methodology_description,
|
118 |
-
methodology_details,
|
119 |
],
|
120 |
)
|
121 |
|
122 |
button.click(fn=run_evaluation, inputs=[
|
123 |
-
dataset, methodology], outputs=[
|
124 |
|
125 |
-
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
+
import os
|
4 |
|
5 |
+
dummy_data = [
|
6 |
["Category", "Value", "Percentage"],
|
7 |
["Total Reviews", 50000, None],
|
8 |
["Total Sentences", 621647, None],
|
|
|
25 |
]
|
26 |
|
27 |
|
28 |
+
def display_dataset(dataset):
|
29 |
+
data = pd.read_csv(dataset.name)
|
30 |
+
return data
|
31 |
+
|
32 |
+
|
33 |
def display_methodology(methodology):
|
34 |
+
title = "### " + methodology
|
35 |
description = ""
|
36 |
+
|
37 |
if methodology == "Term Identity Diversity Analysis":
|
|
|
|
|
|
|
38 |
description = "333"
|
39 |
+
elif methodology == "Gender Label Evaluation":
|
40 |
+
description = "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization."
|
41 |
+
elif methodology == "Microsoft Genbit: Word Co-occurrence and Contextual Mitigation":
|
42 |
+
description = "GenBiT is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications."
|
43 |
|
44 |
+
return title, description
|
45 |
|
46 |
|
47 |
def run_evaluation(dataset, methodology):
|
48 |
+
return f"Running **{methodology}** on **{dataset.name.split('/')[-1].split('.')[0]}**", pd.DataFrame(dummy_data)
|
49 |
|
50 |
if methodology == "A":
|
51 |
run_a(dataset)
|
|
|
55 |
run_c(dataset)
|
56 |
|
57 |
|
58 |
+
BiasAware = gr.Blocks(title="BiasAware: Dataset Bias Detection",
|
59 |
+
theme="freddyaboulton/dracula_revamped")
|
60 |
|
61 |
+
with BiasAware:
|
62 |
gr.Markdown("# BiasAware: Dataset Bias Detection")
|
63 |
gr.Markdown(
|
64 |
"Natural Language Processing (NLP) training datasets often reflect the biases present in the data sources they are compiled from, leading to the **perpetuation of stereotypes, underrepresentation, and skewed perspectives in AI models**. BiasAware is designed to **identify and quantify biases present in text data**, making it an invaluable resource for data scientists, machine learning practitioners, and organizations committed to **mitigating bias in AI systems**."
|
65 |
)
|
66 |
|
67 |
with gr.Row():
|
68 |
+
with gr.Column(scale=2):
|
69 |
+
gr.Markdown("## Dataset")
|
70 |
|
71 |
+
dataset = gr.File()
|
72 |
gr.Examples(
|
73 |
+
[
|
74 |
+
os.path.join(os.path.dirname(__file__), "sample1.csv"),
|
75 |
+
os.path.join(os.path.dirname(__file__), "sample2.csv"),
|
76 |
+
os.path.join(os.path.dirname(__file__), "sample3.csv"),
|
77 |
+
|
78 |
+
],
|
79 |
+
inputs=dataset,
|
80 |
)
|
81 |
|
82 |
+
dataset_entries = gr.Dataframe()
|
83 |
+
|
84 |
methodology = gr.Radio(
|
85 |
[
|
86 |
"Term Identity Diversity Analysis",
|
87 |
+
"Gender Label Evaluation",
|
88 |
+
"Microsoft Genbit: Word Co-occurrence and Contextual Mitigation",
|
89 |
],
|
90 |
label="Methodology",
|
91 |
)
|
92 |
|
93 |
button = gr.Button("Run Evaluation")
|
94 |
|
95 |
+
methodology_title = gr.Markdown("")
|
96 |
+
methodology_description = gr.Markdown("")
|
|
|
|
|
|
|
|
|
97 |
|
98 |
+
with gr.Column(scale=4):
|
99 |
+
gr.Markdown("## Result")
|
|
|
|
|
100 |
|
101 |
+
results_status = gr.Markdown()
|
102 |
+
results = gr.DataFrame()
|
103 |
|
104 |
with gr.Column(scale=1):
|
105 |
+
gr.Markdown("## Leaderboard")
|
106 |
gr.DataFrame(
|
107 |
headers=["Dataset", "Score"],
|
108 |
value=[
|
|
|
113 |
interactive=False,
|
114 |
)
|
115 |
|
116 |
+
dataset.change(
|
117 |
+
fn=display_dataset,
|
118 |
+
inputs=[dataset],
|
119 |
+
outputs=[dataset_entries],
|
120 |
+
)
|
121 |
+
|
122 |
methodology.change(
|
123 |
fn=display_methodology,
|
124 |
inputs=[methodology],
|
125 |
outputs=[
|
126 |
methodology_title,
|
127 |
methodology_description,
|
|
|
128 |
],
|
129 |
)
|
130 |
|
131 |
button.click(fn=run_evaluation, inputs=[
|
132 |
+
dataset, methodology], outputs=[results_status, results])
|
133 |
|
134 |
+
BiasAware.launch()
|
sample1.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
id,sentence,name,number
|
2 |
+
1,"This is the first entry","John Doe",12345
|
3 |
+
2,"A sample sentence here","Jane Smith",67890
|
4 |
+
3,"Another example sentence","Robert Johnson",54321
|
5 |
+
4,"CSV data entry number four","Emily Brown",98765
|
6 |
+
5,"Fifth CSV entry","Michael Davis",13579
|
7 |
+
6,"Just a test sentence","Sarah Wilson",24680
|
8 |
+
7,"Seventh data point","David Lee",86420
|
9 |
+
8,"Eighth entry for CSV","Jessica Turner",97531
|
10 |
+
9,"Ninth item in the list","Christopher White",31415
|
11 |
+
10,"Tenth CSV record","Laura Hall",27183
|
12 |
+
11,"Eleventh entry here","Matthew Taylor",98712
|
13 |
+
12,"This is the twelfth one","Olivia Harris",12309
|
14 |
+
13,"Lucky thirteen","William Martin",56789
|
15 |
+
14,"Fourteenth CSV line","Sophia Anderson",98765
|
16 |
+
15,"Fifteenth data row","Daniel Thomas",54321
|
17 |
+
16,"Sixteenth entry in CSV","Ava Clark",13579
|
18 |
+
17,"Seventeenth CSV record","Ethan Lewis",24680
|
19 |
+
18,"Eighteenth data point","Mia Turner",86420
|
20 |
+
19,"Nineteenth CSV entry","James Brown",97531
|
21 |
+
20,"Twentieth and final entry","Grace Walker",31415
|
sample2.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
id,sentence,name,number
|
2 |
+
1,"Fresh CSV data entry","Alice Johnson",12345
|
3 |
+
2,"Brand new CSV sentence","Kevin Smith",67890
|
4 |
+
3,"Another unique sentence","Linda Davis",54321
|
5 |
+
4,"CSV record number twenty-four","Brian Miller",98765
|
6 |
+
5,"A different CSV entry","Catherine Wilson",13579
|
7 |
+
6,"Random CSV data point","George Brown",24680
|
8 |
+
7,"Unique CSV entry","Susan Lee",86420
|
9 |
+
8,"New CSV data here","Richard Turner",97531
|
10 |
+
9,"CSV record twenty-nine","Patricia White",31415
|
11 |
+
10,"Another brand new entry","Michael Hall",27183
|
12 |
+
11,"Yet another new CSV entry","Elizabeth Taylor",98712
|
13 |
+
12,"Fresh CSV data point","Daniel Harris",12309
|
14 |
+
13,"A distinct CSV entry","Jennifer Martin",56789
|
15 |
+
14,"CSV line thirty-four","Robert Anderson",98765
|
16 |
+
15,"Thirty-fifth CSV record","Karen Thomas",54321
|
17 |
+
16,"New CSV data line","Anthony Clark",13579
|
18 |
+
17,"Unique seventeenth CSV record","Nancy Lewis",24680
|
19 |
+
18,"New eighteenth CSV point","Paul Turner",86420
|
20 |
+
19,"Nineteenth brand new CSV entry","Jessica Brown",97531
|
21 |
+
20,"A completely new twentieth entry","Christopher Walker",31415
|
sample3.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
id,sentence,name,number
|
2 |
+
1,"New CSV entry one","Alice Johnson",12345
|
3 |
+
2,"Fresh CSV data point","Kevin Smith",67890
|
4 |
+
3,"Another unique record","Linda Davis",54321
|
5 |
+
4,"CSV record number four","Brian Miller",98765
|
6 |
+
5,"A different CSV line","Catherine Wilson",13579
|
7 |
+
6,"Random CSV entry","George Brown",24680
|
8 |
+
7,"Unique CSV record seven","Susan Lee",86420
|
9 |
+
8,"New CSV data eight","Richard Turner",97531
|
10 |
+
9,"CSV entry number nine","Patricia White",31415
|
11 |
+
10,"Another brand new record","Michael Hall",27183
|
12 |
+
11,"Unique CSV record eleven","Elizabeth Taylor",98712
|
13 |
+
12,"Fresh CSV entry twelve","Daniel Harris",12309
|
14 |
+
13,"A distinct CSV line","Jennifer Martin",56789
|
15 |
+
14,"CSV record fourteen","Robert Anderson",98765
|
16 |
+
15,"Fifteenth CSV data point","Karen Thomas",54321
|
17 |
+
16,"New CSV entry sixteen","Anthony Clark",13579
|
18 |
+
17,"Seventeenth unique CSV record","Nancy Lewis",24680
|
19 |
+
18,"New eighteenth CSV point","Paul Turner",86420
|
20 |
+
19,"Nineteenth brand new CSV record","Jessica Brown",97531
|
21 |
+
20,"A completely new twentieth CSV entry","Christopher Walker",31415
|