freyam commited on
Commit
6fc6046
·
1 Parent(s): 5da6a2b

Add CSV Workflow

Browse files
Files changed (4) hide show
  1. app.py +47 -38
  2. sample1.csv +21 -0
  3. sample2.csv +21 -0
  4. sample3.csv +21 -0
app.py CHANGED
@@ -1,7 +1,8 @@
1
  import gradio as gr
2
  import pandas as pd
 
3
 
4
- data = [
5
  ["Category", "Value", "Percentage"],
6
  ["Total Reviews", 50000, None],
7
  ["Total Sentences", 621647, None],
@@ -24,25 +25,27 @@ data = [
24
  ]
25
 
26
 
 
 
 
 
 
27
  def display_methodology(methodology):
28
- title = methodology
29
  description = ""
30
- details = ""
31
  if methodology == "Term Identity Diversity Analysis":
32
- description = "111"
33
- details = "222"
34
- elif methodology == "Textual Gender Label Evaluation":
35
  description = "333"
36
- details = "444"
37
- elif methodology == "GenBit":
38
- description = "555"
39
- details = "666"
40
 
41
- return title, description, details
42
 
43
 
44
  def run_evaluation(dataset, methodology):
45
- return f"Running evaluation for {dataset} with {methodology}"
46
 
47
  if methodology == "A":
48
  run_a(dataset)
@@ -52,53 +55,54 @@ def run_evaluation(dataset, methodology):
52
  run_c(dataset)
53
 
54
 
55
- demo = gr.Blocks(title="BiasAware: Dataset Bias Detection",
56
- theme=gr.themes.Soft())
57
 
58
- with demo:
59
  gr.Markdown("# BiasAware: Dataset Bias Detection")
60
  gr.Markdown(
61
  "Natural Language Processing (NLP) training datasets often reflect the biases present in the data sources they are compiled from, leading to the **perpetuation of stereotypes, underrepresentation, and skewed perspectives in AI models**. BiasAware is designed to **identify and quantify biases present in text data**, making it an invaluable resource for data scientists, machine learning practitioners, and organizations committed to **mitigating bias in AI systems**."
62
  )
63
 
64
  with gr.Row():
65
- with gr.Column(scale=1):
66
- gr.Markdown("Select a dataset to analyze")
67
 
68
- dataset = gr.Text(label="Dataset")
69
  gr.Examples(
70
- examples=["imdb", "amazon_reviews_multi", "tweet_eval"],
71
- fn=run_evaluation,
72
- inputs=[dataset],
 
 
 
 
73
  )
74
 
 
 
75
  methodology = gr.Radio(
76
  [
77
  "Term Identity Diversity Analysis",
78
- "Textual Gender Label Evaluation",
79
- "GenBit",
80
  ],
81
  label="Methodology",
82
  )
83
 
84
  button = gr.Button("Run Evaluation")
85
 
86
- with gr.Column(scale=4):
87
- gr.Markdown("### Results")
88
-
89
- with gr.Box():
90
- methodology_title = gr.Markdown("### Title")
91
- methodology_description = gr.Markdown("lorem ipsum")
92
 
93
- methodology_details = gr.Markdown("lorem ipsum")
94
- # outputs = gr.Markdown()
95
- outputs = gr.DataFrame(pd.DataFrame(data), headers=[
96
- "", "Count", "Percentage"])
97
 
98
- gr.Error("No results to display")
 
99
 
100
  with gr.Column(scale=1):
101
- gr.Markdown("### Leaderboard")
102
  gr.DataFrame(
103
  headers=["Dataset", "Score"],
104
  value=[
@@ -109,17 +113,22 @@ with demo:
109
  interactive=False,
110
  )
111
 
 
 
 
 
 
 
112
  methodology.change(
113
  fn=display_methodology,
114
  inputs=[methodology],
115
  outputs=[
116
  methodology_title,
117
  methodology_description,
118
- methodology_details,
119
  ],
120
  )
121
 
122
  button.click(fn=run_evaluation, inputs=[
123
- dataset, methodology], outputs=[outputs])
124
 
125
- demo.launch()
 
1
  import gradio as gr
2
  import pandas as pd
3
+ import os
4
 
5
+ dummy_data = [
6
  ["Category", "Value", "Percentage"],
7
  ["Total Reviews", 50000, None],
8
  ["Total Sentences", 621647, None],
 
25
  ]
26
 
27
 
28
+ def display_dataset(dataset):
29
+ data = pd.read_csv(dataset.name)
30
+ return data
31
+
32
+
33
  def display_methodology(methodology):
34
+ title = "### " + methodology
35
  description = ""
36
+
37
  if methodology == "Term Identity Diversity Analysis":
 
 
 
38
  description = "333"
39
+ elif methodology == "Gender Label Evaluation":
40
+ description = "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization."
41
+ elif methodology == "Microsoft Genbit: Word Co-occurrence and Contextual Mitigation":
42
+ description = "GenBiT is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications."
43
 
44
+ return title, description
45
 
46
 
47
  def run_evaluation(dataset, methodology):
48
+ return f"Running **{methodology}** on **{dataset.name.split('/')[-1].split('.')[0]}**", pd.DataFrame(dummy_data)
49
 
50
  if methodology == "A":
51
  run_a(dataset)
 
55
  run_c(dataset)
56
 
57
 
58
+ BiasAware = gr.Blocks(title="BiasAware: Dataset Bias Detection",
59
+ theme="freddyaboulton/dracula_revamped")
60
 
61
+ with BiasAware:
62
  gr.Markdown("# BiasAware: Dataset Bias Detection")
63
  gr.Markdown(
64
  "Natural Language Processing (NLP) training datasets often reflect the biases present in the data sources they are compiled from, leading to the **perpetuation of stereotypes, underrepresentation, and skewed perspectives in AI models**. BiasAware is designed to **identify and quantify biases present in text data**, making it an invaluable resource for data scientists, machine learning practitioners, and organizations committed to **mitigating bias in AI systems**."
65
  )
66
 
67
  with gr.Row():
68
+ with gr.Column(scale=2):
69
+ gr.Markdown("## Dataset")
70
 
71
+ dataset = gr.File()
72
  gr.Examples(
73
+ [
74
+ os.path.join(os.path.dirname(__file__), "sample1.csv"),
75
+ os.path.join(os.path.dirname(__file__), "sample2.csv"),
76
+ os.path.join(os.path.dirname(__file__), "sample3.csv"),
77
+
78
+ ],
79
+ inputs=dataset,
80
  )
81
 
82
+ dataset_entries = gr.Dataframe()
83
+
84
  methodology = gr.Radio(
85
  [
86
  "Term Identity Diversity Analysis",
87
+ "Gender Label Evaluation",
88
+ "Microsoft Genbit: Word Co-occurrence and Contextual Mitigation",
89
  ],
90
  label="Methodology",
91
  )
92
 
93
  button = gr.Button("Run Evaluation")
94
 
95
+ methodology_title = gr.Markdown("")
96
+ methodology_description = gr.Markdown("")
 
 
 
 
97
 
98
+ with gr.Column(scale=4):
99
+ gr.Markdown("## Result")
 
 
100
 
101
+ results_status = gr.Markdown()
102
+ results = gr.DataFrame()
103
 
104
  with gr.Column(scale=1):
105
+ gr.Markdown("## Leaderboard")
106
  gr.DataFrame(
107
  headers=["Dataset", "Score"],
108
  value=[
 
113
  interactive=False,
114
  )
115
 
116
+ dataset.change(
117
+ fn=display_dataset,
118
+ inputs=[dataset],
119
+ outputs=[dataset_entries],
120
+ )
121
+
122
  methodology.change(
123
  fn=display_methodology,
124
  inputs=[methodology],
125
  outputs=[
126
  methodology_title,
127
  methodology_description,
 
128
  ],
129
  )
130
 
131
  button.click(fn=run_evaluation, inputs=[
132
+ dataset, methodology], outputs=[results_status, results])
133
 
134
+ BiasAware.launch()
sample1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id,sentence,name,number
2
+ 1,"This is the first entry","John Doe",12345
3
+ 2,"A sample sentence here","Jane Smith",67890
4
+ 3,"Another example sentence","Robert Johnson",54321
5
+ 4,"CSV data entry number four","Emily Brown",98765
6
+ 5,"Fifth CSV entry","Michael Davis",13579
7
+ 6,"Just a test sentence","Sarah Wilson",24680
8
+ 7,"Seventh data point","David Lee",86420
9
+ 8,"Eighth entry for CSV","Jessica Turner",97531
10
+ 9,"Ninth item in the list","Christopher White",31415
11
+ 10,"Tenth CSV record","Laura Hall",27183
12
+ 11,"Eleventh entry here","Matthew Taylor",98712
13
+ 12,"This is the twelfth one","Olivia Harris",12309
14
+ 13,"Lucky thirteen","William Martin",56789
15
+ 14,"Fourteenth CSV line","Sophia Anderson",98765
16
+ 15,"Fifteenth data row","Daniel Thomas",54321
17
+ 16,"Sixteenth entry in CSV","Ava Clark",13579
18
+ 17,"Seventeenth CSV record","Ethan Lewis",24680
19
+ 18,"Eighteenth data point","Mia Turner",86420
20
+ 19,"Nineteenth CSV entry","James Brown",97531
21
+ 20,"Twentieth and final entry","Grace Walker",31415
sample2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id,sentence,name,number
2
+ 1,"Fresh CSV data entry","Alice Johnson",12345
3
+ 2,"Brand new CSV sentence","Kevin Smith",67890
4
+ 3,"Another unique sentence","Linda Davis",54321
5
+ 4,"CSV record number twenty-four","Brian Miller",98765
6
+ 5,"A different CSV entry","Catherine Wilson",13579
7
+ 6,"Random CSV data point","George Brown",24680
8
+ 7,"Unique CSV entry","Susan Lee",86420
9
+ 8,"New CSV data here","Richard Turner",97531
10
+ 9,"CSV record twenty-nine","Patricia White",31415
11
+ 10,"Another brand new entry","Michael Hall",27183
12
+ 11,"Yet another new CSV entry","Elizabeth Taylor",98712
13
+ 12,"Fresh CSV data point","Daniel Harris",12309
14
+ 13,"A distinct CSV entry","Jennifer Martin",56789
15
+ 14,"CSV line thirty-four","Robert Anderson",98765
16
+ 15,"Thirty-fifth CSV record","Karen Thomas",54321
17
+ 16,"New CSV data line","Anthony Clark",13579
18
+ 17,"Unique seventeenth CSV record","Nancy Lewis",24680
19
+ 18,"New eighteenth CSV point","Paul Turner",86420
20
+ 19,"Nineteenth brand new CSV entry","Jessica Brown",97531
21
+ 20,"A completely new twentieth entry","Christopher Walker",31415
sample3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id,sentence,name,number
2
+ 1,"New CSV entry one","Alice Johnson",12345
3
+ 2,"Fresh CSV data point","Kevin Smith",67890
4
+ 3,"Another unique record","Linda Davis",54321
5
+ 4,"CSV record number four","Brian Miller",98765
6
+ 5,"A different CSV line","Catherine Wilson",13579
7
+ 6,"Random CSV entry","George Brown",24680
8
+ 7,"Unique CSV record seven","Susan Lee",86420
9
+ 8,"New CSV data eight","Richard Turner",97531
10
+ 9,"CSV entry number nine","Patricia White",31415
11
+ 10,"Another brand new record","Michael Hall",27183
12
+ 11,"Unique CSV record eleven","Elizabeth Taylor",98712
13
+ 12,"Fresh CSV entry twelve","Daniel Harris",12309
14
+ 13,"A distinct CSV line","Jennifer Martin",56789
15
+ 14,"CSV record fourteen","Robert Anderson",98765
16
+ 15,"Fifteenth CSV data point","Karen Thomas",54321
17
+ 16,"New CSV entry sixteen","Anthony Clark",13579
18
+ 17,"Seventeenth unique CSV record","Nancy Lewis",24680
19
+ 18,"New eighteenth CSV point","Paul Turner",86420
20
+ 19,"Nineteenth brand new CSV record","Jessica Brown",97531
21
+ 20,"A completely new twentieth CSV entry","Christopher Walker",31415