freyam commited on
Commit
ce366f7
·
1 Parent(s): 1869ec4

Add HuggingFace support for dataset import

Browse files
Files changed (2) hide show
  1. app.py +246 -121
  2. requirements.txt +6 -5
app.py CHANGED
@@ -1,121 +1,154 @@
1
  import json
2
  import gradio as gr
3
  import pandas as pd
4
- import os
5
 
6
  from scripts.genbit import *
7
  from scripts.gender_profession_bias import *
8
  from scripts.gender_distribution import *
9
 
10
- methodologies = json.load(open("config/methodologies.json", "r"))
11
-
12
- datasets = [
13
- os.path.join(os.path.dirname(__file__), "data", f)
14
- for f in os.listdir(os.path.join(os.path.dirname(__file__), "data"))
15
- if f.endswith(".csv")
16
- ]
17
-
18
- MAX_THRESHOLD = 5000
19
- DATASET_CACHE = {}
20
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- def evaluate(dataset, sampling_method, sampling_size, column, methodology):
23
  try:
24
- print(
25
- f"[{dataset.name.split('/')[-1]}::{column}] - {sampling_method} {sampling_size} entries using {methodology}"
26
- )
27
- data = DATASET_CACHE.setdefault(dataset.name, pd.read_csv(dataset.name))[
28
- [column]
29
- ]
30
-
31
- if sampling_method == "First":
32
- data = data.head(sampling_size)
33
- elif sampling_method == "Last":
34
- data = data.tail(sampling_size)
35
- elif sampling_method == "Random":
36
- data = data.sample(n=sampling_size, random_state=42)
37
-
38
- result_df, result_plot, result_conclusion = globals()[
39
- methodologies.get(methodology).get("fx")
40
  ](data)
41
 
42
  return (
43
- gr.Markdown.update(
44
- f"## {methodology} Results\nResult Summary", visible=True
45
- ),
46
- gr.Plot.update(result_plot, visible=True),
47
- gr.Dataframe.update(result_df, visible=True),
48
  )
49
  except Exception as e:
 
50
  return (
51
- gr.Markdown.update(visible=False),
52
- gr.Plot.update(visible=False),
53
- gr.Dataframe.update(visible=False),
54
  )
55
 
56
 
57
- def display_dataset_config(dataset):
58
- try:
59
- data = DATASET_CACHE.setdefault(dataset.name, pd.read_csv(dataset.name))
 
 
 
 
 
 
 
60
 
61
- columns = data.select_dtypes(include=["object"]).columns.tolist()
62
- corpus = data[columns[0]].tolist()[0:5]
63
 
64
- return (
65
- gr.Radio.update(
66
- label="Scope",
67
- info="Determines the scope of the dataset to be analyzed",
68
- choices=["First", "Last", "Random"],
69
- value="First",
70
- visible=True,
71
- interactive=True,
72
- ),
73
- gr.Slider.update(
74
- label=f"Number of Entries",
75
- info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {MAX_THRESHOLD}.",
76
- minimum=1,
77
- maximum=min(data.shape[0], MAX_THRESHOLD),
78
- value=min(data.shape[0], MAX_THRESHOLD),
79
- visible=True,
80
- interactive=True,
81
- ),
82
- gr.Radio.update(
83
- label="Column",
84
- info="Determines the column to be analyzed. These are the columns with text data.",
85
- choices=columns,
86
- value=columns[0],
87
- visible=True,
88
- interactive=True,
89
- ),
90
- gr.DataFrame.update(
91
- value=pd.DataFrame({f"{columns[0]}": corpus}), visible=True
92
- ),
93
- )
94
- except:
95
- return (
96
- gr.Radio.update(visible=False),
97
- gr.Slider.update(visible=False),
98
- gr.Radio.update(visible=False),
99
- gr.DataFrame.update(visible=False),
100
- )
 
101
 
 
 
 
 
 
 
 
102
 
103
- def update_column_metadata(dataset, column):
104
- data = DATASET_CACHE.setdefault(dataset.name, pd.read_csv(dataset.name))
105
- corpus = data[column].tolist()[0:5]
106
 
107
- return gr.Dataframe.update(value=pd.DataFrame({f"{column}": corpus}), visible=True)
 
 
 
108
 
 
109
 
110
- def get_methodology_metadata(methodology):
111
- title = "## " + methodology
112
- description = methodologies.get(methodology).get("description")
113
 
114
- metadata = f"{title}\n\n{description}"
 
115
 
116
  return (
117
- gr.Markdown.update(metadata, visible=True),
118
- gr.Button.update(interactive=True, visible=True),
 
 
 
 
 
 
 
 
 
 
 
 
119
  )
120
 
121
 
@@ -132,75 +165,167 @@ with BiasAware:
132
 
133
  with gr.Row():
134
  with gr.Column(scale=1):
135
- gr.Markdown("## Dataset")
 
 
 
 
 
 
 
136
 
137
- dataset_file = gr.File(label="Dataset", file_types=["csv"])
138
- dataset_examples = gr.Examples(
139
- examples=datasets,
140
- inputs=dataset_file,
141
- label="Example Datasets",
142
  )
 
 
 
143
 
144
  dataset_sampling_method = gr.Radio(visible=False)
145
  dataset_sampling_size = gr.Slider(visible=False)
146
  dataset_column = gr.Radio(visible=False)
 
147
 
148
- dataset_corpus = gr.Dataframe(
149
- row_count=(5, "fixed"), col_count=(1, "fixed"), visible=False
150
- )
151
 
152
- with gr.Column(scale=1):
153
- gr.Markdown("## Methodology")
154
 
155
  methodology = gr.Radio(
156
  label="Methodology",
157
  info="Determines the methodology to be used for bias detection",
158
- choices=methodologies.keys(),
159
  )
160
 
161
- evalButton = gr.Button(value="Run Evaluation", interactive=False)
162
 
163
- methodology_metadata = gr.Markdown(visible=False)
 
 
 
 
 
164
 
165
  with gr.Column(scale=2):
166
- result = gr.Markdown("## Result")
167
 
 
168
  result_plot = gr.Plot(show_label=False, container=False, visible=False)
169
  result_df = gr.DataFrame(visible=False)
170
 
171
- dataset_file.change(
172
- fn=display_dataset_config,
173
- inputs=[dataset_file],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  outputs=[
 
 
 
175
  dataset_sampling_method,
176
  dataset_sampling_size,
177
  dataset_column,
178
- dataset_corpus,
 
179
  ],
180
  )
181
 
182
- dataset_column.change(
183
- fn=update_column_metadata,
184
- inputs=[dataset_file, dataset_column],
185
- outputs=[dataset_corpus],
 
 
 
 
 
186
  )
187
 
188
- methodology.change(
189
- fn=get_methodology_metadata,
190
- inputs=[methodology],
191
- outputs=[methodology_metadata, evalButton],
 
 
 
 
 
192
  )
193
 
194
- evalButton.click(
195
- fn=evaluate,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  inputs=[
197
- dataset_file,
198
  dataset_sampling_method,
199
  dataset_sampling_size,
200
  dataset_column,
201
- methodology,
202
  ],
203
- outputs=[result, result_plot, result_df],
204
  )
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  BiasAware.launch()
 
1
  import json
2
  import gradio as gr
3
  import pandas as pd
 
4
 
5
  from scripts.genbit import *
6
  from scripts.gender_profession_bias import *
7
  from scripts.gender_distribution import *
8
 
9
+ from datasets import load_dataset as hf_load_dataset
10
+
11
+ MAX_THRESHOLD = 1000
12
+ METHODOLOGIES = json.load(open("config/methodologies.json", "r"))
13
+
14
+ DATASET = {
15
+ "name": None,
16
+ "source": None,
17
+ "df": None,
18
+ "sampling_method": None,
19
+ "sampling_size": None,
20
+ "column": None,
21
+ "methodology": None,
22
+ }
23
+
24
+
25
+ def evaluate():
26
+ print(
27
+ f"Dataset : {DATASET['name']}\n"
28
+ f"Source : {DATASET['source']}\n"
29
+ f"Sampling Method : {DATASET['sampling_method']}\n"
30
+ f"Sampling Size : {DATASET['sampling_size']}\n"
31
+ f"Column : {DATASET['column']}\n"
32
+ f"Methodology : {DATASET['methodology']}\n"
33
+ )
34
 
 
35
  try:
36
+ data = DATASET["df"].copy()
37
+ data = data[[DATASET["column"]]]
38
+
39
+ if DATASET["sampling_method"] == "First":
40
+ data = data.head(DATASET["sampling_size"])
41
+ elif DATASET["sampling_method"] == "Last":
42
+ data = data.tail(DATASET["sampling_size"])
43
+ elif DATASET["sampling_method"] == "Random":
44
+ data = data.sample(n=DATASET["sampling_size"], random_state=42)
45
+
46
+ result_df, result_plot, result_description = globals()[
47
+ METHODOLOGIES.get(DATASET["methodology"]).get("fx")
 
 
 
 
48
  ](data)
49
 
50
  return (
51
+ gr.Markdown(f"### Result Summary\n\nlorem ipsum", visible=True),
52
+ gr.Plot(result_plot, visible=True),
53
+ gr.Dataframe(result_df, visible=True),
 
 
54
  )
55
  except Exception as e:
56
+ print(e)
57
  return (
58
+ gr.Markdown(visible=False),
59
+ gr.Plot(visible=False),
60
+ gr.Dataframe(visible=False),
61
  )
62
 
63
 
64
+ def load_dataset(local_dataset, hf_dataset):
65
+ DATASET["name"] = (
66
+ local_dataset.name.split("/")[-1].split(".")[0] if local_dataset else hf_dataset
67
+ )
68
+ DATASET["source"] = "Local Dataset" if local_dataset else "HuggingFace Hub"
69
+ DATASET["df"] = (
70
+ pd.read_csv(local_dataset.name)
71
+ if local_dataset
72
+ else hf_load_dataset(hf_dataset, split="train[0:100]").to_pandas()
73
+ )
74
 
75
+ columns = DATASET["df"].select_dtypes(include=["object"]).columns.tolist()
76
+ column_corpus = DATASET["df"][columns[0]].tolist()[:5]
77
 
78
+ dataset_sampling_method = gr.Radio(
79
+ label="Scope",
80
+ info="Determines the scope of the dataset to be analyzed",
81
+ choices=["First", "Last", "Random"],
82
+ value="First",
83
+ visible=True,
84
+ interactive=True,
85
+ )
86
+
87
+ dataset_sampling_size = gr.Slider(
88
+ label=f"Number of Entries",
89
+ info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {MAX_THRESHOLD}.",
90
+ minimum=1,
91
+ maximum=min(DATASET["df"].shape[0], MAX_THRESHOLD),
92
+ value=min(DATASET["df"].shape[0], MAX_THRESHOLD),
93
+ visible=True,
94
+ interactive=True,
95
+ )
96
+
97
+ dataset_column = gr.Radio(
98
+ label="Column",
99
+ info="Determines the column to be analyzed. These are the columns with text data.",
100
+ choices=columns,
101
+ value=columns[0],
102
+ visible=True,
103
+ interactive=True,
104
+ )
105
+
106
+ dataset_column_corpus = gr.Dataframe(
107
+ value=pd.DataFrame({f"{columns[0]}": column_corpus}), visible=True
108
+ )
109
+
110
+ dataset_import_btn = gr.Button(
111
+ value="Import",
112
+ interactive=True,
113
+ variant="primary",
114
+ visible=True,
115
+ )
116
 
117
+ return (
118
+ dataset_sampling_method,
119
+ dataset_sampling_size,
120
+ dataset_column,
121
+ dataset_column_corpus,
122
+ dataset_import_btn,
123
+ )
124
 
 
 
 
125
 
126
+ def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_column):
127
+ DATASET["sampling_method"] = dataset_sampling_method
128
+ DATASET["sampling_size"] = dataset_sampling_size
129
+ DATASET["column"] = dataset_column
130
 
131
+ return gr.Markdown(f"## Dataset (`{DATASET['name']}`)")
132
 
 
 
 
133
 
134
+ def import_methodology(methodology):
135
+ DATASET["methodology"] = methodology
136
 
137
  return (
138
+ gr.Markdown(
139
+ f"## Methodology (`{methodology}`)",
140
+ visible=True,
141
+ ),
142
+ gr.Markdown(
143
+ METHODOLOGIES[methodology]["description"],
144
+ visible=True,
145
+ ),
146
+ gr.Button(
147
+ value="Evaluate",
148
+ interactive=True,
149
+ variant="primary",
150
+ visible=True,
151
+ ),
152
  )
153
 
154
 
 
165
 
166
  with gr.Row():
167
  with gr.Column(scale=1):
168
+ dataset_title = gr.Markdown("## Dataset")
169
+
170
+ dataset_import_type = gr.Radio(
171
+ label="Import Type",
172
+ info="Determines the mode of importing the dataset",
173
+ choices=["Local Dataset", "HuggingFace Hub"],
174
+ value="Local Dataset",
175
+ )
176
 
177
+ local_dataset = gr.File(
178
+ label="Dataset", file_types=["csv"], value=None, visible=True
 
 
 
179
  )
180
+ hf_dataset = gr.Textbox(visible=False)
181
+
182
+ dataset_load_btn = gr.Button(visible=False)
183
 
184
  dataset_sampling_method = gr.Radio(visible=False)
185
  dataset_sampling_size = gr.Slider(visible=False)
186
  dataset_column = gr.Radio(visible=False)
187
+ dataset_column_corpus = gr.Dataframe(visible=False)
188
 
189
+ dataset_import_btn = gr.Button(visible=False)
 
 
190
 
191
+ with gr.Column(scale=2):
192
+ methodology_title = gr.Markdown("## Methodology")
193
 
194
  methodology = gr.Radio(
195
  label="Methodology",
196
  info="Determines the methodology to be used for bias detection",
197
+ choices=METHODOLOGIES.keys(),
198
  )
199
 
200
+ methodology_description = gr.Markdown(visible=False)
201
 
202
+ evaluation_btn = gr.Button(
203
+ value="Evaluate",
204
+ interactive=False,
205
+ variant="primary",
206
+ visible=True,
207
+ )
208
 
209
  with gr.Column(scale=2):
210
+ result_title = gr.Markdown("## Results")
211
 
212
+ result_description = gr.Markdown(visible=False)
213
  result_plot = gr.Plot(show_label=False, container=False, visible=False)
214
  result_df = gr.DataFrame(visible=False)
215
 
216
+ submit_to_avid_btn = gr.Button(
217
+ value="Submit to AVID",
218
+ interactive=False,
219
+ variant="primary",
220
+ )
221
+
222
+ #
223
+ # Event Handlers
224
+ #
225
+ dataset_import_type.input(
226
+ fn=lambda import_type: (
227
+ gr.File(label="Dataset", file_types=["csv"], value=None, visible=True)
228
+ if import_type == "Local Dataset"
229
+ else gr.Textbox(visible=False),
230
+ gr.Textbox(
231
+ label="HuggingFace Hub",
232
+ placeholder="Search for a dataset",
233
+ value=None,
234
+ interactive=True,
235
+ visible=True,
236
+ )
237
+ if import_type == "HuggingFace Hub"
238
+ else gr.File(value=None, visible=False),
239
+ gr.Button(visible=False),
240
+ gr.Radio(visible=False),
241
+ gr.Slider(visible=False),
242
+ gr.Radio(visible=False),
243
+ gr.Dataframe(visible=False),
244
+ gr.Button(visible=False),
245
+ ),
246
+ inputs=[dataset_import_type],
247
  outputs=[
248
+ local_dataset,
249
+ hf_dataset,
250
+ dataset_load_btn,
251
  dataset_sampling_method,
252
  dataset_sampling_size,
253
  dataset_column,
254
+ dataset_column_corpus,
255
+ dataset_import_btn,
256
  ],
257
  )
258
 
259
+ local_dataset.upload(
260
+ fn=lambda _: gr.Button(
261
+ value=f"Load",
262
+ interactive=True,
263
+ variant="secondary",
264
+ visible=True,
265
+ ),
266
+ inputs=[local_dataset],
267
+ outputs=[dataset_load_btn],
268
  )
269
 
270
+ hf_dataset.submit(
271
+ fn=lambda _: gr.Button(
272
+ value=f"Load",
273
+ interactive=True,
274
+ variant="secondary",
275
+ visible=True,
276
+ ),
277
+ inputs=[hf_dataset],
278
+ outputs=[dataset_load_btn],
279
  )
280
 
281
+ dataset_load_btn.click(
282
+ fn=load_dataset,
283
+ inputs=[local_dataset, hf_dataset],
284
+ outputs=[
285
+ dataset_sampling_method,
286
+ dataset_sampling_size,
287
+ dataset_column,
288
+ dataset_column_corpus,
289
+ dataset_import_btn,
290
+ ],
291
+ )
292
+
293
+ dataset_column.input(
294
+ fn=lambda column: gr.Dataframe(
295
+ value=pd.DataFrame(
296
+ {f"{column}": DATASET["df"][column].tolist()[:5]},
297
+ ),
298
+ visible=True,
299
+ ),
300
+ inputs=[dataset_column],
301
+ outputs=[dataset_column_corpus],
302
+ )
303
+
304
+ dataset_import_btn.click(
305
+ fn=import_dataset,
306
  inputs=[
 
307
  dataset_sampling_method,
308
  dataset_sampling_size,
309
  dataset_column,
 
310
  ],
311
+ outputs=[dataset_title],
312
  )
313
 
314
+ methodology.input(
315
+ fn=import_methodology,
316
+ inputs=[methodology],
317
+ outputs=[methodology_title, methodology_description, evaluation_btn],
318
+ )
319
+
320
+ evaluation_btn.click(
321
+ fn=evaluate, inputs=None, outputs=[result_description, result_plot, result_df]
322
+ )
323
+
324
+ submit_to_avid_btn.click(
325
+ fn=None,
326
+ inputs=None,
327
+ outputs=None,
328
+ )
329
+
330
+
331
  BiasAware.launch()
requirements.txt CHANGED
@@ -1,7 +1,8 @@
1
- gradio==3.43.2
2
- gradio_client==0.5.0
3
- numpy==1.25.2
4
- pandas==2.0.3
5
  spacy
6
  genbit
7
- plotly
 
 
1
+ gradio
2
+ gradio_client
3
+ numpy
4
+ pandas
5
  spacy
6
  genbit
7
+ plotly
8
+ datasets