Add HuggingFace support for dataset import
Browse files- app.py +246 -121
- requirements.txt +6 -5
app.py
CHANGED
@@ -1,121 +1,154 @@
|
|
1 |
import json
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
4 |
-
import os
|
5 |
|
6 |
from scripts.genbit import *
|
7 |
from scripts.gender_profession_bias import *
|
8 |
from scripts.gender_distribution import *
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
def evaluate(dataset, sampling_method, sampling_size, column, methodology):
|
23 |
try:
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
[
|
29 |
-
]
|
30 |
-
|
31 |
-
|
32 |
-
data = data.
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
data = data.sample(n=sampling_size, random_state=42)
|
37 |
-
|
38 |
-
result_df, result_plot, result_conclusion = globals()[
|
39 |
-
methodologies.get(methodology).get("fx")
|
40 |
](data)
|
41 |
|
42 |
return (
|
43 |
-
gr.Markdown
|
44 |
-
|
45 |
-
),
|
46 |
-
gr.Plot.update(result_plot, visible=True),
|
47 |
-
gr.Dataframe.update(result_df, visible=True),
|
48 |
)
|
49 |
except Exception as e:
|
|
|
50 |
return (
|
51 |
-
gr.Markdown
|
52 |
-
gr.Plot
|
53 |
-
gr.Dataframe
|
54 |
)
|
55 |
|
56 |
|
57 |
-
def
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
-
|
62 |
-
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
)
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
-
def update_column_metadata(dataset, column):
|
104 |
-
data = DATASET_CACHE.setdefault(dataset.name, pd.read_csv(dataset.name))
|
105 |
-
corpus = data[column].tolist()[0:5]
|
106 |
|
107 |
-
|
|
|
|
|
|
|
108 |
|
|
|
109 |
|
110 |
-
def get_methodology_metadata(methodology):
|
111 |
-
title = "## " + methodology
|
112 |
-
description = methodologies.get(methodology).get("description")
|
113 |
|
114 |
-
|
|
|
115 |
|
116 |
return (
|
117 |
-
gr.Markdown
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
)
|
120 |
|
121 |
|
@@ -132,75 +165,167 @@ with BiasAware:
|
|
132 |
|
133 |
with gr.Row():
|
134 |
with gr.Column(scale=1):
|
135 |
-
gr.Markdown("## Dataset")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
-
|
138 |
-
|
139 |
-
examples=datasets,
|
140 |
-
inputs=dataset_file,
|
141 |
-
label="Example Datasets",
|
142 |
)
|
|
|
|
|
|
|
143 |
|
144 |
dataset_sampling_method = gr.Radio(visible=False)
|
145 |
dataset_sampling_size = gr.Slider(visible=False)
|
146 |
dataset_column = gr.Radio(visible=False)
|
|
|
147 |
|
148 |
-
|
149 |
-
row_count=(5, "fixed"), col_count=(1, "fixed"), visible=False
|
150 |
-
)
|
151 |
|
152 |
-
with gr.Column(scale=
|
153 |
-
gr.Markdown("## Methodology")
|
154 |
|
155 |
methodology = gr.Radio(
|
156 |
label="Methodology",
|
157 |
info="Determines the methodology to be used for bias detection",
|
158 |
-
choices=
|
159 |
)
|
160 |
|
161 |
-
|
162 |
|
163 |
-
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
with gr.Column(scale=2):
|
166 |
-
|
167 |
|
|
|
168 |
result_plot = gr.Plot(show_label=False, container=False, visible=False)
|
169 |
result_df = gr.DataFrame(visible=False)
|
170 |
|
171 |
-
|
172 |
-
|
173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
outputs=[
|
|
|
|
|
|
|
175 |
dataset_sampling_method,
|
176 |
dataset_sampling_size,
|
177 |
dataset_column,
|
178 |
-
|
|
|
179 |
],
|
180 |
)
|
181 |
|
182 |
-
|
183 |
-
fn=
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
|
|
|
|
186 |
)
|
187 |
|
188 |
-
|
189 |
-
fn=
|
190 |
-
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
192 |
)
|
193 |
|
194 |
-
|
195 |
-
fn=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
inputs=[
|
197 |
-
dataset_file,
|
198 |
dataset_sampling_method,
|
199 |
dataset_sampling_size,
|
200 |
dataset_column,
|
201 |
-
methodology,
|
202 |
],
|
203 |
-
outputs=[
|
204 |
)
|
205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
BiasAware.launch()
|
|
|
1 |
import json
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
|
|
4 |
|
5 |
from scripts.genbit import *
|
6 |
from scripts.gender_profession_bias import *
|
7 |
from scripts.gender_distribution import *
|
8 |
|
9 |
+
from datasets import load_dataset as hf_load_dataset
|
10 |
+
|
11 |
+
MAX_THRESHOLD = 1000
|
12 |
+
METHODOLOGIES = json.load(open("config/methodologies.json", "r"))
|
13 |
+
|
14 |
+
DATASET = {
|
15 |
+
"name": None,
|
16 |
+
"source": None,
|
17 |
+
"df": None,
|
18 |
+
"sampling_method": None,
|
19 |
+
"sampling_size": None,
|
20 |
+
"column": None,
|
21 |
+
"methodology": None,
|
22 |
+
}
|
23 |
+
|
24 |
+
|
25 |
+
def evaluate():
|
26 |
+
print(
|
27 |
+
f"Dataset : {DATASET['name']}\n"
|
28 |
+
f"Source : {DATASET['source']}\n"
|
29 |
+
f"Sampling Method : {DATASET['sampling_method']}\n"
|
30 |
+
f"Sampling Size : {DATASET['sampling_size']}\n"
|
31 |
+
f"Column : {DATASET['column']}\n"
|
32 |
+
f"Methodology : {DATASET['methodology']}\n"
|
33 |
+
)
|
34 |
|
|
|
35 |
try:
|
36 |
+
data = DATASET["df"].copy()
|
37 |
+
data = data[[DATASET["column"]]]
|
38 |
+
|
39 |
+
if DATASET["sampling_method"] == "First":
|
40 |
+
data = data.head(DATASET["sampling_size"])
|
41 |
+
elif DATASET["sampling_method"] == "Last":
|
42 |
+
data = data.tail(DATASET["sampling_size"])
|
43 |
+
elif DATASET["sampling_method"] == "Random":
|
44 |
+
data = data.sample(n=DATASET["sampling_size"], random_state=42)
|
45 |
+
|
46 |
+
result_df, result_plot, result_description = globals()[
|
47 |
+
METHODOLOGIES.get(DATASET["methodology"]).get("fx")
|
|
|
|
|
|
|
|
|
48 |
](data)
|
49 |
|
50 |
return (
|
51 |
+
gr.Markdown(f"### Result Summary\n\nlorem ipsum", visible=True),
|
52 |
+
gr.Plot(result_plot, visible=True),
|
53 |
+
gr.Dataframe(result_df, visible=True),
|
|
|
|
|
54 |
)
|
55 |
except Exception as e:
|
56 |
+
print(e)
|
57 |
return (
|
58 |
+
gr.Markdown(visible=False),
|
59 |
+
gr.Plot(visible=False),
|
60 |
+
gr.Dataframe(visible=False),
|
61 |
)
|
62 |
|
63 |
|
64 |
+
def load_dataset(local_dataset, hf_dataset):
|
65 |
+
DATASET["name"] = (
|
66 |
+
local_dataset.name.split("/")[-1].split(".")[0] if local_dataset else hf_dataset
|
67 |
+
)
|
68 |
+
DATASET["source"] = "Local Dataset" if local_dataset else "HuggingFace Hub"
|
69 |
+
DATASET["df"] = (
|
70 |
+
pd.read_csv(local_dataset.name)
|
71 |
+
if local_dataset
|
72 |
+
else hf_load_dataset(hf_dataset, split="train[0:100]").to_pandas()
|
73 |
+
)
|
74 |
|
75 |
+
columns = DATASET["df"].select_dtypes(include=["object"]).columns.tolist()
|
76 |
+
column_corpus = DATASET["df"][columns[0]].tolist()[:5]
|
77 |
|
78 |
+
dataset_sampling_method = gr.Radio(
|
79 |
+
label="Scope",
|
80 |
+
info="Determines the scope of the dataset to be analyzed",
|
81 |
+
choices=["First", "Last", "Random"],
|
82 |
+
value="First",
|
83 |
+
visible=True,
|
84 |
+
interactive=True,
|
85 |
+
)
|
86 |
+
|
87 |
+
dataset_sampling_size = gr.Slider(
|
88 |
+
label=f"Number of Entries",
|
89 |
+
info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {MAX_THRESHOLD}.",
|
90 |
+
minimum=1,
|
91 |
+
maximum=min(DATASET["df"].shape[0], MAX_THRESHOLD),
|
92 |
+
value=min(DATASET["df"].shape[0], MAX_THRESHOLD),
|
93 |
+
visible=True,
|
94 |
+
interactive=True,
|
95 |
+
)
|
96 |
+
|
97 |
+
dataset_column = gr.Radio(
|
98 |
+
label="Column",
|
99 |
+
info="Determines the column to be analyzed. These are the columns with text data.",
|
100 |
+
choices=columns,
|
101 |
+
value=columns[0],
|
102 |
+
visible=True,
|
103 |
+
interactive=True,
|
104 |
+
)
|
105 |
+
|
106 |
+
dataset_column_corpus = gr.Dataframe(
|
107 |
+
value=pd.DataFrame({f"{columns[0]}": column_corpus}), visible=True
|
108 |
+
)
|
109 |
+
|
110 |
+
dataset_import_btn = gr.Button(
|
111 |
+
value="Import",
|
112 |
+
interactive=True,
|
113 |
+
variant="primary",
|
114 |
+
visible=True,
|
115 |
+
)
|
116 |
|
117 |
+
return (
|
118 |
+
dataset_sampling_method,
|
119 |
+
dataset_sampling_size,
|
120 |
+
dataset_column,
|
121 |
+
dataset_column_corpus,
|
122 |
+
dataset_import_btn,
|
123 |
+
)
|
124 |
|
|
|
|
|
|
|
125 |
|
126 |
+
def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_column):
|
127 |
+
DATASET["sampling_method"] = dataset_sampling_method
|
128 |
+
DATASET["sampling_size"] = dataset_sampling_size
|
129 |
+
DATASET["column"] = dataset_column
|
130 |
|
131 |
+
return gr.Markdown(f"## Dataset (`{DATASET['name']}`)")
|
132 |
|
|
|
|
|
|
|
133 |
|
134 |
+
def import_methodology(methodology):
|
135 |
+
DATASET["methodology"] = methodology
|
136 |
|
137 |
return (
|
138 |
+
gr.Markdown(
|
139 |
+
f"## Methodology (`{methodology}`)",
|
140 |
+
visible=True,
|
141 |
+
),
|
142 |
+
gr.Markdown(
|
143 |
+
METHODOLOGIES[methodology]["description"],
|
144 |
+
visible=True,
|
145 |
+
),
|
146 |
+
gr.Button(
|
147 |
+
value="Evaluate",
|
148 |
+
interactive=True,
|
149 |
+
variant="primary",
|
150 |
+
visible=True,
|
151 |
+
),
|
152 |
)
|
153 |
|
154 |
|
|
|
165 |
|
166 |
with gr.Row():
|
167 |
with gr.Column(scale=1):
|
168 |
+
dataset_title = gr.Markdown("## Dataset")
|
169 |
+
|
170 |
+
dataset_import_type = gr.Radio(
|
171 |
+
label="Import Type",
|
172 |
+
info="Determines the mode of importing the dataset",
|
173 |
+
choices=["Local Dataset", "HuggingFace Hub"],
|
174 |
+
value="Local Dataset",
|
175 |
+
)
|
176 |
|
177 |
+
local_dataset = gr.File(
|
178 |
+
label="Dataset", file_types=["csv"], value=None, visible=True
|
|
|
|
|
|
|
179 |
)
|
180 |
+
hf_dataset = gr.Textbox(visible=False)
|
181 |
+
|
182 |
+
dataset_load_btn = gr.Button(visible=False)
|
183 |
|
184 |
dataset_sampling_method = gr.Radio(visible=False)
|
185 |
dataset_sampling_size = gr.Slider(visible=False)
|
186 |
dataset_column = gr.Radio(visible=False)
|
187 |
+
dataset_column_corpus = gr.Dataframe(visible=False)
|
188 |
|
189 |
+
dataset_import_btn = gr.Button(visible=False)
|
|
|
|
|
190 |
|
191 |
+
with gr.Column(scale=2):
|
192 |
+
methodology_title = gr.Markdown("## Methodology")
|
193 |
|
194 |
methodology = gr.Radio(
|
195 |
label="Methodology",
|
196 |
info="Determines the methodology to be used for bias detection",
|
197 |
+
choices=METHODOLOGIES.keys(),
|
198 |
)
|
199 |
|
200 |
+
methodology_description = gr.Markdown(visible=False)
|
201 |
|
202 |
+
evaluation_btn = gr.Button(
|
203 |
+
value="Evaluate",
|
204 |
+
interactive=False,
|
205 |
+
variant="primary",
|
206 |
+
visible=True,
|
207 |
+
)
|
208 |
|
209 |
with gr.Column(scale=2):
|
210 |
+
result_title = gr.Markdown("## Results")
|
211 |
|
212 |
+
result_description = gr.Markdown(visible=False)
|
213 |
result_plot = gr.Plot(show_label=False, container=False, visible=False)
|
214 |
result_df = gr.DataFrame(visible=False)
|
215 |
|
216 |
+
submit_to_avid_btn = gr.Button(
|
217 |
+
value="Submit to AVID",
|
218 |
+
interactive=False,
|
219 |
+
variant="primary",
|
220 |
+
)
|
221 |
+
|
222 |
+
#
|
223 |
+
# Event Handlers
|
224 |
+
#
|
225 |
+
dataset_import_type.input(
|
226 |
+
fn=lambda import_type: (
|
227 |
+
gr.File(label="Dataset", file_types=["csv"], value=None, visible=True)
|
228 |
+
if import_type == "Local Dataset"
|
229 |
+
else gr.Textbox(visible=False),
|
230 |
+
gr.Textbox(
|
231 |
+
label="HuggingFace Hub",
|
232 |
+
placeholder="Search for a dataset",
|
233 |
+
value=None,
|
234 |
+
interactive=True,
|
235 |
+
visible=True,
|
236 |
+
)
|
237 |
+
if import_type == "HuggingFace Hub"
|
238 |
+
else gr.File(value=None, visible=False),
|
239 |
+
gr.Button(visible=False),
|
240 |
+
gr.Radio(visible=False),
|
241 |
+
gr.Slider(visible=False),
|
242 |
+
gr.Radio(visible=False),
|
243 |
+
gr.Dataframe(visible=False),
|
244 |
+
gr.Button(visible=False),
|
245 |
+
),
|
246 |
+
inputs=[dataset_import_type],
|
247 |
outputs=[
|
248 |
+
local_dataset,
|
249 |
+
hf_dataset,
|
250 |
+
dataset_load_btn,
|
251 |
dataset_sampling_method,
|
252 |
dataset_sampling_size,
|
253 |
dataset_column,
|
254 |
+
dataset_column_corpus,
|
255 |
+
dataset_import_btn,
|
256 |
],
|
257 |
)
|
258 |
|
259 |
+
local_dataset.upload(
|
260 |
+
fn=lambda _: gr.Button(
|
261 |
+
value=f"Load",
|
262 |
+
interactive=True,
|
263 |
+
variant="secondary",
|
264 |
+
visible=True,
|
265 |
+
),
|
266 |
+
inputs=[local_dataset],
|
267 |
+
outputs=[dataset_load_btn],
|
268 |
)
|
269 |
|
270 |
+
hf_dataset.submit(
|
271 |
+
fn=lambda _: gr.Button(
|
272 |
+
value=f"Load",
|
273 |
+
interactive=True,
|
274 |
+
variant="secondary",
|
275 |
+
visible=True,
|
276 |
+
),
|
277 |
+
inputs=[hf_dataset],
|
278 |
+
outputs=[dataset_load_btn],
|
279 |
)
|
280 |
|
281 |
+
dataset_load_btn.click(
|
282 |
+
fn=load_dataset,
|
283 |
+
inputs=[local_dataset, hf_dataset],
|
284 |
+
outputs=[
|
285 |
+
dataset_sampling_method,
|
286 |
+
dataset_sampling_size,
|
287 |
+
dataset_column,
|
288 |
+
dataset_column_corpus,
|
289 |
+
dataset_import_btn,
|
290 |
+
],
|
291 |
+
)
|
292 |
+
|
293 |
+
dataset_column.input(
|
294 |
+
fn=lambda column: gr.Dataframe(
|
295 |
+
value=pd.DataFrame(
|
296 |
+
{f"{column}": DATASET["df"][column].tolist()[:5]},
|
297 |
+
),
|
298 |
+
visible=True,
|
299 |
+
),
|
300 |
+
inputs=[dataset_column],
|
301 |
+
outputs=[dataset_column_corpus],
|
302 |
+
)
|
303 |
+
|
304 |
+
dataset_import_btn.click(
|
305 |
+
fn=import_dataset,
|
306 |
inputs=[
|
|
|
307 |
dataset_sampling_method,
|
308 |
dataset_sampling_size,
|
309 |
dataset_column,
|
|
|
310 |
],
|
311 |
+
outputs=[dataset_title],
|
312 |
)
|
313 |
|
314 |
+
methodology.input(
|
315 |
+
fn=import_methodology,
|
316 |
+
inputs=[methodology],
|
317 |
+
outputs=[methodology_title, methodology_description, evaluation_btn],
|
318 |
+
)
|
319 |
+
|
320 |
+
evaluation_btn.click(
|
321 |
+
fn=evaluate, inputs=None, outputs=[result_description, result_plot, result_df]
|
322 |
+
)
|
323 |
+
|
324 |
+
submit_to_avid_btn.click(
|
325 |
+
fn=None,
|
326 |
+
inputs=None,
|
327 |
+
outputs=None,
|
328 |
+
)
|
329 |
+
|
330 |
+
|
331 |
BiasAware.launch()
|
requirements.txt
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
-
gradio
|
2 |
-
gradio_client
|
3 |
-
numpy
|
4 |
-
pandas
|
5 |
spacy
|
6 |
genbit
|
7 |
-
plotly
|
|
|
|
1 |
+
gradio
|
2 |
+
gradio_client
|
3 |
+
numpy
|
4 |
+
pandas
|
5 |
spacy
|
6 |
genbit
|
7 |
+
plotly
|
8 |
+
datasets
|