Add Gender-Profession and GenBit plots
Browse files- app.py +35 -13
- scripts/genbit.py +54 -1
- scripts/gender_profession_bias.py +48 -16
app.py
CHANGED
@@ -7,6 +7,7 @@ from scripts.gender_profession_bias import *
|
|
7 |
from scripts.gender_distribution import *
|
8 |
|
9 |
from datasets import load_dataset as hf_load_dataset
|
|
|
10 |
|
11 |
MAX_THRESHOLD = 1000
|
12 |
METHODOLOGIES = json.load(open("config/methodologies.json", "r"))
|
@@ -123,6 +124,32 @@ def load_dataset(local_dataset, hf_dataset):
|
|
123 |
)
|
124 |
|
125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_column):
|
127 |
DATASET["sampling_method"] = dataset_sampling_method
|
128 |
DATASET["sampling_size"] = dataset_sampling_size
|
@@ -178,16 +205,16 @@ with BiasAware:
|
|
178 |
label="Dataset", file_types=["csv"], value=None, visible=True
|
179 |
)
|
180 |
hf_dataset = gr.Textbox(visible=False)
|
|
|
181 |
|
182 |
dataset_load_btn = gr.Button(visible=False)
|
|
|
183 |
|
184 |
dataset_sampling_method = gr.Radio(visible=False)
|
185 |
dataset_sampling_size = gr.Slider(visible=False)
|
186 |
dataset_column = gr.Radio(visible=False)
|
187 |
dataset_column_corpus = gr.Dataframe(visible=False)
|
188 |
|
189 |
-
dataset_import_btn = gr.Button(visible=False)
|
190 |
-
|
191 |
with gr.Column(scale=2):
|
192 |
methodology_title = gr.Markdown("## Methodology")
|
193 |
|
@@ -197,8 +224,6 @@ with BiasAware:
|
|
197 |
choices=METHODOLOGIES.keys(),
|
198 |
)
|
199 |
|
200 |
-
methodology_description = gr.Markdown(visible=False)
|
201 |
-
|
202 |
evaluation_btn = gr.Button(
|
203 |
value="Evaluate",
|
204 |
interactive=False,
|
@@ -206,6 +231,8 @@ with BiasAware:
|
|
206 |
visible=True,
|
207 |
)
|
208 |
|
|
|
|
|
209 |
with gr.Column(scale=2):
|
210 |
result_title = gr.Markdown("## Results")
|
211 |
|
@@ -230,7 +257,7 @@ with BiasAware:
|
|
230 |
gr.Textbox(
|
231 |
label="HuggingFace Hub",
|
232 |
placeholder="Search for a dataset",
|
233 |
-
value=
|
234 |
interactive=True,
|
235 |
visible=True,
|
236 |
)
|
@@ -268,19 +295,14 @@ with BiasAware:
|
|
268 |
)
|
269 |
|
270 |
hf_dataset.submit(
|
271 |
-
fn=
|
272 |
-
value=f"Load",
|
273 |
-
interactive=True,
|
274 |
-
variant="secondary",
|
275 |
-
visible=True,
|
276 |
-
),
|
277 |
inputs=[hf_dataset],
|
278 |
-
outputs=[dataset_load_btn],
|
279 |
)
|
280 |
|
281 |
dataset_load_btn.click(
|
282 |
fn=load_dataset,
|
283 |
-
inputs=[local_dataset,
|
284 |
outputs=[
|
285 |
dataset_sampling_method,
|
286 |
dataset_sampling_size,
|
|
|
7 |
from scripts.gender_distribution import *
|
8 |
|
9 |
from datasets import load_dataset as hf_load_dataset
|
10 |
+
from huggingface_hub import DatasetFilter, list_datasets
|
11 |
|
12 |
MAX_THRESHOLD = 1000
|
13 |
METHODOLOGIES = json.load(open("config/methodologies.json", "r"))
|
|
|
124 |
)
|
125 |
|
126 |
|
127 |
+
def show_hf_dataset_search_results(hf_dataset):
|
128 |
+
choices = [
|
129 |
+
dataset.id
|
130 |
+
for dataset in list_datasets(
|
131 |
+
filter=DatasetFilter(dataset_name=hf_dataset, language="en"), limit=10
|
132 |
+
)
|
133 |
+
]
|
134 |
+
|
135 |
+
return (
|
136 |
+
gr.Button(
|
137 |
+
value=f"Load",
|
138 |
+
interactive=True,
|
139 |
+
variant="secondary",
|
140 |
+
visible=True,
|
141 |
+
),
|
142 |
+
gr.Radio(
|
143 |
+
label="HuggingFace Hub Search Results",
|
144 |
+
info="Select the dataset to be imported",
|
145 |
+
choices=choices,
|
146 |
+
value=choices[0],
|
147 |
+
interactive=True,
|
148 |
+
visible=True,
|
149 |
+
),
|
150 |
+
)
|
151 |
+
|
152 |
+
|
153 |
def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_column):
|
154 |
DATASET["sampling_method"] = dataset_sampling_method
|
155 |
DATASET["sampling_size"] = dataset_sampling_size
|
|
|
205 |
label="Dataset", file_types=["csv"], value=None, visible=True
|
206 |
)
|
207 |
hf_dataset = gr.Textbox(visible=False)
|
208 |
+
hf_dataset_search_results = gr.Radio(visible=False)
|
209 |
|
210 |
dataset_load_btn = gr.Button(visible=False)
|
211 |
+
dataset_import_btn = gr.Button(visible=False)
|
212 |
|
213 |
dataset_sampling_method = gr.Radio(visible=False)
|
214 |
dataset_sampling_size = gr.Slider(visible=False)
|
215 |
dataset_column = gr.Radio(visible=False)
|
216 |
dataset_column_corpus = gr.Dataframe(visible=False)
|
217 |
|
|
|
|
|
218 |
with gr.Column(scale=2):
|
219 |
methodology_title = gr.Markdown("## Methodology")
|
220 |
|
|
|
224 |
choices=METHODOLOGIES.keys(),
|
225 |
)
|
226 |
|
|
|
|
|
227 |
evaluation_btn = gr.Button(
|
228 |
value="Evaluate",
|
229 |
interactive=False,
|
|
|
231 |
visible=True,
|
232 |
)
|
233 |
|
234 |
+
methodology_description = gr.Markdown(visible=False)
|
235 |
+
|
236 |
with gr.Column(scale=2):
|
237 |
result_title = gr.Markdown("## Results")
|
238 |
|
|
|
257 |
gr.Textbox(
|
258 |
label="HuggingFace Hub",
|
259 |
placeholder="Search for a dataset",
|
260 |
+
value="amazon_multi",
|
261 |
interactive=True,
|
262 |
visible=True,
|
263 |
)
|
|
|
295 |
)
|
296 |
|
297 |
hf_dataset.submit(
|
298 |
+
fn=show_hf_dataset_search_results,
|
|
|
|
|
|
|
|
|
|
|
299 |
inputs=[hf_dataset],
|
300 |
+
outputs=[dataset_load_btn, hf_dataset_search_results],
|
301 |
)
|
302 |
|
303 |
dataset_load_btn.click(
|
304 |
fn=load_dataset,
|
305 |
+
inputs=[local_dataset, hf_dataset_search_results],
|
306 |
outputs=[
|
307 |
dataset_sampling_method,
|
308 |
dataset_sampling_size,
|
scripts/genbit.py
CHANGED
@@ -1,5 +1,58 @@
|
|
1 |
from genbit.genbit_metrics import GenBitMetrics
|
2 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
|
5 |
def eval_genbit(data):
|
@@ -18,7 +71,7 @@ def eval_genbit(data):
|
|
18 |
.rename(columns={"index": "Metric", 0: "Value"})
|
19 |
)
|
20 |
|
21 |
-
result_plot =
|
22 |
result_conclusion = ""
|
23 |
|
24 |
return result_df, result_plot, result_conclusion
|
|
|
1 |
from genbit.genbit_metrics import GenBitMetrics
|
2 |
import pandas as pd
|
3 |
+
import plotly.express as px
|
4 |
+
from plotly.subplots import make_subplots
|
5 |
+
|
6 |
+
|
7 |
+
def plot_genbit(result_json):
|
8 |
+
data1 = {
|
9 |
+
"Metric": [
|
10 |
+
"Female Gender",
|
11 |
+
"Male Gender",
|
12 |
+
"Non-Binary Gender",
|
13 |
+
],
|
14 |
+
"Value": [
|
15 |
+
result_json["percentage_of_female_gender_definition_words"],
|
16 |
+
result_json["percentage_of_male_gender_definition_words"],
|
17 |
+
result_json["percentage_of_non_binary_gender_definition_words"],
|
18 |
+
],
|
19 |
+
}
|
20 |
+
|
21 |
+
data2 = {
|
22 |
+
"Metric": [
|
23 |
+
"Trans Gender",
|
24 |
+
"Cis Gender",
|
25 |
+
],
|
26 |
+
"Value": [
|
27 |
+
result_json["percentage_of_trans_gender_definition_words"],
|
28 |
+
result_json["percentage_of_cis_gender_definition_words"],
|
29 |
+
],
|
30 |
+
}
|
31 |
+
|
32 |
+
df1 = pd.DataFrame(data1)
|
33 |
+
df2 = pd.DataFrame(data2)
|
34 |
+
|
35 |
+
fig1 = px.pie(
|
36 |
+
df1,
|
37 |
+
names="Metric",
|
38 |
+
values="Value",
|
39 |
+
title="Combined Gender Definition Words Distribution",
|
40 |
+
)
|
41 |
+
fig1.update_traces(textposition="inside", textinfo="percent+label")
|
42 |
+
|
43 |
+
# fig2 = px.pie(
|
44 |
+
# df2,
|
45 |
+
# names="Metric",
|
46 |
+
# values="Value",
|
47 |
+
# )
|
48 |
+
# fig2.update_traces(textposition="inside", textinfo="percent+label")
|
49 |
+
|
50 |
+
# fig = make_subplots(rows=2, cols=1, specs=[[{"type": "pie"}], [{"type": "pie"}]])
|
51 |
+
|
52 |
+
# fig.add_trace(fig1.data[0], row=1, col=1)
|
53 |
+
# fig.add_trace(fig2.data[0], row=2, col=1)
|
54 |
+
|
55 |
+
return fig1
|
56 |
|
57 |
|
58 |
def eval_genbit(data):
|
|
|
71 |
.rename(columns={"index": "Metric", 0: "Value"})
|
72 |
)
|
73 |
|
74 |
+
result_plot = plot_genbit(result_json)
|
75 |
result_conclusion = ""
|
76 |
|
77 |
return result_df, result_plot, result_conclusion
|
scripts/gender_profession_bias.py
CHANGED
@@ -13,6 +13,20 @@ nlp = English()
|
|
13 |
nlp.add_pipe("sentencizer")
|
14 |
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
def get_split_text(text):
|
17 |
doc = nlp(text)
|
18 |
sentences = [sent for sent in doc.sents]
|
@@ -71,20 +85,6 @@ def get_gender_prof_match_details(df_text):
|
|
71 |
return results
|
72 |
|
73 |
|
74 |
-
def call_multiprocessing_pool(df_text):
|
75 |
-
concurrent = 2000
|
76 |
-
pool = multiprocessing.pool.ThreadPool(processes=concurrent)
|
77 |
-
result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
|
78 |
-
pool.close()
|
79 |
-
|
80 |
-
flat_return_list = [item for sublist in result_list for item in sublist]
|
81 |
-
|
82 |
-
cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
|
83 |
-
return_df = pd.DataFrame(flat_return_list, columns=cols)
|
84 |
-
|
85 |
-
return return_df
|
86 |
-
|
87 |
-
|
88 |
def get_statistics(result):
|
89 |
stats = {
|
90 |
"both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
|
@@ -102,8 +102,40 @@ def get_statistics(result):
|
|
102 |
return stats
|
103 |
|
104 |
|
105 |
-
def get_plot(
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
|
109 |
def eval_gender_profession(data):
|
|
|
13 |
nlp.add_pipe("sentencizer")
|
14 |
|
15 |
|
16 |
+
def call_multiprocessing_pool(df_text):
|
17 |
+
concurrent = 2000
|
18 |
+
pool = multiprocessing.pool.ThreadPool(processes=concurrent)
|
19 |
+
result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
|
20 |
+
pool.close()
|
21 |
+
|
22 |
+
flat_return_list = [item for sublist in result_list for item in sublist]
|
23 |
+
|
24 |
+
cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
|
25 |
+
return_df = pd.DataFrame(flat_return_list, columns=cols)
|
26 |
+
|
27 |
+
return return_df
|
28 |
+
|
29 |
+
|
30 |
def get_split_text(text):
|
31 |
doc = nlp(text)
|
32 |
sentences = [sent for sent in doc.sents]
|
|
|
85 |
return results
|
86 |
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
def get_statistics(result):
|
89 |
stats = {
|
90 |
"both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
|
|
|
102 |
return stats
|
103 |
|
104 |
|
105 |
+
def get_plot(result_json):
|
106 |
+
both_gender_prof_match = int(result_json["both_gender_prof_match"])
|
107 |
+
count_male_pronoun = int(result_json["count_male_pronoun"])
|
108 |
+
count_female_pronoun = int(result_json["count_female_pronoun"])
|
109 |
+
count_male_pronoun_profession = int(result_json["count_male_pronoun_profession"])
|
110 |
+
count_female_pronoun_profession = int(
|
111 |
+
result_json["count_female_pronoun_profession"]
|
112 |
+
)
|
113 |
+
|
114 |
+
data = {
|
115 |
+
"Labels": [
|
116 |
+
"Both Gender & Profession Match",
|
117 |
+
"Male Pronoun",
|
118 |
+
"Female Pronoun",
|
119 |
+
"Male Pronoun & Profession",
|
120 |
+
"Female Pronoun & Profession",
|
121 |
+
],
|
122 |
+
"Values": [
|
123 |
+
both_gender_prof_match,
|
124 |
+
count_male_pronoun,
|
125 |
+
count_female_pronoun,
|
126 |
+
count_male_pronoun_profession,
|
127 |
+
count_female_pronoun_profession,
|
128 |
+
],
|
129 |
+
}
|
130 |
+
|
131 |
+
fig = px.pie(
|
132 |
+
data,
|
133 |
+
names="Labels",
|
134 |
+
values="Values",
|
135 |
+
title="Gender & Profession Match Statistics",
|
136 |
+
)
|
137 |
+
|
138 |
+
return fig
|
139 |
|
140 |
|
141 |
def eval_gender_profession(data):
|