freyam commited on
Commit
d1a2df2
·
1 Parent(s): 8ed4d84

Add Gender-Profession and GenBit plots

Browse files
Files changed (3) hide show
  1. app.py +35 -13
  2. scripts/genbit.py +54 -1
  3. scripts/gender_profession_bias.py +48 -16
app.py CHANGED
@@ -7,6 +7,7 @@ from scripts.gender_profession_bias import *
7
  from scripts.gender_distribution import *
8
 
9
  from datasets import load_dataset as hf_load_dataset
 
10
 
11
  MAX_THRESHOLD = 1000
12
  METHODOLOGIES = json.load(open("config/methodologies.json", "r"))
@@ -123,6 +124,32 @@ def load_dataset(local_dataset, hf_dataset):
123
  )
124
 
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_column):
127
  DATASET["sampling_method"] = dataset_sampling_method
128
  DATASET["sampling_size"] = dataset_sampling_size
@@ -178,16 +205,16 @@ with BiasAware:
178
  label="Dataset", file_types=["csv"], value=None, visible=True
179
  )
180
  hf_dataset = gr.Textbox(visible=False)
 
181
 
182
  dataset_load_btn = gr.Button(visible=False)
 
183
 
184
  dataset_sampling_method = gr.Radio(visible=False)
185
  dataset_sampling_size = gr.Slider(visible=False)
186
  dataset_column = gr.Radio(visible=False)
187
  dataset_column_corpus = gr.Dataframe(visible=False)
188
 
189
- dataset_import_btn = gr.Button(visible=False)
190
-
191
  with gr.Column(scale=2):
192
  methodology_title = gr.Markdown("## Methodology")
193
 
@@ -197,8 +224,6 @@ with BiasAware:
197
  choices=METHODOLOGIES.keys(),
198
  )
199
 
200
- methodology_description = gr.Markdown(visible=False)
201
-
202
  evaluation_btn = gr.Button(
203
  value="Evaluate",
204
  interactive=False,
@@ -206,6 +231,8 @@ with BiasAware:
206
  visible=True,
207
  )
208
 
 
 
209
  with gr.Column(scale=2):
210
  result_title = gr.Markdown("## Results")
211
 
@@ -230,7 +257,7 @@ with BiasAware:
230
  gr.Textbox(
231
  label="HuggingFace Hub",
232
  placeholder="Search for a dataset",
233
- value=None,
234
  interactive=True,
235
  visible=True,
236
  )
@@ -268,19 +295,14 @@ with BiasAware:
268
  )
269
 
270
  hf_dataset.submit(
271
- fn=lambda _: gr.Button(
272
- value=f"Load",
273
- interactive=True,
274
- variant="secondary",
275
- visible=True,
276
- ),
277
  inputs=[hf_dataset],
278
- outputs=[dataset_load_btn],
279
  )
280
 
281
  dataset_load_btn.click(
282
  fn=load_dataset,
283
- inputs=[local_dataset, hf_dataset],
284
  outputs=[
285
  dataset_sampling_method,
286
  dataset_sampling_size,
 
7
  from scripts.gender_distribution import *
8
 
9
  from datasets import load_dataset as hf_load_dataset
10
+ from huggingface_hub import DatasetFilter, list_datasets
11
 
12
  MAX_THRESHOLD = 1000
13
  METHODOLOGIES = json.load(open("config/methodologies.json", "r"))
 
124
  )
125
 
126
 
127
+ def show_hf_dataset_search_results(hf_dataset):
128
+ choices = [
129
+ dataset.id
130
+ for dataset in list_datasets(
131
+ filter=DatasetFilter(dataset_name=hf_dataset, language="en"), limit=10
132
+ )
133
+ ]
134
+
135
+ return (
136
+ gr.Button(
137
+ value=f"Load",
138
+ interactive=True,
139
+ variant="secondary",
140
+ visible=True,
141
+ ),
142
+ gr.Radio(
143
+ label="HuggingFace Hub Search Results",
144
+ info="Select the dataset to be imported",
145
+ choices=choices,
146
+ value=choices[0],
147
+ interactive=True,
148
+ visible=True,
149
+ ),
150
+ )
151
+
152
+
153
  def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_column):
154
  DATASET["sampling_method"] = dataset_sampling_method
155
  DATASET["sampling_size"] = dataset_sampling_size
 
205
  label="Dataset", file_types=["csv"], value=None, visible=True
206
  )
207
  hf_dataset = gr.Textbox(visible=False)
208
+ hf_dataset_search_results = gr.Radio(visible=False)
209
 
210
  dataset_load_btn = gr.Button(visible=False)
211
+ dataset_import_btn = gr.Button(visible=False)
212
 
213
  dataset_sampling_method = gr.Radio(visible=False)
214
  dataset_sampling_size = gr.Slider(visible=False)
215
  dataset_column = gr.Radio(visible=False)
216
  dataset_column_corpus = gr.Dataframe(visible=False)
217
 
 
 
218
  with gr.Column(scale=2):
219
  methodology_title = gr.Markdown("## Methodology")
220
 
 
224
  choices=METHODOLOGIES.keys(),
225
  )
226
 
 
 
227
  evaluation_btn = gr.Button(
228
  value="Evaluate",
229
  interactive=False,
 
231
  visible=True,
232
  )
233
 
234
+ methodology_description = gr.Markdown(visible=False)
235
+
236
  with gr.Column(scale=2):
237
  result_title = gr.Markdown("## Results")
238
 
 
257
  gr.Textbox(
258
  label="HuggingFace Hub",
259
  placeholder="Search for a dataset",
260
+ value="amazon_multi",
261
  interactive=True,
262
  visible=True,
263
  )
 
295
  )
296
 
297
  hf_dataset.submit(
298
+ fn=show_hf_dataset_search_results,
 
 
 
 
 
299
  inputs=[hf_dataset],
300
+ outputs=[dataset_load_btn, hf_dataset_search_results],
301
  )
302
 
303
  dataset_load_btn.click(
304
  fn=load_dataset,
305
+ inputs=[local_dataset, hf_dataset_search_results],
306
  outputs=[
307
  dataset_sampling_method,
308
  dataset_sampling_size,
scripts/genbit.py CHANGED
@@ -1,5 +1,58 @@
1
  from genbit.genbit_metrics import GenBitMetrics
2
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  def eval_genbit(data):
@@ -18,7 +71,7 @@ def eval_genbit(data):
18
  .rename(columns={"index": "Metric", 0: "Value"})
19
  )
20
 
21
- result_plot = None
22
  result_conclusion = ""
23
 
24
  return result_df, result_plot, result_conclusion
 
1
  from genbit.genbit_metrics import GenBitMetrics
2
  import pandas as pd
3
+ import plotly.express as px
4
+ from plotly.subplots import make_subplots
5
+
6
+
7
+ def plot_genbit(result_json):
8
+ data1 = {
9
+ "Metric": [
10
+ "Female Gender",
11
+ "Male Gender",
12
+ "Non-Binary Gender",
13
+ ],
14
+ "Value": [
15
+ result_json["percentage_of_female_gender_definition_words"],
16
+ result_json["percentage_of_male_gender_definition_words"],
17
+ result_json["percentage_of_non_binary_gender_definition_words"],
18
+ ],
19
+ }
20
+
21
+ data2 = {
22
+ "Metric": [
23
+ "Trans Gender",
24
+ "Cis Gender",
25
+ ],
26
+ "Value": [
27
+ result_json["percentage_of_trans_gender_definition_words"],
28
+ result_json["percentage_of_cis_gender_definition_words"],
29
+ ],
30
+ }
31
+
32
+ df1 = pd.DataFrame(data1)
33
+ df2 = pd.DataFrame(data2)
34
+
35
+ fig1 = px.pie(
36
+ df1,
37
+ names="Metric",
38
+ values="Value",
39
+ title="Combined Gender Definition Words Distribution",
40
+ )
41
+ fig1.update_traces(textposition="inside", textinfo="percent+label")
42
+
43
+ # fig2 = px.pie(
44
+ # df2,
45
+ # names="Metric",
46
+ # values="Value",
47
+ # )
48
+ # fig2.update_traces(textposition="inside", textinfo="percent+label")
49
+
50
+ # fig = make_subplots(rows=2, cols=1, specs=[[{"type": "pie"}], [{"type": "pie"}]])
51
+
52
+ # fig.add_trace(fig1.data[0], row=1, col=1)
53
+ # fig.add_trace(fig2.data[0], row=2, col=1)
54
+
55
+ return fig1
56
 
57
 
58
  def eval_genbit(data):
 
71
  .rename(columns={"index": "Metric", 0: "Value"})
72
  )
73
 
74
+ result_plot = plot_genbit(result_json)
75
  result_conclusion = ""
76
 
77
  return result_df, result_plot, result_conclusion
scripts/gender_profession_bias.py CHANGED
@@ -13,6 +13,20 @@ nlp = English()
13
  nlp.add_pipe("sentencizer")
14
 
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def get_split_text(text):
17
  doc = nlp(text)
18
  sentences = [sent for sent in doc.sents]
@@ -71,20 +85,6 @@ def get_gender_prof_match_details(df_text):
71
  return results
72
 
73
 
74
- def call_multiprocessing_pool(df_text):
75
- concurrent = 2000
76
- pool = multiprocessing.pool.ThreadPool(processes=concurrent)
77
- result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
78
- pool.close()
79
-
80
- flat_return_list = [item for sublist in result_list for item in sublist]
81
-
82
- cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
83
- return_df = pd.DataFrame(flat_return_list, columns=cols)
84
-
85
- return return_df
86
-
87
-
88
  def get_statistics(result):
89
  stats = {
90
  "both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
@@ -102,8 +102,40 @@ def get_statistics(result):
102
  return stats
103
 
104
 
105
- def get_plot(result_df):
106
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
 
109
  def eval_gender_profession(data):
 
13
  nlp.add_pipe("sentencizer")
14
 
15
 
16
+ def call_multiprocessing_pool(df_text):
17
+ concurrent = 2000
18
+ pool = multiprocessing.pool.ThreadPool(processes=concurrent)
19
+ result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
20
+ pool.close()
21
+
22
+ flat_return_list = [item for sublist in result_list for item in sublist]
23
+
24
+ cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
25
+ return_df = pd.DataFrame(flat_return_list, columns=cols)
26
+
27
+ return return_df
28
+
29
+
30
  def get_split_text(text):
31
  doc = nlp(text)
32
  sentences = [sent for sent in doc.sents]
 
85
  return results
86
 
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  def get_statistics(result):
89
  stats = {
90
  "both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
 
102
  return stats
103
 
104
 
105
+ def get_plot(result_json):
106
+ both_gender_prof_match = int(result_json["both_gender_prof_match"])
107
+ count_male_pronoun = int(result_json["count_male_pronoun"])
108
+ count_female_pronoun = int(result_json["count_female_pronoun"])
109
+ count_male_pronoun_profession = int(result_json["count_male_pronoun_profession"])
110
+ count_female_pronoun_profession = int(
111
+ result_json["count_female_pronoun_profession"]
112
+ )
113
+
114
+ data = {
115
+ "Labels": [
116
+ "Both Gender & Profession Match",
117
+ "Male Pronoun",
118
+ "Female Pronoun",
119
+ "Male Pronoun & Profession",
120
+ "Female Pronoun & Profession",
121
+ ],
122
+ "Values": [
123
+ both_gender_prof_match,
124
+ count_male_pronoun,
125
+ count_female_pronoun,
126
+ count_male_pronoun_profession,
127
+ count_female_pronoun_profession,
128
+ ],
129
+ }
130
+
131
+ fig = px.pie(
132
+ data,
133
+ names="Labels",
134
+ values="Values",
135
+ title="Gender & Profession Match Statistics",
136
+ )
137
+
138
+ return fig
139
 
140
 
141
  def eval_gender_profession(data):