freyam commited on
Commit
0946447
·
1 Parent(s): 9eb4764

Add Plot for Gender Divide

Browse files
app.py CHANGED
@@ -5,7 +5,7 @@ import os
5
 
6
  from scripts.genbit import *
7
  from scripts.gender_profession_bias import *
8
- from scripts.gender_divide import *
9
 
10
  methodologies = json.load(open("config/methodologies.json", "r"))
11
 
@@ -29,16 +29,22 @@ def evaluate(dataset, sampling_method, sampling_size, column, methodology):
29
  elif sampling_method == "Random":
30
  data = data.sample(n=sampling_size, random_state=42)
31
 
32
- result_json = globals()[methodologies.get(methodology).get("fx")](data)
 
 
33
 
34
- result_df = pd.DataFrame.from_dict(result_json, orient="index").reset_index()
35
- result_df.columns = ["Metric", "Value"]
36
-
37
- return gr.Dataframe.update(result_df, visible=True)
 
 
 
38
  except Exception as e:
39
- return gr.JSON.update(
40
- {"error": f"An error occurred while processing the dataset. {e}"},
41
- visible=True,
 
42
  )
43
 
44
 
@@ -155,9 +161,10 @@ with BiasAware:
155
  methodology_metadata = gr.Markdown(visible=False)
156
 
157
  with gr.Column(scale=4):
158
- gr.Markdown("## Result")
159
 
160
- result = gr.DataFrame(visible=False)
 
161
 
162
  dataset_file.change(
163
  fn=display_dataset_config,
@@ -191,7 +198,7 @@ with BiasAware:
191
  dataset_column,
192
  methodology,
193
  ],
194
- outputs=[result],
195
  )
196
 
197
  BiasAware.launch()
 
5
 
6
  from scripts.genbit import *
7
  from scripts.gender_profession_bias import *
8
+ from scripts.gender_distribution import *
9
 
10
  methodologies = json.load(open("config/methodologies.json", "r"))
11
 
 
29
  elif sampling_method == "Random":
30
  data = data.sample(n=sampling_size, random_state=42)
31
 
32
+ result_df, result_plot, result_conclusion = globals()[
33
+ methodologies.get(methodology).get("fx")
34
+ ](data)
35
 
36
+ return (
37
+ gr.Markdown.update(
38
+ f"## {methodology} Results\nResult Summary", visible=True
39
+ ),
40
+ gr.Plot.update(result_plot, visible=True),
41
+ gr.Dataframe.update(result_df, visible=True),
42
+ )
43
  except Exception as e:
44
+ return (
45
+ gr.Markdown.update(visible=False),
46
+ gr.Plot.update(visible=False),
47
+ gr.Dataframe.update(visible=False),
48
  )
49
 
50
 
 
161
  methodology_metadata = gr.Markdown(visible=False)
162
 
163
  with gr.Column(scale=4):
164
+ result = gr.Markdown("## Result")
165
 
166
+ result_plot = gr.Plot(show_label=False, container=False, visible=False)
167
+ result_df = gr.DataFrame(visible=False)
168
 
169
  dataset_file.change(
170
  fn=display_dataset_config,
 
198
  dataset_column,
199
  methodology,
200
  ],
201
+ outputs=[result, result_plot, result_df],
202
  )
203
 
204
  BiasAware.launch()
config/methodologies.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "Gender Divide (Term Identity Diversity)": {
3
- "description": "333",
4
- "fx": "eval_gender_divide"
5
  },
6
  "Gender Profession Bias (Lexical Evaluation)": {
7
  "description": "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization.",
8
  "fx": "eval_gender_profession"
9
  },
10
- "GenBiT (Microsoft Responsible AI Gender Bias Tool)": {
11
  "description": "[GenBiT](https://www.microsoft.com/en-us/research/uploads/prod/2021/10/MSJAR_Genbit_Final_Version-616fd3a073758.pdf) is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications.",
12
  "fx": "eval_genbit"
13
  }
 
1
  {
2
+ "Gender Distribution (Term Identity Diversity)": {
3
+ "description": "Gender distribution is an essential aspect of identity diversity, representing the presence and balance of different gender identities within a given population or dataset. Understanding gender distribution is crucial for promoting inclusivity and equity in various contexts, such as workplaces, educational institutions, and social settings.\nIn this analysis, we examine gender distribution using a set of predefined gender identity categories. Each category represents a specific gender-related attribute or expression. Let's define the terms used in the analysis:\n- No Gender: This category likely refers to individuals who identify as non-binary, genderqueer, or gender-neutral, indicating that they do not align with traditional binary gender categories (male or female).\n- Equal Gender: This category may represent a balance between male and female genders, suggesting an equal representation of both in the dataset or population.\n- Female Positive Gender: This category likely includes individuals who identify as female or have a strong affiliation with femininity.\n- Male Positive Gender: Similarly, this category includes individuals who identify as male or have a strong affiliation with masculinity.\n- Female Strongly Positive Gender: This subcategory represents a more emphatic identification with female gender attributes, possibly indicating a stronger female gender identity.\n- Male Strongly Positive Gender: This subcategory mirrors the previous one but for male gender attributes, indicating a stronger male gender identity.",
4
+ "fx": "eval_gender_distribution"
5
  },
6
  "Gender Profession Bias (Lexical Evaluation)": {
7
  "description": "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization.",
8
  "fx": "eval_gender_profession"
9
  },
10
+ "GenBiT (Microsoft Gender Bias Tool)": {
11
  "description": "[GenBiT](https://www.microsoft.com/en-us/research/uploads/prod/2021/10/MSJAR_Genbit_Final_Version-616fd3a073758.pdf) is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications.",
12
  "fx": "eval_genbit"
13
  }
scripts/genbit.py CHANGED
@@ -1,4 +1,5 @@
1
  from genbit.genbit_metrics import GenBitMetrics
 
2
 
3
 
4
  def eval_genbit(data):
@@ -9,6 +10,15 @@ def eval_genbit(data):
9
  data = data[data.columns[0]].to_list()
10
 
11
  genbit_metrics.add_data(data, tokenized=False)
12
- genbit_metrics_dict = genbit_metrics.get_metrics(output_word_list=False)
13
 
14
- return genbit_metrics_dict
 
 
 
 
 
 
 
 
 
 
1
  from genbit.genbit_metrics import GenBitMetrics
2
+ import pandas as pd
3
 
4
 
5
  def eval_genbit(data):
 
10
  data = data[data.columns[0]].to_list()
11
 
12
  genbit_metrics.add_data(data, tokenized=False)
13
+ result_json = genbit_metrics.get_metrics(output_word_list=False)
14
 
15
+ result_df = (
16
+ pd.DataFrame.from_dict(result_json, orient="index")
17
+ .reset_index()
18
+ .rename(columns={"index": "Metric", 0: "Value"})
19
+ )
20
+
21
+ result_plot = None
22
+ result_conclusion = ""
23
+
24
+ return result_df, result_plot, result_conclusion
scripts/gender_distribution.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import plotly.express as px
4
+ import pandas as pd
5
+
6
+ with open("config/gender_lexicons.json", "r") as lexicon_file:
7
+ gender_lexicons = json.load(lexicon_file)
8
+
9
+ male_lexicon = set(gender_lexicons.get("male_lexicons"))
10
+ female_lexicon = set(gender_lexicons.get("female_lexicons"))
11
+
12
+ male_pattern = re.compile(r"\b({})\b".format("|".join(map(re.escape, male_lexicon))))
13
+ female_pattern = re.compile(
14
+ r"\b({})\b".format("|".join(map(re.escape, female_lexicon)))
15
+ )
16
+
17
+
18
+ def count_gender_terms(text, gender_pattern):
19
+ matches = re.findall(gender_pattern, text)
20
+ return len(matches)
21
+
22
+
23
+ def get_gender_tag(count_male_terms, count_female_terms):
24
+ total_terms = count_male_terms + count_female_terms
25
+
26
+ if total_terms == 0:
27
+ return "No Gender"
28
+
29
+ male_proportion = (count_male_terms / total_terms) * 100
30
+ if male_proportion >= 75:
31
+ return "Male Strongly Positive Gender"
32
+ elif male_proportion >= 50:
33
+ return "Male Positive Gender"
34
+
35
+ female_proportion = (count_female_terms / total_terms) * 100
36
+ if female_proportion >= 75:
37
+ return "Female Strongly Positive Gender"
38
+ elif female_proportion >= 50:
39
+ return "Female Positive Gender"
40
+
41
+ return "Equal Gender"
42
+
43
+
44
+ def get_gender_category_counts(sample_df):
45
+ gender_labels = [
46
+ "No Gender",
47
+ "Equal Gender",
48
+ "Male Positive Gender",
49
+ "Male Strongly Positive Gender",
50
+ "Female Positive Gender",
51
+ "Female Strongly Positive Gender",
52
+ ]
53
+
54
+ gender_counts = sample_df["gender_category"].value_counts()
55
+ result = {label: str(gender_counts.get(label, 0)) for label in gender_labels}
56
+
57
+ return result
58
+
59
+
60
+ def plot_gender_category_counts(gender_labels):
61
+ labels = [
62
+ "No Gender",
63
+ "Equal Gender",
64
+ "Male Positive Gender",
65
+ "Male Strongly Positive Gender",
66
+ "Female Positive Gender",
67
+ "Female Strongly Positive Gender",
68
+ ]
69
+
70
+ values = [gender_labels[label] for label in labels]
71
+
72
+ fig = px.pie(
73
+ values=values,
74
+ names=labels,
75
+ title="Gender Distribution",
76
+ category_orders={"names": labels},
77
+ )
78
+
79
+ fig.update_traces(
80
+ pull=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
81
+ textinfo="percent+label",
82
+ marker=dict(line=dict(color="#000000", width=1)),
83
+ )
84
+
85
+ fig.update_layout(showlegend=False)
86
+
87
+ return fig
88
+
89
+
90
+ def eval_gender_distribution(data):
91
+ data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip()
92
+
93
+ data["count_male_terms"] = data[data.columns[0]].apply(
94
+ lambda x: count_gender_terms(x, male_pattern)
95
+ )
96
+ data["count_female_terms"] = data[data.columns[0]].apply(
97
+ lambda x: count_gender_terms(x, female_pattern)
98
+ )
99
+
100
+ data["gender_category"] = data.apply(
101
+ lambda row: get_gender_tag(row["count_male_terms"], row["count_female_terms"]),
102
+ axis=1,
103
+ )
104
+
105
+ result_json = get_gender_category_counts(data)
106
+ result_plot = plot_gender_category_counts(result_json)
107
+
108
+ result_df = (
109
+ pd.DataFrame.from_dict(result_json, orient="index")
110
+ .reset_index()
111
+ .rename(columns={"index": "Metric", 0: "Value"})
112
+ )
113
+
114
+ result_conclusion = ""
115
+
116
+ return result_df, result_plot, result_conclusion
scripts/gender_divide.py DELETED
@@ -1,69 +0,0 @@
1
- import re
2
- import json
3
-
4
- gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
5
-
6
-
7
- def count_gender_terms(text, gender_terms):
8
- pattern = r"\b({})\b".format("|".join(gender_terms))
9
- matches = re.findall(pattern, str(text))
10
- return len(matches)
11
-
12
-
13
- def get_gender_tag(count_m_term, count_f_term):
14
- total_terms = count_m_term + count_f_term
15
- if total_terms == 0:
16
- return "No Gender"
17
-
18
- m_proportion = (count_m_term / total_terms) * 100
19
- if m_proportion >= 75:
20
- return "Male Strongly Positive Gender"
21
- elif m_proportion >= 50:
22
- return "Male Positive Gender"
23
-
24
- f_proportion = (count_f_term / total_terms) * 100
25
- if f_proportion >= 75:
26
- return "Female Strongly Positive Gender"
27
- elif f_proportion >= 50:
28
- return "Female Positive Gender"
29
-
30
- return "Equal Gender"
31
-
32
-
33
- def get_pg_spg(sample_df):
34
- gender_labels = [
35
- "Gender",
36
- "No Gender",
37
- "Equal Gender",
38
- "Female Positive Gender",
39
- "Male Positive Gender",
40
- "Female Strongly Positive Gender",
41
- "Male Strongly Positive Gender",
42
- ]
43
-
44
- gender_counts = sample_df["gender_cat"].value_counts()
45
- result = {label: str(gender_counts.get(label, 0)) for label in gender_labels}
46
-
47
- return result
48
-
49
-
50
- def eval_gender_divide(data):
51
- male_terms = gender_lexicons.get("male_lexicons")
52
- female_terms = gender_lexicons.get("female_lexicons")
53
-
54
- data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip()
55
-
56
- data["count_male_term"] = data.apply(
57
- lambda x: count_gender_terms(x[data.columns[0]], male_terms), axis=1
58
- )
59
- data["count_female_term"] = data.apply(
60
- lambda x: count_gender_terms(x[:], female_terms), axis=1
61
- )
62
-
63
- data["gender_cat"] = data.apply(
64
- lambda row: get_gender_tag(row["count_male_term"], row["count_female_term"]),
65
- axis=1,
66
- )
67
-
68
- collection = get_pg_spg(data)
69
- return collection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/gender_profession_bias.py CHANGED
@@ -2,6 +2,7 @@ import re
2
  import json
3
 
4
  import pandas as pd
 
5
  import multiprocessing.pool
6
  from spacy.lang.en import English
7
 
@@ -101,10 +102,24 @@ def get_statistics(result):
101
  return stats
102
 
103
 
 
 
 
 
104
  def eval_gender_profession(data):
105
  data = data[data.columns[0]].str.lower().str.strip()
106
 
107
  result = call_multiprocessing_pool(data)
108
- stats = get_statistics(result)
109
 
110
- return stats
 
 
 
 
 
 
 
 
 
 
 
 
2
  import json
3
 
4
  import pandas as pd
5
+ import plotly.express as px
6
  import multiprocessing.pool
7
  from spacy.lang.en import English
8
 
 
102
  return stats
103
 
104
 
105
+ def get_plot(result_df):
106
+ return
107
+
108
+
109
  def eval_gender_profession(data):
110
  data = data[data.columns[0]].str.lower().str.strip()
111
 
112
  result = call_multiprocessing_pool(data)
 
113
 
114
+ result_json = get_statistics(result)
115
+ result_plot = get_plot(result_json)
116
+
117
+ result_df = (
118
+ pd.DataFrame.from_dict(result_json, orient="index")
119
+ .reset_index()
120
+ .rename(columns={"index": "Metric", 0: "Value"})
121
+ )
122
+
123
+ result_conclusion = ""
124
+
125
+ return result_df, result_plot, result_conclusion