File size: 8,240 Bytes
da9c0a0
5da6a2b
6fc6046
da9c0a0
6fc6046
5da6a2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da9c0a0
 
5da6a2b
6fc6046
5da6a2b
6fc6046
5da6a2b
 
6fc6046
 
38ba037
6fc6046
5da6a2b
38ba037
 
 
 
 
 
 
 
 
 
 
 
5da6a2b
38ba037
5da6a2b
38ba037
5da6a2b
 
 
 
 
 
 
 
 
38ba037
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da9c0a0
6fc6046
da9c0a0
5da6a2b
 
 
 
da9c0a0
38ba037
6fc6046
da9c0a0
38ba037
 
6fc6046
38ba037
 
 
6fc6046
 
38ba037
da9c0a0
 
38ba037
 
 
 
 
 
 
 
6fc6046
5da6a2b
38ba037
 
 
5da6a2b
6fc6046
38ba037
5da6a2b
38ba037
5da6a2b
da9c0a0
38ba037
da9c0a0
38ba037
 
5da6a2b
6fc6046
 
5da6a2b
38ba037
 
5da6a2b
38ba037
 
 
 
 
 
 
 
 
5da6a2b
38ba037
 
 
 
6fc6046
 
da9c0a0
5da6a2b
da9c0a0
38ba037
da9c0a0
 
38ba037
 
 
 
 
 
da9c0a0
6fc6046
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import gradio as gr
import pandas as pd
import os

dummy_data = [
    ["Category", "Value", "Percentage"],
    ["Total Reviews", 50000, None],
    ["Total Sentences", 621647, None],
    ["Pronouns in Sentences", None, None],
    ["Male Pronouns", 85615, None],
    ["Female Pronouns", 39372, None],
    ["Both Male and Female Pronouns", 7765, None],
    ["Exclusive Usage of Pronouns", None, None],
    ["Only Male Pronouns", 77860, 13.77],
    ["Only Female Pronouns", 31617, 6.33],
    ["Pronouns and Professions in Sentences", None, None],
    ["Male Pronouns with Professions", 5580, 0.9],
    ["Female Pronouns with Professions", 2618, 0.42],
    ["Exclusive Usage of Pronouns with Professions", None, None],
    ["Only Male Pronouns with Professions", 5011, 0.81],
    ["Only Female Pronouns with Professions", 2049, 0.33],
    ["Pronouns and Professions in Combination", None, None],
    ["Male or Female Pronouns with Professions", 7629, 1.23],
    ["Male and Female Pronouns with Professions", 569, 0.09]
]


def display_methodology(methodology):
    title = "### " + methodology
    description = ""

    if methodology == "Term Identity Diversity Analysis":
        description = "333"
    elif methodology == "Gender Label Evaluation":
        description = "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization."
    elif methodology == "Microsoft Genbit":
        description = "GenBiT is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications."

    return (
        gr.Markdown.update(title, visible=True),
        gr.Markdown.update(description, visible=True)
    )


def run_evaluation(dataset_file, dataset_scope, dataset_scope_n, dataset_corpus, methodology):
    status = {
        "dataset": dataset_file.name,
        "methodology": methodology,
        "scope": dataset_scope + " " + str(dataset_scope_n),
        "column": dataset_corpus.columns[0]

    }

    return gr.JSON.update(status, visible=True)

    if methodology == "A":
        run_a(dataset)
    elif methodology == "B":
        run_b(dataset)
    elif methodology == "C":
        run_c(dataset)


def process_dataset(dataset):
    data = pd.read_csv(dataset.name)

    columns = data.columns.tolist()
    columns = [x for x in columns if data[x].dtype == "object"]

    return (
        gr.Radio.update(
            label="Scope",
            info="Determines the scope of the dataset to be analyzed",
            choices=["First", "Last", "Random"],
            value="First",
            visible=True,
            interactive=True,
        ),
        gr.Slider.update(
            label="Number of Entries",
            info=f"Determines the number of entries to be analyzed. The dataset has {data.shape[0]} entries.",
            minimum=1,
            maximum=data.shape[0],
            value=data.shape[0] // 2,
            visible=True,
            interactive=True
        ),
        gr.Radio.update(
            label="Column",
            info="Determines the column to be analyzed. These are the columns with text data.",
            choices=columns,
            value=columns[0],
            visible=True,
            interactive=True,
        )
    )


def process_column(dataset, column):
    data = pd.read_csv(dataset.name)
    corpus = data[column].to_list()

    return gr.Dataframe.update(value=pd.DataFrame({"Data Corpus": corpus}), max_rows=5, visible=True)


BiasAware = gr.Blocks(title="BiasAware: Dataset Bias Detection")

with BiasAware:
    gr.Markdown("# BiasAware: Dataset Bias Detection")
    gr.Markdown(
        "Natural Language Processing (NLP) training datasets often reflect the biases present in the data sources they are compiled from, leading to the **perpetuation of stereotypes, underrepresentation, and skewed perspectives in AI models**. BiasAware is designed to **identify and quantify biases present in text data**, making it an invaluable resource for data scientists, machine learning practitioners, and organizations committed to **mitigating bias in AI systems**."
    )

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## Dataset")

            dataset_file = gr.File()
            dataset_examples = gr.Examples(
                [
                    os.path.join(os.path.dirname(__file__), "z_animal.csv"),
                    os.path.join(os.path.dirname(__file__), "z_employee.csv"),
                    os.path.join(os.path.dirname(__file__), "z_house.csv"),

                ],
                inputs=dataset_file,
            )

            dataset_scope = gr.Radio(visible=False)
            dataset_scope_n = gr.Slider(visible=False)
            dataset_columns = gr.Radio(visible=False)

            dataset_corpus = gr.Dataframe(visible=False)

        with gr.Column(scale=1):
            gr.Markdown("## Methodology")

            methodology = gr.Radio(
                label="Methodology",
                info="Determines the methodology to be used for bias detection",
                choices=[
                    "Term Identity Diversity Analysis",
                    "Gender Label Evaluation",
                    "Microsoft Genbit",
                ],
                value="Term Identity Diversity Analysis",
            )

            evalButton = gr.Button("Run Evaluation")

            methodology_title = gr.Markdown(visible=False)
            methodology_description = gr.Markdown(visible=False)

        with gr.Column(scale=4):
            gr.Markdown("## Result")

            result_status = gr.JSON(visible=False)
            result = gr.DataFrame()

    dataset_file.change(
        fn=process_dataset,
        inputs=[dataset_file],
        outputs=[
            dataset_scope,
            dataset_scope_n,
            dataset_columns
        ]
    )

    dataset_columns.change(
        fn=process_column,
        inputs=[dataset_file, dataset_columns],
        outputs=[dataset_corpus],
    )

    methodology.change(
        fn=display_methodology,
        inputs=[methodology],
        outputs=[methodology_title, methodology_description],
    )

    evalButton.click(
        fn=run_evaluation,
        inputs=[dataset_file, dataset_scope,
                dataset_scope_n, dataset_corpus, methodology],
        outputs=[result_status]
    )

BiasAware.launch()