sudipta002
commited on
Commit
·
17fc1de
1
Parent(s):
da3e4bd
Backend code
Browse files- .gitignore +1 -0
- app.py +8 -0
- requirements.txt +2 -1
- scripts/genbit_metrics.py +60 -0
.gitignore
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
# Byte-compiled / optimized / DLL files
|
2 |
__pycache__/
|
3 |
testing/
|
|
|
4 |
check_gender_tagging.py
|
5 |
*.py[cod]
|
6 |
*$py.class
|
|
|
1 |
# Byte-compiled / optimized / DLL files
|
2 |
__pycache__/
|
3 |
testing/
|
4 |
+
flagged/
|
5 |
check_gender_tagging.py
|
6 |
*.py[cod]
|
7 |
*$py.class
|
app.py
CHANGED
@@ -10,6 +10,9 @@ def run_evaluation(dataset_id, methodology):
|
|
10 |
elif methodology == 'C':
|
11 |
run_c(dataset_id)
|
12 |
|
|
|
|
|
|
|
13 |
|
14 |
demo = gr.Blocks(theme=gr.themes.Soft())
|
15 |
|
@@ -42,6 +45,7 @@ with demo:
|
|
42 |
outputs = gr.Markdown()
|
43 |
gr.Error("No results to display")
|
44 |
|
|
|
45 |
methodology.change(
|
46 |
fn=lambda x: (f'### {x}', "lorem ipseum", "lorem ipsum"),
|
47 |
inputs=[methodology],
|
@@ -54,4 +58,8 @@ with demo:
|
|
54 |
outputs=[outputs]
|
55 |
)
|
56 |
|
|
|
|
|
|
|
|
|
57 |
demo.launch()
|
|
|
10 |
elif methodology == 'C':
|
11 |
run_c(dataset_id)
|
12 |
|
13 |
+
def schema_uploaded_file(file):
|
14 |
+
file_paths = [file.name for file in file]
|
15 |
+
return file_paths
|
16 |
|
17 |
demo = gr.Blocks(theme=gr.themes.Soft())
|
18 |
|
|
|
45 |
outputs = gr.Markdown()
|
46 |
gr.Error("No results to display")
|
47 |
|
48 |
+
|
49 |
methodology.change(
|
50 |
fn=lambda x: (f'### {x}', "lorem ipseum", "lorem ipsum"),
|
51 |
inputs=[methodology],
|
|
|
58 |
outputs=[outputs]
|
59 |
)
|
60 |
|
61 |
+
# For user dataset upload
|
62 |
+
gr.Interface(schema_uploaded_file, "file", "text")
|
63 |
+
|
64 |
+
|
65 |
demo.launch()
|
requirements.txt
CHANGED
@@ -2,4 +2,5 @@ gradio==3.40.1
|
|
2 |
gradio_client==0.5.0
|
3 |
numpy==1.25.2
|
4 |
pandas==2.0.3
|
5 |
-
spacy
|
|
|
|
2 |
gradio_client==0.5.0
|
3 |
numpy==1.25.2
|
4 |
pandas==2.0.3
|
5 |
+
spacy
|
6 |
+
genbit
|
scripts/genbit_metrics.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from genbit.genbit_metrics import GenBitMetrics
|
2 |
+
import pandas as pd
|
3 |
+
from utils.read_config import get_args
|
4 |
+
|
5 |
+
|
6 |
+
def cal_metrics(dataset):
|
7 |
+
# Create a GenBit object with the desired settings:
|
8 |
+
|
9 |
+
genbit_metrics_object = GenBitMetrics(language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80)
|
10 |
+
|
11 |
+
# Let's say you want to use GenBit with a test sentence, you can add the sentence to GenBit:
|
12 |
+
#dataset = ["I think she does not like cats. I think he does not like cats.", "He is a dog person."]
|
13 |
+
|
14 |
+
genbit_metrics_object.add_data(dataset, tokenized=False)
|
15 |
+
|
16 |
+
|
17 |
+
# To generate the gender bias metrics, we run `get_metrics` by setting `output_statistics` and `output_word_lists` to false, we can reduce the number of metrics created.
|
18 |
+
metrics = genbit_metrics_object.get_metrics(output_statistics=True, output_word_list=True)
|
19 |
+
|
20 |
+
return metrics
|
21 |
+
|
22 |
+
# Function to load sample of dataset
|
23 |
+
def load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name):
|
24 |
+
|
25 |
+
# Keep only requireed column
|
26 |
+
df = df[[col_name]]
|
27 |
+
if sample_method == "first_record" and df.shape[0] > sample_first_records:
|
28 |
+
df = df.iloc[:sample_first_records].copy().reset_index()
|
29 |
+
if sample_method == "random_pick" and df.shape[0] > sample_first_records:
|
30 |
+
df = df.sample(sample_first_records, random_state=sample_random_seed).copy().reset_index()
|
31 |
+
return df
|
32 |
+
|
33 |
+
|
34 |
+
# Function to extract genbit metrics
|
35 |
+
def extract_genbit_metris(stats):
|
36 |
+
metrics = {}
|
37 |
+
metrics["genbit_score"] = stats["genbit_score"]
|
38 |
+
metrics["percentage_of_female_gender_definition_words"] = stats["percentage_of_female_gender_definition_words"]
|
39 |
+
metrics["percentage_of_male_gender_definition_words"] = stats["percentage_of_male_gender_definition_words"]
|
40 |
+
metrics["percentage_of_non_binary_gender_definition_words"] = stats["percentage_of_non_binary_gender_definition_words"]
|
41 |
+
metrics["percentage_of_trans_gender_definition_words"] = stats["percentage_of_trans_gender_definition_words"]
|
42 |
+
metrics["percentage_of_cis_gender_definition_words"] = stats["percentage_of_cis_gender_definition_words"]
|
43 |
+
metrics["num_words_considered"] = stats["statistics"]["num_words_considered"]
|
44 |
+
|
45 |
+
return metrics
|
46 |
+
|
47 |
+
def load_dataset_and_get_genbit_metrics(df, sample_method, col_name):
|
48 |
+
# Get args from config file
|
49 |
+
sample_first_records = get_args("first_records")
|
50 |
+
sample_random_seed = get_args("random_seed")
|
51 |
+
|
52 |
+
sample_df = load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name)
|
53 |
+
|
54 |
+
# Turn into a list of text.
|
55 |
+
sample_text = sample_df[col_name].tolist()
|
56 |
+
|
57 |
+
# Call cal_metrics function
|
58 |
+
stats = cal_metrics(sample_text)
|
59 |
+
metrics = extract_genbit_metris(stats)
|
60 |
+
return metrics
|