Spaces:

avid-ml
/

biasaware

Sleeping

App Files Files Community

sudipta002 commited on Sep 4, 2023

Commit

17fc1de

1 Parent(s): da3e4bd

Backend code

Browse files

Files changed (4) hide show

.gitignore +1 -0
app.py +8 -0
requirements.txt +2 -1
scripts/genbit_metrics.py +60 -0

.gitignore CHANGED Viewed

@@ -1,6 +1,7 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 testing/
 check_gender_tagging.py
 *.py[cod]
 *$py.class

 # Byte-compiled / optimized / DLL files
 __pycache__/
 testing/
+flagged/
 check_gender_tagging.py
 *.py[cod]
 *$py.class

app.py CHANGED Viewed

@@ -10,6 +10,9 @@ def run_evaluation(dataset_id, methodology):
     elif methodology == 'C':
         run_c(dataset_id)
 demo = gr.Blocks(theme=gr.themes.Soft())
@@ -42,6 +45,7 @@ with demo:
             outputs = gr.Markdown()
             gr.Error("No results to display")
     methodology.change(
         fn=lambda x: (f'### {x}', "lorem ipseum", "lorem ipsum"),
         inputs=[methodology],
@@ -54,4 +58,8 @@ with demo:
         outputs=[outputs]
     )
 demo.launch()

     elif methodology == 'C':
         run_c(dataset_id)
+def schema_uploaded_file(file):
+    file_paths = [file.name for file in file]
+    return file_paths
 demo = gr.Blocks(theme=gr.themes.Soft())
             outputs = gr.Markdown()
             gr.Error("No results to display")
     methodology.change(
         fn=lambda x: (f'### {x}', "lorem ipseum", "lorem ipsum"),
         inputs=[methodology],
         outputs=[outputs]
     )
+     # For user dataset upload
+    gr.Interface(schema_uploaded_file, "file", "text")
 demo.launch()

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ gradio==3.40.1
 gradio_client==0.5.0
 numpy==1.25.2
 pandas==2.0.3
-spacy

 gradio_client==0.5.0
 numpy==1.25.2
 pandas==2.0.3
+spacy
+genbit

scripts/genbit_metrics.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from genbit.genbit_metrics import GenBitMetrics
+import pandas as pd
+from utils.read_config import get_args
+def cal_metrics(dataset):
+  # Create a GenBit object with the desired settings:
+  genbit_metrics_object = GenBitMetrics(language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80)
+  # Let's say you want to use GenBit with a test sentence, you can add the sentence to GenBit:
+  #dataset = ["I think she does not like cats. I think he does not like cats.", "He is a dog person."]
+  genbit_metrics_object.add_data(dataset, tokenized=False)
+  # To generate the gender bias metrics, we run `get_metrics` by setting `output_statistics` and `output_word_lists` to false, we can reduce the number of metrics created.
+  metrics = genbit_metrics_object.get_metrics(output_statistics=True, output_word_list=True)
+  return metrics
+# Function to load sample of dataset
+def load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name):
+    # Keep only requireed column
+    df = df[[col_name]]
+    if sample_method == "first_record" and df.shape[0] > sample_first_records:
+        df = df.iloc[:sample_first_records].copy().reset_index()
+    if sample_method == "random_pick" and df.shape[0] > sample_first_records:
+        df = df.sample(sample_first_records, random_state=sample_random_seed).copy().reset_index()
+    return df
+# Function to extract genbit metrics
+def extract_genbit_metris(stats):
+    metrics = {}
+    metrics["genbit_score"] = stats["genbit_score"]
+    metrics["percentage_of_female_gender_definition_words"] = stats["percentage_of_female_gender_definition_words"]
+    metrics["percentage_of_male_gender_definition_words"] = stats["percentage_of_male_gender_definition_words"]
+    metrics["percentage_of_non_binary_gender_definition_words"] = stats["percentage_of_non_binary_gender_definition_words"]
+    metrics["percentage_of_trans_gender_definition_words"] = stats["percentage_of_trans_gender_definition_words"]
+    metrics["percentage_of_cis_gender_definition_words"] = stats["percentage_of_cis_gender_definition_words"]
+    metrics["num_words_considered"] = stats["statistics"]["num_words_considered"]
+    return metrics
+def load_dataset_and_get_genbit_metrics(df, sample_method, col_name):
+    # Get args from config file
+    sample_first_records = get_args("first_records")
+    sample_random_seed = get_args("random_seed")
+    sample_df = load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name)
+    # Turn into a list of text.
+    sample_text = sample_df[col_name].tolist()
+    # Call cal_metrics function
+    stats = cal_metrics(sample_text)
+    metrics = extract_genbit_metris(stats)
+    return metrics