sudipta002 commited on
Commit
17fc1de
·
1 Parent(s): da3e4bd

Backend code

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +8 -0
  3. requirements.txt +2 -1
  4. scripts/genbit_metrics.py +60 -0
.gitignore CHANGED
@@ -1,6 +1,7 @@
1
  # Byte-compiled / optimized / DLL files
2
  __pycache__/
3
  testing/
 
4
  check_gender_tagging.py
5
  *.py[cod]
6
  *$py.class
 
1
  # Byte-compiled / optimized / DLL files
2
  __pycache__/
3
  testing/
4
+ flagged/
5
  check_gender_tagging.py
6
  *.py[cod]
7
  *$py.class
app.py CHANGED
@@ -10,6 +10,9 @@ def run_evaluation(dataset_id, methodology):
10
  elif methodology == 'C':
11
  run_c(dataset_id)
12
 
 
 
 
13
 
14
  demo = gr.Blocks(theme=gr.themes.Soft())
15
 
@@ -42,6 +45,7 @@ with demo:
42
  outputs = gr.Markdown()
43
  gr.Error("No results to display")
44
 
 
45
  methodology.change(
46
  fn=lambda x: (f'### {x}', "lorem ipseum", "lorem ipsum"),
47
  inputs=[methodology],
@@ -54,4 +58,8 @@ with demo:
54
  outputs=[outputs]
55
  )
56
 
 
 
 
 
57
  demo.launch()
 
10
  elif methodology == 'C':
11
  run_c(dataset_id)
12
 
13
+ def schema_uploaded_file(file):
14
+ file_paths = [file.name for file in file]
15
+ return file_paths
16
 
17
  demo = gr.Blocks(theme=gr.themes.Soft())
18
 
 
45
  outputs = gr.Markdown()
46
  gr.Error("No results to display")
47
 
48
+
49
  methodology.change(
50
  fn=lambda x: (f'### {x}', "lorem ipseum", "lorem ipsum"),
51
  inputs=[methodology],
 
58
  outputs=[outputs]
59
  )
60
 
61
+ # For user dataset upload
62
+ gr.Interface(schema_uploaded_file, "file", "text")
63
+
64
+
65
  demo.launch()
requirements.txt CHANGED
@@ -2,4 +2,5 @@ gradio==3.40.1
2
  gradio_client==0.5.0
3
  numpy==1.25.2
4
  pandas==2.0.3
5
- spacy
 
 
2
  gradio_client==0.5.0
3
  numpy==1.25.2
4
  pandas==2.0.3
5
+ spacy
6
+ genbit
scripts/genbit_metrics.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from genbit.genbit_metrics import GenBitMetrics
2
+ import pandas as pd
3
+ from utils.read_config import get_args
4
+
5
+
6
+ def cal_metrics(dataset):
7
+ # Create a GenBit object with the desired settings:
8
+
9
+ genbit_metrics_object = GenBitMetrics(language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80)
10
+
11
+ # Let's say you want to use GenBit with a test sentence, you can add the sentence to GenBit:
12
+ #dataset = ["I think she does not like cats. I think he does not like cats.", "He is a dog person."]
13
+
14
+ genbit_metrics_object.add_data(dataset, tokenized=False)
15
+
16
+
17
+ # To generate the gender bias metrics, we run `get_metrics` by setting `output_statistics` and `output_word_lists` to false, we can reduce the number of metrics created.
18
+ metrics = genbit_metrics_object.get_metrics(output_statistics=True, output_word_list=True)
19
+
20
+ return metrics
21
+
22
+ # Function to load sample of dataset
23
+ def load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name):
24
+
25
+ # Keep only requireed column
26
+ df = df[[col_name]]
27
+ if sample_method == "first_record" and df.shape[0] > sample_first_records:
28
+ df = df.iloc[:sample_first_records].copy().reset_index()
29
+ if sample_method == "random_pick" and df.shape[0] > sample_first_records:
30
+ df = df.sample(sample_first_records, random_state=sample_random_seed).copy().reset_index()
31
+ return df
32
+
33
+
34
+ # Function to extract genbit metrics
35
+ def extract_genbit_metris(stats):
36
+ metrics = {}
37
+ metrics["genbit_score"] = stats["genbit_score"]
38
+ metrics["percentage_of_female_gender_definition_words"] = stats["percentage_of_female_gender_definition_words"]
39
+ metrics["percentage_of_male_gender_definition_words"] = stats["percentage_of_male_gender_definition_words"]
40
+ metrics["percentage_of_non_binary_gender_definition_words"] = stats["percentage_of_non_binary_gender_definition_words"]
41
+ metrics["percentage_of_trans_gender_definition_words"] = stats["percentage_of_trans_gender_definition_words"]
42
+ metrics["percentage_of_cis_gender_definition_words"] = stats["percentage_of_cis_gender_definition_words"]
43
+ metrics["num_words_considered"] = stats["statistics"]["num_words_considered"]
44
+
45
+ return metrics
46
+
47
+ def load_dataset_and_get_genbit_metrics(df, sample_method, col_name):
48
+ # Get args from config file
49
+ sample_first_records = get_args("first_records")
50
+ sample_random_seed = get_args("random_seed")
51
+
52
+ sample_df = load_sample(sample_first_records, sample_random_seed, sample_method, df, col_name)
53
+
54
+ # Turn into a list of text.
55
+ sample_text = sample_df[col_name].tolist()
56
+
57
+ # Call cal_metrics function
58
+ stats = cal_metrics(sample_text)
59
+ metrics = extract_genbit_metris(stats)
60
+ return metrics