File size: 2,133 Bytes
17fc1de
 
 
a365da6
17fc1de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a365da6
 
 
 
 
 
 
17fc1de
 
 
a365da6
 
17fc1de
a365da6
17fc1de
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from genbit.genbit_metrics import GenBitMetrics
import pandas as pd
from utils.read_config import get_args
from utils.load_csv import load_sample


def cal_metrics(dataset):
  # Create a GenBit object with the desired settings:

  genbit_metrics_object = GenBitMetrics(language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80)

  # Let's say you want to use GenBit with a test sentence, you can add the sentence to GenBit:
  #dataset = ["I think she does not like cats. I think he does not like cats.", "He is a dog person."]

  genbit_metrics_object.add_data(dataset, tokenized=False)


  # To generate the gender bias metrics, we run `get_metrics` by setting `output_statistics` and `output_word_lists` to false, we can reduce the number of metrics created.
  metrics = genbit_metrics_object.get_metrics(output_statistics=True, output_word_list=True)

  return metrics


# Function to extract genbit metrics
def extract_genbit_metris(stats):
    metrics = {}
    metrics["genbit_score"] = str(stats["genbit_score"])
    metrics["percentage_of_female_gender_definition_words"] = str(stats["percentage_of_female_gender_definition_words"])
    metrics["percentage_of_male_gender_definition_words"] = str(stats["percentage_of_male_gender_definition_words"])
    metrics["percentage_of_non_binary_gender_definition_words"] = str(stats["percentage_of_non_binary_gender_definition_words"])
    metrics["percentage_of_trans_gender_definition_words"] = str(stats["percentage_of_trans_gender_definition_words"])
    metrics["percentage_of_cis_gender_definition_words"] = str(stats["percentage_of_cis_gender_definition_words"])
    metrics["num_words_considered"] = str(stats["statistics"]["num_words_considered"])

    return metrics

def load_dataset_and_get_genbit_metrics(df, sample_method, col_name, num_sample_records):
 

    sample_df = load_sample(num_sample_records, sample_method, df, col_name)

    # Turn into a list of text.
    sample_text = sample_df[col_name].tolist()

    # Call cal_metrics function
    stats = cal_metrics(sample_text)
    metrics = extract_genbit_metris(stats)
    return metrics