|
from genbit.genbit_metrics import GenBitMetrics |
|
import pandas as pd |
|
from utils.read_config import get_args |
|
from utils.load_csv import load_sample |
|
|
|
|
|
def cal_metrics(dataset): |
|
|
|
|
|
genbit_metrics_object = GenBitMetrics(language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80) |
|
|
|
|
|
|
|
|
|
genbit_metrics_object.add_data(dataset, tokenized=False) |
|
|
|
|
|
|
|
metrics = genbit_metrics_object.get_metrics(output_statistics=True, output_word_list=True) |
|
|
|
return metrics |
|
|
|
|
|
|
|
def extract_genbit_metris(stats): |
|
metrics = {} |
|
metrics["genbit_score"] = str(stats["genbit_score"]) |
|
metrics["percentage_of_female_gender_definition_words"] = str(stats["percentage_of_female_gender_definition_words"]) |
|
metrics["percentage_of_male_gender_definition_words"] = str(stats["percentage_of_male_gender_definition_words"]) |
|
metrics["percentage_of_non_binary_gender_definition_words"] = str(stats["percentage_of_non_binary_gender_definition_words"]) |
|
metrics["percentage_of_trans_gender_definition_words"] = str(stats["percentage_of_trans_gender_definition_words"]) |
|
metrics["percentage_of_cis_gender_definition_words"] = str(stats["percentage_of_cis_gender_definition_words"]) |
|
metrics["num_words_considered"] = str(stats["statistics"]["num_words_considered"]) |
|
|
|
return metrics |
|
|
|
def load_dataset_and_get_genbit_metrics(df, sample_method, col_name, num_sample_records): |
|
|
|
|
|
sample_df = load_sample(num_sample_records, sample_method, df, col_name) |
|
|
|
|
|
sample_text = sample_df[col_name].tolist() |
|
|
|
|
|
stats = cal_metrics(sample_text) |
|
metrics = extract_genbit_metris(stats) |
|
return metrics |
|
|