Spaces:

avid-ml
/

biasaware

Running

App Files Files Community

freyam commited on Sep 10, 2023

Commit

1dc0d8b

2 Parent(s): 38ba037 b3ce2b2

Merge remote-tracking branch 'origin/dev-sudipta' into dev-frey

Browse files

Files changed (13) hide show

.gitignore +3 -0
app.py +42 -12
data/z_animal.csv +11 -0
data/z_employee.csv +26 -0
data/z_house.csv +7 -0
requirements.txt +4 -2
scripts/.keep +0 -0
scripts/genbit_metrics.py +48 -0
scripts/gender_profession_tagging.py +129 -0
scripts/gender_tagging.py +93 -0
utils/config.json +160 -0
utils/load_csv.py +23 -0
utils/read_config.py +13 -0

.gitignore CHANGED Viewed

@@ -1,5 +1,8 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class

 # Byte-compiled / optimized / DLL files
 __pycache__/
+testing/
+flagged/
+check_gender_tagging.py
 *.py[cod]
 *$py.class

app.py CHANGED Viewed

@@ -1,6 +1,11 @@
 import gradio as gr
 import pandas as pd
 import os
 dummy_data = [
     ["Category", "Value", "Percentage"],
@@ -42,14 +47,32 @@ def display_methodology(methodology):
     )
-def run_evaluation(dataset_file, dataset_scope, dataset_scope_n, dataset_corpus, methodology):
-    status = {
-        "dataset": dataset_file.name,
-        "methodology": methodology,
-        "scope": dataset_scope + " " + str(dataset_scope_n),
-        "column": dataset_corpus.columns[0]
-    }
     return gr.JSON.update(status, visible=True)
@@ -62,7 +85,11 @@ def run_evaluation(dataset_file, dataset_scope, dataset_scope_n, dataset_corpus,
 def process_dataset(dataset):
     data = pd.read_csv(dataset.name)
     columns = data.columns.tolist()
     columns = [x for x in columns if data[x].dtype == "object"]
@@ -98,7 +125,7 @@ def process_dataset(dataset):
 def process_column(dataset, column):
     data = pd.read_csv(dataset.name)
-    corpus = data[column].to_list()
     return gr.Dataframe.update(value=pd.DataFrame({"Data Corpus": corpus}), max_rows=5, visible=True)
@@ -118,9 +145,12 @@ with BiasAware:
             dataset_file = gr.File()
             dataset_examples = gr.Examples(
                 [
-                    os.path.join(os.path.dirname(__file__), "z_animal.csv"),
-                    os.path.join(os.path.dirname(__file__), "z_employee.csv"),
-                    os.path.join(os.path.dirname(__file__), "z_house.csv"),
                 ],
                 inputs=dataset_file,
@@ -182,7 +212,7 @@ with BiasAware:
     evalButton.click(
         fn=run_evaluation,
         inputs=[dataset_file, dataset_scope,
-                dataset_scope_n, dataset_corpus, methodology],
         outputs=[result_status]
     )

 import gradio as gr
 import pandas as pd
 import os
+from scripts.genbit_metrics import *
+from scripts.gender_profession_tagging import *
+from scripts.gender_tagging import *
+from utils.load_csv import *
+from utils.read_config import get_args
 dummy_data = [
     ["Category", "Value", "Percentage"],
     )
+def run_evaluation(dataset_file, dataset_scope, dataset_scope_n, dataset_columns, methodology):
+    status = {}
+    # Read CSV file
+    dataset = check_csv(dataset_file.name)
+    sample_method = dataset_scope
+    col_name = dataset_columns
+    num_sample_records = dataset_scope_n
+    # Check selected methodology
+    if methodology == "Term Identity Diversity Analysis":
+        status = load_dataset_and_analyze_gender_tag(
+            dataset, sample_method, col_name, num_sample_records)
+    if methodology == "Gender Label Evaluation":
+        status = load_dataset_and_analyze_gender_profession(
+            dataset, sample_method, col_name, num_sample_records)
+    if methodology == "Microsoft Genbit":
+        status = load_dataset_and_get_genbit_metrics(
+            dataset, sample_method, col_name, num_sample_records)
+    # status = {
+    #     "dataset": dataset_file.name,
+    #     "methodology": methodology,
+    #     "scope": dataset_scope + " " + str(dataset_scope_n),
+    #     "column": dataset_columns
+    # }
     return gr.JSON.update(status, visible=True)
 def process_dataset(dataset):
     data = pd.read_csv(dataset.name)
+    # maximum_records = get_args("first_records")
+    # input_records = data.shape(0)
+    # num_sample_records = input_records if input_records < maximum_records else maximum_records
     columns = data.columns.tolist()
     columns = [x for x in columns if data[x].dtype == "object"]
 def process_column(dataset, column):
     data = pd.read_csv(dataset.name)
+    corpus = data[column].to_list()[:10]
     return gr.Dataframe.update(value=pd.DataFrame({"Data Corpus": corpus}), max_rows=5, visible=True)
             dataset_file = gr.File()
             dataset_examples = gr.Examples(
                 [
+                    os.path.join(os.path.dirname(__file__),
+                                 "data/z_animal.csv"),
+                    os.path.join(os.path.dirname(__file__),
+                                 "data/z_employee.csv"),
+                    os.path.join(os.path.dirname(
+                        __file__), "data/z_house.csv"),
                 ],
                 inputs=dataset_file,
     evalButton.click(
         fn=run_evaluation,
         inputs=[dataset_file, dataset_scope,
+                dataset_scope_n, dataset_columns, methodology],
         outputs=[result_status]
     )

data/z_animal.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+AnimalID,CommonName,ScientificName,Class,Order,Family,Habitat,ConservationStatus
+1,Lion,Panthera leo,Mammalia,Carnivora,Felidae,Savanna,Vulnerable
+2,Eagle,Aquila chrysaetos,Aves,Accipitriformes,Accipitridae,Mountains,Least Concern
+3,Dolphin,Tursiops truncatus,Mammalia,Cetacea,Delphinidae,Ocean,Least Concern
+4,Elephant,Loxodonta africana,Mammalia,Proboscidea,Elephantidae,Grassland,Vulnerable
+5,Tiger,Panthera tigris,Mammalia,Carnivora,Felidae,Forest,Endangered
+6,Penguin,Spheniscidae,Aves,Sphenisciformes,Spheniscidae,Antarctica,Least Concern
+7,Giraffe,Giraffa camelopardalis,Mammalia,Artiodactyla,Giraffidae,Savanna,Vulnerable
+8,Cheetah,Acinonyx jubatus,Mammalia,Carnivora,Felidae,Grassland,Vulnerable
+9,Panda,Ailuropoda melanoleuca,Mammalia,Carnivora,Ursidae,Forest,Endangered
+10,Kangaroo,Macropus rufus,Mammalia,Diprotodontia,Macropodidae,Grassland,Least Concern

data/z_employee.csv ADDED Viewed

	@@ -0,0 +1,26 @@

+EmployeeID,FirstName,LastName,Email,Department,Salary
+101,John,Smith,[email protected],Finance,60000
+102,Emily,Johnson,[email protected],Marketing,55000
+103,Michael,Williams,[email protected],HR,50000
+104,Susan,Anderson,[email protected],IT,65000
+105,David,Martin,[email protected],Sales,58000
+106,Linda,Davis,[email protected],Finance,62000
+107,William,Miller,[email protected],Marketing,56000
+108,Sarah,Anderson,[email protected],HR,51000
+109,Robert,Clark,[email protected],IT,67000
+110,Karen,Wilson,[email protected],Sales,59000
+111,James,Brown,[email protected],Finance,61000
+112,Anna,Johnson,[email protected],Marketing,57000
+113,Christopher,Moore,[email protected],HR,52000
+114,Laura,White,[email protected],IT,68000
+115,Mark,Davis,[email protected],Sales,60000
+116,Patricia,Jones,[email protected],Finance,63000
+117,Matthew,Taylor,[email protected],Marketing,58000
+118,Jennifer,Young,[email protected],HR,53000
+119,Steven,Anderson,[email protected],IT,69000
+120,Elizabeth,Thomas,[email protected],Sales,61000
+121,Kevin,Harris,[email protected],Finance,64000
+122,Deborah,Smith,[email protected],Marketing,59000
+123,Joseph,Walker,[email protected],HR,54000
+124,Cynthia,Jackson,[email protected],IT,70000
+125,Daniel,Hall,[email protected],Sales,62000

data/z_house.csv ADDED Viewed

	@@ -0,0 +1,7 @@

+PropertyID,StreetAddress,City,State,ZipCode,NumberOfBedrooms,NumberOfBathrooms,SquareFootage,Price
+1,123 Main St,Los Angeles,CA,90001,3,2,1800,550000
+2,456 Elm St,New York,NY,10001,2,1,1200,750000
+3,789 Oak St,San Francisco,CA,94101,4,3,2500,950000
+4,101 Maple St,Boston,MA,02101,3,2.5,2000,680000
+5,202 Pine St,Miami,FL,33101,4,3.5,2700,820000
+6,303 Cedar St,Chicago,IL,60601,2,1,1100,450000

requirements.txt CHANGED Viewed

@@ -1,4 +1,6 @@
-gradio==3.41.2
 gradio_client==0.5.0
 numpy==1.25.2
-pandas==2.0.3

+gradio==3.40.1
 gradio_client==0.5.0
 numpy==1.25.2
+pandas==2.0.3
+spacy
+genbit

scripts/.keep DELETED Viewed

File without changes

scripts/genbit_metrics.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from genbit.genbit_metrics import GenBitMetrics
+import pandas as pd
+from utils.read_config import get_args
+from utils.load_csv import load_sample
+def cal_metrics(dataset):
+  # Create a GenBit object with the desired settings:
+  genbit_metrics_object = GenBitMetrics(language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80)
+  # Let's say you want to use GenBit with a test sentence, you can add the sentence to GenBit:
+  #dataset = ["I think she does not like cats. I think he does not like cats.", "He is a dog person."]
+  genbit_metrics_object.add_data(dataset, tokenized=False)
+  # To generate the gender bias metrics, we run `get_metrics` by setting `output_statistics` and `output_word_lists` to false, we can reduce the number of metrics created.
+  metrics = genbit_metrics_object.get_metrics(output_statistics=True, output_word_list=True)
+  return metrics
+# Function to extract genbit metrics
+def extract_genbit_metris(stats):
+    metrics = {}
+    metrics["genbit_score"] = str(stats["genbit_score"])
+    metrics["percentage_of_female_gender_definition_words"] = str(stats["percentage_of_female_gender_definition_words"])
+    metrics["percentage_of_male_gender_definition_words"] = str(stats["percentage_of_male_gender_definition_words"])
+    metrics["percentage_of_non_binary_gender_definition_words"] = str(stats["percentage_of_non_binary_gender_definition_words"])
+    metrics["percentage_of_trans_gender_definition_words"] = str(stats["percentage_of_trans_gender_definition_words"])
+    metrics["percentage_of_cis_gender_definition_words"] = str(stats["percentage_of_cis_gender_definition_words"])
+    metrics["num_words_considered"] = str(stats["statistics"]["num_words_considered"])
+    return metrics
+def load_dataset_and_get_genbit_metrics(df, sample_method, col_name, num_sample_records):
+    sample_df = load_sample(num_sample_records, sample_method, df, col_name)
+    # Turn into a list of text.
+    sample_text = sample_df[col_name].tolist()
+    # Call cal_metrics function
+    stats = cal_metrics(sample_text)
+    metrics = extract_genbit_metris(stats)
+    return metrics

scripts/gender_profession_tagging.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import pandas as pd
+import re
+import spacy
+from spacy.lang.en import English
+import time
+from tqdm import tqdm
+import multiprocessing.pool
+import warnings
+warnings.filterwarnings("ignore")
+from utils.read_config import get_args
+from utils.load_csv import load_sample
+# For sentence split
+nlp = English()
+nlp.add_pipe("sentencizer")
+# Function to split sentences
+def get_split_text(text):
+    doc = nlp(text)
+    sentences = [sent for sent in doc.sents]
+    return sentences
+def get_gender_prof_match_details(df_text):
+    # Get args from config file
+    male_pronoun = get_args("male_pronoun")
+    female_pronoun = get_args("female_pronoun")
+    professions = get_args("professions")
+    # Get regex pattern
+    male_pronoun_pat, female_pronoun_pat, professions_pat = get_regex_pattern(male_pronoun, female_pronoun, professions)
+    split_text = get_split_text(df_text)
+    results = []
+    for text in split_text:
+        male_pronoun_match = re.findall(male_pronoun_pat, str(text))
+        female_pronoun_match = re.findall(female_pronoun_pat, str(text))
+        prof_match = re.findall(professions_pat, str(text))
+        both_match = "No"
+        if len(male_pronoun_match) != 0 and len(prof_match) != 0:
+            both_match = "Yes"
+        if len(female_pronoun_match) != 0 and len(prof_match) != 0:
+            both_match = "Yes"
+        # Unpack from list
+        male_pronoun_match = ",".join(male_pronoun_match)
+        female_pronoun_match = ",".join(female_pronoun_match)
+        prof_match = ",".join(prof_match)
+        results.append((str(text), male_pronoun_match, female_pronoun_match, prof_match, both_match))
+    return results
+# Function to call multiprocessing threadpool
+def call_multiprocessing_pool(df_text):
+    concurrent = 2000
+    pool = multiprocessing.pool.ThreadPool(processes=concurrent)
+    result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
+    pool.close()
+    # return_list is nested -- we need to flatten it
+    flat_return_list = [item for sublist in result_list for item in sublist]
+    # add column names
+    cols = ["Split_Text", 'Male Pronoun', 'Female Pronoun', 'Profession', "Both Match"]
+    return_df = pd.DataFrame(flat_return_list, columns=cols)
+    return return_df
+# Function to get statistics
+def get_statistics(results_df):
+    count_total_sentence = results_df.shape[0]
+    count_both_match = results_df[results_df["Both Match"] == "Yes"]['Both Match'].count()
+    count_male_pronoun = results_df[results_df["Male Pronoun"] != ""]["Male Pronoun"].count()
+    count_female_pronoun = results_df[results_df["Female Pronoun"] != ""]["Female Pronoun"].count()
+    count_male_pronoun_profession = results_df[(results_df["Male Pronoun"] != "") & (results_df["Profession"] != "")]["Male Pronoun"].count()
+    count_female_pronoun_profession = results_df[(results_df["Female Pronoun"] != "") & (results_df["Profession"] != "")]["Female Pronoun"].count()
+    return{
+        "total_sentence" : str(count_total_sentence),
+        "both_gender_prof_match" : str(count_both_match),
+        "count_male_pronoun" : str(count_male_pronoun),
+        "count_female_pronoun" : str(count_female_pronoun),
+        "count_male_pronoun_profession" : str(count_male_pronoun_profession),
+        "count_female_pronoun_profession" : str(count_female_pronoun_profession)
+    }
+# Function to return regular expression patterns
+def get_regex_pattern(male_pronoun, female_pronoun, professions):
+    male_pronoun_pat = r'\b({})\b'.format("|".join(male_pronoun))
+    female_pronoun_pat = r'\b({})\b'.format("|".join(female_pronoun))
+    #Lower case male professioon
+    professions = [prof.lower() for prof in professions]
+    professions_pat = r'\b({})\b'.format("|".join(professions))
+    return male_pronoun_pat, female_pronoun_pat, professions_pat
+def load_dataset_and_analyze_gender_profession(df, sample_method, col_name, num_sample_records):
+    # Get args from config file
+    sample_df = load_sample(num_sample_records, sample_method, df, col_name)
+    # Lowercase of text
+    sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
+    # Call multiple threadpool
+    results_df = call_multiprocessing_pool(sample_df[col_name])
+    stats = get_statistics(results_df)
+    # Get statistics
+    return stats

scripts/gender_tagging.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Import required libraries
+import pandas as pd
+import re
+from utils.read_config import get_args
+from utils.load_csv import load_sample
+# Function to get count of male terms in text
+def count_male_terms(text, male_terms):
+  # Get pattern
+  pattern = r"\b({})\b".format("|".join(male_terms))
+  match = re.findall(pattern, str(text))
+  return len(match)
+# Function to get count of female terms in text
+def count_female_terms(text, female_terms):
+  # Get pattern
+  pattern = r"\b({})\b".format("|".join(female_terms))
+  match = re.findall(pattern, str(text))
+  return len(match)
+# Function to get gender tag categories
+def get_gender_tag(count_m_term, count_f_term):
+    tag = ''
+    if count_m_term == 0 and count_f_term == 0:
+        tag = "No Gender"
+    elif count_m_term == count_f_term:
+        tag = "Equal Gender"
+    elif count_m_term > count_f_term:
+        m_proportion = (count_m_term / (count_m_term + count_f_term)) * 100
+        if m_proportion >= 50 and m_proportion < 75:
+            tag = "Male Positive Gender"
+        elif m_proportion >= 75:
+            tag = "Male Strongly Positive Gender"
+    elif count_m_term < count_f_term:
+        f_proportion = (count_f_term / (count_m_term + count_f_term)) * 100
+        if f_proportion >= 50 and f_proportion < 75:
+            tag = "Female Positive Gender"
+        elif f_proportion >= 75:
+            tag = "Female Strongly Positive Gender"
+    return tag
+# Function to calculate PG and SPG
+def get_pg_spg(sample_df):
+    count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"]['gender_cat'].count()
+    count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"]['gender_cat'].count()
+    count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"]['gender_cat'].count()
+    count_male_pg = sample_df[sample_df['gender_cat'] == "Male Positive Gender"]['gender_cat'].count()
+    count_male_spg = sample_df[sample_df['gender_cat'] == "Male Strongly Positive Gender"]['gender_cat'].count()
+    count_female_pg = sample_df[sample_df['gender_cat'] == "Female Positive Gender"]['gender_cat'].count()
+    count_female_spg = sample_df[sample_df['gender_cat'] == "Female Stronly Positive Gender"]['gender_cat'].count()
+    return {
+    "gender" : str(count_gender_sentences),
+    "no gender" : str(count_no_gender_sentences),
+    "equal gender" : str(count_equal_gender),
+    "female pg" : str(count_female_pg),
+    "male pg" : str(count_male_pg),
+    "female spg" : str(count_female_spg),
+    "male spg" : str(count_male_spg)
+    }
+# Function to load dataset and get the analysis done
+def load_dataset_and_analyze_gender_tag(df, sample_method, col_name, num_sample_records):
+    # Read config file
+    male_terms = get_args("male_terms")
+    female_terms = get_args("female_terms")
+    # Load sample
+    sample_df = load_sample(num_sample_records, sample_method, df, col_name)
+    # Lowercase of text
+    sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
+    # Get new columns of count - male terms and female terms
+    sample_df['count_male_term'] = sample_df.apply(lambda x : count_male_terms(x[col_name], male_terms), axis=1)
+    sample_df['count_female_term'] = sample_df.apply(lambda x : count_female_terms(x[:], female_terms), axis=1)
+    # Get tag categories
+    sample_df['gender_cat'] = sample_df.apply(lambda row: get_gender_tag(row['count_male_term'], row['count_female_term']), axis=1)
+    # Get statistics
+    collection = get_pg_spg(sample_df)
+    return collection

utils/config.json ADDED Viewed

	@@ -0,0 +1,160 @@

+{
+"first_records" : 2000,
+"random_seed" : 42,
+"male_terms" : ["man", "boy", "male", "he", "son", "his", "himself", "guy", "father", "john"],
+"female_terms" : ["woman", "girl", "female", "she", "daughter", "her", "herself", "gal", "mother", "mary"],
+"male_pronoun" : ["he", "him", "his"],
+"female_pronoun" : ["she", "her", "hers"],
+"professions" : ["Accountant",
+"Actor",
+"Actress",
+"Aerospace Engineer",
+"Agricultural Scientist",
+"Air Traffic Controller",
+"Aircraft Mechanic",
+"Animator",
+"Architect",
+"Art Director",
+"Attorney",
+"Lawyer",
+"Audiologist",
+"Author",
+"Writer",
+"Baker",
+"Barber",
+"Hairdresser",
+"Bartender",
+"Biomedical Engineer",
+"Botanist",
+"Broadcast Journalist",
+"Business Analyst",
+"Carpenter",
+"Chef",
+"Cook",
+"Chemist",
+"Civil Engineer",
+"Clinical Psychologist",
+"Commercial Diver",
+"Computer Programmer",
+"Construction Worker",
+"Corporate Trainer",
+"Cosmetologist",
+"Counselor",
+"Therapist",
+"Court Reporter",
+"Creative Director",
+"Criminologist",
+"Customer Service Representative",
+"Data Analyst",
+"Dental Assistant",
+"Dentist",
+"Dermatologist",
+"Dietician",
+"Nutritionist",
+"Doctor",
+"Physician",
+"Economist",
+"Electrician",
+"Elementary School Teacher",
+"Emergency Medical Technician",
+"Engineer",
+"Environmental Scientist",
+"Event Planner",
+"Fashion Designer",
+"Film Director",
+"Financial Analyst",
+"Firefighter",
+"Fisherman",
+"Fitness Trainer",
+"Flight Attendant",
+"Florist",
+"Food Scientist",
+"Forensic Scientist",
+"Furniture Maker",
+"Game Developer",
+"Gardener",
+"Landscaper",
+"Geologist",
+"Graphic Designer",
+"Hair Stylist",
+"Historian",
+"Home Health Aide",
+"Hotel Manager",
+"Human Resources Manager",
+"Immigration Lawyer",
+"Industrial Designer",
+"Insurance Agent",
+"Interior Designer",
+"Interpreter",
+"Translator",
+"Investment Banker",
+"IT Specialist",
+"Journalist",
+"Judge",
+"Kindergarten Teacher",
+"Land Surveyor",
+"Landscape Architect",
+"Lawyer",
+"Attorney",
+"Librarian",
+"Life Coach",
+"Linguist",
+"Makeup Artist",
+"Management Consultant",
+"Manufacturing Engineer",
+"Marine Biologist",
+"Marketing Manager",
+"Massage Therapist",
+"Mechanical Engineer",
+"Medical Assistant",
+"Medical Researcher",
+"Meteorologist",
+"Midwife",
+"Military Officer",
+"Music Producer",
+"Musician",
+"Nurse",
+"Occupational Therapist",
+"Optician",
+"Optometrist",
+"Paralegal",
+"Paramedic",
+"Patent Attorney",
+"Pediatrician",
+"Personal Trainer",
+"Petroleum Engineer",
+"Pharmacist",
+"Photographer",
+"Physical Therapist",
+"Physician Assistant",
+"Pilot",
+"Plumber",
+"Police Officer",
+"Political Scientist",
+"Preschool Teacher",
+"Private Investigator",
+"Product Manager",
+"Professor",
+"Lecturer",
+"Programmer",
+"Psychiatrist",
+"Psychologist",
+"Public Relations Specialist",
+"Public School Teacher",
+"Real Estate Agent",
+"Broker",
+"Receptionist",
+"Registered Nurse",
+"Reporter",
+"Restaurant Manager",
+"Sales Representative",
+"School Counselor",
+"Scientist",
+"Screenwriter",
+"Social Media Manager",
+"Social Worker",
+"Software Developer",
+"Speech-Language Pathologist",
+"Sports Coach",
+"Statistician"]
+}

utils/load_csv.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import pandas as pd
+from utils.read_config import get_args
+def check_csv(upload_file):
+    df = pd.read_csv(upload_file)
+    return df
+# Function to load sample of dataset
+def load_sample(num_sample_records, sample_method, df, col_name):
+    sample_first_records = get_args("first_records")
+    sample_random_seed = get_args("random_seed")
+    num_sample_records = num_sample_records if num_sample_records <= sample_first_records else sample_first_records
+    # Keep only required column
+    df = df[[col_name]]
+    if sample_method == "First":
+        df = df.iloc[:num_sample_records].copy().reset_index()
+    if sample_method == "Last":
+        df = df.iloc[-num_sample_records:].copy().reset_index()
+    if sample_method == "Random":
+        df = df.sample(num_sample_records, random_state=sample_random_seed).copy().reset_index()
+    return df

utils/read_config.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import json
+def read_config_file():
+    with open("utils/config.json", "r") as jsonfile:
+        data = json.load(jsonfile)
+    return data
+def get_args(args):
+    try:
+        data = read_config_file()
+    except:
+        raise "Could not read config file."
+    return data[args]