opdx

Sleeping

App Files Files Community

lyangas commited on Jun 27, 2024

Commit

6931ba0

1 Parent(s): 4c9b947

missed files

Browse files

Files changed (7) hide show

helpers/__init__.py +0 -0
helpers/data_processor.py +180 -0
helpers/firebase.py +148 -0
helpers/gcloud.py +98 -0
helpers/required_classes.py +177 -0
helpers/trainer_classifiers.py +240 -0
helpers/trainer_embedder.py +58 -0

helpers/__init__.py ADDED Viewed

File without changes

helpers/data_processor.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from random import choices
+def log(*args):
+    print(*args, flush=True)
+def create_group(code):
+    """
+    Creating group column, transforming an input string
+    Parameters:
+        code (str): string with ICD-10 code name
+    Returns:
+        group(str): string with ICD-10 group name
+    """
+    group = code.split('.')[0]
+    return group
+def df_creation(texts, labels,
+                    all_classes, prompt_column_name,
+                    code_column_name, group_column_name):
+    """
+    Creates a DataFrame from medical reports, their corresponding ICD-10 codes, and class information.
+    Parameters:
+        texts (List[str]): A list of strings, where each string is a medical report.
+        labels (List[str]): A list of strings, where each string is an ICD-10 code name
+                            relevant to the corresponding text in 'texts'.
+        all_classes (List[str]): A list of all ICD-10 code names from the initial dataset.
+        prompt_column_name (str): The column name in the DataFrame for the prompts.
+        code_column_name (str): The column name in the DataFrame for the codes.
+        group_column_name (str): The column name in the DataFrame for the groups.
+    Returns:
+        pandas.DataFrame: A DataFrame where each row contains the text of the report,
+                          its corresponding ICD-10 code, and the group category derived
+                          from the code.
+    """
+    df = pd.DataFrame()
+    df[prompt_column_name] = texts
+    df[code_column_name] = [all_classes[c] for c in labels]
+    df[group_column_name] = [all_classes[c].split('.')[0] for c in labels]
+    return df
+def select_random_rows(df_test, balance_column, random_n):
+    """
+    Selects a random, balanced subset of rows from a DataFrame based on a specified column.
+    This function aims to create a balanced DataFrame by randomly selecting a specified number of rows
+    from each unique value in the balance column. It's particularly useful in scenarios where you
+    need a balanced sample from a dataset for testing or validation purposes.
+    Parameters:
+        df_test (pandas.DataFrame): The DataFrame to select rows from.
+        balance_column (str): The name of the column used to balance the data. The function will
+                              select rows such that each unique value in this column is equally represented.
+        random_n (int): The number of rows to select for each unique value in the balance column.
+    Returns:
+        pandas.DataFrame: A new DataFrame containing a balanced, random subset of rows.
+    """
+    classes = df_test[balance_column].unique()
+    balanced_data = []
+    for class_name in classes:
+        balanced_data += choices(df_test[df_test[balance_column]==class_name].to_dict('records'), k=random_n)
+    df = pd.DataFrame(balanced_data)
+    return df
+def extract_valuable_data(path_to_raw_csv, prompt_column_name,
+                            code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls):
+    """
+    Extracts and processes valuable data from a raw CSV file based on specified criteria.
+    This function loads data from a CSV file, filters out rows based on non-null values in specified columns,
+    removes codes with a low number of associated prompts, filters for prompt length, creates a new 'group'
+    column, and saves the processed data to a new CSV file.
+    Parameters:
+        path_to_raw_csv (str): The file path to the raw CSV data file.
+        prompt_column_name (str): The column name in the CSV file for prompts.
+        code_column_name (str): The column name in the CSV file for codes.
+        path_to_processed_csv (str): The file path where the processed CSV data will be saved.
+    Returns:
+        pandas.DataFrame: A DataFrame containing the processed dataset.
+    """
+    df = pd.read_csv(path_to_raw_csv)
+    log(path_to_raw_csv, prompt_column_name, code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls)
+    df = df[df[prompt_column_name].notna() & df[code_column_name].notna()]
+    log(f"New data is loaded. New data has {len(df)} reports.")
+    log(f"New data contains {len(df['code'].unique())} unique codes.")
+    # Leave data for codes where more than min_samples_per_cls prompts.
+    unique_values = df['code'].value_counts()
+    values_to_remove = unique_values[unique_values <= min_samples_per_cls].index
+    df = df[~df['code'].isin(values_to_remove)]
+    # leave prompts that are longer that min_text_len characters
+    df = df[df[prompt_column_name].str.len() >= min_text_len]
+    # Creating GROUP column in dataset
+    df['group'] = df['code'].apply(create_group)
+    log(f"New data is processed. Processed data has {len(df)} reports.")
+    log(f"Processed dataset contains {len(df['code'].unique())} codes.")
+    log(f"Processed dataset contains {len(df['group'].unique())} groups.")
+    # Saving processed dataset
+    df.to_csv(path_to_processed_csv, index=False)
+    log(f"Processed dataset is saved to {path_to_processed_csv}.")
+    return df
+def balance_data(df, prompt_column_name, code_column_name,
+                group_column_name,random_n, test_size, path_to_train_csv,
+                path_to_csv_test_codes, path_to_csv_test_groups):
+    """
+    Balances and splits a dataset into training and test sets, then saves these sets to CSV files.
+    This function takes a DataFrame and performs stratified splitting based on the specified 'code_column_name'
+    to create balanced training and test datasets. It then saves the training dataset and two versions of
+    the test dataset (one for codes and one for groups) to separate CSV files.
+    Parameters:
+        df (pandas.DataFrame): The DataFrame to be processed and split.
+        prompt_column_name (str): The column name in the DataFrame for the prompts.
+        code_column_name (str): The column name in the DataFrame for the codes.
+        group_column_name (str): The column name in the DataFrame for the groups.
+        random_n (int): The number of rows to be randomly selected in test datasets for each unique code or group.
+        test_size (float): The proportion of the dataset to include in the test split.
+        path_to_train_csv (str): The file path where the training dataset CSV will be saved.
+        path_to_csv_test_codes (str): The file path where the test dataset for codes CSV will be saved.
+        path_to_csv_test_groups (str): The file path where the test dataset for groups CSV will be saved.
+    Returns:
+        None
+    """
+    texts = np.array(df[prompt_column_name])
+    labels = np.array(df[code_column_name])
+    groups = np.array(df[group_column_name])
+    all_classes = np.unique(labels).tolist()
+    labels = [all_classes.index(l) for l in labels]
+    log('='*50)
+    log(f"texts={len(texts)} labels={len(labels)} uniq_labels={len(np.unique(labels))} test_size={test_size}")
+    log('='*50)
+    texts_train, texts_test, labels_train, labels_test = train_test_split(
+        texts, labels, test_size=test_size, random_state=42, stratify=labels
+    )
+    log(f"Train dataset len={len(texts_train)}")
+    log(f"Test dataset len={len(texts_test)}")
+    log(f"Count of classes={len(np.unique(labels))}")
+    # Creating TRAIN and TEST dataset
+    df_train = df_creation(texts_train, labels_train, all_classes,
+                            prompt_column_name, code_column_name, group_column_name)
+    df_train.to_csv(path_to_train_csv, index=False)
+    log(f"TRAIN dataset is saved to {path_to_train_csv}")
+    # Creating test datasets for codes and groups
+    df_test = df_creation(texts_test, labels_test, all_classes,
+                            prompt_column_name, code_column_name, group_column_name)
+    df_test_codes = df_test # select_random_rows(df_test, code_column_name, random_n)
+    df_test_codes.to_csv(path_to_csv_test_codes, index=False)
+    log(f"TEST dataset for codes is saved to {path_to_csv_test_codes}")
+    df_test_groups = df_test # select_random_rows(df_test, group_column_name, random_n)
+    df_test_groups.to_csv(path_to_csv_test_groups, index=False)
+    log(f"TEST dataset for groups is saved to {path_to_csv_test_groups}")

helpers/firebase.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import firebase_admin
+from firebase_admin import credentials
+from firebase_admin import firestore
+class FirebaseClient:
+    def __init__(self, path_to_certificate):
+        # Initialize Firebase Admin SDK
+        cred = credentials.Certificate(path_to_certificate)  # Path to your service account key JSON file
+        firebase_admin.initialize_app(cred)
+        # Initialize Firestore database
+        self.db = firestore.client()
+    def add_task(self, task_data):
+        """
+        Add a new task to Firestore.
+        Args:
+            task_data (dict): Dictionary containing task data.
+                Example: {'title': 'Task Title', 'description': 'Task Description', 'status': 'pending'}
+        """
+        # Add task data to Firestore
+        doc_ref = self.db.collection('tasks').document()
+        doc_ref.set(task_data)
+        return doc_ref.id
+    def get_task_by_status(self, status):
+        # Reference to the tasks collection
+        tasks_ref = self.db.collection('tasks')
+        # Query tasks with status 'pending'
+        query = tasks_ref.where('status', '==', status)
+        # Get documents that match the query
+        pending_tasks = query.stream()
+        # Convert documents to dictionaries
+        pending_tasks_data = []
+        for doc in pending_tasks:
+            task_data = doc.to_dict()
+            task_data['id'] = doc.id
+            pending_tasks_data.append(task_data)
+        return pending_tasks_data
+    def get_all_tasks(self):
+        """
+        Retrieve all tasks from Firestore.
+        Returns:
+            list: A list containing dictionaries, each representing a task.
+        """
+        # Reference to the 'tasks' collection
+        tasks_ref = self.db.collection('tasks')
+        # Get all documents in the collection
+        docs = tasks_ref.stream()
+        # Initialize an empty list to store tasks
+        tasks = []
+        # Iterate over each document and add it to the tasks list
+        for doc in docs:
+            doc_dict = doc.to_dict()
+            doc_dict['id'] = doc.id
+            tasks.append(doc_dict)
+        return tasks
+    def update(self, task_id, data):
+        """
+        Reserve a task by a worker.
+        Args:
+            task_id (str): ID of the task to be reserved.
+            worker_id (str): ID of the worker reserving the task.
+        """
+        # Reference to the task document
+        task_ref = self.db.collection('tasks').document(task_id)
+        # Update the task document to indicate it has been reserved by the worker
+        task_ref.update(data)
+    def delete_task(self, task_id):
+        """
+        Delete a task from Firestore by its ID.
+        Args:
+            task_id (str): ID of the task to be deleted.
+        """
+        # Reference to the task document
+        task_ref = self.db.collection('tasks').document(task_id)
+        # Delete the task document
+        task_ref.delete()
+    def get_task_by_id(self, task_id):
+        """
+        Retrieve a task from Firestore by its ID.
+        Args:
+            task_id (str): ID of the task to be retrieved.
+        Returns:
+            dict or None: Dictionary containing the task data if found, None otherwise.
+        """
+        # Reference to the task document
+        task_ref = self.db.collection('tasks').document(task_id)
+        # Retrieve the task document
+        task_doc = task_ref.get()
+        # Check if the task document exists
+        if task_doc.exists:
+            return task_doc.to_dict()
+        else:
+            return None
+    def find_tasks_by_status(self, status):
+        """
+        Find all tasks in Firestore with the specified status.
+        Args:
+            status (str): Status value to filter tasks by.
+        Returns:
+            list: List of dictionaries containing task data.
+        """
+        # Reference to the 'tasks' collection
+        tasks_ref = self.db.collection('tasks')
+        # Query tasks with the specified status
+        query = tasks_ref.where('status', '==', status)
+        # Get documents that match the query
+        docs = query.stream()
+        # Initialize an empty list to store tasks
+        tasks = []
+        # Iterate over each document and add it to the tasks list
+        for doc in docs:
+            task = doc.to_dict()
+            task['id'] = doc.id
+            tasks.append(task)
+        return tasks

helpers/gcloud.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os
+from google.cloud import storage
+from tqdm import tqdm
+from googleapiclient import discovery
+import requests
+service = discovery.build('compute', 'v1')
+storage_client = storage.Client()
+def download_csv_from_gcloud(bucket_name, object_name, destination_file_path):
+    """Download a file from Google Cloud Storage."""
+    bucket = storage_client.bucket(bucket_name)
+    blob = bucket.blob(object_name)
+    # Download the file to a local path
+    blob.download_to_filename(destination_file_path)
+    print(f"File {object_name} downloaded to {destination_file_path}")
+def upload_folder_to_gcloud(bucket_name, source_folder_path, destination_folder_name):
+    """Uploads all files in a folder to the Google Cloud Storage bucket."""
+    # Instantiates a client
+    # storage_client = storage.Client()
+    # Gets the bucket
+    print(f"bucket_name={bucket_name}, source_folder_path={source_folder_path}, destination_folder_name={destination_folder_name}", flush=True)
+    bucket = storage_client.bucket(bucket_name)
+    # Walk through the folder and upload each file
+    for root, _, files in os.walk(source_folder_path):
+        for file_name in files:
+            # Construct the local file path
+            local_file_path = os.path.join(root, file_name)
+            # Construct the destination blob name
+            destination_blob_name = os.path.join(destination_folder_name, os.path.relpath(local_file_path, source_folder_path))
+            print(f"destination_blob_name={destination_blob_name}")
+            # Upload the file
+            blob = bucket.blob(destination_blob_name)
+            blob.upload_from_filename(local_file_path)
+            print(f"File {local_file_path} uploaded to {destination_blob_name}.")
+def download_folder(bucket_name, folder_name, destination_directory):
+    """
+    Download the contents of a folder from a Google Cloud Storage bucket to a local directory.
+    Args:
+        bucket_name (str): Name of the Google Cloud Storage bucket.
+        folder_name (str): Name of the folder in the bucket to download.
+        destination_directory (str): Local directory to save the downloaded files.
+    """
+    # Get the bucket
+    bucket = storage_client.get_bucket(bucket_name)
+    # List objects in the folder
+    blobs = bucket.list_blobs(prefix=folder_name)
+    # Ensure destination directory exists
+    os.makedirs(destination_directory, exist_ok=True)
+    # Iterate over each object in the folder
+    for blob in tqdm(blobs, desc=f'Downloading {folder_name}'):
+        # Determine local file path
+        local_file_path = os.path.join(destination_directory, os.path.relpath(blob.name, folder_name))
+        # Ensure local directory exists
+        os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+        # Download the object to a local file
+        blob.download_to_filename(local_file_path)
+def start_vm(project, zone, instance):
+    request = service.instances().start(project=project, zone=zone, instance=instance)
+    response = request.execute()
+    return response
+def stop_vm(project, zone, instance):
+    request = service.instances().stop(project=project, zone=zone, instance=instance)
+    response = request.execute()
+    return response
+def get_current_instance_name():
+    # URL for the metadata server
+    METADATA_URL = "http://metadata.google.internal/computeMetadata/v1/instance/name"
+    HEADERS = {"Metadata-Flavor": "Google"}
+    try:
+        response = requests.get(METADATA_URL, headers=HEADERS)
+        response.raise_for_status()  # Raise an error for bad status codes
+        instance_name = response.text
+        return instance_name
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching instance name: {e}")
+        return None

helpers/required_classes.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import numpy as np
+from typing import List
+import pandas as pd
+import torch
+import xgboost as xgb
+from transformers import AutoTokenizer, BertForSequenceClassification
+from tqdm import tqdm
+class BertEmbedder:
+    def __init__(self, tokenizer_path:str, model_path:str, cut_head:bool=False):
+        """
+            cut_head = True if the model have classifier head
+        """
+        self.embedder = BertForSequenceClassification.from_pretrained(model_path)
+        self.max_length = self.embedder.config.max_position_embeddings
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, max_length=self.max_length)
+        if cut_head:
+            self.embedder = self.embedder.bert
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        print(f"Used device for BERT: {self.device }", flush=True)
+        self.embedder.to(self.device)
+    def __call__(self, text: str):
+        encoded_input = self.tokenizer(text,
+                                       return_tensors='pt',
+                                       max_length=self.max_length,
+                                       padding=True,
+                                       truncation=True).to(self.device)
+        model_output = self.embedder(**encoded_input)
+        text_embed = model_output.pooler_output[0].cpu()
+        return text_embed
+    def batch_predict(self, texts: List[str]):
+        encoded_input = self.tokenizer(texts,
+                                       return_tensors='pt',
+                                       max_length=self.max_length,
+                                       padding=True,
+                                       truncation=True).to(self.device)
+        model_output = self.embedder(**encoded_input)
+        texts_embeds = model_output.pooler_output.cpu()
+        return texts_embeds
+class PredictModel:
+    def __init__(self, embedder, classifier_code, classifier_group, batch_size=8):
+        self.batch_size = batch_size
+        self.embedder = embedder
+        self.classifier_code = classifier_code
+        self.classifier_group = classifier_group
+    def _texts2vecs(self, texts, logging=False):
+        embeds = []
+        batches_texts = np.array_split(texts, len(texts) // self.batch_size)
+        if logging:
+            iterator = tqdm(batches_texts)
+        else:
+            iterator = batches_texts
+        for batch_texts in iterator:
+            batch_texts = batch_texts.tolist()
+            embeds += self.embedder.batch_predict(batch_texts).tolist()
+        embeds = np.array(embeds)
+        return embeds
+    def fit(self, texts: List[str], labels: List[str], logging: bool=False):
+        if logging:
+            print('Start text2vec transform')
+        embeds = self._texts2vecs(texts, logging)
+        if logging:
+            print('Start codes-classifier fitting')
+        self.classifier_code.fit(embeds, labels)
+        labels = [l.split('.')[0] for l in labels]
+        if logging:
+            print('Start groups-classifier fitting')
+        self.classifier_group.fit(embeds, labels)
+    def predict_code(self, texts: List[str], log: bool=False):
+        if log:
+            print('Start text2vec transform')
+        embeds = self._texts2vecs(texts, log)
+        if log:
+            print('Start classifier prediction')
+        prediction = self.classifier_code.predict(embeds)
+        return prediction
+    def predict_group(self, texts: List[str], logging: bool=False):
+        if logging:
+            print('Start text2vec transform')
+        embeds = self._texts2vecs(texts, logging)
+        if logging:
+            print('Start classifier prediction')
+        prediction = self.classifier_group.predict(embeds)
+        return prediction
+class CustomXGBoost:
+    def __init__(self, use_gpu):
+        if use_gpu:
+            self.model = xgb.XGBClassifier(tree_method="gpu_hist")
+        else:
+            self.model = xgb.XGBClassifier()
+        self.classes_ = None
+    def fit(self, X, y, **kwargs):
+        self.classes_ = np.unique(y).tolist()
+        y = [self.classes_.index(l) for l in y]
+        self.model.fit(X, y, **kwargs)
+    def predict_proba(self, X):
+        pred = self.model.predict_proba(X)
+        return pred
+    def predict(self, X):
+        preds = self.model.predict_proba(X)
+        return np.array([self.classes_[p] for p in np.argmax(preds, axis=1)])
+class SimpleModel:
+    def __init__(self):
+        self.classes_ = None
+    def fit(self, X, y):
+        print(y[0])
+        self.classes_ = [y[0]]
+    def predict_proba(self, X):
+        return np.array([[1.0]] * len(X))
+def balance_dataset(labels_train_for_group, vecs_train_for_group, balance=None, logging=True):
+    if balance == 'remove':
+        min_len = -1
+        for code_l in np.unique(labels_train_for_group):
+            cur_len = sum(labels_train_for_group==code_l)
+            if logging:
+                print(code_l, cur_len)
+            if min_len > cur_len or min_len==-1:
+                min_len = cur_len
+        if logging:
+            print('min_len is', min_len)
+        df_train_group = pd.DataFrame()
+        df_train_group['labels'] = labels_train_for_group
+        df_train_group['vecs'] = vecs_train_for_group.tolist()
+        df_train_group = df_train_group.groupby('labels', as_index=False).apply(lambda array: array.loc[np.random.choice(array.index, min_len, False),:])
+        labels_train_for_group = df_train_group['labels'].values
+        vecs_train_for_group = [np.array(v) for v in df_train_group['vecs'].values]
+    elif balance == 'duplicate':
+        df_train_group = pd.DataFrame()
+        df_train_group['labels'] = labels_train_for_group
+        df_train_group['vecs'] = vecs_train_for_group.tolist()
+        max_len = 0
+        for code_data in df_train_group.groupby('labels'):
+            cur_len = len(code_data[1])
+            if logging:
+                print(code_data[0], cur_len)
+            if max_len < cur_len:
+                max_len = cur_len
+        if logging:
+            print('max_len is ', max_len)
+        labels_train_for_group = []
+        vecs_train_for_group = []
+        for code_data in df_train_group.groupby('labels'):
+            cur_len = len(code_data[1])
+            cur_labels = code_data[1]['labels'].values.tolist()
+            cur_vecs = code_data[1]['vecs'].values.tolist()
+            while cur_len < max_len:
+                cur_len *= 2
+                cur_labels += cur_labels
+                cur_vecs += cur_vecs
+            cur_labels = cur_labels[:max_len]
+            cur_vecs = cur_vecs[:max_len]
+            labels_train_for_group += cur_labels
+            vecs_train_for_group += cur_vecs
+        labels_train_for_group = np.array(labels_train_for_group)
+        vecs_train_for_group = np.array(vecs_train_for_group)
+    return labels_train_for_group, vecs_train_for_group

helpers/trainer_classifiers.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.metrics import accuracy_score, fbeta_score, confusion_matrix, ConfusionMatrixDisplay
+from sklearn.utils.class_weight import compute_sample_weight
+import pickle as pkl
+from tqdm import tqdm
+import time
+import os
+import shutil
+import json
+from copy import deepcopy
+from helpers.required_classes import *
+def log(*args):
+    print(*args, flush=True)
+def train_code_classifier(vecs_train_codes, vecs_test_for_groups,
+                          labels_train_codes, labels_test_groups_codes, labels_test_groups_groups,
+                          labels_train_groups,
+                          models_folder, group_name, balance=None, logging=True, use_gpu=True):
+    """
+        balance - is a type of balancing dataset:
+            remove - remove items per class until amount texts per clas is not the same as minimum amount
+            duplicate - duplicate items per class until amount texts per clas is not the same as maximum amount
+            weight - weighted training model
+            None - without any balancing method
+    """
+    log(f"training model for codes classifiers in group {group_name}")
+    # create / remove folder
+    experiment_path = f"{models_folder}/{group_name}"
+    if not os.path.exists(experiment_path):
+        os.makedirs(experiment_path, exist_ok=True)
+    else:
+        shutil.rmtree(experiment_path)
+        os.makedirs(experiment_path, exist_ok=True)
+    labels_train_for_group = labels_train_codes[labels_train_groups==group_name]
+    if logging:
+        log(f"e.g. labels in the group: {labels_train_for_group[:3]} cng of codes: {len(np.unique(labels_train_for_group))} cnt of texts: {len(labels_train_for_group)}")
+    # prepare train labels
+    if len(np.unique(labels_train_for_group)) < 2:
+        # if group have only one code inside
+        code_name = labels_train_for_group[0]
+        if logging:
+            log(f'group {group_name} have only one code inside {code_name}')
+        simple_clf = SimpleModel()
+        simple_clf.fit([], [code_name])
+        pkl.dump(simple_clf, open(f"{experiment_path}/{group_name}_code_clf.pkl", 'wb'))
+        return {"f1_score": 'one_cls', "accuracy": 'one_cls'}
+    sample_weights = compute_sample_weight(
+        class_weight='balanced',
+        y=labels_train_for_group
+    )
+    # prepare other data
+    vecs_train_for_group = vecs_train_codes[labels_train_groups==group_name]
+    vecs_test_for_group = vecs_test_for_groups[labels_test_groups_groups==group_name]
+    labels_test_for_group = labels_test_groups_codes[labels_test_groups_groups==group_name]
+    labels_train_for_group, vecs_train_for_group = balance_dataset(
+        labels_train_for_group, vecs_train_for_group, balance=balance
+    )
+    fit_start_time = time.time()
+    model = CustomXGBoost(use_gpu)
+    if balance == 'weight':
+        model.fit(vecs_train_for_group, labels_train_for_group, sample_weight=sample_weights)
+    else:
+        model.fit(vecs_train_for_group, labels_train_for_group)
+    pkl.dump(model, open(f"{experiment_path}/{group_name}_code_clf.pkl", 'wb'))
+    if logging:
+        log(f'Trained in {time.time() - fit_start_time}s')
+    pred_start_time = time.time()
+    predictions_group = model.predict(vecs_test_for_group)
+    scores = {
+        "f1_score": fbeta_score(labels_test_for_group, predictions_group, beta=1, average='macro'),
+        "accuracy": accuracy_score(labels_test_for_group, predictions_group)
+    }
+    if logging:
+        log(scores, f'Predicted in {time.time() - pred_start_time}s')
+    with open(f"{experiment_path}/{group_name}_scores.json", 'w') as f:
+        f.write(json.dumps(scores))
+    conf_matrix = confusion_matrix(labels_test_for_group, predictions_group)
+    disp_code = ConfusionMatrixDisplay(confusion_matrix=conf_matrix,
+                                display_labels=model.classes_, )
+    fig, ax = plt.subplots(figsize=(5,5))
+    disp_code.plot(ax=ax)
+    plt.xticks(rotation=90)
+    plt.savefig(f"{experiment_path}/{group_name}_matrix.png")
+    return scores
+def train_codes_for_groups(vecs_train_codes, vecs_test_groups,
+                          labels_train_codes, labels_test_groups_codes, labels_test_groups_groups,
+                          labels_train_groups,
+                          output_path, logging, use_gpu=True):
+    all_scores = []
+    for group_name in tqdm(np.unique(labels_train_groups)):
+        row = {'group': group_name}
+        for balanced_method in ['weight']: # [None, 'remove', 'weight', 'duplicate']:
+            if logging:
+                log('\n', '-'*50)
+            scores = train_code_classifier(vecs_train_codes, vecs_test_groups,
+                          labels_train_codes, labels_test_groups_codes, labels_test_groups_groups,
+                          labels_train_groups,
+                          output_path, group_name, balanced_method, logging, use_gpu)
+            scores = {f"{balanced_method}_{k}": v for k, v in scores.items()}
+            row.update(scores)
+        all_scores.append(row)
+    df = pd.DataFrame(all_scores)
+    columns = df.columns.tolist()
+    columns.remove('group')
+    mean_scores = {'group': 'MEAN'}
+    for score_name in columns:
+        mean_score = df[df[score_name] != 'one_cls'][score_name].mean()
+        mean_scores.update({score_name: float(mean_score)})
+    df = pd.concat([df, pd.DataFrame([mean_scores])], ignore_index=True)
+    return df
+def make_experiment_classifier(vecs_train_codes, vecs_test_codes, vecs_test_group,
+                          labels_train_codes, labels_test_codes,
+                          labels_test_groups, labels_train_groups,
+                          sample_weights_codes, sample_weights_groups,
+                          texts_test_codes, texts_test_groups,
+                          experiment_name, classifier_model_code, classifier_model_group, experiment_path, balance=None):
+    # train different models as base model for group and codes
+    log(f'Model: {experiment_name}')
+    # create / remove experiment folder
+    experiment_path = f"{experiment_path}/{experiment_name}"
+    if not os.path.exists(experiment_path):
+        os.makedirs(experiment_path, exist_ok=True)
+    else:
+        shutil.rmtree(experiment_path)
+        os.makedirs(experiment_path, exist_ok=True)
+    # fit the models
+    cls_codes = deepcopy(classifier_model_code)
+    cls_groups = deepcopy(classifier_model_group)
+    labels_train_codes_balanced, vecs_train_codes_balanced = balance_dataset(
+        labels_train_codes, vecs_train_codes, balance=balance
+    )
+    labels_train_groups_balanced, vecs_train_codes_balanced = balance_dataset(
+        labels_train_groups, vecs_train_codes, balance=balance
+    )
+    log('start training base model')
+    if balance == 'weight':
+        try:
+            start_time = time.time()
+            cls_codes.fit(vecs_train_codes_balanced, labels_train_codes_balanced, sample_weight=sample_weights_codes)
+            log(f'codes classify trained in {(time.time() - start_time) / 60}m')
+            start_time = time.time()
+            cls_groups.fit(vecs_train_codes_balanced, labels_train_groups_balanced, sample_weight=sample_weights_groups)
+            log(f'groups classify trained in {(time.time() - start_time) / 60}m')
+        except Exception as e:
+            log(str(e))
+            start_time = time.time()
+            cls_codes.fit(vecs_train_codes_balanced, labels_train_codes_balanced)
+            log(f'codes classify trained in {(time.time() - start_time) / 60}m')
+            start_time = time.time()
+            cls_groups.fit(vecs_train_codes_balanced, labels_train_groups_balanced)
+            log(f'groups classify trained in {(time.time() - start_time) / 60}m')
+    else:
+        start_time = time.time()
+        cls_codes.fit(vecs_train_codes_balanced, labels_train_codes_balanced)
+        log(f'codes classify trained in {(time.time() - start_time) / 60}m')
+        start_time = time.time()
+        cls_groups.fit(vecs_train_codes_balanced, labels_train_groups_balanced)
+        log(f'groups classify trained in {(time.time() - start_time) / 60}m')
+    pkl.dump(cls_codes, open(f"{experiment_path}/{experiment_name}_codes.pkl", 'wb'))
+    pkl.dump(cls_groups, open(f"{experiment_path}/{experiment_name}_groups.pkl", 'wb'))
+    # inference the model
+    predictions_code = cls_codes.predict(vecs_test_codes)
+    predictions_group = cls_groups.predict(vecs_test_group)
+    scores = {
+        "f1_score_code": fbeta_score(labels_test_codes, predictions_code, beta=1, average='macro'),
+        "f1_score_group": fbeta_score(labels_test_groups, predictions_group, beta=1, average='macro'),
+        "accuracy_code": accuracy_score(labels_test_codes, predictions_code),
+        "accuracy_group": accuracy_score(labels_test_groups, predictions_group)
+    }
+    with open(f"{experiment_path}/{experiment_name}_scores.json", 'w') as f:
+        f.write(json.dumps(scores))
+    conf_matrix = confusion_matrix(labels_test_codes, predictions_code)
+    disp_code = ConfusionMatrixDisplay(confusion_matrix=conf_matrix,
+                                display_labels=cls_codes.classes_, )
+    fig, ax = plt.subplots(figsize=(20,20))
+    disp_code.plot(ax=ax)
+    plt.xticks(rotation=90)
+    plt.savefig(f"{experiment_path}/{experiment_name}_codes_matrix.png")
+    conf_matrix = confusion_matrix(labels_test_groups, predictions_group)
+    disp_group = ConfusionMatrixDisplay(confusion_matrix=conf_matrix,
+                                display_labels=cls_groups.classes_, )
+    fig, ax = plt.subplots(figsize=(20,20))
+    disp_group.plot(ax=ax)
+    plt.xticks(rotation=90)
+    plt.savefig(f"{experiment_path}/{experiment_name}_groups_matrix.png")
+    pd.DataFrame({'codes': predictions_code, 'truth': labels_test_codes, 'text': texts_test_codes}).to_csv(f"{experiment_path}/{experiment_name}_pred_codes.csv")
+    pd.DataFrame({'groups': predictions_group, 'truth': labels_test_groups, 'text': texts_test_groups}).to_csv(f"{experiment_path}/{experiment_name}_pred_groups.csv")
+    return predictions_code, predictions_group, scores
+def train_base_clfs(classifiers, vecs_train_codes, vecs_test_codes, vecs_test_group,
+                    labels_train_codes, labels_test_codes,
+                    labels_test_groups_codes, labels_test_groups_groups, labels_train_groups,
+                    sample_weights_codes, sample_weights_groups,
+                    texts_test_codes, texts_test_groups, output_path):
+    results = ''
+    for experiment_data in classifiers:
+        for balanced_method in ['weight']:
+            exp_name = experiment_data['name']
+            cls_model = experiment_data['model']
+            _, _, scores = make_experiment_classifier(vecs_train_codes, vecs_test_codes, vecs_test_group,
+                                                    labels_train_codes, labels_test_codes,
+                                                    labels_test_groups_groups, labels_train_groups,
+                                                    sample_weights_codes, sample_weights_groups,
+                                                    texts_test_codes, texts_test_groups,
+                                                    exp_name, cls_model, cls_model, output_path, balance=None)
+            res = f"\n\n{exp_name} balanced by: {balanced_method} scores: {scores}"
+            results += res
+            log(res)
+    return results

helpers/trainer_embedder.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import numpy as np
+import torch
+from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
+from transformers import TrainingArguments, Trainer
+from transformers import EarlyStoppingCallback
+import pickle as pkl
+from datetime import datetime
+class Dataset(torch.utils.data.Dataset):
+    def __init__(self, encodings, labels=None):
+        self.encodings = encodings
+        self.labels = labels
+    def __getitem__(self, idx):
+        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
+        item["labels"] = torch.tensor(self.labels[idx])
+        return item
+    def __len__(self):
+        return len(self.encodings["input_ids"])
+def compute_metrics(p):
+    pred, labels = p
+    pred = np.argmax(pred, axis=1)
+    accuracy = accuracy_score(y_true=labels, y_pred=pred)
+    recall = recall_score(y_true=labels, y_pred=pred, average='macro', zero_division=0)
+    precision = precision_score(y_true=labels, y_pred=pred, average='macro', zero_division=0)
+    f1 = f1_score(y_true=labels, y_pred=pred, average="macro", zero_division=0)
+    return {"eval_accuracy": accuracy, "eval_precision": precision, "eval_recall": recall, "eval_f1": f1}
+def train(model, train_dataset, val_dataset, output_dir, save_steps, num_train_epochs=10):
+    args = TrainingArguments(
+        output_dir=output_dir,
+        overwrite_output_dir=True,
+        evaluation_strategy="steps",
+        eval_steps=save_steps,
+        per_device_train_batch_size=16,
+        per_device_eval_batch_size=16,
+        num_train_epochs=num_train_epochs,
+        seed=0,
+        save_steps=save_steps,
+        save_total_limit=2,
+        load_best_model_at_end=True,
+        metric_for_best_model='eval_f1'
+    )
+    trainer = Trainer(
+        model=model,
+        args=args,
+        train_dataset=train_dataset,
+        eval_dataset=val_dataset,
+        compute_metrics=compute_metrics,
+        callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
+    )
+    res = trainer.train()