import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from random import choices def log(*args): print(*args, flush=True) def create_group(code): """ Creating group column, transforming an input string Parameters: code (str): string with ICD-10 code name Returns: group(str): string with ICD-10 group name """ group = code.split('.')[0] return group def df_creation(texts, labels, all_classes, prompt_column_name, code_column_name, group_column_name): """ Creates a DataFrame from medical reports, their corresponding ICD-10 codes, and class information. Parameters: texts (List[str]): A list of strings, where each string is a medical report. labels (List[str]): A list of strings, where each string is an ICD-10 code name relevant to the corresponding text in 'texts'. all_classes (List[str]): A list of all ICD-10 code names from the initial dataset. prompt_column_name (str): The column name in the DataFrame for the prompts. code_column_name (str): The column name in the DataFrame for the codes. group_column_name (str): The column name in the DataFrame for the groups. Returns: pandas.DataFrame: A DataFrame where each row contains the text of the report, its corresponding ICD-10 code, and the group category derived from the code. """ df = pd.DataFrame() df[prompt_column_name] = texts df[code_column_name] = [all_classes[c] for c in labels] df[group_column_name] = [all_classes[c].split('.')[0] for c in labels] return df def select_random_rows(df_test, balance_column, random_n): """ Selects a random, balanced subset of rows from a DataFrame based on a specified column. This function aims to create a balanced DataFrame by randomly selecting a specified number of rows from each unique value in the balance column. It's particularly useful in scenarios where you need a balanced sample from a dataset for testing or validation purposes. Parameters: df_test (pandas.DataFrame): The DataFrame to select rows from. balance_column (str): The name of the column used to balance the data. The function will select rows such that each unique value in this column is equally represented. random_n (int): The number of rows to select for each unique value in the balance column. Returns: pandas.DataFrame: A new DataFrame containing a balanced, random subset of rows. """ classes = df_test[balance_column].unique() balanced_data = [] for class_name in classes: balanced_data += choices(df_test[df_test[balance_column]==class_name].to_dict('records'), k=random_n) df = pd.DataFrame(balanced_data) return df def extract_valuable_data(path_to_raw_csv, prompt_column_name, code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls): """ Extracts and processes valuable data from a raw CSV file based on specified criteria. This function loads data from a CSV file, filters out rows based on non-null values in specified columns, removes codes with a low number of associated prompts, filters for prompt length, creates a new 'group' column, and saves the processed data to a new CSV file. Parameters: path_to_raw_csv (str): The file path to the raw CSV data file. prompt_column_name (str): The column name in the CSV file for prompts. code_column_name (str): The column name in the CSV file for codes. path_to_processed_csv (str): The file path where the processed CSV data will be saved. Returns: pandas.DataFrame: A DataFrame containing the processed dataset. """ df = pd.read_csv(path_to_raw_csv) log(path_to_raw_csv, prompt_column_name, code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls) df = df[df[prompt_column_name].notna() & df[code_column_name].notna()] log(f"New data is loaded. New data has {len(df)} reports.") log(f"New data contains {len(df['code'].unique())} unique codes.") # Leave data for codes where more than min_samples_per_cls prompts. unique_values = df['code'].value_counts() values_to_remove = unique_values[unique_values <= min_samples_per_cls].index df = df[~df['code'].isin(values_to_remove)] # leave prompts that are longer that min_text_len characters df = df[df[prompt_column_name].str.len() >= min_text_len] # Creating GROUP column in dataset df['group'] = df['code'].apply(create_group) log(f"New data is processed. Processed data has {len(df)} reports.") log(f"Processed dataset contains {len(df['code'].unique())} codes.") log(f"Processed dataset contains {len(df['group'].unique())} groups.") # Saving processed dataset df.to_csv(path_to_processed_csv, index=False) log(f"Processed dataset is saved to {path_to_processed_csv}.") return df def balance_data(df, prompt_column_name, code_column_name, group_column_name,random_n, test_size, path_to_train_csv, path_to_csv_test_codes, path_to_csv_test_groups): """ Balances and splits a dataset into training and test sets, then saves these sets to CSV files. This function takes a DataFrame and performs stratified splitting based on the specified 'code_column_name' to create balanced training and test datasets. It then saves the training dataset and two versions of the test dataset (one for codes and one for groups) to separate CSV files. Parameters: df (pandas.DataFrame): The DataFrame to be processed and split. prompt_column_name (str): The column name in the DataFrame for the prompts. code_column_name (str): The column name in the DataFrame for the codes. group_column_name (str): The column name in the DataFrame for the groups. random_n (int): The number of rows to be randomly selected in test datasets for each unique code or group. test_size (float): The proportion of the dataset to include in the test split. path_to_train_csv (str): The file path where the training dataset CSV will be saved. path_to_csv_test_codes (str): The file path where the test dataset for codes CSV will be saved. path_to_csv_test_groups (str): The file path where the test dataset for groups CSV will be saved. Returns: None """ texts = np.array(df[prompt_column_name]) labels = np.array(df[code_column_name]) groups = np.array(df[group_column_name]) all_classes = np.unique(labels).tolist() labels = [all_classes.index(l) for l in labels] log('='*50) log(f"texts={len(texts)} labels={len(labels)} uniq_labels={len(np.unique(labels))} test_size={test_size}") log('='*50) texts_train, texts_test, labels_train, labels_test = train_test_split( texts, labels, test_size=test_size, random_state=42, stratify=labels ) log(f"Train dataset len={len(texts_train)}") log(f"Test dataset len={len(texts_test)}") log(f"Count of classes={len(np.unique(labels))}") # Creating TRAIN and TEST dataset df_train = df_creation(texts_train, labels_train, all_classes, prompt_column_name, code_column_name, group_column_name) df_train.to_csv(path_to_train_csv, index=False) log(f"TRAIN dataset is saved to {path_to_train_csv}") # Creating test datasets for codes and groups df_test = df_creation(texts_test, labels_test, all_classes, prompt_column_name, code_column_name, group_column_name) df_test_codes = df_test # select_random_rows(df_test, code_column_name, random_n) df_test_codes.to_csv(path_to_csv_test_codes, index=False) log(f"TEST dataset for codes is saved to {path_to_csv_test_codes}") df_test_groups = df_test # select_random_rows(df_test, group_column_name, random_n) df_test_groups.to_csv(path_to_csv_test_groups, index=False) log(f"TEST dataset for groups is saved to {path_to_csv_test_groups}")