|
import numpy as np |
|
import pandas as pd |
|
from sklearn.model_selection import train_test_split |
|
from random import choices |
|
|
|
|
|
def log(*args): |
|
print(*args, flush=True) |
|
|
|
def create_group(code): |
|
""" |
|
Creating group column, transforming an input string |
|
Parameters: |
|
code (str): string with ICD-10 code name |
|
Returns: |
|
group(str): string with ICD-10 group name |
|
""" |
|
|
|
group = code.split('.')[0] |
|
return group |
|
|
|
def df_creation(texts, labels, |
|
all_classes, prompt_column_name, |
|
code_column_name, group_column_name): |
|
""" |
|
Creates a DataFrame from medical reports, their corresponding ICD-10 codes, and class information. |
|
|
|
Parameters: |
|
texts (List[str]): A list of strings, where each string is a medical report. |
|
labels (List[str]): A list of strings, where each string is an ICD-10 code name |
|
relevant to the corresponding text in 'texts'. |
|
all_classes (List[str]): A list of all ICD-10 code names from the initial dataset. |
|
prompt_column_name (str): The column name in the DataFrame for the prompts. |
|
code_column_name (str): The column name in the DataFrame for the codes. |
|
group_column_name (str): The column name in the DataFrame for the groups. |
|
|
|
Returns: |
|
pandas.DataFrame: A DataFrame where each row contains the text of the report, |
|
its corresponding ICD-10 code, and the group category derived |
|
from the code. |
|
""" |
|
|
|
df = pd.DataFrame() |
|
df[prompt_column_name] = texts |
|
df[code_column_name] = [all_classes[c] for c in labels] |
|
df[group_column_name] = [all_classes[c].split('.')[0] for c in labels] |
|
return df |
|
|
|
def select_random_rows(df_test, balance_column, random_n): |
|
""" |
|
Selects a random, balanced subset of rows from a DataFrame based on a specified column. |
|
|
|
This function aims to create a balanced DataFrame by randomly selecting a specified number of rows |
|
from each unique value in the balance column. It's particularly useful in scenarios where you |
|
need a balanced sample from a dataset for testing or validation purposes. |
|
|
|
Parameters: |
|
df_test (pandas.DataFrame): The DataFrame to select rows from. |
|
balance_column (str): The name of the column used to balance the data. The function will |
|
select rows such that each unique value in this column is equally represented. |
|
random_n (int): The number of rows to select for each unique value in the balance column. |
|
|
|
Returns: |
|
pandas.DataFrame: A new DataFrame containing a balanced, random subset of rows. |
|
""" |
|
|
|
classes = df_test[balance_column].unique() |
|
balanced_data = [] |
|
for class_name in classes: |
|
balanced_data += choices(df_test[df_test[balance_column]==class_name].to_dict('records'), k=random_n) |
|
|
|
df = pd.DataFrame(balanced_data) |
|
return df |
|
|
|
def extract_valuable_data(path_to_raw_csv, prompt_column_name, |
|
code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls): |
|
""" |
|
Extracts and processes valuable data from a raw CSV file based on specified criteria. |
|
|
|
This function loads data from a CSV file, filters out rows based on non-null values in specified columns, |
|
removes codes with a low number of associated prompts, filters for prompt length, creates a new 'group' |
|
column, and saves the processed data to a new CSV file. |
|
|
|
Parameters: |
|
path_to_raw_csv (str): The file path to the raw CSV data file. |
|
prompt_column_name (str): The column name in the CSV file for prompts. |
|
code_column_name (str): The column name in the CSV file for codes. |
|
path_to_processed_csv (str): The file path where the processed CSV data will be saved. |
|
|
|
Returns: |
|
pandas.DataFrame: A DataFrame containing the processed dataset. |
|
""" |
|
|
|
df = pd.read_csv(path_to_raw_csv) |
|
log(path_to_raw_csv, prompt_column_name, code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls) |
|
|
|
df = df[df[prompt_column_name].notna() & df[code_column_name].notna()] |
|
log(f"New data is loaded. New data has {len(df)} reports.") |
|
log(f"New data contains {len(df['code'].unique())} unique codes.") |
|
|
|
|
|
unique_values = df['code'].value_counts() |
|
values_to_remove = unique_values[unique_values <= min_samples_per_cls].index |
|
df = df[~df['code'].isin(values_to_remove)] |
|
|
|
|
|
df = df[df[prompt_column_name].str.len() >= min_text_len] |
|
|
|
|
|
df['group'] = df['code'].apply(create_group) |
|
|
|
log(f"New data is processed. Processed data has {len(df)} reports.") |
|
log(f"Processed dataset contains {len(df['code'].unique())} codes.") |
|
log(f"Processed dataset contains {len(df['group'].unique())} groups.") |
|
|
|
|
|
df.to_csv(path_to_processed_csv, index=False) |
|
log(f"Processed dataset is saved to {path_to_processed_csv}.") |
|
return df |
|
|
|
|
|
def balance_data(df, prompt_column_name, code_column_name, |
|
group_column_name,random_n, test_size, path_to_train_csv, |
|
path_to_csv_test_codes, path_to_csv_test_groups): |
|
""" |
|
Balances and splits a dataset into training and test sets, then saves these sets to CSV files. |
|
|
|
This function takes a DataFrame and performs stratified splitting based on the specified 'code_column_name' |
|
to create balanced training and test datasets. It then saves the training dataset and two versions of |
|
the test dataset (one for codes and one for groups) to separate CSV files. |
|
|
|
Parameters: |
|
df (pandas.DataFrame): The DataFrame to be processed and split. |
|
prompt_column_name (str): The column name in the DataFrame for the prompts. |
|
code_column_name (str): The column name in the DataFrame for the codes. |
|
group_column_name (str): The column name in the DataFrame for the groups. |
|
random_n (int): The number of rows to be randomly selected in test datasets for each unique code or group. |
|
test_size (float): The proportion of the dataset to include in the test split. |
|
path_to_train_csv (str): The file path where the training dataset CSV will be saved. |
|
path_to_csv_test_codes (str): The file path where the test dataset for codes CSV will be saved. |
|
path_to_csv_test_groups (str): The file path where the test dataset for groups CSV will be saved. |
|
|
|
Returns: |
|
None |
|
""" |
|
|
|
texts = np.array(df[prompt_column_name]) |
|
labels = np.array(df[code_column_name]) |
|
groups = np.array(df[group_column_name]) |
|
|
|
all_classes = np.unique(labels).tolist() |
|
labels = [all_classes.index(l) for l in labels] |
|
log('='*50) |
|
log(f"texts={len(texts)} labels={len(labels)} uniq_labels={len(np.unique(labels))} test_size={test_size}") |
|
log('='*50) |
|
texts_train, texts_test, labels_train, labels_test = train_test_split( |
|
texts, labels, test_size=test_size, random_state=42, stratify=labels |
|
) |
|
|
|
log(f"Train dataset len={len(texts_train)}") |
|
log(f"Test dataset len={len(texts_test)}") |
|
log(f"Count of classes={len(np.unique(labels))}") |
|
|
|
|
|
df_train = df_creation(texts_train, labels_train, all_classes, |
|
prompt_column_name, code_column_name, group_column_name) |
|
df_train.to_csv(path_to_train_csv, index=False) |
|
log(f"TRAIN dataset is saved to {path_to_train_csv}") |
|
|
|
|
|
df_test = df_creation(texts_test, labels_test, all_classes, |
|
prompt_column_name, code_column_name, group_column_name) |
|
|
|
df_test_codes = df_test |
|
df_test_codes.to_csv(path_to_csv_test_codes, index=False) |
|
log(f"TEST dataset for codes is saved to {path_to_csv_test_codes}") |
|
|
|
df_test_groups = df_test |
|
df_test_groups.to_csv(path_to_csv_test_groups, index=False) |
|
log(f"TEST dataset for groups is saved to {path_to_csv_test_groups}") |
|
|