File size: 8,282 Bytes
6931ba0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from random import choices
def log(*args):
print(*args, flush=True)
def create_group(code):
"""
Creating group column, transforming an input string
Parameters:
code (str): string with ICD-10 code name
Returns:
group(str): string with ICD-10 group name
"""
group = code.split('.')[0]
return group
def df_creation(texts, labels,
all_classes, prompt_column_name,
code_column_name, group_column_name):
"""
Creates a DataFrame from medical reports, their corresponding ICD-10 codes, and class information.
Parameters:
texts (List[str]): A list of strings, where each string is a medical report.
labels (List[str]): A list of strings, where each string is an ICD-10 code name
relevant to the corresponding text in 'texts'.
all_classes (List[str]): A list of all ICD-10 code names from the initial dataset.
prompt_column_name (str): The column name in the DataFrame for the prompts.
code_column_name (str): The column name in the DataFrame for the codes.
group_column_name (str): The column name in the DataFrame for the groups.
Returns:
pandas.DataFrame: A DataFrame where each row contains the text of the report,
its corresponding ICD-10 code, and the group category derived
from the code.
"""
df = pd.DataFrame()
df[prompt_column_name] = texts
df[code_column_name] = [all_classes[c] for c in labels]
df[group_column_name] = [all_classes[c].split('.')[0] for c in labels]
return df
def select_random_rows(df_test, balance_column, random_n):
"""
Selects a random, balanced subset of rows from a DataFrame based on a specified column.
This function aims to create a balanced DataFrame by randomly selecting a specified number of rows
from each unique value in the balance column. It's particularly useful in scenarios where you
need a balanced sample from a dataset for testing or validation purposes.
Parameters:
df_test (pandas.DataFrame): The DataFrame to select rows from.
balance_column (str): The name of the column used to balance the data. The function will
select rows such that each unique value in this column is equally represented.
random_n (int): The number of rows to select for each unique value in the balance column.
Returns:
pandas.DataFrame: A new DataFrame containing a balanced, random subset of rows.
"""
classes = df_test[balance_column].unique()
balanced_data = []
for class_name in classes:
balanced_data += choices(df_test[df_test[balance_column]==class_name].to_dict('records'), k=random_n)
df = pd.DataFrame(balanced_data)
return df
def extract_valuable_data(path_to_raw_csv, prompt_column_name,
code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls):
"""
Extracts and processes valuable data from a raw CSV file based on specified criteria.
This function loads data from a CSV file, filters out rows based on non-null values in specified columns,
removes codes with a low number of associated prompts, filters for prompt length, creates a new 'group'
column, and saves the processed data to a new CSV file.
Parameters:
path_to_raw_csv (str): The file path to the raw CSV data file.
prompt_column_name (str): The column name in the CSV file for prompts.
code_column_name (str): The column name in the CSV file for codes.
path_to_processed_csv (str): The file path where the processed CSV data will be saved.
Returns:
pandas.DataFrame: A DataFrame containing the processed dataset.
"""
df = pd.read_csv(path_to_raw_csv)
log(path_to_raw_csv, prompt_column_name, code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls)
df = df[df[prompt_column_name].notna() & df[code_column_name].notna()]
log(f"New data is loaded. New data has {len(df)} reports.")
log(f"New data contains {len(df['code'].unique())} unique codes.")
# Leave data for codes where more than min_samples_per_cls prompts.
unique_values = df['code'].value_counts()
values_to_remove = unique_values[unique_values <= min_samples_per_cls].index
df = df[~df['code'].isin(values_to_remove)]
# leave prompts that are longer that min_text_len characters
df = df[df[prompt_column_name].str.len() >= min_text_len]
# Creating GROUP column in dataset
df['group'] = df['code'].apply(create_group)
log(f"New data is processed. Processed data has {len(df)} reports.")
log(f"Processed dataset contains {len(df['code'].unique())} codes.")
log(f"Processed dataset contains {len(df['group'].unique())} groups.")
# Saving processed dataset
df.to_csv(path_to_processed_csv, index=False)
log(f"Processed dataset is saved to {path_to_processed_csv}.")
return df
def balance_data(df, prompt_column_name, code_column_name,
group_column_name,random_n, test_size, path_to_train_csv,
path_to_csv_test_codes, path_to_csv_test_groups):
"""
Balances and splits a dataset into training and test sets, then saves these sets to CSV files.
This function takes a DataFrame and performs stratified splitting based on the specified 'code_column_name'
to create balanced training and test datasets. It then saves the training dataset and two versions of
the test dataset (one for codes and one for groups) to separate CSV files.
Parameters:
df (pandas.DataFrame): The DataFrame to be processed and split.
prompt_column_name (str): The column name in the DataFrame for the prompts.
code_column_name (str): The column name in the DataFrame for the codes.
group_column_name (str): The column name in the DataFrame for the groups.
random_n (int): The number of rows to be randomly selected in test datasets for each unique code or group.
test_size (float): The proportion of the dataset to include in the test split.
path_to_train_csv (str): The file path where the training dataset CSV will be saved.
path_to_csv_test_codes (str): The file path where the test dataset for codes CSV will be saved.
path_to_csv_test_groups (str): The file path where the test dataset for groups CSV will be saved.
Returns:
None
"""
texts = np.array(df[prompt_column_name])
labels = np.array(df[code_column_name])
groups = np.array(df[group_column_name])
all_classes = np.unique(labels).tolist()
labels = [all_classes.index(l) for l in labels]
log('='*50)
log(f"texts={len(texts)} labels={len(labels)} uniq_labels={len(np.unique(labels))} test_size={test_size}")
log('='*50)
texts_train, texts_test, labels_train, labels_test = train_test_split(
texts, labels, test_size=test_size, random_state=42, stratify=labels
)
log(f"Train dataset len={len(texts_train)}")
log(f"Test dataset len={len(texts_test)}")
log(f"Count of classes={len(np.unique(labels))}")
# Creating TRAIN and TEST dataset
df_train = df_creation(texts_train, labels_train, all_classes,
prompt_column_name, code_column_name, group_column_name)
df_train.to_csv(path_to_train_csv, index=False)
log(f"TRAIN dataset is saved to {path_to_train_csv}")
# Creating test datasets for codes and groups
df_test = df_creation(texts_test, labels_test, all_classes,
prompt_column_name, code_column_name, group_column_name)
df_test_codes = df_test # select_random_rows(df_test, code_column_name, random_n)
df_test_codes.to_csv(path_to_csv_test_codes, index=False)
log(f"TEST dataset for codes is saved to {path_to_csv_test_codes}")
df_test_groups = df_test # select_random_rows(df_test, group_column_name, random_n)
df_test_groups.to_csv(path_to_csv_test_groups, index=False)
log(f"TEST dataset for groups is saved to {path_to_csv_test_groups}")
|