opdx / helpers /data_processor.py
lyangas
missed files
6931ba0
raw
history blame
8.28 kB
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from random import choices
def log(*args):
print(*args, flush=True)
def create_group(code):
"""
Creating group column, transforming an input string
Parameters:
code (str): string with ICD-10 code name
Returns:
group(str): string with ICD-10 group name
"""
group = code.split('.')[0]
return group
def df_creation(texts, labels,
all_classes, prompt_column_name,
code_column_name, group_column_name):
"""
Creates a DataFrame from medical reports, their corresponding ICD-10 codes, and class information.
Parameters:
texts (List[str]): A list of strings, where each string is a medical report.
labels (List[str]): A list of strings, where each string is an ICD-10 code name
relevant to the corresponding text in 'texts'.
all_classes (List[str]): A list of all ICD-10 code names from the initial dataset.
prompt_column_name (str): The column name in the DataFrame for the prompts.
code_column_name (str): The column name in the DataFrame for the codes.
group_column_name (str): The column name in the DataFrame for the groups.
Returns:
pandas.DataFrame: A DataFrame where each row contains the text of the report,
its corresponding ICD-10 code, and the group category derived
from the code.
"""
df = pd.DataFrame()
df[prompt_column_name] = texts
df[code_column_name] = [all_classes[c] for c in labels]
df[group_column_name] = [all_classes[c].split('.')[0] for c in labels]
return df
def select_random_rows(df_test, balance_column, random_n):
"""
Selects a random, balanced subset of rows from a DataFrame based on a specified column.
This function aims to create a balanced DataFrame by randomly selecting a specified number of rows
from each unique value in the balance column. It's particularly useful in scenarios where you
need a balanced sample from a dataset for testing or validation purposes.
Parameters:
df_test (pandas.DataFrame): The DataFrame to select rows from.
balance_column (str): The name of the column used to balance the data. The function will
select rows such that each unique value in this column is equally represented.
random_n (int): The number of rows to select for each unique value in the balance column.
Returns:
pandas.DataFrame: A new DataFrame containing a balanced, random subset of rows.
"""
classes = df_test[balance_column].unique()
balanced_data = []
for class_name in classes:
balanced_data += choices(df_test[df_test[balance_column]==class_name].to_dict('records'), k=random_n)
df = pd.DataFrame(balanced_data)
return df
def extract_valuable_data(path_to_raw_csv, prompt_column_name,
code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls):
"""
Extracts and processes valuable data from a raw CSV file based on specified criteria.
This function loads data from a CSV file, filters out rows based on non-null values in specified columns,
removes codes with a low number of associated prompts, filters for prompt length, creates a new 'group'
column, and saves the processed data to a new CSV file.
Parameters:
path_to_raw_csv (str): The file path to the raw CSV data file.
prompt_column_name (str): The column name in the CSV file for prompts.
code_column_name (str): The column name in the CSV file for codes.
path_to_processed_csv (str): The file path where the processed CSV data will be saved.
Returns:
pandas.DataFrame: A DataFrame containing the processed dataset.
"""
df = pd.read_csv(path_to_raw_csv)
log(path_to_raw_csv, prompt_column_name, code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls)
df = df[df[prompt_column_name].notna() & df[code_column_name].notna()]
log(f"New data is loaded. New data has {len(df)} reports.")
log(f"New data contains {len(df['code'].unique())} unique codes.")
# Leave data for codes where more than min_samples_per_cls prompts.
unique_values = df['code'].value_counts()
values_to_remove = unique_values[unique_values <= min_samples_per_cls].index
df = df[~df['code'].isin(values_to_remove)]
# leave prompts that are longer that min_text_len characters
df = df[df[prompt_column_name].str.len() >= min_text_len]
# Creating GROUP column in dataset
df['group'] = df['code'].apply(create_group)
log(f"New data is processed. Processed data has {len(df)} reports.")
log(f"Processed dataset contains {len(df['code'].unique())} codes.")
log(f"Processed dataset contains {len(df['group'].unique())} groups.")
# Saving processed dataset
df.to_csv(path_to_processed_csv, index=False)
log(f"Processed dataset is saved to {path_to_processed_csv}.")
return df
def balance_data(df, prompt_column_name, code_column_name,
group_column_name,random_n, test_size, path_to_train_csv,
path_to_csv_test_codes, path_to_csv_test_groups):
"""
Balances and splits a dataset into training and test sets, then saves these sets to CSV files.
This function takes a DataFrame and performs stratified splitting based on the specified 'code_column_name'
to create balanced training and test datasets. It then saves the training dataset and two versions of
the test dataset (one for codes and one for groups) to separate CSV files.
Parameters:
df (pandas.DataFrame): The DataFrame to be processed and split.
prompt_column_name (str): The column name in the DataFrame for the prompts.
code_column_name (str): The column name in the DataFrame for the codes.
group_column_name (str): The column name in the DataFrame for the groups.
random_n (int): The number of rows to be randomly selected in test datasets for each unique code or group.
test_size (float): The proportion of the dataset to include in the test split.
path_to_train_csv (str): The file path where the training dataset CSV will be saved.
path_to_csv_test_codes (str): The file path where the test dataset for codes CSV will be saved.
path_to_csv_test_groups (str): The file path where the test dataset for groups CSV will be saved.
Returns:
None
"""
texts = np.array(df[prompt_column_name])
labels = np.array(df[code_column_name])
groups = np.array(df[group_column_name])
all_classes = np.unique(labels).tolist()
labels = [all_classes.index(l) for l in labels]
log('='*50)
log(f"texts={len(texts)} labels={len(labels)} uniq_labels={len(np.unique(labels))} test_size={test_size}")
log('='*50)
texts_train, texts_test, labels_train, labels_test = train_test_split(
texts, labels, test_size=test_size, random_state=42, stratify=labels
)
log(f"Train dataset len={len(texts_train)}")
log(f"Test dataset len={len(texts_test)}")
log(f"Count of classes={len(np.unique(labels))}")
# Creating TRAIN and TEST dataset
df_train = df_creation(texts_train, labels_train, all_classes,
prompt_column_name, code_column_name, group_column_name)
df_train.to_csv(path_to_train_csv, index=False)
log(f"TRAIN dataset is saved to {path_to_train_csv}")
# Creating test datasets for codes and groups
df_test = df_creation(texts_test, labels_test, all_classes,
prompt_column_name, code_column_name, group_column_name)
df_test_codes = df_test # select_random_rows(df_test, code_column_name, random_n)
df_test_codes.to_csv(path_to_csv_test_codes, index=False)
log(f"TEST dataset for codes is saved to {path_to_csv_test_codes}")
df_test_groups = df_test # select_random_rows(df_test, group_column_name, random_n)
df_test_groups.to_csv(path_to_csv_test_groups, index=False)
log(f"TEST dataset for groups is saved to {path_to_csv_test_groups}")