Spaces:
Runtime error
Runtime error
import numpy as np | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from random import choices | |
def log(*args): | |
print(*args, flush=True) | |
def create_group(code): | |
""" | |
Creating group column, transforming an input string | |
Parameters: | |
code (str): string with ICD-10 code name | |
Returns: | |
group(str): string with ICD-10 group name | |
""" | |
group = code.split('.')[0] | |
return group | |
def df_creation(texts, labels, | |
all_classes, prompt_column_name, | |
code_column_name, group_column_name): | |
""" | |
Creates a DataFrame from medical reports, their corresponding ICD-10 codes, and class information. | |
Parameters: | |
texts (List[str]): A list of strings, where each string is a medical report. | |
labels (List[str]): A list of strings, where each string is an ICD-10 code name | |
relevant to the corresponding text in 'texts'. | |
all_classes (List[str]): A list of all ICD-10 code names from the initial dataset. | |
prompt_column_name (str): The column name in the DataFrame for the prompts. | |
code_column_name (str): The column name in the DataFrame for the codes. | |
group_column_name (str): The column name in the DataFrame for the groups. | |
Returns: | |
pandas.DataFrame: A DataFrame where each row contains the text of the report, | |
its corresponding ICD-10 code, and the group category derived | |
from the code. | |
""" | |
df = pd.DataFrame() | |
df[prompt_column_name] = texts | |
df[code_column_name] = [all_classes[c] for c in labels] | |
df[group_column_name] = [all_classes[c].split('.')[0] for c in labels] | |
return df | |
def select_random_rows(df_test, balance_column, random_n): | |
""" | |
Selects a random, balanced subset of rows from a DataFrame based on a specified column. | |
This function aims to create a balanced DataFrame by randomly selecting a specified number of rows | |
from each unique value in the balance column. It's particularly useful in scenarios where you | |
need a balanced sample from a dataset for testing or validation purposes. | |
Parameters: | |
df_test (pandas.DataFrame): The DataFrame to select rows from. | |
balance_column (str): The name of the column used to balance the data. The function will | |
select rows such that each unique value in this column is equally represented. | |
random_n (int): The number of rows to select for each unique value in the balance column. | |
Returns: | |
pandas.DataFrame: A new DataFrame containing a balanced, random subset of rows. | |
""" | |
classes = df_test[balance_column].unique() | |
balanced_data = [] | |
for class_name in classes: | |
balanced_data += choices(df_test[df_test[balance_column]==class_name].to_dict('records'), k=random_n) | |
df = pd.DataFrame(balanced_data) | |
return df | |
def extract_valuable_data(path_to_raw_csv, prompt_column_name, | |
code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls): | |
""" | |
Extracts and processes valuable data from a raw CSV file based on specified criteria. | |
This function loads data from a CSV file, filters out rows based on non-null values in specified columns, | |
removes codes with a low number of associated prompts, filters for prompt length, creates a new 'group' | |
column, and saves the processed data to a new CSV file. | |
Parameters: | |
path_to_raw_csv (str): The file path to the raw CSV data file. | |
prompt_column_name (str): The column name in the CSV file for prompts. | |
code_column_name (str): The column name in the CSV file for codes. | |
path_to_processed_csv (str): The file path where the processed CSV data will be saved. | |
Returns: | |
pandas.DataFrame: A DataFrame containing the processed dataset. | |
""" | |
df = pd.read_csv(path_to_raw_csv) | |
log(path_to_raw_csv, prompt_column_name, code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls) | |
df = df[df[prompt_column_name].notna() & df[code_column_name].notna()] | |
log(f"New data is loaded. New data has {len(df)} reports.") | |
log(f"New data contains {len(df['code'].unique())} unique codes.") | |
# Leave data for codes where more than min_samples_per_cls prompts. | |
unique_values = df['code'].value_counts() | |
values_to_remove = unique_values[unique_values <= min_samples_per_cls].index | |
df = df[~df['code'].isin(values_to_remove)] | |
# leave prompts that are longer that min_text_len characters | |
df = df[df[prompt_column_name].str.len() >= min_text_len] | |
# Creating GROUP column in dataset | |
df['group'] = df['code'].apply(create_group) | |
log(f"New data is processed. Processed data has {len(df)} reports.") | |
log(f"Processed dataset contains {len(df['code'].unique())} codes.") | |
log(f"Processed dataset contains {len(df['group'].unique())} groups.") | |
# Saving processed dataset | |
df.to_csv(path_to_processed_csv, index=False) | |
log(f"Processed dataset is saved to {path_to_processed_csv}.") | |
return df | |
def balance_data(df, prompt_column_name, code_column_name, | |
group_column_name,random_n, test_size, path_to_train_csv, | |
path_to_csv_test_codes, path_to_csv_test_groups): | |
""" | |
Balances and splits a dataset into training and test sets, then saves these sets to CSV files. | |
This function takes a DataFrame and performs stratified splitting based on the specified 'code_column_name' | |
to create balanced training and test datasets. It then saves the training dataset and two versions of | |
the test dataset (one for codes and one for groups) to separate CSV files. | |
Parameters: | |
df (pandas.DataFrame): The DataFrame to be processed and split. | |
prompt_column_name (str): The column name in the DataFrame for the prompts. | |
code_column_name (str): The column name in the DataFrame for the codes. | |
group_column_name (str): The column name in the DataFrame for the groups. | |
random_n (int): The number of rows to be randomly selected in test datasets for each unique code or group. | |
test_size (float): The proportion of the dataset to include in the test split. | |
path_to_train_csv (str): The file path where the training dataset CSV will be saved. | |
path_to_csv_test_codes (str): The file path where the test dataset for codes CSV will be saved. | |
path_to_csv_test_groups (str): The file path where the test dataset for groups CSV will be saved. | |
Returns: | |
None | |
""" | |
texts = np.array(df[prompt_column_name]) | |
labels = np.array(df[code_column_name]) | |
groups = np.array(df[group_column_name]) | |
all_classes = np.unique(labels).tolist() | |
labels = [all_classes.index(l) for l in labels] | |
log('='*50) | |
log(f"texts={len(texts)} labels={len(labels)} uniq_labels={len(np.unique(labels))} test_size={test_size}") | |
log('='*50) | |
texts_train, texts_test, labels_train, labels_test = train_test_split( | |
texts, labels, test_size=test_size, random_state=42, stratify=labels | |
) | |
log(f"Train dataset len={len(texts_train)}") | |
log(f"Test dataset len={len(texts_test)}") | |
log(f"Count of classes={len(np.unique(labels))}") | |
# Creating TRAIN and TEST dataset | |
df_train = df_creation(texts_train, labels_train, all_classes, | |
prompt_column_name, code_column_name, group_column_name) | |
df_train.to_csv(path_to_train_csv, index=False) | |
log(f"TRAIN dataset is saved to {path_to_train_csv}") | |
# Creating test datasets for codes and groups | |
df_test = df_creation(texts_test, labels_test, all_classes, | |
prompt_column_name, code_column_name, group_column_name) | |
df_test_codes = df_test # select_random_rows(df_test, code_column_name, random_n) | |
df_test_codes.to_csv(path_to_csv_test_codes, index=False) | |
log(f"TEST dataset for codes is saved to {path_to_csv_test_codes}") | |
df_test_groups = df_test # select_random_rows(df_test, group_column_name, random_n) | |
df_test_groups.to_csv(path_to_csv_test_groups, index=False) | |
log(f"TEST dataset for groups is saved to {path_to_csv_test_groups}") | |