File size: 8,282 Bytes
6931ba0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from random import choices


def log(*args):
    print(*args, flush=True)

def create_group(code):
    """
    Creating group column, transforming an input string
    Parameters:
        code (str): string with ICD-10 code name
    Returns:
        group(str): string with ICD-10 group name
    """

    group = code.split('.')[0]
    return group

def df_creation(texts, labels, 
                    all_classes, prompt_column_name, 
                    code_column_name, group_column_name):
    """
    Creates a DataFrame from medical reports, their corresponding ICD-10 codes, and class information.

    Parameters:
        texts (List[str]): A list of strings, where each string is a medical report.
        labels (List[str]): A list of strings, where each string is an ICD-10 code name 
                            relevant to the corresponding text in 'texts'.
        all_classes (List[str]): A list of all ICD-10 code names from the initial dataset.
        prompt_column_name (str): The column name in the DataFrame for the prompts.
        code_column_name (str): The column name in the DataFrame for the codes.
        group_column_name (str): The column name in the DataFrame for the groups.

    Returns:
        pandas.DataFrame: A DataFrame where each row contains the text of the report, 
                          its corresponding ICD-10 code, and the group category derived 
                          from the code.
    """

    df = pd.DataFrame()
    df[prompt_column_name] = texts
    df[code_column_name] = [all_classes[c] for c in labels]
    df[group_column_name] = [all_classes[c].split('.')[0] for c in labels]
    return df

def select_random_rows(df_test, balance_column, random_n):
    """
    Selects a random, balanced subset of rows from a DataFrame based on a specified column.

    This function aims to create a balanced DataFrame by randomly selecting a specified number of rows 
    from each unique value in the balance column. It's particularly useful in scenarios where you 
    need a balanced sample from a dataset for testing or validation purposes.

    Parameters:
        df_test (pandas.DataFrame): The DataFrame to select rows from.
        balance_column (str): The name of the column used to balance the data. The function will 
                              select rows such that each unique value in this column is equally represented.
        random_n (int): The number of rows to select for each unique value in the balance column.

    Returns:
        pandas.DataFrame: A new DataFrame containing a balanced, random subset of rows.
    """

    classes = df_test[balance_column].unique()
    balanced_data = []
    for class_name in classes:
        balanced_data += choices(df_test[df_test[balance_column]==class_name].to_dict('records'), k=random_n)

    df = pd.DataFrame(balanced_data)
    return df

def extract_valuable_data(path_to_raw_csv, prompt_column_name, 
                            code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls):
    """
    Extracts and processes valuable data from a raw CSV file based on specified criteria.

    This function loads data from a CSV file, filters out rows based on non-null values in specified columns, 
    removes codes with a low number of associated prompts, filters for prompt length, creates a new 'group' 
    column, and saves the processed data to a new CSV file.

    Parameters:
        path_to_raw_csv (str): The file path to the raw CSV data file.
        prompt_column_name (str): The column name in the CSV file for prompts.
        code_column_name (str): The column name in the CSV file for codes.
        path_to_processed_csv (str): The file path where the processed CSV data will be saved.

    Returns:
        pandas.DataFrame: A DataFrame containing the processed dataset.
    """

    df = pd.read_csv(path_to_raw_csv)
    log(path_to_raw_csv, prompt_column_name, code_column_name, path_to_processed_csv, min_text_len, min_samples_per_cls)

    df = df[df[prompt_column_name].notna() & df[code_column_name].notna()]
    log(f"New data is loaded. New data has {len(df)} reports.")
    log(f"New data contains {len(df['code'].unique())} unique codes.")

    # Leave data for codes where more than min_samples_per_cls prompts.
    unique_values = df['code'].value_counts()
    values_to_remove = unique_values[unique_values <= min_samples_per_cls].index
    df = df[~df['code'].isin(values_to_remove)]

    # leave prompts that are longer that min_text_len characters
    df = df[df[prompt_column_name].str.len() >= min_text_len]

    # Creating GROUP column in dataset
    df['group'] = df['code'].apply(create_group)
    
    log(f"New data is processed. Processed data has {len(df)} reports.")
    log(f"Processed dataset contains {len(df['code'].unique())} codes.")
    log(f"Processed dataset contains {len(df['group'].unique())} groups.")

    # Saving processed dataset
    df.to_csv(path_to_processed_csv, index=False)
    log(f"Processed dataset is saved to {path_to_processed_csv}.")
    return df


def balance_data(df, prompt_column_name, code_column_name, 
                group_column_name,random_n, test_size, path_to_train_csv, 
                path_to_csv_test_codes, path_to_csv_test_groups):
    """
    Balances and splits a dataset into training and test sets, then saves these sets to CSV files.

    This function takes a DataFrame and performs stratified splitting based on the specified 'code_column_name' 
    to create balanced training and test datasets. It then saves the training dataset and two versions of 
    the test dataset (one for codes and one for groups) to separate CSV files.

    Parameters:
        df (pandas.DataFrame): The DataFrame to be processed and split.
        prompt_column_name (str): The column name in the DataFrame for the prompts.
        code_column_name (str): The column name in the DataFrame for the codes.
        group_column_name (str): The column name in the DataFrame for the groups.
        random_n (int): The number of rows to be randomly selected in test datasets for each unique code or group.
        test_size (float): The proportion of the dataset to include in the test split.
        path_to_train_csv (str): The file path where the training dataset CSV will be saved.
        path_to_csv_test_codes (str): The file path where the test dataset for codes CSV will be saved.
        path_to_csv_test_groups (str): The file path where the test dataset for groups CSV will be saved.

    Returns:
        None
    """

    texts = np.array(df[prompt_column_name])
    labels = np.array(df[code_column_name])
    groups = np.array(df[group_column_name])

    all_classes = np.unique(labels).tolist()
    labels = [all_classes.index(l) for l in labels]
    log('='*50)
    log(f"texts={len(texts)} labels={len(labels)} uniq_labels={len(np.unique(labels))} test_size={test_size}")
    log('='*50)
    texts_train, texts_test, labels_train, labels_test = train_test_split(
        texts, labels, test_size=test_size, random_state=42, stratify=labels
    )

    log(f"Train dataset len={len(texts_train)}")
    log(f"Test dataset len={len(texts_test)}")
    log(f"Count of classes={len(np.unique(labels))}")

    # Creating TRAIN and TEST dataset
    df_train = df_creation(texts_train, labels_train, all_classes, 
                            prompt_column_name, code_column_name, group_column_name)
    df_train.to_csv(path_to_train_csv, index=False)
    log(f"TRAIN dataset is saved to {path_to_train_csv}")

    # Creating test datasets for codes and groups
    df_test = df_creation(texts_test, labels_test, all_classes, 
                            prompt_column_name, code_column_name, group_column_name)

    df_test_codes = df_test # select_random_rows(df_test, code_column_name, random_n)
    df_test_codes.to_csv(path_to_csv_test_codes, index=False)
    log(f"TEST dataset for codes is saved to {path_to_csv_test_codes}")

    df_test_groups = df_test # select_random_rows(df_test, group_column_name, random_n)
    df_test_groups.to_csv(path_to_csv_test_groups, index=False)
    log(f"TEST dataset for groups is saved to {path_to_csv_test_groups}")