File size: 6,721 Bytes
80e69dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import snowflake.snowpark as snowpark
from snowflake.snowpark.functions import col
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_recall_curve, auc
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
import json

def main(session: snowpark.Session): 
    # Load data from table
    df = session.table("PUBLIC.BEQUESTS_CLEAN").to_pandas()

    # Define imputers (only mean and mice)
    imputers = {
        'mean': SimpleImputer(strategy='mean'),
        'median': SimpleImputer(strategy='median'),
        'mice': IterativeImputer(random_state=42)
    }

    # Store results
    results_dead = []
    results_alive = []
    results_modeling = []

    # Function to evaluate imputation method
    def evaluate_imputation(df, imputer_name, imputer):
        # Impute BIRTH_YEAR
        df['BIRTH_YEAR'] = imputer.fit_transform(df[['BIRTH_YEAR']])
        
        # Encode categorical variables
        df = pd.get_dummies(df, columns=['REGION_CODE'], drop_first=True)
        
        # Define features after one-hot encoding
        feature_columns = [
            'TOTAL_TRANSACTIONS',
            'TOTAL_AMOUNT',
            'FIRST_GIFT_AMOUNT',
            'MRC_AMOUNT',
            'HPC_AMOUNT',
            'YEARS_SINCE_FIRST_GIFT',
            'YEARS_SINCE_MRC_GIFT',
            'YEARS_SINCE_HPC_GIFT',
            'BIRTH_YEAR'
        ] + [col for col in df.columns if col.startswith('REGION_CODE_')]

        # Separate dead and alive individuals
        df_dead = df[df['DEATH_FLAG'] == 1]
        df_alive = df[df['DEATH_FLAG'] == 0]

        # Train model on dead individuals
        if len(df_dead) > 0:
            X_dead = df_dead[feature_columns]
            y_dead = df_dead['BEQUEST_RECEIVED']
            ROI_FAMILY_ID_dead = df_dead['ROI_FAMILY_ID']
            
            # Cross-validation setup
            skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
            smote = SMOTE(random_state=42)
            model = RandomForestClassifier(random_state=42, n_jobs=-1)  # Use all available cores

            # Cross-validated predictions
            y_pred_dead = np.zeros(len(y_dead))
            y_pred_proba_dead = np.zeros(len(y_dead))
            for train_index, test_index in skf.split(X_dead, y_dead):
                X_train, X_test = X_dead.iloc[train_index], X_dead.iloc[test_index]
                y_train, y_test = y_dead.iloc[train_index], y_dead.iloc[test_index]

                X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
                model.fit(X_train_res, y_train_res)
                y_pred_dead[test_index] = model.predict(X_test)
                y_pred_proba_dead[test_index] = model.predict_proba(X_test)[:, 1]  # Probability for class 1

            # Evaluation for dead individuals
            accuracy_dead = accuracy_score(y_dead, y_pred_dead)
            precision_dead, recall_dead, _ = precision_recall_curve(y_dead, y_pred_proba_dead)
            auc_pr_dead = auc(recall_dead, precision_dead)
            report_dead = classification_report(y_dead, y_pred_dead, output_dict=True)
            model.fit(X_dead, y_dead)
            feature_importance_dead = pd.DataFrame({
                'Feature': X_dead.columns,
                'Importance': model.feature_importances_
            }).sort_values(by='Importance', ascending=False)

            results_dead.append({
                'imputer': imputer_name,
                'accuracy': accuracy_dead,
                'auc_pr': auc_pr_dead,
                'report': pd.DataFrame(report_dead).transpose(),
                'feature_importance': feature_importance_dead,
                'ROI_FAMILY_ID': ROI_FAMILY_ID_dead,
                'y_true': y_dead,
                'y_pred': y_pred_dead
            })

            results_modeling.append({
                'imputer': imputer_name,
                'accuracy': accuracy_dead,
                'auc_pr': auc_pr_dead,
                'classification_report': json.dumps(report_dead),
                'feature_importance': feature_importance_dead.to_dict(orient='list')
            })

        # Predict on alive individuals
        if len(df_alive) > 0:
            X_alive = df_alive[feature_columns]
            y_pred_alive = model.predict(X_alive)
            ROI_FAMILY_ID_alive = df_alive['ROI_FAMILY_ID']

            results_alive.append({
                'imputer': imputer_name,
                'ROI_FAMILY_ID': ROI_FAMILY_ID_alive,
                'y_pred': y_pred_alive
            })

    # Evaluate each imputation method
    for imputer_name, imputer in imputers.items():
        evaluate_imputation(df.copy(), imputer_name, imputer)

    # Print the modeling results for dead individuals
    for result in results_dead:
        print(f"Imputer: {result['imputer']} (Dead)")
        print("Accuracy:", result['accuracy'])
        print("AUC-PR:", result['auc_pr'])
        print("Classification Report:")
        print(result['report'])
        print("Feature Importance:")
        print(result['feature_importance'])
        print("\n" + "-"*50 + "\n")

    # Combine all dead predictions into a single DataFrame
    predictions_dead_df = pd.concat([
        pd.DataFrame({
            'ROI_FAMILY_ID': result['ROI_FAMILY_ID'],
            'imputer': result['imputer'],
            'y_true': result['y_true'],
            'y_pred': result['y_pred'],
            'status': 'dead'
        }) for result in results_dead
    ], ignore_index=True)

    # Combine all alive predictions into a single DataFrame
    predictions_alive_df = pd.concat([
        pd.DataFrame({
            'ROI_FAMILY_ID': result['ROI_FAMILY_ID'],
            'imputer': result['imputer'],
            'y_pred': result['y_pred'],
            'status': 'alive'
        }) for result in results_alive
    ], ignore_index=True)

    # Write the dead predictions DataFrame to a new table
    session.write_pandas(predictions_dead_df, 'BEQUEST_PREDICTIONS_DEAD', auto_create_table=True)

    # Write the alive predictions DataFrame to a new table
    session.write_pandas(predictions_alive_df, 'BEQUEST_PREDICTIONS_ALIVE', auto_create_table=True)

    # Write the modeling results to a new table
    modeling_results_df = pd.DataFrame(results_modeling)
    session.write_pandas(modeling_results_df, 'BEQUEST_MODELING_RESULTS', auto_create_table=True)

    # Return string
    return "Data processing, prediction, and table creation completed successfully."