File size: 1,958 Bytes
61f924c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from scripts.download_data import download_data
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import QuantileTransformer
import pandas as pd

def calculate_metric(model):
    _, test_set = download_data() 
    X_test, y_test = test_set.drop(columns=['cardio']), test_set['cardio']  
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, pos_label='positive')            
    return f1


def model_training():
    train_set, _ = download_data() 
    X_train, y_train = train_set.drop(columns=['cardio']), train_set['cardio']   
   
    num_columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo',]
    cat_columns = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
    
    num_pipe = Pipeline([
        ('qt', QuantileTransformer(output_distribution="normal")),
        ('scaler', StandardScaler()),
        ('power', PowerTransformer()),     
    ])
    
    cat_pipe = Pipeline([
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessors_all = ColumnTransformer(transformers=[
        ('num_p', num_pipe, num_columns),
        ('cat_p', cat_pipe, cat_columns),    
    ])

    pipe_all = Pipeline([
                ('preprocessors', preprocessors_all),
                ('model', RandomForestClassifier(n_estimators=200,
                                      criterion = "gini",
                                      min_samples_split=15,
                                      max_depth=15,
                                      oob_score=True)
                )
    ])
   
    pipe_all.fit(X_train, y_train)
    
    return pipe_all, calculate_metric(pipe_all)