Vahe commited on
Commit
30d7a06
·
1 Parent(s): 03c004c

initialized

Browse files
app.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ import os
6
+ import sys
7
+
8
+ from company_bankruptcy.components.model_trainer import ModelTrainer
9
+ from company_bankruptcy.components.data_transformation import DataTransformation
10
+ from company_bankruptcy.utils.utils import load_object
11
+ from company_bankruptcy.logger.logger import logging
12
+ from company_bankruptcy.exception.exception import CustomException
13
+
14
+ def get_prob(input_df, trained_models_dict, feature_selection_dict, opt_dict):
15
+ if best_model_name == 'Average Ensemble':
16
+
17
+ default_prob = 0
18
+ for model_name in trained_models_dict:
19
+ if model_name == 'best_model_name':
20
+ continue
21
+ temp_features_list = feature_selection_dict[model_name][1]['selected_shap_feats']
22
+ temp_prob = trained_models_dict[model_name].predict_proba(input_df[temp_features_list])[:, 1]
23
+ default_prob += temp_prob
24
+ default_prob /= (len(trained_models_dict) - 1)
25
+
26
+ elif best_model_name == 'Optimized Ensemble':
27
+
28
+ rfm_features_list = feature_selection_dict['RandomForestClassifier'][1]['selected_shap_feats']
29
+ xgbm_features_list = feature_selection_dict['XGBClassifier'][1]['selected_shap_feats']
30
+ lrm_features_list = feature_selection_dict['LogisticRegression'][1]['selected_shap_feats']
31
+ svcm_features_list = feature_selection_dict['SVC'][1]['selected_shap_feats']
32
+
33
+ preds_list = []
34
+
35
+ for idx in opt_dict:
36
+ opt = opt_dict[idx]['opt']
37
+ rfm = opt_dict[idx]['rfm']
38
+ xgbm = opt_dict[idx]['xgbm']
39
+ lrm = opt_dict[idx]['lrm']
40
+ svcm = opt_dict[idx]['svcm']
41
+
42
+ rfm_probs = rfm.predict_proba(input_df[rfm_features_list])[:, 1]
43
+ xgbm_probs = xgbm.predict_proba(input_df[xgbm_features_list])[:, 1]
44
+ lrm_probs = lrm.predict_proba(input_df[lrm_features_list])[:, 1]
45
+ svcm_probs = svcm.predict_proba(input_df[svcm_features_list])[:, 1]
46
+
47
+ model_preds = np.column_stack([
48
+ rfm_probs,
49
+ xgbm_probs,
50
+ lrm_probs,
51
+ svcm_probs
52
+ ])
53
+
54
+ preds_list.append(opt.predict(model_preds))
55
+
56
+ default_prob = np.mean(np.column_stack(preds_list), axis=1)
57
+
58
+ elif best_model_name == 'Rank Ensemble':
59
+
60
+ rank_ensemble_list = []
61
+ prob_list = []
62
+ model_names_list = []
63
+
64
+ for model_name in trained_models_dict:
65
+ if model_name == 'best_model_name':
66
+ continue
67
+ temp_features_list = feature_selection_dict[model_name][1]['selected_shap_feats']
68
+ model_names_list.append(model_name)
69
+ rank_ensemble_list.append((model_name, trained_models_dict[model_name].best_score_))
70
+ prob_list.append(trained_models_dict[model_name].predict_proba(input_df[temp_features_list])[:, 1])
71
+
72
+ rank_ensemble_list = sorted(rank_ensemble_list, key=lambda x: x[1])
73
+
74
+ default_prob = 0
75
+ for i in range(len(rank_ensemble_list)):
76
+ default_prob += (i+1) * prob_list[model_names_list.index(rank_ensemble_list[i][0])]
77
+ default_prob /= (len(rank_ensemble_list) * (1 + len(rank_ensemble_list)) / 2)
78
+
79
+ else:
80
+ model = trained_models_dict[best_model_name]
81
+ temp_features_list = feature_selection_dict[best_model_name][1]['selected_shap_feats']
82
+ default_prob = model.predict_proba(input_df[temp_features_list])[:, 1]
83
+
84
+ return default_prob
85
+
86
+ st.set_page_config(
87
+ page_title='Default Predictor',
88
+ layout='centered'
89
+ )
90
+
91
+ try:
92
+
93
+ st.title('Company Default Predictor')
94
+
95
+ logging.info('Initiating dictionaries')
96
+ if 'trained_models_dict' not in st.session_state:
97
+ model_trainer_obj = ModelTrainer()
98
+ trained_models_dict = load_object(
99
+ os.path.join(
100
+ model_trainer_obj.model_trainer_config.trained_models_path,
101
+ 'trained_models_dict.pkl'
102
+ )
103
+ )
104
+ opt_dict = load_object(
105
+ os.path.join(
106
+ model_trainer_obj.model_trainer_config.trained_models_path,
107
+ 'opt_dict.pkl'
108
+ )
109
+ )
110
+
111
+ data_transformation_obj = DataTransformation()
112
+ feature_selection_dict = load_object(
113
+ data_transformation_obj.data_transformation_config.feature_selection_dict_file_path
114
+ )
115
+
116
+ example_data = pd.read_excel('app_input_example.xlsx')
117
+ # example_data = pd.read_csv('app_input_example.csv')
118
+
119
+ st.session_state['trained_models_dict'] = trained_models_dict
120
+ st.session_state['opt_dict'] = opt_dict
121
+ st.session_state['feature_selection_dict'] = feature_selection_dict
122
+ st.session_state['example_data'] = example_data
123
+
124
+ else:
125
+
126
+ trained_models_dict = st.session_state['trained_models_dict']
127
+ opt_dict = st.session_state['opt_dict']
128
+ feature_selection_dict = st.session_state['feature_selection_dict']
129
+ example_data = st.session_state['example_data']
130
+ logging.info('Dictionaries initiated')
131
+
132
+ logging.info('Checking button clicked')
133
+ if 'clicked' not in st.session_state:
134
+ st.session_state.clicked = False
135
+ logging.info(f'Button check passed with value {st.session_state.clicked}')
136
+
137
+
138
+ st.subheader('Please, fill in the input boxes or provide an csv/excel file and click on submit button to get the default probability(ies).')
139
+
140
+ best_model_name = trained_models_dict['best_model_name']
141
+
142
+ logging.info("Getting features' list")
143
+ if best_model_name in ['Average Ensemble', 'Optimized Ensemble', 'Rank Ensemble']:
144
+ features_list = []
145
+ for model_name in feature_selection_dict:
146
+ features_list.extend(
147
+ feature_selection_dict[model_name][1]['selected_shap_feats']
148
+ )
149
+ features_list = list(set(features_list))
150
+ else:
151
+ features_list = feature_selection_dict[best_model_name][1]['selected_shap_feats']
152
+ logging.info("Features' list found")
153
+
154
+ upload_container = st.container()
155
+ with upload_container:
156
+ upload_col1, upload_col2 = st.columns([0.6, 0.4])
157
+ uploaded_file = upload_col1.file_uploader(
158
+ 'Upload a csv/excel file with data',
159
+ type=["csv", "xlsx"]
160
+ )
161
+
162
+ # example_data = pd.read_csv('app_input_example.csv')
163
+ # example_data = pd.read_csv('artifacts/data.csv')
164
+ # example_data = pd.read_excel('app_input_example.xlsx')
165
+
166
+ # @st.cache_data
167
+ # def convert_df(df):
168
+ # return df.to_csv(index=False).encode("utf-8")
169
+ # # return df.to_excel(index=False).encode("utf-8")
170
+
171
+ # csv_data = convert_df(df=example_data[features_list])
172
+
173
+ csv_data = example_data[features_list].to_csv(index=False).encode("utf-8")
174
+
175
+ upload_col2.write('An example of the data file')
176
+ upload_col2.download_button(
177
+ 'Download',
178
+ data=csv_data,
179
+ file_name='input_example.csv',
180
+ mime="text/csv"
181
+ )
182
+
183
+ n_cols = 2
184
+ n_rows = int((len(features_list) - len(features_list) % n_cols) / n_cols)
185
+ if len(features_list) % n_cols != 0:
186
+ n_rows += 1
187
+
188
+ logging.info('Constructing the app input structure')
189
+ input_dict = {}
190
+ feature_idx = 0
191
+ for i in range(n_rows):
192
+
193
+ temp_input_container = st.container()
194
+
195
+ with temp_input_container:
196
+ col1, col2 = st.columns(n_cols)
197
+ if i <= n_rows - 1 and len(features_list) % 2 == 0:
198
+ input_dict[features_list[feature_idx]] = [
199
+ col1.number_input(
200
+ features_list[feature_idx],
201
+ format='%.6f' if features_list[feature_idx].split(' ')[-1] != 'Flag' else '%.0f'
202
+ )
203
+ ]
204
+ input_dict[features_list[feature_idx+1]] = [
205
+ col2.number_input(
206
+ features_list[feature_idx+1],
207
+ format='%.6f' if features_list[feature_idx+1].split(' ')[-1] != 'Flag' else '%.0f'
208
+ )
209
+ ]
210
+ else:
211
+ input_dict[features_list[feature_idx]] = [
212
+ col1.number_input(
213
+ features_list[feature_idx],
214
+ format='%.6f' if features_list[feature_idx].split(' ')[-1] != 'Flag' else '%.0f'
215
+ )
216
+ ]
217
+
218
+ feature_idx += 2
219
+
220
+ logging.info('Input structure constructed')
221
+
222
+ def set_button_click():
223
+ st.session_state.clicked = True
224
+
225
+ st.button('Submit', on_click=set_button_click)
226
+
227
+ if st.session_state.clicked and uploaded_file is None:
228
+
229
+ st.session_state.clicked = False
230
+
231
+ logging.info(f'Calculating prob for {best_model_name}')
232
+
233
+ input_df = pd.DataFrame(input_dict)
234
+
235
+ default_prob = get_prob(input_df, trained_models_dict, feature_selection_dict, opt_dict)
236
+
237
+ st.write(f"Default probability: {default_prob[0]:.4f}")
238
+
239
+ logging.info(f'Default prob: {default_prob[0]:.4f}')
240
+
241
+ elif st.session_state.clicked and uploaded_file is not None:
242
+ st.session_state.clicked = False
243
+ # bites_data = uploaded_file.getvalue()
244
+ # stringio = StringIO(bites_data.decode('utf-8'))
245
+ # string_data = stringio.read()
246
+ logging.info('Loading uploaded data')
247
+ file_extension = uploaded_file.name.split('.')[-1]
248
+ if file_extension == 'csv':
249
+ input_df = pd.read_csv(uploaded_file)
250
+ else:
251
+ input_df = pd.read_excel(uploaded_file)
252
+ # input_df = pd.read_excel(uploaded_file)
253
+ logging.info('Uploaded data loaded')
254
+
255
+ with st.spinner('Please wait...'):
256
+ logging.info(f'Calculating probabilies for {best_model_name}')
257
+ default_prob = get_prob(input_df, trained_models_dict, feature_selection_dict, opt_dict)
258
+ logging.info('Probabilities calculated')
259
+
260
+ result_df = pd.DataFrame()
261
+ result_df['default_probability'] = default_prob
262
+
263
+ result_data = result_df.to_csv(index=False).encode("utf-8")
264
+
265
+ st.success('Done!')
266
+
267
+ st.download_button(
268
+ 'Download the predicted probabilities',
269
+ data=result_data,
270
+ file_name='default_probabilities.csv',
271
+ mime='text/csv'
272
+ )
273
+
274
+ except Exception as e:
275
+ logging.info('Error occured while creating streamlit app')
276
+ raise CustomException(e, sys)
277
+
app_input_example.xlsx ADDED
Binary file (11.1 kB). View file
 
artifacts/feature_selection_dict.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffff597549a2c76e13872a5f2048d4b83da2a0f25eeccbade02a575871d84bf9
3
+ size 1930217
artifacts/models/opt_dict.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f79cee91a25f02eb551b29c882f6afc778d175c4d755497234c1b2f49f3bbde
3
+ size 15200636
artifacts/models/trained_models_dict.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9695d244584ea79581682da0530cbdfb3dd02c76598114626a09e5f3bac3b520
3
+ size 1983143
company_bankruptcy/__init__.py ADDED
File without changes
company_bankruptcy/components/__init__.py ADDED
File without changes
company_bankruptcy/components/data_ingestion.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ from company_bankruptcy.logger.logger import logging
5
+ from company_bankruptcy.exception.exception import CustomException
6
+ from company_bankruptcy.data_access.mongo_db_connection import MongoOps
7
+ from company_bankruptcy.constants.constants import DATABASE_NAME, COLLECTION_NAME, MONGODB_COLLECTION_STR
8
+
9
+ import os
10
+ import sys
11
+ from pathlib import Path
12
+ from dataclasses import dataclass
13
+
14
+ from sklearn.model_selection import train_test_split
15
+
16
+ MONGODB_COLLECTION_STR = "mongodb+srv://vcharchian:[email protected]/?retryWrites=true&w=majority&appName=Cluster0"
17
+
18
+ @dataclass
19
+ class DataIngestionConfig:
20
+ raw_data_path:str = os.path.join('artifacts', 'data.csv')
21
+ train_data_path:str = os.path.join('artifacts', 'train_data.csv')
22
+ test_data_path:str = os.path.join('artifacts', 'test_data.csv')
23
+
24
+ class DataIngestion:
25
+
26
+ def __init__(self):
27
+ self.ingestion_config = DataIngestionConfig()
28
+
29
+ def initiate_data_ingestion(self):
30
+ logging.info('Data ingestion started')
31
+ try:
32
+ logging.info('Reading the raw data')
33
+ mongo_instance = MongoOps(
34
+ client_url=MONGODB_COLLECTION_STR
35
+ )
36
+ data = mongo_instance.get_records(coll_name=COLLECTION_NAME, db_name=DATABASE_NAME)
37
+ logging.info('Data loaded')
38
+ os.makedirs(os.path.dirname(os.path.join(self.ingestion_config.raw_data_path)), exist_ok=True)
39
+ logging.info('Saving the data')
40
+ data.to_csv(self.ingestion_config.raw_data_path, index=False)
41
+ logging.info('Data saved')
42
+ logging.info('Splitting the data into train and test sets')
43
+ train_df, test_df = train_test_split(
44
+ data,
45
+ test_size=0.1,
46
+ random_state=13,
47
+ stratify=data['Bankrupt?']
48
+ )
49
+ logging.info('Saving train and test sets')
50
+ train_df.to_csv(self.ingestion_config.train_data_path, index=False)
51
+ test_df.to_csv(self.ingestion_config.test_data_path, index=False)
52
+ logging.info('Sets are saved')
53
+ logging.info('Data ingestion completed')
54
+ return (self.ingestion_config.train_data_path, self.ingestion_config.test_data_path)
55
+ except Exception as e:
56
+ logging.info('Error occured during data ingestion')
57
+ raise CustomException(e, sys)
58
+
59
+ if __name__ == '__main__':
60
+ data_ingestion_obj = DataIngestion()
61
+ train_path, test_path = data_ingestion_obj.initiate_data_ingestion()
company_bankruptcy/components/data_transformation.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from company_bankruptcy.logger.logger import logging
3
+ from company_bankruptcy.exception.exception import CustomException
4
+
5
+ import os
6
+ import sys
7
+ from dataclasses import dataclass
8
+
9
+ from sklearn.model_selection import StratifiedKFold
10
+
11
+ from company_bankruptcy.utils.utils import save_object, create_feature_selection_dict
12
+
13
+ @dataclass
14
+ class DataTransformationConfig:
15
+ feature_selection_dict_file_path = os.path.join('artifacts', 'feature_selection_dict.pkl')
16
+
17
+ class DataTransformation:
18
+
19
+ def __init__(self):
20
+ self.data_transformation_config = DataTransformationConfig()
21
+
22
+ def initiate_data_transformation(self, train_path, test_path, n_cv_folds=10):
23
+
24
+ try:
25
+ logging.info('Loading training data')
26
+ train_df = pd.read_csv(train_path)
27
+ logging.info('Training data loaded')
28
+
29
+ logging.info('Loading testing data')
30
+ test_df = pd.read_csv(test_path)
31
+ logging.info('Testing data loaded')
32
+
33
+ logging.info('Removing Net Income Flag')
34
+ train_df.drop(columns=' Net Income Flag', inplace=True)
35
+ test_df.drop(columns=' Net Income Flag', inplace=True)
36
+ logging.info('Net Income Flag removed')
37
+
38
+ logging.info('Specifying nominal and numerical features as list')
39
+ nominal_features = [' Liability-Assets Flag']
40
+ numerical_features = [col for col in train_df.columns if col not in nominal_features and col!='Bankrupt?']
41
+ logging.info('Nominal and numerical features specified')
42
+
43
+ logging.info(f'Creating {n_cv_folds} CV folds for train data')
44
+ skfold = StratifiedKFold(n_splits=n_cv_folds, random_state=42, shuffle=True)
45
+ skfold_list = []
46
+ for train_idxs, valid_idxs in skfold.split(train_df, y=train_df['Bankrupt?']):
47
+ skfold_list.append((train_idxs, valid_idxs))
48
+ logging.info('CV folds created')
49
+
50
+ # logging.info('Creating new columns using categorical and numerical iteractions')
51
+ # for feat in numerical_features:
52
+ # train_df[f"feat{numerical_features.index(feat)}"] = train_df[feat] * train_df[' Liability-Assets Flag']
53
+ # test_df[f"feat{numerical_features.index(feat)}"] = test_df[feat] * test_df[' Liability-Assets Flag']
54
+ # numerical_features.append(f"feat{numerical_features.index(feat)}")
55
+ # logging.info('New columns created')
56
+
57
+ logging.info('Starting feature selection')
58
+ selected_features_dict = create_feature_selection_dict(
59
+ data=train_df,
60
+ cv_fold_list=skfold_list,
61
+ numerical_features=numerical_features,
62
+ nominal_features=nominal_features
63
+ )
64
+ logging.info('Feature selection completed')
65
+
66
+ logging.info('Saving feature selection dictionary as pkl file')
67
+ save_object(
68
+ file_path=self.data_transformation_config.feature_selection_dict_file_path,
69
+ obj=selected_features_dict
70
+ )
71
+ logging.info('Dictionary saved')
72
+
73
+ return (train_df, test_df, skfold_list, numerical_features)
74
+
75
+ except Exception as e:
76
+ logging.info('Error occured during data transformation')
77
+ raise CustomException(e, sys)
78
+
79
+ if __name__ == '__main__':
80
+
81
+ data_transformation_obj = DataTransformation()
82
+ train_df, test_df, cv_fold_list, numerical_features = data_transformation_obj.initiate_data_transformation(
83
+ train_path='artifacts\\train_data.csv',
84
+ test_path='artifacts\\test_data.csv'
85
+ )
company_bankruptcy/components/model_evaluation.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ from company_bankruptcy.logger.logger import logging
5
+ from company_bankruptcy.exception.exception import CustomException
6
+ from company_bankruptcy.utils.utils import load_object
7
+ from company_bankruptcy.components.model_trainer import ModelTrainer
8
+ from company_bankruptcy.components.data_transformation import DataTransformation
9
+
10
+ import os
11
+ import sys
12
+
13
+ import mlflow
14
+ import mlflow.sklearn
15
+ import mlflow.xgboost
16
+
17
+ from sklearn.metrics import roc_auc_score
18
+
19
+ from urllib.parse import urlparse
20
+
21
+
22
+ class ModelEvaluation:
23
+
24
+ def __init__(self):
25
+
26
+ logging.info('Model evaluation started')
27
+
28
+ def initiate_model_evaluation(self, test_df):
29
+
30
+ try:
31
+
32
+ logging.info('Setting target variable')
33
+ y_test = test_df['Bankrupt?'].to_frame()
34
+ logging.info('Target variable set')
35
+
36
+ logging.info('Loading the trained models')
37
+ model_trainer_obj = ModelTrainer()
38
+ models_main_path = model_trainer_obj.model_trainer_config.trained_models_path
39
+ trained_models_dict = load_object(
40
+ os.path.join(models_main_path, 'trained_models_dict.pkl')
41
+ )
42
+ opt_dict = load_object(
43
+ os.path.join(models_main_path, 'opt_dict.pkl')
44
+ )
45
+ logging.info('Trained models loaded')
46
+
47
+ logging.info("Loading the features' dictionary")
48
+ data_transformation_obj = DataTransformation()
49
+ features_selection_dict_path = data_transformation_obj.data_transformation_config.feature_selection_dict_file_path
50
+ feature_selection_dict = load_object(features_selection_dict_path)
51
+ logging.info("Features' selection dictionary loaded")
52
+
53
+ test_score_dict = {}
54
+
55
+ logging.info('Finding test score for Average Ensemble')
56
+ y_test_pred_prob = 0
57
+ for model_name in trained_models_dict:
58
+ if model_name == 'best_model_name':
59
+ continue
60
+ features_list = feature_selection_dict[model_name][1]['selected_shap_feats']
61
+ temp_prob = trained_models_dict[model_name].predict_proba(test_df[features_list])[:, 1]
62
+ y_test_pred_prob += temp_prob
63
+ y_test_pred_prob /= (len(trained_models_dict) - 1)
64
+ avg_ens_score = roc_auc_score(y_test, y_test_pred_prob)
65
+ test_score_dict['AverageEnsemble'] = avg_ens_score
66
+ logging.info('Average Ensemble score calculated')
67
+
68
+ logging.info('Finding test score for Optimized Ensemble')
69
+ rfm_features_list = feature_selection_dict['RandomForestClassifier'][1]['selected_shap_feats']
70
+ xgbm_features_list = feature_selection_dict['XGBClassifier'][1]['selected_shap_feats']
71
+ lrm_features_list = feature_selection_dict['LogisticRegression'][1]['selected_shap_feats']
72
+ svcm_features_list = feature_selection_dict['SVC'][1]['selected_shap_feats']
73
+
74
+ preds_list = []
75
+
76
+ for idx in opt_dict:
77
+ opt = opt_dict[idx]['opt']
78
+ rfm = opt_dict[idx]['rfm']
79
+ xgbm = opt_dict[idx]['xgbm']
80
+ lrm = opt_dict[idx]['lrm']
81
+ svcm = opt_dict[idx]['svcm']
82
+
83
+ rfm_probs = rfm.predict_proba(test_df[rfm_features_list])[:, 1]
84
+ xgbm_probs = xgbm.predict_proba(test_df[xgbm_features_list])[:, 1]
85
+ lrm_probs = lrm.predict_proba(test_df[lrm_features_list])[:, 1]
86
+ svcm_probs = svcm.predict_proba(test_df[svcm_features_list])[:, 1]
87
+
88
+ model_preds = np.column_stack([
89
+ rfm_probs,
90
+ xgbm_probs,
91
+ lrm_probs,
92
+ svcm_probs
93
+ ])
94
+
95
+ preds_list.append(opt.predict(model_preds))
96
+
97
+ y_test_pred_prob = np.mean(np.column_stack(preds_list), axis=1)
98
+ optimized_ens_score = roc_auc_score(y_test, y_test_pred_prob)
99
+ test_score_dict['OptimizedEnsemble'] = optimized_ens_score
100
+ logging.info('Optimized Ensemble score calculated')
101
+
102
+ logging.info('Finding test score for Rank Ensemble')
103
+ rank_ensemble_list = []
104
+ prob_list = []
105
+ model_names_list = []
106
+
107
+ for model_name in trained_models_dict:
108
+ if model_name == 'best_model_name':
109
+ continue
110
+ features_list = feature_selection_dict[model_name][1]['selected_shap_feats']
111
+ model_names_list.append(model_name)
112
+ rank_ensemble_list.append((model_name, trained_models_dict[model_name].best_score_))
113
+ prob_list.append(trained_models_dict[model_name].predict_proba(test_df[features_list])[:, 1])
114
+
115
+ rank_ensemble_list = sorted(rank_ensemble_list, key=lambda x: x[1])
116
+
117
+ y_test_pred_prob = 0
118
+ for i in range(len(rank_ensemble_list)):
119
+ y_test_pred_prob += (i+1) * prob_list[model_names_list.index(rank_ensemble_list[i][0])]
120
+ y_test_pred_prob /= (len(rank_ensemble_list) * (1 + len(rank_ensemble_list)) / 2)
121
+ rank_ens_score = roc_auc_score(y_test, y_test_pred_prob)
122
+ test_score_dict['RankEnsemble'] = rank_ens_score
123
+ logging.info('Rank Ensemble score calculated')
124
+
125
+ for model_name in trained_models_dict:
126
+ if model_name == 'best_model_name':
127
+ continue
128
+ logging.info(f'Finding test score for {model_name}')
129
+ features_list = feature_selection_dict[model_name][1]['selected_shap_feats']
130
+ model = trained_models_dict[model_name]
131
+ y_test_pred_prob = model.predict_proba(test_df[features_list])[:, 1]
132
+ temp_score = roc_auc_score(y_test, y_test_pred_prob)
133
+ test_score_dict[model_name] = temp_score
134
+ logging.info(f'{model_name} score calculated')
135
+
136
+ logging.info('Getting mlflow tracking uri type')
137
+ tracking_uri_type_store = urlparse(mlflow.get_tracking_uri()).scheme
138
+ logging.info('Tracking uri got')
139
+
140
+ logging.info('Starting mlflow')
141
+ with mlflow.start_run():
142
+ for model_name in test_score_dict:
143
+ mlflow.log_metric(f'{model_name} ROC-AUC', test_score_dict[model_name])
144
+ if model_name in trained_models_dict.keys():
145
+ model = trained_models_dict[model_name]
146
+ if tracking_uri_type_store != 'file':
147
+ # if model_name == 'XGBClassifier':
148
+ # mlflow.xgboost.log_model(model, f'{model_name}', registered_model_name=f'{model_name}_model')
149
+ # else:
150
+ mlflow.sklearn.log_model(model, f'{model_name}', registered_model_name=f'{model_name}_model')
151
+ else:
152
+ # if model_name == 'XGBClassifier':
153
+ # mlflow.xgboost.log_model(model, f'{model_name}')
154
+ # else:
155
+ mlflow.sklearn.log_model(model, f'{model_name}')
156
+
157
+ logging.info('mlflow succeeded')
158
+
159
+
160
+ except Exception as e:
161
+
162
+ logging.info('Error occured during model evaluation')
163
+ raise CustomException(e, sys)
164
+
company_bankruptcy/components/model_trainer.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from company_bankruptcy.logger.logger import logging
4
+ from company_bankruptcy.exception.exception import CustomException
5
+ from company_bankruptcy.utils.utils import save_object, find_optimal_model
6
+
7
+ import os
8
+ import sys
9
+ from pathlib import Path
10
+ from dataclasses import dataclass
11
+
12
+
13
+ @dataclass
14
+ class ModelTrainerConfig:
15
+ trained_models_path = os.path.join('artifacts', 'models')
16
+
17
+
18
+ class ModelTrainer:
19
+
20
+ def __init__(self):
21
+ self.model_trainer_config = ModelTrainerConfig()
22
+
23
+ def initiate_model_training(self, train_df, test_df, features_dict_path, cv_fold_list, numerical_features):
24
+
25
+ try:
26
+
27
+ logging.info('Creating a directory to save trained models')
28
+ os.makedirs(
29
+ self.model_trainer_config.trained_models_path, exist_ok=True)
30
+ logging.info("Models' directory created")
31
+
32
+ logging.info('Finding the best model')
33
+ trained_models_dict, opt_dict = find_optimal_model(
34
+ train_df,
35
+ test_df,
36
+ features_dict_path,
37
+ cv_fold_list,
38
+ numerical_features
39
+ )
40
+
41
+ logging.info(
42
+ "Saving trained models' and ensemble optimized weights' dictionaries")
43
+ save_object(
44
+ file_path=os.path.join(
45
+ self.model_trainer_config.trained_models_path, 'trained_models_dict.pkl'),
46
+ obj=trained_models_dict
47
+ )
48
+
49
+ save_object(
50
+ file_path=os.path.join(
51
+ self.model_trainer_config.trained_models_path, 'opt_dict.pkl'),
52
+ obj=opt_dict
53
+ )
54
+ logging.info('Saving completed')
55
+
56
+ except Exception as e:
57
+ logging.info('Error occured during model training')
58
+ raise CustomException(e, sys)
59
+
60
+ # if __name__ == '__main__':
61
+ # model_training_obj = ModelTrainer()
62
+ # model_training_obj.initiate_model_training(
63
+ # train_df,
64
+ # test_df,
65
+ # features_dict_path,
66
+ # cv_fold_list,
67
+ # numerical_features
68
+ # )
company_bankruptcy/constants/__init__.py ADDED
File without changes
company_bankruptcy/constants/constants.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ DATABASE_NAME = "bankruptcy"
2
+
3
+ COLLECTION_NAME = "data"
4
+
5
+ MONGODB_COLLECTION_STR = "MONGODB_COLLECTION_STR"
company_bankruptcy/data_access/__init__.py ADDED
File without changes
company_bankruptcy/data_access/mongo_db_connection.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pymongo
3
+ import json
4
+
5
+ from company_bankruptcy.exception.exception import CustomException
6
+ from company_bankruptcy.logger.logger import logging
7
+ from company_bankruptcy.constants.constants import DATABASE_NAME, COLLECTION_NAME, MONGODB_COLLECTION_STR
8
+
9
+ import sys
10
+
11
+
12
+ class MongoOps:
13
+
14
+ def __init__(self, client_url:str, database_name:str=None, collection_name:str=None):
15
+ self.client_url = client_url
16
+ self.database_name = database_name
17
+ self.collection_name = collection_name
18
+
19
+ def create_client(self):
20
+ logging.info('Initiating MongoClient')
21
+ client = pymongo.MongoClient(self.client_url)
22
+ logging.info('MongoClient initiated')
23
+ return client
24
+
25
+ def create_database(self):
26
+ logging.info('Creating Mongo database')
27
+ client = self.create_client()
28
+ database = client[self.database_name]
29
+ logging.info(f'Mongo database {self.database_name} created')
30
+ return database
31
+
32
+ def create_collection(self):
33
+ logging.info('Creating Mongo collection')
34
+ database = self.create_database()
35
+ collection = database[self.collection_name]
36
+ logging.info(f'Mongo collection {self.collection_name} created')
37
+ return collection
38
+
39
+ def get_database(self, db_name:str):
40
+ logging.info(f'Accessing {db_name} database')
41
+ client = self.create_client()
42
+ database = client[db_name]
43
+ logging.info(f'{db_name} database accessed')
44
+ return database
45
+
46
+ def get_collection(self, coll_name:str, db_name:str):
47
+ logging.info(f'Accessing {coll_name} collection')
48
+ database = self.get_database(db_name)
49
+ collection = database[coll_name]
50
+ logging.info(f'{coll_name} collection accessed')
51
+ return collection
52
+
53
+ def insert_record(self, record:dict, coll_name:str, db_name:str):
54
+ collection = self.get_collection(coll_name, db_name)
55
+ logging.info(f'Starting record insertion into {coll_name} collection of {db_name} database')
56
+ if isinstance(record, list):
57
+ for data in record:
58
+ if type(data) != dict:
59
+ logging.info("Records' list should have elements as dict")
60
+ raise TypeError("Records' list should have elements as dict")
61
+ collection.insert_many(record)
62
+ elif isinstance(record, dict):
63
+ collection.insert_one(record)
64
+ logging.info(f'Insertion into {coll_name} collection of {db_name} database completed')
65
+
66
+ def insert_from_file(self, datafile:str, coll_name:str, db_name:str):
67
+ logging.info(f'Starting record insertion into {coll_name} collection of {db_name} database from {datafile}')
68
+ self.path = datafile
69
+
70
+ if self.path.endswith('.csv'):
71
+ df = pd.read_csv(self.path, encoding='utf-8')
72
+ elif self.path.endswith('.xlsx'):
73
+ df = pd.read_excel(self.path, encoding='utf-8')
74
+ logging.info('Data is loaded as a pandas dataframe')
75
+
76
+ logging.info('Converting the data into json')
77
+ datajson = json.loads(df.to_json(orient='record'))
78
+ logging.info('Conversion to json completed')
79
+
80
+ collection = self.get_collection(coll_name, db_name)
81
+
82
+ logging.info('Inserting json data')
83
+ collection.insert_many(datajson)
84
+ logging.info('Insertion completed')
85
+
86
+ def get_records(self, coll_name:str, db_name:str):
87
+ collection = self.get_collection(coll_name, db_name)
88
+ retrieved_data = pd.DataFrame(list(collection.find()))
89
+ try:
90
+ retrieved_data.drop(columns='_id', inplace=True)
91
+ logging.info('Loading the data from the database completed')
92
+ except Exception as e:
93
+ retrieved_data = pd.DataFrame()
94
+ logging.info('Loading the data from the database failed')
95
+ raise CustomException(e, sys)
96
+ return retrieved_data
97
+
98
+ if __name__ == '__main__':
99
+
100
+ mongo_instance = MongoOps(
101
+ client_url=MONGODB_COLLECTION_STR
102
+ )
103
+
104
+ retrieved_data = mongo_instance.get_records(coll_name=COLLECTION_NAME, db_name=DATABASE_NAME)
company_bankruptcy/exception/__init__.py ADDED
File without changes
company_bankruptcy/exception/exception.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+
4
+ class CustomException(Exception):
5
+
6
+ def __init__(self, error_message, error_details:sys):
7
+ self.error_message = error_message
8
+ _, _, exc_tb = error_details.exc_info()
9
+ self.lineno = exc_tb.tb_lineno
10
+ self.file_name = exc_tb.tb_frame.f_code.co_filename
11
+
12
+ def __str__(self):
13
+ return "Error occured in python script name [{0}] line number [{1}] error message [{2}]".format(
14
+ self.file_name, self.lineno, str(self.error_message))
15
+
16
+ if __name__ == '__main__':
17
+ try:
18
+ 1 / 0
19
+ except Exception as e:
20
+ raise CustomException(e, sys)
company_bankruptcy/logger/__init__.py ADDED
File without changes
company_bankruptcy/logger/logger.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from datetime import datetime as dt
4
+
5
+ LOG_FILE = f"{dt.now().strftime('%m_%d_%Y_%H_%M_%S')}.log"
6
+
7
+ log_path = os.path.join(os.getcwd(), "logs")
8
+
9
+ os.makedirs(log_path, exist_ok=True)
10
+
11
+ LOG_FILEPATH = os.path.join(log_path, LOG_FILE)
12
+
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ filename=LOG_FILEPATH,
16
+ format="[%(asctime)s] %(lineno)d %(name)s - %(levelname)s - %(message)s"
17
+ )
18
+
19
+ if __name__ == '__main__':
20
+ logging.info("Log testing executed!!!")
company_bankruptcy/pipeline/__init__.py ADDED
File without changes
company_bankruptcy/pipeline/prediction_pipeline.py ADDED
File without changes
company_bankruptcy/pipeline/training_pipeline.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from company_bankruptcy.components.data_ingestion import DataIngestion
2
+ from company_bankruptcy.components.data_transformation import DataTransformation
3
+ from company_bankruptcy.components.model_trainer import ModelTrainer
4
+ from company_bankruptcy.components.model_evaluation import ModelEvaluation
5
+
6
+ def run_pipeline():
7
+
8
+ data_ingestion_obj = DataIngestion()
9
+ train_path, test_path = data_ingestion_obj.initiate_data_ingestion()
10
+
11
+ data_transformation_obj = DataTransformation()
12
+ train_df, test_df, cv_fold_list, numerical_features = data_transformation_obj.initiate_data_transformation(
13
+ train_path=train_path,
14
+ test_path=test_path
15
+ )
16
+
17
+ model_training_obj = ModelTrainer()
18
+ model_training_obj.initiate_model_training(
19
+ train_df=train_df,
20
+ test_df=test_df,
21
+ features_dict_path=data_transformation_obj.data_transformation_config.feature_selection_dict_file_path,
22
+ cv_fold_list=cv_fold_list,
23
+ numerical_features=numerical_features
24
+ )
25
+
26
+ model_evaluation_obj = ModelEvaluation()
27
+ model_evaluation_obj.initiate_model_evaluation(test_df)
company_bankruptcy/utils/__init__.py ADDED
File without changes
company_bankruptcy/utils/utils.py ADDED
@@ -0,0 +1,974 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import pickle
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+ from company_bankruptcy.logger.logger import logging
8
+ from company_bankruptcy.exception.exception import CustomException
9
+
10
+ from sklearn.svm import SVC
11
+ from sklearn.feature_selection import RFE
12
+ from sklearn.feature_selection import r_regression, SelectKBest
13
+ from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
14
+ from sklearn.feature_selection import f_classif, chi2
15
+ from sklearn.ensemble import RandomForestClassifier
16
+ from sklearn.linear_model import LogisticRegression
17
+ from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
18
+ from sklearn.preprocessing import StandardScaler
19
+ from sklearn.model_selection import GridSearchCV
20
+ from sklearn.pipeline import Pipeline
21
+ from sklearn.compose import ColumnTransformer
22
+
23
+ from xgboost import XGBClassifier
24
+
25
+ from scipy import stats
26
+ from scipy.special import softmax
27
+ from scipy.optimize import fmin
28
+
29
+ from functools import partial
30
+
31
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
32
+
33
+ from boruta import BorutaPy
34
+
35
+ import shap
36
+
37
+ from collections import Counter
38
+
39
+ from tqdm.auto import tqdm
40
+ import gc
41
+
42
+ import warnings
43
+ warnings.filterwarnings('ignore')
44
+
45
+
46
+ def save_object(file_path, obj):
47
+ try:
48
+ dir_path = os.path.dirname(file_path)
49
+
50
+ os.makedirs(dir_path, exist_ok=True)
51
+
52
+ with open(file_path, "wb") as file_obj:
53
+ pickle.dump(obj, file_obj)
54
+
55
+ except Exception as e:
56
+ raise CustomException(e, sys)
57
+
58
+ def load_object(file_path):
59
+ try:
60
+ with open(file_path, 'rb') as file_obj:
61
+ return pickle.load(file_obj)
62
+ except Exception as e:
63
+ logging.info('Exception Occured in load_object function utils')
64
+ raise CustomException(e, sys)
65
+
66
+
67
+ def get_shap_features(shap_values, features, topk=10):
68
+ '''
69
+ Returns topk features selected using shap values
70
+
71
+ Args:
72
+ shap_values (object): shap explainer
73
+ features (list): list of features' name
74
+
75
+ Returns:
76
+ list: topk features derived from shap values
77
+ '''
78
+ # Calculates the feature importance (mean absolute shap value) for each feature
79
+ importances = []
80
+ for i in range(shap_values.values.shape[1]):
81
+ importances.append(np.mean(np.abs(shap_values.values[:, i])))
82
+ # Calculates the normalized version
83
+ importances_norm = softmax(importances)
84
+ # Organize the importances and columns in a dictionary
85
+ feature_importances = {fea: imp for imp, fea in zip(importances, features)}
86
+ feature_importances_norm = {fea: imp for imp,
87
+ fea in zip(importances_norm, features)}
88
+ # Sorts the dictionary
89
+ feature_importances = {k: v for k, v in sorted(
90
+ feature_importances.items(), key=lambda item: item[1], reverse=True)}
91
+ feature_importances_norm = {k: v for k, v in sorted(
92
+ feature_importances_norm.items(), key=lambda item: item[1], reverse=True)}
93
+ # Prints the feature importances
94
+ selected_topk_feats = []
95
+
96
+ for idx, (k, v) in enumerate(feature_importances.items()):
97
+ # print(f"{k} -> {v:.4f} (softmax = {feature_importances_norm[k]:.4f})")
98
+ if idx <= topk:
99
+ selected_topk_feats.append(k)
100
+
101
+ return selected_topk_feats
102
+
103
+
104
+ class FSelector():
105
+ '''
106
+ Helps to select features based on BorutaPy, RFE, and various statistics
107
+ '''
108
+
109
+ def __init__(self, X, y, num_feats, ordinal_feats, nominal_feats, model, is_target_cat=True, select_n_feats=15):
110
+ '''
111
+ Initializes some parameters
112
+
113
+ Args:
114
+ X (pd.DataFrame): contains features' values
115
+ y (pd.DataFrame): contains target values
116
+ num_feats (list): list of numerical features' names
117
+ ordinal_feats (list): list of ordinal features' names
118
+ nominal_feats (list): list of nominal features' names
119
+ model (model object): can be any type of model like RandomForest, LogisticRegression, etc.
120
+ is_target_cat (bool): indicates whether the target is categorical or not
121
+ select_n_feats (int): specifies the number of features to output
122
+ '''
123
+
124
+ self.X = X
125
+ self.y = y
126
+ self.num_feats = num_feats
127
+ self.ordinal_feats = ordinal_feats
128
+ self.nominal_feats = nominal_feats
129
+ self.model = model
130
+ self.is_target_cat = is_target_cat
131
+ self.select_n_feats = select_n_feats
132
+
133
+ def calculate_vif(self, X):
134
+
135
+ vif = pd.DataFrame()
136
+ vif["features"] = X.columns
137
+ vif["VIF"] = [variance_inflation_factor(
138
+ X.values, i) for i in range(X.shape[1])]
139
+
140
+ return vif
141
+
142
+ def select_feats_via_vif(self):
143
+
144
+ num_features = self.num_feats.copy()
145
+
146
+ vif_df = self.calculate_vif(self.X[num_features])
147
+
148
+ while vif_df[vif_df['VIF'] >= 10].shape[0] != 0:
149
+ vif_df.sort_values('VIF', ascending=False, inplace=True)
150
+ vif_df.reset_index(drop=True, inplace=True)
151
+ # print(vif_df)
152
+ elimination_candidate = vif_df.iloc[0]['features']
153
+ # print(elimination_candidate)
154
+ num_features = [i for i in num_features if i !=
155
+ elimination_candidate]
156
+ new_X = self.X[num_features]
157
+ vif_df = self.calculate_vif(new_X)
158
+
159
+ return list(vif_df['features'].values)
160
+
161
+ def get_spearmanr(self, X, y):
162
+ # return np.array([stats.spearmanr(X.values[:, f], y.values).correlation for f in range(X.shape[1])])
163
+ spearman_values = [stats.spearmanr(
164
+ X.values[:, f], y.values).correlation for f in range(X.shape[1])]
165
+ temp_sp_df = pd.DataFrame(
166
+ {'spearman': spearman_values, 'feats': list(X.columns)})
167
+ temp_sp_df['abs_spearman'] = np.abs(temp_sp_df['spearman'])
168
+ temp_sp_df.sort_values('abs_spearman', ascending=False, inplace=True)
169
+ temp_sp_df.reset_index(drop=True, inplace=True)
170
+ return temp_sp_df.iloc[:15]['feats'].to_list()
171
+
172
+ def get_kendalltau(self, X, y):
173
+ # return [stats.kendalltau(X.values[:, f], y.values).correlation for f in range(X.shape[1])]
174
+ kendall_values = [stats.spearmanr(
175
+ X.values[:, f], y.values).correlation for f in range(X.shape[1])]
176
+ temp_ken_df = pd.DataFrame(
177
+ {'kendall': kendall_values, 'feats': list(X.columns)})
178
+ temp_ken_df['abs_kendall'] = np.abs(temp_ken_df['kendall'])
179
+ temp_ken_df.sort_values('abs_kendall', ascending=False, inplace=True)
180
+ temp_ken_df.reset_index(drop=True, inplace=True)
181
+ return temp_ken_df.iloc[:15]['feats'].to_list()
182
+
183
+ def get_pointbiserialr(self, X, y):
184
+ return [stats.pointbiserialr(X.values[:, f], y.values).correlation for f in range(X.shape[1])]
185
+
186
+ def get_boruta_feats(self):
187
+ feat_selector = BorutaPy(
188
+ self.model, n_estimators='auto', verbose=2, random_state=1)
189
+ feat_selector.fit(np.array(self.X), np.array(self.y))
190
+ boruta_selected_features = list(
191
+ self.X.iloc[:, feat_selector.support_].columns)
192
+ return boruta_selected_features
193
+
194
+ def get_kbest(self, X, feats_list, metric):
195
+ selector = SelectKBest(metric, k=self.select_n_feats)
196
+ selector.fit_transform(X[feats_list], self.y)
197
+ selected_feats_idxs_list = list(selector.get_support(indices=True))
198
+ column_names = [feats_list[i] for i in selected_feats_idxs_list]
199
+ return column_names
200
+
201
+ def get_rfe_feats(self):
202
+ model_rfe = RFE(self.model, n_features_to_select=self.select_n_feats)
203
+ model_rfe.fit(self.X, self.y)
204
+ model_rfe_feats = list(
205
+ self.X.iloc[:, list(model_rfe.support_)].columns)
206
+ return model_rfe_feats
207
+
208
+ # def get_shap_feats(self, feats_list, topk=10):
209
+ # model = self.model
210
+ # X = self.X[feats_list]
211
+ # model.fit(self.X, self.y)
212
+ # explainer = shap.Explainer(model.predict, X, max_evals = int(2 * X.shape[1] + 1), verbose=0)
213
+ # shap_values = explainer(X)
214
+ # selected_shap_features = get_feature_importances_shap_values(
215
+ # shap_values, features=list(X.columns), topk=topk
216
+ # )
217
+ # return selected_shap_features
218
+
219
+ def get_features(self):
220
+
221
+ if self.num_feats is not None:
222
+
223
+ if self.is_target_cat:
224
+
225
+ temp_n_feats = self.select_n_feats
226
+ if len(self.num_feats) < self.select_n_feats:
227
+ self.select_n_feats = 'all'
228
+
229
+ # self.num_kendalltau_feats = self.get_kendalltau(self.X[self.num_feats], self.y)
230
+ self.num_f_feats = self.get_kbest(
231
+ X=self.X, feats_list=self.num_feats, metric=f_classif)
232
+ self.num_mi_feats = self.get_kbest(
233
+ X=self.X, feats_list=self.num_feats, metric=mutual_info_classif)
234
+
235
+ self.select_n_feats = temp_n_feats
236
+
237
+ self.selected_num_feats = []
238
+ # self.selected_num_feats.extend(self.num_kendalltau_feats)
239
+ self.selected_num_feats.extend(self.num_f_feats)
240
+ self.selected_num_feats.extend(self.num_mi_feats)
241
+
242
+ else:
243
+
244
+ self.vif_feats = self.select_feats_via_vif()
245
+
246
+ temp_n_feats = self.select_n_feats
247
+ if len(self.num_feats) < self.select_n_feats:
248
+ self.select_n_feats = 'all'
249
+
250
+ self.pearson_feats = self.get_kbest(
251
+ X=self.X, feats_list=self.num_feats, metric=r_regression, k=self.select_n_feats)
252
+
253
+ self.select_n_feats = temp_n_feats
254
+ # self.num_spearmanr_feats = self.get_kbest(X=self.X, feats_list=self.num_feats, metric=stats.spearmanr, k=self.select_n_feats)
255
+ # self.num_kendalltau_feats = self.get_kbest(X=self.X, feats_list=self.num_feats, metric=stats.kendalltau, k=self.select_n_feats)
256
+ self.num_spearmanr_feats = self.get_spearmanr(
257
+ self.X[self.num_feats], self.y)
258
+ self.num_kendalltau_feats = self.get_kendalltau(
259
+ self.X[self.num_feats], self.y)
260
+ # self.num_spearmanr_feats = SelectKBest(self.get_spearmanr, k=self.select_n_feats).fit_transform(self.X[self.num_feats], self.y)
261
+ # self.num_kendalltau_feats = SelectKBest(self.get_kendalltau, k=self.select_n_feats).fit_transform(self.X[self.num_feats], self.y)
262
+
263
+ self.selected_num_feats = []
264
+ self.selected_num_feats.extend(self.pearson_feats)
265
+ self.selected_num_feats.extend(self.num_spearmanr_feats)
266
+ self.selected_num_feats.extend(self.num_kendalltau_feats)
267
+ # self.selected_num_feats = list(set(self.selected_num_feats))
268
+
269
+ else:
270
+
271
+ self.selected_num_feats = []
272
+
273
+ if self.ordinal_feats is not None:
274
+
275
+ if self.is_target_cat:
276
+
277
+ temp_n_feats = self.select_n_feats
278
+ if len(self.ordinal_feats) < self.select_n_feats:
279
+ self.select_n_feats = 'all'
280
+
281
+ self.ordinal_mi_feats = self.get_kbest(
282
+ X=self.X, feats_list=self.ordinal_feats, metric=mutual_info_classif)
283
+ self.ordinal_chi2_feats = self.get_kbest(
284
+ X=self.X, feats_list=self.ordinal_feats, metric=chi2)
285
+
286
+ self.selected_ordinal_feats = []
287
+ self.selected_ordinal_feats.extend(self.ordinal_mi_feats)
288
+ self.selected_ordinal_feats.extend(self.ordinal_chi2_feats)
289
+
290
+ self.select_n_feats = temp_n_feats
291
+
292
+ else:
293
+
294
+ self.ordinal_spearmanr_feats = self.get_spearmanr(
295
+ self.X[self.ordinal_feats], self.y)
296
+ self.ordinal_kendalltau_feats = self.get_kendalltau(
297
+ self.X[self.ordinal_feats], self.y)
298
+
299
+ # self.ordinal_spearmanr_feats = self.get_kbest(X=self.X, feats_list=self.ordinal_feats, metric=stats.spearmanr, k=self.select_n_feats)
300
+ # self.ordinal_kendalltau_feats = self.get_kbest(X=self.X, feats_list=self.ordinal_feats, metric=stats.kendalltau, k=self.select_n_feats)
301
+
302
+ # self.ordinal_spearmanr_feats = SelectKBest(self.get_spearmanr, k=self.select_n_feats).fit_transform(self.X[self.ordinal_feats], self.y)
303
+ # self.ordinal_kendalltau_feats = SelectKBest(self.get_kendalltau, k=self.select_n_feats).fit_transform(self.X[self.ordinal_feats], self.y)
304
+
305
+ self.selected_ordinal_feats = []
306
+ self.selected_ordinal_feats.extend(
307
+ self.ordinal_spearmanr_feats)
308
+ self.selected_ordinal_feats.extend(
309
+ self.ordinal_kendalltau_feats)
310
+ # self.selected_ordinal_feats = list(set(self.selected_ordinal_feats))
311
+
312
+ else:
313
+ self.selected_ordinal_feats = []
314
+
315
+ if self.nominal_feats is not None:
316
+
317
+ if self.is_target_cat:
318
+
319
+ temp_n_feats = self.select_n_feats
320
+ if len(self.nominal_feats) < self.select_n_feats:
321
+ self.select_n_feats = 'all'
322
+
323
+ self.nominal_mi_feats = self.get_kbest(
324
+ X=self.X, feats_list=self.nominal_feats, metric=mutual_info_classif)
325
+ self.nominal_chi2_feats = self.get_kbest(
326
+ X=self.X, feats_list=self.nominal_feats, metric=chi2)
327
+
328
+ self.selected_nominal_feats = []
329
+ self.selected_nominal_feats.extend(self.nominal_mi_feats)
330
+ self.selected_nominal_feats.extend(self.nominal_chi2_feats)
331
+
332
+ self.select_n_feats = temp_n_feats
333
+
334
+ else:
335
+
336
+ temp_n_feats = self.select_n_feats
337
+ if len(self.nominal_feats) < self.select_n_feats:
338
+ self.select_n_feats = 'all'
339
+
340
+ self.f_feats = self.get_kbest(
341
+ X=self.X, feats_list=self.nominal_feats, metric=f_classif, k=self.select_n_feats)
342
+ self.mi_feats = self.get_kbest(
343
+ X=self.X, feats_list=self.nominal_feats, metric=mutual_info_regression, k=self.select_n_feats)
344
+
345
+ self.select_n_feats = temp_n_feats
346
+
347
+ # # self.f_feats = f_classif(self.X[self.nominal_feats], self.y)[0]
348
+ # self.f_feats = SelectKBest(f_classif, k=self.select_n_feats).fit_transform(self.X[self.nominal_feats], self.y).columns
349
+
350
+ # # self.mi_feats = mutual_info_regression(self.X[self.nominal_feats], self.y)
351
+ # self.mi_feats = SelectKBest(mutual_info_regression, k=self.select_n_feats).fit_transform(self.X[self.nominal_feats], self.y).columns
352
+
353
+ self.selected_nominal_feats = []
354
+ self.selected_nominal_feats.extend(self.f_feats)
355
+ self.selected_nominal_feats.extend(self.mi_feats)
356
+ # self.selected_nominal_feats = list(set(self.selected_nominal_feats))
357
+
358
+ else:
359
+
360
+ self.selected_nominal_feats = []
361
+
362
+ if self.model is not None:
363
+ # np.int = np.int32
364
+ # np.float = np.float64
365
+ # np.bool = np.bool_
366
+ if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
367
+ self.boruta_feats = self.get_boruta_feats()
368
+ if not isinstance(self.model, SVC):
369
+ self.rfe_feats = self.get_rfe_feats()
370
+ else:
371
+ self.boruta_feats = []
372
+ self.rfe_feats = []
373
+
374
+ if len(self.selected_num_feats) != 0:
375
+ if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
376
+ self.selected_num_feats.extend(self.boruta_feats)
377
+ if not isinstance(self.model, SVC):
378
+ self.selected_num_feats.extend(self.rfe_feats)
379
+ num_feats_dict = dict(Counter(self.selected_num_feats))
380
+ self.selected_num_feats = [
381
+ i for i in num_feats_dict if num_feats_dict[i] >= 2]
382
+
383
+ if len(self.selected_ordinal_feats) != 0:
384
+ if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
385
+ self.selected_ordinal_feats.extend(self.boruta_feats)
386
+ if not isinstance(self.model, SVC):
387
+ self.selected_ordinal_feats.extend(self.rfe_feats)
388
+ ordinal_feats_dict = dict(Counter(self.selected_ordinal_feats))
389
+ self.selected_ordinal_feats = [
390
+ i for i in ordinal_feats_dict if ordinal_feats_dict[i] >= 2]
391
+
392
+ if len(self.selected_nominal_feats) != 0:
393
+ if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
394
+ self.selected_nominal_feats.extend(self.boruta_feats)
395
+ if not isinstance(self.model, SVC):
396
+ self.selected_nominal_feats.extend(self.rfe_feats)
397
+ nominal_feats_dict = dict(Counter(self.selected_nominal_feats))
398
+ self.selected_nominal_feats = [
399
+ i for i in nominal_feats_dict if nominal_feats_dict[i] >= 2]
400
+
401
+ self.selected_feats = []
402
+ self.selected_feats.extend(self.selected_num_feats)
403
+ self.selected_feats.extend(self.selected_ordinal_feats)
404
+ self.selected_feats.extend(self.selected_nominal_feats)
405
+ if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
406
+ self.selected_feats.extend(self.boruta_feats)
407
+ self.selected_feats = list(set(self.selected_feats))
408
+
409
+ # self.selected_feats = self.get_shap_feats(self.selected_feats)
410
+
411
+ return self.selected_feats
412
+
413
+
414
+ def create_feature_selection_dict(data, cv_fold_list, numerical_features, nominal_features):
415
+ '''
416
+ Returns feature selection dictionary for 4 different models
417
+
418
+ Args:
419
+ data (pd.DataFrame): train data
420
+ cv_fold_list (list): contains tuples of indeces of train and validation data for each fold
421
+ numerical_features (list): contains the names of numerical features
422
+ nominal_features (list): contains the names of nominal features
423
+
424
+ Returns:
425
+ dict: contains selected features, train and validation scores, models and scalers used
426
+ '''
427
+
428
+ selected_features_dict = {}
429
+
430
+ for idx in tqdm(range(1)):
431
+
432
+ X_train = data.iloc[cv_fold_list[idx][0]].reset_index(drop=True)
433
+ y_train = data.iloc[cv_fold_list[idx][0]
434
+ ]['Bankrupt?'].to_frame().reset_index(drop=True)
435
+
436
+ X_valid = data.iloc[cv_fold_list[idx][1]].reset_index(drop=True)
437
+ y_valid = data.iloc[cv_fold_list[idx][1]
438
+ ]['Bankrupt?'].to_frame().reset_index(drop=True)
439
+
440
+ new_numerical_features = []
441
+ for feat in numerical_features:
442
+ X_train[f"feat{numerical_features.index(feat)}"] = X_train[feat] * \
443
+ X_train[' Liability-Assets Flag']
444
+ X_valid[f"feat{numerical_features.index(feat)}"] = X_valid[feat] * \
445
+ X_valid[' Liability-Assets Flag']
446
+ new_numerical_features.append(
447
+ f"feat{numerical_features.index(feat)}")
448
+
449
+ numerical_features.extend(new_numerical_features)
450
+
451
+ # getting categorical features
452
+ categorical_features = nominal_features.copy()
453
+
454
+ # getting all features
455
+ all_features = []
456
+ all_features.extend(categorical_features)
457
+ all_features.extend(numerical_features)
458
+
459
+ X_train = X_train[all_features]
460
+ X_valid = X_valid[all_features]
461
+
462
+ models_list = [RandomForestClassifier(), XGBClassifier(
463
+ ), LogisticRegression(), SVC(probability=True)]
464
+ model_names_list = ['RandomForestClassifier',
465
+ 'XGBClassifier', 'LogisticRegression', 'SVC']
466
+
467
+ for model_idx in tqdm(range(len(model_names_list))):
468
+
469
+ model_name = model_names_list[model_idx]
470
+
471
+ selected_features_dict[model_name] = {}
472
+
473
+ # feature selection
474
+ model = models_list[model_idx]
475
+
476
+ if isinstance(model, LogisticRegression) or isinstance(model, SVC):
477
+
478
+ scaler = StandardScaler()
479
+
480
+ X_train2 = scaler.fit_transform(X_train[numerical_features])
481
+ X_train2 = pd.DataFrame(X_train2, columns=numerical_features)
482
+ X_train2 = pd.concat(
483
+ [X_train2, X_train[categorical_features]], axis=1)
484
+
485
+ fselector = FSelector(
486
+ X=X_train2,
487
+ y=y_train,
488
+ num_feats=numerical_features,
489
+ ordinal_feats=None,
490
+ nominal_feats=nominal_features,
491
+ model=model
492
+ )
493
+
494
+ else:
495
+
496
+ fselector = FSelector(
497
+ X=X_train,
498
+ y=y_train,
499
+ num_feats=numerical_features,
500
+ ordinal_feats=None,
501
+ nominal_feats=nominal_features,
502
+ model=model
503
+ )
504
+
505
+ selected_features = fselector.get_features()
506
+
507
+ if len(selected_features) == 0:
508
+ continue
509
+
510
+ # selecting features using shap values
511
+ if isinstance(model, LogisticRegression) or isinstance(model, SVC):
512
+
513
+ X_valid2 = scaler.transform(X_valid[numerical_features])
514
+ X_valid2 = pd.DataFrame(X_valid2, columns=numerical_features)
515
+ X_valid2 = pd.concat(
516
+ [X_valid2, X_valid[categorical_features]], axis=1)
517
+
518
+ X_train_filtered = X_train2[selected_features]
519
+ X_valid_filtered = X_valid2[selected_features]
520
+
521
+ else:
522
+
523
+ X_train_filtered = X_train[selected_features]
524
+ X_valid_filtered = X_valid[selected_features]
525
+
526
+ # model training using selected features
527
+ model.fit(X_train_filtered, y_train)
528
+
529
+ explainer = shap.Explainer(
530
+ model.predict,
531
+ X_train_filtered,
532
+ # max_evals = int(2 * X_train_filtered.shape[1] + 1),
533
+ # verbose=0
534
+ )
535
+ shap_values = explainer(X_train_filtered)
536
+ selected_shap_features = get_shap_features(
537
+ shap_values,
538
+ features=list(X_train_filtered.columns),
539
+ topk=10
540
+ )
541
+
542
+ # model training using shap features
543
+ model = models_list[model_idx]
544
+ model.fit(X_train_filtered[selected_shap_features], y_train)
545
+
546
+ # metric calculation
547
+ y_train_pred = model.predict(
548
+ X_train_filtered[selected_shap_features])
549
+ y_train_pred_prob = model.predict_proba(
550
+ X_train_filtered[selected_shap_features])[:, 1]
551
+
552
+ y_valid_pred = model.predict(
553
+ X_valid_filtered[selected_shap_features])
554
+ y_valid_pred_prob = model.predict_proba(
555
+ X_valid_filtered[selected_shap_features])[:, 1]
556
+
557
+ train_acc = accuracy_score(y_train, y_train_pred)
558
+ train_f1 = f1_score(y_train, y_train_pred)
559
+ train_roc_auc = roc_auc_score(y_train, y_train_pred_prob)
560
+
561
+ valid_acc = accuracy_score(y_valid, y_valid_pred)
562
+ valid_f1 = f1_score(y_valid, y_valid_pred)
563
+ valid_roc_auc = roc_auc_score(y_valid, y_valid_pred_prob)
564
+
565
+ selected_features_dict[model_name][idx+1] = {}
566
+ selected_features_dict[model_name][idx +
567
+ 1]['selected_feats'] = selected_features
568
+ selected_features_dict[model_name][idx +
569
+ 1]['selected_shap_feats'] = selected_shap_features
570
+ selected_features_dict[model_name][idx+1]['train_acc'] = train_acc
571
+ selected_features_dict[model_name][idx+1]['train_f1'] = train_f1
572
+ selected_features_dict[model_name][idx +
573
+ 1]['train_roc_auc'] = train_roc_auc
574
+ selected_features_dict[model_name][idx+1]['valid_acc'] = valid_acc
575
+ selected_features_dict[model_name][idx+1]['valid_f1'] = valid_f1
576
+ selected_features_dict[model_name][idx +
577
+ 1]['valid_roc_auc'] = valid_roc_auc
578
+ selected_features_dict[model_name][idx+1]['model'] = model
579
+ if isinstance(model, LogisticRegression) or isinstance(model, SVC):
580
+ selected_features_dict[model_name][idx+1]['scaler'] = scaler
581
+
582
+ # print(f"##### {model_name} #####")
583
+ # print(f"Selected features: {selected_features}")
584
+ # print("Train:")
585
+ # print(f"Accuracy: {train_acc:.5f}, F1: {train_f1:.5f}, ROC-AUC: {train_roc_auc:.5f}")
586
+ # print("Validation:")
587
+ # print(f"Accuracy: {valid_acc:.5f}, F1: {valid_f1:.5f}, ROC-AUC: {valid_roc_auc:.5f}")
588
+
589
+ logging.info(f"##### {model_name} #####")
590
+ logging.info(f"Selected features: {selected_features}")
591
+ logging.info('Train:')
592
+ logging.info(
593
+ f"Accuracy: {train_acc:.5f}, F1: {train_f1:.5f}, ROC-AUC: {train_roc_auc:.5f}")
594
+ logging.info('Validation:')
595
+ logging.info(
596
+ f"Accuracy: {valid_acc:.5f}, F1: {valid_f1:.5f}, ROC-AUC: {valid_roc_auc:.5f}")
597
+
598
+ del X_train, y_train, X_valid, y_valid, X_train_filtered, X_valid_filtered, model
599
+ gc.collect()
600
+
601
+ return selected_features_dict
602
+
603
+
604
+ def get_mean_ensemble_prediction(prob_list):
605
+ prob_array = np.vstack(prob_list).T
606
+ return np.mean(prob_array, axis=1)
607
+
608
+
609
+ class OptimizeAUC:
610
+ def __init__(self):
611
+ self.coef_ = 0
612
+
613
+ def _auc(self, coef, X, y):
614
+ X_coef = X * coef
615
+ preds = np.sum(X_coef, axis=1)
616
+ auc_score = roc_auc_score(y, preds)
617
+ return -1 * auc_score
618
+
619
+ def fit(self, X, y):
620
+ loss_partial = partial(self._auc, X=X, y=y)
621
+ initial_coef = np.random.dirichlet(np.ones(X.shape[1]), size=1)
622
+ self.coef_ = fmin(loss_partial, initial_coef, disp=True)
623
+
624
+ def predict(self, X):
625
+ X_coef = X * self.coef_
626
+ preds = np.sum(X_coef, axis=1)
627
+ return preds
628
+
629
+
630
+ def get_optimized_ensemble(train_df, test_df, cv_fold_list, selected_features_dict, trained_models_dict, numerical_features):
631
+ '''
632
+ Finds the optimized weights for ensembling using the train data and evaluates it on test data
633
+
634
+ Args:
635
+ train_df (pd.DataFrame): train data
636
+ test_df (pd.DataFrame): test data
637
+ cv_fold_list (list): contains tuples of indeces of train and validation data for each fold
638
+ selected_features_dict (dict): selected features dictionary where keys are models' names
639
+ trained_models_dict (dict): trained models dictionary where keys are models' names
640
+ numerical_features (list): contains the names of numerical features
641
+
642
+ Returns:
643
+ dict: contains all optimized weights for each fold
644
+ float: ROC-AUC score
645
+ '''
646
+
647
+ opt_dict = {}
648
+
649
+ test_preds_list = []
650
+ # valid_preds_list = []
651
+
652
+ X_test_rf = test_df[selected_features_dict['RandomForestClassifier']
653
+ [1]['selected_shap_feats']]
654
+ X_test_xgb = test_df[selected_features_dict['XGBClassifier']
655
+ [1]['selected_shap_feats']]
656
+ X_test_lr = test_df[selected_features_dict['LogisticRegression']
657
+ [1]['selected_shap_feats']]
658
+ X_test_svc = test_df[selected_features_dict['SVC']
659
+ [1]['selected_shap_feats']]
660
+
661
+ y_test = test_df['Bankrupt?'].to_frame()
662
+
663
+ for idx in range(len(cv_fold_list)):
664
+
665
+ logging.info(f'Starting calculations for Fold {idx+1}')
666
+
667
+ X_train = train_df.iloc[cv_fold_list[idx][0]].reset_index(drop=True)
668
+ y_train = train_df.iloc[cv_fold_list[idx][0]
669
+ ]['Bankrupt?'].to_frame().reset_index(drop=True)
670
+
671
+ X_valid = train_df.iloc[cv_fold_list[idx][1]].reset_index(drop=True)
672
+ y_valid = train_df.iloc[cv_fold_list[idx][1]
673
+ ]['Bankrupt?'].to_frame().reset_index(drop=True)
674
+
675
+ # RandomForest
676
+ logging.info('Starting RandomForest calculations')
677
+ rf_selected_features = selected_features_dict['RandomForestClassifier'][1]['selected_shap_feats']
678
+ X_train_rf = X_train[rf_selected_features]
679
+ X_valid_rf = X_valid[rf_selected_features]
680
+
681
+ rf_gscv = trained_models_dict['RandomForestClassifier']
682
+
683
+ rfm = RandomForestClassifier(**rf_gscv.best_params_)
684
+ rfm.fit(X_train_rf, y_train)
685
+ rfm_valid_probs = rfm.predict_proba(X_valid_rf)[:, 1]
686
+
687
+ rfm_test_probs = rfm.predict_proba(X_test_rf)[:, 1]
688
+ logging.info('RandomForest calculations completed')
689
+
690
+ # XGBoost
691
+ logging.info('Starting XGBoost calculations')
692
+ xgb_selected_features = selected_features_dict['XGBClassifier'][1]['selected_shap_feats']
693
+ X_train_xgb = X_train[xgb_selected_features]
694
+ X_valid_xgb = X_valid[xgb_selected_features]
695
+
696
+ xgb_gscv = trained_models_dict['XGBClassifier']
697
+
698
+ xgbm = XGBClassifier(**xgb_gscv.best_params_)
699
+ xgbm.fit(X_train_xgb, y_train)
700
+ xgbm_valid_probs = xgbm.predict_proba(X_valid_xgb)[:, 1]
701
+ xgbm_test_probs = xgbm.predict_proba(X_test_xgb)[:, 1]
702
+ logging.info('XGBoost calculations completed')
703
+
704
+ # LogisticRegression
705
+ logging.info('Starting LogisticRegression calculations')
706
+ lr_selected_features = selected_features_dict['LogisticRegression'][1]['selected_shap_feats']
707
+ X_train_lr = X_train[lr_selected_features]
708
+ X_valid_lr = X_valid[lr_selected_features]
709
+
710
+ lr_gscv = trained_models_dict['LogisticRegression']
711
+
712
+ lr_params = {k.replace('model__', ''): v for k,
713
+ v in lr_gscv.best_params_.items()}
714
+ selected_shap_features = selected_features_dict['LogisticRegression'][1]['selected_shap_feats']
715
+ num_feat = [
716
+ col for col in selected_shap_features if col in numerical_features]
717
+ num_trans = Pipeline([('scale', StandardScaler())])
718
+ preprocessor = ColumnTransformer(
719
+ transformers=[('num', num_trans, num_feat)], remainder='passthrough')
720
+ lrm = Pipeline(
721
+ [
722
+ ('preproc', preprocessor),
723
+ ('lr', LogisticRegression(**lr_params))
724
+ ]
725
+ )
726
+ lrm.fit(X_train_lr, y_train)
727
+ lrm_valid_probs = lrm.predict_proba(X_valid_lr)[:, 1]
728
+ lrm_test_probs = lrm.predict_proba(X_test_lr)[:, 1]
729
+ logging.info('LogisticRegression calculations completed')
730
+
731
+ # SVC
732
+ logging.info('Starting SVC calculations')
733
+ svc_selected_features = selected_features_dict['SVC'][1]['selected_shap_feats']
734
+ X_train_svc = X_train[svc_selected_features]
735
+ X_valid_svc = X_valid[svc_selected_features]
736
+
737
+ svc_gscv = trained_models_dict['SVC']
738
+
739
+ svc_params = {k.replace('model__', ''): v for k,
740
+ v in svc_gscv.best_params_.items()}
741
+ selected_shap_features = selected_features_dict['SVC'][1]['selected_shap_feats']
742
+ num_feat = [
743
+ col for col in selected_shap_features if col in numerical_features]
744
+ num_trans = Pipeline([('scale', StandardScaler())])
745
+ preprocessor = ColumnTransformer(
746
+ transformers=[('num', num_trans, num_feat)], remainder='passthrough')
747
+ svcm = Pipeline(
748
+ [
749
+ ('preproc', preprocessor),
750
+ ('svc', SVC(probability=True, **svc_params))
751
+ ]
752
+ )
753
+ svcm.fit(X_train_svc, y_train)
754
+ svcm_valid_probs = svcm.predict_proba(X_valid_svc)[:, 1]
755
+ svcm_test_probs = svcm.predict_proba(X_test_svc)[:, 1]
756
+ logging.info('SVC calculations completed')
757
+
758
+ logging.info('Optimizing Ensemble weights')
759
+ valid_preds = np.column_stack([
760
+ rfm_valid_probs,
761
+ xgbm_valid_probs,
762
+ lrm_valid_probs,
763
+ svcm_valid_probs
764
+ ])
765
+
766
+ opt = OptimizeAUC()
767
+ opt.fit(valid_preds, y_valid)
768
+ opt_dict[idx] = {}
769
+ opt_dict[idx]['opt'] = opt
770
+ opt_dict[idx]['rfm'] = rfm
771
+ opt_dict[idx]['xgbm'] = xgbm
772
+ opt_dict[idx]['lrm'] = lrm
773
+ opt_dict[idx]['svcm'] = svcm
774
+ logging.info('Optimization finished')
775
+
776
+ # valid_preds_list.append(opt.predict(valid_preds))
777
+
778
+ logging.info('Calculating predictions for test set')
779
+ test_preds = np.column_stack([
780
+ rfm_test_probs,
781
+ xgbm_test_probs,
782
+ lrm_test_probs,
783
+ svcm_test_probs
784
+ ])
785
+
786
+ test_preds_list.append(opt.predict(test_preds))
787
+ logging.info('Test set predictions calculated')
788
+
789
+ logging.info('Getting the score for test set')
790
+ opt_y_test_pred_prob = np.mean(np.column_stack(test_preds_list), axis=1)
791
+ opt_test_roc_auc = roc_auc_score(y_test, opt_y_test_pred_prob)
792
+ logging.info('Test score calculated')
793
+
794
+ return (opt_dict, opt_test_roc_auc)
795
+
796
+
797
+ def find_optimal_model(train_df, test_df, features_dict_path, cv_fold_list, numerical_features):
798
+ '''
799
+ Finds the best model for the train data and evaluates it on test data
800
+
801
+ Args:
802
+ train_df (pd.DataFrame): train data
803
+ test_df (pd.DataFrame): test data
804
+ features_dict_path (str): path to selected features dictionary
805
+ cv_fold_list (list): contains tuples of indeces of train and validation data for each fold
806
+ numerical_features (list): contains the names of numerical features
807
+
808
+ Returns:
809
+ dict: contains all trained models and the name of the best model
810
+ dict: contains all optimized weights of ensembling for each fold
811
+ '''
812
+ logging.info('Loading selected features dictionary')
813
+ selected_features_dict = load_object(file_path=features_dict_path)
814
+ logging.info('Selected features dictionary loaded')
815
+
816
+ models_list = [RandomForestClassifier(), XGBClassifier(),
817
+ LogisticRegression(), SVC(probability=True)]
818
+ model_names_list = ['RandomForestClassifier',
819
+ 'XGBClassifier', 'LogisticRegression', 'SVC']
820
+ model_params_list = [
821
+ {
822
+ 'n_estimators': [5, 10, 15, 25, 50, 100, 120, 300, 500],
823
+ 'max_depth': [2, 3, 5, 8, 15, 25, 30, None]
824
+ },
825
+ {
826
+ 'eta': [0.01, 0.015, 0.025, 0.05, 0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9],
827
+ 'max_depth': [3, 5, 6, 7, 9, 12, 15, 17, 25],
828
+ 'n_estimators': [50, 100, 150, 200, 500, 1000]
829
+ },
830
+ {'model__penalty': ['l1', 'l2'], 'model__C': [
831
+ 0.001, 0.01, 0.1, 1, 10, 100, 1000]},
832
+ {'model__C': [1, 10, 100, 1000], 'model__gamma': [
833
+ 1, 0.1, 0.001, 0.0001], 'model__kernel': ['linear', 'rbf']}
834
+ ]
835
+
836
+ trained_models_dict = {}
837
+
838
+ best_score = 0
839
+ best_model_name = None
840
+
841
+
842
+ y_train = train_df['Bankrupt?'].to_frame()
843
+ y_test = test_df['Bankrupt?'].to_frame()
844
+
845
+ y_train_pred_prob_list = []
846
+ y_test_pred_prob_list = []
847
+ rank_ensemble_list = []
848
+
849
+ for model_idx in tqdm(range(len(model_names_list))):
850
+
851
+ # y_train_pred_prob = np.zeros(X_train.shape)
852
+
853
+ model_name = model_names_list[model_idx]
854
+
855
+ selected_shap_features = selected_features_dict[model_name][1]['selected_shap_feats']
856
+
857
+ X_train = train_df[selected_shap_features]
858
+ X_test = test_df[selected_shap_features]
859
+
860
+ logging.info(f'Starting {model_name} training')
861
+ params_dict = model_params_list[model_idx]
862
+
863
+ model = models_list[model_idx]
864
+
865
+ if isinstance(model, LogisticRegression) or isinstance(model, SVC):
866
+ num_feat = [
867
+ col for col in selected_shap_features if col in numerical_features]
868
+ num_trans = Pipeline([('scale', StandardScaler())])
869
+ preprocessor = ColumnTransformer(
870
+ transformers=[('num', num_trans, num_feat)], remainder='passthrough')
871
+ pipe = Pipeline(
872
+ [
873
+ ('preproc', preprocessor),
874
+ ('model', model)
875
+ ]
876
+ )
877
+
878
+ model_gscv = GridSearchCV(
879
+ pipe,
880
+ param_grid=params_dict,
881
+ scoring='roc_auc',
882
+ cv=cv_fold_list,
883
+ n_jobs=-1,
884
+ verbose=4
885
+ )
886
+ else:
887
+ model_gscv = GridSearchCV(
888
+ model,
889
+ param_grid=params_dict,
890
+ scoring='roc_auc',
891
+ cv=cv_fold_list,
892
+ n_jobs=-1,
893
+ verbose=4
894
+ )
895
+
896
+ model_gscv.fit(X_train, y_train)
897
+ logging.info(f'{model_name} training finished')
898
+
899
+ trained_models_dict[model_name] = model_gscv
900
+
901
+ rank_ensemble_list.append((model_name, model_gscv.best_score_))
902
+
903
+ # for train_idxs, valid_idxs in cv_fold_list:
904
+ # temp_model = models_list[model_idx]
905
+ # y_train_pred_prob[valid_idxs, :] = model_gscv.predict_proba(X_train[valid_idxs, :])[:, 1]
906
+ # y_train_pred_prob_list.append(y_train_pred_prob)
907
+
908
+ logging.info('Getting ROC-AUC for test set')
909
+ y_test_pred_prob = model_gscv.predict_proba(X_test)[:, 1]
910
+ y_test_pred_prob_list.append(y_test_pred_prob)
911
+ test_roc_auc = roc_auc_score(y_test, y_test_pred_prob)
912
+ logging.info(
913
+ f'{model_name}: Validation score = {model_gscv.best_score_:.4f}, Test score = {test_roc_auc:.4f}')
914
+
915
+ if test_roc_auc > best_score:
916
+ best_score = test_roc_auc
917
+ best_model_name = model_name
918
+
919
+ logging.info('Getting Average Ensemble score')
920
+ # avg_ens_y_train_pred_prob = get_mean_ensemble_prediction(y_train_pred_prob_list)
921
+ # avg_ens_train_roc_auc = roc_auc_score(y_test, avg_ens_y_train_pred_prob)
922
+
923
+ avg_ens_y_test_pred_prob = get_mean_ensemble_prediction(
924
+ y_test_pred_prob_list)
925
+ avg_ens_test_roc_auc = roc_auc_score(y_test, avg_ens_y_test_pred_prob)
926
+ logging.info(f'Average Ensemble: Test score = {avg_ens_test_roc_auc:.4f}')
927
+ # logging.info(f'Average Ensemble: Validation score = {avg_ens_train_roc_auc:.4f}, Test score = {avg_ens_test_roc_auc:.4f}')
928
+
929
+ if avg_ens_test_roc_auc > best_score:
930
+ best_score = avg_ens_test_roc_auc
931
+ best_model_name = 'Average Ensemble'
932
+
933
+ logging.info('Getting Rank Ensemble score')
934
+ rank_ensemble_list = sorted(rank_ensemble_list, key=lambda x: x[1])
935
+
936
+ # rank_ens_y_train_pred_prob = 0
937
+ rank_ens_y_test_pred_prob = 0
938
+ for i in range(len(rank_ensemble_list)):
939
+ # rank_ens_y_train_pred_prob += (i+1) * y_train_pred_prob_list[model_names_list.index(rank_ensemble_list[i][0])]
940
+ rank_ens_y_test_pred_prob += (
941
+ i+1) * y_test_pred_prob_list[model_names_list.index(rank_ensemble_list[i][0])]
942
+ # rank_ens_y_train_pred_prob /= len(rank_ensemble_list) * (1+ len(rank_ensemble_list)) / 2
943
+ rank_ens_y_test_pred_prob /= len(rank_ensemble_list) * \
944
+ (1 + len(rank_ensemble_list)) / 2
945
+ rank_ens_test_roc_auc = roc_auc_score(y_test, rank_ens_y_test_pred_prob)
946
+
947
+ logging.info(f'Rank Ensemble: Test score = {rank_ens_test_roc_auc:.4f}')
948
+ # logging.info(f'Rank Ensemble: Validation score = {rank_ens_y_train_pred_prob:.4f}, Test score = {rank_ens_y_test_pred_prob:.4f}')
949
+
950
+ if rank_ens_test_roc_auc > best_score:
951
+ best_score = rank_ens_test_roc_auc
952
+ best_model_name = 'Rank Ensemble'
953
+
954
+ logging.info('Getting Optimized Ensemble score')
955
+ opt_dict, opt_test_roc_auc = get_optimized_ensemble(
956
+ train_df,
957
+ test_df,
958
+ cv_fold_list,
959
+ selected_features_dict,
960
+ trained_models_dict,
961
+ numerical_features
962
+ )
963
+
964
+ logging.info(f'Optimized Ensemble: Test score = {opt_test_roc_auc:.4f}')
965
+
966
+ if opt_test_roc_auc > best_score:
967
+ best_score = opt_test_roc_auc
968
+ best_model_name = 'Optimized Ensemble'
969
+
970
+ trained_models_dict['best_model_name'] = best_model_name
971
+
972
+ logging.info(f'{best_model_name} is the best model')
973
+
974
+ return (trained_models_dict, opt_dict)
requirements.txt ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ alembic==1.13.1
2
+ altair==5.3.0
3
+ aniso8601==9.0.1
4
+ annotated-types==0.6.0
5
+ anyio==4.3.0
6
+ appdirs==1.4.4
7
+ asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1698341106958/work
8
+ attrs==23.2.0
9
+ blinker==1.7.0
10
+ Boruta==0.3
11
+ BorutaShap==1.0.17
12
+ cachetools==5.3.3
13
+ certifi==2024.2.2
14
+ charset-normalizer==3.3.2
15
+ click==8.1.7
16
+ cloudpickle==3.0.0
17
+ colorama @ file:///home/conda/feedstock_root/build_artifacts/colorama_1666700638685/work
18
+ comm @ file:///home/conda/feedstock_root/build_artifacts/comm_1710320294760/work
19
+ -e git+https://github.com/VaheC/CompanyBankruptcy.git@0c9aba9c454511775cdf83313b15ca93d56c3356#egg=CompanyBankruptcy
20
+ contourpy==1.2.1
21
+ cycler==0.12.1
22
+ debugpy @ file:///C:/b/abs_c0y1fjipt2/croot/debugpy_1690906864587/work
23
+ decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1641555617451/work
24
+ Deprecated==1.2.14
25
+ distro==1.9.0
26
+ dnspython==1.16.0
27
+ docker==7.1.0
28
+ dynaconf==3.2.5
29
+ ensure==1.0.2
30
+ entrypoints==0.4
31
+ et-xmlfile==1.1.0
32
+ evidently==0.4.22
33
+ exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1704921103267/work
34
+ executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1698579936712/work
35
+ Faker==25.2.0
36
+ filelock==3.14.0
37
+ Flask==3.0.3
38
+ fonttools==4.51.0
39
+ from-root==1.3.0
40
+ fsspec==2024.3.1
41
+ gitdb==4.0.11
42
+ GitPython==3.1.43
43
+ graphene==3.3
44
+ graphql-core==3.2.3
45
+ graphql-relay==3.2.0
46
+ greenlet==3.0.3
47
+ h11==0.14.0
48
+ httpcore==1.0.5
49
+ httptools==0.6.1
50
+ httpx==0.27.0
51
+ idna==3.6
52
+ imbalanced-learn==0.12.2
53
+ imblearn==0.0
54
+ importlib-metadata==6.11.0
55
+ ipykernel @ file:///D:/bld/ipykernel_1708996677248/work
56
+ ipython @ file:///D:/bld/ipython_1709559926914/work
57
+ iterative-telemetry==0.0.8
58
+ itsdangerous==2.2.0
59
+ jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1696326070614/work
60
+ Jinja2==3.1.3
61
+ joblib==1.4.0
62
+ jsonschema==4.21.1
63
+ jsonschema-specifications==2023.12.1
64
+ jupyter_client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1710255804825/work
65
+ jupyter_core @ file:///D:/bld/jupyter_core_1710257272359/work
66
+ kiwisolver==1.4.5
67
+ lightgbm==4.3.0
68
+ litestar==2.8.3
69
+ llvmlite==0.42.0
70
+ Mako==1.3.5
71
+ Markdown==3.6
72
+ markdown-it-py==3.0.0
73
+ MarkupSafe==2.1.5
74
+ matplotlib==3.8.4
75
+ matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1713250518406/work
76
+ mdurl==0.1.2
77
+ mlflow==2.13.0
78
+ msgspec==0.18.6
79
+ multidict==6.0.5
80
+ mypy-extensions==1.0.0
81
+ nest_asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1705850609492/work
82
+ nltk==3.8.1
83
+ numba==0.59.1
84
+ numpy==1.26.4
85
+ openpyxl==3.1.2
86
+ opentelemetry-api==1.24.0
87
+ opentelemetry-sdk==1.24.0
88
+ opentelemetry-semantic-conventions==0.45b0
89
+ packaging==23.2
90
+ pandas==2.2.1
91
+ parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1712320355065/work
92
+ patsy==0.5.6
93
+ pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1602536217715/work
94
+ pillow==10.3.0
95
+ platformdirs @ file:///home/conda/feedstock_root/build_artifacts/platformdirs_1706713388748/work
96
+ plotly==5.22.0
97
+ polyfactory==2.16.0
98
+ prompt-toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1702399386289/work
99
+ protobuf==4.25.3
100
+ psutil @ file:///C:/Windows/Temp/abs_b2c2fd7f-9fd5-4756-95ea-8aed74d0039flsd9qufz/croots/recipe/psutil_1656431277748/work
101
+ pure-eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1642875951954/work
102
+ pyarrow==15.0.2
103
+ pydantic==2.7.1
104
+ pydantic_core==2.18.2
105
+ pydeck==0.8.1b0
106
+ Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1700607939962/work
107
+ pymongo==4.7.2
108
+ pyparsing==3.1.2
109
+ python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1709299778482/work
110
+ python-dotenv==1.0.1
111
+ pytz==2024.1
112
+ pywin32==305.1
113
+ PyYAML==6.0.1
114
+ pyzmq @ file:///C:/b/abs_89aq69t0up/croot/pyzmq_1705605705281/work
115
+ querystring-parser==1.2.4
116
+ referencing==0.34.0
117
+ regex==2024.5.10
118
+ requests==2.31.0
119
+ rich==13.7.1
120
+ rich-click==1.8.1
121
+ rpds-py==0.18.0
122
+ scikit-learn==1.4.2
123
+ scipy==1.13.0
124
+ seaborn==0.13.2
125
+ shap==0.45.0
126
+ shellingham==1.5.4
127
+ six @ file:///home/conda/feedstock_root/build_artifacts/six_1620240208055/work
128
+ slicer==0.0.7
129
+ smmap==5.0.1
130
+ sniffio==1.3.1
131
+ SQLAlchemy==2.0.30
132
+ sqlparse==0.5.0
133
+ stack-data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1669632077133/work
134
+ statsmodels==0.14.2
135
+ streamlit==1.28.0
136
+ tenacity==8.2.3
137
+ threadpoolctl==3.5.0
138
+ toml==0.10.2
139
+ toolz==0.12.1
140
+ tornado @ file:///D:/bld/tornado_1656937966227/work
141
+ tqdm==4.66.2
142
+ traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1713535121073/work
143
+ typer==0.12.3
144
+ typing-inspect==0.9.0
145
+ typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1712329955671/work
146
+ tzdata==2024.1
147
+ tzlocal==5.2
148
+ ujson==5.10.0
149
+ urllib3==2.2.1
150
+ uvicorn==0.29.0
151
+ validators==0.28.3
152
+ waitress==3.0.0
153
+ watchdog==4.0.0
154
+ watchfiles==0.21.0
155
+ wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1704731205417/work
156
+ websockets==12.0
157
+ Werkzeug==3.0.3
158
+ wrapt==1.16.0
159
+ xgboost==2.0.3
160
+ zipp @ file:///home/conda/feedstock_root/build_artifacts/zipp_1695255097490/work