Spaces:
Sleeping
Sleeping
initialized
Browse files- app.py +277 -0
- app_input_example.xlsx +0 -0
- artifacts/feature_selection_dict.pkl +3 -0
- artifacts/models/opt_dict.pkl +3 -0
- artifacts/models/trained_models_dict.pkl +3 -0
- company_bankruptcy/__init__.py +0 -0
- company_bankruptcy/components/__init__.py +0 -0
- company_bankruptcy/components/data_ingestion.py +61 -0
- company_bankruptcy/components/data_transformation.py +85 -0
- company_bankruptcy/components/model_evaluation.py +164 -0
- company_bankruptcy/components/model_trainer.py +68 -0
- company_bankruptcy/constants/__init__.py +0 -0
- company_bankruptcy/constants/constants.py +5 -0
- company_bankruptcy/data_access/__init__.py +0 -0
- company_bankruptcy/data_access/mongo_db_connection.py +104 -0
- company_bankruptcy/exception/__init__.py +0 -0
- company_bankruptcy/exception/exception.py +20 -0
- company_bankruptcy/logger/__init__.py +0 -0
- company_bankruptcy/logger/logger.py +20 -0
- company_bankruptcy/pipeline/__init__.py +0 -0
- company_bankruptcy/pipeline/prediction_pipeline.py +0 -0
- company_bankruptcy/pipeline/training_pipeline.py +27 -0
- company_bankruptcy/utils/__init__.py +0 -0
- company_bankruptcy/utils/utils.py +974 -0
- requirements.txt +160 -0
app.py
ADDED
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
import os
|
6 |
+
import sys
|
7 |
+
|
8 |
+
from company_bankruptcy.components.model_trainer import ModelTrainer
|
9 |
+
from company_bankruptcy.components.data_transformation import DataTransformation
|
10 |
+
from company_bankruptcy.utils.utils import load_object
|
11 |
+
from company_bankruptcy.logger.logger import logging
|
12 |
+
from company_bankruptcy.exception.exception import CustomException
|
13 |
+
|
14 |
+
def get_prob(input_df, trained_models_dict, feature_selection_dict, opt_dict):
|
15 |
+
if best_model_name == 'Average Ensemble':
|
16 |
+
|
17 |
+
default_prob = 0
|
18 |
+
for model_name in trained_models_dict:
|
19 |
+
if model_name == 'best_model_name':
|
20 |
+
continue
|
21 |
+
temp_features_list = feature_selection_dict[model_name][1]['selected_shap_feats']
|
22 |
+
temp_prob = trained_models_dict[model_name].predict_proba(input_df[temp_features_list])[:, 1]
|
23 |
+
default_prob += temp_prob
|
24 |
+
default_prob /= (len(trained_models_dict) - 1)
|
25 |
+
|
26 |
+
elif best_model_name == 'Optimized Ensemble':
|
27 |
+
|
28 |
+
rfm_features_list = feature_selection_dict['RandomForestClassifier'][1]['selected_shap_feats']
|
29 |
+
xgbm_features_list = feature_selection_dict['XGBClassifier'][1]['selected_shap_feats']
|
30 |
+
lrm_features_list = feature_selection_dict['LogisticRegression'][1]['selected_shap_feats']
|
31 |
+
svcm_features_list = feature_selection_dict['SVC'][1]['selected_shap_feats']
|
32 |
+
|
33 |
+
preds_list = []
|
34 |
+
|
35 |
+
for idx in opt_dict:
|
36 |
+
opt = opt_dict[idx]['opt']
|
37 |
+
rfm = opt_dict[idx]['rfm']
|
38 |
+
xgbm = opt_dict[idx]['xgbm']
|
39 |
+
lrm = opt_dict[idx]['lrm']
|
40 |
+
svcm = opt_dict[idx]['svcm']
|
41 |
+
|
42 |
+
rfm_probs = rfm.predict_proba(input_df[rfm_features_list])[:, 1]
|
43 |
+
xgbm_probs = xgbm.predict_proba(input_df[xgbm_features_list])[:, 1]
|
44 |
+
lrm_probs = lrm.predict_proba(input_df[lrm_features_list])[:, 1]
|
45 |
+
svcm_probs = svcm.predict_proba(input_df[svcm_features_list])[:, 1]
|
46 |
+
|
47 |
+
model_preds = np.column_stack([
|
48 |
+
rfm_probs,
|
49 |
+
xgbm_probs,
|
50 |
+
lrm_probs,
|
51 |
+
svcm_probs
|
52 |
+
])
|
53 |
+
|
54 |
+
preds_list.append(opt.predict(model_preds))
|
55 |
+
|
56 |
+
default_prob = np.mean(np.column_stack(preds_list), axis=1)
|
57 |
+
|
58 |
+
elif best_model_name == 'Rank Ensemble':
|
59 |
+
|
60 |
+
rank_ensemble_list = []
|
61 |
+
prob_list = []
|
62 |
+
model_names_list = []
|
63 |
+
|
64 |
+
for model_name in trained_models_dict:
|
65 |
+
if model_name == 'best_model_name':
|
66 |
+
continue
|
67 |
+
temp_features_list = feature_selection_dict[model_name][1]['selected_shap_feats']
|
68 |
+
model_names_list.append(model_name)
|
69 |
+
rank_ensemble_list.append((model_name, trained_models_dict[model_name].best_score_))
|
70 |
+
prob_list.append(trained_models_dict[model_name].predict_proba(input_df[temp_features_list])[:, 1])
|
71 |
+
|
72 |
+
rank_ensemble_list = sorted(rank_ensemble_list, key=lambda x: x[1])
|
73 |
+
|
74 |
+
default_prob = 0
|
75 |
+
for i in range(len(rank_ensemble_list)):
|
76 |
+
default_prob += (i+1) * prob_list[model_names_list.index(rank_ensemble_list[i][0])]
|
77 |
+
default_prob /= (len(rank_ensemble_list) * (1 + len(rank_ensemble_list)) / 2)
|
78 |
+
|
79 |
+
else:
|
80 |
+
model = trained_models_dict[best_model_name]
|
81 |
+
temp_features_list = feature_selection_dict[best_model_name][1]['selected_shap_feats']
|
82 |
+
default_prob = model.predict_proba(input_df[temp_features_list])[:, 1]
|
83 |
+
|
84 |
+
return default_prob
|
85 |
+
|
86 |
+
st.set_page_config(
|
87 |
+
page_title='Default Predictor',
|
88 |
+
layout='centered'
|
89 |
+
)
|
90 |
+
|
91 |
+
try:
|
92 |
+
|
93 |
+
st.title('Company Default Predictor')
|
94 |
+
|
95 |
+
logging.info('Initiating dictionaries')
|
96 |
+
if 'trained_models_dict' not in st.session_state:
|
97 |
+
model_trainer_obj = ModelTrainer()
|
98 |
+
trained_models_dict = load_object(
|
99 |
+
os.path.join(
|
100 |
+
model_trainer_obj.model_trainer_config.trained_models_path,
|
101 |
+
'trained_models_dict.pkl'
|
102 |
+
)
|
103 |
+
)
|
104 |
+
opt_dict = load_object(
|
105 |
+
os.path.join(
|
106 |
+
model_trainer_obj.model_trainer_config.trained_models_path,
|
107 |
+
'opt_dict.pkl'
|
108 |
+
)
|
109 |
+
)
|
110 |
+
|
111 |
+
data_transformation_obj = DataTransformation()
|
112 |
+
feature_selection_dict = load_object(
|
113 |
+
data_transformation_obj.data_transformation_config.feature_selection_dict_file_path
|
114 |
+
)
|
115 |
+
|
116 |
+
example_data = pd.read_excel('app_input_example.xlsx')
|
117 |
+
# example_data = pd.read_csv('app_input_example.csv')
|
118 |
+
|
119 |
+
st.session_state['trained_models_dict'] = trained_models_dict
|
120 |
+
st.session_state['opt_dict'] = opt_dict
|
121 |
+
st.session_state['feature_selection_dict'] = feature_selection_dict
|
122 |
+
st.session_state['example_data'] = example_data
|
123 |
+
|
124 |
+
else:
|
125 |
+
|
126 |
+
trained_models_dict = st.session_state['trained_models_dict']
|
127 |
+
opt_dict = st.session_state['opt_dict']
|
128 |
+
feature_selection_dict = st.session_state['feature_selection_dict']
|
129 |
+
example_data = st.session_state['example_data']
|
130 |
+
logging.info('Dictionaries initiated')
|
131 |
+
|
132 |
+
logging.info('Checking button clicked')
|
133 |
+
if 'clicked' not in st.session_state:
|
134 |
+
st.session_state.clicked = False
|
135 |
+
logging.info(f'Button check passed with value {st.session_state.clicked}')
|
136 |
+
|
137 |
+
|
138 |
+
st.subheader('Please, fill in the input boxes or provide an csv/excel file and click on submit button to get the default probability(ies).')
|
139 |
+
|
140 |
+
best_model_name = trained_models_dict['best_model_name']
|
141 |
+
|
142 |
+
logging.info("Getting features' list")
|
143 |
+
if best_model_name in ['Average Ensemble', 'Optimized Ensemble', 'Rank Ensemble']:
|
144 |
+
features_list = []
|
145 |
+
for model_name in feature_selection_dict:
|
146 |
+
features_list.extend(
|
147 |
+
feature_selection_dict[model_name][1]['selected_shap_feats']
|
148 |
+
)
|
149 |
+
features_list = list(set(features_list))
|
150 |
+
else:
|
151 |
+
features_list = feature_selection_dict[best_model_name][1]['selected_shap_feats']
|
152 |
+
logging.info("Features' list found")
|
153 |
+
|
154 |
+
upload_container = st.container()
|
155 |
+
with upload_container:
|
156 |
+
upload_col1, upload_col2 = st.columns([0.6, 0.4])
|
157 |
+
uploaded_file = upload_col1.file_uploader(
|
158 |
+
'Upload a csv/excel file with data',
|
159 |
+
type=["csv", "xlsx"]
|
160 |
+
)
|
161 |
+
|
162 |
+
# example_data = pd.read_csv('app_input_example.csv')
|
163 |
+
# example_data = pd.read_csv('artifacts/data.csv')
|
164 |
+
# example_data = pd.read_excel('app_input_example.xlsx')
|
165 |
+
|
166 |
+
# @st.cache_data
|
167 |
+
# def convert_df(df):
|
168 |
+
# return df.to_csv(index=False).encode("utf-8")
|
169 |
+
# # return df.to_excel(index=False).encode("utf-8")
|
170 |
+
|
171 |
+
# csv_data = convert_df(df=example_data[features_list])
|
172 |
+
|
173 |
+
csv_data = example_data[features_list].to_csv(index=False).encode("utf-8")
|
174 |
+
|
175 |
+
upload_col2.write('An example of the data file')
|
176 |
+
upload_col2.download_button(
|
177 |
+
'Download',
|
178 |
+
data=csv_data,
|
179 |
+
file_name='input_example.csv',
|
180 |
+
mime="text/csv"
|
181 |
+
)
|
182 |
+
|
183 |
+
n_cols = 2
|
184 |
+
n_rows = int((len(features_list) - len(features_list) % n_cols) / n_cols)
|
185 |
+
if len(features_list) % n_cols != 0:
|
186 |
+
n_rows += 1
|
187 |
+
|
188 |
+
logging.info('Constructing the app input structure')
|
189 |
+
input_dict = {}
|
190 |
+
feature_idx = 0
|
191 |
+
for i in range(n_rows):
|
192 |
+
|
193 |
+
temp_input_container = st.container()
|
194 |
+
|
195 |
+
with temp_input_container:
|
196 |
+
col1, col2 = st.columns(n_cols)
|
197 |
+
if i <= n_rows - 1 and len(features_list) % 2 == 0:
|
198 |
+
input_dict[features_list[feature_idx]] = [
|
199 |
+
col1.number_input(
|
200 |
+
features_list[feature_idx],
|
201 |
+
format='%.6f' if features_list[feature_idx].split(' ')[-1] != 'Flag' else '%.0f'
|
202 |
+
)
|
203 |
+
]
|
204 |
+
input_dict[features_list[feature_idx+1]] = [
|
205 |
+
col2.number_input(
|
206 |
+
features_list[feature_idx+1],
|
207 |
+
format='%.6f' if features_list[feature_idx+1].split(' ')[-1] != 'Flag' else '%.0f'
|
208 |
+
)
|
209 |
+
]
|
210 |
+
else:
|
211 |
+
input_dict[features_list[feature_idx]] = [
|
212 |
+
col1.number_input(
|
213 |
+
features_list[feature_idx],
|
214 |
+
format='%.6f' if features_list[feature_idx].split(' ')[-1] != 'Flag' else '%.0f'
|
215 |
+
)
|
216 |
+
]
|
217 |
+
|
218 |
+
feature_idx += 2
|
219 |
+
|
220 |
+
logging.info('Input structure constructed')
|
221 |
+
|
222 |
+
def set_button_click():
|
223 |
+
st.session_state.clicked = True
|
224 |
+
|
225 |
+
st.button('Submit', on_click=set_button_click)
|
226 |
+
|
227 |
+
if st.session_state.clicked and uploaded_file is None:
|
228 |
+
|
229 |
+
st.session_state.clicked = False
|
230 |
+
|
231 |
+
logging.info(f'Calculating prob for {best_model_name}')
|
232 |
+
|
233 |
+
input_df = pd.DataFrame(input_dict)
|
234 |
+
|
235 |
+
default_prob = get_prob(input_df, trained_models_dict, feature_selection_dict, opt_dict)
|
236 |
+
|
237 |
+
st.write(f"Default probability: {default_prob[0]:.4f}")
|
238 |
+
|
239 |
+
logging.info(f'Default prob: {default_prob[0]:.4f}')
|
240 |
+
|
241 |
+
elif st.session_state.clicked and uploaded_file is not None:
|
242 |
+
st.session_state.clicked = False
|
243 |
+
# bites_data = uploaded_file.getvalue()
|
244 |
+
# stringio = StringIO(bites_data.decode('utf-8'))
|
245 |
+
# string_data = stringio.read()
|
246 |
+
logging.info('Loading uploaded data')
|
247 |
+
file_extension = uploaded_file.name.split('.')[-1]
|
248 |
+
if file_extension == 'csv':
|
249 |
+
input_df = pd.read_csv(uploaded_file)
|
250 |
+
else:
|
251 |
+
input_df = pd.read_excel(uploaded_file)
|
252 |
+
# input_df = pd.read_excel(uploaded_file)
|
253 |
+
logging.info('Uploaded data loaded')
|
254 |
+
|
255 |
+
with st.spinner('Please wait...'):
|
256 |
+
logging.info(f'Calculating probabilies for {best_model_name}')
|
257 |
+
default_prob = get_prob(input_df, trained_models_dict, feature_selection_dict, opt_dict)
|
258 |
+
logging.info('Probabilities calculated')
|
259 |
+
|
260 |
+
result_df = pd.DataFrame()
|
261 |
+
result_df['default_probability'] = default_prob
|
262 |
+
|
263 |
+
result_data = result_df.to_csv(index=False).encode("utf-8")
|
264 |
+
|
265 |
+
st.success('Done!')
|
266 |
+
|
267 |
+
st.download_button(
|
268 |
+
'Download the predicted probabilities',
|
269 |
+
data=result_data,
|
270 |
+
file_name='default_probabilities.csv',
|
271 |
+
mime='text/csv'
|
272 |
+
)
|
273 |
+
|
274 |
+
except Exception as e:
|
275 |
+
logging.info('Error occured while creating streamlit app')
|
276 |
+
raise CustomException(e, sys)
|
277 |
+
|
app_input_example.xlsx
ADDED
Binary file (11.1 kB). View file
|
|
artifacts/feature_selection_dict.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ffff597549a2c76e13872a5f2048d4b83da2a0f25eeccbade02a575871d84bf9
|
3 |
+
size 1930217
|
artifacts/models/opt_dict.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3f79cee91a25f02eb551b29c882f6afc778d175c4d755497234c1b2f49f3bbde
|
3 |
+
size 15200636
|
artifacts/models/trained_models_dict.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9695d244584ea79581682da0530cbdfb3dd02c76598114626a09e5f3bac3b520
|
3 |
+
size 1983143
|
company_bankruptcy/__init__.py
ADDED
File without changes
|
company_bankruptcy/components/__init__.py
ADDED
File without changes
|
company_bankruptcy/components/data_ingestion.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
from company_bankruptcy.logger.logger import logging
|
5 |
+
from company_bankruptcy.exception.exception import CustomException
|
6 |
+
from company_bankruptcy.data_access.mongo_db_connection import MongoOps
|
7 |
+
from company_bankruptcy.constants.constants import DATABASE_NAME, COLLECTION_NAME, MONGODB_COLLECTION_STR
|
8 |
+
|
9 |
+
import os
|
10 |
+
import sys
|
11 |
+
from pathlib import Path
|
12 |
+
from dataclasses import dataclass
|
13 |
+
|
14 |
+
from sklearn.model_selection import train_test_split
|
15 |
+
|
16 |
+
MONGODB_COLLECTION_STR = "mongodb+srv://vcharchian:[email protected]/?retryWrites=true&w=majority&appName=Cluster0"
|
17 |
+
|
18 |
+
@dataclass
|
19 |
+
class DataIngestionConfig:
|
20 |
+
raw_data_path:str = os.path.join('artifacts', 'data.csv')
|
21 |
+
train_data_path:str = os.path.join('artifacts', 'train_data.csv')
|
22 |
+
test_data_path:str = os.path.join('artifacts', 'test_data.csv')
|
23 |
+
|
24 |
+
class DataIngestion:
|
25 |
+
|
26 |
+
def __init__(self):
|
27 |
+
self.ingestion_config = DataIngestionConfig()
|
28 |
+
|
29 |
+
def initiate_data_ingestion(self):
|
30 |
+
logging.info('Data ingestion started')
|
31 |
+
try:
|
32 |
+
logging.info('Reading the raw data')
|
33 |
+
mongo_instance = MongoOps(
|
34 |
+
client_url=MONGODB_COLLECTION_STR
|
35 |
+
)
|
36 |
+
data = mongo_instance.get_records(coll_name=COLLECTION_NAME, db_name=DATABASE_NAME)
|
37 |
+
logging.info('Data loaded')
|
38 |
+
os.makedirs(os.path.dirname(os.path.join(self.ingestion_config.raw_data_path)), exist_ok=True)
|
39 |
+
logging.info('Saving the data')
|
40 |
+
data.to_csv(self.ingestion_config.raw_data_path, index=False)
|
41 |
+
logging.info('Data saved')
|
42 |
+
logging.info('Splitting the data into train and test sets')
|
43 |
+
train_df, test_df = train_test_split(
|
44 |
+
data,
|
45 |
+
test_size=0.1,
|
46 |
+
random_state=13,
|
47 |
+
stratify=data['Bankrupt?']
|
48 |
+
)
|
49 |
+
logging.info('Saving train and test sets')
|
50 |
+
train_df.to_csv(self.ingestion_config.train_data_path, index=False)
|
51 |
+
test_df.to_csv(self.ingestion_config.test_data_path, index=False)
|
52 |
+
logging.info('Sets are saved')
|
53 |
+
logging.info('Data ingestion completed')
|
54 |
+
return (self.ingestion_config.train_data_path, self.ingestion_config.test_data_path)
|
55 |
+
except Exception as e:
|
56 |
+
logging.info('Error occured during data ingestion')
|
57 |
+
raise CustomException(e, sys)
|
58 |
+
|
59 |
+
if __name__ == '__main__':
|
60 |
+
data_ingestion_obj = DataIngestion()
|
61 |
+
train_path, test_path = data_ingestion_obj.initiate_data_ingestion()
|
company_bankruptcy/components/data_transformation.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from company_bankruptcy.logger.logger import logging
|
3 |
+
from company_bankruptcy.exception.exception import CustomException
|
4 |
+
|
5 |
+
import os
|
6 |
+
import sys
|
7 |
+
from dataclasses import dataclass
|
8 |
+
|
9 |
+
from sklearn.model_selection import StratifiedKFold
|
10 |
+
|
11 |
+
from company_bankruptcy.utils.utils import save_object, create_feature_selection_dict
|
12 |
+
|
13 |
+
@dataclass
|
14 |
+
class DataTransformationConfig:
|
15 |
+
feature_selection_dict_file_path = os.path.join('artifacts', 'feature_selection_dict.pkl')
|
16 |
+
|
17 |
+
class DataTransformation:
|
18 |
+
|
19 |
+
def __init__(self):
|
20 |
+
self.data_transformation_config = DataTransformationConfig()
|
21 |
+
|
22 |
+
def initiate_data_transformation(self, train_path, test_path, n_cv_folds=10):
|
23 |
+
|
24 |
+
try:
|
25 |
+
logging.info('Loading training data')
|
26 |
+
train_df = pd.read_csv(train_path)
|
27 |
+
logging.info('Training data loaded')
|
28 |
+
|
29 |
+
logging.info('Loading testing data')
|
30 |
+
test_df = pd.read_csv(test_path)
|
31 |
+
logging.info('Testing data loaded')
|
32 |
+
|
33 |
+
logging.info('Removing Net Income Flag')
|
34 |
+
train_df.drop(columns=' Net Income Flag', inplace=True)
|
35 |
+
test_df.drop(columns=' Net Income Flag', inplace=True)
|
36 |
+
logging.info('Net Income Flag removed')
|
37 |
+
|
38 |
+
logging.info('Specifying nominal and numerical features as list')
|
39 |
+
nominal_features = [' Liability-Assets Flag']
|
40 |
+
numerical_features = [col for col in train_df.columns if col not in nominal_features and col!='Bankrupt?']
|
41 |
+
logging.info('Nominal and numerical features specified')
|
42 |
+
|
43 |
+
logging.info(f'Creating {n_cv_folds} CV folds for train data')
|
44 |
+
skfold = StratifiedKFold(n_splits=n_cv_folds, random_state=42, shuffle=True)
|
45 |
+
skfold_list = []
|
46 |
+
for train_idxs, valid_idxs in skfold.split(train_df, y=train_df['Bankrupt?']):
|
47 |
+
skfold_list.append((train_idxs, valid_idxs))
|
48 |
+
logging.info('CV folds created')
|
49 |
+
|
50 |
+
# logging.info('Creating new columns using categorical and numerical iteractions')
|
51 |
+
# for feat in numerical_features:
|
52 |
+
# train_df[f"feat{numerical_features.index(feat)}"] = train_df[feat] * train_df[' Liability-Assets Flag']
|
53 |
+
# test_df[f"feat{numerical_features.index(feat)}"] = test_df[feat] * test_df[' Liability-Assets Flag']
|
54 |
+
# numerical_features.append(f"feat{numerical_features.index(feat)}")
|
55 |
+
# logging.info('New columns created')
|
56 |
+
|
57 |
+
logging.info('Starting feature selection')
|
58 |
+
selected_features_dict = create_feature_selection_dict(
|
59 |
+
data=train_df,
|
60 |
+
cv_fold_list=skfold_list,
|
61 |
+
numerical_features=numerical_features,
|
62 |
+
nominal_features=nominal_features
|
63 |
+
)
|
64 |
+
logging.info('Feature selection completed')
|
65 |
+
|
66 |
+
logging.info('Saving feature selection dictionary as pkl file')
|
67 |
+
save_object(
|
68 |
+
file_path=self.data_transformation_config.feature_selection_dict_file_path,
|
69 |
+
obj=selected_features_dict
|
70 |
+
)
|
71 |
+
logging.info('Dictionary saved')
|
72 |
+
|
73 |
+
return (train_df, test_df, skfold_list, numerical_features)
|
74 |
+
|
75 |
+
except Exception as e:
|
76 |
+
logging.info('Error occured during data transformation')
|
77 |
+
raise CustomException(e, sys)
|
78 |
+
|
79 |
+
if __name__ == '__main__':
|
80 |
+
|
81 |
+
data_transformation_obj = DataTransformation()
|
82 |
+
train_df, test_df, cv_fold_list, numerical_features = data_transformation_obj.initiate_data_transformation(
|
83 |
+
train_path='artifacts\\train_data.csv',
|
84 |
+
test_path='artifacts\\test_data.csv'
|
85 |
+
)
|
company_bankruptcy/components/model_evaluation.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
from company_bankruptcy.logger.logger import logging
|
5 |
+
from company_bankruptcy.exception.exception import CustomException
|
6 |
+
from company_bankruptcy.utils.utils import load_object
|
7 |
+
from company_bankruptcy.components.model_trainer import ModelTrainer
|
8 |
+
from company_bankruptcy.components.data_transformation import DataTransformation
|
9 |
+
|
10 |
+
import os
|
11 |
+
import sys
|
12 |
+
|
13 |
+
import mlflow
|
14 |
+
import mlflow.sklearn
|
15 |
+
import mlflow.xgboost
|
16 |
+
|
17 |
+
from sklearn.metrics import roc_auc_score
|
18 |
+
|
19 |
+
from urllib.parse import urlparse
|
20 |
+
|
21 |
+
|
22 |
+
class ModelEvaluation:
|
23 |
+
|
24 |
+
def __init__(self):
|
25 |
+
|
26 |
+
logging.info('Model evaluation started')
|
27 |
+
|
28 |
+
def initiate_model_evaluation(self, test_df):
|
29 |
+
|
30 |
+
try:
|
31 |
+
|
32 |
+
logging.info('Setting target variable')
|
33 |
+
y_test = test_df['Bankrupt?'].to_frame()
|
34 |
+
logging.info('Target variable set')
|
35 |
+
|
36 |
+
logging.info('Loading the trained models')
|
37 |
+
model_trainer_obj = ModelTrainer()
|
38 |
+
models_main_path = model_trainer_obj.model_trainer_config.trained_models_path
|
39 |
+
trained_models_dict = load_object(
|
40 |
+
os.path.join(models_main_path, 'trained_models_dict.pkl')
|
41 |
+
)
|
42 |
+
opt_dict = load_object(
|
43 |
+
os.path.join(models_main_path, 'opt_dict.pkl')
|
44 |
+
)
|
45 |
+
logging.info('Trained models loaded')
|
46 |
+
|
47 |
+
logging.info("Loading the features' dictionary")
|
48 |
+
data_transformation_obj = DataTransformation()
|
49 |
+
features_selection_dict_path = data_transformation_obj.data_transformation_config.feature_selection_dict_file_path
|
50 |
+
feature_selection_dict = load_object(features_selection_dict_path)
|
51 |
+
logging.info("Features' selection dictionary loaded")
|
52 |
+
|
53 |
+
test_score_dict = {}
|
54 |
+
|
55 |
+
logging.info('Finding test score for Average Ensemble')
|
56 |
+
y_test_pred_prob = 0
|
57 |
+
for model_name in trained_models_dict:
|
58 |
+
if model_name == 'best_model_name':
|
59 |
+
continue
|
60 |
+
features_list = feature_selection_dict[model_name][1]['selected_shap_feats']
|
61 |
+
temp_prob = trained_models_dict[model_name].predict_proba(test_df[features_list])[:, 1]
|
62 |
+
y_test_pred_prob += temp_prob
|
63 |
+
y_test_pred_prob /= (len(trained_models_dict) - 1)
|
64 |
+
avg_ens_score = roc_auc_score(y_test, y_test_pred_prob)
|
65 |
+
test_score_dict['AverageEnsemble'] = avg_ens_score
|
66 |
+
logging.info('Average Ensemble score calculated')
|
67 |
+
|
68 |
+
logging.info('Finding test score for Optimized Ensemble')
|
69 |
+
rfm_features_list = feature_selection_dict['RandomForestClassifier'][1]['selected_shap_feats']
|
70 |
+
xgbm_features_list = feature_selection_dict['XGBClassifier'][1]['selected_shap_feats']
|
71 |
+
lrm_features_list = feature_selection_dict['LogisticRegression'][1]['selected_shap_feats']
|
72 |
+
svcm_features_list = feature_selection_dict['SVC'][1]['selected_shap_feats']
|
73 |
+
|
74 |
+
preds_list = []
|
75 |
+
|
76 |
+
for idx in opt_dict:
|
77 |
+
opt = opt_dict[idx]['opt']
|
78 |
+
rfm = opt_dict[idx]['rfm']
|
79 |
+
xgbm = opt_dict[idx]['xgbm']
|
80 |
+
lrm = opt_dict[idx]['lrm']
|
81 |
+
svcm = opt_dict[idx]['svcm']
|
82 |
+
|
83 |
+
rfm_probs = rfm.predict_proba(test_df[rfm_features_list])[:, 1]
|
84 |
+
xgbm_probs = xgbm.predict_proba(test_df[xgbm_features_list])[:, 1]
|
85 |
+
lrm_probs = lrm.predict_proba(test_df[lrm_features_list])[:, 1]
|
86 |
+
svcm_probs = svcm.predict_proba(test_df[svcm_features_list])[:, 1]
|
87 |
+
|
88 |
+
model_preds = np.column_stack([
|
89 |
+
rfm_probs,
|
90 |
+
xgbm_probs,
|
91 |
+
lrm_probs,
|
92 |
+
svcm_probs
|
93 |
+
])
|
94 |
+
|
95 |
+
preds_list.append(opt.predict(model_preds))
|
96 |
+
|
97 |
+
y_test_pred_prob = np.mean(np.column_stack(preds_list), axis=1)
|
98 |
+
optimized_ens_score = roc_auc_score(y_test, y_test_pred_prob)
|
99 |
+
test_score_dict['OptimizedEnsemble'] = optimized_ens_score
|
100 |
+
logging.info('Optimized Ensemble score calculated')
|
101 |
+
|
102 |
+
logging.info('Finding test score for Rank Ensemble')
|
103 |
+
rank_ensemble_list = []
|
104 |
+
prob_list = []
|
105 |
+
model_names_list = []
|
106 |
+
|
107 |
+
for model_name in trained_models_dict:
|
108 |
+
if model_name == 'best_model_name':
|
109 |
+
continue
|
110 |
+
features_list = feature_selection_dict[model_name][1]['selected_shap_feats']
|
111 |
+
model_names_list.append(model_name)
|
112 |
+
rank_ensemble_list.append((model_name, trained_models_dict[model_name].best_score_))
|
113 |
+
prob_list.append(trained_models_dict[model_name].predict_proba(test_df[features_list])[:, 1])
|
114 |
+
|
115 |
+
rank_ensemble_list = sorted(rank_ensemble_list, key=lambda x: x[1])
|
116 |
+
|
117 |
+
y_test_pred_prob = 0
|
118 |
+
for i in range(len(rank_ensemble_list)):
|
119 |
+
y_test_pred_prob += (i+1) * prob_list[model_names_list.index(rank_ensemble_list[i][0])]
|
120 |
+
y_test_pred_prob /= (len(rank_ensemble_list) * (1 + len(rank_ensemble_list)) / 2)
|
121 |
+
rank_ens_score = roc_auc_score(y_test, y_test_pred_prob)
|
122 |
+
test_score_dict['RankEnsemble'] = rank_ens_score
|
123 |
+
logging.info('Rank Ensemble score calculated')
|
124 |
+
|
125 |
+
for model_name in trained_models_dict:
|
126 |
+
if model_name == 'best_model_name':
|
127 |
+
continue
|
128 |
+
logging.info(f'Finding test score for {model_name}')
|
129 |
+
features_list = feature_selection_dict[model_name][1]['selected_shap_feats']
|
130 |
+
model = trained_models_dict[model_name]
|
131 |
+
y_test_pred_prob = model.predict_proba(test_df[features_list])[:, 1]
|
132 |
+
temp_score = roc_auc_score(y_test, y_test_pred_prob)
|
133 |
+
test_score_dict[model_name] = temp_score
|
134 |
+
logging.info(f'{model_name} score calculated')
|
135 |
+
|
136 |
+
logging.info('Getting mlflow tracking uri type')
|
137 |
+
tracking_uri_type_store = urlparse(mlflow.get_tracking_uri()).scheme
|
138 |
+
logging.info('Tracking uri got')
|
139 |
+
|
140 |
+
logging.info('Starting mlflow')
|
141 |
+
with mlflow.start_run():
|
142 |
+
for model_name in test_score_dict:
|
143 |
+
mlflow.log_metric(f'{model_name} ROC-AUC', test_score_dict[model_name])
|
144 |
+
if model_name in trained_models_dict.keys():
|
145 |
+
model = trained_models_dict[model_name]
|
146 |
+
if tracking_uri_type_store != 'file':
|
147 |
+
# if model_name == 'XGBClassifier':
|
148 |
+
# mlflow.xgboost.log_model(model, f'{model_name}', registered_model_name=f'{model_name}_model')
|
149 |
+
# else:
|
150 |
+
mlflow.sklearn.log_model(model, f'{model_name}', registered_model_name=f'{model_name}_model')
|
151 |
+
else:
|
152 |
+
# if model_name == 'XGBClassifier':
|
153 |
+
# mlflow.xgboost.log_model(model, f'{model_name}')
|
154 |
+
# else:
|
155 |
+
mlflow.sklearn.log_model(model, f'{model_name}')
|
156 |
+
|
157 |
+
logging.info('mlflow succeeded')
|
158 |
+
|
159 |
+
|
160 |
+
except Exception as e:
|
161 |
+
|
162 |
+
logging.info('Error occured during model evaluation')
|
163 |
+
raise CustomException(e, sys)
|
164 |
+
|
company_bankruptcy/components/model_trainer.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from company_bankruptcy.logger.logger import logging
|
4 |
+
from company_bankruptcy.exception.exception import CustomException
|
5 |
+
from company_bankruptcy.utils.utils import save_object, find_optimal_model
|
6 |
+
|
7 |
+
import os
|
8 |
+
import sys
|
9 |
+
from pathlib import Path
|
10 |
+
from dataclasses import dataclass
|
11 |
+
|
12 |
+
|
13 |
+
@dataclass
|
14 |
+
class ModelTrainerConfig:
|
15 |
+
trained_models_path = os.path.join('artifacts', 'models')
|
16 |
+
|
17 |
+
|
18 |
+
class ModelTrainer:
|
19 |
+
|
20 |
+
def __init__(self):
|
21 |
+
self.model_trainer_config = ModelTrainerConfig()
|
22 |
+
|
23 |
+
def initiate_model_training(self, train_df, test_df, features_dict_path, cv_fold_list, numerical_features):
|
24 |
+
|
25 |
+
try:
|
26 |
+
|
27 |
+
logging.info('Creating a directory to save trained models')
|
28 |
+
os.makedirs(
|
29 |
+
self.model_trainer_config.trained_models_path, exist_ok=True)
|
30 |
+
logging.info("Models' directory created")
|
31 |
+
|
32 |
+
logging.info('Finding the best model')
|
33 |
+
trained_models_dict, opt_dict = find_optimal_model(
|
34 |
+
train_df,
|
35 |
+
test_df,
|
36 |
+
features_dict_path,
|
37 |
+
cv_fold_list,
|
38 |
+
numerical_features
|
39 |
+
)
|
40 |
+
|
41 |
+
logging.info(
|
42 |
+
"Saving trained models' and ensemble optimized weights' dictionaries")
|
43 |
+
save_object(
|
44 |
+
file_path=os.path.join(
|
45 |
+
self.model_trainer_config.trained_models_path, 'trained_models_dict.pkl'),
|
46 |
+
obj=trained_models_dict
|
47 |
+
)
|
48 |
+
|
49 |
+
save_object(
|
50 |
+
file_path=os.path.join(
|
51 |
+
self.model_trainer_config.trained_models_path, 'opt_dict.pkl'),
|
52 |
+
obj=opt_dict
|
53 |
+
)
|
54 |
+
logging.info('Saving completed')
|
55 |
+
|
56 |
+
except Exception as e:
|
57 |
+
logging.info('Error occured during model training')
|
58 |
+
raise CustomException(e, sys)
|
59 |
+
|
60 |
+
# if __name__ == '__main__':
|
61 |
+
# model_training_obj = ModelTrainer()
|
62 |
+
# model_training_obj.initiate_model_training(
|
63 |
+
# train_df,
|
64 |
+
# test_df,
|
65 |
+
# features_dict_path,
|
66 |
+
# cv_fold_list,
|
67 |
+
# numerical_features
|
68 |
+
# )
|
company_bankruptcy/constants/__init__.py
ADDED
File without changes
|
company_bankruptcy/constants/constants.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
DATABASE_NAME = "bankruptcy"
|
2 |
+
|
3 |
+
COLLECTION_NAME = "data"
|
4 |
+
|
5 |
+
MONGODB_COLLECTION_STR = "MONGODB_COLLECTION_STR"
|
company_bankruptcy/data_access/__init__.py
ADDED
File without changes
|
company_bankruptcy/data_access/mongo_db_connection.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import pymongo
|
3 |
+
import json
|
4 |
+
|
5 |
+
from company_bankruptcy.exception.exception import CustomException
|
6 |
+
from company_bankruptcy.logger.logger import logging
|
7 |
+
from company_bankruptcy.constants.constants import DATABASE_NAME, COLLECTION_NAME, MONGODB_COLLECTION_STR
|
8 |
+
|
9 |
+
import sys
|
10 |
+
|
11 |
+
|
12 |
+
class MongoOps:
|
13 |
+
|
14 |
+
def __init__(self, client_url:str, database_name:str=None, collection_name:str=None):
|
15 |
+
self.client_url = client_url
|
16 |
+
self.database_name = database_name
|
17 |
+
self.collection_name = collection_name
|
18 |
+
|
19 |
+
def create_client(self):
|
20 |
+
logging.info('Initiating MongoClient')
|
21 |
+
client = pymongo.MongoClient(self.client_url)
|
22 |
+
logging.info('MongoClient initiated')
|
23 |
+
return client
|
24 |
+
|
25 |
+
def create_database(self):
|
26 |
+
logging.info('Creating Mongo database')
|
27 |
+
client = self.create_client()
|
28 |
+
database = client[self.database_name]
|
29 |
+
logging.info(f'Mongo database {self.database_name} created')
|
30 |
+
return database
|
31 |
+
|
32 |
+
def create_collection(self):
|
33 |
+
logging.info('Creating Mongo collection')
|
34 |
+
database = self.create_database()
|
35 |
+
collection = database[self.collection_name]
|
36 |
+
logging.info(f'Mongo collection {self.collection_name} created')
|
37 |
+
return collection
|
38 |
+
|
39 |
+
def get_database(self, db_name:str):
|
40 |
+
logging.info(f'Accessing {db_name} database')
|
41 |
+
client = self.create_client()
|
42 |
+
database = client[db_name]
|
43 |
+
logging.info(f'{db_name} database accessed')
|
44 |
+
return database
|
45 |
+
|
46 |
+
def get_collection(self, coll_name:str, db_name:str):
|
47 |
+
logging.info(f'Accessing {coll_name} collection')
|
48 |
+
database = self.get_database(db_name)
|
49 |
+
collection = database[coll_name]
|
50 |
+
logging.info(f'{coll_name} collection accessed')
|
51 |
+
return collection
|
52 |
+
|
53 |
+
def insert_record(self, record:dict, coll_name:str, db_name:str):
|
54 |
+
collection = self.get_collection(coll_name, db_name)
|
55 |
+
logging.info(f'Starting record insertion into {coll_name} collection of {db_name} database')
|
56 |
+
if isinstance(record, list):
|
57 |
+
for data in record:
|
58 |
+
if type(data) != dict:
|
59 |
+
logging.info("Records' list should have elements as dict")
|
60 |
+
raise TypeError("Records' list should have elements as dict")
|
61 |
+
collection.insert_many(record)
|
62 |
+
elif isinstance(record, dict):
|
63 |
+
collection.insert_one(record)
|
64 |
+
logging.info(f'Insertion into {coll_name} collection of {db_name} database completed')
|
65 |
+
|
66 |
+
def insert_from_file(self, datafile:str, coll_name:str, db_name:str):
|
67 |
+
logging.info(f'Starting record insertion into {coll_name} collection of {db_name} database from {datafile}')
|
68 |
+
self.path = datafile
|
69 |
+
|
70 |
+
if self.path.endswith('.csv'):
|
71 |
+
df = pd.read_csv(self.path, encoding='utf-8')
|
72 |
+
elif self.path.endswith('.xlsx'):
|
73 |
+
df = pd.read_excel(self.path, encoding='utf-8')
|
74 |
+
logging.info('Data is loaded as a pandas dataframe')
|
75 |
+
|
76 |
+
logging.info('Converting the data into json')
|
77 |
+
datajson = json.loads(df.to_json(orient='record'))
|
78 |
+
logging.info('Conversion to json completed')
|
79 |
+
|
80 |
+
collection = self.get_collection(coll_name, db_name)
|
81 |
+
|
82 |
+
logging.info('Inserting json data')
|
83 |
+
collection.insert_many(datajson)
|
84 |
+
logging.info('Insertion completed')
|
85 |
+
|
86 |
+
def get_records(self, coll_name:str, db_name:str):
|
87 |
+
collection = self.get_collection(coll_name, db_name)
|
88 |
+
retrieved_data = pd.DataFrame(list(collection.find()))
|
89 |
+
try:
|
90 |
+
retrieved_data.drop(columns='_id', inplace=True)
|
91 |
+
logging.info('Loading the data from the database completed')
|
92 |
+
except Exception as e:
|
93 |
+
retrieved_data = pd.DataFrame()
|
94 |
+
logging.info('Loading the data from the database failed')
|
95 |
+
raise CustomException(e, sys)
|
96 |
+
return retrieved_data
|
97 |
+
|
98 |
+
if __name__ == '__main__':
|
99 |
+
|
100 |
+
mongo_instance = MongoOps(
|
101 |
+
client_url=MONGODB_COLLECTION_STR
|
102 |
+
)
|
103 |
+
|
104 |
+
retrieved_data = mongo_instance.get_records(coll_name=COLLECTION_NAME, db_name=DATABASE_NAME)
|
company_bankruptcy/exception/__init__.py
ADDED
File without changes
|
company_bankruptcy/exception/exception.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
|
3 |
+
|
4 |
+
class CustomException(Exception):
|
5 |
+
|
6 |
+
def __init__(self, error_message, error_details:sys):
|
7 |
+
self.error_message = error_message
|
8 |
+
_, _, exc_tb = error_details.exc_info()
|
9 |
+
self.lineno = exc_tb.tb_lineno
|
10 |
+
self.file_name = exc_tb.tb_frame.f_code.co_filename
|
11 |
+
|
12 |
+
def __str__(self):
|
13 |
+
return "Error occured in python script name [{0}] line number [{1}] error message [{2}]".format(
|
14 |
+
self.file_name, self.lineno, str(self.error_message))
|
15 |
+
|
16 |
+
if __name__ == '__main__':
|
17 |
+
try:
|
18 |
+
1 / 0
|
19 |
+
except Exception as e:
|
20 |
+
raise CustomException(e, sys)
|
company_bankruptcy/logger/__init__.py
ADDED
File without changes
|
company_bankruptcy/logger/logger.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
from datetime import datetime as dt
|
4 |
+
|
5 |
+
LOG_FILE = f"{dt.now().strftime('%m_%d_%Y_%H_%M_%S')}.log"
|
6 |
+
|
7 |
+
log_path = os.path.join(os.getcwd(), "logs")
|
8 |
+
|
9 |
+
os.makedirs(log_path, exist_ok=True)
|
10 |
+
|
11 |
+
LOG_FILEPATH = os.path.join(log_path, LOG_FILE)
|
12 |
+
|
13 |
+
logging.basicConfig(
|
14 |
+
level=logging.INFO,
|
15 |
+
filename=LOG_FILEPATH,
|
16 |
+
format="[%(asctime)s] %(lineno)d %(name)s - %(levelname)s - %(message)s"
|
17 |
+
)
|
18 |
+
|
19 |
+
if __name__ == '__main__':
|
20 |
+
logging.info("Log testing executed!!!")
|
company_bankruptcy/pipeline/__init__.py
ADDED
File without changes
|
company_bankruptcy/pipeline/prediction_pipeline.py
ADDED
File without changes
|
company_bankruptcy/pipeline/training_pipeline.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from company_bankruptcy.components.data_ingestion import DataIngestion
|
2 |
+
from company_bankruptcy.components.data_transformation import DataTransformation
|
3 |
+
from company_bankruptcy.components.model_trainer import ModelTrainer
|
4 |
+
from company_bankruptcy.components.model_evaluation import ModelEvaluation
|
5 |
+
|
6 |
+
def run_pipeline():
|
7 |
+
|
8 |
+
data_ingestion_obj = DataIngestion()
|
9 |
+
train_path, test_path = data_ingestion_obj.initiate_data_ingestion()
|
10 |
+
|
11 |
+
data_transformation_obj = DataTransformation()
|
12 |
+
train_df, test_df, cv_fold_list, numerical_features = data_transformation_obj.initiate_data_transformation(
|
13 |
+
train_path=train_path,
|
14 |
+
test_path=test_path
|
15 |
+
)
|
16 |
+
|
17 |
+
model_training_obj = ModelTrainer()
|
18 |
+
model_training_obj.initiate_model_training(
|
19 |
+
train_df=train_df,
|
20 |
+
test_df=test_df,
|
21 |
+
features_dict_path=data_transformation_obj.data_transformation_config.feature_selection_dict_file_path,
|
22 |
+
cv_fold_list=cv_fold_list,
|
23 |
+
numerical_features=numerical_features
|
24 |
+
)
|
25 |
+
|
26 |
+
model_evaluation_obj = ModelEvaluation()
|
27 |
+
model_evaluation_obj.initiate_model_evaluation(test_df)
|
company_bankruptcy/utils/__init__.py
ADDED
File without changes
|
company_bankruptcy/utils/utils.py
ADDED
@@ -0,0 +1,974 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import pickle
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
from company_bankruptcy.logger.logger import logging
|
8 |
+
from company_bankruptcy.exception.exception import CustomException
|
9 |
+
|
10 |
+
from sklearn.svm import SVC
|
11 |
+
from sklearn.feature_selection import RFE
|
12 |
+
from sklearn.feature_selection import r_regression, SelectKBest
|
13 |
+
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
|
14 |
+
from sklearn.feature_selection import f_classif, chi2
|
15 |
+
from sklearn.ensemble import RandomForestClassifier
|
16 |
+
from sklearn.linear_model import LogisticRegression
|
17 |
+
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
|
18 |
+
from sklearn.preprocessing import StandardScaler
|
19 |
+
from sklearn.model_selection import GridSearchCV
|
20 |
+
from sklearn.pipeline import Pipeline
|
21 |
+
from sklearn.compose import ColumnTransformer
|
22 |
+
|
23 |
+
from xgboost import XGBClassifier
|
24 |
+
|
25 |
+
from scipy import stats
|
26 |
+
from scipy.special import softmax
|
27 |
+
from scipy.optimize import fmin
|
28 |
+
|
29 |
+
from functools import partial
|
30 |
+
|
31 |
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
32 |
+
|
33 |
+
from boruta import BorutaPy
|
34 |
+
|
35 |
+
import shap
|
36 |
+
|
37 |
+
from collections import Counter
|
38 |
+
|
39 |
+
from tqdm.auto import tqdm
|
40 |
+
import gc
|
41 |
+
|
42 |
+
import warnings
|
43 |
+
warnings.filterwarnings('ignore')
|
44 |
+
|
45 |
+
|
46 |
+
def save_object(file_path, obj):
|
47 |
+
try:
|
48 |
+
dir_path = os.path.dirname(file_path)
|
49 |
+
|
50 |
+
os.makedirs(dir_path, exist_ok=True)
|
51 |
+
|
52 |
+
with open(file_path, "wb") as file_obj:
|
53 |
+
pickle.dump(obj, file_obj)
|
54 |
+
|
55 |
+
except Exception as e:
|
56 |
+
raise CustomException(e, sys)
|
57 |
+
|
58 |
+
def load_object(file_path):
|
59 |
+
try:
|
60 |
+
with open(file_path, 'rb') as file_obj:
|
61 |
+
return pickle.load(file_obj)
|
62 |
+
except Exception as e:
|
63 |
+
logging.info('Exception Occured in load_object function utils')
|
64 |
+
raise CustomException(e, sys)
|
65 |
+
|
66 |
+
|
67 |
+
def get_shap_features(shap_values, features, topk=10):
|
68 |
+
'''
|
69 |
+
Returns topk features selected using shap values
|
70 |
+
|
71 |
+
Args:
|
72 |
+
shap_values (object): shap explainer
|
73 |
+
features (list): list of features' name
|
74 |
+
|
75 |
+
Returns:
|
76 |
+
list: topk features derived from shap values
|
77 |
+
'''
|
78 |
+
# Calculates the feature importance (mean absolute shap value) for each feature
|
79 |
+
importances = []
|
80 |
+
for i in range(shap_values.values.shape[1]):
|
81 |
+
importances.append(np.mean(np.abs(shap_values.values[:, i])))
|
82 |
+
# Calculates the normalized version
|
83 |
+
importances_norm = softmax(importances)
|
84 |
+
# Organize the importances and columns in a dictionary
|
85 |
+
feature_importances = {fea: imp for imp, fea in zip(importances, features)}
|
86 |
+
feature_importances_norm = {fea: imp for imp,
|
87 |
+
fea in zip(importances_norm, features)}
|
88 |
+
# Sorts the dictionary
|
89 |
+
feature_importances = {k: v for k, v in sorted(
|
90 |
+
feature_importances.items(), key=lambda item: item[1], reverse=True)}
|
91 |
+
feature_importances_norm = {k: v for k, v in sorted(
|
92 |
+
feature_importances_norm.items(), key=lambda item: item[1], reverse=True)}
|
93 |
+
# Prints the feature importances
|
94 |
+
selected_topk_feats = []
|
95 |
+
|
96 |
+
for idx, (k, v) in enumerate(feature_importances.items()):
|
97 |
+
# print(f"{k} -> {v:.4f} (softmax = {feature_importances_norm[k]:.4f})")
|
98 |
+
if idx <= topk:
|
99 |
+
selected_topk_feats.append(k)
|
100 |
+
|
101 |
+
return selected_topk_feats
|
102 |
+
|
103 |
+
|
104 |
+
class FSelector():
|
105 |
+
'''
|
106 |
+
Helps to select features based on BorutaPy, RFE, and various statistics
|
107 |
+
'''
|
108 |
+
|
109 |
+
def __init__(self, X, y, num_feats, ordinal_feats, nominal_feats, model, is_target_cat=True, select_n_feats=15):
|
110 |
+
'''
|
111 |
+
Initializes some parameters
|
112 |
+
|
113 |
+
Args:
|
114 |
+
X (pd.DataFrame): contains features' values
|
115 |
+
y (pd.DataFrame): contains target values
|
116 |
+
num_feats (list): list of numerical features' names
|
117 |
+
ordinal_feats (list): list of ordinal features' names
|
118 |
+
nominal_feats (list): list of nominal features' names
|
119 |
+
model (model object): can be any type of model like RandomForest, LogisticRegression, etc.
|
120 |
+
is_target_cat (bool): indicates whether the target is categorical or not
|
121 |
+
select_n_feats (int): specifies the number of features to output
|
122 |
+
'''
|
123 |
+
|
124 |
+
self.X = X
|
125 |
+
self.y = y
|
126 |
+
self.num_feats = num_feats
|
127 |
+
self.ordinal_feats = ordinal_feats
|
128 |
+
self.nominal_feats = nominal_feats
|
129 |
+
self.model = model
|
130 |
+
self.is_target_cat = is_target_cat
|
131 |
+
self.select_n_feats = select_n_feats
|
132 |
+
|
133 |
+
def calculate_vif(self, X):
|
134 |
+
|
135 |
+
vif = pd.DataFrame()
|
136 |
+
vif["features"] = X.columns
|
137 |
+
vif["VIF"] = [variance_inflation_factor(
|
138 |
+
X.values, i) for i in range(X.shape[1])]
|
139 |
+
|
140 |
+
return vif
|
141 |
+
|
142 |
+
def select_feats_via_vif(self):
|
143 |
+
|
144 |
+
num_features = self.num_feats.copy()
|
145 |
+
|
146 |
+
vif_df = self.calculate_vif(self.X[num_features])
|
147 |
+
|
148 |
+
while vif_df[vif_df['VIF'] >= 10].shape[0] != 0:
|
149 |
+
vif_df.sort_values('VIF', ascending=False, inplace=True)
|
150 |
+
vif_df.reset_index(drop=True, inplace=True)
|
151 |
+
# print(vif_df)
|
152 |
+
elimination_candidate = vif_df.iloc[0]['features']
|
153 |
+
# print(elimination_candidate)
|
154 |
+
num_features = [i for i in num_features if i !=
|
155 |
+
elimination_candidate]
|
156 |
+
new_X = self.X[num_features]
|
157 |
+
vif_df = self.calculate_vif(new_X)
|
158 |
+
|
159 |
+
return list(vif_df['features'].values)
|
160 |
+
|
161 |
+
def get_spearmanr(self, X, y):
|
162 |
+
# return np.array([stats.spearmanr(X.values[:, f], y.values).correlation for f in range(X.shape[1])])
|
163 |
+
spearman_values = [stats.spearmanr(
|
164 |
+
X.values[:, f], y.values).correlation for f in range(X.shape[1])]
|
165 |
+
temp_sp_df = pd.DataFrame(
|
166 |
+
{'spearman': spearman_values, 'feats': list(X.columns)})
|
167 |
+
temp_sp_df['abs_spearman'] = np.abs(temp_sp_df['spearman'])
|
168 |
+
temp_sp_df.sort_values('abs_spearman', ascending=False, inplace=True)
|
169 |
+
temp_sp_df.reset_index(drop=True, inplace=True)
|
170 |
+
return temp_sp_df.iloc[:15]['feats'].to_list()
|
171 |
+
|
172 |
+
def get_kendalltau(self, X, y):
|
173 |
+
# return [stats.kendalltau(X.values[:, f], y.values).correlation for f in range(X.shape[1])]
|
174 |
+
kendall_values = [stats.spearmanr(
|
175 |
+
X.values[:, f], y.values).correlation for f in range(X.shape[1])]
|
176 |
+
temp_ken_df = pd.DataFrame(
|
177 |
+
{'kendall': kendall_values, 'feats': list(X.columns)})
|
178 |
+
temp_ken_df['abs_kendall'] = np.abs(temp_ken_df['kendall'])
|
179 |
+
temp_ken_df.sort_values('abs_kendall', ascending=False, inplace=True)
|
180 |
+
temp_ken_df.reset_index(drop=True, inplace=True)
|
181 |
+
return temp_ken_df.iloc[:15]['feats'].to_list()
|
182 |
+
|
183 |
+
def get_pointbiserialr(self, X, y):
|
184 |
+
return [stats.pointbiserialr(X.values[:, f], y.values).correlation for f in range(X.shape[1])]
|
185 |
+
|
186 |
+
def get_boruta_feats(self):
|
187 |
+
feat_selector = BorutaPy(
|
188 |
+
self.model, n_estimators='auto', verbose=2, random_state=1)
|
189 |
+
feat_selector.fit(np.array(self.X), np.array(self.y))
|
190 |
+
boruta_selected_features = list(
|
191 |
+
self.X.iloc[:, feat_selector.support_].columns)
|
192 |
+
return boruta_selected_features
|
193 |
+
|
194 |
+
def get_kbest(self, X, feats_list, metric):
|
195 |
+
selector = SelectKBest(metric, k=self.select_n_feats)
|
196 |
+
selector.fit_transform(X[feats_list], self.y)
|
197 |
+
selected_feats_idxs_list = list(selector.get_support(indices=True))
|
198 |
+
column_names = [feats_list[i] for i in selected_feats_idxs_list]
|
199 |
+
return column_names
|
200 |
+
|
201 |
+
def get_rfe_feats(self):
|
202 |
+
model_rfe = RFE(self.model, n_features_to_select=self.select_n_feats)
|
203 |
+
model_rfe.fit(self.X, self.y)
|
204 |
+
model_rfe_feats = list(
|
205 |
+
self.X.iloc[:, list(model_rfe.support_)].columns)
|
206 |
+
return model_rfe_feats
|
207 |
+
|
208 |
+
# def get_shap_feats(self, feats_list, topk=10):
|
209 |
+
# model = self.model
|
210 |
+
# X = self.X[feats_list]
|
211 |
+
# model.fit(self.X, self.y)
|
212 |
+
# explainer = shap.Explainer(model.predict, X, max_evals = int(2 * X.shape[1] + 1), verbose=0)
|
213 |
+
# shap_values = explainer(X)
|
214 |
+
# selected_shap_features = get_feature_importances_shap_values(
|
215 |
+
# shap_values, features=list(X.columns), topk=topk
|
216 |
+
# )
|
217 |
+
# return selected_shap_features
|
218 |
+
|
219 |
+
def get_features(self):
|
220 |
+
|
221 |
+
if self.num_feats is not None:
|
222 |
+
|
223 |
+
if self.is_target_cat:
|
224 |
+
|
225 |
+
temp_n_feats = self.select_n_feats
|
226 |
+
if len(self.num_feats) < self.select_n_feats:
|
227 |
+
self.select_n_feats = 'all'
|
228 |
+
|
229 |
+
# self.num_kendalltau_feats = self.get_kendalltau(self.X[self.num_feats], self.y)
|
230 |
+
self.num_f_feats = self.get_kbest(
|
231 |
+
X=self.X, feats_list=self.num_feats, metric=f_classif)
|
232 |
+
self.num_mi_feats = self.get_kbest(
|
233 |
+
X=self.X, feats_list=self.num_feats, metric=mutual_info_classif)
|
234 |
+
|
235 |
+
self.select_n_feats = temp_n_feats
|
236 |
+
|
237 |
+
self.selected_num_feats = []
|
238 |
+
# self.selected_num_feats.extend(self.num_kendalltau_feats)
|
239 |
+
self.selected_num_feats.extend(self.num_f_feats)
|
240 |
+
self.selected_num_feats.extend(self.num_mi_feats)
|
241 |
+
|
242 |
+
else:
|
243 |
+
|
244 |
+
self.vif_feats = self.select_feats_via_vif()
|
245 |
+
|
246 |
+
temp_n_feats = self.select_n_feats
|
247 |
+
if len(self.num_feats) < self.select_n_feats:
|
248 |
+
self.select_n_feats = 'all'
|
249 |
+
|
250 |
+
self.pearson_feats = self.get_kbest(
|
251 |
+
X=self.X, feats_list=self.num_feats, metric=r_regression, k=self.select_n_feats)
|
252 |
+
|
253 |
+
self.select_n_feats = temp_n_feats
|
254 |
+
# self.num_spearmanr_feats = self.get_kbest(X=self.X, feats_list=self.num_feats, metric=stats.spearmanr, k=self.select_n_feats)
|
255 |
+
# self.num_kendalltau_feats = self.get_kbest(X=self.X, feats_list=self.num_feats, metric=stats.kendalltau, k=self.select_n_feats)
|
256 |
+
self.num_spearmanr_feats = self.get_spearmanr(
|
257 |
+
self.X[self.num_feats], self.y)
|
258 |
+
self.num_kendalltau_feats = self.get_kendalltau(
|
259 |
+
self.X[self.num_feats], self.y)
|
260 |
+
# self.num_spearmanr_feats = SelectKBest(self.get_spearmanr, k=self.select_n_feats).fit_transform(self.X[self.num_feats], self.y)
|
261 |
+
# self.num_kendalltau_feats = SelectKBest(self.get_kendalltau, k=self.select_n_feats).fit_transform(self.X[self.num_feats], self.y)
|
262 |
+
|
263 |
+
self.selected_num_feats = []
|
264 |
+
self.selected_num_feats.extend(self.pearson_feats)
|
265 |
+
self.selected_num_feats.extend(self.num_spearmanr_feats)
|
266 |
+
self.selected_num_feats.extend(self.num_kendalltau_feats)
|
267 |
+
# self.selected_num_feats = list(set(self.selected_num_feats))
|
268 |
+
|
269 |
+
else:
|
270 |
+
|
271 |
+
self.selected_num_feats = []
|
272 |
+
|
273 |
+
if self.ordinal_feats is not None:
|
274 |
+
|
275 |
+
if self.is_target_cat:
|
276 |
+
|
277 |
+
temp_n_feats = self.select_n_feats
|
278 |
+
if len(self.ordinal_feats) < self.select_n_feats:
|
279 |
+
self.select_n_feats = 'all'
|
280 |
+
|
281 |
+
self.ordinal_mi_feats = self.get_kbest(
|
282 |
+
X=self.X, feats_list=self.ordinal_feats, metric=mutual_info_classif)
|
283 |
+
self.ordinal_chi2_feats = self.get_kbest(
|
284 |
+
X=self.X, feats_list=self.ordinal_feats, metric=chi2)
|
285 |
+
|
286 |
+
self.selected_ordinal_feats = []
|
287 |
+
self.selected_ordinal_feats.extend(self.ordinal_mi_feats)
|
288 |
+
self.selected_ordinal_feats.extend(self.ordinal_chi2_feats)
|
289 |
+
|
290 |
+
self.select_n_feats = temp_n_feats
|
291 |
+
|
292 |
+
else:
|
293 |
+
|
294 |
+
self.ordinal_spearmanr_feats = self.get_spearmanr(
|
295 |
+
self.X[self.ordinal_feats], self.y)
|
296 |
+
self.ordinal_kendalltau_feats = self.get_kendalltau(
|
297 |
+
self.X[self.ordinal_feats], self.y)
|
298 |
+
|
299 |
+
# self.ordinal_spearmanr_feats = self.get_kbest(X=self.X, feats_list=self.ordinal_feats, metric=stats.spearmanr, k=self.select_n_feats)
|
300 |
+
# self.ordinal_kendalltau_feats = self.get_kbest(X=self.X, feats_list=self.ordinal_feats, metric=stats.kendalltau, k=self.select_n_feats)
|
301 |
+
|
302 |
+
# self.ordinal_spearmanr_feats = SelectKBest(self.get_spearmanr, k=self.select_n_feats).fit_transform(self.X[self.ordinal_feats], self.y)
|
303 |
+
# self.ordinal_kendalltau_feats = SelectKBest(self.get_kendalltau, k=self.select_n_feats).fit_transform(self.X[self.ordinal_feats], self.y)
|
304 |
+
|
305 |
+
self.selected_ordinal_feats = []
|
306 |
+
self.selected_ordinal_feats.extend(
|
307 |
+
self.ordinal_spearmanr_feats)
|
308 |
+
self.selected_ordinal_feats.extend(
|
309 |
+
self.ordinal_kendalltau_feats)
|
310 |
+
# self.selected_ordinal_feats = list(set(self.selected_ordinal_feats))
|
311 |
+
|
312 |
+
else:
|
313 |
+
self.selected_ordinal_feats = []
|
314 |
+
|
315 |
+
if self.nominal_feats is not None:
|
316 |
+
|
317 |
+
if self.is_target_cat:
|
318 |
+
|
319 |
+
temp_n_feats = self.select_n_feats
|
320 |
+
if len(self.nominal_feats) < self.select_n_feats:
|
321 |
+
self.select_n_feats = 'all'
|
322 |
+
|
323 |
+
self.nominal_mi_feats = self.get_kbest(
|
324 |
+
X=self.X, feats_list=self.nominal_feats, metric=mutual_info_classif)
|
325 |
+
self.nominal_chi2_feats = self.get_kbest(
|
326 |
+
X=self.X, feats_list=self.nominal_feats, metric=chi2)
|
327 |
+
|
328 |
+
self.selected_nominal_feats = []
|
329 |
+
self.selected_nominal_feats.extend(self.nominal_mi_feats)
|
330 |
+
self.selected_nominal_feats.extend(self.nominal_chi2_feats)
|
331 |
+
|
332 |
+
self.select_n_feats = temp_n_feats
|
333 |
+
|
334 |
+
else:
|
335 |
+
|
336 |
+
temp_n_feats = self.select_n_feats
|
337 |
+
if len(self.nominal_feats) < self.select_n_feats:
|
338 |
+
self.select_n_feats = 'all'
|
339 |
+
|
340 |
+
self.f_feats = self.get_kbest(
|
341 |
+
X=self.X, feats_list=self.nominal_feats, metric=f_classif, k=self.select_n_feats)
|
342 |
+
self.mi_feats = self.get_kbest(
|
343 |
+
X=self.X, feats_list=self.nominal_feats, metric=mutual_info_regression, k=self.select_n_feats)
|
344 |
+
|
345 |
+
self.select_n_feats = temp_n_feats
|
346 |
+
|
347 |
+
# # self.f_feats = f_classif(self.X[self.nominal_feats], self.y)[0]
|
348 |
+
# self.f_feats = SelectKBest(f_classif, k=self.select_n_feats).fit_transform(self.X[self.nominal_feats], self.y).columns
|
349 |
+
|
350 |
+
# # self.mi_feats = mutual_info_regression(self.X[self.nominal_feats], self.y)
|
351 |
+
# self.mi_feats = SelectKBest(mutual_info_regression, k=self.select_n_feats).fit_transform(self.X[self.nominal_feats], self.y).columns
|
352 |
+
|
353 |
+
self.selected_nominal_feats = []
|
354 |
+
self.selected_nominal_feats.extend(self.f_feats)
|
355 |
+
self.selected_nominal_feats.extend(self.mi_feats)
|
356 |
+
# self.selected_nominal_feats = list(set(self.selected_nominal_feats))
|
357 |
+
|
358 |
+
else:
|
359 |
+
|
360 |
+
self.selected_nominal_feats = []
|
361 |
+
|
362 |
+
if self.model is not None:
|
363 |
+
# np.int = np.int32
|
364 |
+
# np.float = np.float64
|
365 |
+
# np.bool = np.bool_
|
366 |
+
if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
|
367 |
+
self.boruta_feats = self.get_boruta_feats()
|
368 |
+
if not isinstance(self.model, SVC):
|
369 |
+
self.rfe_feats = self.get_rfe_feats()
|
370 |
+
else:
|
371 |
+
self.boruta_feats = []
|
372 |
+
self.rfe_feats = []
|
373 |
+
|
374 |
+
if len(self.selected_num_feats) != 0:
|
375 |
+
if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
|
376 |
+
self.selected_num_feats.extend(self.boruta_feats)
|
377 |
+
if not isinstance(self.model, SVC):
|
378 |
+
self.selected_num_feats.extend(self.rfe_feats)
|
379 |
+
num_feats_dict = dict(Counter(self.selected_num_feats))
|
380 |
+
self.selected_num_feats = [
|
381 |
+
i for i in num_feats_dict if num_feats_dict[i] >= 2]
|
382 |
+
|
383 |
+
if len(self.selected_ordinal_feats) != 0:
|
384 |
+
if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
|
385 |
+
self.selected_ordinal_feats.extend(self.boruta_feats)
|
386 |
+
if not isinstance(self.model, SVC):
|
387 |
+
self.selected_ordinal_feats.extend(self.rfe_feats)
|
388 |
+
ordinal_feats_dict = dict(Counter(self.selected_ordinal_feats))
|
389 |
+
self.selected_ordinal_feats = [
|
390 |
+
i for i in ordinal_feats_dict if ordinal_feats_dict[i] >= 2]
|
391 |
+
|
392 |
+
if len(self.selected_nominal_feats) != 0:
|
393 |
+
if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
|
394 |
+
self.selected_nominal_feats.extend(self.boruta_feats)
|
395 |
+
if not isinstance(self.model, SVC):
|
396 |
+
self.selected_nominal_feats.extend(self.rfe_feats)
|
397 |
+
nominal_feats_dict = dict(Counter(self.selected_nominal_feats))
|
398 |
+
self.selected_nominal_feats = [
|
399 |
+
i for i in nominal_feats_dict if nominal_feats_dict[i] >= 2]
|
400 |
+
|
401 |
+
self.selected_feats = []
|
402 |
+
self.selected_feats.extend(self.selected_num_feats)
|
403 |
+
self.selected_feats.extend(self.selected_ordinal_feats)
|
404 |
+
self.selected_feats.extend(self.selected_nominal_feats)
|
405 |
+
if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
|
406 |
+
self.selected_feats.extend(self.boruta_feats)
|
407 |
+
self.selected_feats = list(set(self.selected_feats))
|
408 |
+
|
409 |
+
# self.selected_feats = self.get_shap_feats(self.selected_feats)
|
410 |
+
|
411 |
+
return self.selected_feats
|
412 |
+
|
413 |
+
|
414 |
+
def create_feature_selection_dict(data, cv_fold_list, numerical_features, nominal_features):
|
415 |
+
'''
|
416 |
+
Returns feature selection dictionary for 4 different models
|
417 |
+
|
418 |
+
Args:
|
419 |
+
data (pd.DataFrame): train data
|
420 |
+
cv_fold_list (list): contains tuples of indeces of train and validation data for each fold
|
421 |
+
numerical_features (list): contains the names of numerical features
|
422 |
+
nominal_features (list): contains the names of nominal features
|
423 |
+
|
424 |
+
Returns:
|
425 |
+
dict: contains selected features, train and validation scores, models and scalers used
|
426 |
+
'''
|
427 |
+
|
428 |
+
selected_features_dict = {}
|
429 |
+
|
430 |
+
for idx in tqdm(range(1)):
|
431 |
+
|
432 |
+
X_train = data.iloc[cv_fold_list[idx][0]].reset_index(drop=True)
|
433 |
+
y_train = data.iloc[cv_fold_list[idx][0]
|
434 |
+
]['Bankrupt?'].to_frame().reset_index(drop=True)
|
435 |
+
|
436 |
+
X_valid = data.iloc[cv_fold_list[idx][1]].reset_index(drop=True)
|
437 |
+
y_valid = data.iloc[cv_fold_list[idx][1]
|
438 |
+
]['Bankrupt?'].to_frame().reset_index(drop=True)
|
439 |
+
|
440 |
+
new_numerical_features = []
|
441 |
+
for feat in numerical_features:
|
442 |
+
X_train[f"feat{numerical_features.index(feat)}"] = X_train[feat] * \
|
443 |
+
X_train[' Liability-Assets Flag']
|
444 |
+
X_valid[f"feat{numerical_features.index(feat)}"] = X_valid[feat] * \
|
445 |
+
X_valid[' Liability-Assets Flag']
|
446 |
+
new_numerical_features.append(
|
447 |
+
f"feat{numerical_features.index(feat)}")
|
448 |
+
|
449 |
+
numerical_features.extend(new_numerical_features)
|
450 |
+
|
451 |
+
# getting categorical features
|
452 |
+
categorical_features = nominal_features.copy()
|
453 |
+
|
454 |
+
# getting all features
|
455 |
+
all_features = []
|
456 |
+
all_features.extend(categorical_features)
|
457 |
+
all_features.extend(numerical_features)
|
458 |
+
|
459 |
+
X_train = X_train[all_features]
|
460 |
+
X_valid = X_valid[all_features]
|
461 |
+
|
462 |
+
models_list = [RandomForestClassifier(), XGBClassifier(
|
463 |
+
), LogisticRegression(), SVC(probability=True)]
|
464 |
+
model_names_list = ['RandomForestClassifier',
|
465 |
+
'XGBClassifier', 'LogisticRegression', 'SVC']
|
466 |
+
|
467 |
+
for model_idx in tqdm(range(len(model_names_list))):
|
468 |
+
|
469 |
+
model_name = model_names_list[model_idx]
|
470 |
+
|
471 |
+
selected_features_dict[model_name] = {}
|
472 |
+
|
473 |
+
# feature selection
|
474 |
+
model = models_list[model_idx]
|
475 |
+
|
476 |
+
if isinstance(model, LogisticRegression) or isinstance(model, SVC):
|
477 |
+
|
478 |
+
scaler = StandardScaler()
|
479 |
+
|
480 |
+
X_train2 = scaler.fit_transform(X_train[numerical_features])
|
481 |
+
X_train2 = pd.DataFrame(X_train2, columns=numerical_features)
|
482 |
+
X_train2 = pd.concat(
|
483 |
+
[X_train2, X_train[categorical_features]], axis=1)
|
484 |
+
|
485 |
+
fselector = FSelector(
|
486 |
+
X=X_train2,
|
487 |
+
y=y_train,
|
488 |
+
num_feats=numerical_features,
|
489 |
+
ordinal_feats=None,
|
490 |
+
nominal_feats=nominal_features,
|
491 |
+
model=model
|
492 |
+
)
|
493 |
+
|
494 |
+
else:
|
495 |
+
|
496 |
+
fselector = FSelector(
|
497 |
+
X=X_train,
|
498 |
+
y=y_train,
|
499 |
+
num_feats=numerical_features,
|
500 |
+
ordinal_feats=None,
|
501 |
+
nominal_feats=nominal_features,
|
502 |
+
model=model
|
503 |
+
)
|
504 |
+
|
505 |
+
selected_features = fselector.get_features()
|
506 |
+
|
507 |
+
if len(selected_features) == 0:
|
508 |
+
continue
|
509 |
+
|
510 |
+
# selecting features using shap values
|
511 |
+
if isinstance(model, LogisticRegression) or isinstance(model, SVC):
|
512 |
+
|
513 |
+
X_valid2 = scaler.transform(X_valid[numerical_features])
|
514 |
+
X_valid2 = pd.DataFrame(X_valid2, columns=numerical_features)
|
515 |
+
X_valid2 = pd.concat(
|
516 |
+
[X_valid2, X_valid[categorical_features]], axis=1)
|
517 |
+
|
518 |
+
X_train_filtered = X_train2[selected_features]
|
519 |
+
X_valid_filtered = X_valid2[selected_features]
|
520 |
+
|
521 |
+
else:
|
522 |
+
|
523 |
+
X_train_filtered = X_train[selected_features]
|
524 |
+
X_valid_filtered = X_valid[selected_features]
|
525 |
+
|
526 |
+
# model training using selected features
|
527 |
+
model.fit(X_train_filtered, y_train)
|
528 |
+
|
529 |
+
explainer = shap.Explainer(
|
530 |
+
model.predict,
|
531 |
+
X_train_filtered,
|
532 |
+
# max_evals = int(2 * X_train_filtered.shape[1] + 1),
|
533 |
+
# verbose=0
|
534 |
+
)
|
535 |
+
shap_values = explainer(X_train_filtered)
|
536 |
+
selected_shap_features = get_shap_features(
|
537 |
+
shap_values,
|
538 |
+
features=list(X_train_filtered.columns),
|
539 |
+
topk=10
|
540 |
+
)
|
541 |
+
|
542 |
+
# model training using shap features
|
543 |
+
model = models_list[model_idx]
|
544 |
+
model.fit(X_train_filtered[selected_shap_features], y_train)
|
545 |
+
|
546 |
+
# metric calculation
|
547 |
+
y_train_pred = model.predict(
|
548 |
+
X_train_filtered[selected_shap_features])
|
549 |
+
y_train_pred_prob = model.predict_proba(
|
550 |
+
X_train_filtered[selected_shap_features])[:, 1]
|
551 |
+
|
552 |
+
y_valid_pred = model.predict(
|
553 |
+
X_valid_filtered[selected_shap_features])
|
554 |
+
y_valid_pred_prob = model.predict_proba(
|
555 |
+
X_valid_filtered[selected_shap_features])[:, 1]
|
556 |
+
|
557 |
+
train_acc = accuracy_score(y_train, y_train_pred)
|
558 |
+
train_f1 = f1_score(y_train, y_train_pred)
|
559 |
+
train_roc_auc = roc_auc_score(y_train, y_train_pred_prob)
|
560 |
+
|
561 |
+
valid_acc = accuracy_score(y_valid, y_valid_pred)
|
562 |
+
valid_f1 = f1_score(y_valid, y_valid_pred)
|
563 |
+
valid_roc_auc = roc_auc_score(y_valid, y_valid_pred_prob)
|
564 |
+
|
565 |
+
selected_features_dict[model_name][idx+1] = {}
|
566 |
+
selected_features_dict[model_name][idx +
|
567 |
+
1]['selected_feats'] = selected_features
|
568 |
+
selected_features_dict[model_name][idx +
|
569 |
+
1]['selected_shap_feats'] = selected_shap_features
|
570 |
+
selected_features_dict[model_name][idx+1]['train_acc'] = train_acc
|
571 |
+
selected_features_dict[model_name][idx+1]['train_f1'] = train_f1
|
572 |
+
selected_features_dict[model_name][idx +
|
573 |
+
1]['train_roc_auc'] = train_roc_auc
|
574 |
+
selected_features_dict[model_name][idx+1]['valid_acc'] = valid_acc
|
575 |
+
selected_features_dict[model_name][idx+1]['valid_f1'] = valid_f1
|
576 |
+
selected_features_dict[model_name][idx +
|
577 |
+
1]['valid_roc_auc'] = valid_roc_auc
|
578 |
+
selected_features_dict[model_name][idx+1]['model'] = model
|
579 |
+
if isinstance(model, LogisticRegression) or isinstance(model, SVC):
|
580 |
+
selected_features_dict[model_name][idx+1]['scaler'] = scaler
|
581 |
+
|
582 |
+
# print(f"##### {model_name} #####")
|
583 |
+
# print(f"Selected features: {selected_features}")
|
584 |
+
# print("Train:")
|
585 |
+
# print(f"Accuracy: {train_acc:.5f}, F1: {train_f1:.5f}, ROC-AUC: {train_roc_auc:.5f}")
|
586 |
+
# print("Validation:")
|
587 |
+
# print(f"Accuracy: {valid_acc:.5f}, F1: {valid_f1:.5f}, ROC-AUC: {valid_roc_auc:.5f}")
|
588 |
+
|
589 |
+
logging.info(f"##### {model_name} #####")
|
590 |
+
logging.info(f"Selected features: {selected_features}")
|
591 |
+
logging.info('Train:')
|
592 |
+
logging.info(
|
593 |
+
f"Accuracy: {train_acc:.5f}, F1: {train_f1:.5f}, ROC-AUC: {train_roc_auc:.5f}")
|
594 |
+
logging.info('Validation:')
|
595 |
+
logging.info(
|
596 |
+
f"Accuracy: {valid_acc:.5f}, F1: {valid_f1:.5f}, ROC-AUC: {valid_roc_auc:.5f}")
|
597 |
+
|
598 |
+
del X_train, y_train, X_valid, y_valid, X_train_filtered, X_valid_filtered, model
|
599 |
+
gc.collect()
|
600 |
+
|
601 |
+
return selected_features_dict
|
602 |
+
|
603 |
+
|
604 |
+
def get_mean_ensemble_prediction(prob_list):
|
605 |
+
prob_array = np.vstack(prob_list).T
|
606 |
+
return np.mean(prob_array, axis=1)
|
607 |
+
|
608 |
+
|
609 |
+
class OptimizeAUC:
|
610 |
+
def __init__(self):
|
611 |
+
self.coef_ = 0
|
612 |
+
|
613 |
+
def _auc(self, coef, X, y):
|
614 |
+
X_coef = X * coef
|
615 |
+
preds = np.sum(X_coef, axis=1)
|
616 |
+
auc_score = roc_auc_score(y, preds)
|
617 |
+
return -1 * auc_score
|
618 |
+
|
619 |
+
def fit(self, X, y):
|
620 |
+
loss_partial = partial(self._auc, X=X, y=y)
|
621 |
+
initial_coef = np.random.dirichlet(np.ones(X.shape[1]), size=1)
|
622 |
+
self.coef_ = fmin(loss_partial, initial_coef, disp=True)
|
623 |
+
|
624 |
+
def predict(self, X):
|
625 |
+
X_coef = X * self.coef_
|
626 |
+
preds = np.sum(X_coef, axis=1)
|
627 |
+
return preds
|
628 |
+
|
629 |
+
|
630 |
+
def get_optimized_ensemble(train_df, test_df, cv_fold_list, selected_features_dict, trained_models_dict, numerical_features):
|
631 |
+
'''
|
632 |
+
Finds the optimized weights for ensembling using the train data and evaluates it on test data
|
633 |
+
|
634 |
+
Args:
|
635 |
+
train_df (pd.DataFrame): train data
|
636 |
+
test_df (pd.DataFrame): test data
|
637 |
+
cv_fold_list (list): contains tuples of indeces of train and validation data for each fold
|
638 |
+
selected_features_dict (dict): selected features dictionary where keys are models' names
|
639 |
+
trained_models_dict (dict): trained models dictionary where keys are models' names
|
640 |
+
numerical_features (list): contains the names of numerical features
|
641 |
+
|
642 |
+
Returns:
|
643 |
+
dict: contains all optimized weights for each fold
|
644 |
+
float: ROC-AUC score
|
645 |
+
'''
|
646 |
+
|
647 |
+
opt_dict = {}
|
648 |
+
|
649 |
+
test_preds_list = []
|
650 |
+
# valid_preds_list = []
|
651 |
+
|
652 |
+
X_test_rf = test_df[selected_features_dict['RandomForestClassifier']
|
653 |
+
[1]['selected_shap_feats']]
|
654 |
+
X_test_xgb = test_df[selected_features_dict['XGBClassifier']
|
655 |
+
[1]['selected_shap_feats']]
|
656 |
+
X_test_lr = test_df[selected_features_dict['LogisticRegression']
|
657 |
+
[1]['selected_shap_feats']]
|
658 |
+
X_test_svc = test_df[selected_features_dict['SVC']
|
659 |
+
[1]['selected_shap_feats']]
|
660 |
+
|
661 |
+
y_test = test_df['Bankrupt?'].to_frame()
|
662 |
+
|
663 |
+
for idx in range(len(cv_fold_list)):
|
664 |
+
|
665 |
+
logging.info(f'Starting calculations for Fold {idx+1}')
|
666 |
+
|
667 |
+
X_train = train_df.iloc[cv_fold_list[idx][0]].reset_index(drop=True)
|
668 |
+
y_train = train_df.iloc[cv_fold_list[idx][0]
|
669 |
+
]['Bankrupt?'].to_frame().reset_index(drop=True)
|
670 |
+
|
671 |
+
X_valid = train_df.iloc[cv_fold_list[idx][1]].reset_index(drop=True)
|
672 |
+
y_valid = train_df.iloc[cv_fold_list[idx][1]
|
673 |
+
]['Bankrupt?'].to_frame().reset_index(drop=True)
|
674 |
+
|
675 |
+
# RandomForest
|
676 |
+
logging.info('Starting RandomForest calculations')
|
677 |
+
rf_selected_features = selected_features_dict['RandomForestClassifier'][1]['selected_shap_feats']
|
678 |
+
X_train_rf = X_train[rf_selected_features]
|
679 |
+
X_valid_rf = X_valid[rf_selected_features]
|
680 |
+
|
681 |
+
rf_gscv = trained_models_dict['RandomForestClassifier']
|
682 |
+
|
683 |
+
rfm = RandomForestClassifier(**rf_gscv.best_params_)
|
684 |
+
rfm.fit(X_train_rf, y_train)
|
685 |
+
rfm_valid_probs = rfm.predict_proba(X_valid_rf)[:, 1]
|
686 |
+
|
687 |
+
rfm_test_probs = rfm.predict_proba(X_test_rf)[:, 1]
|
688 |
+
logging.info('RandomForest calculations completed')
|
689 |
+
|
690 |
+
# XGBoost
|
691 |
+
logging.info('Starting XGBoost calculations')
|
692 |
+
xgb_selected_features = selected_features_dict['XGBClassifier'][1]['selected_shap_feats']
|
693 |
+
X_train_xgb = X_train[xgb_selected_features]
|
694 |
+
X_valid_xgb = X_valid[xgb_selected_features]
|
695 |
+
|
696 |
+
xgb_gscv = trained_models_dict['XGBClassifier']
|
697 |
+
|
698 |
+
xgbm = XGBClassifier(**xgb_gscv.best_params_)
|
699 |
+
xgbm.fit(X_train_xgb, y_train)
|
700 |
+
xgbm_valid_probs = xgbm.predict_proba(X_valid_xgb)[:, 1]
|
701 |
+
xgbm_test_probs = xgbm.predict_proba(X_test_xgb)[:, 1]
|
702 |
+
logging.info('XGBoost calculations completed')
|
703 |
+
|
704 |
+
# LogisticRegression
|
705 |
+
logging.info('Starting LogisticRegression calculations')
|
706 |
+
lr_selected_features = selected_features_dict['LogisticRegression'][1]['selected_shap_feats']
|
707 |
+
X_train_lr = X_train[lr_selected_features]
|
708 |
+
X_valid_lr = X_valid[lr_selected_features]
|
709 |
+
|
710 |
+
lr_gscv = trained_models_dict['LogisticRegression']
|
711 |
+
|
712 |
+
lr_params = {k.replace('model__', ''): v for k,
|
713 |
+
v in lr_gscv.best_params_.items()}
|
714 |
+
selected_shap_features = selected_features_dict['LogisticRegression'][1]['selected_shap_feats']
|
715 |
+
num_feat = [
|
716 |
+
col for col in selected_shap_features if col in numerical_features]
|
717 |
+
num_trans = Pipeline([('scale', StandardScaler())])
|
718 |
+
preprocessor = ColumnTransformer(
|
719 |
+
transformers=[('num', num_trans, num_feat)], remainder='passthrough')
|
720 |
+
lrm = Pipeline(
|
721 |
+
[
|
722 |
+
('preproc', preprocessor),
|
723 |
+
('lr', LogisticRegression(**lr_params))
|
724 |
+
]
|
725 |
+
)
|
726 |
+
lrm.fit(X_train_lr, y_train)
|
727 |
+
lrm_valid_probs = lrm.predict_proba(X_valid_lr)[:, 1]
|
728 |
+
lrm_test_probs = lrm.predict_proba(X_test_lr)[:, 1]
|
729 |
+
logging.info('LogisticRegression calculations completed')
|
730 |
+
|
731 |
+
# SVC
|
732 |
+
logging.info('Starting SVC calculations')
|
733 |
+
svc_selected_features = selected_features_dict['SVC'][1]['selected_shap_feats']
|
734 |
+
X_train_svc = X_train[svc_selected_features]
|
735 |
+
X_valid_svc = X_valid[svc_selected_features]
|
736 |
+
|
737 |
+
svc_gscv = trained_models_dict['SVC']
|
738 |
+
|
739 |
+
svc_params = {k.replace('model__', ''): v for k,
|
740 |
+
v in svc_gscv.best_params_.items()}
|
741 |
+
selected_shap_features = selected_features_dict['SVC'][1]['selected_shap_feats']
|
742 |
+
num_feat = [
|
743 |
+
col for col in selected_shap_features if col in numerical_features]
|
744 |
+
num_trans = Pipeline([('scale', StandardScaler())])
|
745 |
+
preprocessor = ColumnTransformer(
|
746 |
+
transformers=[('num', num_trans, num_feat)], remainder='passthrough')
|
747 |
+
svcm = Pipeline(
|
748 |
+
[
|
749 |
+
('preproc', preprocessor),
|
750 |
+
('svc', SVC(probability=True, **svc_params))
|
751 |
+
]
|
752 |
+
)
|
753 |
+
svcm.fit(X_train_svc, y_train)
|
754 |
+
svcm_valid_probs = svcm.predict_proba(X_valid_svc)[:, 1]
|
755 |
+
svcm_test_probs = svcm.predict_proba(X_test_svc)[:, 1]
|
756 |
+
logging.info('SVC calculations completed')
|
757 |
+
|
758 |
+
logging.info('Optimizing Ensemble weights')
|
759 |
+
valid_preds = np.column_stack([
|
760 |
+
rfm_valid_probs,
|
761 |
+
xgbm_valid_probs,
|
762 |
+
lrm_valid_probs,
|
763 |
+
svcm_valid_probs
|
764 |
+
])
|
765 |
+
|
766 |
+
opt = OptimizeAUC()
|
767 |
+
opt.fit(valid_preds, y_valid)
|
768 |
+
opt_dict[idx] = {}
|
769 |
+
opt_dict[idx]['opt'] = opt
|
770 |
+
opt_dict[idx]['rfm'] = rfm
|
771 |
+
opt_dict[idx]['xgbm'] = xgbm
|
772 |
+
opt_dict[idx]['lrm'] = lrm
|
773 |
+
opt_dict[idx]['svcm'] = svcm
|
774 |
+
logging.info('Optimization finished')
|
775 |
+
|
776 |
+
# valid_preds_list.append(opt.predict(valid_preds))
|
777 |
+
|
778 |
+
logging.info('Calculating predictions for test set')
|
779 |
+
test_preds = np.column_stack([
|
780 |
+
rfm_test_probs,
|
781 |
+
xgbm_test_probs,
|
782 |
+
lrm_test_probs,
|
783 |
+
svcm_test_probs
|
784 |
+
])
|
785 |
+
|
786 |
+
test_preds_list.append(opt.predict(test_preds))
|
787 |
+
logging.info('Test set predictions calculated')
|
788 |
+
|
789 |
+
logging.info('Getting the score for test set')
|
790 |
+
opt_y_test_pred_prob = np.mean(np.column_stack(test_preds_list), axis=1)
|
791 |
+
opt_test_roc_auc = roc_auc_score(y_test, opt_y_test_pred_prob)
|
792 |
+
logging.info('Test score calculated')
|
793 |
+
|
794 |
+
return (opt_dict, opt_test_roc_auc)
|
795 |
+
|
796 |
+
|
797 |
+
def find_optimal_model(train_df, test_df, features_dict_path, cv_fold_list, numerical_features):
|
798 |
+
'''
|
799 |
+
Finds the best model for the train data and evaluates it on test data
|
800 |
+
|
801 |
+
Args:
|
802 |
+
train_df (pd.DataFrame): train data
|
803 |
+
test_df (pd.DataFrame): test data
|
804 |
+
features_dict_path (str): path to selected features dictionary
|
805 |
+
cv_fold_list (list): contains tuples of indeces of train and validation data for each fold
|
806 |
+
numerical_features (list): contains the names of numerical features
|
807 |
+
|
808 |
+
Returns:
|
809 |
+
dict: contains all trained models and the name of the best model
|
810 |
+
dict: contains all optimized weights of ensembling for each fold
|
811 |
+
'''
|
812 |
+
logging.info('Loading selected features dictionary')
|
813 |
+
selected_features_dict = load_object(file_path=features_dict_path)
|
814 |
+
logging.info('Selected features dictionary loaded')
|
815 |
+
|
816 |
+
models_list = [RandomForestClassifier(), XGBClassifier(),
|
817 |
+
LogisticRegression(), SVC(probability=True)]
|
818 |
+
model_names_list = ['RandomForestClassifier',
|
819 |
+
'XGBClassifier', 'LogisticRegression', 'SVC']
|
820 |
+
model_params_list = [
|
821 |
+
{
|
822 |
+
'n_estimators': [5, 10, 15, 25, 50, 100, 120, 300, 500],
|
823 |
+
'max_depth': [2, 3, 5, 8, 15, 25, 30, None]
|
824 |
+
},
|
825 |
+
{
|
826 |
+
'eta': [0.01, 0.015, 0.025, 0.05, 0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9],
|
827 |
+
'max_depth': [3, 5, 6, 7, 9, 12, 15, 17, 25],
|
828 |
+
'n_estimators': [50, 100, 150, 200, 500, 1000]
|
829 |
+
},
|
830 |
+
{'model__penalty': ['l1', 'l2'], 'model__C': [
|
831 |
+
0.001, 0.01, 0.1, 1, 10, 100, 1000]},
|
832 |
+
{'model__C': [1, 10, 100, 1000], 'model__gamma': [
|
833 |
+
1, 0.1, 0.001, 0.0001], 'model__kernel': ['linear', 'rbf']}
|
834 |
+
]
|
835 |
+
|
836 |
+
trained_models_dict = {}
|
837 |
+
|
838 |
+
best_score = 0
|
839 |
+
best_model_name = None
|
840 |
+
|
841 |
+
|
842 |
+
y_train = train_df['Bankrupt?'].to_frame()
|
843 |
+
y_test = test_df['Bankrupt?'].to_frame()
|
844 |
+
|
845 |
+
y_train_pred_prob_list = []
|
846 |
+
y_test_pred_prob_list = []
|
847 |
+
rank_ensemble_list = []
|
848 |
+
|
849 |
+
for model_idx in tqdm(range(len(model_names_list))):
|
850 |
+
|
851 |
+
# y_train_pred_prob = np.zeros(X_train.shape)
|
852 |
+
|
853 |
+
model_name = model_names_list[model_idx]
|
854 |
+
|
855 |
+
selected_shap_features = selected_features_dict[model_name][1]['selected_shap_feats']
|
856 |
+
|
857 |
+
X_train = train_df[selected_shap_features]
|
858 |
+
X_test = test_df[selected_shap_features]
|
859 |
+
|
860 |
+
logging.info(f'Starting {model_name} training')
|
861 |
+
params_dict = model_params_list[model_idx]
|
862 |
+
|
863 |
+
model = models_list[model_idx]
|
864 |
+
|
865 |
+
if isinstance(model, LogisticRegression) or isinstance(model, SVC):
|
866 |
+
num_feat = [
|
867 |
+
col for col in selected_shap_features if col in numerical_features]
|
868 |
+
num_trans = Pipeline([('scale', StandardScaler())])
|
869 |
+
preprocessor = ColumnTransformer(
|
870 |
+
transformers=[('num', num_trans, num_feat)], remainder='passthrough')
|
871 |
+
pipe = Pipeline(
|
872 |
+
[
|
873 |
+
('preproc', preprocessor),
|
874 |
+
('model', model)
|
875 |
+
]
|
876 |
+
)
|
877 |
+
|
878 |
+
model_gscv = GridSearchCV(
|
879 |
+
pipe,
|
880 |
+
param_grid=params_dict,
|
881 |
+
scoring='roc_auc',
|
882 |
+
cv=cv_fold_list,
|
883 |
+
n_jobs=-1,
|
884 |
+
verbose=4
|
885 |
+
)
|
886 |
+
else:
|
887 |
+
model_gscv = GridSearchCV(
|
888 |
+
model,
|
889 |
+
param_grid=params_dict,
|
890 |
+
scoring='roc_auc',
|
891 |
+
cv=cv_fold_list,
|
892 |
+
n_jobs=-1,
|
893 |
+
verbose=4
|
894 |
+
)
|
895 |
+
|
896 |
+
model_gscv.fit(X_train, y_train)
|
897 |
+
logging.info(f'{model_name} training finished')
|
898 |
+
|
899 |
+
trained_models_dict[model_name] = model_gscv
|
900 |
+
|
901 |
+
rank_ensemble_list.append((model_name, model_gscv.best_score_))
|
902 |
+
|
903 |
+
# for train_idxs, valid_idxs in cv_fold_list:
|
904 |
+
# temp_model = models_list[model_idx]
|
905 |
+
# y_train_pred_prob[valid_idxs, :] = model_gscv.predict_proba(X_train[valid_idxs, :])[:, 1]
|
906 |
+
# y_train_pred_prob_list.append(y_train_pred_prob)
|
907 |
+
|
908 |
+
logging.info('Getting ROC-AUC for test set')
|
909 |
+
y_test_pred_prob = model_gscv.predict_proba(X_test)[:, 1]
|
910 |
+
y_test_pred_prob_list.append(y_test_pred_prob)
|
911 |
+
test_roc_auc = roc_auc_score(y_test, y_test_pred_prob)
|
912 |
+
logging.info(
|
913 |
+
f'{model_name}: Validation score = {model_gscv.best_score_:.4f}, Test score = {test_roc_auc:.4f}')
|
914 |
+
|
915 |
+
if test_roc_auc > best_score:
|
916 |
+
best_score = test_roc_auc
|
917 |
+
best_model_name = model_name
|
918 |
+
|
919 |
+
logging.info('Getting Average Ensemble score')
|
920 |
+
# avg_ens_y_train_pred_prob = get_mean_ensemble_prediction(y_train_pred_prob_list)
|
921 |
+
# avg_ens_train_roc_auc = roc_auc_score(y_test, avg_ens_y_train_pred_prob)
|
922 |
+
|
923 |
+
avg_ens_y_test_pred_prob = get_mean_ensemble_prediction(
|
924 |
+
y_test_pred_prob_list)
|
925 |
+
avg_ens_test_roc_auc = roc_auc_score(y_test, avg_ens_y_test_pred_prob)
|
926 |
+
logging.info(f'Average Ensemble: Test score = {avg_ens_test_roc_auc:.4f}')
|
927 |
+
# logging.info(f'Average Ensemble: Validation score = {avg_ens_train_roc_auc:.4f}, Test score = {avg_ens_test_roc_auc:.4f}')
|
928 |
+
|
929 |
+
if avg_ens_test_roc_auc > best_score:
|
930 |
+
best_score = avg_ens_test_roc_auc
|
931 |
+
best_model_name = 'Average Ensemble'
|
932 |
+
|
933 |
+
logging.info('Getting Rank Ensemble score')
|
934 |
+
rank_ensemble_list = sorted(rank_ensemble_list, key=lambda x: x[1])
|
935 |
+
|
936 |
+
# rank_ens_y_train_pred_prob = 0
|
937 |
+
rank_ens_y_test_pred_prob = 0
|
938 |
+
for i in range(len(rank_ensemble_list)):
|
939 |
+
# rank_ens_y_train_pred_prob += (i+1) * y_train_pred_prob_list[model_names_list.index(rank_ensemble_list[i][0])]
|
940 |
+
rank_ens_y_test_pred_prob += (
|
941 |
+
i+1) * y_test_pred_prob_list[model_names_list.index(rank_ensemble_list[i][0])]
|
942 |
+
# rank_ens_y_train_pred_prob /= len(rank_ensemble_list) * (1+ len(rank_ensemble_list)) / 2
|
943 |
+
rank_ens_y_test_pred_prob /= len(rank_ensemble_list) * \
|
944 |
+
(1 + len(rank_ensemble_list)) / 2
|
945 |
+
rank_ens_test_roc_auc = roc_auc_score(y_test, rank_ens_y_test_pred_prob)
|
946 |
+
|
947 |
+
logging.info(f'Rank Ensemble: Test score = {rank_ens_test_roc_auc:.4f}')
|
948 |
+
# logging.info(f'Rank Ensemble: Validation score = {rank_ens_y_train_pred_prob:.4f}, Test score = {rank_ens_y_test_pred_prob:.4f}')
|
949 |
+
|
950 |
+
if rank_ens_test_roc_auc > best_score:
|
951 |
+
best_score = rank_ens_test_roc_auc
|
952 |
+
best_model_name = 'Rank Ensemble'
|
953 |
+
|
954 |
+
logging.info('Getting Optimized Ensemble score')
|
955 |
+
opt_dict, opt_test_roc_auc = get_optimized_ensemble(
|
956 |
+
train_df,
|
957 |
+
test_df,
|
958 |
+
cv_fold_list,
|
959 |
+
selected_features_dict,
|
960 |
+
trained_models_dict,
|
961 |
+
numerical_features
|
962 |
+
)
|
963 |
+
|
964 |
+
logging.info(f'Optimized Ensemble: Test score = {opt_test_roc_auc:.4f}')
|
965 |
+
|
966 |
+
if opt_test_roc_auc > best_score:
|
967 |
+
best_score = opt_test_roc_auc
|
968 |
+
best_model_name = 'Optimized Ensemble'
|
969 |
+
|
970 |
+
trained_models_dict['best_model_name'] = best_model_name
|
971 |
+
|
972 |
+
logging.info(f'{best_model_name} is the best model')
|
973 |
+
|
974 |
+
return (trained_models_dict, opt_dict)
|
requirements.txt
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
alembic==1.13.1
|
2 |
+
altair==5.3.0
|
3 |
+
aniso8601==9.0.1
|
4 |
+
annotated-types==0.6.0
|
5 |
+
anyio==4.3.0
|
6 |
+
appdirs==1.4.4
|
7 |
+
asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1698341106958/work
|
8 |
+
attrs==23.2.0
|
9 |
+
blinker==1.7.0
|
10 |
+
Boruta==0.3
|
11 |
+
BorutaShap==1.0.17
|
12 |
+
cachetools==5.3.3
|
13 |
+
certifi==2024.2.2
|
14 |
+
charset-normalizer==3.3.2
|
15 |
+
click==8.1.7
|
16 |
+
cloudpickle==3.0.0
|
17 |
+
colorama @ file:///home/conda/feedstock_root/build_artifacts/colorama_1666700638685/work
|
18 |
+
comm @ file:///home/conda/feedstock_root/build_artifacts/comm_1710320294760/work
|
19 |
+
-e git+https://github.com/VaheC/CompanyBankruptcy.git@0c9aba9c454511775cdf83313b15ca93d56c3356#egg=CompanyBankruptcy
|
20 |
+
contourpy==1.2.1
|
21 |
+
cycler==0.12.1
|
22 |
+
debugpy @ file:///C:/b/abs_c0y1fjipt2/croot/debugpy_1690906864587/work
|
23 |
+
decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1641555617451/work
|
24 |
+
Deprecated==1.2.14
|
25 |
+
distro==1.9.0
|
26 |
+
dnspython==1.16.0
|
27 |
+
docker==7.1.0
|
28 |
+
dynaconf==3.2.5
|
29 |
+
ensure==1.0.2
|
30 |
+
entrypoints==0.4
|
31 |
+
et-xmlfile==1.1.0
|
32 |
+
evidently==0.4.22
|
33 |
+
exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1704921103267/work
|
34 |
+
executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1698579936712/work
|
35 |
+
Faker==25.2.0
|
36 |
+
filelock==3.14.0
|
37 |
+
Flask==3.0.3
|
38 |
+
fonttools==4.51.0
|
39 |
+
from-root==1.3.0
|
40 |
+
fsspec==2024.3.1
|
41 |
+
gitdb==4.0.11
|
42 |
+
GitPython==3.1.43
|
43 |
+
graphene==3.3
|
44 |
+
graphql-core==3.2.3
|
45 |
+
graphql-relay==3.2.0
|
46 |
+
greenlet==3.0.3
|
47 |
+
h11==0.14.0
|
48 |
+
httpcore==1.0.5
|
49 |
+
httptools==0.6.1
|
50 |
+
httpx==0.27.0
|
51 |
+
idna==3.6
|
52 |
+
imbalanced-learn==0.12.2
|
53 |
+
imblearn==0.0
|
54 |
+
importlib-metadata==6.11.0
|
55 |
+
ipykernel @ file:///D:/bld/ipykernel_1708996677248/work
|
56 |
+
ipython @ file:///D:/bld/ipython_1709559926914/work
|
57 |
+
iterative-telemetry==0.0.8
|
58 |
+
itsdangerous==2.2.0
|
59 |
+
jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1696326070614/work
|
60 |
+
Jinja2==3.1.3
|
61 |
+
joblib==1.4.0
|
62 |
+
jsonschema==4.21.1
|
63 |
+
jsonschema-specifications==2023.12.1
|
64 |
+
jupyter_client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1710255804825/work
|
65 |
+
jupyter_core @ file:///D:/bld/jupyter_core_1710257272359/work
|
66 |
+
kiwisolver==1.4.5
|
67 |
+
lightgbm==4.3.0
|
68 |
+
litestar==2.8.3
|
69 |
+
llvmlite==0.42.0
|
70 |
+
Mako==1.3.5
|
71 |
+
Markdown==3.6
|
72 |
+
markdown-it-py==3.0.0
|
73 |
+
MarkupSafe==2.1.5
|
74 |
+
matplotlib==3.8.4
|
75 |
+
matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1713250518406/work
|
76 |
+
mdurl==0.1.2
|
77 |
+
mlflow==2.13.0
|
78 |
+
msgspec==0.18.6
|
79 |
+
multidict==6.0.5
|
80 |
+
mypy-extensions==1.0.0
|
81 |
+
nest_asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1705850609492/work
|
82 |
+
nltk==3.8.1
|
83 |
+
numba==0.59.1
|
84 |
+
numpy==1.26.4
|
85 |
+
openpyxl==3.1.2
|
86 |
+
opentelemetry-api==1.24.0
|
87 |
+
opentelemetry-sdk==1.24.0
|
88 |
+
opentelemetry-semantic-conventions==0.45b0
|
89 |
+
packaging==23.2
|
90 |
+
pandas==2.2.1
|
91 |
+
parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1712320355065/work
|
92 |
+
patsy==0.5.6
|
93 |
+
pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1602536217715/work
|
94 |
+
pillow==10.3.0
|
95 |
+
platformdirs @ file:///home/conda/feedstock_root/build_artifacts/platformdirs_1706713388748/work
|
96 |
+
plotly==5.22.0
|
97 |
+
polyfactory==2.16.0
|
98 |
+
prompt-toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1702399386289/work
|
99 |
+
protobuf==4.25.3
|
100 |
+
psutil @ file:///C:/Windows/Temp/abs_b2c2fd7f-9fd5-4756-95ea-8aed74d0039flsd9qufz/croots/recipe/psutil_1656431277748/work
|
101 |
+
pure-eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1642875951954/work
|
102 |
+
pyarrow==15.0.2
|
103 |
+
pydantic==2.7.1
|
104 |
+
pydantic_core==2.18.2
|
105 |
+
pydeck==0.8.1b0
|
106 |
+
Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1700607939962/work
|
107 |
+
pymongo==4.7.2
|
108 |
+
pyparsing==3.1.2
|
109 |
+
python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1709299778482/work
|
110 |
+
python-dotenv==1.0.1
|
111 |
+
pytz==2024.1
|
112 |
+
pywin32==305.1
|
113 |
+
PyYAML==6.0.1
|
114 |
+
pyzmq @ file:///C:/b/abs_89aq69t0up/croot/pyzmq_1705605705281/work
|
115 |
+
querystring-parser==1.2.4
|
116 |
+
referencing==0.34.0
|
117 |
+
regex==2024.5.10
|
118 |
+
requests==2.31.0
|
119 |
+
rich==13.7.1
|
120 |
+
rich-click==1.8.1
|
121 |
+
rpds-py==0.18.0
|
122 |
+
scikit-learn==1.4.2
|
123 |
+
scipy==1.13.0
|
124 |
+
seaborn==0.13.2
|
125 |
+
shap==0.45.0
|
126 |
+
shellingham==1.5.4
|
127 |
+
six @ file:///home/conda/feedstock_root/build_artifacts/six_1620240208055/work
|
128 |
+
slicer==0.0.7
|
129 |
+
smmap==5.0.1
|
130 |
+
sniffio==1.3.1
|
131 |
+
SQLAlchemy==2.0.30
|
132 |
+
sqlparse==0.5.0
|
133 |
+
stack-data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1669632077133/work
|
134 |
+
statsmodels==0.14.2
|
135 |
+
streamlit==1.28.0
|
136 |
+
tenacity==8.2.3
|
137 |
+
threadpoolctl==3.5.0
|
138 |
+
toml==0.10.2
|
139 |
+
toolz==0.12.1
|
140 |
+
tornado @ file:///D:/bld/tornado_1656937966227/work
|
141 |
+
tqdm==4.66.2
|
142 |
+
traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1713535121073/work
|
143 |
+
typer==0.12.3
|
144 |
+
typing-inspect==0.9.0
|
145 |
+
typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1712329955671/work
|
146 |
+
tzdata==2024.1
|
147 |
+
tzlocal==5.2
|
148 |
+
ujson==5.10.0
|
149 |
+
urllib3==2.2.1
|
150 |
+
uvicorn==0.29.0
|
151 |
+
validators==0.28.3
|
152 |
+
waitress==3.0.0
|
153 |
+
watchdog==4.0.0
|
154 |
+
watchfiles==0.21.0
|
155 |
+
wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1704731205417/work
|
156 |
+
websockets==12.0
|
157 |
+
Werkzeug==3.0.3
|
158 |
+
wrapt==1.16.0
|
159 |
+
xgboost==2.0.3
|
160 |
+
zipp @ file:///home/conda/feedstock_root/build_artifacts/zipp_1695255097490/work
|