matsammut commited on
Commit
79b1800
·
verified ·
1 Parent(s): 63ec3bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -7
app.py CHANGED
@@ -51,16 +51,29 @@ def cleaning_features(data):
51
  data = pca(data)
52
  return data
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def pca(data):
55
- encoder = OneHotEncoder(sparse_output=False)
56
- one_hot_encoded = encoder.fit_transform(data[['workclass', 'occupation']])
 
57
  encoded_columns_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())
58
- pca_net = PCA(n_components=10)
59
- pca_result_net = pca_net.fit_transform(encoded_columns_df)
60
- pca_columns = [f'pca_component_{i+1}' for i in range(10)]
61
  pca_df = pd.DataFrame(pca_result_net, columns=pca_columns)
62
- data = data.drop(columns=['workclass', 'occupation'], axis=1) #remove the original columns
63
- data = pd.concat([data, pca_df], axis=1)
64
  return data
65
 
66
  def hbdscan_tranform(df_transformed):
 
51
  data = pca(data)
52
  return data
53
 
54
+ # def pca(data):
55
+ # encoder = OneHotEncoder(sparse_output=False)
56
+ # one_hot_encoded = encoder.fit_transform(data[['workclass', 'occupation']])
57
+ # encoded_columns_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())
58
+ # pca_net = PCA(n_components=10)
59
+ # pca_result_net = pca_net.fit_transform(encoded_columns_df)
60
+ # pca_columns = [f'pca_component_{i+1}' for i in range(10)]
61
+ # pca_df = pd.DataFrame(pca_result_net, columns=pca_columns)
62
+ # data = data.drop(columns=['workclass', 'occupation'], axis=1) #remove the original columns
63
+ # data = pd.concat([data, pca_df], axis=1)
64
+ # return data
65
+
66
+
67
  def pca(data):
68
+ encoder = joblib.load('onehot_encoder.joblib')
69
+ pca_model = joblib.load('pca.joblib')
70
+ one_hot_encoded = encoder.transform(data[['workclass', 'occupation']])
71
  encoded_columns_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())
72
+ pca_result_net = pca_model.transform(encoded_columns_df)
73
+ pca_columns = [f'pca_component_{i+1}' for i in range(pca_model.n_components_)]
 
74
  pca_df = pd.DataFrame(pca_result_net, columns=pca_columns)
75
+ data = data.drop(columns=['workclass', 'occupation'], axis=1)
76
+ data = pd.concat([data, pca_df], axis=1)
77
  return data
78
 
79
  def hbdscan_tranform(df_transformed):