analist commited on
Commit
d0ec537
·
verified ·
1 Parent(s): 724de9b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -183
app.py CHANGED
@@ -6,200 +6,191 @@ from sklearn.tree import plot_tree, export_text
6
  import seaborn as sns
7
  from sklearn.preprocessing import LabelEncoder
8
  from sklearn.ensemble import RandomForestClassifier
9
- from sklearn.tree import DecisionTreeClassifier, plot_tree
10
  from sklearn.ensemble import GradientBoostingClassifier
11
  from sklearn.linear_model import LogisticRegression
12
  from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
 
13
 
14
- data = pd.read_csv('exported_named_train_good.csv')
15
- data_test = pd.read_csv('exported_named_test_good.csv')
16
- X_train = data.drop("Target", axis=1).values
17
- y_train = data['Target'].values
18
-
19
- X_test = data_test.drop('Target', axis=1).values
20
- y_test = data_test['Target'].values
21
-
22
- models={
23
- "Logisitic Regression":LogisticRegression(),
24
- "Decision Tree":DecisionTreeClassifier(),
25
- "Random Forest":RandomForestClassifier(),
26
- "Gradient Boost":GradientBoostingClassifier()
27
- }
28
-
29
- for name, model in models.items():
30
-
31
- model.fit(X_train, y_train)
32
-
33
- # Make predictions
34
- y_train_pred = model.predict(X_train)
35
- y_test_pred = model.predict(X_test)
36
-
37
- # Training set performance
38
- model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
39
- model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
40
- model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
41
- model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall
42
- model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)
43
-
44
- # Test set performance
45
- model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
46
- model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
47
- model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
48
- model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall
49
- model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) #Calculate Roc
50
-
51
- print(name)
52
-
53
- print('Model performance for Training set')
54
- print("- Accuracy: {:.4f}".format(model_train_accuracy))
55
- print('- F1 score: {:.4f}'.format(model_train_f1))
56
 
57
- print('- Precision: {:.4f}'.format(model_train_precision))
58
- print('- Recall: {:.4f}'.format(model_train_recall))
59
- print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
 
 
 
61
 
 
 
 
 
 
 
62
 
63
- print('----------------------------------')
 
 
 
 
 
64
 
65
- print('Model performance for Test set')
66
- print('- Accuracy: {:.4f}'.format(model_test_accuracy))
67
- print('- F1 score: {:.4f}'.format(model_test_f1))
68
- print('- Precision: {:.4f}'.format(model_test_precision))
69
- print('- Recall: {:.4f}'.format(model_test_recall))
70
- print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
71
 
 
 
72
 
73
- print('='*35)
74
- print('\n')
75
-
76
- def load_model_and_data():
77
-
78
- model = models['Decision Tree']
79
- data = pd.read_csv('exported_named_train_good.csv')
80
- X = data.drop("Target", axis=1)
81
- y = data['Target']
82
-
83
- return model, X, y, X.columns
84
 
85
- import streamlit as st
86
- import pandas as pd
87
- import numpy as np
88
- import matplotlib.pyplot as plt
89
- from sklearn.tree import plot_tree, export_text
90
- import seaborn as sns
91
- from sklearn.preprocessing import LabelEncoder
92
- from dtreeviz import trees
93
-
94
 
95
  def app():
96
- st.title("Interpréteur d'Arbre de Décision")
97
 
98
- # Chargement du modèle et des données
99
- model, X, y, feature_names = load_model_and_data()
100
 
101
- if model is None:
102
- st.warning("Veuillez charger un modèle pour commencer.")
103
- return
 
104
 
105
- # Sidebar avec les sections
106
  st.sidebar.title("Navigation")
 
 
 
 
 
107
  page = st.sidebar.radio(
108
  "Sélectionnez une section",
109
- ["Vue globale du modèle",
110
- "Explorateur de règles",
111
- "Analyse de cohortes",
112
  "Simulateur de prédictions"]
113
  )
114
 
115
- # Vue globale du modèle
116
- if page == "Vue globale du modèle":
117
- st.header("Vue globale du modèle")
 
 
 
 
 
 
 
 
 
 
118
  col1, col2 = st.columns(2)
119
 
120
  with col1:
121
- st.subheader("Importance des caractéristiques")
122
- importance_plot = plt.figure(figsize=(10, 6))
123
- feature_importance = pd.DataFrame({
124
- 'feature': feature_names,
125
- 'importance': model.feature_importances_
126
- }).sort_values('importance', ascending=True)
127
- plt.barh(feature_importance['feature'], feature_importance['importance'])
128
- st.pyplot(importance_plot)
129
 
130
  with col2:
131
- st.subheader("Statistiques du modèle")
132
- st.write(f"Profondeur de l'arbre: {model.get_depth()}")
133
- st.write(f"Nombre de feuilles: {model.get_n_leaves()}")
134
 
135
- # Explorateur de règles
136
- elif page == "Explorateur de règles":
137
- st.header("Explorateur de règles de décision")
138
 
139
- viz_type = st.radio(
140
- "Type de visualisation",
141
- ["Texte", "Graphique interactif"]
142
- )
143
-
144
- max_depth = st.slider("Profondeur maximale à afficher", 1, model.get_depth(), 3)
 
 
 
 
 
 
145
 
146
- if viz_type == "Texte":
147
- tree_text = export_text(model, feature_names=list(feature_names), max_depth=max_depth)
148
- st.text(tree_text)
149
- else:
150
- # Création de la visualisation dtreeviz
151
- viz = dtreeviz(
152
- model,
153
- X,
154
- y,
155
- target_name="target",
156
- feature_names=list(feature_names),
157
- class_names=list(map(str, model.classes_)),
158
- max_depth=max_depth
159
- )
160
 
161
- # Sauvegarde temporaire et affichage
162
- st.set_option('deprecation.showPyplotGlobalUse', False)
163
- fig = viz.view()
164
  st.pyplot(fig)
165
 
166
- # Analyse de cohortes
167
- elif page == "Analyse de cohortes":
168
- st.header("Analyse de cohortes")
169
 
170
- selected_features = st.multiselect(
171
- "Sélectionnez les caractéristiques pour définir les cohortes",
172
- feature_names,
173
- max_selections=2
174
- )
175
 
176
- if len(selected_features) > 0:
177
- def create_cohorts(X, features):
178
- cohort_def = X[features].copy()
179
- for feat in features:
180
- if X[feat].dtype == 'object' or len(X[feat].unique()) < 10:
181
- cohort_def[feat] = X[feat]
182
- else:
183
- cohort_def[feat] = pd.qcut(X[feat], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
184
- return cohort_def.apply(lambda x: ' & '.join(x.astype(str)), axis=1)
185
-
186
- cohorts = create_cohorts(X, selected_features)
187
-
188
- cohort_analysis = pd.DataFrame({
189
- 'Cohorte': cohorts,
190
- 'Prédiction': model.predict(X)
191
- })
192
-
193
- cohort_stats = cohort_analysis.groupby('Cohorte')['Prédiction'].agg(['count', 'mean'])
194
- cohort_stats.columns = ['Nombre d\'observations', 'Taux de prédiction positive']
195
-
196
- st.write("Statistiques par cohorte:")
197
- st.dataframe(cohort_stats)
198
-
199
- cohort_viz = plt.figure(figsize=(10, 6))
200
- sns.barplot(data=cohort_analysis, x='Cohorte', y='Prédiction')
201
- plt.xticks(rotation=45)
202
- st.pyplot(cohort_viz)
203
 
204
  # Simulateur de prédictions
205
  else:
@@ -207,45 +198,46 @@ def app():
207
 
208
  input_values = {}
209
  for feature in feature_names:
210
- if X[feature].dtype == 'object':
211
  input_values[feature] = st.selectbox(
212
  f"Sélectionnez {feature}",
213
- options=X[feature].unique()
214
  )
215
  else:
216
  input_values[feature] = st.slider(
217
  f"Valeur pour {feature}",
218
- float(X[feature].min()),
219
- float(X[feature].max()),
220
- float(X[feature].mean())
221
  )
222
 
223
  if st.button("Prédire"):
224
  input_df = pd.DataFrame([input_values])
225
 
226
- prediction = model.predict_proba(input_df)
227
 
228
  st.write("Probabilités prédites:")
229
  st.write({f"Classe {i}": f"{prob:.2%}" for i, prob in enumerate(prediction[0])})
230
 
231
- st.subheader("Chemin de décision")
232
- node_indicator = model.decision_path(input_df)
233
- leaf_id = model.apply(input_df)
234
-
235
- node_index = node_indicator.indices[node_indicator.indptr[0]:node_indicator.indptr[1]]
236
-
237
- rules = []
238
- for node_id in node_index:
239
- if node_id != leaf_id[0]:
240
- threshold = model.tree_.threshold[node_id]
241
- feature = feature_names[model.tree_.feature[node_id]]
242
- if input_df.iloc[0][feature] <= threshold:
243
- rules.append(f"{feature} {threshold:.2f}")
244
- else:
245
- rules.append(f"{feature} > {threshold:.2f}")
246
-
247
- for rule in rules:
248
- st.write(rule)
 
249
 
250
  if __name__ == "__main__":
251
- app()
 
6
  import seaborn as sns
7
  from sklearn.preprocessing import LabelEncoder
8
  from sklearn.ensemble import RandomForestClassifier
9
+ from sklearn.tree import DecisionTreeClassifier
10
  from sklearn.ensemble import GradientBoostingClassifier
11
  from sklearn.linear_model import LogisticRegression
12
  from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
13
+ import shap
14
 
15
+ def load_data():
16
+ data = pd.read_csv('exported_named_train_good.csv')
17
+ data_test = pd.read_csv('exported_named_test_good.csv')
18
+ X_train = data.drop("Target", axis=1)
19
+ y_train = data['Target']
20
+ X_test = data_test.drop('Target', axis=1)
21
+ y_test = data_test['Target']
22
+ return X_train, y_train, X_test, y_test, X_train.columns
23
+
24
+ def train_models(X_train, y_train, X_test, y_test):
25
+ models = {
26
+ "Logistic Regression": LogisticRegression(random_state=42),
27
+ "Decision Tree": DecisionTreeClassifier(random_state=42),
28
+ "Random Forest": RandomForestClassifier(random_state=42),
29
+ "Gradient Boost": GradientBoostingClassifier(random_state=42)
30
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ results = {}
33
+ for name, model in models.items():
34
+ model.fit(X_train, y_train)
35
+
36
+ # Predictions
37
+ y_train_pred = model.predict(X_train)
38
+ y_test_pred = model.predict(X_test)
39
+
40
+ # Metrics
41
+ results[name] = {
42
+ 'model': model,
43
+ 'train_metrics': {
44
+ 'accuracy': accuracy_score(y_train, y_train_pred),
45
+ 'f1': f1_score(y_train, y_train_pred, average='weighted'),
46
+ 'precision': precision_score(y_train, y_train_pred),
47
+ 'recall': recall_score(y_train, y_train_pred),
48
+ 'roc_auc': roc_auc_score(y_train, y_train_pred)
49
+ },
50
+ 'test_metrics': {
51
+ 'accuracy': accuracy_score(y_test, y_test_pred),
52
+ 'f1': f1_score(y_test, y_test_pred, average='weighted'),
53
+ 'precision': precision_score(y_test, y_test_pred),
54
+ 'recall': recall_score(y_test, y_test_pred),
55
+ 'roc_auc': roc_auc_score(y_test, y_test_pred)
56
+ }
57
+ }
58
+
59
+ return results
60
 
61
+ def plot_model_performance(results):
62
+ metrics = ['accuracy', 'f1', 'precision', 'recall', 'roc_auc']
63
+ fig, axes = plt.subplots(1, 2, figsize=(15, 6))
64
 
65
+ # Training metrics
66
+ train_data = {model: [results[model]['train_metrics'][metric] for metric in metrics]
67
+ for model in results.keys()}
68
+ train_df = pd.DataFrame(train_data, index=metrics)
69
+ train_df.plot(kind='bar', ax=axes[0], title='Training Performance')
70
+ axes[0].set_ylim(0, 1)
71
 
72
+ # Test metrics
73
+ test_data = {model: [results[model]['test_metrics'][metric] for metric in metrics]
74
+ for model in results.keys()}
75
+ test_df = pd.DataFrame(test_data, index=metrics)
76
+ test_df.plot(kind='bar', ax=axes[1], title='Test Performance')
77
+ axes[1].set_ylim(0, 1)
78
 
79
+ plt.tight_layout()
80
+ return fig
 
 
 
 
81
 
82
+ def plot_feature_importance(model, feature_names, model_type):
83
+ plt.figure(figsize=(10, 6))
84
 
85
+ if model_type in ["Decision Tree", "Random Forest", "Gradient Boost"]:
86
+ importance = model.feature_importances_
87
+ elif model_type == "Logistic Regression":
88
+ importance = np.abs(model.coef_[0])
 
 
 
 
 
 
 
89
 
90
+ importance_df = pd.DataFrame({
91
+ 'feature': feature_names,
92
+ 'importance': importance
93
+ }).sort_values('importance', ascending=True)
94
+
95
+ plt.barh(importance_df['feature'], importance_df['importance'])
96
+ plt.title(f"Feature Importance - {model_type}")
97
+ return plt.gcf()
 
98
 
99
  def app():
100
+ st.title("Interpréteur de Modèles ML")
101
 
102
+ # Load data
103
+ X_train, y_train, X_test, y_test, feature_names = load_data()
104
 
105
+ # Train models if not in session state
106
+ if 'model_results' not in st.session_state:
107
+ with st.spinner("Entraînement des modèles en cours..."):
108
+ st.session_state.model_results = train_models(X_train, y_train, X_test, y_test)
109
 
110
+ # Sidebar
111
  st.sidebar.title("Navigation")
112
+ selected_model = st.sidebar.selectbox(
113
+ "Sélectionnez un modèle",
114
+ list(st.session_state.model_results.keys())
115
+ )
116
+
117
  page = st.sidebar.radio(
118
  "Sélectionnez une section",
119
+ ["Performance des modèles",
120
+ "Interprétation du modèle",
121
+ "Analyse des caractéristiques",
122
  "Simulateur de prédictions"]
123
  )
124
 
125
+ current_model = st.session_state.model_results[selected_model]['model']
126
+
127
+ # Performance des modèles
128
+ if page == "Performance des modèles":
129
+ st.header("Performance des modèles")
130
+
131
+ # Plot global performance comparison
132
+ st.subheader("Comparaison des performances")
133
+ performance_fig = plot_model_performance(st.session_state.model_results)
134
+ st.pyplot(performance_fig)
135
+
136
+ # Detailed metrics for selected model
137
+ st.subheader(f"Métriques détaillées - {selected_model}")
138
  col1, col2 = st.columns(2)
139
 
140
  with col1:
141
+ st.write("Métriques d'entraînement:")
142
+ for metric, value in st.session_state.model_results[selected_model]['train_metrics'].items():
143
+ st.write(f"{metric}: {value:.4f}")
 
 
 
 
 
144
 
145
  with col2:
146
+ st.write("Métriques de test:")
147
+ for metric, value in st.session_state.model_results[selected_model]['test_metrics'].items():
148
+ st.write(f"{metric}: {value:.4f}")
149
 
150
+ # Interprétation du modèle
151
+ elif page == "Interprétation du modèle":
152
+ st.header(f"Interprétation du modèle - {selected_model}")
153
 
154
+ if selected_model in ["Decision Tree", "Random Forest"]:
155
+ if selected_model == "Decision Tree":
156
+ st.subheader("Visualisation de l'arbre")
157
+ max_depth = st.slider("Profondeur maximale à afficher", 1, 5, 3)
158
+ fig, ax = plt.subplots(figsize=(20, 10))
159
+ plot_tree(current_model, feature_names=list(feature_names),
160
+ max_depth=max_depth, filled=True, rounded=True)
161
+ st.pyplot(fig)
162
+
163
+ st.subheader("Règles de décision importantes")
164
+ if selected_model == "Decision Tree":
165
+ st.text(export_text(current_model, feature_names=list(feature_names)))
166
 
167
+ # SHAP values for all models
168
+ st.subheader("SHAP Values")
169
+ with st.spinner("Calcul des valeurs SHAP en cours..."):
170
+ explainer = shap.TreeExplainer(current_model) if selected_model != "Logistic Regression" \
171
+ else shap.LinearExplainer(current_model, X_train)
172
+ shap_values = explainer.shap_values(X_train[:100]) # Using first 100 samples for speed
 
 
 
 
 
 
 
 
173
 
174
+ fig, ax = plt.subplots(figsize=(10, 6))
175
+ shap.summary_plot(shap_values, X_train[:100], feature_names=list(feature_names),
176
+ show=False)
177
  st.pyplot(fig)
178
 
179
+ # Analyse des caractéristiques
180
+ elif page == "Analyse des caractéristiques":
181
+ st.header("Analyse des caractéristiques")
182
 
183
+ # Feature importance
184
+ st.subheader("Importance des caractéristiques")
185
+ importance_fig = plot_feature_importance(current_model, feature_names, selected_model)
186
+ st.pyplot(importance_fig)
 
187
 
188
+ # Feature correlation
189
+ st.subheader("Matrice de corrélation")
190
+ corr_matrix = X_train.corr()
191
+ fig, ax = plt.subplots(figsize=(10, 8))
192
+ sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
193
+ st.pyplot(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
  # Simulateur de prédictions
196
  else:
 
198
 
199
  input_values = {}
200
  for feature in feature_names:
201
+ if X_train[feature].dtype == 'object':
202
  input_values[feature] = st.selectbox(
203
  f"Sélectionnez {feature}",
204
+ options=X_train[feature].unique()
205
  )
206
  else:
207
  input_values[feature] = st.slider(
208
  f"Valeur pour {feature}",
209
+ float(X_train[feature].min()),
210
+ float(X_train[feature].max()),
211
+ float(X_train[feature].mean())
212
  )
213
 
214
  if st.button("Prédire"):
215
  input_df = pd.DataFrame([input_values])
216
 
217
+ prediction = current_model.predict_proba(input_df)
218
 
219
  st.write("Probabilités prédites:")
220
  st.write({f"Classe {i}": f"{prob:.2%}" for i, prob in enumerate(prediction[0])})
221
 
222
+ if selected_model == "Decision Tree":
223
+ st.subheader("Chemin de décision")
224
+ node_indicator = current_model.decision_path(input_df)
225
+ leaf_id = current_model.apply(input_df)
226
+
227
+ node_index = node_indicator.indices[node_indicator.indptr[0]:node_indicator.indptr[1]]
228
+
229
+ rules = []
230
+ for node_id in node_index:
231
+ if node_id != leaf_id[0]:
232
+ threshold = current_model.tree_.threshold[node_id]
233
+ feature = feature_names[current_model.tree_.feature[node_id]]
234
+ if input_df.iloc[0][feature] <= threshold:
235
+ rules.append(f"{feature} ≤ {threshold:.2f}")
236
+ else:
237
+ rules.append(f"{feature} > {threshold:.2f}")
238
+
239
+ for rule in rules:
240
+ st.write(rule)
241
 
242
  if __name__ == "__main__":
243
+ app()