analist commited on
Commit
00994c4
·
verified ·
1 Parent(s): fc8d6e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +203 -227
app.py CHANGED
@@ -2,266 +2,242 @@ import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  import matplotlib.pyplot as plt
 
5
  import seaborn as sns
6
  from sklearn.preprocessing import LabelEncoder
7
  from sklearn.ensemble import RandomForestClassifier
8
  from sklearn.tree import DecisionTreeClassifier
9
  from sklearn.ensemble import GradientBoostingClassifier
10
  from sklearn.linear_model import LogisticRegression
11
- from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
12
- import plotly.express as px
13
- import plotly.graph_objects as go
14
 
15
  def load_data():
16
- data = pd.read_csv('exported_named_train_good.csv')
17
- data_test = pd.read_csv('exported_named_test_good.csv')
18
- X_train = data.drop("Target", axis=1)
19
- y_train = data['Target']
20
- X_test = data_test.drop('Target', axis=1)
21
- y_test = data_test['Target']
22
- return X_train, y_train, X_test, y_test, X_train.columns
23
 
24
  def train_models(X_train, y_train, X_test, y_test):
25
- models = {
26
- "Logistic Regression": LogisticRegression(random_state=42),
27
- "Decision Tree": DecisionTreeClassifier(random_state=42),
28
- "Random Forest": RandomForestClassifier(random_state=42),
29
- "Gradient Boost": GradientBoostingClassifier(random_state=42)
30
- }
31
 
32
- results = {}
33
- for name, model in models.items():
34
  model.fit(X_train, y_train)
35
-
36
- # Predictions
37
- y_train_pred = model.predict(X_train)
38
- y_test_pred = model.predict(X_test)
39
-
40
- # Metrics
41
- results[name] = {
42
- 'model': model,
43
- 'train_metrics': {
44
- 'accuracy': accuracy_score(y_train, y_train_pred),
45
- 'f1': f1_score(y_train, y_train_pred, average='weighted'),
46
- 'precision': precision_score(y_train, y_train_pred),
47
- 'recall': recall_score(y_train, y_train_pred),
48
- 'roc_auc': roc_auc_score(y_train, y_train_pred)
49
- },
50
- 'test_metrics': {
51
- 'accuracy': accuracy_score(y_test, y_test_pred),
52
- 'f1': f1_score(y_test, y_test_pred, average='weighted'),
53
- 'precision': precision_score(y_test, y_test_pred),
54
- 'recall': recall_score(y_test, y_test_pred),
55
- 'roc_auc': roc_auc_score(y_test, y_test_pred)
56
- }
57
- }
58
-
59
- return results
60
 
61
  def plot_model_performance(results):
62
- metrics = ['accuracy', 'f1', 'precision', 'recall', 'roc_auc']
63
- fig, axes = plt.subplots(1, 2, figsize=(15, 6))
64
-
65
- # Training metrics
66
- train_data = {model: [results[model]['train_metrics'][metric] for metric in metrics]
67
- for model in results.keys()}
68
- train_df = pd.DataFrame(train_data, index=metrics)
69
- train_df.plot(kind='bar', ax=axes[0], title='Training Performance')
70
- axes[0].set_ylim(0, 1)
71
-
72
- # Test metrics
73
- test_data = {model: [results[model]['test_metrics'][metric] for metric in metrics]
74
- for model in results.keys()}
75
- test_df = pd.DataFrame(test_data, index=metrics)
76
- test_df.plot(kind='bar', ax=axes[1], title='Test Performance')
77
- axes[1].set_ylim(0, 1)
78
-
79
- plt.tight_layout()
80
- return fig
81
 
82
  def plot_feature_importance(model, feature_names, model_type):
83
- plt.figure(figsize=(10, 6))
84
-
85
- if model_type in ["Decision Tree", "Random Forest", "Gradient Boost"]:
86
- importance = model.feature_importances_
87
- elif model_type == "Logistic Regression":
88
- importance = np.abs(model.coef_[0])
89
-
90
- importance_df = pd.DataFrame({
91
- 'feature': feature_names,
92
- 'importance': importance
93
- }).sort_values('importance', ascending=True)
94
 
95
- plt.barh(importance_df['feature'], importance_df['importance'])
96
- plt.title(f"Feature Importance - {model_type}")
97
- return plt.gcf()
98
-
99
-
100
- import streamlit as st
101
- import pandas as pd
102
- import numpy as np
103
- import matplotlib.pyplot as plt
104
- from sklearn.tree import DecisionTreeClassifier
105
- from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
106
- from sklearn.linear_model import LogisticRegression
107
- from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score
108
- import seaborn as sns
109
-
110
- # Configuration de la page
111
- st.set_page_config(layout="wide", page_title="ML Dashboard")
112
-
113
- # Style personnalisé
114
- st.markdown("""
115
- <style>
116
- /* Cartes stylisées */
117
- div.css-1r6slb0.e1tzin5v2 {
118
- background-color: #FFFFFF;
119
- border: 1px solid #EEEEEE;
120
- padding: 1.5rem;
121
- border-radius: 10px;
122
- box-shadow: 0 2px 4px rgba(0,0,0,0.1);
123
- }
124
-
125
- /* Headers */
126
- .main-header {
127
- font-size: 2rem;
128
- font-weight: 700;
129
- color: #1E88E5;
130
- text-align: center;
131
- margin-bottom: 2rem;
132
- }
133
 
134
- /* Metric containers */
135
- div.css-12w0qpk.e1tzin5v2 {
136
- background-color: #F8F9FA;
137
- padding: 1rem;
138
- border-radius: 8px;
139
- text-align: center;
140
- }
141
 
142
- /* Metric values */
143
- div.css-1xarl3l.e16fv1kl1 {
144
- font-size: 1.8rem;
145
- font-weight: 700;
146
- color: #1E88E5;
147
- }
148
- </style>
149
- """, unsafe_allow_html=True)
150
-
151
- def plot_performance_comparison(results, metric='test_metrics'):
152
- """Crée un graphique de comparaison des performances avec des couleurs distinctes"""
153
- metrics = ['accuracy', 'f1', 'recall', 'roc_auc']
154
- model_names = list(results.keys())
155
-
156
- # Définir des couleurs distinctes pour chaque modèle
157
- colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
158
-
159
- data = {model: [results[model][metric][m] for m in metrics]
160
- for model in model_names}
161
-
162
- fig, ax = plt.subplots(figsize=(10, 6))
163
- x = np.arange(len(metrics))
164
- width = 0.2
165
-
166
- for i, (model, values) in enumerate(data.items()):
167
- ax.bar(x + i*width, values, width, label=model, color=colors[i])
168
-
169
- ax.set_ylabel('Score')
170
- ax.set_title(f'Comparaison des performances ({metric.split("_")[0].title()})')
171
- ax.set_xticks(x + width * (len(model_names)-1)/2)
172
- ax.set_xticklabels(metrics)
173
- ax.legend()
174
- ax.grid(True, alpha=0.3)
175
- plt.ylim(0, 1)
176
-
177
- return fig
178
-
179
- def create_metric_card(title, value):
180
- """Crée une carte de métrique stylisée"""
181
- st.markdown(f"""
182
- <div style="
183
- background-color: white;
184
- padding: 1rem;
185
- border-radius: 8px;
186
- box-shadow: 0 2px 4px rgba(0,0,0,0.1);
187
- text-align: center;
188
- margin-bottom: 1rem;
189
- ">
190
- <h3 style="color: #666; font-size: 1rem; margin-bottom: 0.5rem;">{title}</h3>
191
- <p style="color: #1E88E5; font-size: 1.8rem; font-weight: bold; margin: 0;">{value:.3f}</p>
192
- </div>
193
- """, unsafe_allow_html=True)
194
 
195
  def app():
196
- # Header
197
- st.markdown('<h1 class="main-header">Tableau de Bord ML</h1>', unsafe_allow_html=True)
198
 
199
- # Charger et préparer les données
200
  X_train, y_train, X_test, y_test, feature_names = load_data()
201
 
202
- # Sidebar pour la sélection du modèle
203
- with st.sidebar:
204
- st.markdown('<h2 style="color: #1E88E5;">Configuration</h2>', unsafe_allow_html=True)
205
- selected_model = st.selectbox(
206
- "Sélectionner un modèle",
207
- ["Logistic Regression", "Decision Tree", "Random Forest", "Gradient Boost"]
208
- )
209
-
210
- # Entraînement des modèles si pas déjà fait
211
  if 'model_results' not in st.session_state:
212
- with st.spinner("Entraînement des modèles..."):
213
  st.session_state.model_results = train_models(X_train, y_train, X_test, y_test)
214
 
215
- # Layout principal
216
- col1, col2 = st.columns([2, 1])
217
-
218
- with col1:
219
- # Graphiques de performance
220
- st.markdown("### 📊 Comparaison des Performances")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
- tab1, tab2 = st.tabs(["🎯 Test", "📈 Entraînement"])
 
 
 
223
 
224
- with tab1:
225
- fig_test = plot_performance_comparison(st.session_state.model_results, 'test_metrics')
226
- st.pyplot(fig_test)
227
 
228
- with tab2:
229
- fig_train = plot_performance_comparison(st.session_state.model_results, 'train_metrics')
230
- st.pyplot(fig_train)
231
-
232
- with col2:
233
- # Métriques détaillées du modèle sélectionné
234
- st.markdown(f"### 📌 Métriques - {selected_model}")
235
 
236
- metrics = st.session_state.model_results[selected_model]['test_metrics']
237
- for metric, value in metrics.items():
238
- if metric != 'precision': # On exclut la précision
239
- create_metric_card(metric.upper(), value)
240
-
241
- # Section inférieure
242
- st.markdown("### 🔍 Analyse Détaillée")
243
- col3, col4 = st.columns(2)
244
-
245
- with col3:
246
- # Feature Importance
247
- current_model = st.session_state.model_results[selected_model]['model']
248
- if hasattr(current_model, 'feature_importances_') or hasattr(current_model, 'coef_'):
249
- fig_importance = plt.figure(figsize=(10, 6))
250
- if hasattr(current_model, 'feature_importances_'):
251
- importances = current_model.feature_importances_
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  else:
253
- importances = np.abs(current_model.coef_[0])
 
 
 
 
 
 
 
 
254
 
255
- plt.barh(feature_names, importances)
256
- plt.title("Importance des Caractéristiques")
257
- st.pyplot(fig_importance)
258
-
259
- with col4:
260
- # Matrice de corrélation
261
- fig_corr = plt.figure(figsize=(10, 8))
262
- sns.heatmap(X_train.corr(), annot=True, cmap='coolwarm', center=0)
263
- plt.title("Matrice de Corrélation")
264
- st.pyplot(fig_corr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
  if __name__ == "__main__":
267
  app()
 
2
  import pandas as pd
3
  import numpy as np
4
  import matplotlib.pyplot as plt
5
+ from sklearn.tree import plot_tree, export_text
6
  import seaborn as sns
7
  from sklearn.preprocessing import LabelEncoder
8
  from sklearn.ensemble import RandomForestClassifier
9
  from sklearn.tree import DecisionTreeClassifier
10
  from sklearn.ensemble import GradientBoostingClassifier
11
  from sklearn.linear_model import LogisticRegression
12
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
13
+ import shap
 
14
 
15
  def load_data():
16
+ data = pd.read_csv('exported_named_train_good.csv')
17
+ data_test = pd.read_csv('exported_named_test_good.csv')
18
+ X_train = data.drop("Target", axis=1)
19
+ y_train = data['Target']
20
+ X_test = data_test.drop('Target', axis=1)
21
+ y_test = data_test['Target']
22
+ return X_train, y_train, X_test, y_test, X_train.columns
23
 
24
  def train_models(X_train, y_train, X_test, y_test):
25
+ models = {
26
+ "Logistic Regression": LogisticRegression(random_state=42),
27
+ "Decision Tree": DecisionTreeClassifier(random_state=42),
28
+ "Random Forest": RandomForestClassifier(random_state=42),
29
+ "Gradient Boost": GradientBoostingClassifier(random_state=42)
30
+ }
31
 
32
+ results = {}
33
+ for name, model in models.items():
34
  model.fit(X_train, y_train)
35
+
36
+ # Predictions
37
+ y_train_pred = model.predict(X_train)
38
+ y_test_pred = model.predict(X_test)
39
+
40
+ # Metrics
41
+ results[name] = {
42
+ 'model': model,
43
+ 'train_metrics': {
44
+ 'accuracy': accuracy_score(y_train, y_train_pred),
45
+ 'f1': f1_score(y_train, y_train_pred, average='weighted'),
46
+ 'precision': precision_score(y_train, y_train_pred),
47
+ 'recall': recall_score(y_train, y_train_pred),
48
+ 'roc_auc': roc_auc_score(y_train, y_train_pred)
49
+ },
50
+ 'test_metrics': {
51
+ 'accuracy': accuracy_score(y_test, y_test_pred),
52
+ 'f1': f1_score(y_test, y_test_pred, average='weighted'),
53
+ 'precision': precision_score(y_test, y_test_pred),
54
+ 'recall': recall_score(y_test, y_test_pred),
55
+ 'roc_auc': roc_auc_score(y_test, y_test_pred)
56
+ }
57
+ }
58
+
59
+ return results
60
 
61
  def plot_model_performance(results):
62
+ metrics = ['accuracy', 'f1', 'precision', 'recall', 'roc_auc']
63
+ fig, axes = plt.subplots(1, 2, figsize=(15, 6))
64
+
65
+ # Training metrics
66
+ train_data = {model: [results[model]['train_metrics'][metric] for metric in metrics]
67
+ for model in results.keys()}
68
+ train_df = pd.DataFrame(train_data, index=metrics)
69
+ train_df.plot(kind='bar', ax=axes[0], title='Training Performance')
70
+ axes[0].set_ylim(0, 1)
71
+
72
+ # Test metrics
73
+ test_data = {model: [results[model]['test_metrics'][metric] for metric in metrics]
74
+ for model in results.keys()}
75
+ test_df = pd.DataFrame(test_data, index=metrics)
76
+ test_df.plot(kind='bar', ax=axes[1], title='Test Performance')
77
+ axes[1].set_ylim(0, 1)
78
+
79
+ plt.tight_layout()
80
+ return fig
81
 
82
  def plot_feature_importance(model, feature_names, model_type):
83
+ plt.figure(figsize=(10, 6))
 
 
 
 
 
 
 
 
 
 
84
 
85
+ if model_type in ["Decision Tree", "Random Forest", "Gradient Boost"]:
86
+ importance = model.feature_importances_
87
+ elif model_type == "Logistic Regression":
88
+ importance = np.abs(model.coef_[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ importance_df = pd.DataFrame({
91
+ 'feature': feature_names,
92
+ 'importance': importance
93
+ }).sort_values('importance', ascending=True)
 
 
 
94
 
95
+ plt.barh(importance_df['feature'], importance_df['importance'])
96
+ plt.title(f"Feature Importance - {model_type}")
97
+ return plt.gcf()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  def app():
100
+ st.title("Interpréteur de Modèles ML")
 
101
 
102
+ # Load data
103
  X_train, y_train, X_test, y_test, feature_names = load_data()
104
 
105
+ # Train models if not in session state
 
 
 
 
 
 
 
 
106
  if 'model_results' not in st.session_state:
107
+ with st.spinner("Entraînement des modèles en cours..."):
108
  st.session_state.model_results = train_models(X_train, y_train, X_test, y_test)
109
 
110
+ # Sidebar
111
+ st.sidebar.title("Navigation")
112
+ selected_model = st.sidebar.selectbox(
113
+ "Sélectionnez un modèle",
114
+ list(st.session_state.model_results.keys())
115
+ )
116
+
117
+ page = st.sidebar.radio(
118
+ "Sélectionnez une section",
119
+ ["Performance des modèles",
120
+ "Interprétation du modèle",
121
+ "Analyse des caractéristiques",
122
+ "Simulateur de prédictions"]
123
+ )
124
+
125
+ current_model = st.session_state.model_results[selected_model]['model']
126
+
127
+ # Performance des modèles
128
+ if page == "Performance des modèles":
129
+ st.header("Performance des modèles")
130
 
131
+ # Plot global performance comparison
132
+ st.subheader("Comparaison des performances")
133
+ performance_fig = plot_model_performance(st.session_state.model_results)
134
+ st.pyplot(performance_fig)
135
 
136
+ # Detailed metrics for selected model
137
+ st.subheader(f"Métriques détaillées - {selected_model}")
138
+ col1, col2 = st.columns(2)
139
 
140
+ with col1:
141
+ st.write("Métriques d'entraînement:")
142
+ for metric, value in st.session_state.model_results[selected_model]['train_metrics'].items():
143
+ st.write(f"{metric}: {value:.4f}")
 
 
 
144
 
145
+ with col2:
146
+ st.write("Métriques de test:")
147
+ for metric, value in st.session_state.model_results[selected_model]['test_metrics'].items():
148
+ st.write(f"{metric}: {value:.4f}")
149
+
150
+ # Interprétation du modèle
151
+ elif page == "Interprétation du modèle":
152
+ st.header(f"Interprétation du modèle - {selected_model}")
153
+
154
+ if selected_model in ["Decision Tree", "Random Forest"]:
155
+ if selected_model == "Decision Tree":
156
+ st.subheader("Visualisation de l'arbre")
157
+ max_depth = st.slider("Profondeur maximale à afficher", 1, 5, 3)
158
+ fig, ax = plt.subplots(figsize=(20, 10))
159
+ plot_tree(current_model, feature_names=list(feature_names),
160
+ max_depth=max_depth, filled=True, rounded=True)
161
+ st.pyplot(fig)
162
+
163
+ st.subheader("Règles de décision importantes")
164
+ if selected_model == "Decision Tree":
165
+ st.text(export_text(current_model, feature_names=list(feature_names)))
166
+
167
+ # SHAP values for all models
168
+ st.subheader("SHAP Values")
169
+ with st.spinner("Calcul des valeurs SHAP en cours..."):
170
+ explainer = shap.TreeExplainer(current_model) if selected_model != "Logistic Regression" \
171
+ else shap.LinearExplainer(current_model, X_train)
172
+ shap_values = explainer.shap_values(X_train[:100]) # Using first 100 samples for speed
173
+
174
+ fig, ax = plt.subplots(figsize=(10, 6))
175
+ shap.summary_plot(shap_values, X_train[:100], feature_names=list(feature_names),
176
+ show=False)
177
+ st.pyplot(fig)
178
+
179
+ # Analyse des caractéristiques
180
+ elif page == "Analyse des caractéristiques":
181
+ st.header("Analyse des caractéristiques")
182
+
183
+ # Feature importance
184
+ st.subheader("Importance des caractéristiques")
185
+ importance_fig = plot_feature_importance(current_model, feature_names, selected_model)
186
+ st.pyplot(importance_fig)
187
+
188
+ # Feature correlation
189
+ st.subheader("Matrice de corrélation")
190
+ corr_matrix = X_train.corr()
191
+ fig, ax = plt.subplots(figsize=(10, 8))
192
+ sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
193
+ st.pyplot(fig)
194
+
195
+ # Simulateur de prédictions
196
+ else:
197
+ st.header("Simulateur de prédictions")
198
+
199
+ input_values = {}
200
+ for feature in feature_names:
201
+ if X_train[feature].dtype == 'object':
202
+ input_values[feature] = st.selectbox(
203
+ f"Sélectionnez {feature}",
204
+ options=X_train[feature].unique()
205
+ )
206
  else:
207
+ input_values[feature] = st.slider(
208
+ f"Valeur pour {feature}",
209
+ float(X_train[feature].min()),
210
+ float(X_train[feature].max()),
211
+ float(X_train[feature].mean())
212
+ )
213
+
214
+ if st.button("Prédire"):
215
+ input_df = pd.DataFrame([input_values])
216
 
217
+ prediction = current_model.predict_proba(input_df)
218
+
219
+ st.write("Probabilités prédites:")
220
+ st.write({f"Classe {i}": f"{prob:.2%}" for i, prob in enumerate(prediction[0])})
221
+
222
+ if selected_model == "Decision Tree":
223
+ st.subheader("Chemin de décision")
224
+ node_indicator = current_model.decision_path(input_df)
225
+ leaf_id = current_model.apply(input_df)
226
+
227
+ node_index = node_indicator.indices[node_indicator.indptr[0]:node_indicator.indptr[1]]
228
+
229
+ rules = []
230
+ for node_id in node_index:
231
+ if node_id != leaf_id[0]:
232
+ threshold = current_model.tree_.threshold[node_id]
233
+ feature = feature_names[current_model.tree_.feature[node_id]]
234
+ if input_df.iloc[0][feature] <= threshold:
235
+ rules.append(f"{feature} ≤ {threshold:.2f}")
236
+ else:
237
+ rules.append(f"{feature} > {threshold:.2f}")
238
+
239
+ for rule in rules:
240
+ st.write(rule)
241
 
242
  if __name__ == "__main__":
243
  app()