Travel.Com / app.py
analist's picture
Update app.py
e82efd1 verified
raw
history blame
8.14 kB
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree, export_text
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
data = pd.read_csv('exported_named_train_good.csv')
X_test = pd.read_csv('exported_named_test.csv').values
X_train = data.drop("Target", axis=1).values
y_train = data['Target'].values
models={
"Logisitic Regression":LogisticRegression(),
"Decision Tree":DecisionTreeClassifier(),
"Random Forest":RandomForestClassifier(),
"Gradient Boost":GradientBoostingClassifier()
}
for name, model in models.items():
model.fit(X_train, y_train)
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# Training set performance
model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall
model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)
# Test set performance
model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall
model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) #Calculate Roc
print(name)
print('Model performance for Training set')
print("- Accuracy: {:.4f}".format(model_train_accuracy))
print('- F1 score: {:.4f}'.format(model_train_f1))
print('- Precision: {:.4f}'.format(model_train_precision))
print('- Recall: {:.4f}'.format(model_train_recall))
print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: {:.4f}'.format(model_test_accuracy))
print('- F1 score: {:.4f}'.format(model_test_f1))
print('- Precision: {:.4f}'.format(model_test_precision))
print('- Recall: {:.4f}'.format(model_test_recall))
print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
print('='*35)
print('\n')
def load_model_and_data():
# Ici vous chargeriez votre modèle et données
# Pour l'exemple, on suppose qu'ils sont disponibles comme:
# model = loaded_model
# X = loaded_X
# y = loaded_y
# feature_names = X.columns
model = models['Decision Tree']
data = pd.read_csv('exported_named_train.csv')
X = data.drop("Target", axis=1)
y = data['Target']
return model, X, y
def app():
st.title("Interpréteur d'Arbre de Décision")
# Sidebar pour les contrôles
st.sidebar.header("Paramètres d'analyse")
# Section 1: Vue globale du modèle
st.header("Vue globale du modèle")
col1, col2 = st.columns(2)
with col1:
model, X, y = load_model_and_data()
feature_names = X.columns
st.subheader("Importance des caractéristiques")
importance_plot = plt.figure(figsize=(10, 6))
# Remplacer par vos features et leurs importances
feature_importance = pd.DataFrame({
'feature': feature_names,
'importance': model.feature_importances_
}).sort_values('importance', ascending=True)
plt.barh(feature_importance['feature'], feature_importance['importance'])
st.pyplot(importance_plot)
with col2:
st.subheader("Statistiques du modèle")
st.write(f"Profondeur de l'arbre: {model.get_depth()}")
st.write(f"Nombre de feuilles: {model.get_n_leaves()}")
# Section 2: Explorateur de règles
st.header("2. Explorateur de règles de décision")
max_depth = st.slider("Profondeur maximale à afficher", 1, model.get_depth(), 3)
tree_text = export_text(model, feature_names=list(feature_names), max_depth=max_depth)
st.text(tree_text)
# Section 3: Analyse de cohortes
st.header("3. Analyse de cohortes")
# Sélection des caractéristiques pour définir les cohortes
selected_features = st.multiselect(
"Sélectionnez les caractéristiques pour définir les cohortes",
feature_names,
max_selections=2
)
if len(selected_features) > 0:
# Création des cohortes basées sur les caractéristiques sélectionnées
def create_cohorts(X, features):
cohort_def = X[features].copy()
for feat in features:
if X[feat].dtype == 'object' or len(X[feat].unique()) < 10:
cohort_def[feat] = X[feat]
else:
cohort_def[feat] = pd.qcut(X[feat], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
return cohort_def.apply(lambda x: ' & '.join(x.astype(str)), axis=1)
cohorts = create_cohorts(X, selected_features)
# Analyse des prédictions par cohorte
cohort_analysis = pd.DataFrame({
'Cohorte': cohorts,
'Prédiction': model.predict(X)
})
cohort_stats = cohort_analysis.groupby('Cohorte')['Prédiction'].agg(['count', 'mean'])
cohort_stats.columns = ['Nombre d\'observations', 'Taux de prédiction positive']
st.write("Statistiques par cohorte:")
st.dataframe(cohort_stats)
# Visualisation des cohortes
cohort_viz = plt.figure(figsize=(10, 6))
sns.barplot(data=cohort_analysis, x='Cohorte', y='Prédiction')
plt.xticks(rotation=45)
st.pyplot(cohort_viz)
# Section 4: Simulateur de prédictions
st.header("4. Simulateur de prédictions")
# Interface pour entrer des valeurs
input_values = {}
for feature in feature_names:
if X[feature].dtype == 'object':
input_values[feature] = st.selectbox(
f"Sélectionnez {feature}",
options=X[feature].unique()
)
else:
input_values[feature] = st.slider(
f"Valeur pour {feature}",
float(X[feature].min()),
float(X[feature].max()),
float(X[feature].mean())
)
if st.button("Prédire"):
# Création du DataFrame pour la prédiction
input_df = pd.DataFrame([input_values])
# Prédiction
prediction = model.predict_proba(input_df)
# Affichage du résultat
st.write("Probabilités prédites:")
st.write({f"Classe {i}": f"{prob:.2%}" for i, prob in enumerate(prediction[0])})
# Chemin de décision pour cette prédiction
st.subheader("Chemin de décision")
node_indicator = model.decision_path(input_df)
leaf_id = model.apply(input_df)
feature_names = list(feature_names)
node_index = node_indicator.indices[node_indicator.indptr[0]:node_indicator.indptr[1]]
rules = []
for node_id in node_index:
if node_id != leaf_id[0]:
threshold = model.tree_.threshold[node_id]
feature = feature_names[model.tree_.feature[node_id]]
if input_df.iloc[0][feature] <= threshold:
rules.append(f"{feature}{threshold:.2f}")
else:
rules.append(f"{feature} > {threshold:.2f}")
for rule in rules:
st.write(rule)
if __name__ == "__main__":
app()