Spaces:

Canstralian
/

cybersec-ml-pipeline

Running

App Files Files Community

Canstralian commited on Jan 20

Commit

7a3ab23

verified ·

1 Parent(s): 1aa27e7

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -357

app.py CHANGED Viewed

@@ -1,370 +1,130 @@
 import streamlit as st
 import pandas as pd
-import numpy as np
-from data_processing import DataProcessor
-from model_training import ModelTrainer
-from visualizations import Visualizer
-from utils import load_data, get_feature_names, save_model, load_saved_model, list_saved_models
-import warnings
-import re
-from typing import Optional
 from datasets import load_dataset
-from huggingface_hub import list_datasets
-import traceback
-warnings.filterwarnings('ignore')
-st.set_page_config(
-    page_title="ML Pipeline for Purple Teaming",
-    page_icon="🛡️",
-    layout="wide"
-)
-def validate_model_name(name: Optional[str]) -> str:
-    """Validate and sanitize model name"""
-    if not name:
-        return f"model_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}"
-    sanitized = re.sub(r'[^\w\-]', '_', name)
-    return sanitized
-def load_hf_dataset(dataset_name: str, config_name: Optional[str] = None) -> pd.DataFrame:
-    """Load a dataset from Hugging Face and convert to pandas DataFrame"""
-    try:
-        if config_name:
-            dataset = load_dataset(dataset_name, config_name)
         else:
-            dataset = load_dataset(dataset_name)
-        # Convert to pandas DataFrame (using first split, usually 'train')
-        split_name = list(dataset.keys())[0]
-        df = dataset[split_name].to_pandas()
-        return df
-    except Exception as e:
-        raise Exception(f"Error loading dataset from Hugging Face: {str(e)}\n{traceback.format_exc()}")
-def main():
-    st.title("🛡️ ML Pipeline for Cybersecurity Purple Teaming")
-    # Initialize default values for feature engineering
-    if 'poly_degree' not in st.session_state:
-        st.session_state.poly_degree = 2
-    if 'k_best_features' not in st.session_state:
-        st.session_state.k_best_features = 10
-    if 'n_components' not in st.session_state:
-        st.session_state.n_components = 0.95
-    # Sidebar
-    st.sidebar.header("Pipeline Configuration")
-    # Data Input Tabs
-    data_input_tab = st.radio(
-        "Choose Data Source",
-        ["Upload File", "Load from Hugging Face"]
-    )
-    df = None
-    if data_input_tab == "Upload File":
-        uploaded_file = st.file_uploader(
-            "Upload Dataset (CSV/JSON)",
-            type=['csv', 'json']
-        )
-        if uploaded_file is not None:
-            try:
-                df = load_data(uploaded_file)
-            except Exception as e:
-                st.error(f"Error loading file: {str(e)}")
-    else:
-        # Hugging Face Dataset Loading
-        st.markdown("### Load Dataset from Hugging Face")
-        dataset_name = st.text_input(
-            "Dataset Name",
-            help="Enter the Hugging Face dataset name (e.g., 'username/dataset-name')"
-        )
-        config_name = st.text_input(
-            "Configuration Name (Optional)",
-            help="Enter the specific configuration name if the dataset has multiple configurations"
-        )
-        if dataset_name:
-            try:
-                with st.spinner("Loading dataset from Hugging Face..."):
-                    df = load_hf_dataset(
-                        dataset_name,
-                        config_name if config_name else None
-                    )
-                st.success(f"Successfully loaded dataset: {dataset_name}")
-            except Exception as e:
-                st.error(str(e))
-    if df is not None:
         try:
-            # Validate data
-            if df.empty:
-                st.error("The dataset contains no data.")
-                return
-            if df.shape[1] < 2:
-                st.error("Dataset must contain at least two columns (features and target).")
-                return
-            # Check for numeric columns
-            numeric_cols = df.select_dtypes(include=[np.number]).columns
-            if len(numeric_cols) == 0:
-                st.error("Dataset must contain at least one numeric column for analysis.")
-                return
-            # Initialize components
-            processor = DataProcessor()
-            trainer = ModelTrainer()
-            visualizer = Visualizer()
-            # Data Processing Section
-            st.header("1. Data Processing")
-            col1, col2 = st.columns(2)
-            with col1:
-                st.subheader("Dataset Overview")
-                st.write(f"Shape: {df.shape}")
-                st.write("Sample Data:")
-                st.dataframe(df.head())
-            with col2:
-                st.subheader("Data Statistics")
-                st.write(df.describe())
-            # Feature Engineering Configuration
-            st.header("2. Feature Engineering")
-            col3, col4 = st.columns(2)
-            with col3:
-                # Basic preprocessing
-                handling_strategy = st.selectbox(
-                    "Missing Values Strategy",
-                    ["mean", "median", "most_frequent", "constant"]
-                )
-                scaling_method = st.selectbox(
-                    "Scaling Method",
-                    ["standard", "minmax", "robust"]
-                )
-                # Advanced Feature Engineering
-                st.subheader("Advanced Features")
-                use_polynomial = st.checkbox("Use Polynomial Features")
-                if use_polynomial:
-                    st.session_state.poly_degree = st.slider("Polynomial Degree", 2, 5, st.session_state.poly_degree)
-                use_feature_selection = st.checkbox("Use Feature Selection")
-                if use_feature_selection:
-                    max_features = min(50, df.shape[1])  # Limit k_best_features to number of columns
-                    st.session_state.k_best_features = st.slider(
-                        "Number of Best Features",
-                        2,  # Minimum 2 features required
-                        max_features,
-                        min(st.session_state.k_best_features, max_features),
-                        help="Select the number of most important features to use"
-                    )
-            with col4:
-                use_pca = st.checkbox("Use PCA")
-                if use_pca:
-                    st.session_state.n_components = st.slider(
-                        "PCA Components (%)",
-                        1, 100,
-                        int(st.session_state.n_components * 100),
-                        help="Percentage of variance to preserve"
-                    ) / 100.0
-                add_cyber_features = st.checkbox("Add Cybersecurity Features")
-                numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
-                if not numeric_features:
-                    st.error("No numeric features found in the dataset.")
-                    return
-                feature_cols = st.multiselect(
-                    "Select Features",
-                    numeric_features,
-                    default=numeric_features,
-                    help="Select the features to use for training"
-                )
-                if not feature_cols:
-                    st.error("Please select at least one feature column")
-                    return
-                categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
-                target_col = st.selectbox(
-                    "Select Target Column",
-                    [col for col in categorical_cols if col not in feature_cols],
-                    help="Select the target variable to predict"
-                )
-                if target_col is None:
-                    st.error("No suitable target column found. Target should be categorical.")
-                    return
-            # Create feature engineering config
-            feature_engineering_config = {
-                'use_polynomial': use_polynomial,
-                'poly_degree': st.session_state.poly_degree if use_polynomial else None,
-                'use_feature_selection': use_feature_selection,
-                'k_best_features': st.session_state.k_best_features if use_feature_selection else None,
-                'use_pca': use_pca,
-                'n_components': st.session_state.n_components if use_pca else None,
-                'add_cyber_features': add_cyber_features
-            }
-            # Model Configuration Section
-            st.header("3. Model Configuration")
-            col5, col6 = st.columns(2)
-            with col5:
-                n_estimators = st.slider(
-                    "Number of Trees",
-                    min_value=10,
-                    max_value=500,
-                    value=100
-                )
-                max_depth = st.slider(
-                    "Max Depth",
-                    min_value=1,
-                    max_value=50,
-                    value=10
-                )
-            with col6:
-                min_samples_split = st.slider(
-                    "Min Samples Split",
-                    min_value=2,
-                    max_value=20,
-                    value=2
-                )
-                min_samples_leaf = st.slider(
-                    "Min Samples Leaf",
-                    min_value=1,
-                    max_value=10,
-                    value=1
-                )
-            if st.button("Train Model"):
-                with st.spinner("Processing data and training model..."):
-                    # Process data with feature engineering
-                    X_train, X_test, y_train, y_test = processor.process_data(
-                        df,
-                        feature_cols,
-                        target_col,
-                        handling_strategy,
-                        scaling_method,
-                        feature_engineering_config
-                    )
-                    # Train model
-                    model, metrics = trainer.train_model(
-                        X_train, X_test, y_train, y_test,
-                        n_estimators=n_estimators,
-                        max_depth=max_depth,
-                        min_samples_split=min_samples_split,
-                        min_samples_leaf=min_samples_leaf
-                    )
-                    # Results Section
-                    st.header("4. Results and Visualizations")
-                    col7, col8 = st.columns(2)
-                    with col7:
-                        st.subheader("Model Performance Metrics")
-                        for metric, value in metrics.items():
-                            st.metric(metric, f"{value:.4f}")
-                        # Add model export section with improved validation
-                        st.subheader("Export Model")
-                        model_name = st.text_input(
-                            "Model Name (optional)",
-                            help="Enter a name for your model (alphanumeric and underscores only)"
-                        )
-                        if st.button("Save Model"):
-                            try:
-                                # Validate and sanitize model name
-                                sanitized_name = validate_model_name(model_name)
-                                if sanitized_name != model_name:
-                                    st.warning(f"Model name was sanitized to: {sanitized_name}")
-                                # Save model and metadata
-                                preprocessing_params = {
-                                    'feature_engineering_config': feature_engineering_config,
-                                    'handling_strategy': handling_strategy,
-                                    'scaling_method': scaling_method,
-                                    'feature_columns': feature_cols,
-                                    'target_column': target_col
-                                }
-                                model_path, metadata_path = save_model(
-                                    model,
-                                    feature_cols,
-                                    preprocessing_params,
-                                    metrics,
-                                    sanitized_name
-                                )
-                                st.success(f"Model saved successfully!\nFiles:\n- {model_path}\n- {metadata_path}")
-                            except Exception as e:
-                                st.error(f"Error saving model: {str(e)}")
-                                st.error("Please ensure you have proper permissions and sufficient disk space.")
-                    with col8:
-                        if not use_pca:  # Skip feature importance for PCA
-                            st.subheader("Feature Importance")
-                            fig_importance = visualizer.plot_feature_importance(
-                                model,
-                                feature_cols if not use_polynomial else [f"Feature_{i}" for i in range(X_train.shape[1])]
-                            )
-                            st.pyplot(fig_importance)
-                    # Confusion Matrix
-                    st.subheader("Confusion Matrix")
-                    fig_cm = visualizer.plot_confusion_matrix(
-                        y_test,
-                        model.predict(X_test)
-                    )
-                    st.pyplot(fig_cm)
-                    # ROC Curve
-                    st.subheader("ROC Curve")
-                    fig_roc = visualizer.plot_roc_curve(
-                        model,
-                        X_test,
-                        y_test
-                    )
-                    st.pyplot(fig_roc)
         except Exception as e:
-            st.error(f"An error occurred: {str(e)}")
-            st.error("Please check your input data and try again.")
     else:
-        if data_input_tab == "Upload File":
-            st.info("Please upload a dataset to begin.")
-        else:
-            st.info("Please enter a Hugging Face dataset name to begin.")
-    # Add Model Management Section
-    st.header("5. Saved Models")
-    try:
-        saved_models = list_saved_models()
-        if saved_models:
-            for model_info in saved_models:
-                with st.expander(f"Model: {model_info['name']}"):
-                    st.write(f"Type: {model_info['type']}")
-                    st.write(f"Created: {model_info['created_at']}")
-                    st.write("Performance Metrics:")
-                    for metric, value in model_info['metrics'].items():
-                        st.metric(metric, f"{value:.4f}")
-        else:
-            st.info("No saved models found.")
-    except Exception as e:
-        st.error(f"Error loading saved models: {str(e)}")
-if __name__ == "__main__":
-    main()

 import streamlit as st
 import pandas as pd
+import seaborn as sns
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import classification_report
+from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
 from datasets import load_dataset
+# 1. Load Dataset
+st.header("1. Load Dataset")
+data_source = st.radio("Choose data source:", ["Upload File", "Hugging Face", "Sample Dataset"])
+if data_source == "Upload File":
+    uploaded_file = st.file_uploader("Upload your dataset (CSV, Excel, or Parquet)", type=["csv", "xlsx", "parquet"])
+    if uploaded_file:
+        if uploaded_file.name.endswith(".csv"):
+            df = pd.read_csv(uploaded_file)
         else:
+            df = pd.read_excel(uploaded_file)
+        st.success(f"Successfully loaded {uploaded_file.name}")
+elif data_source == "Hugging Face":
+    hf_dataset_name = st.text_input("Enter Hugging Face dataset name:")
+    if hf_dataset_name:
         try:
+            dataset = load_dataset(hf_dataset_name)
+            df = dataset.to_pandas()
+            st.success(f"Loaded dataset: {hf_dataset_name}")
         except Exception as e:
+            st.error(f"Error loading dataset: {str(e)}")
+else:  # Sample Dataset
+    sample_data = st.selectbox("Select a sample dataset:", ["Iris", "Wine", "Titanic"])
+    df = sns.load_dataset(sample_data.lower())
+    st.success(f"Loaded sample dataset: {sample_data}")
+if 'df' in locals():
+    st.dataframe(df.head())
+# 2. Explore Dataset
+st.header("2. Explore Dataset")
+if 'df' in locals():
+    st.subheader("Dataset Overview")
+    st.write(f"Shape: {df.shape}")
+    st.write("Column Information:")
+    st.dataframe(df.dtypes)
+    if st.checkbox("Show Missing Values"):
+        missing = df.isnull().sum()
+        st.bar_chart(missing[missing > 0])
+    st.subheader("Summary Statistics")
+    st.write(df.describe())
+    if st.checkbox("Generate Correlation Matrix"):
+        corr_matrix = df.corr()
+        st.write(sns.heatmap(corr_matrix, annot=True, cmap="coolwarm"))
+        st.pyplot()
+else:
+    st.warning("Load a dataset to explore.")
+# 3. Preprocess Dataset
+st.header("3. Preprocess Dataset")
+if 'df' in locals():
+    st.subheader("Handle Missing Values")
+    missing_option = st.radio("Choose missing value strategy:", ["None", "Fill with Mean", "Drop Rows"])
+    if missing_option == "Fill with Mean":
+        df = df.fillna(df.mean())
+    elif missing_option == "Drop Rows":
+        df = df.dropna()
+    st.subheader("Encode Categorical Variables")
+    encoding_method = st.radio("Encoding Method:", ["None", "One-Hot Encoding", "Label Encoding"])
+    if encoding_method == "One-Hot Encoding":
+        df = pd.get_dummies(df)
+    elif encoding_method == "Label Encoding":
+        le = LabelEncoder()
+        for col in df.select_dtypes(include="object").columns:
+            df[col] = le.fit_transform(df[col])
+    st.subheader("Feature Scaling")
+    scaling_method = st.radio("Scaling Method:", ["None", "Standardization", "Normalization"])
+    if scaling_method != "None":
+        scaler = StandardScaler() if scaling_method == "Standardization" else MinMaxScaler()
+        numeric_cols = df.select_dtypes(include="number").columns
+        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
+    st.success("Preprocessing complete!")
+    st.dataframe(df.head())
+else:
+    st.warning("Load a dataset to preprocess.")
+# 4. Train Model
+st.header("4. Train Model")
+if 'df' in locals():
+    st.subheader("Select Target Column")
+    target_col = st.selectbox("Choose the target column:", df.columns)
+    features = [col for col in df.columns if col != target_col]
+    st.subheader("Train/Test Split")
+    test_size = st.slider("Test size (percentage):", 10, 50, 20) / 100
+    X_train, X_test, y_train, y_test = train_test_split(
+        df[features], df[target_col], test_size=test_size, random_state=42
+    )
+    st.subheader("Select and Train Model")
+    model_type = st.selectbox("Choose a model:", ["Logistic Regression", "Decision Tree", "Random Forest"])
+    if model_type == "Logistic Regression":
+        model = LogisticRegression()
+    elif model_type == "Decision Tree":
+        model = DecisionTreeClassifier()
     else:
+        model = RandomForestClassifier()
+    model.fit(X_train, y_train)
+    st.success("Model trained successfully!")
+    st.subheader("Model Performance")
+    y_pred = model.predict(X_test)
+    report = classification_report(y_test, y_pred, output_dict=True)
+    st.dataframe(pd.DataFrame(report).transpose())
+else:
+    st.warning("Load and preprocess a dataset to train a model.")