Canstralian commited on
Commit
7a3ab23
·
verified ·
1 Parent(s): 1aa27e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -357
app.py CHANGED
@@ -1,370 +1,130 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import numpy as np
4
- from data_processing import DataProcessor
5
- from model_training import ModelTrainer
6
- from visualizations import Visualizer
7
- from utils import load_data, get_feature_names, save_model, load_saved_model, list_saved_models
8
- import warnings
9
- import re
10
- from typing import Optional
11
  from datasets import load_dataset
12
- from huggingface_hub import list_datasets
13
- import traceback
14
 
15
- warnings.filterwarnings('ignore')
 
16
 
17
- st.set_page_config(
18
- page_title="ML Pipeline for Purple Teaming",
19
- page_icon="🛡️",
20
- layout="wide"
21
- )
22
 
23
- def validate_model_name(name: Optional[str]) -> str:
24
- """Validate and sanitize model name"""
25
- if not name:
26
- return f"model_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}"
27
- sanitized = re.sub(r'[^\w\-]', '_', name)
28
- return sanitized
29
-
30
- def load_hf_dataset(dataset_name: str, config_name: Optional[str] = None) -> pd.DataFrame:
31
- """Load a dataset from Hugging Face and convert to pandas DataFrame"""
32
- try:
33
- if config_name:
34
- dataset = load_dataset(dataset_name, config_name)
35
  else:
36
- dataset = load_dataset(dataset_name)
37
-
38
- # Convert to pandas DataFrame (using first split, usually 'train')
39
- split_name = list(dataset.keys())[0]
40
- df = dataset[split_name].to_pandas()
41
- return df
42
- except Exception as e:
43
- raise Exception(f"Error loading dataset from Hugging Face: {str(e)}\n{traceback.format_exc()}")
44
-
45
- def main():
46
- st.title("🛡️ ML Pipeline for Cybersecurity Purple Teaming")
47
-
48
- # Initialize default values for feature engineering
49
- if 'poly_degree' not in st.session_state:
50
- st.session_state.poly_degree = 2
51
- if 'k_best_features' not in st.session_state:
52
- st.session_state.k_best_features = 10
53
- if 'n_components' not in st.session_state:
54
- st.session_state.n_components = 0.95
55
-
56
- # Sidebar
57
- st.sidebar.header("Pipeline Configuration")
58
-
59
- # Data Input Tabs
60
- data_input_tab = st.radio(
61
- "Choose Data Source",
62
- ["Upload File", "Load from Hugging Face"]
63
- )
64
 
65
- df = None
66
-
67
- if data_input_tab == "Upload File":
68
- uploaded_file = st.file_uploader(
69
- "Upload Dataset (CSV/JSON)",
70
- type=['csv', 'json']
71
- )
72
- if uploaded_file is not None:
73
- try:
74
- df = load_data(uploaded_file)
75
- except Exception as e:
76
- st.error(f"Error loading file: {str(e)}")
77
- else:
78
- # Hugging Face Dataset Loading
79
- st.markdown("### Load Dataset from Hugging Face")
80
- dataset_name = st.text_input(
81
- "Dataset Name",
82
- help="Enter the Hugging Face dataset name (e.g., 'username/dataset-name')"
83
- )
84
- config_name = st.text_input(
85
- "Configuration Name (Optional)",
86
- help="Enter the specific configuration name if the dataset has multiple configurations"
87
- )
88
-
89
- if dataset_name:
90
- try:
91
- with st.spinner("Loading dataset from Hugging Face..."):
92
- df = load_hf_dataset(
93
- dataset_name,
94
- config_name if config_name else None
95
- )
96
- st.success(f"Successfully loaded dataset: {dataset_name}")
97
- except Exception as e:
98
- st.error(str(e))
99
-
100
- if df is not None:
101
  try:
102
- # Validate data
103
- if df.empty:
104
- st.error("The dataset contains no data.")
105
- return
106
-
107
- if df.shape[1] < 2:
108
- st.error("Dataset must contain at least two columns (features and target).")
109
- return
110
-
111
- # Check for numeric columns
112
- numeric_cols = df.select_dtypes(include=[np.number]).columns
113
- if len(numeric_cols) == 0:
114
- st.error("Dataset must contain at least one numeric column for analysis.")
115
- return
116
-
117
- # Initialize components
118
- processor = DataProcessor()
119
- trainer = ModelTrainer()
120
- visualizer = Visualizer()
121
-
122
- # Data Processing Section
123
- st.header("1. Data Processing")
124
- col1, col2 = st.columns(2)
125
-
126
- with col1:
127
- st.subheader("Dataset Overview")
128
- st.write(f"Shape: {df.shape}")
129
- st.write("Sample Data:")
130
- st.dataframe(df.head())
131
-
132
- with col2:
133
- st.subheader("Data Statistics")
134
- st.write(df.describe())
135
-
136
- # Feature Engineering Configuration
137
- st.header("2. Feature Engineering")
138
- col3, col4 = st.columns(2)
139
-
140
- with col3:
141
- # Basic preprocessing
142
- handling_strategy = st.selectbox(
143
- "Missing Values Strategy",
144
- ["mean", "median", "most_frequent", "constant"]
145
- )
146
- scaling_method = st.selectbox(
147
- "Scaling Method",
148
- ["standard", "minmax", "robust"]
149
- )
150
-
151
- # Advanced Feature Engineering
152
- st.subheader("Advanced Features")
153
- use_polynomial = st.checkbox("Use Polynomial Features")
154
- if use_polynomial:
155
- st.session_state.poly_degree = st.slider("Polynomial Degree", 2, 5, st.session_state.poly_degree)
156
-
157
- use_feature_selection = st.checkbox("Use Feature Selection")
158
- if use_feature_selection:
159
- max_features = min(50, df.shape[1]) # Limit k_best_features to number of columns
160
- st.session_state.k_best_features = st.slider(
161
- "Number of Best Features",
162
- 2, # Minimum 2 features required
163
- max_features,
164
- min(st.session_state.k_best_features, max_features),
165
- help="Select the number of most important features to use"
166
- )
167
-
168
- with col4:
169
- use_pca = st.checkbox("Use PCA")
170
- if use_pca:
171
- st.session_state.n_components = st.slider(
172
- "PCA Components (%)",
173
- 1, 100,
174
- int(st.session_state.n_components * 100),
175
- help="Percentage of variance to preserve"
176
- ) / 100.0
177
-
178
- add_cyber_features = st.checkbox("Add Cybersecurity Features")
179
-
180
- numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
181
- if not numeric_features:
182
- st.error("No numeric features found in the dataset.")
183
- return
184
-
185
- feature_cols = st.multiselect(
186
- "Select Features",
187
- numeric_features,
188
- default=numeric_features,
189
- help="Select the features to use for training"
190
- )
191
-
192
- if not feature_cols:
193
- st.error("Please select at least one feature column")
194
- return
195
-
196
- categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
197
- target_col = st.selectbox(
198
- "Select Target Column",
199
- [col for col in categorical_cols if col not in feature_cols],
200
- help="Select the target variable to predict"
201
- )
202
-
203
- if target_col is None:
204
- st.error("No suitable target column found. Target should be categorical.")
205
- return
206
-
207
- # Create feature engineering config
208
- feature_engineering_config = {
209
- 'use_polynomial': use_polynomial,
210
- 'poly_degree': st.session_state.poly_degree if use_polynomial else None,
211
- 'use_feature_selection': use_feature_selection,
212
- 'k_best_features': st.session_state.k_best_features if use_feature_selection else None,
213
- 'use_pca': use_pca,
214
- 'n_components': st.session_state.n_components if use_pca else None,
215
- 'add_cyber_features': add_cyber_features
216
- }
217
-
218
- # Model Configuration Section
219
- st.header("3. Model Configuration")
220
- col5, col6 = st.columns(2)
221
-
222
- with col5:
223
- n_estimators = st.slider(
224
- "Number of Trees",
225
- min_value=10,
226
- max_value=500,
227
- value=100
228
- )
229
- max_depth = st.slider(
230
- "Max Depth",
231
- min_value=1,
232
- max_value=50,
233
- value=10
234
- )
235
-
236
- with col6:
237
- min_samples_split = st.slider(
238
- "Min Samples Split",
239
- min_value=2,
240
- max_value=20,
241
- value=2
242
- )
243
- min_samples_leaf = st.slider(
244
- "Min Samples Leaf",
245
- min_value=1,
246
- max_value=10,
247
- value=1
248
- )
249
-
250
- if st.button("Train Model"):
251
- with st.spinner("Processing data and training model..."):
252
- # Process data with feature engineering
253
- X_train, X_test, y_train, y_test = processor.process_data(
254
- df,
255
- feature_cols,
256
- target_col,
257
- handling_strategy,
258
- scaling_method,
259
- feature_engineering_config
260
- )
261
-
262
- # Train model
263
- model, metrics = trainer.train_model(
264
- X_train, X_test, y_train, y_test,
265
- n_estimators=n_estimators,
266
- max_depth=max_depth,
267
- min_samples_split=min_samples_split,
268
- min_samples_leaf=min_samples_leaf
269
- )
270
-
271
- # Results Section
272
- st.header("4. Results and Visualizations")
273
- col7, col8 = st.columns(2)
274
-
275
- with col7:
276
- st.subheader("Model Performance Metrics")
277
- for metric, value in metrics.items():
278
- st.metric(metric, f"{value:.4f}")
279
-
280
- # Add model export section with improved validation
281
- st.subheader("Export Model")
282
- model_name = st.text_input(
283
- "Model Name (optional)",
284
- help="Enter a name for your model (alphanumeric and underscores only)"
285
- )
286
-
287
- if st.button("Save Model"):
288
- try:
289
- # Validate and sanitize model name
290
- sanitized_name = validate_model_name(model_name)
291
-
292
- if sanitized_name != model_name:
293
- st.warning(f"Model name was sanitized to: {sanitized_name}")
294
-
295
- # Save model and metadata
296
- preprocessing_params = {
297
- 'feature_engineering_config': feature_engineering_config,
298
- 'handling_strategy': handling_strategy,
299
- 'scaling_method': scaling_method,
300
- 'feature_columns': feature_cols,
301
- 'target_column': target_col
302
- }
303
-
304
- model_path, metadata_path = save_model(
305
- model,
306
- feature_cols,
307
- preprocessing_params,
308
- metrics,
309
- sanitized_name
310
- )
311
-
312
- st.success(f"Model saved successfully!\nFiles:\n- {model_path}\n- {metadata_path}")
313
- except Exception as e:
314
- st.error(f"Error saving model: {str(e)}")
315
- st.error("Please ensure you have proper permissions and sufficient disk space.")
316
-
317
- with col8:
318
- if not use_pca: # Skip feature importance for PCA
319
- st.subheader("Feature Importance")
320
- fig_importance = visualizer.plot_feature_importance(
321
- model,
322
- feature_cols if not use_polynomial else [f"Feature_{i}" for i in range(X_train.shape[1])]
323
- )
324
- st.pyplot(fig_importance)
325
-
326
- # Confusion Matrix
327
- st.subheader("Confusion Matrix")
328
- fig_cm = visualizer.plot_confusion_matrix(
329
- y_test,
330
- model.predict(X_test)
331
- )
332
- st.pyplot(fig_cm)
333
-
334
- # ROC Curve
335
- st.subheader("ROC Curve")
336
- fig_roc = visualizer.plot_roc_curve(
337
- model,
338
- X_test,
339
- y_test
340
- )
341
- st.pyplot(fig_roc)
342
-
343
  except Exception as e:
344
- st.error(f"An error occurred: {str(e)}")
345
- st.error("Please check your input data and try again.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  else:
347
- if data_input_tab == "Upload File":
348
- st.info("Please upload a dataset to begin.")
349
- else:
350
- st.info("Please enter a Hugging Face dataset name to begin.")
351
 
352
- # Add Model Management Section
353
- st.header("5. Saved Models")
354
- try:
355
- saved_models = list_saved_models()
356
- if saved_models:
357
- for model_info in saved_models:
358
- with st.expander(f"Model: {model_info['name']}"):
359
- st.write(f"Type: {model_info['type']}")
360
- st.write(f"Created: {model_info['created_at']}")
361
- st.write("Performance Metrics:")
362
- for metric, value in model_info['metrics'].items():
363
- st.metric(metric, f"{value:.4f}")
364
- else:
365
- st.info("No saved models found.")
366
- except Exception as e:
367
- st.error(f"Error loading saved models: {str(e)}")
368
 
369
- if __name__ == "__main__":
370
- main()
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import seaborn as sns
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.linear_model import LogisticRegression
6
+ from sklearn.tree import DecisionTreeClassifier
7
+ from sklearn.ensemble import RandomForestClassifier
8
+ from sklearn.metrics import classification_report
9
+ from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
 
10
  from datasets import load_dataset
 
 
11
 
12
+ # 1. Load Dataset
13
+ st.header("1. Load Dataset")
14
 
15
+ data_source = st.radio("Choose data source:", ["Upload File", "Hugging Face", "Sample Dataset"])
 
 
 
 
16
 
17
+ if data_source == "Upload File":
18
+ uploaded_file = st.file_uploader("Upload your dataset (CSV, Excel, or Parquet)", type=["csv", "xlsx", "parquet"])
19
+ if uploaded_file:
20
+ if uploaded_file.name.endswith(".csv"):
21
+ df = pd.read_csv(uploaded_file)
 
 
 
 
 
 
 
22
  else:
23
+ df = pd.read_excel(uploaded_file)
24
+ st.success(f"Successfully loaded {uploaded_file.name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ elif data_source == "Hugging Face":
27
+ hf_dataset_name = st.text_input("Enter Hugging Face dataset name:")
28
+ if hf_dataset_name:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  try:
30
+ dataset = load_dataset(hf_dataset_name)
31
+ df = dataset.to_pandas()
32
+ st.success(f"Loaded dataset: {hf_dataset_name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  except Exception as e:
34
+ st.error(f"Error loading dataset: {str(e)}")
35
+
36
+ else: # Sample Dataset
37
+ sample_data = st.selectbox("Select a sample dataset:", ["Iris", "Wine", "Titanic"])
38
+ df = sns.load_dataset(sample_data.lower())
39
+ st.success(f"Loaded sample dataset: {sample_data}")
40
+
41
+ if 'df' in locals():
42
+ st.dataframe(df.head())
43
+
44
+ # 2. Explore Dataset
45
+ st.header("2. Explore Dataset")
46
+
47
+ if 'df' in locals():
48
+ st.subheader("Dataset Overview")
49
+ st.write(f"Shape: {df.shape}")
50
+ st.write("Column Information:")
51
+ st.dataframe(df.dtypes)
52
+
53
+ if st.checkbox("Show Missing Values"):
54
+ missing = df.isnull().sum()
55
+ st.bar_chart(missing[missing > 0])
56
+
57
+ st.subheader("Summary Statistics")
58
+ st.write(df.describe())
59
+
60
+ if st.checkbox("Generate Correlation Matrix"):
61
+ corr_matrix = df.corr()
62
+ st.write(sns.heatmap(corr_matrix, annot=True, cmap="coolwarm"))
63
+ st.pyplot()
64
+ else:
65
+ st.warning("Load a dataset to explore.")
66
+
67
+ # 3. Preprocess Dataset
68
+ st.header("3. Preprocess Dataset")
69
+
70
+ if 'df' in locals():
71
+ st.subheader("Handle Missing Values")
72
+ missing_option = st.radio("Choose missing value strategy:", ["None", "Fill with Mean", "Drop Rows"])
73
+ if missing_option == "Fill with Mean":
74
+ df = df.fillna(df.mean())
75
+ elif missing_option == "Drop Rows":
76
+ df = df.dropna()
77
+
78
+ st.subheader("Encode Categorical Variables")
79
+ encoding_method = st.radio("Encoding Method:", ["None", "One-Hot Encoding", "Label Encoding"])
80
+ if encoding_method == "One-Hot Encoding":
81
+ df = pd.get_dummies(df)
82
+ elif encoding_method == "Label Encoding":
83
+ le = LabelEncoder()
84
+ for col in df.select_dtypes(include="object").columns:
85
+ df[col] = le.fit_transform(df[col])
86
+
87
+ st.subheader("Feature Scaling")
88
+ scaling_method = st.radio("Scaling Method:", ["None", "Standardization", "Normalization"])
89
+ if scaling_method != "None":
90
+ scaler = StandardScaler() if scaling_method == "Standardization" else MinMaxScaler()
91
+ numeric_cols = df.select_dtypes(include="number").columns
92
+ df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
93
+
94
+ st.success("Preprocessing complete!")
95
+ st.dataframe(df.head())
96
+ else:
97
+ st.warning("Load a dataset to preprocess.")
98
+
99
+ # 4. Train Model
100
+ st.header("4. Train Model")
101
+
102
+ if 'df' in locals():
103
+ st.subheader("Select Target Column")
104
+ target_col = st.selectbox("Choose the target column:", df.columns)
105
+ features = [col for col in df.columns if col != target_col]
106
+
107
+ st.subheader("Train/Test Split")
108
+ test_size = st.slider("Test size (percentage):", 10, 50, 20) / 100
109
+ X_train, X_test, y_train, y_test = train_test_split(
110
+ df[features], df[target_col], test_size=test_size, random_state=42
111
+ )
112
+
113
+ st.subheader("Select and Train Model")
114
+ model_type = st.selectbox("Choose a model:", ["Logistic Regression", "Decision Tree", "Random Forest"])
115
+ if model_type == "Logistic Regression":
116
+ model = LogisticRegression()
117
+ elif model_type == "Decision Tree":
118
+ model = DecisionTreeClassifier()
119
  else:
120
+ model = RandomForestClassifier()
 
 
 
121
 
122
+ model.fit(X_train, y_train)
123
+ st.success("Model trained successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
+ st.subheader("Model Performance")
126
+ y_pred = model.predict(X_test)
127
+ report = classification_report(y_test, y_pred, output_dict=True)
128
+ st.dataframe(pd.DataFrame(report).transpose())
129
+ else:
130
+ st.warning("Load and preprocess a dataset to train a model.")