diff --git "a/app.py" "b/app.py"
--- "a/app.py"
+++ "b/app.py"
@@ -7,12 +7,9 @@ from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVR, SVC
-from sklearn.decomposition import PCA #Import at top
-from sklearn.metrics import silhouette_score #Import at top
-from sklearn.cluster import DBSCAN #Import at top
-from sklearn.feature_selection import SelectKBest #Import at top
-import joblib #Import at top
-from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
+from sklearn.feature_selection import SelectKBest
+import joblib
+from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
@@ -20,847 +17,846 @@ from sklearn.pipeline import Pipeline
from ydata_profiling import ProfileReport
from streamlit_pandas_profiling import st_profile_report
from io import StringIO
-import joblib
import requests
import asyncio
from io import BytesIO
import base64
-import seaborn as sns
-import time
-from sklearn.cluster import KMeans
-import scipy.stats as stats
import mimetypes
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
-# Configurations
-st.set_page_config(page_title="Executive Insights Pro", layout="wide", page_icon="๐")
-
-# ----Load Image----
-@st.cache_data(ttl=3600)
-def load_image(image_url):
- """Loads an image from a URL and returns bytes."""
- try:
- response = requests.get(image_url, stream=True)
- response.raise_for_status()
- return response.content
- except requests.exceptions.RequestException as e:
- st.error(f"Error loading image: {e}")
- return None
-
-# ----Function to make and convert background to base 64 code-----
-def set_background():
- """Sets the background image using base64 encoding."""
- image_url = "https://www.nasa.gov/sites/default/files/thumbnails/image/web_first_images_release.png" # NASA Image
- image_data = load_image(image_url)
- if image_data:
- # Convert bytes to base64
- image_base64 = base64.b64encode(image_data).decode()
- st.markdown(
- f"""
-
- """,
- unsafe_allow_html=True,
- )
- return
-
-# Simplified CSS
-def apply_simplified_theme():
- """Injects simplified CSS to enhance Streamlit's default style."""
- st.markdown(
- """
-
- """,
- unsafe_allow_html=True,
- )
- return
-
-# Apply background and simplified theme
-set_background()
-apply_simplified_theme()
-
-def show_loader(message="Loading..."):
- """Displays an animated loader."""
- st.markdown(
- f"""
-
- """,
- unsafe_allow_html=True
- )
+# Enhanced configuration
+st.set_page_config(
+ page_title="Executive Insights Pro",
+ layout="wide",
+ page_icon="๐",
+ initial_sidebar_state="expanded"
+)
-@st.cache_data(ttl=3600) #Added allow_output_mutation
-def load_data(uploaded_file):
- """Load and cache dataset, with file type validation."""
- if uploaded_file is not None:
- file_extension = uploaded_file.name.split(".")[-1].lower()
- mime_type = mimetypes.guess_type(uploaded_file.name)[0]
-
- max_file_size_mb = 50 # Set a maximum file size (adjust as needed)
- file_size_mb = uploaded_file.size / (1024 * 1024)
- if file_size_mb > max_file_size_mb:
- st.error(f"File size exceeds the limit of {max_file_size_mb} MB.")
- return None
-
-
- try: # Wrap file reading in a try...except
- if file_extension == "csv" or mime_type == 'text/csv':
- df = pd.read_csv(uploaded_file)
- return df
- elif file_extension in ["xlsx", "xls"] or mime_type in ['application/vnd.ms-excel', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']:
- df = pd.read_excel(uploaded_file)
- return df
- else:
- st.error("Unsupported file type. Please upload a CSV or Excel file.")
- return None
- except FileNotFoundError:
- st.error("File not found. Please check the file path.")
- except pd.errors.ParserError: # Catch pandas-specific parsing errors
- st.error("Error parsing the file. Make sure it's a valid CSV or Excel file.")
- except Exception as e:
- st.error(f"An unexpected error occurred: {type(e).__name__} - {str(e)}")
- return None # Handle other potential exceptions
+# Security: Set allowed file types
+ALLOWED_EXTENSIONS = {'csv', 'xlsx', 'parquet', 'feather'}
+MAX_FILE_SIZE_MB = 250 # 250MB limit
- else:
- return None
-
-@st.cache_data(ttl=3600)
-def generate_profile(df):
- """Generate automated EDA report"""
- return ProfileReport(df, minimal=True)
-
-# Session State Management
-if 'raw_data' not in st.session_state:
- st.session_state.raw_data = None
-if 'cleaned_data' not in st.session_state:
- st.session_state.cleaned_data = None
-if 'train_test' not in st.session_state:
- st.session_state.train_test = {}
-if 'model' not in st.session_state:
- st.session_state.model = None
-if 'preprocessor' not in st.session_state:
- st.session_state.preprocessor = None # to store the column transformer
-
-# Sidebar Navigation
-st.sidebar.title("๐ฎ Data Wizard Pro")
-
-# Apply custom CSS to change text color in the sidebar
-st.markdown(
- """
-
- """,
- unsafe_allow_html=True,
-)
-# Replace the existing app_mode section with this:
-app_mode = st.sidebar.radio("Navigate", [
- "Data Upload",
- "Smart Cleaning",
- "Advanced EDA",
- "Model Training",
- "Predictions",
- "Visualization Lab",
- "Neural Network Studio" # New option
-])
-
-# --- Main App Logic ---
-if app_mode == "Data Upload":
- st.title("๐ค Data Upload & Initial Analysis")
-
- # File Upload Section with improved styling
- st.markdown(
- """
-
- """,
- unsafe_allow_html=True,
- )
+ # Numeric specific checks
+ if pd.api.types.is_numeric_dtype(df[col]):
+ col_report.update({
+ 'mean': df[col].mean(),
+ 'std': df[col].std(),
+ 'zeros': (df[col] == 0).sum(),
+ 'negatives': (df[col] < 0).sum() if df[col].dtype != 'uint' else 0,
+ 'outliers': detect_outliers(df[col])
+ })
+ report['data_health_score'] -= 2 # Deduct 2% per numeric column
+
+ # Categorical specific checks
+ if pd.api.types.is_string_dtype(df[col]):
+ col_report.update({
+ 'top_value': df[col].mode()[0] if not df[col].empty else None,
+ 'top_freq': df[col].value_counts().iloc[0]/len(df) if not df[col].empty else 0
+ })
+ report['data_health_score'] -= 1 # Deduct 1% per string column
+
+ report['column_analysis'][col] = col_report
+ report['data_health_score'] = max(report['data_health_score'], 0)
+
+ return report
+
+def detect_outliers(series):
+ """Detect outliers using IQR method"""
+ q1 = series.quantile(0.25)
+ q3 = series.quantile(0.75)
+ iqr = q3 - q1
+ return ((series < (q1 - 1.5 * iqr)) | (series > (q3 + 1.5 * iqr))).sum()
+
+# --- Data Upload Page ---
+if app_mode == "Data Upload":
+ st.title("๐ค Smart Data Hub")
+ st.markdown("""
+ **Upload your dataset** (CSV, Excel, Parquet) for comprehensive analysis.
+ Get instant data health insights and quality assessment.
+ """)
+
+ # File upload with enhanced UI
uploaded_file = st.file_uploader(
- "Choose a CSV or Excel file", type=["csv", "xlsx"],
- help="Upload your dataset here. Supported formats: CSV, XLSX"
+ "Drag & drop or browse files",
+ type=list(ALLOWED_EXTENSIONS),
+ help=f"Max file size: {MAX_FILE_SIZE_MB}MB. Supported formats: {', '.join(ALLOWED_EXTENSIONS)}"
)
-
+
if uploaded_file:
- df = load_data(uploaded_file)
- if df is not None:
- # only proceed if load_data returned a valid dataframe
- st.session_state.raw_data = df
- st.session_state.cleaned_data = df.copy()
-
- st.subheader("Data Overview")
- # Data Overview Cards with more context
- col1, col2, col3 = st.columns(3)
- with col1:
- st.metric("Number of Rows", df.shape[0], help="Total number of entries in the dataset.")
- with col2:
- st.metric("Number of Columns", df.shape[1], help="Total number of features in the dataset.")
- with col3:
- num_missing = df.isna().sum().sum()
- st.metric("Total Missing Values", num_missing, help="Total number of missing entries across the entire dataset.")
-
- # Display Data Types
- st.write("Column Data Types:")
- dtype_counts = df.dtypes.value_counts().to_dict()
- for dtype, count in dtype_counts.items():
- st.write(f"- {dtype}: {count} column(s)")
+ # Validate file
+ is_valid, message = validate_file(uploaded_file)
+ if not is_valid:
+ st.error(f"Upload error: {message}")
+ st.stop()
+
+ # Load data with progress
+ with st.spinner(f"Loading {uploaded_file.name}..."):
+ try:
+ if uploaded_file.name.endswith('.csv'):
+ df = pd.read_csv(uploaded_file, low_memory=False)
+ elif uploaded_file.name.endswith(('.xlsx', '.xls')):
+ df = pd.read_excel(uploaded_file)
+ elif uploaded_file.name.endswith('.parquet'):
+ df = pd.read_parquet(uploaded_file)
+ elif uploaded_file.name.endswith('.feather'):
+ df = pd.read_feather(uploaded_file)
+
+ st.session_state.raw_data = df
+ st.success("Dataset loaded successfully!")
+
+ except Exception as e:
+ st.error(f"Error loading file: {str(e)}")
+ st.stop()
+
+ # Data Health Dashboard
+ st.subheader("๐ Data Health Dashboard")
+ report = enhanced_quality_report(df)
+
+ col1, col2, col3, col4 = st.columns(4)
+ col1.metric("Total Rows", report['basic_stats']['rows'])
+ col2.metric("Total Columns", report['basic_stats']['columns'])
+ col3.metric("Missing Values", report['basic_stats']['missing_values'])
+ col4.metric("Data Health Score", f"{report['data_health_score']}/100")
+
+ # Column Explorer
+ with st.expander("๐ Deep Column Analysis", expanded=True):
+ selected_col = st.selectbox("Select column to inspect", df.columns)
+ col_info = report['column_analysis'][selected_col]
- # Sample Data Table with improved display
- st.subheader("Sample Data")
- num_rows_preview = st.slider("Number of Rows to Preview", 5, 20, 10, help="Adjust the number of rows displayed in the sample data.")
- st.dataframe(df.head(num_rows_preview), use_container_width=True)
+ st.write(f"**Type:** {col_info['type']}")
+ st.write(f"**Unique Values:** {col_info['unique']}")
+ st.write(f"**Missing Values:** {col_info['missing']} ({col_info['missing']/len(df):.1%})")
- # Column Statistics
- with st.expander("๐ Column Statistics"):
- for col in df.columns:
- st.subheader(f"Column: {col}")
- st.write(f"Data type: {df[col].dtype}")
- if pd.api.types.is_numeric_dtype(df[col]):
- st.write("Summary Statistics:")
- st.write(df[col].describe())
- else:
- st.write("Value Counts:")
- st.write(df[col].value_counts())
+ if pd.api.types.is_numeric_dtype(df[selected_col]):
+ st.write("**Distribution:**")
+ st.line_chart(df[selected_col])
+ st.write(f"**Outliers Detected:** {col_info['outliers']}")
+ else:
+ st.write("**Most Common Values:**")
+ top_values = df[selected_col].value_counts().head(5)
+ st.bar_chart(top_values)
+
+ # Smart Recommendations
+ with st.expander("๐ก Cleaning Recommendations"):
+ recommendations = []
+ if report['basic_stats']['duplicates'] > 0:
+ recommendations.append(f"๐จ Remove {report['basic_stats']['duplicates']} duplicate rows")
+ if report['basic_stats']['missing_values'] > 0:
+ recommendations.append("๐ง Apply advanced imputation strategies")
+ for col, data in report['column_analysis'].items():
+ if data['missing'] > 0.5 * len(df):
+ recommendations.append(f"โ ๏ธ Consider dropping {col} (>{50}% missing)")
+ if data['unique'] == len(df):
+ recommendations.append(f"๐ Investigate {col} - potential unique identifier")
- # Automated EDA Report
- with st.expander("๐ Automated Data Report"):
- if st.button("Generate Smart Report"):
- show_loader("Generating EDA Report")
- pr = generate_profile(df)
- st_profile_report(pr)
+ if recommendations:
+ st.write("### Recommended Actions")
+ for rec in recommendations[:5]: # Show top 5
+ st.write(f"- {rec}")
+ else:
+ st.success("No critical issues detected - your data looks healthy!")
+ # Data Preview
+ with st.expander("๐ Data Preview", expanded=True):
+ preview_size = st.slider("Preview rows", 5, 100, 15)
+ st.dataframe(df.head(preview_size).style.highlight_null(color='#FF6666')
+
+ # Advanced Profiling
+ if st.button("๐ Generate Full Data Profile"):
+ with st.spinner("Generating comprehensive report..."):
+ pr = ProfileReport(df, explorative=True)
+ st_profile_report(pr)
+
+# Smart Cleaning Section
elif app_mode == "Smart Cleaning":
st.title("๐งผ Intelligent Data Cleaning")
- if st.session_state.raw_data is not None:
- df = st.session_state.cleaned_data
-
- # Cleaning Toolkit
- col1, col2 = st.columns([1, 3])
- with col1:
- st.subheader("Cleaning Actions")
-
- clean_action = st.selectbox("Choose Operation", [
- "Handle Missing Values",
- "Clean Text",
- "Remove Columns", # New option
- # ... other cleaning operations ...
+ st.markdown("""
+ **Automated Data Cleaning** with smart suggestions and advanced transformations.
+ Clean your data with confidence using AI-powered recommendations.
+ """)
+
+ if 'raw_data' not in st.session_state or st.session_state.raw_data is None:
+ st.warning("Please upload your data in the Data Upload section first.")
+ st.stop()
+
+ df = st.session_state.raw_data.copy()
+ cleaning_actions = []
+
+ # Data Health Summary
+ st.subheader("๐ Data Health Summary")
+ col1, col2, col3 = st.columns(3)
+ with col1:
+ missing_pct = df.isna().mean().mean()
+ st.metric("Missing Values", f"{missing_pct:.1%}")
+ with col2:
+ duplicates = df.duplicated().sum()
+ st.metric("Duplicates", duplicates)
+ with col3:
+ data_types = df.dtypes.value_counts().to_dict()
+ st.metric("Data Types", str(data_types))
+
+ # Cleaning Operations
+ st.subheader("๐ง Cleaning Operations")
+
+ # 1. Missing Value Handling
+ with st.expander("๐ณ๏ธ Handle Missing Values", expanded=True):
+ missing_cols = df.columns[df.isna().any()].tolist()
+ if missing_cols:
+ st.write("Columns with missing values:")
+ cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols)
+
+ method = st.radio("Imputation Method", [
+ "Drop Missing",
+ "Mean/Median/Mode",
+ "KNN Imputation",
+ "Advanced Imputation"
+ ], horizontal=True)
+
+ if method == "Mean/Median/Mode":
+ strategy = st.selectbox("Strategy", ["mean", "median", "most_frequent"])
+ if st.button("Apply Imputation"):
+ df[cols] = df[cols].fillna(df[cols].agg(strategy))
+ cleaning_actions.append(f"Filled missing values in {cols} using {strategy}")
+
+ elif method == "KNN Imputation":
+ n_neighbors = st.slider("Number of neighbors", 2, 15, 5)
+ if st.button("Apply KNN Imputation"):
+ from sklearn.impute import KNNImputer
+ imputer = KNNImputer(n_neighbors=n_neighbors)
+ df[cols] = imputer.fit_transform(df[cols])
+ cleaning_actions.append(f"Applied KNN imputation (k={n_neighbors}) on {cols}")
+
+ elif method == "Advanced Imputation":
+ st.write("Coming soon: MICE, Deep Learning imputation")
+ else:
+ st.success("No missing values found!")
+
+ # 2. Duplicate Handling
+ with st.expander("๐ Handle Duplicates", expanded=True):
+ if duplicates > 0:
+ st.write(f"Found {duplicates} duplicate rows")
+ dup_strategy = st.radio("Duplicate Strategy", [
+ "Remove All Duplicates",
+ "Keep First Occurrence",
+ "Keep Last Occurrence"
])
+
+ if st.button("Handle Duplicates"):
+ df = df.drop_duplicates(keep={
+ "Remove All Duplicates": False,
+ "Keep First Occurrence": 'first',
+ "Keep Last Occurrence": 'last'
+ }[dup_strategy])
+ cleaning_actions.append(f"Removed duplicates using strategy: {dup_strategy}")
+ else:
+ st.success("No duplicates found!")
+
+ # 3. Data Type Conversion
+ with st.expander("๐ Convert Data Types", expanded=True):
+ st.write("Current Data Types:")
+ st.dataframe(df.dtypes.reset_index().rename(columns={
+ 0: 'Type',
+ 'index': 'Column'
+ }))
+
+ col_to_convert = st.selectbox("Select column to convert", df.columns)
+ new_type = st.selectbox("New Data Type", [
+ "String", "Integer", "Float",
+ "Boolean", "Datetime", "Category"
+ ])
+
+ if st.button("Convert Data Type"):
+ try:
+ if new_type == "String":
+ df[col_to_convert] = df[col_to_convert].astype(str)
+ elif new_type == "Integer":
+ df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64')
+ elif new_type == "Float":
+ df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce')
+ elif new_type == "Boolean":
+ df[col_to_convert] = df[col_to_convert].astype(bool)
+ elif new_type == "Datetime":
+ df[col_to_convert] = pd.to_datetime(df[col_to_convert], errors='coerce')
+ elif new_type == "Category":
+ df[col_to_convert] = df[col_to_convert].astype('category')
+
+ cleaning_actions.append(f"Converted {col_to_convert} to {new_type}")
+ st.success("Data type converted successfully!")
+ except Exception as e:
+ st.error(f"Conversion failed: {str(e)}")
+
+ # 4. Outlier Detection & Handling
+ with st.expander("๐ Handle Outliers", expanded=True):
+ numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
+ if numeric_cols:
+ outlier_col = st.selectbox("Select numeric column", numeric_cols)
+ threshold = st.slider("Outlier Threshold (Z-Score)", 1.0, 5.0, 3.0)
+
+ z_scores = (df[outlier_col] - df[outlier_col].mean()) / df[outlier_col].std()
+ outliers = df[abs(z_scores) > threshold]
+
+ st.write(f"Detected {len(outliers)} outliers")
+ st.dataframe(outliers)
+
+ if st.button("Handle Outliers"):
+ df = df[abs(z_scores) <= threshold]
+ cleaning_actions.append(f"Removed {len(outliers)} outliers from {outlier_col}")
+ else:
+ st.info("No numeric columns found for outlier detection")
+
+ # 5. Text Cleaning
+ with st.expander("๐ Clean Text Data", expanded=True):
+ text_cols = df.select_dtypes(include='object').columns.tolist()
+ if text_cols:
+ text_col = st.selectbox("Select text column", text_cols)
+ options = st.multiselect("Text Cleaning Options", [
+ "Lowercase",
+ "Remove Punctuation",
+ "Remove Extra Spaces",
+ "Remove Stopwords",
+ "Stemming"
+ ])
+
+ if st.button("Clean Text"):
+ if "Lowercase" in options:
+ df[text_col] = df[text_col].str.lower()
+ if "Remove Punctuation" in options:
+ df[text_col] = df[text_col].str.replace(r'[^\w\s]', '', regex=True)
+ if "Remove Extra Spaces" in options:
+ df[text_col] = df[text_col].str.strip().str.replace(r'\s+', ' ', regex=True)
+ if "Remove Stopwords" in options:
+ from nltk.corpus import stopwords
+ stop_words = set(stopwords.words('english'))
+ df[text_col] = df[text_col].apply(
+ lambda x: ' '.join([word for word in x.split() if word not in stop_words])
+ )
+ if "Stemming" in options:
+ from nltk.stem import PorterStemmer
+ stemmer = PorterStemmer()
+ df[text_col] = df[text_col].apply(
+ lambda x: ' '.join([stemmer.stem(word) for word in x.split()])
+ )
+
+ cleaning_actions.append(f"Cleaned text in {text_col}")
+ st.success("Text cleaned successfully!")
+ else:
+ st.info("No text columns found for cleaning")
- if clean_action == "Handle Missing Values":
- columns_with_missing = df.columns[df.isnull().any()].tolist()
- column_to_impute = st.selectbox("Column to Impute", ["All Columns"] + columns_with_missing)
-
- method = st.selectbox("Imputation Method", [
- "KNN Imputation",
- "Median Fill",
- "Mean Fill",
- "Drop Missing",
- "Constant Value Fill"
- ])
- if method == "KNN Imputation":
- knn_neighbors = st.slider("KNN Neighbors", 2, 10, 5)
- elif method == "Constant Value Fill":
- constant_value = st.text_input("Constant Value")
-
- elif clean_action == "Clean Text":
- text_column = st.selectbox("Text Column", df.select_dtypes(include='object').columns)
- cleaning_operation = st.selectbox("Cleaning Operation", ["Remove Special Characters", "Lowercase", "Uppercase", "Remove Extra Spaces"])
- if cleaning_operation == "Remove Special Characters":
- chars_to_remove = st.text_input("Characters to Remove", r'[^a-zA-Z0-9\s]')
-
- elif clean_action == "Remove Columns":
- remove_cols = st.multiselect("Columns to Remove", df.columns) # Multiselect for column removal
-
- with col2:
- st.subheader("Data Preview") # Added Data Preview Section
- st.dataframe(df.head(10), use_container_width=True) # Display sample data
-
- if st.button("Apply Transformation"):
- with st.spinner("Applying changes..."):
- current_df = df.copy()
- # ... (your data history logic) ...
-
- if clean_action == "Handle Missing Values":
- if method == "KNN Imputation":
- imputer = KNNImputer(n_neighbors=knn_neighbors)
- if column_to_impute == "All Columns":
- current_df = pd.DataFrame(imputer.fit_transform(current_df), columns=current_df.columns)
- else:
- current_df[[column_to_impute]] = pd.DataFrame(imputer.fit_transform(current_df[[column_to_impute]]), columns=[column_to_impute])
- elif method == "Median Fill":
- if column_to_impute == "All Columns":
- current_df = current_df.fillna(current_df.median())
- else:
- current_df[column_to_impute] = current_df[column_to_impute].fillna(current_df[column_to_impute].median())
- elif method == "Mean Fill":
- if column_to_impute == "All Columns":
- current_df = current_df.fillna(current_df.mean())
- else:
- current_df[column_to_impute] = current_df[column_to_impute].fillna(current_df[column_to_impute].mean())
- elif method == "Constant Value Fill":
- if column_to_impute == "All Columns":
- current_df = current_df.fillna(constant_value)
- else:
- current_df[column_to_impute] = current_df[column_to_impute].fillna(constant_value)
- else:
- current_df = current_df.dropna()
+ # Save Cleaned Data
+ if st.button("๐พ Save Cleaned Data"):
+ st.session_state.cleaned_data = df
+ st.success("Cleaned data saved successfully!")
+
+ # Show Cleaning Log
+ st.subheader("๐ Cleaning Log")
+ if cleaning_actions:
+ st.write("### Applied Transformations")
+ for action in cleaning_actions:
+ st.write(f"- {action}")
+ else:
+ st.info("No transformations applied yet")
+
+# Advanced EDA Section
+elif app_mode == "Advanced EDA":
+ st.title("๐ Advanced Exploratory Data Analysis")
+ st.markdown("""
+ **Interactive Data Exploration** with advanced statistical tools and visualizations.
+ Uncover hidden patterns and relationships in your data.
+ """)
- elif clean_action == "Clean Text":
- import re # moved here since its only used here to avoid library bloat
+ if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
+ st.warning("Please clean your data in the Smart Cleaning section first.")
+ st.stop()
- def clean_text(text, operation, chars_to_remove=r'[^a-zA-Z0-9\s]'):
- if operation == "Remove Special Characters":
- text = re.sub(chars_to_remove, '', str(text))
- elif operation == "Lowercase":
- text = str(text).lower()
- elif operation == "Uppercase":
- text = str(text).upper()
- elif operation == "Remove Extra Spaces":
- text = " ".join(str(text).split())
- return text
+ df = st.session_state.cleaned_data.copy()
+
+ # Initialize session state for EDA configuration
+ if 'eda_config' not in st.session_state:
+ st.session_state.eda_config = {
+ 'plot_type': "Histogram",
+ 'x_col': df.columns[0] if len(df.columns) > 0 else None,
+ 'y_col': df.columns[1] if len(df.columns) > 1 else None,
+ 'z_col': df.columns[2] if len(df.columns) > 2 else None,
+ 'color_col': None,
+ 'size_col': None,
+ 'time_col': None,
+ 'value_col': None,
+ 'scatter_matrix_cols': df.select_dtypes(include=np.number).columns.tolist()[:5],
+ 'color_palette': "Viridis",
+ 'hover_data_cols': [],
+ 'filter_col': None,
+ 'filter_options': []
+ }
- current_df[text_column] = current_df[text_column].astype(str).apply(lambda x: clean_text(x, cleaning_operation, chars_to_remove))
+ # Data Filtering Section
+ with st.expander("๐ Data Filtering", expanded=True):
+ st.session_state.eda_config['filter_col'] = st.selectbox(
+ "Filter Column",
+ [None] + list(df.columns),
+ help="Choose a column to filter the data."
+ )
- elif clean_action == "Remove Columns":
- if remove_cols: #Check that it is not empty
- current_df = current_df.drop(columns=remove_cols) # Drop selected columns
+ if st.session_state.eda_config['filter_col']:
+ unique_values = df[st.session_state.eda_config['filter_col']].unique()
+ st.session_state.eda_config['filter_options'] = st.multiselect(
+ "Filter Values",
+ unique_values,
+ default=unique_values,
+ help=f"Select values from '{st.session_state.eda_config['filter_col']}'"
+ )
+ df = df[df[st.session_state.eda_config['filter_col']].isin(
+ st.session_state.eda_config['filter_options']
+ )]
+
+ # Visualization Type Selection
+ st.sidebar.header("๐ Visualization Configuration")
+ plot_types = [
+ "Histogram", "Scatter Plot", "Box Plot", "Violin Plot",
+ "Correlation Heatmap", "Parallel Coordinates", "Andrews Curves",
+ "Pair Plot", "Density Contour", "3D Scatter", "Time Series",
+ "Sunburst Chart", "Funnel Chart", "Clustering Analysis"
+ ]
+ st.session_state.eda_config['plot_type'] = st.sidebar.selectbox(
+ "Choose Visualization",
+ plot_types,
+ index=0
+ )
- st.session_state.cleaned_data = current_df
- st.success("Transformation applied!")
+ # Dynamic Controls Based on Plot Type
+ if st.session_state.eda_config['plot_type'] != "Correlation Heatmap":
+ st.session_state.eda_config['x_col'] = st.sidebar.selectbox(
+ "X Axis",
+ df.columns,
+ index=df.columns.get_loc(st.session_state.eda_config['x_col'])
+ if st.session_state.eda_config['x_col'] in df.columns else 0
+ )
- if st.button("Refresh Data Preview"): # Button to refresh data preview
- st.rerun()
-
-elif app_mode == "Advanced EDA":
- st.title("๐ Advanced Exploratory Analysis")
-
- if st.session_state.cleaned_data is not None:
- df = st.session_state.cleaned_data.copy()
-
- # Initialize session state for plot configuration
- if 'plot_config' not in st.session_state:
- st.session_state.plot_config = {
- 'plot_type': "Histogram",
- 'x_col': df.columns[0] if len(df.columns) > 0 else None,
- 'y_col': df.columns[1] if len(df.columns) > 1 else None,
- 'z_col': df.columns[2] if len(df.columns) > 2 else None,
- 'color_col': None,
- 'size_col': None,
- 'time_col': None,
- 'value_col': None,
- 'scatter_matrix_cols': df.select_dtypes(include=np.number).columns.tolist()[:5],
- 'color_palette': "#00f7ff",
- 'color_continuous_scale': "Viridis",
- 'hover_data_cols': [],
- 'filter_col': None,
- 'filter_options': []
- }
+ if st.session_state.eda_config['plot_type'] in ["Scatter Plot", "Box Plot", "Violin Plot", "Time Series", "3D Scatter", "Histogram"]:
+ st.session_state.eda_config['y_col'] = st.sidebar.selectbox(
+ "Y Axis",
+ df.columns,
+ index=df.columns.get_loc(st.session_state.eda_config['y_col'])
+ if st.session_state.eda_config['y_col'] in df.columns else 0
+ )
- # Data Filtering Section
- with st.expander("๐ Data Filtering", expanded=False):
- # Use direct session state assignment for reactivity
- st.session_state.plot_config['filter_col'] = st.selectbox(
- "Filter Column",
- [None] + list(df.columns),
- help="Choose a column to filter the data."
- )
+ if st.session_state.eda_config['plot_type'] == "3D Scatter":
+ st.session_state.eda_config['z_col'] = st.sidebar.selectbox(
+ "Z Axis",
+ df.columns,
+ index=df.columns.get_loc(st.session_state.eda_config['z_col'])
+ if st.session_state.eda_config['z_col'] in df.columns else 0
+ )
+ st.session_state.eda_config['color_col'] = st.sidebar.selectbox(
+ "Color by",
+ [None] + list(df.columns)
+ )
- if st.session_state.plot_config['filter_col']:
- unique_values = df[st.session_state.plot_config['filter_col']].unique()
- st.session_state.plot_config['filter_options'] = st.multiselect(
- "Filter Values",
- unique_values,
- default=unique_values,
- help=f"Select values from '{st.session_state.plot_config['filter_col']}'"
- )
- df = df[df[st.session_state.plot_config['filter_col']].isin(
- st.session_state.plot_config['filter_options']
- )]
-
- # Visualization Configuration
- st.sidebar.header("๐ Plot Configuration")
-
- # Plot type selector
- st.session_state.plot_config['plot_type'] = st.sidebar.selectbox(
- "Choose Visualization",
- [
- "Histogram", "Scatter Plot", "Box Plot",
- "Correlation Heatmap", "3D Scatter",
- "Violin Plot", "Time Series", "Scatter Matrix"
- ],
- index=0 # Reset to first option when plot type changes
+ # Advanced Plot Customization
+ with st.expander("๐จ Advanced Customization", expanded=False):
+ st.session_state.eda_config['color_palette'] = st.selectbox(
+ "Color Palette",
+ ["Viridis", "Plasma", "Magma", "Cividis", "RdBu", "Rainbow"]
+ )
+ st.session_state.eda_config['hover_data_cols'] = st.multiselect(
+ "Hover Data",
+ df.columns
)
- # Dynamic controls based on plot type
- if st.session_state.plot_config['plot_type'] != "Correlation Heatmap":
- st.session_state.plot_config['x_col'] = st.sidebar.selectbox(
- "X Axis",
- df.columns,
- index=df.columns.get_loc(st.session_state.plot_config['x_col'])
- if st.session_state.plot_config['x_col'] in df.columns else 0
+ # Plot Generation
+ try:
+ fig = None
+ config = st.session_state.eda_config
+
+ if config['plot_type'] == "Histogram":
+ fig = px.histogram(
+ df, x=config['x_col'], y=config['y_col'],
+ nbins=30, template="plotly_dark",
+ color_discrete_sequence=[config['color_palette']]
)
- if st.session_state.plot_config['plot_type'] in ["Scatter Plot", "Box Plot",
- "Violin Plot", "Time Series",
- "3D Scatter", "Histogram"]:
- st.session_state.plot_config['y_col'] = st.sidebar.selectbox(
- "Y Axis",
- df.columns,
- index=df.columns.get_loc(st.session_state.plot_config['y_col'])
- if st.session_state.plot_config['y_col'] in df.columns else 0
+ elif config['plot_type'] == "Scatter Plot":
+ fig = px.scatter(
+ df, x=config['x_col'], y=config['y_col'],
+ color=config['color_col'],
+ size=config['size_col'],
+ hover_data=config['hover_data_cols']
)
- if st.session_state.plot_config['plot_type'] == "3D Scatter":
- st.session_state.plot_config['z_col'] = st.sidebar.selectbox(
- "Z Axis",
- df.columns,
- index=df.columns.get_loc(st.session_state.plot_config['z_col'])
- if st.session_state.plot_config['z_col'] in df.columns else 0
- )
- st.session_state.plot_config['color_col'] = st.sidebar.selectbox(
- "Color by",
- [None] + list(df.columns)
+ elif config['plot_type'] == "3D Scatter":
+ fig = px.scatter_3d(
+ df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
+ color=config['color_col'],
+ color_discrete_sequence=[config['color_palette']]
)
- # Color configuration
- if st.session_state.plot_config['plot_type'] == "Correlation Heatmap":
- st.session_state.plot_config['color_continuous_scale'] = st.sidebar.selectbox(
- "Color Scale",
- ['Viridis', 'Plasma', 'Magma', 'Cividis', 'RdBu']
- )
- else:
- st.session_state.plot_config['color_palette'] = st.sidebar.selectbox(
- "Color Palette",
- ['#00f7ff', '#ff00ff', '#f70000', '#0000f7']
- )
+ elif config['plot_type'] == "Correlation Heatmap":
+ numeric_df = df.select_dtypes(include=np.number)
+ if not numeric_df.empty:
+ corr = numeric_df.corr()
+ fig = px.imshow(
+ corr, text_auto=True,
+ color_continuous_scale=config['color_palette']
+ )
+ else:
+ st.warning("No numerical columns found for correlation heatmap.")
- # Additional configurations
- if st.session_state.plot_config['plot_type'] == "Scatter Plot":
- st.session_state.plot_config['size_col'] = st.sidebar.selectbox(
- "Size by",
- [None] + list(df.columns)
- )
- st.session_state.plot_config['hover_data_cols'] = st.sidebar.multiselect(
- "Hover Data",
- df.columns
+ elif config['plot_type'] == "Box Plot":
+ fig = px.box(
+ df, x=config['x_col'], y=config['y_col'],
+ color=config['color_col']
)
- if st.session_state.plot_config['plot_type'] == "Time Series":
- st.session_state.plot_config['time_col'] = st.sidebar.selectbox(
- "Time Column",
- df.columns
- )
- st.session_state.plot_config['value_col'] = st.sidebar.selectbox(
- "Value Column",
- df.columns
+ elif config['plot_type'] == "Violin Plot":
+ fig = px.violin(
+ df, x=config['x_col'], y=config['y_col'],
+ box=True, points="all",
+ color=config['color_col']
)
- if st.session_state.plot_config['plot_type'] == "Scatter Matrix":
- st.session_state.plot_config['scatter_matrix_cols'] = st.multiselect(
- "Columns for Scatter Matrix",
- df.select_dtypes(include=np.number).columns,
- default=st.session_state.plot_config['scatter_matrix_cols']
+ elif config['plot_type'] == "Time Series":
+ df = df.sort_values(by=config['time_col'])
+ fig = px.line(
+ df, x=config['time_col'], y=config['value_col'],
+ color=config['color_col']
)
- # Plot generation
- try:
- fig = None
- config = st.session_state.plot_config
-
- if config['plot_type'] == "Histogram":
- fig = px.histogram(
- df, x=config['x_col'], y=config['y_col'],
- nbins=30, template="plotly_dark",
- color_discrete_sequence=[config['color_palette']]
- )
-
- elif config['plot_type'] == "Scatter Plot":
- fig = px.scatter(
- df, x=config['x_col'], y=config['y_col'],
- color_discrete_sequence=[config['color_palette']],
- size=config['size_col'],
- hover_data=config['hover_data_cols']
- )
-
- elif config['plot_type'] == "3D Scatter":
- fig = px.scatter_3d(
- df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
- color=config['color_col'],
- color_discrete_sequence=[config['color_palette']]
- )
-
- elif config['plot_type'] == "Correlation Heatmap":
- numeric_df = df.select_dtypes(include=np.number)
- if not numeric_df.empty:
- corr = numeric_df.corr()
- fig = px.imshow(
- corr, text_auto=True,
- color_continuous_scale=config['color_continuous_scale']
- )
- else:
- st.warning("No numerical columns found for correlation heatmap.")
-
- elif config['plot_type'] == "Box Plot":
- fig = px.box(
- df, x=config['x_col'], y=config['y_col'],
- color_discrete_sequence=[config['color_palette']]
- )
-
- elif config['plot_type'] == "Violin Plot":
- fig = px.violin(
- df, x=config['x_col'], y=config['y_col'],
- box=True, points="all",
- color_discrete_sequence=[config['color_palette']]
- )
-
- elif config['plot_type'] == "Time Series":
- df = df.sort_values(by=config['time_col'])
- fig = px.line(
- df, x=config['time_col'], y=config['value_col'],
- color_discrete_sequence=[config['color_palette']]
- )
+ elif config['plot_type'] == "Scatter Matrix":
+ fig = px.scatter_matrix(
+ df, dimensions=config['scatter_matrix_cols'],
+ color=config['color_col']
+ )
- elif config['plot_type'] == "Scatter Matrix":
- fig = px.scatter_matrix(
- df, dimensions=config['scatter_matrix_cols'],
- color_discrete_sequence=[config['color_palette']]
- )
+ if fig:
+ st.plotly_chart(fig, use_container_width=True)
+ except Exception as e:
+ st.error(f"An error occurred while generating the plot: {e}")
+
+ # Statistical Analysis Section
+ with st.expander("๐ Statistical Analysis", expanded=True):
+ analysis_type = st.selectbox("Select Analysis Type", [
+ "Descriptive Statistics",
+ "Correlation Analysis",
+ "Hypothesis Testing",
+ "Distribution Fitting"
+ ])
- if fig:
- st.plotly_chart(fig, use_container_width=True)
- except Exception as e:
- st.error(f"An error occurred while generating the plot: {e}")
+ if analysis_type == "Descriptive Statistics":
+ st.write(df.describe(include='all'))
- with st.expander("๐งช Hypothesis Testing"):
- test_type = st.selectbox("Select Test Type", ["T-test", "Chi-Squared Test"])
+ elif analysis_type == "Correlation Analysis":
+ numeric_cols = df.select_dtypes(include=np.number).columns
+ if len(numeric_cols) >= 2:
+ corr_method = st.selectbox("Correlation Method", [
+ "Pearson", "Kendall", "Spearman"
+ ])
+ corr_matrix = df[numeric_cols].corr(method=corr_method.lower())
+ st.write(corr_matrix)
+ st.heatmap(corr_matrix, annot=True, cmap=config['color_palette'])
+ else:
+ st.warning("Need at least 2 numeric columns for correlation analysis")
+ elif analysis_type == "Hypothesis Testing":
+ test_type = st.selectbox("Select Test Type", [
+ "T-test", "Chi-Squared Test", "ANOVA", "Mann-Whitney U"
+ ])
if test_type == "T-test":
col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns)
col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns)
if st.button("Run T-test"):
- # Example: Split data by category and perform t-test
- try:
- groups = df.groupby(col2)[col1].apply(list)
- if len(groups) == 2:
- t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1])
- st.write(f"T-statistic: {t_stat:.4f}")
- st.write(f"P-value: {p_value:.4f}")
- if p_value < 0.05:
- st.write("Reject the null hypothesis.")
- else:
- st.write("Fail to reject the null hypothesis.")
+ groups = df.groupby(col2)[col1].apply(list)
+ if len(groups) == 2:
+ t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1])
+ st.write(f"T-statistic: {t_stat:.4f}")
+ st.write(f"P-value: {p_value:.4f}")
+ if p_value < 0.05:
+ st.write("Reject the null hypothesis.")
else:
- st.write("Select a categorical column with exactly two categories.")
- except Exception as e:
- st.error(f"An error occurred during the T-test: {e}")
-
+ st.write("Fail to reject the null hypothesis.")
+ else:
+ st.write("Select a categorical column with exactly two categories.")
+
+ elif analysis_type == "Distribution Fitting":
+ numeric_col = st.selectbox("Select Numeric Column", df.select_dtypes(include=np.number).columns)
+ dist_types = ["Normal", "Log-Normal", "Exponential", "Gamma"]
+ selected_dist = st.selectbox("Select Distribution Type", dist_types)
+ if st.button("Fit Distribution"):
+ from scipy.stats import norm, lognorm, expon, gamma
+ dist_functions = {
+ "Normal": norm,
+ "Log-Normal": lognorm,
+ "Exponential": expon,
+ "Gamma": gamma
+ }
+ params = dist_functions[selected_dist].fit(df[numeric_col].dropna())
+ st.write(f"Fitted Parameters: {params}")
+
+ # Data Profiling Section
+ with st.expander("๐ Generate Full Data Profile", expanded=False):
+ if st.button("๐ Generate Comprehensive Report"):
+ with st.spinner("Generating report..."):
+ pr = ProfileReport(df, explorative=True)
+ st_profile_report(pr)
+
+# Model Training Section
elif app_mode == "Model Training":
- st.title("๐ Model Training")
-
- if st.session_state.cleaned_data is not None:
- df = st.session_state.cleaned_data.copy()
-
- # Initialize session state for train/test split
- if 'X_train_selected' not in st.session_state:
- st.session_state.X_train_selected = None
- st.session_state.X_test_selected = None
- st.session_state.y_train = None
- st.session_state.y_test = None
- st.session_state.model = None # Initialize model in session state
-
- # Target Variable Selection
- target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.")
-
- # Problem Type Selection
- problem_type = st.radio("Select Problem Type", ["Regression", "Classification"], help="Choose the type of problem.")
-
- # Feature Selection
- feature_columns = st.multiselect("Select Feature Columns", df.drop(columns=[target_column]).columns, help="Choose features for training.")
-
- # Model Selection - Dynamic based on Problem Type
- if problem_type == "Regression":
- model_options = ["Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM"]
- else: # Classification
- model_options = ["Logistic Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM", "Naive Bayes", "KNN"]
-
- model_name = st.selectbox("Select Model", model_options, help="Choose a model.")
-
- feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])
-
- # Hyperparameter Tuning - Dynamic based on Model Selection
- param_grid = {} # Initialize to empty dictionary
-
- #Define different paramter values for the model so it works. This is not an optimized number
- #The goal is to make sure that all visualizations and graphs work as is.
+ st.title("๐ Model Training Studio")
+ st.markdown("""
+ **Train and Evaluate Machine Learning Models** with advanced hyperparameter tuning and performance tracking.
+ Choose from a wide range of algorithms and configurations.
+ """)
+
+ if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
+ st.warning("Please clean your data in the Smart Cleaning section first.")
+ st.stop()
+
+ df = st.session_state.cleaned_data.copy()
+
+ # Target Variable Selection
+ st.subheader("๐ฏ Target Variable")
+ target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.")
+
+ # Problem Type Selection
+ st.subheader("๐ Problem Type")
+ problem_type = st.radio("Select Problem Type", ["Regression", "Classification"], help="Choose the type of machine learning problem.")
+
+ # Feature Selection
+ st.subheader("๐ง Feature Selection")
+ use_all_features = st.checkbox("Use All Features", value=True, help="Select to use all features for training. Deselect to manually choose features.")
+ if use_all_features:
+ feature_columns = df.drop(columns=[target_column]).columns.tolist()
+ else:
+ feature_columns = st.multiselect("Select Feature Columns", df.drop(columns=[target_column]).columns, help="Choose the features you want to use for prediction.")
+
+ # Model Selection
+ st.subheader("๐ค Model Selection")
+ if problem_type == "Regression":
+ model_options = ["Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM", "Neural Network"]
+ else: # Classification
+ model_options = ["Logistic Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM", "Neural Network", "KNN", "Naive Bayes"]
+ model_name = st.selectbox("Select Model", model_options, help="Choose a model.")
+
+ # Hyperparameter Tuning
+ st.subheader("๐๏ธ Hyperparameter Tuning")
+ with st.expander("Configure Hyperparameters", expanded=True):
if model_name == "Random Forest":
- st.subheader("Random Forest Hyperparameters")
- param_grid = {
- 'n_estimators': list(range(100, 101)), #Used 100 so model is trained and not empty and all visuals work
-
- 'max_depth': list(range(10,11)), #default value 10 so its in model
- 'min_samples_split': list(range(2,3)), #New hyperparameter default 2
- 'min_samples_leaf': list(range(1,2)), #New hyperparameter default 1
+ n_estimators = st.slider("Number of Estimators", 10, 200, 100)
+ max_depth = st.slider("Max Depth", 3, 20, 10)
+ min_samples_split = st.slider("Min Samples Split", 2, 10, 2)
+ min_samples_leaf = st.slider("Min Samples Leaf", 1, 10, 1)
+ hyperparams = {
+ 'n_estimators': n_estimators,
+ 'max_depth': max_depth,
+ 'min_samples_split': min_samples_split,
+ 'min_samples_leaf': min_samples_leaf
}
-
elif model_name == "Gradient Boosting":
- st.subheader("Gradient Boosting Hyperparameters")
- param_grid = {
- 'n_estimators': list(range(100, 101)),
- 'learning_rate': [0.1],
- 'max_depth': list(range(3,4))
-
+ learning_rate = st.slider("Learning Rate", 0.01, 1.0, 0.1)
+ n_estimators = st.slider("Number of Estimators", 10, 200, 100)
+ max_depth = st.slider("Max Depth", 3, 20, 10)
+ hyperparams = {
+ 'learning_rate': learning_rate,
+ 'n_estimators': n_estimators,
+ 'max_depth': max_depth
}
-
- elif model_name == "Decision Tree":
- st.subheader("Decision Tree Hyperparameters")
- param_grid = {
- 'criterion': ["gini"],
- 'max_depth': list(range(3,4)),
+ elif model_name == "Neural Network":
+ hidden_layers = st.slider("Number of Hidden Layers", 1, 5, 2)
+ neurons_per_layer = st.slider("Neurons per Layer", 10, 200, 50)
+ epochs = st.slider("Epochs", 10, 200, 50)
+ batch_size = st.slider("Batch Size", 16, 128, 32)
+ hyperparams = {
+ 'hidden_layers': hidden_layers,
+ 'neurons_per_layer': neurons_per_layer,
+ 'epochs': epochs,
+ 'batch_size': batch_size
}
+ else:
+ hyperparams = {}
- # Train-Test Split
- test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the dataset to include in the test split.")
+ # Train-Test Split
+ st.subheader("โ๏ธ Train-Test Split")
+ test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the dataset to include in the test split.")
- if st.button("Train Model"):
- with st.spinner("Training model..."):
- try:
- X = df[feature_columns]
- y = df[target_column]
+ # Model Training
+ if st.button("๐ Train Model"):
+ with st.spinner("Training model..."):
+ try:
+ X = df[feature_columns]
+ y = df[target_column]
- # Check if X is empty
- if X.empty:
- st.error("No features were selected. Please select feature columns.")
- st.stop()
+ # Check if X is empty
+ if X.empty:
+ st.error("No features were selected. Please select feature columns.")
+ st.stop()
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
+ # Train-Test Split
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
- # Preprocessing Pipeline
- numeric_features = X.select_dtypes(include=np.number).columns
- categorical_features = X.select_dtypes(exclude=np.number).columns
+ # Preprocessing Pipeline
+ numeric_features = X.select_dtypes(include=np.number).columns
+ categorical_features = X.select_dtypes(exclude=np.number).columns
- numeric_transformer = Pipeline(steps=[
- ('imputer', SimpleImputer(strategy='median')),
- ('scaler', StandardScaler())
- ])
+ numeric_transformer = Pipeline(steps=[
+ ('imputer', SimpleImputer(strategy='median')),
+ ('scaler', StandardScaler())
+ ])
- categorical_transformer = Pipeline(steps=[
- ('imputer', SimpleImputer(strategy='most_frequent')),
- ('onehot', OneHotEncoder(handle_unknown='ignore'))
+ categorical_transformer = Pipeline(steps=[
+ ('imputer', SimpleImputer(strategy='most_frequent')),
+ ('onehot', OneHotEncoder(handle_unknown='ignore'))
+ ])
+
+ preprocessor = ColumnTransformer(
+ transformers=[
+ ('num', numeric_transformer, numeric_features),
+ ('cat', categorical_transformer, categorical_features)
])
- preprocessor = ColumnTransformer(
- transformers=[
- ('num', numeric_transformer, numeric_features),
- ('cat', categorical_transformer, categorical_features)
- ])
-
- X_train_processed = preprocessor.fit_transform(X_train)
- X_test_processed = preprocessor.transform(X_test)
-
- #Feature Selection
- if feature_selection_method == "SelectKBest":
- k = st.slider("Number of Features to Select", 1, len(feature_columns), len(feature_columns), key = "featureselector")
- selector = SelectKBest(k=k)
- X_train_selected = selector.fit_transform(X_train_processed, y_train)
- X_test_selected = selector.transform(X_test_processed)
- else:
- X_train_selected = X_train_processed
- X_test_selected = X_test_processed
-
- # Model Training and Hyperparameter Tuning
- if model_name == "Linear Regression":
- model = LinearRegression()
- model.fit(X_train_selected, y_train)
-
- elif model_name == "Logistic Regression":
- model = LogisticRegression(max_iter=1000)
- model.fit(X_train_selected, y_train)
- elif model_name == "Decision Tree":
- if problem_type == "Regression":
- model = DecisionTreeRegressor()
- model.fit(X_train_selected, y_train)
- else:
- model = DecisionTreeClassifier()
- model.fit(X_train_selected, y_train)
- elif model_name == "Random Forest":
- if problem_type == "Regression":
- model = RandomForestRegressor(random_state=42)
- if 'param_grid' in locals() and param_grid: #added param_grid not empty condition
- grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error') # Example scoring
- grid_search.fit(X_train_selected, y_train)
- model = grid_search.best_estimator_
- st.write("Best Parameters:", grid_search.best_params_)
- else:
- model = RandomForestRegressor(random_state=42) #define if no param_grid
- model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined.
+ X_train_processed = preprocessor.fit_transform(X_train)
+ X_test_processed = preprocessor.transform(X_test)
- else:
- model = RandomForestClassifier(random_state=42)
- if 'param_grid' in locals()and param_grid: #added param_grid not empty condition
- grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
- grid_search.fit(X_train_selected, y_train)
- model = grid_search.best_estimator_
- st.write("Best Parameters:", grid_search.best_params_)
- else:
- model = RandomForestClassifier(random_state=42) #define if no param_grid
- model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined
- elif model_name == "Gradient Boosting":
- from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier #moved import here to avoid bloat
- model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
- model.fit(X_train_selected, y_train)
- elif model_name == "SVM":
- model = SVR() if problem_type == "Regression" else SVC()
- model.fit(X_train_selected, y_train)
- elif model_name == "Naive Bayes":
- from sklearn.naive_bayes import GaussianNB
- model = GaussianNB()
- model.fit(X_train_selected, y_train)
- elif model_name == "KNN":
- from sklearn.neighbors import KNeighborsClassifier
- model = KNeighborsClassifier()
- model.fit(X_train_selected, y_train)
-
- # Store model and preprocessor
- st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
- st.session_state.preprocessor = preprocessor
-
- #Store the test data
- st.session_state.X_train_selected = X_train_selected
- st.session_state.X_test_selected = X_test_selected
- st.session_state.y_train = y_train
- st.session_state.y_test = y_test
-
- # Model Evaluation
- y_pred = model.predict(X_test_selected)
+ # Model Training
+ if model_name == "Linear Regression":
+ model = LinearRegression()
+ elif model_name == "Logistic Regression":
+ model = LogisticRegression(max_iter=1000)
+ elif model_name == "Decision Tree":
if problem_type == "Regression":
- mse = mean_squared_error(y_test, y_pred)
- r2 = r2_score(y_test, y_pred)
- st.write(f"Mean Squared Error: {mse:.4f}")
- st.write(f"R-squared: {r2:.4f}")
+ model = DecisionTreeRegressor()
else:
- from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat
- import seaborn as sns
- import matplotlib.pyplot as plt #Added import statement
- import numpy as np
- import pandas as pd
- from sklearn.model_selection import learning_curve, validation_curve
-
- #Weighted averaging for metrics for multiclass
- average_method = "weighted" #changed from None
-
- accuracy = accuracy_score(y_test, y_pred)
- precision = precision_score(y_test, y_pred, average = average_method, zero_division = 0)
- recall = recall_score(y_test, y_pred, average = average_method, zero_division = 0)
- f1 = f1_score(y_test, y_pred, average = average_method, zero_division = 0)
- st.write(f"Accuracy: {accuracy:.4f}")
- st.write(f"Precision: {precision:.4f}")
- st.write(f"Recall: {recall:.4f}")
- st.write(f"F1 Score: {f1:.4f}")
- st.write("Classification Report:")
- st.text(classification_report(y_test, y_pred, zero_division = 0))
-
-
- #Confusion Matrix
-
- conf_matrix = confusion_matrix(y_test, y_pred)
-
- #Heatmap
- fig_conf, ax_conf = plt.subplots()
- sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax_conf)
- ax_conf.set_xlabel('Predicted Labels')
- ax_conf.set_ylabel('True Labels')
- ax_conf.set_title('Confusion Matrix')
- st.pyplot(fig_conf)
-
-
- #Added section for model visualization
- st.subheader("Model Visualization")
- #Use conditional to make sure that everything only executes when the data set is trained and not outside of it.
- if st.session_state.model is not None: #Make sure that everything only executes when data set is trained and not outside of it.
- try: #All the plotting code here.
- if model_name in ["Random Forest", "Gradient Boosting"]:#Used list to define models.
- #Make sure you use this inside of a conditional for classification, model, and tree based model.
-
- #Feature Importance (Tree-based Models)
-
- importances = model.feature_importances_ # Assumed tree-based model
- feat_importances = pd.Series(importances, index=X_train.columns)
- feat_importances = feat_importances.nlargest(20)
-
- fig_feat, ax_feat = plt.subplots()
- feat_importances.plot(kind='barh', ax=ax_feat)
- ax_feat.set_xlabel('Relative Importance')
- ax_feat.set_ylabel('Features')
- ax_feat.set_title('Feature Importances')
- st.pyplot(fig_feat)
-
- #Create data that determines the learning and validation curve and what we have to add
- train_sizes, train_scores, valid_scores = learning_curve(model, X_train_selected, y_train, cv=5, scoring='accuracy' if problem_type =="Classification" else 'neg_mean_squared_error', n_jobs=-1) #Define cross validation for run
-
- #Then add a plot for the learning curve and use st.pyplot
- train_mean = np.mean(train_scores, axis=1)
- train_std = np.std(train_scores, axis=1)
- valid_mean = np.mean(valid_scores, axis=1)
- valid_std = np.std(valid_scores, axis=1)
-
- #Plot each of the variables that has to be used.
-
- fig_lc, ax_lc = plt.subplots()
- ax_lc.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training ' + ('Accuracy' if problem_type == "Classification" else "Neg MSE"))
- ax_lc.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
- ax_lc.plot(train_sizes, valid_mean, color='green', linestyle='--', marker='s', markersize=5, label='Validation ' + ('Accuracy' if problem_type == "Classification" else "Neg MSE"))
- ax_lc.fill_between(train_sizes, valid_mean + valid_std, valid_mean - valid_std, alpha=0.15, color='green')
-
- ax_lc.set_title('Learning Curves')
- ax_lc.set_xlabel('Training Set Size')
- ax_lc.set_ylabel('Score')
- ax_lc.legend(loc='best')
- st.pyplot(fig_lc)
-
- except Exception as e: #Local error
- st.write(f"Visuals are only available for tree based models or if models are selected prior: {e}") #Write only if error
-
- except Exception as e:
- st.error(f"An error occurred: {e}")
-
- else:
- st.write("Please upload and clean data first.")
+ model = DecisionTreeClassifier()
+ elif model_name == "Random Forest":
+ if problem_type == "Regression":
+ model = RandomForestRegressor(**hyperparams)
+ else:
+ model = RandomForestClassifier(**hyperparams)
+ elif model_name == "Gradient Boosting":
+ if problem_type == "Regression":
+ model = GradientBoostingRegressor(**hyperparams)
+ else:
+ model = GradientBoostingClassifier(**hyperparams)
+ elif model_name == "SVM":
+ if problem_type == "Regression":
+ model = SVR()
+ else:
+ model = SVC()
+ elif model_name == "Neural Network":
+ if problem_type == "Regression":
+ model = MLPRegressor(
+ hidden_layer_sizes=[hyperparams['neurons_per_layer']] * hyperparams['hidden_layers'],
+ max_iter=hyperparams['epochs'],
+ batch_size=hyperparams['batch_size']
+ )
+ else:
+ model = MLPClassifier(
+ hidden_layer_sizes=[hyperparams['neurons_per_layer']] * hyperparams['hidden_layers'],
+ max_iter=hyperparams['epochs'],
+ batch_size=hyperparams['batch_size']
+ )
+ elif model_name == "KNN":
+ model = KNeighborsClassifier()
+ elif model_name == "Naive Bayes":
+ model = GaussianNB()
+
+ # Train the model
+ model.fit(X_train_processed, y_train)
+
+ # Store model and preprocessor
+ st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
+ st.session_state.preprocessor = preprocessor
+
+ # Store the test data
+ st.session_state.X_train_selected = X_train_processed
+ st.session_state.X_test_selected = X_test_processed
+ st.session_state.y_train = y_train
+ st.session_state.y_test = y_test
+
+ # Model Evaluation
+ y_pred = model.predict(X_test_processed)
+ if problem_type == "Regression":
+ mse = mean_squared_error(y_test, y_pred)
+ rmse = np.sqrt(mse)
+ mae = mean_absolute_error(y_test, y_pred)
+ r2 = r2_score(y_test, y_pred)
+ st.write(f"Mean Squared Error: {mse:.4f}")
+ st.write(f"Root Mean Squared Error: {rmse:.4f}")
+ st.write(f"Mean Absolute Error: {mae:.4f}")
+ st.write(f"R-squared: {r2:.4f}")
+ else:
+ accuracy = accuracy_score(y_test, y_pred)
+ precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
+ recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
+ f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
+ st.write(f"Accuracy: {accuracy:.4f}")
+ st.write(f"Precision: {precision:.4f}")
+ st.write(f"Recall: {recall:.4f}")
+ st.write(f"F1 Score: {f1:.4f}")
+ st.write("Classification Report:")
+ st.text(classification_report(y_test, y_pred))
+
+ # Visualization
+ st.subheader("๐ Model Performance Visualization")
+ if problem_type == "Regression":
+ fig, ax = plt.subplots()
+ ax.scatter(y_test, y_pred)
+ ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
+ ax.set_xlabel('Actual')
+ ax.set_ylabel('Predicted')
+ ax.set_title('Actual vs Predicted')
+ st.pyplot(fig)
+ else:
+ conf_matrix = confusion_matrix(y_test, y_pred)
+ fig, ax = plt.subplots()
+ sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
+ ax.set_xlabel('Predicted Labels')
+ ax.set_ylabel('True Labels')
+ ax.set_title('Confusion Matrix')
+ st.pyplot(fig)
+
+ st.success("Model trained successfully!")
+ except Exception as e:
+ st.error(f"An error occurred during training: {e}")
- # Model Saving
+ # Model Saving
+ if st.session_state.model is not None:
+ st.subheader("๐พ Save Model")
model_filename = st.text_input("Enter Model Filename (without extension)", "trained_model")
if st.button("Save Model"):
try:
@@ -868,498 +864,579 @@ elif app_mode == "Model Training":
st.success(f"Model saved as {model_filename}.joblib")
except Exception as e:
st.error(f"Error saving model: {e}")
- # Model loading in a different section
- model_file = st.file_uploader("Upload Trained Model", type=["joblib"])
- if model_file is not None:
- try:
- st.session_state.model = joblib.load(model_file)
- st.success("Model loaded successfully!")
- except Exception as e:
- st.error(f"Error loading model: {e}")
- #Model Evaluation Section - run on the saved model
- if st.session_state.model is not None and st.session_state.X_test_selected is not None: # added check to make sure it is a loaded model
- try:
- y_pred = st.session_state.model.predict(st.session_state.X_test_selected) # load from stored
+# Visualization Lab Section
+elif app_mode == "Visualization Lab":
+ st.title("๐ฌ Visualization Lab")
+ st.markdown("""
+ **Explore and Visualize Your Data** with advanced plotting tools and interactive visualizations.
+ Uncover hidden patterns and relationships in your data.
+ """)
+
+ if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
+ st.warning("Please clean your data in the Smart Cleaning section first.")
+ st.stop()
+
+ df = st.session_state.cleaned_data.copy()
+
+ # Visualization Type Selection
+ st.subheader("๐ Choose Visualization Type")
+ plot_types = [
+ "Histogram", "Scatter Plot", "Box Plot", "Violin Plot",
+ "Correlation Heatmap", "Parallel Coordinates", "Andrews Curves",
+ "Pair Plot", "Density Contour", "3D Scatter", "Time Series",
+ "Sunburst Chart", "Funnel Chart", "Clustering Analysis"
+ ]
+ plot_type = st.selectbox("Select Visualization Type", plot_types)
+
+ # Dynamic Controls Based on Plot Type
+ if plot_type != "Correlation Heatmap":
+ x_col = st.selectbox("X Axis", df.columns)
+
+ if plot_type in ["Scatter Plot", "Box Plot", "Violin Plot", "Time Series", "3D Scatter", "Histogram"]:
+ y_col = st.selectbox("Y Axis", df.columns)
+
+ if plot_type == "3D Scatter":
+ z_col = st.selectbox("Z Axis", df.columns)
+ color_col = st.selectbox("Color by", [None] + list(df.columns))
+
+ # Advanced Plot Customization
+ with st.expander("๐จ Advanced Customization", expanded=False):
+ color_palette = st.selectbox("Color Palette", ["Viridis", "Plasma", "Magma", "Cividis", "RdBu", "Rainbow"])
+ hover_data_cols = st.multiselect("Hover Data", df.columns)
+
+ # Plot Generation
+ try:
+ fig = None
- if problem_type == "Regression":
- mse = mean_squared_error(st.session_state.y_test, y_pred)
- r2 = r2_score(st.session_state.y_test, y_pred)
- st.write(f"Mean Squared Error: {mse:.4f}")
- st.write(f"R-squared: {r2:.4f}")
- else:
- from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat
- accuracy = accuracy_score(st.session_state.y_test, y_pred)
- st.write(f"Accuracy: {accuracy:.4f}")
- except Exception as e: #local error
- st.error(f"An error occurred during model evaluation: {e}")
-
-elif app_mode == "Predictions":
- st.title("๐ฎ Make Predictions")
+ if plot_type == "Histogram":
+ fig = px.histogram(
+ df, x=x_col, y=y_col,
+ nbins=30, template="plotly_dark",
+ color_discrete_sequence=[color_palette]
+ )
- if st.session_state.model is not None and st.session_state.cleaned_data is not None:
- df = st.session_state.cleaned_data.copy()
- model = st.session_state.model.steps[-1][1] #Define model from the state
+ elif plot_type == "Scatter Plot":
+ fig = px.scatter(
+ df, x=x_col, y=y_col,
+ color=color_col,
+ size=hover_data_cols,
+ hover_data=hover_data_cols
+ )
- try:
- numeric_transformer_columns = st.session_state.model.steps[0][1].transformers_[0][2] if hasattr(st.session_state.model.steps[0][1].transformers_[0][2], '__len__') else []
- categorical_transformer_columns = st.session_state.model.steps[0][1].transformers_[1][2] if hasattr(st.session_state.model.steps[0][1].transformers_[1][2], '__len__') else []
- model_columns = numeric_transformer_columns + categorical_transformer_columns
- except AttributeError as e:
- st.error(f"Error accessing model transformers: {e}. Please ensure a valid model is trained and loaded.")
- st.stop()
+ elif plot_type == "3D Scatter":
+ fig = px.scatter_3d(
+ df, x=x_col, y=y_col, z=z_col,
+ color=color_col,
+ color_discrete_sequence=[color_palette]
+ )
- model_is_classification = hasattr(model, 'predict_proba') # Check for classification or other problem
- if not set(model_columns).issubset(set(df.columns)): #Fixed comparison
- st.error("The model was trained on a dataframe that contains different columns than the currently uploaded dataframe. Please upload the correct dataframe.")
- st.stop()
-
- input_data = {}
- st.subheader("Enter Data for Prediction")
- for col in model_columns:
- if pd.api.types.is_numeric_dtype(df[col]):
- input_data[col] = st.number_input(f"Enter {col}", value=df[col].mean())
+ elif plot_type == "Correlation Heatmap":
+ numeric_df = df.select_dtypes(include=np.number)
+ if not numeric_df.empty:
+ corr = numeric_df.corr()
+ fig = px.imshow(
+ corr, text_auto=True,
+ color_continuous_scale=color_palette
+ )
else:
- input_data[col] = st.selectbox(f"Select {col}", df[col].unique())
-
- # Prediction Button
- if st.button("Make Prediction"):
- try:
- input_df = pd.DataFrame([input_data])
- #Preprocess for model
- input_processed = st.session_state.preprocessor.transform(input_df)
- prediction = st.session_state.model.predict(input_processed)[0]
- st.subheader("Prediction Result")
- st.write(f"The predicted value is: {prediction}")
-
- # Show shap values chart
- show_shap_values = st.checkbox("View SHAP Explanation") #select model to show shap values
-
-
- if show_shap_values and model_is_classification and model_name not in ["Linear Regression","Logistic Regression","SVM","Naive Bayes", "KNN"]:#Show shap values if this can perform.
-
- try:
- import shap #Import lib
- explainer = shap.TreeExplainer(st.session_state.model.steps[-1][1]) #Used tree model because these are easily visualized
-
- shap_values = explainer.shap_values(input_processed) #Get output of each values, only used in tree models
-
- st.subheader("SHAP Values")
- #Plot for each of the different class labels.
-
- shap.initjs()
- fig_shap, ax_shap = plt.subplots(1, figsize = (10,10))
- shap.summary_plot(shap_values, features = input_processed, feature_names = model_columns, plot_type = "bar")#plot for multi class labels
- st.pyplot(fig_shap) #Show the figure
- except Exception as e:
- st.write(f"Can show shap values on tree based model: {e}") #Show error
- # Additional Feedback (Example for Classification)
- if hasattr(st.session_state.model.steps[-1][1], 'predict_proba'): #If the end variable has predict_proba and is therefore a predictor
- probabilities = st.session_state.model.predict_proba(input_processed)[0]
- st.write("Predicted Probabilities:")
- st.write(probabilities) #write here
- except Exception as e:
- st.error(f"An error occurred during prediction: {e}")
-
- #Add batch prediction section in prediction tab
- st.subheader("Batch Predictions")
- batch_file = st.file_uploader("Upload CSV for Batch Predictions", type=["csv"])
- if batch_file is not None:
- try:
- batch_df = pd.read_csv(batch_file)
- #Verify data types and if it matches the ones used during the columns
- for col in model_columns:
- if pd.api.types.is_numeric_dtype(df[col]):
- try:
- batch_df[col] = pd.to_numeric(batch_df[col], errors='raise')
- except ValueError:
- st.error(f"Column '{col}' must be numeric.")
- st.stop()
- else:
- #ensure columns are type string if that isnt the case
- batch_df[col] = batch_df[col].astype(str)
+ st.warning("No numerical columns found for correlation heatmap.")
- if not set(model_columns).issubset(set(batch_df.columns)): #Fixed comparison
- st.error("The batch dataframe that contains different columns than the currently used training dataframe. Please upload the correct dataframe.")
- st.stop()
-
- # Preprocess the batch data
- batch_processed = st.session_state.preprocessor.transform(batch_df[model_columns])
- # Make predictions
- batch_predictions = st.session_state.model.predict(batch_processed)
- batch_df['Prediction'] = batch_predictions
+ elif plot_type == "Box Plot":
+ fig = px.box(
+ df, x=x_col, y=y_col,
+ color=color_col
+ )
- #Add probability output if that function is available.
- if hasattr(st.session_state.model.steps[-1][1], 'predict_proba'):
- batch_probabilities = st.session_state.model.predict_proba(batch_processed)
- for i in range(batch_probabilities.shape[1]): #Loop through and give each probability
- batch_df[f'Probability_Class_{i}'] = batch_probabilities[:, i]
+ elif plot_type == "Violin Plot":
+ fig = px.violin(
+ df, x=x_col, y=y_col,
+ box=True, points="all",
+ color=color_col
+ )
+ elif plot_type == "Time Series":
+ df = df.sort_values(by=x_col)
+ fig = px.line(
+ df, x=x_col, y=y_col,
+ color=color_col
+ )
+ elif plot_type == "Scatter Matrix":
+ fig = px.scatter_matrix(
+ df, dimensions=[x_col, y_col],
+ color=color_col
+ )
- st.dataframe(batch_df)
+ if fig:
+ st.plotly_chart(fig, use_container_width=True)
+ except Exception as e:
+ st.error(f"An error occurred while generating the plot: {e}")
+
+ # Statistical Analysis Section
+ with st.expander("๐ Statistical Analysis", expanded=True):
+ analysis_type = st.selectbox("Select Analysis Type", [
+ "Descriptive Statistics",
+ "Correlation Analysis",
+ "Hypothesis Testing",
+ "Distribution Fitting"
+ ])
- # Download predictions
- csv = batch_df.to_csv(index=False)
- b64 = base64.b64encode(csv.encode()).decode() # some strings
- href = f'Download Predictions CSV'
- st.markdown(href, unsafe_allow_html=True)
+ if analysis_type == "Descriptive Statistics":
+ st.write(df.describe(include='all'))
- except Exception as e:
- st.error(f"Error processing batch file: {e}")
+ elif analysis_type == "Correlation Analysis":
+ numeric_cols = df.select_dtypes(include=np.number).columns
+ if len(numeric_cols) >= 2:
+ corr_method = st.selectbox("Correlation Method", [
+ "Pearson", "Kendall", "Spearman"
+ ])
+ corr_matrix = df[numeric_cols].corr(method=corr_method.lower())
+ st.write(corr_matrix)
+ st.heatmap(corr_matrix, annot=True, cmap=color_palette)
+ else:
+ st.warning("Need at least 2 numeric columns for correlation analysis")
+ elif analysis_type == "Hypothesis Testing":
+ test_type = st.selectbox("Select Test Type", [
+ "T-test", "Chi-Squared Test", "ANOVA", "Mann-Whitney U"
+ ])
+ if test_type == "T-test":
+ col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns)
+ col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns)
+ if st.button("Run T-test"):
+ groups = df.groupby(col2)[col1].apply(list)
+ if len(groups) == 2:
+ t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1])
+ st.write(f"T-statistic: {t_stat:.4f}")
+ st.write(f"P-value: {p_value:.4f}")
+ if p_value < 0.05:
+ st.write("Reject the null hypothesis.")
+ else:
+ st.write("Fail to reject the null hypothesis.")
+ else:
+ st.write("Select a categorical column with exactly two categories.")
+
+ elif analysis_type == "Distribution Fitting":
+ numeric_col = st.selectbox("Select Numeric Column", df.select_dtypes(include=np.number).columns)
+ dist_types = ["Normal", "Log-Normal", "Exponential", "Gamma"]
+ selected_dist = st.selectbox("Select Distribution Type", dist_types)
+ if st.button("Fit Distribution"):
+ from scipy.stats import norm, lognorm, expon, gamma
+ dist_functions = {
+ "Normal": norm,
+ "Log-Normal": lognorm,
+ "Exponential": expon,
+ "Gamma": gamma
+ }
+ params = dist_functions[selected_dist].fit(df[numeric_col].dropna())
+ st.write(f"Fitted Parameters: {params}")
+
+ # Data Profiling Section
+ with st.expander("๐ Generate Full Data Profile", expanded=False):
+ if st.button("๐ Generate Comprehensive Report"):
+ with st.spinner("Generating report..."):
+ pr = ProfileReport(df, explorative=True)
+ st_profile_report(pr)
+
+# Insights Section
+elif app_mode == "Insights":
+ st.title("๐ Model Insights & Explainability")
+ st.markdown("""
+ **Understand and Interpret Your Model** with advanced explainability tools and visualizations.
+ Gain deeper insights into model behavior and predictions.
+ """)
+
+ if 'model' not in st.session_state or st.session_state.model is None:
+ st.warning("Please train a model in the Model Training section first.")
+ st.stop()
+
+ model = st.session_state.model.steps[-1][1] # Get the trained model
+ preprocessor = st.session_state.model.steps[0][1] # Get the preprocessor
+
+ # Model Summary
+ st.subheader("๐ Model Summary")
+ st.write(f"**Model Type:** {type(model).__name__}")
+ st.write(f"**Problem Type:** {'Regression' if hasattr(model, 'predict') else 'Classification'}")
+ st.write(f"**Training Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+
+ # Feature Importance
+ st.subheader("๐ Feature Importance")
+ if hasattr(model, 'feature_importances_'):
+ importances = model.feature_importances_
+ feature_names = preprocessor.get_feature_names_out()
+ importance_df = pd.DataFrame({
+ 'Feature': feature_names,
+ 'Importance': importances
+ }).sort_values('Importance', ascending=False)
+
+ fig, ax = plt.subplots()
+ sns.barplot(x='Importance', y='Feature', data=importance_df.head(10), ax=ax)
+ ax.set_title('Top 10 Feature Importances')
+ st.pyplot(fig)
else:
- st.write("Please train a model first in the 'Model Training' section.")
+ st.info("Feature importance not available for this model type.")
+
+ # SHAP Values
+ st.subheader("๐ SHAP Values")
+ if st.checkbox("Calculate SHAP Values (Warning: May be slow for large datasets)"):
+ try:
+ import shap
+ explainer = shap.TreeExplainer(model)
+ shap_values = explainer.shap_values(st.session_state.X_test_selected)
+
+ # Summary Plot
+ st.write("### Summary Plot")
+ fig, ax = plt.subplots()
+ shap.summary_plot(shap_values, st.session_state.X_test_selected, feature_names=preprocessor.get_feature_names_out())
+ st.pyplot(fig)
+
+ # Force Plot for Individual Predictions
+ st.write("### Individual Prediction Explanation")
+ sample_idx = st.slider("Select Sample Index", 0, len(st.session_state.X_test_selected)-1, 0)
+ fig, ax = plt.subplots()
+ shap.force_plot(explainer.expected_value, shap_values[sample_idx], st.session_state.X_test_selected[sample_idx],
+ feature_names=preprocessor.get_feature_names_out(), matplotlib=True, show=False)
+ st.pyplot(fig)
+ except Exception as e:
+ st.error(f"SHAP calculation failed: {e}")
+
+ # Partial Dependence Plots
+ st.subheader("๐ Partial Dependence Plots")
+ if hasattr(model, 'predict'):
+ feature_to_plot = st.selectbox("Select Feature for PDP", preprocessor.get_feature_names_out())
+ if st.button("Generate PDP"):
+ from sklearn.inspection import PartialDependenceDisplay
+ fig, ax = plt.subplots()
+ PartialDependenceDisplay.from_estimator(
+ model, st.session_state.X_test_selected,
+ features=[feature_to_plot],
+ feature_names=preprocessor.get_feature_names_out(),
+ ax=ax
+ )
+ st.pyplot(fig)
+
+ # Model Performance Over Time
+ st.subheader("โณ Model Performance Over Time")
+ if st.checkbox("Track Performance Over Time"):
+ performance_history = {
+ 'timestamp': [],
+ 'metric': [],
+ 'value': []
+ }
+
+ if hasattr(model, 'predict'):
+ y_pred = model.predict(st.session_state.X_test_selected)
+ mse = mean_squared_error(st.session_state.y_test, y_pred)
+ performance_history['timestamp'].append(datetime.now())
+ performance_history['metric'].append('MSE')
+ performance_history['value'].append(mse)
+
+ performance_df = pd.DataFrame(performance_history)
+ st.line_chart(performance_df.set_index('timestamp'))
+
+ # Model Debugging
+ st.subheader("๐ Model Debugging")
+ if st.checkbox("Enable Debug Mode"):
+ st.write("### Model Parameters")
+ st.json(model.get_params())
+
+ st.write("### Training Data Summary")
+ st.write(f"Number of Samples: {st.session_state.X_train_selected.shape[0]}")
+ st.write(f"Number of Features: {st.session_state.X_train_selected.shape[1]}")
+
+ # Export Insights
+ st.subheader("๐พ Export Insights")
+ if st.button("Export Insights as PDF"):
+ try:
+ from fpdf import FPDF
+ pdf = FPDF()
+ pdf.add_page()
+ pdf.set_font("Arial", size=12)
+ pdf.cell(200, 10, txt="Model Insights Report", ln=True, align='C')
+ pdf.cell(200, 10, txt=f"Model Type: {type(model).__name__}", ln=True)
+ pdf.cell(200, 10, txt=f"Problem Type: {'Regression' if hasattr(model, 'predict') else 'Classification'}", ln=True)
+ pdf.output("model_insights.pdf")
+ st.success("Insights exported successfully!")
+ except Exception as e:
+ st.error(f"Export failed: {e}")
+# Predictions Section
+elif app_mode == "Predictions":
+ st.title("๐ฎ Prediction Studio")
+ st.markdown("""
+ **Make Predictions** with your trained model and explore prediction explanations.
+ Generate batch predictions and export results.
+ """)
+
+ if 'model' not in st.session_state or st.session_state.model is None:
+ st.warning("Please train a model in the Model Training section first.")
+ st.stop()
+
+ model = st.session_state.model.steps[-1][1] # Get the trained model
+ preprocessor = st.session_state.model.steps[0][1] # Get the preprocessor
+
+ # Single Prediction
+ st.subheader("๐ฏ Single Prediction")
+ input_data = {}
+ feature_names = preprocessor.get_feature_names_out()
+ for feature in feature_names:
+ if feature in st.session_state.cleaned_data.columns:
+ if pd.api.types.is_numeric_dtype(st.session_state.cleaned_data[feature]):
+ input_data[feature] = st.number_input(f"Enter {feature}", value=st.session_state.cleaned_data[feature].mean())
+ else:
+ input_data[feature] = st.selectbox(f"Select {feature}", st.session_state.cleaned_data[feature].unique())
-elif app_mode == "Visualization Lab":
- st.title("๐ฌ Advanced Data Visualization and Clustering Lab")
+ if st.button("Make Prediction"):
+ try:
+ input_df = pd.DataFrame([input_data])
+ input_processed = preprocessor.transform(input_df)
+ prediction = model.predict(input_processed)[0]
+
+ st.write(f"**Prediction:** {prediction}")
+
+ if hasattr(model, 'predict_proba'):
+ probabilities = model.predict_proba(input_processed)[0]
+ st.write("**Prediction Probabilities:**")
+ st.bar_chart(probabilities)
- # Initialize session state for cleaned data
- if 'cleaned_data' not in st.session_state:
- st.session_state.cleaned_data = None
+ # SHAP Explanation
+ if st.checkbox("Show SHAP Explanation"):
+ try:
+ import shap
+ explainer = shap.TreeExplainer(model)
+ shap_values = explainer.shap_values(input_processed)
+
+ st.write("### SHAP Values")
+ fig, ax = plt.subplots()
+ shap.force_plot(explainer.expected_value, shap_values, input_processed,
+ feature_names=feature_names, matplotlib=True, show=False)
+ st.pyplot(fig)
+ except Exception as e:
+ st.error(f"SHAP calculation failed: {e}")
- # Sample data upload (replace with your data loading logic)
- uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
- if uploaded_file is not None:
- try:
- df = pd.read_csv(uploaded_file)
- st.session_state.cleaned_data = df
- st.success("Data loaded successfully!")
except Exception as e:
- st.error(f"Error loading data: {e}")
+ st.error(f"Prediction failed: {e}")
- if st.session_state.cleaned_data is not None:
- df = st.session_state.cleaned_data.copy()
-
- # Visualization Type Selection
- visualization_type = st.selectbox("Select Visualization Type", [
- "Pair Plot", "Parallel Coordinates Plot", "Andrews Curves", "Pie Chart",
- "Area Chart", "Density Contour", "Sunburst Chart", "Funnel Chart", "Clustering Analysis"
- ])
+ # Batch Predictions
+ st.subheader("๐ Batch Predictions")
+ batch_file = st.file_uploader("Upload CSV for Batch Predictions", type=["csv"])
+ if batch_file is not None:
+ try:
+ batch_df = pd.read_csv(batch_file)
+ batch_processed = preprocessor.transform(batch_df)
+ batch_predictions = model.predict(batch_processed)
+ batch_df['Prediction'] = batch_predictions
- if visualization_type == "Pair Plot":
- st.subheader("Pair Plot")
- cols_for_pairplot = st.multiselect("Select Columns for Pair Plot", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:3])
- if cols_for_pairplot:
- fig = px.scatter_matrix(df, dimensions=cols_for_pairplot)
- st.plotly_chart(fig, use_container_width=True)
-
- elif visualization_type == "Parallel Coordinates Plot":
- st.subheader("Parallel Coordinates Plot")
- cols_for_parallel = st.multiselect("Select Columns for Parallel Coordinates", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:5])
- if cols_for_parallel:
- fig = px.parallel_coordinates(df[cols_for_parallel], color=df[cols_for_parallel[0]] if cols_for_parallel else None)
- st.plotly_chart(fig, use_container_width=True)
-
- elif visualization_type == "Andrews Curves":
- st.subheader("Andrews Curves")
- cols_for_andrews = st.multiselect("Select Columns for Andrews Curves", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:5])
- if cols_for_andrews:
- fig = px.andrews_curves(df[cols_for_andrews + [df.columns[0]]], class_column=df.columns[0])
- st.plotly_chart(fig, use_container_width=True)
-
- elif visualization_type == "Pie Chart":
- st.subheader("Pie Chart")
- col_for_pie = st.selectbox("Select Column for Pie Chart", df.columns)
- fig = px.pie(df, names=col_for_pie)
- st.plotly_chart(fig, use_container_width=True)
+ if hasattr(model, 'predict_proba'):
+ probabilities = model.predict_proba(batch_processed)
+ for i in range(probabilities.shape[1]):
+ batch_df[f'Probability_Class_{i}'] = probabilities[:, i]
- elif visualization_type == "Area Chart":
- st.subheader("Area Chart")
- cols_for_area = st.multiselect("Select Columns for Area Chart", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:3])
- if cols_for_area:
- fig = px.area(df[cols_for_area])
- st.plotly_chart(fig, use_container_width=True)
-
- elif visualization_type == "Density Contour":
- st.subheader("Density Contour")
- x_col = st.selectbox("Select X Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist())
- y_col = st.selectbox("Select Y Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist())
- fig = px.density_contour(df, x=x_col, y=y_col)
- st.plotly_chart(fig, use_container_width=True)
+ st.write("### Predictions Preview")
+ st.dataframe(batch_df.head())
- elif visualization_type == "Sunburst Chart":
- st.subheader("Sunburst Chart")
- path_cols = st.multiselect("Select Path Columns for Sunburst Chart", df.columns)
- if path_cols:
- fig = px.sunburst(df, path=path_cols)
- st.plotly_chart(fig, use_container_width=True)
-
- elif visualization_type == "Funnel Chart":
- st.subheader("Funnel Chart")
- x_col = st.selectbox("Select X Column for Funnel Chart (Values)", df.select_dtypes(include=np.number).columns.tolist())
- y_col = st.selectbox("Select Y Column for Funnel Chart (Categories)", df.columns)
- fig = px.funnel(df, x=x_col, y=y_col)
- st.plotly_chart(fig, use_container_width=True)
+ # Download Predictions
+ csv = batch_df.to_csv(index=False)
+ b64 = base64.b64encode(csv.encode()).decode()
+ href = f'Download Predictions CSV'
+ st.markdown(href, unsafe_allow_html=True)
- elif visualization_type == "Clustering Analysis":
- st.subheader("Clustering Analysis")
- numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
+ except Exception as e:
+ st.error(f"Batch prediction failed: {e}")
- if not numerical_cols:
- st.warning("No numerical columns found for clustering.")
+ # Prediction Analysis
+ st.subheader("๐ Prediction Analysis")
+ if st.checkbox("Analyze Predictions"):
+ try:
+ y_pred = model.predict(st.session_state.X_test_selected)
+ y_test = st.session_state.y_test
+
+ if hasattr(model, 'predict'):
+ fig, ax = plt.subplots()
+ ax.scatter(y_test, y_pred)
+ ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
+ ax.set_xlabel('Actual')
+ ax.set_ylabel('Predicted')
+ ax.set_title('Actual vs Predicted')
+ st.pyplot(fig)
else:
- cluster_cols = st.multiselect("Select Columns for Clustering", numerical_cols, default=numerical_cols[:2] if len(numerical_cols) >= 2 else numerical_cols)
-
- if cluster_cols:
- try:
- scaler = StandardScaler()
- scaled_data = scaler.fit_transform(df[cluster_cols])
- n_clusters = st.slider("Number of Clusters", 2, 10, 3, help="Number of clusters to form.")
- kmeans = KMeans(n_clusters=n_clusters, random_state=42)
- clusters = kmeans.fit_predict(scaled_data)
- df['Cluster'] = clusters
-
- if len(cluster_cols) == 2:
- fig = px.scatter(df, x=cluster_cols[0], y=cluster_cols[1], color='Cluster', title="K-Means Clustering")
- st.plotly_chart(fig, use_container_width=True)
- elif len(cluster_cols) == 3:
- fig = px.scatter_3d(df, x=cluster_cols[0], y=cluster_cols[1], z=cluster_cols[2], color='Cluster', title="K-Means Clustering (3D)")
- st.plotly_chart(fig, use_container_width=True)
- else:
- st.write("Clustering visualization is only supported for 2 or 3 selected columns.")
- st.success("Clustering applied successfully!")
-
- #Add clustering performance in clustering analysis
- if len(cluster_cols) >= 2: # Evaluate Silhouette Score
- try:
- silhouette_avg = silhouette_score(scaled_data, clusters)
- st.write(f"Silhouette Score: {silhouette_avg:.4f}")
- except:
- st.write("Could not compute silhouette score")
-
- #Add dimensionality reduction option and 2d/3d plots
-
- dimension_reduction = st.selectbox("Dimensionality Reduction", ["None", "PCA"])
- if dimension_reduction == "PCA":
- n_components = st.slider("Number of Components", 2, min(3, len(cluster_cols)), 2)
- pca = PCA(n_components=n_components)
- principal_components = pca.fit_transform(scaled_data)
- pca_df = pd.DataFrame(data=principal_components, columns=[f'PC{i + 1}' for i in range(n_components)])
- pca_df['Cluster'] = clusters # Add Cluster
-
- if len(cluster_cols) >= 2: #plotting section
- fig = None #Initialize fig
- if dimension_reduction == "None":
- if len(cluster_cols) == 2:
- fig = px.scatter(df, x=cluster_cols[0], y=cluster_cols[1], color='Cluster', title="K-Means Clustering")
- st.plotly_chart(fig, use_container_width=True)
- elif len(cluster_cols) == 3:
- fig = px.scatter_3d(df, x=cluster_cols[0], y=cluster_cols[1], z=cluster_cols[2], color='Cluster', title="K-Means Clustering (3D)")
- st.plotly_chart(fig, use_container_width=True)
- else:
- st.write("Clustering visualization is only supported for 2 or 3 selected columns.")
-
- elif dimension_reduction == "PCA":
- if n_components == 2:
- fig = px.scatter(pca_df, x='PC1', y='PC2', color='Cluster', title="K-Means Clustering (PCA - 2D)")
- st.plotly_chart(fig, use_container_width=True)
- elif n_components == 3:
- fig = px.scatter_3d(pca_df, x='PC1', y='PC2', z='PC3', color='Cluster', title="K-Means Clustering (PCA - 3D)")
- st.plotly_chart(fig, use_container_width=True)
-
- else:
- st.write("PCA visualization is only supported for 2 or 3 components.")
-
- except Exception as e:
- st.error(f"An error occurred during clustering: {e}")
+ conf_matrix = confusion_matrix(y_test, y_pred)
+ fig, ax = plt.subplots()
+ sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
+ ax.set_xlabel('Predicted Labels')
+ ax.set_ylabel('True Labels')
+ ax.set_title('Confusion Matrix')
+ st.pyplot(fig)
+ except Exception as e:
+ st.error(f"Prediction analysis failed: {e}")
+ # Prediction Export
+ st.subheader("๐พ Export Predictions")
+ if st.button("Export Predictions as PDF"):
+ try:
+ from fpdf import FPDF
+ pdf = FPDF()
+ pdf.add_page()
+ pdf.set_font("Arial", size=12)
+ pdf.cell(200, 10, txt="Predictions Report", ln=True, align='C')
+ pdf.cell(200, 10, txt=f"Model Type: {type(model).__name__}", ln=True)
+ pdf.cell(200, 10, txt=f"Problem Type: {'Regression' if hasattr(model, 'predict') else 'Classification'}", ln=True)
+ pdf.output("predictions_report.pdf")
+ st.success("Predictions exported successfully!")
+ except Exception as e:
+ st.error(f"Export failed: {e}")
+
+# Neural Network Studio Section
elif app_mode == "Neural Network Studio":
st.title("๐ง Neural Network Studio")
+ st.markdown("""
+ **Build and Train Neural Networks** with advanced configurations and visualizations.
+ Explore deep learning models with ease.
+ """)
+
+ if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None:
+ st.warning("Please clean your data in the Smart Cleaning section first.")
+ st.stop()
+
+ df = st.session_state.cleaned_data.copy()
+
+ # Target Variable Selection
+ st.subheader("๐ฏ Target Variable")
+ target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.")
+
+ # Problem Type Selection
+ st.subheader("๐ Problem Type")
+ problem_type = st.radio("Select Problem Type", ["Regression", "Classification"], help="Choose the type of machine learning problem.")
+
+ # Feature Selection
+ st.subheader("๐ง Feature Selection")
+ use_all_features = st.checkbox("Use All Features", value=True, help="Select to use all features for training. Deselect to manually choose features.")
+ if use_all_features:
+ feature_columns = df.drop(columns=[target_column]).columns.tolist()
+ else:
+ feature_columns = st.multiselect("Select Feature Columns", df.drop(columns=[target_column]).columns, help="Choose the features you want to use for prediction.")
+
+ # Neural Network Configuration
+ st.subheader("โ๏ธ Neural Network Configuration")
+ with st.expander("Configure Neural Network", expanded=True):
+ hidden_layers = st.slider("Number of Hidden Layers", 1, 5, 2)
+ neurons_per_layer = st.slider("Neurons per Layer", 10, 200, 50)
+ activation = st.selectbox("Activation Function", ["relu", "tanh", "sigmoid"])
+ learning_rate = st.slider("Learning Rate", 0.001, 0.1, 0.01)
+ epochs = st.slider("Epochs", 10, 200, 50)
+ batch_size = st.slider("Batch Size", 16, 128, 32)
+
+ # Train-Test Split
+ st.subheader("โ๏ธ Train-Test Split")
+ test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the dataset to include in the test split.")
+
+ # Model Training
+ if st.button("๐ Train Neural Network"):
+ with st.spinner("Training neural network..."):
+ try:
+ X = df[feature_columns]
+ y = df[target_column]
- if st.session_state.cleaned_data is not None:
- df = st.session_state.cleaned_data.copy()
+ # Train-Test Split
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
- # Target Variable Selection
- target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column you want to predict.")
+ # Preprocessing Pipeline
+ numeric_features = X.select_dtypes(include=np.number).columns
+ categorical_features = X.select_dtypes(exclude=np.number).columns
- # Problem Type Selection
- problem_type = st.radio("Select Problem Type", ["Regression", "Classification"], help="Choose the type of machine learning problem.")
+ numeric_transformer = Pipeline(steps=[
+ ('imputer', SimpleImputer(strategy='median')),
+ ('scaler', StandardScaler())
+ ])
- # Feature Selection (optional)
- use_all_features = st.checkbox("Use All Features", value=True, help="Select to use all features for training. Deselect to manually choose features.")
- if use_all_features:
- feature_columns = df.drop(columns=[target_column]).columns.tolist()
- else:
- feature_columns = st.multiselect("Select Feature Columns", df.drop(columns=[target_column]).columns, help="Choose the features you want to use for prediction.")
-
- # Model Selection
- model_type = st.selectbox("Select Neural Network Model", [
- "Simple Neural Network", "Convolutional Neural Network (CNN)", "Recurrent Neural Network (RNN)"
- ], help="Choose the neural network model to use.")
-
- # Hyperparameter Tuning
- with st.expander("Hyperparameter Tuning", expanded=False):
- if model_type == "Simple Neural Network":
- hidden_layers = st.slider("Number of Hidden Layers", 1, 5, 2, help="Number of hidden layers in the network.")
- neurons_per_layer = st.slider("Neurons per Layer", 10, 200, 50, help="Number of neurons in each hidden layer.")
- epochs = st.slider("Epochs", 10, 200, 50, help="Number of epochs for training.")
- batch_size = st.slider("Batch Size", 16, 128, 32, help="Batch size for training.")
- elif model_type == "Convolutional Neural Network (CNN)":
- epochs_cnn = st.slider("Epochs", 10, 200, 50, help="Number of epochs for CNN training.")
- batch_size_cnn = st.slider("Batch Size", 16, 128, 32, help="Batch size for CNN training.")
- elif model_type == "Recurrent Neural Network (RNN)":
- epochs_rnn = st.slider("Epochs", 10, 200, 50, help="Number of epochs for RNN training.")
- batch_size_rnn = st.slider("Batch Size", 16, 128, 32, help="Batch size for RNN training.")
- sequence_length = st.slider("Sequence Length (for RNN)", 10, 100, 30, help="Length of the input sequences for RNN.")
- # Train-Test Split
- test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the data to use for testing.")
-
- # Model Training Button
- if st.button("Train Neural Network Model"):
- with st.spinner("Training neural network model..."):
- try:
- # Split data
- X = df[feature_columns]
- y = df[target_column]
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
-
- # Preprocessing
- numeric_transformer = Pipeline(steps=[
- ('imputer', SimpleImputer(strategy='median')),
- ('scaler', StandardScaler())
- ])
- categorical_transformer = Pipeline(steps=[
- ('imputer', SimpleImputer(strategy='most_frequent')),
- ('onehot', OneHotEncoder(handle_unknown='ignore'))
+ categorical_transformer = Pipeline(steps=[
+ ('imputer', SimpleImputer(strategy='most_frequent')),
+ ('onehot', OneHotEncoder(handle_unknown='ignore'))
+ ])
+
+ preprocessor = ColumnTransformer(
+ transformers=[
+ ('num', numeric_transformer, numeric_features),
+ ('cat', categorical_transformer, categorical_features)
])
- numeric_features = X_train.select_dtypes(include=np.number).columns
- categorical_features = X_train.select_dtypes(include='object').columns
-
- preprocessor = ColumnTransformer(
- transformers=[
- ('num', numeric_transformer, numeric_features),
- ('cat', categorical_transformer, categorical_features)
- ])
-
- X_train_processed = preprocessor.fit_transform(X_train)
- X_test_processed = preprocessor.transform(X_test)
-
- # Neural Network Model Selection and Training
- tf.random.set_seed(42) # for reproducibility
-
- # Callbacks (Early Stopping)
- early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
-
- if model_type == "Simple Neural Network":
- model = keras.Sequential()
- model.add(layers.Input(shape=(X_train_processed.shape[1],)))
- for _ in range(hidden_layers):
- model.add(layers.Dense(neurons_per_layer, activation=activation)) # Use the selected activation
- model.add(
- layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)),
- activation='linear' if problem_type == "Regression" else 'softmax'))
-
- optimizer = keras.optimizers.Adam(learning_rate=learning_rate) # Use the learning rate
-
- model.compile(optimizer=optimizer,
- loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
- metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
-
- history = model.fit(X_train_processed, y_train, epochs=epochs, batch_size=batch_size,
- validation_split=0.2, verbose=0,
- callbacks=[early_stopping]) # Added early stopping
-
- y_pred = model.predict(X_test_processed)
- if problem_type == "Classification":
- y_pred = np.argmax(y_pred, axis=1)
-
- elif model_type == "Convolutional Neural Network (CNN)":
- X_train_cnn = np.expand_dims(X_train_processed, axis=2)
- X_test_cnn = np.expand_dims(X_test_processed, axis=2)
-
- model = keras.Sequential()
- model.add(layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu',
- input_shape=(X_train_cnn.shape[1], 1)))
- model.add(layers.MaxPooling1D(pool_size=pooling_size))
- model.add(layers.Flatten())
- model.add(layers.Dense(50, activation='relu'))
- model.add(
- layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)),
- activation='linear' if problem_type == "Regression" else 'softmax'))
-
- optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
- model.compile(optimizer=optimizer,
- loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
- metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
-
- history = model.fit(X_train_cnn, y_train, epochs=epochs_cnn, batch_size=batch_size_cnn,
- validation_split=0.2, verbose=0,
- callbacks=[early_stopping])
-
- y_pred = model.predict(X_test_cnn)
- if problem_type == "Classification":
- y_pred = np.argmax(y_pred, axis=1)
-
- elif model_type == "Recurrent Neural Network (RNN)":
- try:
- X_train_rnn = np.reshape(X_train_processed, (
- X_train_processed.shape[0], sequence_length,
- X_train_processed.shape[1] // sequence_length))
- X_test_rnn = np.reshape(X_test_processed, (
- X_test_processed.shape[0], sequence_length, X_test_processed.shape[1] // sequence_length))
-
- model = keras.Sequential()
- model.add(layers.SimpleRNN(units, activation='relu', # Use the selected units
- dropout=dropout_rate,
- input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2])))
- model.add(
- layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)),
- activation='linear' if problem_type == "Regression" else 'softmax'))
-
- optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
- model.compile(optimizer=optimizer,
- loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
- metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
-
- history = model.fit(X_train_rnn, y_train, epochs=epochs_rnn, batch_size=batch_size_rnn,
- validation_split=0.2, verbose=0,
- callbacks=[early_stopping])
-
- y_pred = model.predict(X_test_rnn)
- if problem_type == "Classification":
- y_pred = np.argmax(y_pred, axis=1)
- except Exception as e:
- st.error(f"Error during RNN training: {e}")
- st.stop() # Stop execution if RNN fails
-
- # Evaluation
- if problem_type == "Regression":
- mse = mean_squared_error(y_test, y_pred)
- rmse = np.sqrt(mse)
- mae = mean_absolute_error(y_test, y_pred)
- r2 = r2_score(y_test, y_pred)
- st.write(f"Mean Squared Error: {mse:.4f}")
- st.write(f"Root Mean Squared Error: {rmse:.4f}")
- st.write(f"Mean Absolute Error: {mae:.4f}")
- st.write(f"R-squared: {r2:.4f}")
- else:
- accuracy = accuracy_score(y_test, y_pred)
- precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
- recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
- f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
- st.write(f"Accuracy: {accuracy:.4f}")
- st.write(f"Precision: {precision:.4f}")
- st.write(f"Recall: {recall:.4f}")
- st.write(f"F1 Score: {f1:.4f}")
- st.write("Classification Report:")
- st.text(classification_report(y_test, y_pred))
-
- # Visualization
- st.subheader("Training History")
- fig, ax = plt.subplots() # Use matplotlib directly
-
- ax.plot(history.history['loss'], label='loss')
- ax.plot(history.history['val_loss'], label='val_loss')
- ax.set_xlabel('Epoch')
- ax.set_ylabel('Loss')
- ax.legend()
- st.pyplot(fig) # Display with st.pyplot
-
- st.success("Model trained successfully!")
+ X_train_processed = preprocessor.fit_transform(X_train)
+ X_test_processed = preprocessor.transform(X_test)
+
+ # Neural Network Model
+ model = keras.Sequential()
+ model.add(layers.Input(shape=(X_train_processed.shape[1],)))
+ for _ in range(hidden_layers):
+ model.add(layers.Dense(neurons_per_layer, activation=activation))
+ model.add(layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)),
+ activation='linear' if problem_type == "Regression" else 'softmax'))
+
+ # Compile the model
+ optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
+ model.compile(optimizer=optimizer,
+ loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
+ metrics=['mae'] if problem_type == "Regression" else ['accuracy'])
+
+ # Train the model
+ history = model.fit(X_train_processed, y_train, epochs=epochs, batch_size=batch_size,
+ validation_split=0.2, verbose=0)
+
+ # Store model and preprocessor
+ st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
+ st.session_state.preprocessor = preprocessor
+
+ # Store the test data
+ st.session_state.X_train_selected = X_train_processed
+ st.session_state.X_test_selected = X_test_processed
+ st.session_state.y_train = y_train
+ st.session_state.y_test = y_test
+
+ # Model Evaluation
+ y_pred = model.predict(X_test_processed)
+ if problem_type == "Regression":
+ mse = mean_squared_error(y_test, y_pred)
+ rmse = np.sqrt(mse)
+ mae = mean_absolute_error(y_test, y_pred)
+ r2 = r2_score(y_test, y_pred)
+ st.write(f"Mean Squared Error: {mse:.4f}")
+ st.write(f"Root Mean Squared Error: {rmse:.4f}")
+ st.write(f"Mean Absolute Error: {mae:.4f}")
+ st.write(f"R-squared: {r2:.4f}")
+ else:
+ accuracy = accuracy_score(y_test, y_pred)
+ precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
+ recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
+ f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
+ st.write(f"Accuracy: {accuracy:.4f}")
+ st.write(f"Precision: {precision:.4f}")
+ st.write(f"Recall: {recall:.4f}")
+ st.write(f"F1 Score: {f1:.4f}")
+ st.write("Classification Report:")
+ st.text(classification_report(y_test, y_pred))
+
+ # Visualization
+ st.subheader("๐ Training History")
+ fig, ax = plt.subplots()
+ ax.plot(history.history['loss'], label='loss')
+ ax.plot(history.history['val_loss'], label='val_loss')
+ ax.set_xlabel('Epoch')
+ ax.set_ylabel('Loss')
+ ax.legend()
+ st.pyplot(fig)
+
+ st.success("Neural network trained successfully!")
+ except Exception as e:
+ st.error(f"An error occurred during training: {e}")
- except Exception as e:
- st.error(f"An error occurred during training: {e}")
\ No newline at end of file
+ # Model Saving
+ if st.session_state.model is not None:
+ st.subheader("๐พ Save Model")
+ model_filename = st.text_input("Enter Model Filename (without extension)", "neural_network")
+ if st.button("Save Model"):
+ try:
+ joblib.dump(st.session_state.model, f"{model_filename}.joblib")
+ st.success(f"Model saved as {model_filename}.joblib")
+ except Exception as e:
+ st.error(f"Error saving model: {e}")
\ No newline at end of file