diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -7,12 +7,9 @@ from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier from sklearn.svm import SVR, SVC -from sklearn.decomposition import PCA #Import at top -from sklearn.metrics import silhouette_score #Import at top -from sklearn.cluster import DBSCAN #Import at top -from sklearn.feature_selection import SelectKBest #Import at top -import joblib #Import at top -from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score +from sklearn.feature_selection import SelectKBest +import joblib +from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score from sklearn.impute import KNNImputer, SimpleImputer from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer @@ -20,847 +17,846 @@ from sklearn.pipeline import Pipeline from ydata_profiling import ProfileReport from streamlit_pandas_profiling import st_profile_report from io import StringIO -import joblib import requests import asyncio from io import BytesIO import base64 -import seaborn as sns -import time -from sklearn.cluster import KMeans -import scipy.stats as stats import mimetypes import matplotlib.pyplot as plt from sklearn.model_selection import learning_curve -# Configurations -st.set_page_config(page_title="Executive Insights Pro", layout="wide", page_icon="๐Ÿ“ˆ") - -# ----Load Image---- -@st.cache_data(ttl=3600) -def load_image(image_url): - """Loads an image from a URL and returns bytes.""" - try: - response = requests.get(image_url, stream=True) - response.raise_for_status() - return response.content - except requests.exceptions.RequestException as e: - st.error(f"Error loading image: {e}") - return None - -# ----Function to make and convert background to base 64 code----- -def set_background(): - """Sets the background image using base64 encoding.""" - image_url = "https://www.nasa.gov/sites/default/files/thumbnails/image/web_first_images_release.png" # NASA Image - image_data = load_image(image_url) - if image_data: - # Convert bytes to base64 - image_base64 = base64.b64encode(image_data).decode() - st.markdown( - f""" - - """, - unsafe_allow_html=True, - ) - return - -# Simplified CSS -def apply_simplified_theme(): - """Injects simplified CSS to enhance Streamlit's default style.""" - st.markdown( - """ - - """, - unsafe_allow_html=True, - ) - return - -# Apply background and simplified theme -set_background() -apply_simplified_theme() - -def show_loader(message="Loading..."): - """Displays an animated loader.""" - st.markdown( - f""" -
-
- {message} -
- """, - unsafe_allow_html=True - ) +# Enhanced configuration +st.set_page_config( + page_title="Executive Insights Pro", + layout="wide", + page_icon="๐Ÿ“ˆ", + initial_sidebar_state="expanded" +) -@st.cache_data(ttl=3600) #Added allow_output_mutation -def load_data(uploaded_file): - """Load and cache dataset, with file type validation.""" - if uploaded_file is not None: - file_extension = uploaded_file.name.split(".")[-1].lower() - mime_type = mimetypes.guess_type(uploaded_file.name)[0] - - max_file_size_mb = 50 # Set a maximum file size (adjust as needed) - file_size_mb = uploaded_file.size / (1024 * 1024) - if file_size_mb > max_file_size_mb: - st.error(f"File size exceeds the limit of {max_file_size_mb} MB.") - return None - - - try: # Wrap file reading in a try...except - if file_extension == "csv" or mime_type == 'text/csv': - df = pd.read_csv(uploaded_file) - return df - elif file_extension in ["xlsx", "xls"] or mime_type in ['application/vnd.ms-excel', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']: - df = pd.read_excel(uploaded_file) - return df - else: - st.error("Unsupported file type. Please upload a CSV or Excel file.") - return None - except FileNotFoundError: - st.error("File not found. Please check the file path.") - except pd.errors.ParserError: # Catch pandas-specific parsing errors - st.error("Error parsing the file. Make sure it's a valid CSV or Excel file.") - except Exception as e: - st.error(f"An unexpected error occurred: {type(e).__name__} - {str(e)}") - return None # Handle other potential exceptions +# Security: Set allowed file types +ALLOWED_EXTENSIONS = {'csv', 'xlsx', 'parquet', 'feather'} +MAX_FILE_SIZE_MB = 250 # 250MB limit - else: - return None - -@st.cache_data(ttl=3600) -def generate_profile(df): - """Generate automated EDA report""" - return ProfileReport(df, minimal=True) - -# Session State Management -if 'raw_data' not in st.session_state: - st.session_state.raw_data = None -if 'cleaned_data' not in st.session_state: - st.session_state.cleaned_data = None -if 'train_test' not in st.session_state: - st.session_state.train_test = {} -if 'model' not in st.session_state: - st.session_state.model = None -if 'preprocessor' not in st.session_state: - st.session_state.preprocessor = None # to store the column transformer - -# Sidebar Navigation -st.sidebar.title("๐Ÿ”ฎ Data Wizard Pro") - -# Apply custom CSS to change text color in the sidebar -st.markdown( - """ - - """, - unsafe_allow_html=True, -) -# Replace the existing app_mode section with this: -app_mode = st.sidebar.radio("Navigate", [ - "Data Upload", - "Smart Cleaning", - "Advanced EDA", - "Model Training", - "Predictions", - "Visualization Lab", - "Neural Network Studio" # New option -]) - -# --- Main App Logic --- -if app_mode == "Data Upload": - st.title("๐Ÿ“ค Data Upload & Initial Analysis") - - # File Upload Section with improved styling - st.markdown( - """ - - """, - unsafe_allow_html=True, - ) + # Numeric specific checks + if pd.api.types.is_numeric_dtype(df[col]): + col_report.update({ + 'mean': df[col].mean(), + 'std': df[col].std(), + 'zeros': (df[col] == 0).sum(), + 'negatives': (df[col] < 0).sum() if df[col].dtype != 'uint' else 0, + 'outliers': detect_outliers(df[col]) + }) + report['data_health_score'] -= 2 # Deduct 2% per numeric column + + # Categorical specific checks + if pd.api.types.is_string_dtype(df[col]): + col_report.update({ + 'top_value': df[col].mode()[0] if not df[col].empty else None, + 'top_freq': df[col].value_counts().iloc[0]/len(df) if not df[col].empty else 0 + }) + report['data_health_score'] -= 1 # Deduct 1% per string column + + report['column_analysis'][col] = col_report + report['data_health_score'] = max(report['data_health_score'], 0) + + return report + +def detect_outliers(series): + """Detect outliers using IQR method""" + q1 = series.quantile(0.25) + q3 = series.quantile(0.75) + iqr = q3 - q1 + return ((series < (q1 - 1.5 * iqr)) | (series > (q3 + 1.5 * iqr))).sum() + +# --- Data Upload Page --- +if app_mode == "Data Upload": + st.title("๐Ÿ“ค Smart Data Hub") + st.markdown(""" + **Upload your dataset** (CSV, Excel, Parquet) for comprehensive analysis. + Get instant data health insights and quality assessment. + """) + + # File upload with enhanced UI uploaded_file = st.file_uploader( - "Choose a CSV or Excel file", type=["csv", "xlsx"], - help="Upload your dataset here. Supported formats: CSV, XLSX" + "Drag & drop or browse files", + type=list(ALLOWED_EXTENSIONS), + help=f"Max file size: {MAX_FILE_SIZE_MB}MB. Supported formats: {', '.join(ALLOWED_EXTENSIONS)}" ) - + if uploaded_file: - df = load_data(uploaded_file) - if df is not None: - # only proceed if load_data returned a valid dataframe - st.session_state.raw_data = df - st.session_state.cleaned_data = df.copy() - - st.subheader("Data Overview") - # Data Overview Cards with more context - col1, col2, col3 = st.columns(3) - with col1: - st.metric("Number of Rows", df.shape[0], help="Total number of entries in the dataset.") - with col2: - st.metric("Number of Columns", df.shape[1], help="Total number of features in the dataset.") - with col3: - num_missing = df.isna().sum().sum() - st.metric("Total Missing Values", num_missing, help="Total number of missing entries across the entire dataset.") - - # Display Data Types - st.write("Column Data Types:") - dtype_counts = df.dtypes.value_counts().to_dict() - for dtype, count in dtype_counts.items(): - st.write(f"- {dtype}: {count} column(s)") + # Validate file + is_valid, message = validate_file(uploaded_file) + if not is_valid: + st.error(f"Upload error: {message}") + st.stop() + + # Load data with progress + with st.spinner(f"Loading {uploaded_file.name}..."): + try: + if uploaded_file.name.endswith('.csv'): + df = pd.read_csv(uploaded_file, low_memory=False) + elif uploaded_file.name.endswith(('.xlsx', '.xls')): + df = pd.read_excel(uploaded_file) + elif uploaded_file.name.endswith('.parquet'): + df = pd.read_parquet(uploaded_file) + elif uploaded_file.name.endswith('.feather'): + df = pd.read_feather(uploaded_file) + + st.session_state.raw_data = df + st.success("Dataset loaded successfully!") + + except Exception as e: + st.error(f"Error loading file: {str(e)}") + st.stop() + + # Data Health Dashboard + st.subheader("๐Ÿ“Š Data Health Dashboard") + report = enhanced_quality_report(df) + + col1, col2, col3, col4 = st.columns(4) + col1.metric("Total Rows", report['basic_stats']['rows']) + col2.metric("Total Columns", report['basic_stats']['columns']) + col3.metric("Missing Values", report['basic_stats']['missing_values']) + col4.metric("Data Health Score", f"{report['data_health_score']}/100") + + # Column Explorer + with st.expander("๐Ÿ” Deep Column Analysis", expanded=True): + selected_col = st.selectbox("Select column to inspect", df.columns) + col_info = report['column_analysis'][selected_col] - # Sample Data Table with improved display - st.subheader("Sample Data") - num_rows_preview = st.slider("Number of Rows to Preview", 5, 20, 10, help="Adjust the number of rows displayed in the sample data.") - st.dataframe(df.head(num_rows_preview), use_container_width=True) + st.write(f"**Type:** {col_info['type']}") + st.write(f"**Unique Values:** {col_info['unique']}") + st.write(f"**Missing Values:** {col_info['missing']} ({col_info['missing']/len(df):.1%})") - # Column Statistics - with st.expander("๐Ÿ“Š Column Statistics"): - for col in df.columns: - st.subheader(f"Column: {col}") - st.write(f"Data type: {df[col].dtype}") - if pd.api.types.is_numeric_dtype(df[col]): - st.write("Summary Statistics:") - st.write(df[col].describe()) - else: - st.write("Value Counts:") - st.write(df[col].value_counts()) + if pd.api.types.is_numeric_dtype(df[selected_col]): + st.write("**Distribution:**") + st.line_chart(df[selected_col]) + st.write(f"**Outliers Detected:** {col_info['outliers']}") + else: + st.write("**Most Common Values:**") + top_values = df[selected_col].value_counts().head(5) + st.bar_chart(top_values) + + # Smart Recommendations + with st.expander("๐Ÿ’ก Cleaning Recommendations"): + recommendations = [] + if report['basic_stats']['duplicates'] > 0: + recommendations.append(f"๐Ÿšจ Remove {report['basic_stats']['duplicates']} duplicate rows") + if report['basic_stats']['missing_values'] > 0: + recommendations.append("๐Ÿ”ง Apply advanced imputation strategies") + for col, data in report['column_analysis'].items(): + if data['missing'] > 0.5 * len(df): + recommendations.append(f"โš ๏ธ Consider dropping {col} (>{50}% missing)") + if data['unique'] == len(df): + recommendations.append(f"๐Ÿ” Investigate {col} - potential unique identifier") - # Automated EDA Report - with st.expander("๐Ÿš€ Automated Data Report"): - if st.button("Generate Smart Report"): - show_loader("Generating EDA Report") - pr = generate_profile(df) - st_profile_report(pr) + if recommendations: + st.write("### Recommended Actions") + for rec in recommendations[:5]: # Show top 5 + st.write(f"- {rec}") + else: + st.success("No critical issues detected - your data looks healthy!") + # Data Preview + with st.expander("๐Ÿ”Ž Data Preview", expanded=True): + preview_size = st.slider("Preview rows", 5, 100, 15) + st.dataframe(df.head(preview_size).style.highlight_null(color='#FF6666') + + # Advanced Profiling + if st.button("๐Ÿš€ Generate Full Data Profile"): + with st.spinner("Generating comprehensive report..."): + pr = ProfileReport(df, explorative=True) + st_profile_report(pr) + +# Smart Cleaning Section elif app_mode == "Smart Cleaning": st.title("๐Ÿงผ Intelligent Data Cleaning") - if st.session_state.raw_data is not None: - df = st.session_state.cleaned_data - - # Cleaning Toolkit - col1, col2 = st.columns([1, 3]) - with col1: - st.subheader("Cleaning Actions") - - clean_action = st.selectbox("Choose Operation", [ - "Handle Missing Values", - "Clean Text", - "Remove Columns", # New option - # ... other cleaning operations ... + st.markdown(""" + **Automated Data Cleaning** with smart suggestions and advanced transformations. + Clean your data with confidence using AI-powered recommendations. + """) + + if 'raw_data' not in st.session_state or st.session_state.raw_data is None: + st.warning("Please upload your data in the Data Upload section first.") + st.stop() + + df = st.session_state.raw_data.copy() + cleaning_actions = [] + + # Data Health Summary + st.subheader("๐Ÿ“Š Data Health Summary") + col1, col2, col3 = st.columns(3) + with col1: + missing_pct = df.isna().mean().mean() + st.metric("Missing Values", f"{missing_pct:.1%}") + with col2: + duplicates = df.duplicated().sum() + st.metric("Duplicates", duplicates) + with col3: + data_types = df.dtypes.value_counts().to_dict() + st.metric("Data Types", str(data_types)) + + # Cleaning Operations + st.subheader("๐Ÿ”ง Cleaning Operations") + + # 1. Missing Value Handling + with st.expander("๐Ÿ•ณ๏ธ Handle Missing Values", expanded=True): + missing_cols = df.columns[df.isna().any()].tolist() + if missing_cols: + st.write("Columns with missing values:") + cols = st.multiselect("Select columns to clean", missing_cols, default=missing_cols) + + method = st.radio("Imputation Method", [ + "Drop Missing", + "Mean/Median/Mode", + "KNN Imputation", + "Advanced Imputation" + ], horizontal=True) + + if method == "Mean/Median/Mode": + strategy = st.selectbox("Strategy", ["mean", "median", "most_frequent"]) + if st.button("Apply Imputation"): + df[cols] = df[cols].fillna(df[cols].agg(strategy)) + cleaning_actions.append(f"Filled missing values in {cols} using {strategy}") + + elif method == "KNN Imputation": + n_neighbors = st.slider("Number of neighbors", 2, 15, 5) + if st.button("Apply KNN Imputation"): + from sklearn.impute import KNNImputer + imputer = KNNImputer(n_neighbors=n_neighbors) + df[cols] = imputer.fit_transform(df[cols]) + cleaning_actions.append(f"Applied KNN imputation (k={n_neighbors}) on {cols}") + + elif method == "Advanced Imputation": + st.write("Coming soon: MICE, Deep Learning imputation") + else: + st.success("No missing values found!") + + # 2. Duplicate Handling + with st.expander("๐Ÿ”„ Handle Duplicates", expanded=True): + if duplicates > 0: + st.write(f"Found {duplicates} duplicate rows") + dup_strategy = st.radio("Duplicate Strategy", [ + "Remove All Duplicates", + "Keep First Occurrence", + "Keep Last Occurrence" ]) + + if st.button("Handle Duplicates"): + df = df.drop_duplicates(keep={ + "Remove All Duplicates": False, + "Keep First Occurrence": 'first', + "Keep Last Occurrence": 'last' + }[dup_strategy]) + cleaning_actions.append(f"Removed duplicates using strategy: {dup_strategy}") + else: + st.success("No duplicates found!") + + # 3. Data Type Conversion + with st.expander("๐Ÿ”„ Convert Data Types", expanded=True): + st.write("Current Data Types:") + st.dataframe(df.dtypes.reset_index().rename(columns={ + 0: 'Type', + 'index': 'Column' + })) + + col_to_convert = st.selectbox("Select column to convert", df.columns) + new_type = st.selectbox("New Data Type", [ + "String", "Integer", "Float", + "Boolean", "Datetime", "Category" + ]) + + if st.button("Convert Data Type"): + try: + if new_type == "String": + df[col_to_convert] = df[col_to_convert].astype(str) + elif new_type == "Integer": + df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce').astype('Int64') + elif new_type == "Float": + df[col_to_convert] = pd.to_numeric(df[col_to_convert], errors='coerce') + elif new_type == "Boolean": + df[col_to_convert] = df[col_to_convert].astype(bool) + elif new_type == "Datetime": + df[col_to_convert] = pd.to_datetime(df[col_to_convert], errors='coerce') + elif new_type == "Category": + df[col_to_convert] = df[col_to_convert].astype('category') + + cleaning_actions.append(f"Converted {col_to_convert} to {new_type}") + st.success("Data type converted successfully!") + except Exception as e: + st.error(f"Conversion failed: {str(e)}") + + # 4. Outlier Detection & Handling + with st.expander("๐Ÿ“ˆ Handle Outliers", expanded=True): + numeric_cols = df.select_dtypes(include=np.number).columns.tolist() + if numeric_cols: + outlier_col = st.selectbox("Select numeric column", numeric_cols) + threshold = st.slider("Outlier Threshold (Z-Score)", 1.0, 5.0, 3.0) + + z_scores = (df[outlier_col] - df[outlier_col].mean()) / df[outlier_col].std() + outliers = df[abs(z_scores) > threshold] + + st.write(f"Detected {len(outliers)} outliers") + st.dataframe(outliers) + + if st.button("Handle Outliers"): + df = df[abs(z_scores) <= threshold] + cleaning_actions.append(f"Removed {len(outliers)} outliers from {outlier_col}") + else: + st.info("No numeric columns found for outlier detection") + + # 5. Text Cleaning + with st.expander("๐Ÿ“ Clean Text Data", expanded=True): + text_cols = df.select_dtypes(include='object').columns.tolist() + if text_cols: + text_col = st.selectbox("Select text column", text_cols) + options = st.multiselect("Text Cleaning Options", [ + "Lowercase", + "Remove Punctuation", + "Remove Extra Spaces", + "Remove Stopwords", + "Stemming" + ]) + + if st.button("Clean Text"): + if "Lowercase" in options: + df[text_col] = df[text_col].str.lower() + if "Remove Punctuation" in options: + df[text_col] = df[text_col].str.replace(r'[^\w\s]', '', regex=True) + if "Remove Extra Spaces" in options: + df[text_col] = df[text_col].str.strip().str.replace(r'\s+', ' ', regex=True) + if "Remove Stopwords" in options: + from nltk.corpus import stopwords + stop_words = set(stopwords.words('english')) + df[text_col] = df[text_col].apply( + lambda x: ' '.join([word for word in x.split() if word not in stop_words]) + ) + if "Stemming" in options: + from nltk.stem import PorterStemmer + stemmer = PorterStemmer() + df[text_col] = df[text_col].apply( + lambda x: ' '.join([stemmer.stem(word) for word in x.split()]) + ) + + cleaning_actions.append(f"Cleaned text in {text_col}") + st.success("Text cleaned successfully!") + else: + st.info("No text columns found for cleaning") - if clean_action == "Handle Missing Values": - columns_with_missing = df.columns[df.isnull().any()].tolist() - column_to_impute = st.selectbox("Column to Impute", ["All Columns"] + columns_with_missing) - - method = st.selectbox("Imputation Method", [ - "KNN Imputation", - "Median Fill", - "Mean Fill", - "Drop Missing", - "Constant Value Fill" - ]) - if method == "KNN Imputation": - knn_neighbors = st.slider("KNN Neighbors", 2, 10, 5) - elif method == "Constant Value Fill": - constant_value = st.text_input("Constant Value") - - elif clean_action == "Clean Text": - text_column = st.selectbox("Text Column", df.select_dtypes(include='object').columns) - cleaning_operation = st.selectbox("Cleaning Operation", ["Remove Special Characters", "Lowercase", "Uppercase", "Remove Extra Spaces"]) - if cleaning_operation == "Remove Special Characters": - chars_to_remove = st.text_input("Characters to Remove", r'[^a-zA-Z0-9\s]') - - elif clean_action == "Remove Columns": - remove_cols = st.multiselect("Columns to Remove", df.columns) # Multiselect for column removal - - with col2: - st.subheader("Data Preview") # Added Data Preview Section - st.dataframe(df.head(10), use_container_width=True) # Display sample data - - if st.button("Apply Transformation"): - with st.spinner("Applying changes..."): - current_df = df.copy() - # ... (your data history logic) ... - - if clean_action == "Handle Missing Values": - if method == "KNN Imputation": - imputer = KNNImputer(n_neighbors=knn_neighbors) - if column_to_impute == "All Columns": - current_df = pd.DataFrame(imputer.fit_transform(current_df), columns=current_df.columns) - else: - current_df[[column_to_impute]] = pd.DataFrame(imputer.fit_transform(current_df[[column_to_impute]]), columns=[column_to_impute]) - elif method == "Median Fill": - if column_to_impute == "All Columns": - current_df = current_df.fillna(current_df.median()) - else: - current_df[column_to_impute] = current_df[column_to_impute].fillna(current_df[column_to_impute].median()) - elif method == "Mean Fill": - if column_to_impute == "All Columns": - current_df = current_df.fillna(current_df.mean()) - else: - current_df[column_to_impute] = current_df[column_to_impute].fillna(current_df[column_to_impute].mean()) - elif method == "Constant Value Fill": - if column_to_impute == "All Columns": - current_df = current_df.fillna(constant_value) - else: - current_df[column_to_impute] = current_df[column_to_impute].fillna(constant_value) - else: - current_df = current_df.dropna() + # Save Cleaned Data + if st.button("๐Ÿ’พ Save Cleaned Data"): + st.session_state.cleaned_data = df + st.success("Cleaned data saved successfully!") + + # Show Cleaning Log + st.subheader("๐Ÿ“ Cleaning Log") + if cleaning_actions: + st.write("### Applied Transformations") + for action in cleaning_actions: + st.write(f"- {action}") + else: + st.info("No transformations applied yet") + +# Advanced EDA Section +elif app_mode == "Advanced EDA": + st.title("๐Ÿ” Advanced Exploratory Data Analysis") + st.markdown(""" + **Interactive Data Exploration** with advanced statistical tools and visualizations. + Uncover hidden patterns and relationships in your data. + """) - elif clean_action == "Clean Text": - import re # moved here since its only used here to avoid library bloat + if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None: + st.warning("Please clean your data in the Smart Cleaning section first.") + st.stop() - def clean_text(text, operation, chars_to_remove=r'[^a-zA-Z0-9\s]'): - if operation == "Remove Special Characters": - text = re.sub(chars_to_remove, '', str(text)) - elif operation == "Lowercase": - text = str(text).lower() - elif operation == "Uppercase": - text = str(text).upper() - elif operation == "Remove Extra Spaces": - text = " ".join(str(text).split()) - return text + df = st.session_state.cleaned_data.copy() + + # Initialize session state for EDA configuration + if 'eda_config' not in st.session_state: + st.session_state.eda_config = { + 'plot_type': "Histogram", + 'x_col': df.columns[0] if len(df.columns) > 0 else None, + 'y_col': df.columns[1] if len(df.columns) > 1 else None, + 'z_col': df.columns[2] if len(df.columns) > 2 else None, + 'color_col': None, + 'size_col': None, + 'time_col': None, + 'value_col': None, + 'scatter_matrix_cols': df.select_dtypes(include=np.number).columns.tolist()[:5], + 'color_palette': "Viridis", + 'hover_data_cols': [], + 'filter_col': None, + 'filter_options': [] + } - current_df[text_column] = current_df[text_column].astype(str).apply(lambda x: clean_text(x, cleaning_operation, chars_to_remove)) + # Data Filtering Section + with st.expander("๐Ÿ”Ž Data Filtering", expanded=True): + st.session_state.eda_config['filter_col'] = st.selectbox( + "Filter Column", + [None] + list(df.columns), + help="Choose a column to filter the data." + ) - elif clean_action == "Remove Columns": - if remove_cols: #Check that it is not empty - current_df = current_df.drop(columns=remove_cols) # Drop selected columns + if st.session_state.eda_config['filter_col']: + unique_values = df[st.session_state.eda_config['filter_col']].unique() + st.session_state.eda_config['filter_options'] = st.multiselect( + "Filter Values", + unique_values, + default=unique_values, + help=f"Select values from '{st.session_state.eda_config['filter_col']}'" + ) + df = df[df[st.session_state.eda_config['filter_col']].isin( + st.session_state.eda_config['filter_options'] + )] + + # Visualization Type Selection + st.sidebar.header("๐Ÿ“Š Visualization Configuration") + plot_types = [ + "Histogram", "Scatter Plot", "Box Plot", "Violin Plot", + "Correlation Heatmap", "Parallel Coordinates", "Andrews Curves", + "Pair Plot", "Density Contour", "3D Scatter", "Time Series", + "Sunburst Chart", "Funnel Chart", "Clustering Analysis" + ] + st.session_state.eda_config['plot_type'] = st.sidebar.selectbox( + "Choose Visualization", + plot_types, + index=0 + ) - st.session_state.cleaned_data = current_df - st.success("Transformation applied!") + # Dynamic Controls Based on Plot Type + if st.session_state.eda_config['plot_type'] != "Correlation Heatmap": + st.session_state.eda_config['x_col'] = st.sidebar.selectbox( + "X Axis", + df.columns, + index=df.columns.get_loc(st.session_state.eda_config['x_col']) + if st.session_state.eda_config['x_col'] in df.columns else 0 + ) - if st.button("Refresh Data Preview"): # Button to refresh data preview - st.rerun() - -elif app_mode == "Advanced EDA": - st.title("๐Ÿ” Advanced Exploratory Analysis") - - if st.session_state.cleaned_data is not None: - df = st.session_state.cleaned_data.copy() - - # Initialize session state for plot configuration - if 'plot_config' not in st.session_state: - st.session_state.plot_config = { - 'plot_type': "Histogram", - 'x_col': df.columns[0] if len(df.columns) > 0 else None, - 'y_col': df.columns[1] if len(df.columns) > 1 else None, - 'z_col': df.columns[2] if len(df.columns) > 2 else None, - 'color_col': None, - 'size_col': None, - 'time_col': None, - 'value_col': None, - 'scatter_matrix_cols': df.select_dtypes(include=np.number).columns.tolist()[:5], - 'color_palette': "#00f7ff", - 'color_continuous_scale': "Viridis", - 'hover_data_cols': [], - 'filter_col': None, - 'filter_options': [] - } + if st.session_state.eda_config['plot_type'] in ["Scatter Plot", "Box Plot", "Violin Plot", "Time Series", "3D Scatter", "Histogram"]: + st.session_state.eda_config['y_col'] = st.sidebar.selectbox( + "Y Axis", + df.columns, + index=df.columns.get_loc(st.session_state.eda_config['y_col']) + if st.session_state.eda_config['y_col'] in df.columns else 0 + ) - # Data Filtering Section - with st.expander("๐Ÿ”Ž Data Filtering", expanded=False): - # Use direct session state assignment for reactivity - st.session_state.plot_config['filter_col'] = st.selectbox( - "Filter Column", - [None] + list(df.columns), - help="Choose a column to filter the data." - ) + if st.session_state.eda_config['plot_type'] == "3D Scatter": + st.session_state.eda_config['z_col'] = st.sidebar.selectbox( + "Z Axis", + df.columns, + index=df.columns.get_loc(st.session_state.eda_config['z_col']) + if st.session_state.eda_config['z_col'] in df.columns else 0 + ) + st.session_state.eda_config['color_col'] = st.sidebar.selectbox( + "Color by", + [None] + list(df.columns) + ) - if st.session_state.plot_config['filter_col']: - unique_values = df[st.session_state.plot_config['filter_col']].unique() - st.session_state.plot_config['filter_options'] = st.multiselect( - "Filter Values", - unique_values, - default=unique_values, - help=f"Select values from '{st.session_state.plot_config['filter_col']}'" - ) - df = df[df[st.session_state.plot_config['filter_col']].isin( - st.session_state.plot_config['filter_options'] - )] - - # Visualization Configuration - st.sidebar.header("๐Ÿ“Š Plot Configuration") - - # Plot type selector - st.session_state.plot_config['plot_type'] = st.sidebar.selectbox( - "Choose Visualization", - [ - "Histogram", "Scatter Plot", "Box Plot", - "Correlation Heatmap", "3D Scatter", - "Violin Plot", "Time Series", "Scatter Matrix" - ], - index=0 # Reset to first option when plot type changes + # Advanced Plot Customization + with st.expander("๐ŸŽจ Advanced Customization", expanded=False): + st.session_state.eda_config['color_palette'] = st.selectbox( + "Color Palette", + ["Viridis", "Plasma", "Magma", "Cividis", "RdBu", "Rainbow"] + ) + st.session_state.eda_config['hover_data_cols'] = st.multiselect( + "Hover Data", + df.columns ) - # Dynamic controls based on plot type - if st.session_state.plot_config['plot_type'] != "Correlation Heatmap": - st.session_state.plot_config['x_col'] = st.sidebar.selectbox( - "X Axis", - df.columns, - index=df.columns.get_loc(st.session_state.plot_config['x_col']) - if st.session_state.plot_config['x_col'] in df.columns else 0 + # Plot Generation + try: + fig = None + config = st.session_state.eda_config + + if config['plot_type'] == "Histogram": + fig = px.histogram( + df, x=config['x_col'], y=config['y_col'], + nbins=30, template="plotly_dark", + color_discrete_sequence=[config['color_palette']] ) - if st.session_state.plot_config['plot_type'] in ["Scatter Plot", "Box Plot", - "Violin Plot", "Time Series", - "3D Scatter", "Histogram"]: - st.session_state.plot_config['y_col'] = st.sidebar.selectbox( - "Y Axis", - df.columns, - index=df.columns.get_loc(st.session_state.plot_config['y_col']) - if st.session_state.plot_config['y_col'] in df.columns else 0 + elif config['plot_type'] == "Scatter Plot": + fig = px.scatter( + df, x=config['x_col'], y=config['y_col'], + color=config['color_col'], + size=config['size_col'], + hover_data=config['hover_data_cols'] ) - if st.session_state.plot_config['plot_type'] == "3D Scatter": - st.session_state.plot_config['z_col'] = st.sidebar.selectbox( - "Z Axis", - df.columns, - index=df.columns.get_loc(st.session_state.plot_config['z_col']) - if st.session_state.plot_config['z_col'] in df.columns else 0 - ) - st.session_state.plot_config['color_col'] = st.sidebar.selectbox( - "Color by", - [None] + list(df.columns) + elif config['plot_type'] == "3D Scatter": + fig = px.scatter_3d( + df, x=config['x_col'], y=config['y_col'], z=config['z_col'], + color=config['color_col'], + color_discrete_sequence=[config['color_palette']] ) - # Color configuration - if st.session_state.plot_config['plot_type'] == "Correlation Heatmap": - st.session_state.plot_config['color_continuous_scale'] = st.sidebar.selectbox( - "Color Scale", - ['Viridis', 'Plasma', 'Magma', 'Cividis', 'RdBu'] - ) - else: - st.session_state.plot_config['color_palette'] = st.sidebar.selectbox( - "Color Palette", - ['#00f7ff', '#ff00ff', '#f70000', '#0000f7'] - ) + elif config['plot_type'] == "Correlation Heatmap": + numeric_df = df.select_dtypes(include=np.number) + if not numeric_df.empty: + corr = numeric_df.corr() + fig = px.imshow( + corr, text_auto=True, + color_continuous_scale=config['color_palette'] + ) + else: + st.warning("No numerical columns found for correlation heatmap.") - # Additional configurations - if st.session_state.plot_config['plot_type'] == "Scatter Plot": - st.session_state.plot_config['size_col'] = st.sidebar.selectbox( - "Size by", - [None] + list(df.columns) - ) - st.session_state.plot_config['hover_data_cols'] = st.sidebar.multiselect( - "Hover Data", - df.columns + elif config['plot_type'] == "Box Plot": + fig = px.box( + df, x=config['x_col'], y=config['y_col'], + color=config['color_col'] ) - if st.session_state.plot_config['plot_type'] == "Time Series": - st.session_state.plot_config['time_col'] = st.sidebar.selectbox( - "Time Column", - df.columns - ) - st.session_state.plot_config['value_col'] = st.sidebar.selectbox( - "Value Column", - df.columns + elif config['plot_type'] == "Violin Plot": + fig = px.violin( + df, x=config['x_col'], y=config['y_col'], + box=True, points="all", + color=config['color_col'] ) - if st.session_state.plot_config['plot_type'] == "Scatter Matrix": - st.session_state.plot_config['scatter_matrix_cols'] = st.multiselect( - "Columns for Scatter Matrix", - df.select_dtypes(include=np.number).columns, - default=st.session_state.plot_config['scatter_matrix_cols'] + elif config['plot_type'] == "Time Series": + df = df.sort_values(by=config['time_col']) + fig = px.line( + df, x=config['time_col'], y=config['value_col'], + color=config['color_col'] ) - # Plot generation - try: - fig = None - config = st.session_state.plot_config - - if config['plot_type'] == "Histogram": - fig = px.histogram( - df, x=config['x_col'], y=config['y_col'], - nbins=30, template="plotly_dark", - color_discrete_sequence=[config['color_palette']] - ) - - elif config['plot_type'] == "Scatter Plot": - fig = px.scatter( - df, x=config['x_col'], y=config['y_col'], - color_discrete_sequence=[config['color_palette']], - size=config['size_col'], - hover_data=config['hover_data_cols'] - ) - - elif config['plot_type'] == "3D Scatter": - fig = px.scatter_3d( - df, x=config['x_col'], y=config['y_col'], z=config['z_col'], - color=config['color_col'], - color_discrete_sequence=[config['color_palette']] - ) - - elif config['plot_type'] == "Correlation Heatmap": - numeric_df = df.select_dtypes(include=np.number) - if not numeric_df.empty: - corr = numeric_df.corr() - fig = px.imshow( - corr, text_auto=True, - color_continuous_scale=config['color_continuous_scale'] - ) - else: - st.warning("No numerical columns found for correlation heatmap.") - - elif config['plot_type'] == "Box Plot": - fig = px.box( - df, x=config['x_col'], y=config['y_col'], - color_discrete_sequence=[config['color_palette']] - ) - - elif config['plot_type'] == "Violin Plot": - fig = px.violin( - df, x=config['x_col'], y=config['y_col'], - box=True, points="all", - color_discrete_sequence=[config['color_palette']] - ) - - elif config['plot_type'] == "Time Series": - df = df.sort_values(by=config['time_col']) - fig = px.line( - df, x=config['time_col'], y=config['value_col'], - color_discrete_sequence=[config['color_palette']] - ) + elif config['plot_type'] == "Scatter Matrix": + fig = px.scatter_matrix( + df, dimensions=config['scatter_matrix_cols'], + color=config['color_col'] + ) - elif config['plot_type'] == "Scatter Matrix": - fig = px.scatter_matrix( - df, dimensions=config['scatter_matrix_cols'], - color_discrete_sequence=[config['color_palette']] - ) + if fig: + st.plotly_chart(fig, use_container_width=True) + except Exception as e: + st.error(f"An error occurred while generating the plot: {e}") + + # Statistical Analysis Section + with st.expander("๐Ÿ“Š Statistical Analysis", expanded=True): + analysis_type = st.selectbox("Select Analysis Type", [ + "Descriptive Statistics", + "Correlation Analysis", + "Hypothesis Testing", + "Distribution Fitting" + ]) - if fig: - st.plotly_chart(fig, use_container_width=True) - except Exception as e: - st.error(f"An error occurred while generating the plot: {e}") + if analysis_type == "Descriptive Statistics": + st.write(df.describe(include='all')) - with st.expander("๐Ÿงช Hypothesis Testing"): - test_type = st.selectbox("Select Test Type", ["T-test", "Chi-Squared Test"]) + elif analysis_type == "Correlation Analysis": + numeric_cols = df.select_dtypes(include=np.number).columns + if len(numeric_cols) >= 2: + corr_method = st.selectbox("Correlation Method", [ + "Pearson", "Kendall", "Spearman" + ]) + corr_matrix = df[numeric_cols].corr(method=corr_method.lower()) + st.write(corr_matrix) + st.heatmap(corr_matrix, annot=True, cmap=config['color_palette']) + else: + st.warning("Need at least 2 numeric columns for correlation analysis") + elif analysis_type == "Hypothesis Testing": + test_type = st.selectbox("Select Test Type", [ + "T-test", "Chi-Squared Test", "ANOVA", "Mann-Whitney U" + ]) if test_type == "T-test": col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns) col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns) if st.button("Run T-test"): - # Example: Split data by category and perform t-test - try: - groups = df.groupby(col2)[col1].apply(list) - if len(groups) == 2: - t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1]) - st.write(f"T-statistic: {t_stat:.4f}") - st.write(f"P-value: {p_value:.4f}") - if p_value < 0.05: - st.write("Reject the null hypothesis.") - else: - st.write("Fail to reject the null hypothesis.") + groups = df.groupby(col2)[col1].apply(list) + if len(groups) == 2: + t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1]) + st.write(f"T-statistic: {t_stat:.4f}") + st.write(f"P-value: {p_value:.4f}") + if p_value < 0.05: + st.write("Reject the null hypothesis.") else: - st.write("Select a categorical column with exactly two categories.") - except Exception as e: - st.error(f"An error occurred during the T-test: {e}") - + st.write("Fail to reject the null hypothesis.") + else: + st.write("Select a categorical column with exactly two categories.") + + elif analysis_type == "Distribution Fitting": + numeric_col = st.selectbox("Select Numeric Column", df.select_dtypes(include=np.number).columns) + dist_types = ["Normal", "Log-Normal", "Exponential", "Gamma"] + selected_dist = st.selectbox("Select Distribution Type", dist_types) + if st.button("Fit Distribution"): + from scipy.stats import norm, lognorm, expon, gamma + dist_functions = { + "Normal": norm, + "Log-Normal": lognorm, + "Exponential": expon, + "Gamma": gamma + } + params = dist_functions[selected_dist].fit(df[numeric_col].dropna()) + st.write(f"Fitted Parameters: {params}") + + # Data Profiling Section + with st.expander("๐Ÿ“ Generate Full Data Profile", expanded=False): + if st.button("๐Ÿš€ Generate Comprehensive Report"): + with st.spinner("Generating report..."): + pr = ProfileReport(df, explorative=True) + st_profile_report(pr) + +# Model Training Section elif app_mode == "Model Training": - st.title("๐Ÿš‚ Model Training") - - if st.session_state.cleaned_data is not None: - df = st.session_state.cleaned_data.copy() - - # Initialize session state for train/test split - if 'X_train_selected' not in st.session_state: - st.session_state.X_train_selected = None - st.session_state.X_test_selected = None - st.session_state.y_train = None - st.session_state.y_test = None - st.session_state.model = None # Initialize model in session state - - # Target Variable Selection - target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.") - - # Problem Type Selection - problem_type = st.radio("Select Problem Type", ["Regression", "Classification"], help="Choose the type of problem.") - - # Feature Selection - feature_columns = st.multiselect("Select Feature Columns", df.drop(columns=[target_column]).columns, help="Choose features for training.") - - # Model Selection - Dynamic based on Problem Type - if problem_type == "Regression": - model_options = ["Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM"] - else: # Classification - model_options = ["Logistic Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM", "Naive Bayes", "KNN"] - - model_name = st.selectbox("Select Model", model_options, help="Choose a model.") - - feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"]) - - # Hyperparameter Tuning - Dynamic based on Model Selection - param_grid = {} # Initialize to empty dictionary - - #Define different paramter values for the model so it works. This is not an optimized number - #The goal is to make sure that all visualizations and graphs work as is. + st.title("๐Ÿš‚ Model Training Studio") + st.markdown(""" + **Train and Evaluate Machine Learning Models** with advanced hyperparameter tuning and performance tracking. + Choose from a wide range of algorithms and configurations. + """) + + if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None: + st.warning("Please clean your data in the Smart Cleaning section first.") + st.stop() + + df = st.session_state.cleaned_data.copy() + + # Target Variable Selection + st.subheader("๐ŸŽฏ Target Variable") + target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.") + + # Problem Type Selection + st.subheader("๐Ÿ“ Problem Type") + problem_type = st.radio("Select Problem Type", ["Regression", "Classification"], help="Choose the type of machine learning problem.") + + # Feature Selection + st.subheader("๐Ÿ”ง Feature Selection") + use_all_features = st.checkbox("Use All Features", value=True, help="Select to use all features for training. Deselect to manually choose features.") + if use_all_features: + feature_columns = df.drop(columns=[target_column]).columns.tolist() + else: + feature_columns = st.multiselect("Select Feature Columns", df.drop(columns=[target_column]).columns, help="Choose the features you want to use for prediction.") + + # Model Selection + st.subheader("๐Ÿค– Model Selection") + if problem_type == "Regression": + model_options = ["Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM", "Neural Network"] + else: # Classification + model_options = ["Logistic Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "SVM", "Neural Network", "KNN", "Naive Bayes"] + model_name = st.selectbox("Select Model", model_options, help="Choose a model.") + + # Hyperparameter Tuning + st.subheader("๐ŸŽ›๏ธ Hyperparameter Tuning") + with st.expander("Configure Hyperparameters", expanded=True): if model_name == "Random Forest": - st.subheader("Random Forest Hyperparameters") - param_grid = { - 'n_estimators': list(range(100, 101)), #Used 100 so model is trained and not empty and all visuals work - - 'max_depth': list(range(10,11)), #default value 10 so its in model - 'min_samples_split': list(range(2,3)), #New hyperparameter default 2 - 'min_samples_leaf': list(range(1,2)), #New hyperparameter default 1 + n_estimators = st.slider("Number of Estimators", 10, 200, 100) + max_depth = st.slider("Max Depth", 3, 20, 10) + min_samples_split = st.slider("Min Samples Split", 2, 10, 2) + min_samples_leaf = st.slider("Min Samples Leaf", 1, 10, 1) + hyperparams = { + 'n_estimators': n_estimators, + 'max_depth': max_depth, + 'min_samples_split': min_samples_split, + 'min_samples_leaf': min_samples_leaf } - elif model_name == "Gradient Boosting": - st.subheader("Gradient Boosting Hyperparameters") - param_grid = { - 'n_estimators': list(range(100, 101)), - 'learning_rate': [0.1], - 'max_depth': list(range(3,4)) - + learning_rate = st.slider("Learning Rate", 0.01, 1.0, 0.1) + n_estimators = st.slider("Number of Estimators", 10, 200, 100) + max_depth = st.slider("Max Depth", 3, 20, 10) + hyperparams = { + 'learning_rate': learning_rate, + 'n_estimators': n_estimators, + 'max_depth': max_depth } - - elif model_name == "Decision Tree": - st.subheader("Decision Tree Hyperparameters") - param_grid = { - 'criterion': ["gini"], - 'max_depth': list(range(3,4)), + elif model_name == "Neural Network": + hidden_layers = st.slider("Number of Hidden Layers", 1, 5, 2) + neurons_per_layer = st.slider("Neurons per Layer", 10, 200, 50) + epochs = st.slider("Epochs", 10, 200, 50) + batch_size = st.slider("Batch Size", 16, 128, 32) + hyperparams = { + 'hidden_layers': hidden_layers, + 'neurons_per_layer': neurons_per_layer, + 'epochs': epochs, + 'batch_size': batch_size } + else: + hyperparams = {} - # Train-Test Split - test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the dataset to include in the test split.") + # Train-Test Split + st.subheader("โœ‚๏ธ Train-Test Split") + test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the dataset to include in the test split.") - if st.button("Train Model"): - with st.spinner("Training model..."): - try: - X = df[feature_columns] - y = df[target_column] + # Model Training + if st.button("๐Ÿš€ Train Model"): + with st.spinner("Training model..."): + try: + X = df[feature_columns] + y = df[target_column] - # Check if X is empty - if X.empty: - st.error("No features were selected. Please select feature columns.") - st.stop() + # Check if X is empty + if X.empty: + st.error("No features were selected. Please select feature columns.") + st.stop() - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) + # Train-Test Split + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) - # Preprocessing Pipeline - numeric_features = X.select_dtypes(include=np.number).columns - categorical_features = X.select_dtypes(exclude=np.number).columns + # Preprocessing Pipeline + numeric_features = X.select_dtypes(include=np.number).columns + categorical_features = X.select_dtypes(exclude=np.number).columns - numeric_transformer = Pipeline(steps=[ - ('imputer', SimpleImputer(strategy='median')), - ('scaler', StandardScaler()) - ]) + numeric_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='median')), + ('scaler', StandardScaler()) + ]) - categorical_transformer = Pipeline(steps=[ - ('imputer', SimpleImputer(strategy='most_frequent')), - ('onehot', OneHotEncoder(handle_unknown='ignore')) + categorical_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='most_frequent')), + ('onehot', OneHotEncoder(handle_unknown='ignore')) + ]) + + preprocessor = ColumnTransformer( + transformers=[ + ('num', numeric_transformer, numeric_features), + ('cat', categorical_transformer, categorical_features) ]) - preprocessor = ColumnTransformer( - transformers=[ - ('num', numeric_transformer, numeric_features), - ('cat', categorical_transformer, categorical_features) - ]) - - X_train_processed = preprocessor.fit_transform(X_train) - X_test_processed = preprocessor.transform(X_test) - - #Feature Selection - if feature_selection_method == "SelectKBest": - k = st.slider("Number of Features to Select", 1, len(feature_columns), len(feature_columns), key = "featureselector") - selector = SelectKBest(k=k) - X_train_selected = selector.fit_transform(X_train_processed, y_train) - X_test_selected = selector.transform(X_test_processed) - else: - X_train_selected = X_train_processed - X_test_selected = X_test_processed - - # Model Training and Hyperparameter Tuning - if model_name == "Linear Regression": - model = LinearRegression() - model.fit(X_train_selected, y_train) - - elif model_name == "Logistic Regression": - model = LogisticRegression(max_iter=1000) - model.fit(X_train_selected, y_train) - elif model_name == "Decision Tree": - if problem_type == "Regression": - model = DecisionTreeRegressor() - model.fit(X_train_selected, y_train) - else: - model = DecisionTreeClassifier() - model.fit(X_train_selected, y_train) - elif model_name == "Random Forest": - if problem_type == "Regression": - model = RandomForestRegressor(random_state=42) - if 'param_grid' in locals() and param_grid: #added param_grid not empty condition - grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error') # Example scoring - grid_search.fit(X_train_selected, y_train) - model = grid_search.best_estimator_ - st.write("Best Parameters:", grid_search.best_params_) - else: - model = RandomForestRegressor(random_state=42) #define if no param_grid - model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined. + X_train_processed = preprocessor.fit_transform(X_train) + X_test_processed = preprocessor.transform(X_test) - else: - model = RandomForestClassifier(random_state=42) - if 'param_grid' in locals()and param_grid: #added param_grid not empty condition - grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy') - grid_search.fit(X_train_selected, y_train) - model = grid_search.best_estimator_ - st.write("Best Parameters:", grid_search.best_params_) - else: - model = RandomForestClassifier(random_state=42) #define if no param_grid - model.fit(X_train_selected, y_train) # fit without gridsearch if param_grid is not defined - elif model_name == "Gradient Boosting": - from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier #moved import here to avoid bloat - model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier() - model.fit(X_train_selected, y_train) - elif model_name == "SVM": - model = SVR() if problem_type == "Regression" else SVC() - model.fit(X_train_selected, y_train) - elif model_name == "Naive Bayes": - from sklearn.naive_bayes import GaussianNB - model = GaussianNB() - model.fit(X_train_selected, y_train) - elif model_name == "KNN": - from sklearn.neighbors import KNeighborsClassifier - model = KNeighborsClassifier() - model.fit(X_train_selected, y_train) - - # Store model and preprocessor - st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)]) - st.session_state.preprocessor = preprocessor - - #Store the test data - st.session_state.X_train_selected = X_train_selected - st.session_state.X_test_selected = X_test_selected - st.session_state.y_train = y_train - st.session_state.y_test = y_test - - # Model Evaluation - y_pred = model.predict(X_test_selected) + # Model Training + if model_name == "Linear Regression": + model = LinearRegression() + elif model_name == "Logistic Regression": + model = LogisticRegression(max_iter=1000) + elif model_name == "Decision Tree": if problem_type == "Regression": - mse = mean_squared_error(y_test, y_pred) - r2 = r2_score(y_test, y_pred) - st.write(f"Mean Squared Error: {mse:.4f}") - st.write(f"R-squared: {r2:.4f}") + model = DecisionTreeRegressor() else: - from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat - import seaborn as sns - import matplotlib.pyplot as plt #Added import statement - import numpy as np - import pandas as pd - from sklearn.model_selection import learning_curve, validation_curve - - #Weighted averaging for metrics for multiclass - average_method = "weighted" #changed from None - - accuracy = accuracy_score(y_test, y_pred) - precision = precision_score(y_test, y_pred, average = average_method, zero_division = 0) - recall = recall_score(y_test, y_pred, average = average_method, zero_division = 0) - f1 = f1_score(y_test, y_pred, average = average_method, zero_division = 0) - st.write(f"Accuracy: {accuracy:.4f}") - st.write(f"Precision: {precision:.4f}") - st.write(f"Recall: {recall:.4f}") - st.write(f"F1 Score: {f1:.4f}") - st.write("Classification Report:") - st.text(classification_report(y_test, y_pred, zero_division = 0)) - - - #Confusion Matrix - - conf_matrix = confusion_matrix(y_test, y_pred) - - #Heatmap - fig_conf, ax_conf = plt.subplots() - sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax_conf) - ax_conf.set_xlabel('Predicted Labels') - ax_conf.set_ylabel('True Labels') - ax_conf.set_title('Confusion Matrix') - st.pyplot(fig_conf) - - - #Added section for model visualization - st.subheader("Model Visualization") - #Use conditional to make sure that everything only executes when the data set is trained and not outside of it. - if st.session_state.model is not None: #Make sure that everything only executes when data set is trained and not outside of it. - try: #All the plotting code here. - if model_name in ["Random Forest", "Gradient Boosting"]:#Used list to define models. - #Make sure you use this inside of a conditional for classification, model, and tree based model. - - #Feature Importance (Tree-based Models) - - importances = model.feature_importances_ # Assumed tree-based model - feat_importances = pd.Series(importances, index=X_train.columns) - feat_importances = feat_importances.nlargest(20) - - fig_feat, ax_feat = plt.subplots() - feat_importances.plot(kind='barh', ax=ax_feat) - ax_feat.set_xlabel('Relative Importance') - ax_feat.set_ylabel('Features') - ax_feat.set_title('Feature Importances') - st.pyplot(fig_feat) - - #Create data that determines the learning and validation curve and what we have to add - train_sizes, train_scores, valid_scores = learning_curve(model, X_train_selected, y_train, cv=5, scoring='accuracy' if problem_type =="Classification" else 'neg_mean_squared_error', n_jobs=-1) #Define cross validation for run - - #Then add a plot for the learning curve and use st.pyplot - train_mean = np.mean(train_scores, axis=1) - train_std = np.std(train_scores, axis=1) - valid_mean = np.mean(valid_scores, axis=1) - valid_std = np.std(valid_scores, axis=1) - - #Plot each of the variables that has to be used. - - fig_lc, ax_lc = plt.subplots() - ax_lc.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training ' + ('Accuracy' if problem_type == "Classification" else "Neg MSE")) - ax_lc.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') - ax_lc.plot(train_sizes, valid_mean, color='green', linestyle='--', marker='s', markersize=5, label='Validation ' + ('Accuracy' if problem_type == "Classification" else "Neg MSE")) - ax_lc.fill_between(train_sizes, valid_mean + valid_std, valid_mean - valid_std, alpha=0.15, color='green') - - ax_lc.set_title('Learning Curves') - ax_lc.set_xlabel('Training Set Size') - ax_lc.set_ylabel('Score') - ax_lc.legend(loc='best') - st.pyplot(fig_lc) - - except Exception as e: #Local error - st.write(f"Visuals are only available for tree based models or if models are selected prior: {e}") #Write only if error - - except Exception as e: - st.error(f"An error occurred: {e}") - - else: - st.write("Please upload and clean data first.") + model = DecisionTreeClassifier() + elif model_name == "Random Forest": + if problem_type == "Regression": + model = RandomForestRegressor(**hyperparams) + else: + model = RandomForestClassifier(**hyperparams) + elif model_name == "Gradient Boosting": + if problem_type == "Regression": + model = GradientBoostingRegressor(**hyperparams) + else: + model = GradientBoostingClassifier(**hyperparams) + elif model_name == "SVM": + if problem_type == "Regression": + model = SVR() + else: + model = SVC() + elif model_name == "Neural Network": + if problem_type == "Regression": + model = MLPRegressor( + hidden_layer_sizes=[hyperparams['neurons_per_layer']] * hyperparams['hidden_layers'], + max_iter=hyperparams['epochs'], + batch_size=hyperparams['batch_size'] + ) + else: + model = MLPClassifier( + hidden_layer_sizes=[hyperparams['neurons_per_layer']] * hyperparams['hidden_layers'], + max_iter=hyperparams['epochs'], + batch_size=hyperparams['batch_size'] + ) + elif model_name == "KNN": + model = KNeighborsClassifier() + elif model_name == "Naive Bayes": + model = GaussianNB() + + # Train the model + model.fit(X_train_processed, y_train) + + # Store model and preprocessor + st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)]) + st.session_state.preprocessor = preprocessor + + # Store the test data + st.session_state.X_train_selected = X_train_processed + st.session_state.X_test_selected = X_test_processed + st.session_state.y_train = y_train + st.session_state.y_test = y_test + + # Model Evaluation + y_pred = model.predict(X_test_processed) + if problem_type == "Regression": + mse = mean_squared_error(y_test, y_pred) + rmse = np.sqrt(mse) + mae = mean_absolute_error(y_test, y_pred) + r2 = r2_score(y_test, y_pred) + st.write(f"Mean Squared Error: {mse:.4f}") + st.write(f"Root Mean Squared Error: {rmse:.4f}") + st.write(f"Mean Absolute Error: {mae:.4f}") + st.write(f"R-squared: {r2:.4f}") + else: + accuracy = accuracy_score(y_test, y_pred) + precision = precision_score(y_test, y_pred, average='weighted', zero_division=0) + recall = recall_score(y_test, y_pred, average='weighted', zero_division=0) + f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0) + st.write(f"Accuracy: {accuracy:.4f}") + st.write(f"Precision: {precision:.4f}") + st.write(f"Recall: {recall:.4f}") + st.write(f"F1 Score: {f1:.4f}") + st.write("Classification Report:") + st.text(classification_report(y_test, y_pred)) + + # Visualization + st.subheader("๐Ÿ“Š Model Performance Visualization") + if problem_type == "Regression": + fig, ax = plt.subplots() + ax.scatter(y_test, y_pred) + ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) + ax.set_xlabel('Actual') + ax.set_ylabel('Predicted') + ax.set_title('Actual vs Predicted') + st.pyplot(fig) + else: + conf_matrix = confusion_matrix(y_test, y_pred) + fig, ax = plt.subplots() + sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax) + ax.set_xlabel('Predicted Labels') + ax.set_ylabel('True Labels') + ax.set_title('Confusion Matrix') + st.pyplot(fig) + + st.success("Model trained successfully!") + except Exception as e: + st.error(f"An error occurred during training: {e}") - # Model Saving + # Model Saving + if st.session_state.model is not None: + st.subheader("๐Ÿ’พ Save Model") model_filename = st.text_input("Enter Model Filename (without extension)", "trained_model") if st.button("Save Model"): try: @@ -868,498 +864,579 @@ elif app_mode == "Model Training": st.success(f"Model saved as {model_filename}.joblib") except Exception as e: st.error(f"Error saving model: {e}") - # Model loading in a different section - model_file = st.file_uploader("Upload Trained Model", type=["joblib"]) - if model_file is not None: - try: - st.session_state.model = joblib.load(model_file) - st.success("Model loaded successfully!") - except Exception as e: - st.error(f"Error loading model: {e}") - #Model Evaluation Section - run on the saved model - if st.session_state.model is not None and st.session_state.X_test_selected is not None: # added check to make sure it is a loaded model - try: - y_pred = st.session_state.model.predict(st.session_state.X_test_selected) # load from stored +# Visualization Lab Section +elif app_mode == "Visualization Lab": + st.title("๐Ÿ”ฌ Visualization Lab") + st.markdown(""" + **Explore and Visualize Your Data** with advanced plotting tools and interactive visualizations. + Uncover hidden patterns and relationships in your data. + """) + + if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None: + st.warning("Please clean your data in the Smart Cleaning section first.") + st.stop() + + df = st.session_state.cleaned_data.copy() + + # Visualization Type Selection + st.subheader("๐Ÿ“Š Choose Visualization Type") + plot_types = [ + "Histogram", "Scatter Plot", "Box Plot", "Violin Plot", + "Correlation Heatmap", "Parallel Coordinates", "Andrews Curves", + "Pair Plot", "Density Contour", "3D Scatter", "Time Series", + "Sunburst Chart", "Funnel Chart", "Clustering Analysis" + ] + plot_type = st.selectbox("Select Visualization Type", plot_types) + + # Dynamic Controls Based on Plot Type + if plot_type != "Correlation Heatmap": + x_col = st.selectbox("X Axis", df.columns) + + if plot_type in ["Scatter Plot", "Box Plot", "Violin Plot", "Time Series", "3D Scatter", "Histogram"]: + y_col = st.selectbox("Y Axis", df.columns) + + if plot_type == "3D Scatter": + z_col = st.selectbox("Z Axis", df.columns) + color_col = st.selectbox("Color by", [None] + list(df.columns)) + + # Advanced Plot Customization + with st.expander("๐ŸŽจ Advanced Customization", expanded=False): + color_palette = st.selectbox("Color Palette", ["Viridis", "Plasma", "Magma", "Cividis", "RdBu", "Rainbow"]) + hover_data_cols = st.multiselect("Hover Data", df.columns) + + # Plot Generation + try: + fig = None - if problem_type == "Regression": - mse = mean_squared_error(st.session_state.y_test, y_pred) - r2 = r2_score(st.session_state.y_test, y_pred) - st.write(f"Mean Squared Error: {mse:.4f}") - st.write(f"R-squared: {r2:.4f}") - else: - from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report #Import here to avoid library bloat - accuracy = accuracy_score(st.session_state.y_test, y_pred) - st.write(f"Accuracy: {accuracy:.4f}") - except Exception as e: #local error - st.error(f"An error occurred during model evaluation: {e}") - -elif app_mode == "Predictions": - st.title("๐Ÿ”ฎ Make Predictions") + if plot_type == "Histogram": + fig = px.histogram( + df, x=x_col, y=y_col, + nbins=30, template="plotly_dark", + color_discrete_sequence=[color_palette] + ) - if st.session_state.model is not None and st.session_state.cleaned_data is not None: - df = st.session_state.cleaned_data.copy() - model = st.session_state.model.steps[-1][1] #Define model from the state + elif plot_type == "Scatter Plot": + fig = px.scatter( + df, x=x_col, y=y_col, + color=color_col, + size=hover_data_cols, + hover_data=hover_data_cols + ) - try: - numeric_transformer_columns = st.session_state.model.steps[0][1].transformers_[0][2] if hasattr(st.session_state.model.steps[0][1].transformers_[0][2], '__len__') else [] - categorical_transformer_columns = st.session_state.model.steps[0][1].transformers_[1][2] if hasattr(st.session_state.model.steps[0][1].transformers_[1][2], '__len__') else [] - model_columns = numeric_transformer_columns + categorical_transformer_columns - except AttributeError as e: - st.error(f"Error accessing model transformers: {e}. Please ensure a valid model is trained and loaded.") - st.stop() + elif plot_type == "3D Scatter": + fig = px.scatter_3d( + df, x=x_col, y=y_col, z=z_col, + color=color_col, + color_discrete_sequence=[color_palette] + ) - model_is_classification = hasattr(model, 'predict_proba') # Check for classification or other problem - if not set(model_columns).issubset(set(df.columns)): #Fixed comparison - st.error("The model was trained on a dataframe that contains different columns than the currently uploaded dataframe. Please upload the correct dataframe.") - st.stop() - - input_data = {} - st.subheader("Enter Data for Prediction") - for col in model_columns: - if pd.api.types.is_numeric_dtype(df[col]): - input_data[col] = st.number_input(f"Enter {col}", value=df[col].mean()) + elif plot_type == "Correlation Heatmap": + numeric_df = df.select_dtypes(include=np.number) + if not numeric_df.empty: + corr = numeric_df.corr() + fig = px.imshow( + corr, text_auto=True, + color_continuous_scale=color_palette + ) else: - input_data[col] = st.selectbox(f"Select {col}", df[col].unique()) - - # Prediction Button - if st.button("Make Prediction"): - try: - input_df = pd.DataFrame([input_data]) - #Preprocess for model - input_processed = st.session_state.preprocessor.transform(input_df) - prediction = st.session_state.model.predict(input_processed)[0] - st.subheader("Prediction Result") - st.write(f"The predicted value is: {prediction}") - - # Show shap values chart - show_shap_values = st.checkbox("View SHAP Explanation") #select model to show shap values - - - if show_shap_values and model_is_classification and model_name not in ["Linear Regression","Logistic Regression","SVM","Naive Bayes", "KNN"]:#Show shap values if this can perform. - - try: - import shap #Import lib - explainer = shap.TreeExplainer(st.session_state.model.steps[-1][1]) #Used tree model because these are easily visualized - - shap_values = explainer.shap_values(input_processed) #Get output of each values, only used in tree models - - st.subheader("SHAP Values") - #Plot for each of the different class labels. - - shap.initjs() - fig_shap, ax_shap = plt.subplots(1, figsize = (10,10)) - shap.summary_plot(shap_values, features = input_processed, feature_names = model_columns, plot_type = "bar")#plot for multi class labels - st.pyplot(fig_shap) #Show the figure - except Exception as e: - st.write(f"Can show shap values on tree based model: {e}") #Show error - # Additional Feedback (Example for Classification) - if hasattr(st.session_state.model.steps[-1][1], 'predict_proba'): #If the end variable has predict_proba and is therefore a predictor - probabilities = st.session_state.model.predict_proba(input_processed)[0] - st.write("Predicted Probabilities:") - st.write(probabilities) #write here - except Exception as e: - st.error(f"An error occurred during prediction: {e}") - - #Add batch prediction section in prediction tab - st.subheader("Batch Predictions") - batch_file = st.file_uploader("Upload CSV for Batch Predictions", type=["csv"]) - if batch_file is not None: - try: - batch_df = pd.read_csv(batch_file) - #Verify data types and if it matches the ones used during the columns - for col in model_columns: - if pd.api.types.is_numeric_dtype(df[col]): - try: - batch_df[col] = pd.to_numeric(batch_df[col], errors='raise') - except ValueError: - st.error(f"Column '{col}' must be numeric.") - st.stop() - else: - #ensure columns are type string if that isnt the case - batch_df[col] = batch_df[col].astype(str) + st.warning("No numerical columns found for correlation heatmap.") - if not set(model_columns).issubset(set(batch_df.columns)): #Fixed comparison - st.error("The batch dataframe that contains different columns than the currently used training dataframe. Please upload the correct dataframe.") - st.stop() - - # Preprocess the batch data - batch_processed = st.session_state.preprocessor.transform(batch_df[model_columns]) - # Make predictions - batch_predictions = st.session_state.model.predict(batch_processed) - batch_df['Prediction'] = batch_predictions + elif plot_type == "Box Plot": + fig = px.box( + df, x=x_col, y=y_col, + color=color_col + ) - #Add probability output if that function is available. - if hasattr(st.session_state.model.steps[-1][1], 'predict_proba'): - batch_probabilities = st.session_state.model.predict_proba(batch_processed) - for i in range(batch_probabilities.shape[1]): #Loop through and give each probability - batch_df[f'Probability_Class_{i}'] = batch_probabilities[:, i] + elif plot_type == "Violin Plot": + fig = px.violin( + df, x=x_col, y=y_col, + box=True, points="all", + color=color_col + ) + elif plot_type == "Time Series": + df = df.sort_values(by=x_col) + fig = px.line( + df, x=x_col, y=y_col, + color=color_col + ) + elif plot_type == "Scatter Matrix": + fig = px.scatter_matrix( + df, dimensions=[x_col, y_col], + color=color_col + ) - st.dataframe(batch_df) + if fig: + st.plotly_chart(fig, use_container_width=True) + except Exception as e: + st.error(f"An error occurred while generating the plot: {e}") + + # Statistical Analysis Section + with st.expander("๐Ÿ“Š Statistical Analysis", expanded=True): + analysis_type = st.selectbox("Select Analysis Type", [ + "Descriptive Statistics", + "Correlation Analysis", + "Hypothesis Testing", + "Distribution Fitting" + ]) - # Download predictions - csv = batch_df.to_csv(index=False) - b64 = base64.b64encode(csv.encode()).decode() # some strings - href = f'Download Predictions CSV' - st.markdown(href, unsafe_allow_html=True) + if analysis_type == "Descriptive Statistics": + st.write(df.describe(include='all')) - except Exception as e: - st.error(f"Error processing batch file: {e}") + elif analysis_type == "Correlation Analysis": + numeric_cols = df.select_dtypes(include=np.number).columns + if len(numeric_cols) >= 2: + corr_method = st.selectbox("Correlation Method", [ + "Pearson", "Kendall", "Spearman" + ]) + corr_matrix = df[numeric_cols].corr(method=corr_method.lower()) + st.write(corr_matrix) + st.heatmap(corr_matrix, annot=True, cmap=color_palette) + else: + st.warning("Need at least 2 numeric columns for correlation analysis") + elif analysis_type == "Hypothesis Testing": + test_type = st.selectbox("Select Test Type", [ + "T-test", "Chi-Squared Test", "ANOVA", "Mann-Whitney U" + ]) + if test_type == "T-test": + col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns) + col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns) + if st.button("Run T-test"): + groups = df.groupby(col2)[col1].apply(list) + if len(groups) == 2: + t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1]) + st.write(f"T-statistic: {t_stat:.4f}") + st.write(f"P-value: {p_value:.4f}") + if p_value < 0.05: + st.write("Reject the null hypothesis.") + else: + st.write("Fail to reject the null hypothesis.") + else: + st.write("Select a categorical column with exactly two categories.") + + elif analysis_type == "Distribution Fitting": + numeric_col = st.selectbox("Select Numeric Column", df.select_dtypes(include=np.number).columns) + dist_types = ["Normal", "Log-Normal", "Exponential", "Gamma"] + selected_dist = st.selectbox("Select Distribution Type", dist_types) + if st.button("Fit Distribution"): + from scipy.stats import norm, lognorm, expon, gamma + dist_functions = { + "Normal": norm, + "Log-Normal": lognorm, + "Exponential": expon, + "Gamma": gamma + } + params = dist_functions[selected_dist].fit(df[numeric_col].dropna()) + st.write(f"Fitted Parameters: {params}") + + # Data Profiling Section + with st.expander("๐Ÿ“ Generate Full Data Profile", expanded=False): + if st.button("๐Ÿš€ Generate Comprehensive Report"): + with st.spinner("Generating report..."): + pr = ProfileReport(df, explorative=True) + st_profile_report(pr) + +# Insights Section +elif app_mode == "Insights": + st.title("๐Ÿ“Š Model Insights & Explainability") + st.markdown(""" + **Understand and Interpret Your Model** with advanced explainability tools and visualizations. + Gain deeper insights into model behavior and predictions. + """) + + if 'model' not in st.session_state or st.session_state.model is None: + st.warning("Please train a model in the Model Training section first.") + st.stop() + + model = st.session_state.model.steps[-1][1] # Get the trained model + preprocessor = st.session_state.model.steps[0][1] # Get the preprocessor + + # Model Summary + st.subheader("๐Ÿ“ Model Summary") + st.write(f"**Model Type:** {type(model).__name__}") + st.write(f"**Problem Type:** {'Regression' if hasattr(model, 'predict') else 'Classification'}") + st.write(f"**Training Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + # Feature Importance + st.subheader("๐Ÿ” Feature Importance") + if hasattr(model, 'feature_importances_'): + importances = model.feature_importances_ + feature_names = preprocessor.get_feature_names_out() + importance_df = pd.DataFrame({ + 'Feature': feature_names, + 'Importance': importances + }).sort_values('Importance', ascending=False) + + fig, ax = plt.subplots() + sns.barplot(x='Importance', y='Feature', data=importance_df.head(10), ax=ax) + ax.set_title('Top 10 Feature Importances') + st.pyplot(fig) else: - st.write("Please train a model first in the 'Model Training' section.") + st.info("Feature importance not available for this model type.") + + # SHAP Values + st.subheader("๐Ÿ“Š SHAP Values") + if st.checkbox("Calculate SHAP Values (Warning: May be slow for large datasets)"): + try: + import shap + explainer = shap.TreeExplainer(model) + shap_values = explainer.shap_values(st.session_state.X_test_selected) + + # Summary Plot + st.write("### Summary Plot") + fig, ax = plt.subplots() + shap.summary_plot(shap_values, st.session_state.X_test_selected, feature_names=preprocessor.get_feature_names_out()) + st.pyplot(fig) + + # Force Plot for Individual Predictions + st.write("### Individual Prediction Explanation") + sample_idx = st.slider("Select Sample Index", 0, len(st.session_state.X_test_selected)-1, 0) + fig, ax = plt.subplots() + shap.force_plot(explainer.expected_value, shap_values[sample_idx], st.session_state.X_test_selected[sample_idx], + feature_names=preprocessor.get_feature_names_out(), matplotlib=True, show=False) + st.pyplot(fig) + except Exception as e: + st.error(f"SHAP calculation failed: {e}") + + # Partial Dependence Plots + st.subheader("๐Ÿ“ˆ Partial Dependence Plots") + if hasattr(model, 'predict'): + feature_to_plot = st.selectbox("Select Feature for PDP", preprocessor.get_feature_names_out()) + if st.button("Generate PDP"): + from sklearn.inspection import PartialDependenceDisplay + fig, ax = plt.subplots() + PartialDependenceDisplay.from_estimator( + model, st.session_state.X_test_selected, + features=[feature_to_plot], + feature_names=preprocessor.get_feature_names_out(), + ax=ax + ) + st.pyplot(fig) + + # Model Performance Over Time + st.subheader("โณ Model Performance Over Time") + if st.checkbox("Track Performance Over Time"): + performance_history = { + 'timestamp': [], + 'metric': [], + 'value': [] + } + + if hasattr(model, 'predict'): + y_pred = model.predict(st.session_state.X_test_selected) + mse = mean_squared_error(st.session_state.y_test, y_pred) + performance_history['timestamp'].append(datetime.now()) + performance_history['metric'].append('MSE') + performance_history['value'].append(mse) + + performance_df = pd.DataFrame(performance_history) + st.line_chart(performance_df.set_index('timestamp')) + + # Model Debugging + st.subheader("๐Ÿ› Model Debugging") + if st.checkbox("Enable Debug Mode"): + st.write("### Model Parameters") + st.json(model.get_params()) + + st.write("### Training Data Summary") + st.write(f"Number of Samples: {st.session_state.X_train_selected.shape[0]}") + st.write(f"Number of Features: {st.session_state.X_train_selected.shape[1]}") + + # Export Insights + st.subheader("๐Ÿ’พ Export Insights") + if st.button("Export Insights as PDF"): + try: + from fpdf import FPDF + pdf = FPDF() + pdf.add_page() + pdf.set_font("Arial", size=12) + pdf.cell(200, 10, txt="Model Insights Report", ln=True, align='C') + pdf.cell(200, 10, txt=f"Model Type: {type(model).__name__}", ln=True) + pdf.cell(200, 10, txt=f"Problem Type: {'Regression' if hasattr(model, 'predict') else 'Classification'}", ln=True) + pdf.output("model_insights.pdf") + st.success("Insights exported successfully!") + except Exception as e: + st.error(f"Export failed: {e}") +# Predictions Section +elif app_mode == "Predictions": + st.title("๐Ÿ”ฎ Prediction Studio") + st.markdown(""" + **Make Predictions** with your trained model and explore prediction explanations. + Generate batch predictions and export results. + """) + + if 'model' not in st.session_state or st.session_state.model is None: + st.warning("Please train a model in the Model Training section first.") + st.stop() + + model = st.session_state.model.steps[-1][1] # Get the trained model + preprocessor = st.session_state.model.steps[0][1] # Get the preprocessor + + # Single Prediction + st.subheader("๐ŸŽฏ Single Prediction") + input_data = {} + feature_names = preprocessor.get_feature_names_out() + for feature in feature_names: + if feature in st.session_state.cleaned_data.columns: + if pd.api.types.is_numeric_dtype(st.session_state.cleaned_data[feature]): + input_data[feature] = st.number_input(f"Enter {feature}", value=st.session_state.cleaned_data[feature].mean()) + else: + input_data[feature] = st.selectbox(f"Select {feature}", st.session_state.cleaned_data[feature].unique()) -elif app_mode == "Visualization Lab": - st.title("๐Ÿ”ฌ Advanced Data Visualization and Clustering Lab") + if st.button("Make Prediction"): + try: + input_df = pd.DataFrame([input_data]) + input_processed = preprocessor.transform(input_df) + prediction = model.predict(input_processed)[0] + + st.write(f"**Prediction:** {prediction}") + + if hasattr(model, 'predict_proba'): + probabilities = model.predict_proba(input_processed)[0] + st.write("**Prediction Probabilities:**") + st.bar_chart(probabilities) - # Initialize session state for cleaned data - if 'cleaned_data' not in st.session_state: - st.session_state.cleaned_data = None + # SHAP Explanation + if st.checkbox("Show SHAP Explanation"): + try: + import shap + explainer = shap.TreeExplainer(model) + shap_values = explainer.shap_values(input_processed) + + st.write("### SHAP Values") + fig, ax = plt.subplots() + shap.force_plot(explainer.expected_value, shap_values, input_processed, + feature_names=feature_names, matplotlib=True, show=False) + st.pyplot(fig) + except Exception as e: + st.error(f"SHAP calculation failed: {e}") - # Sample data upload (replace with your data loading logic) - uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) - if uploaded_file is not None: - try: - df = pd.read_csv(uploaded_file) - st.session_state.cleaned_data = df - st.success("Data loaded successfully!") except Exception as e: - st.error(f"Error loading data: {e}") + st.error(f"Prediction failed: {e}") - if st.session_state.cleaned_data is not None: - df = st.session_state.cleaned_data.copy() - - # Visualization Type Selection - visualization_type = st.selectbox("Select Visualization Type", [ - "Pair Plot", "Parallel Coordinates Plot", "Andrews Curves", "Pie Chart", - "Area Chart", "Density Contour", "Sunburst Chart", "Funnel Chart", "Clustering Analysis" - ]) + # Batch Predictions + st.subheader("๐Ÿ“‚ Batch Predictions") + batch_file = st.file_uploader("Upload CSV for Batch Predictions", type=["csv"]) + if batch_file is not None: + try: + batch_df = pd.read_csv(batch_file) + batch_processed = preprocessor.transform(batch_df) + batch_predictions = model.predict(batch_processed) + batch_df['Prediction'] = batch_predictions - if visualization_type == "Pair Plot": - st.subheader("Pair Plot") - cols_for_pairplot = st.multiselect("Select Columns for Pair Plot", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:3]) - if cols_for_pairplot: - fig = px.scatter_matrix(df, dimensions=cols_for_pairplot) - st.plotly_chart(fig, use_container_width=True) - - elif visualization_type == "Parallel Coordinates Plot": - st.subheader("Parallel Coordinates Plot") - cols_for_parallel = st.multiselect("Select Columns for Parallel Coordinates", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:5]) - if cols_for_parallel: - fig = px.parallel_coordinates(df[cols_for_parallel], color=df[cols_for_parallel[0]] if cols_for_parallel else None) - st.plotly_chart(fig, use_container_width=True) - - elif visualization_type == "Andrews Curves": - st.subheader("Andrews Curves") - cols_for_andrews = st.multiselect("Select Columns for Andrews Curves", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:5]) - if cols_for_andrews: - fig = px.andrews_curves(df[cols_for_andrews + [df.columns[0]]], class_column=df.columns[0]) - st.plotly_chart(fig, use_container_width=True) - - elif visualization_type == "Pie Chart": - st.subheader("Pie Chart") - col_for_pie = st.selectbox("Select Column for Pie Chart", df.columns) - fig = px.pie(df, names=col_for_pie) - st.plotly_chart(fig, use_container_width=True) + if hasattr(model, 'predict_proba'): + probabilities = model.predict_proba(batch_processed) + for i in range(probabilities.shape[1]): + batch_df[f'Probability_Class_{i}'] = probabilities[:, i] - elif visualization_type == "Area Chart": - st.subheader("Area Chart") - cols_for_area = st.multiselect("Select Columns for Area Chart", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:3]) - if cols_for_area: - fig = px.area(df[cols_for_area]) - st.plotly_chart(fig, use_container_width=True) - - elif visualization_type == "Density Contour": - st.subheader("Density Contour") - x_col = st.selectbox("Select X Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist()) - y_col = st.selectbox("Select Y Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist()) - fig = px.density_contour(df, x=x_col, y=y_col) - st.plotly_chart(fig, use_container_width=True) + st.write("### Predictions Preview") + st.dataframe(batch_df.head()) - elif visualization_type == "Sunburst Chart": - st.subheader("Sunburst Chart") - path_cols = st.multiselect("Select Path Columns for Sunburst Chart", df.columns) - if path_cols: - fig = px.sunburst(df, path=path_cols) - st.plotly_chart(fig, use_container_width=True) - - elif visualization_type == "Funnel Chart": - st.subheader("Funnel Chart") - x_col = st.selectbox("Select X Column for Funnel Chart (Values)", df.select_dtypes(include=np.number).columns.tolist()) - y_col = st.selectbox("Select Y Column for Funnel Chart (Categories)", df.columns) - fig = px.funnel(df, x=x_col, y=y_col) - st.plotly_chart(fig, use_container_width=True) + # Download Predictions + csv = batch_df.to_csv(index=False) + b64 = base64.b64encode(csv.encode()).decode() + href = f'Download Predictions CSV' + st.markdown(href, unsafe_allow_html=True) - elif visualization_type == "Clustering Analysis": - st.subheader("Clustering Analysis") - numerical_cols = df.select_dtypes(include=np.number).columns.tolist() + except Exception as e: + st.error(f"Batch prediction failed: {e}") - if not numerical_cols: - st.warning("No numerical columns found for clustering.") + # Prediction Analysis + st.subheader("๐Ÿ“Š Prediction Analysis") + if st.checkbox("Analyze Predictions"): + try: + y_pred = model.predict(st.session_state.X_test_selected) + y_test = st.session_state.y_test + + if hasattr(model, 'predict'): + fig, ax = plt.subplots() + ax.scatter(y_test, y_pred) + ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) + ax.set_xlabel('Actual') + ax.set_ylabel('Predicted') + ax.set_title('Actual vs Predicted') + st.pyplot(fig) else: - cluster_cols = st.multiselect("Select Columns for Clustering", numerical_cols, default=numerical_cols[:2] if len(numerical_cols) >= 2 else numerical_cols) - - if cluster_cols: - try: - scaler = StandardScaler() - scaled_data = scaler.fit_transform(df[cluster_cols]) - n_clusters = st.slider("Number of Clusters", 2, 10, 3, help="Number of clusters to form.") - kmeans = KMeans(n_clusters=n_clusters, random_state=42) - clusters = kmeans.fit_predict(scaled_data) - df['Cluster'] = clusters - - if len(cluster_cols) == 2: - fig = px.scatter(df, x=cluster_cols[0], y=cluster_cols[1], color='Cluster', title="K-Means Clustering") - st.plotly_chart(fig, use_container_width=True) - elif len(cluster_cols) == 3: - fig = px.scatter_3d(df, x=cluster_cols[0], y=cluster_cols[1], z=cluster_cols[2], color='Cluster', title="K-Means Clustering (3D)") - st.plotly_chart(fig, use_container_width=True) - else: - st.write("Clustering visualization is only supported for 2 or 3 selected columns.") - st.success("Clustering applied successfully!") - - #Add clustering performance in clustering analysis - if len(cluster_cols) >= 2: # Evaluate Silhouette Score - try: - silhouette_avg = silhouette_score(scaled_data, clusters) - st.write(f"Silhouette Score: {silhouette_avg:.4f}") - except: - st.write("Could not compute silhouette score") - - #Add dimensionality reduction option and 2d/3d plots - - dimension_reduction = st.selectbox("Dimensionality Reduction", ["None", "PCA"]) - if dimension_reduction == "PCA": - n_components = st.slider("Number of Components", 2, min(3, len(cluster_cols)), 2) - pca = PCA(n_components=n_components) - principal_components = pca.fit_transform(scaled_data) - pca_df = pd.DataFrame(data=principal_components, columns=[f'PC{i + 1}' for i in range(n_components)]) - pca_df['Cluster'] = clusters # Add Cluster - - if len(cluster_cols) >= 2: #plotting section - fig = None #Initialize fig - if dimension_reduction == "None": - if len(cluster_cols) == 2: - fig = px.scatter(df, x=cluster_cols[0], y=cluster_cols[1], color='Cluster', title="K-Means Clustering") - st.plotly_chart(fig, use_container_width=True) - elif len(cluster_cols) == 3: - fig = px.scatter_3d(df, x=cluster_cols[0], y=cluster_cols[1], z=cluster_cols[2], color='Cluster', title="K-Means Clustering (3D)") - st.plotly_chart(fig, use_container_width=True) - else: - st.write("Clustering visualization is only supported for 2 or 3 selected columns.") - - elif dimension_reduction == "PCA": - if n_components == 2: - fig = px.scatter(pca_df, x='PC1', y='PC2', color='Cluster', title="K-Means Clustering (PCA - 2D)") - st.plotly_chart(fig, use_container_width=True) - elif n_components == 3: - fig = px.scatter_3d(pca_df, x='PC1', y='PC2', z='PC3', color='Cluster', title="K-Means Clustering (PCA - 3D)") - st.plotly_chart(fig, use_container_width=True) - - else: - st.write("PCA visualization is only supported for 2 or 3 components.") - - except Exception as e: - st.error(f"An error occurred during clustering: {e}") + conf_matrix = confusion_matrix(y_test, y_pred) + fig, ax = plt.subplots() + sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax) + ax.set_xlabel('Predicted Labels') + ax.set_ylabel('True Labels') + ax.set_title('Confusion Matrix') + st.pyplot(fig) + except Exception as e: + st.error(f"Prediction analysis failed: {e}") + # Prediction Export + st.subheader("๐Ÿ’พ Export Predictions") + if st.button("Export Predictions as PDF"): + try: + from fpdf import FPDF + pdf = FPDF() + pdf.add_page() + pdf.set_font("Arial", size=12) + pdf.cell(200, 10, txt="Predictions Report", ln=True, align='C') + pdf.cell(200, 10, txt=f"Model Type: {type(model).__name__}", ln=True) + pdf.cell(200, 10, txt=f"Problem Type: {'Regression' if hasattr(model, 'predict') else 'Classification'}", ln=True) + pdf.output("predictions_report.pdf") + st.success("Predictions exported successfully!") + except Exception as e: + st.error(f"Export failed: {e}") + +# Neural Network Studio Section elif app_mode == "Neural Network Studio": st.title("๐Ÿง  Neural Network Studio") + st.markdown(""" + **Build and Train Neural Networks** with advanced configurations and visualizations. + Explore deep learning models with ease. + """) + + if 'cleaned_data' not in st.session_state or st.session_state.cleaned_data is None: + st.warning("Please clean your data in the Smart Cleaning section first.") + st.stop() + + df = st.session_state.cleaned_data.copy() + + # Target Variable Selection + st.subheader("๐ŸŽฏ Target Variable") + target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column to predict.") + + # Problem Type Selection + st.subheader("๐Ÿ“ Problem Type") + problem_type = st.radio("Select Problem Type", ["Regression", "Classification"], help="Choose the type of machine learning problem.") + + # Feature Selection + st.subheader("๐Ÿ”ง Feature Selection") + use_all_features = st.checkbox("Use All Features", value=True, help="Select to use all features for training. Deselect to manually choose features.") + if use_all_features: + feature_columns = df.drop(columns=[target_column]).columns.tolist() + else: + feature_columns = st.multiselect("Select Feature Columns", df.drop(columns=[target_column]).columns, help="Choose the features you want to use for prediction.") + + # Neural Network Configuration + st.subheader("โš™๏ธ Neural Network Configuration") + with st.expander("Configure Neural Network", expanded=True): + hidden_layers = st.slider("Number of Hidden Layers", 1, 5, 2) + neurons_per_layer = st.slider("Neurons per Layer", 10, 200, 50) + activation = st.selectbox("Activation Function", ["relu", "tanh", "sigmoid"]) + learning_rate = st.slider("Learning Rate", 0.001, 0.1, 0.01) + epochs = st.slider("Epochs", 10, 200, 50) + batch_size = st.slider("Batch Size", 16, 128, 32) + + # Train-Test Split + st.subheader("โœ‚๏ธ Train-Test Split") + test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the dataset to include in the test split.") + + # Model Training + if st.button("๐Ÿš€ Train Neural Network"): + with st.spinner("Training neural network..."): + try: + X = df[feature_columns] + y = df[target_column] - if st.session_state.cleaned_data is not None: - df = st.session_state.cleaned_data.copy() + # Train-Test Split + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) - # Target Variable Selection - target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column you want to predict.") + # Preprocessing Pipeline + numeric_features = X.select_dtypes(include=np.number).columns + categorical_features = X.select_dtypes(exclude=np.number).columns - # Problem Type Selection - problem_type = st.radio("Select Problem Type", ["Regression", "Classification"], help="Choose the type of machine learning problem.") + numeric_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='median')), + ('scaler', StandardScaler()) + ]) - # Feature Selection (optional) - use_all_features = st.checkbox("Use All Features", value=True, help="Select to use all features for training. Deselect to manually choose features.") - if use_all_features: - feature_columns = df.drop(columns=[target_column]).columns.tolist() - else: - feature_columns = st.multiselect("Select Feature Columns", df.drop(columns=[target_column]).columns, help="Choose the features you want to use for prediction.") - - # Model Selection - model_type = st.selectbox("Select Neural Network Model", [ - "Simple Neural Network", "Convolutional Neural Network (CNN)", "Recurrent Neural Network (RNN)" - ], help="Choose the neural network model to use.") - - # Hyperparameter Tuning - with st.expander("Hyperparameter Tuning", expanded=False): - if model_type == "Simple Neural Network": - hidden_layers = st.slider("Number of Hidden Layers", 1, 5, 2, help="Number of hidden layers in the network.") - neurons_per_layer = st.slider("Neurons per Layer", 10, 200, 50, help="Number of neurons in each hidden layer.") - epochs = st.slider("Epochs", 10, 200, 50, help="Number of epochs for training.") - batch_size = st.slider("Batch Size", 16, 128, 32, help="Batch size for training.") - elif model_type == "Convolutional Neural Network (CNN)": - epochs_cnn = st.slider("Epochs", 10, 200, 50, help="Number of epochs for CNN training.") - batch_size_cnn = st.slider("Batch Size", 16, 128, 32, help="Batch size for CNN training.") - elif model_type == "Recurrent Neural Network (RNN)": - epochs_rnn = st.slider("Epochs", 10, 200, 50, help="Number of epochs for RNN training.") - batch_size_rnn = st.slider("Batch Size", 16, 128, 32, help="Batch size for RNN training.") - sequence_length = st.slider("Sequence Length (for RNN)", 10, 100, 30, help="Length of the input sequences for RNN.") - # Train-Test Split - test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the data to use for testing.") - - # Model Training Button - if st.button("Train Neural Network Model"): - with st.spinner("Training neural network model..."): - try: - # Split data - X = df[feature_columns] - y = df[target_column] - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) - - # Preprocessing - numeric_transformer = Pipeline(steps=[ - ('imputer', SimpleImputer(strategy='median')), - ('scaler', StandardScaler()) - ]) - categorical_transformer = Pipeline(steps=[ - ('imputer', SimpleImputer(strategy='most_frequent')), - ('onehot', OneHotEncoder(handle_unknown='ignore')) + categorical_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='most_frequent')), + ('onehot', OneHotEncoder(handle_unknown='ignore')) + ]) + + preprocessor = ColumnTransformer( + transformers=[ + ('num', numeric_transformer, numeric_features), + ('cat', categorical_transformer, categorical_features) ]) - numeric_features = X_train.select_dtypes(include=np.number).columns - categorical_features = X_train.select_dtypes(include='object').columns - - preprocessor = ColumnTransformer( - transformers=[ - ('num', numeric_transformer, numeric_features), - ('cat', categorical_transformer, categorical_features) - ]) - - X_train_processed = preprocessor.fit_transform(X_train) - X_test_processed = preprocessor.transform(X_test) - - # Neural Network Model Selection and Training - tf.random.set_seed(42) # for reproducibility - - # Callbacks (Early Stopping) - early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True) - - if model_type == "Simple Neural Network": - model = keras.Sequential() - model.add(layers.Input(shape=(X_train_processed.shape[1],))) - for _ in range(hidden_layers): - model.add(layers.Dense(neurons_per_layer, activation=activation)) # Use the selected activation - model.add( - layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)), - activation='linear' if problem_type == "Regression" else 'softmax')) - - optimizer = keras.optimizers.Adam(learning_rate=learning_rate) # Use the learning rate - - model.compile(optimizer=optimizer, - loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy', - metrics=['mae'] if problem_type == "Regression" else ['accuracy']) - - history = model.fit(X_train_processed, y_train, epochs=epochs, batch_size=batch_size, - validation_split=0.2, verbose=0, - callbacks=[early_stopping]) # Added early stopping - - y_pred = model.predict(X_test_processed) - if problem_type == "Classification": - y_pred = np.argmax(y_pred, axis=1) - - elif model_type == "Convolutional Neural Network (CNN)": - X_train_cnn = np.expand_dims(X_train_processed, axis=2) - X_test_cnn = np.expand_dims(X_test_processed, axis=2) - - model = keras.Sequential() - model.add(layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu', - input_shape=(X_train_cnn.shape[1], 1))) - model.add(layers.MaxPooling1D(pool_size=pooling_size)) - model.add(layers.Flatten()) - model.add(layers.Dense(50, activation='relu')) - model.add( - layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)), - activation='linear' if problem_type == "Regression" else 'softmax')) - - optimizer = keras.optimizers.Adam(learning_rate=learning_rate) - model.compile(optimizer=optimizer, - loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy', - metrics=['mae'] if problem_type == "Regression" else ['accuracy']) - - history = model.fit(X_train_cnn, y_train, epochs=epochs_cnn, batch_size=batch_size_cnn, - validation_split=0.2, verbose=0, - callbacks=[early_stopping]) - - y_pred = model.predict(X_test_cnn) - if problem_type == "Classification": - y_pred = np.argmax(y_pred, axis=1) - - elif model_type == "Recurrent Neural Network (RNN)": - try: - X_train_rnn = np.reshape(X_train_processed, ( - X_train_processed.shape[0], sequence_length, - X_train_processed.shape[1] // sequence_length)) - X_test_rnn = np.reshape(X_test_processed, ( - X_test_processed.shape[0], sequence_length, X_test_processed.shape[1] // sequence_length)) - - model = keras.Sequential() - model.add(layers.SimpleRNN(units, activation='relu', # Use the selected units - dropout=dropout_rate, - input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2]))) - model.add( - layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)), - activation='linear' if problem_type == "Regression" else 'softmax')) - - optimizer = keras.optimizers.Adam(learning_rate=learning_rate) - model.compile(optimizer=optimizer, - loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy', - metrics=['mae'] if problem_type == "Regression" else ['accuracy']) - - history = model.fit(X_train_rnn, y_train, epochs=epochs_rnn, batch_size=batch_size_rnn, - validation_split=0.2, verbose=0, - callbacks=[early_stopping]) - - y_pred = model.predict(X_test_rnn) - if problem_type == "Classification": - y_pred = np.argmax(y_pred, axis=1) - except Exception as e: - st.error(f"Error during RNN training: {e}") - st.stop() # Stop execution if RNN fails - - # Evaluation - if problem_type == "Regression": - mse = mean_squared_error(y_test, y_pred) - rmse = np.sqrt(mse) - mae = mean_absolute_error(y_test, y_pred) - r2 = r2_score(y_test, y_pred) - st.write(f"Mean Squared Error: {mse:.4f}") - st.write(f"Root Mean Squared Error: {rmse:.4f}") - st.write(f"Mean Absolute Error: {mae:.4f}") - st.write(f"R-squared: {r2:.4f}") - else: - accuracy = accuracy_score(y_test, y_pred) - precision = precision_score(y_test, y_pred, average='weighted', zero_division=0) - recall = recall_score(y_test, y_pred, average='weighted', zero_division=0) - f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0) - st.write(f"Accuracy: {accuracy:.4f}") - st.write(f"Precision: {precision:.4f}") - st.write(f"Recall: {recall:.4f}") - st.write(f"F1 Score: {f1:.4f}") - st.write("Classification Report:") - st.text(classification_report(y_test, y_pred)) - - # Visualization - st.subheader("Training History") - fig, ax = plt.subplots() # Use matplotlib directly - - ax.plot(history.history['loss'], label='loss') - ax.plot(history.history['val_loss'], label='val_loss') - ax.set_xlabel('Epoch') - ax.set_ylabel('Loss') - ax.legend() - st.pyplot(fig) # Display with st.pyplot - - st.success("Model trained successfully!") + X_train_processed = preprocessor.fit_transform(X_train) + X_test_processed = preprocessor.transform(X_test) + + # Neural Network Model + model = keras.Sequential() + model.add(layers.Input(shape=(X_train_processed.shape[1],))) + for _ in range(hidden_layers): + model.add(layers.Dense(neurons_per_layer, activation=activation)) + model.add(layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)), + activation='linear' if problem_type == "Regression" else 'softmax')) + + # Compile the model + optimizer = keras.optimizers.Adam(learning_rate=learning_rate) + model.compile(optimizer=optimizer, + loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy', + metrics=['mae'] if problem_type == "Regression" else ['accuracy']) + + # Train the model + history = model.fit(X_train_processed, y_train, epochs=epochs, batch_size=batch_size, + validation_split=0.2, verbose=0) + + # Store model and preprocessor + st.session_state.model = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)]) + st.session_state.preprocessor = preprocessor + + # Store the test data + st.session_state.X_train_selected = X_train_processed + st.session_state.X_test_selected = X_test_processed + st.session_state.y_train = y_train + st.session_state.y_test = y_test + + # Model Evaluation + y_pred = model.predict(X_test_processed) + if problem_type == "Regression": + mse = mean_squared_error(y_test, y_pred) + rmse = np.sqrt(mse) + mae = mean_absolute_error(y_test, y_pred) + r2 = r2_score(y_test, y_pred) + st.write(f"Mean Squared Error: {mse:.4f}") + st.write(f"Root Mean Squared Error: {rmse:.4f}") + st.write(f"Mean Absolute Error: {mae:.4f}") + st.write(f"R-squared: {r2:.4f}") + else: + accuracy = accuracy_score(y_test, y_pred) + precision = precision_score(y_test, y_pred, average='weighted', zero_division=0) + recall = recall_score(y_test, y_pred, average='weighted', zero_division=0) + f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0) + st.write(f"Accuracy: {accuracy:.4f}") + st.write(f"Precision: {precision:.4f}") + st.write(f"Recall: {recall:.4f}") + st.write(f"F1 Score: {f1:.4f}") + st.write("Classification Report:") + st.text(classification_report(y_test, y_pred)) + + # Visualization + st.subheader("๐Ÿ“Š Training History") + fig, ax = plt.subplots() + ax.plot(history.history['loss'], label='loss') + ax.plot(history.history['val_loss'], label='val_loss') + ax.set_xlabel('Epoch') + ax.set_ylabel('Loss') + ax.legend() + st.pyplot(fig) + + st.success("Neural network trained successfully!") + except Exception as e: + st.error(f"An error occurred during training: {e}") - except Exception as e: - st.error(f"An error occurred during training: {e}") \ No newline at end of file + # Model Saving + if st.session_state.model is not None: + st.subheader("๐Ÿ’พ Save Model") + model_filename = st.text_input("Enter Model Filename (without extension)", "neural_network") + if st.button("Save Model"): + try: + joblib.dump(st.session_state.model, f"{model_filename}.joblib") + st.success(f"Model saved as {model_filename}.joblib") + except Exception as e: + st.error(f"Error saving model: {e}") \ No newline at end of file