import streamlit as st import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np from io import StringIO import openpyxl from st_aggrid import AgGrid, GridUpdateMode from st_aggrid.grid_options_builder import GridOptionsBuilder def load_data(file): file_extension = file.name.split('.')[-1].lower() if file_extension == 'csv': data = pd.read_csv(file) elif file_extension in ['xls', 'xlsx']: data = pd.read_excel(file) else: st.error("Unsupported file format. Please upload a CSV, XLS, or XLSX file.") return None return data def manual_data_entry(): st.subheader("Manual Data Entry") col_names = st.text_input("Enter column names separated by commas:").split(',') col_names = [name.strip() for name in col_names if name.strip()] if col_names: num_rows = st.number_input("Enter number of rows:", min_value=1, value=5) data = pd.DataFrame(columns=col_names, index=range(num_rows)) gd = GridOptionsBuilder.from_dataframe(data) gd.configure_default_column(editable=True) gridoptions = gd.build() grid_table = AgGrid(data, gridOptions=gridoptions, update_mode=GridUpdateMode.VALUE_CHANGED, height=400) return grid_table['data'] return None def preprocess_data(data): st.subheader("Data Preprocessing") # Handle missing values if data.isnull().sum().sum() > 0: st.write("Handling missing values:") for column in data.columns: if data[column].isnull().sum() > 0: method = st.selectbox(f"Choose method for {column}:", ["Drop", "Fill with mean", "Fill with median", "Fill with mode"]) if method == "Drop": data = data.dropna(subset=[column]) elif method == "Fill with mean": data[column].fillna(data[column].mean(), inplace=True) elif method == "Fill with median": data[column].fillna(data[column].median(), inplace=True) elif method == "Fill with mode": data[column].fillna(data[column].mode()[0], inplace=True) # Convert data types for column in data.columns: if data[column].dtype == 'object': try: data[column] = pd.to_numeric(data[column]) st.write(f"Converted {column} to numeric.") except ValueError: st.write(f"Kept {column} as categorical.") return data def perform_analysis(data): st.header("Exploratory Data Analysis") # Summary statistics st.write("Summary Statistics:") st.write(data.describe()) # Correlation heatmap st.write("Correlation Heatmap:") numeric_data = data.select_dtypes(include=['float64', 'int64']) if not numeric_data.empty: fig, ax = plt.subplots(figsize=(10, 8)) sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', ax=ax) st.pyplot(fig) else: st.write("No numeric columns available for correlation heatmap.") # Pairplot st.write("Pairplot:") if not numeric_data.empty: fig = sns.pairplot(numeric_data) st.pyplot(fig) else: st.write("No numeric columns available for pairplot.") # Histogram st.write("Histograms:") for column in numeric_data.columns: fig, ax = plt.subplots() sns.histplot(data[column], kde=True, ax=ax) st.pyplot(fig) # Box plots for numerical columns st.write("Box Plots:") for column in numeric_data.columns: fig, ax = plt.subplots() sns.boxplot(data=data, y=column, ax=ax) st.pyplot(fig) # Bar plots for categorical columns categorical_columns = data.select_dtypes(include=['object']).columns if not categorical_columns.empty: st.write("Bar Plots for Categorical Variables:") for column in categorical_columns: fig, ax = plt.subplots() data[column].value_counts().plot(kind='bar', ax=ax) plt.title(f"Distribution of {column}") plt.xlabel(column) plt.ylabel("Count") st.pyplot(fig) def main(): st.title("Interactive EDA Toolkit") data_input_method = st.radio("Choose data input method:", ("Upload File", "Manual Entry")) if data_input_method == "Upload File": uploaded_file = st.file_uploader("Choose a CSV, XLS, or XLSX file", type=["csv", "xls", "xlsx"]) if uploaded_file is not None: data = load_data(uploaded_file) else: data = None else: data = manual_data_entry() if data is not None: st.write("Data Preview:") st.write(data.head()) data = preprocess_data(data) perform_analysis(data) if __name__ == "__main__": main()