Spaces:
Runtime error
Runtime error
File size: 4,995 Bytes
92a085a f076a08 92a085a f076a08 900c0ad f076a08 900c0ad f076a08 900c0ad f076a08 900c0ad f076a08 900c0ad f076a08 92a085a 900c0ad 92a085a 900c0ad 92a085a 900c0ad 92a085a f076a08 92a085a f076a08 92a085a 900c0ad f076a08 92a085a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from io import StringIO
import openpyxl
from st_aggrid import AgGrid, GridUpdateMode
from st_aggrid.grid_options_builder import GridOptionsBuilder
def load_data(file):
file_extension = file.name.split('.')[-1].lower()
if file_extension == 'csv':
data = pd.read_csv(file)
elif file_extension in ['xls', 'xlsx']:
data = pd.read_excel(file)
else:
st.error("Unsupported file format. Please upload a CSV, XLS, or XLSX file.")
return None
return data
def manual_data_entry():
st.subheader("Manual Data Entry")
col_names = st.text_input("Enter column names separated by commas:").split(',')
col_names = [name.strip() for name in col_names if name.strip()]
if col_names:
num_rows = st.number_input("Enter number of rows:", min_value=1, value=5)
data = pd.DataFrame(columns=col_names, index=range(num_rows))
gd = GridOptionsBuilder.from_dataframe(data)
gd.configure_default_column(editable=True)
gridoptions = gd.build()
grid_table = AgGrid(data, gridOptions=gridoptions,
update_mode=GridUpdateMode.VALUE_CHANGED,
height=400)
return grid_table['data']
return None
def preprocess_data(data):
st.subheader("Data Preprocessing")
# Handle missing values
if data.isnull().sum().sum() > 0:
st.write("Handling missing values:")
for column in data.columns:
if data[column].isnull().sum() > 0:
method = st.selectbox(f"Choose method for {column}:",
["Drop", "Fill with mean", "Fill with median", "Fill with mode"])
if method == "Drop":
data = data.dropna(subset=[column])
elif method == "Fill with mean":
data[column].fillna(data[column].mean(), inplace=True)
elif method == "Fill with median":
data[column].fillna(data[column].median(), inplace=True)
elif method == "Fill with mode":
data[column].fillna(data[column].mode()[0], inplace=True)
# Convert data types
for column in data.columns:
if data[column].dtype == 'object':
try:
data[column] = pd.to_numeric(data[column])
st.write(f"Converted {column} to numeric.")
except ValueError:
st.write(f"Kept {column} as categorical.")
return data
def perform_analysis(data):
st.header("Exploratory Data Analysis")
# Summary statistics
st.write("Summary Statistics:")
st.write(data.describe())
# Correlation heatmap
st.write("Correlation Heatmap:")
numeric_data = data.select_dtypes(include=['float64', 'int64'])
if not numeric_data.empty:
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', ax=ax)
st.pyplot(fig)
else:
st.write("No numeric columns available for correlation heatmap.")
# Pairplot
st.write("Pairplot:")
if not numeric_data.empty:
fig = sns.pairplot(numeric_data)
st.pyplot(fig)
else:
st.write("No numeric columns available for pairplot.")
# Histogram
st.write("Histograms:")
for column in numeric_data.columns:
fig, ax = plt.subplots()
sns.histplot(data[column], kde=True, ax=ax)
st.pyplot(fig)
# Box plots for numerical columns
st.write("Box Plots:")
for column in numeric_data.columns:
fig, ax = plt.subplots()
sns.boxplot(data=data, y=column, ax=ax)
st.pyplot(fig)
# Bar plots for categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns
if not categorical_columns.empty:
st.write("Bar Plots for Categorical Variables:")
for column in categorical_columns:
fig, ax = plt.subplots()
data[column].value_counts().plot(kind='bar', ax=ax)
plt.title(f"Distribution of {column}")
plt.xlabel(column)
plt.ylabel("Count")
st.pyplot(fig)
def main():
st.title("Interactive EDA Toolkit")
data_input_method = st.radio("Choose data input method:", ("Upload File", "Manual Entry"))
if data_input_method == "Upload File":
uploaded_file = st.file_uploader("Choose a CSV, XLS, or XLSX file", type=["csv", "xls", "xlsx"])
if uploaded_file is not None:
data = load_data(uploaded_file)
else:
data = None
else:
data = manual_data_entry()
if data is not None:
st.write("Data Preview:")
st.write(data.head())
data = preprocess_data(data)
perform_analysis(data)
if __name__ == "__main__":
main() |