Spaces:
Runtime error
Runtime error
File size: 11,698 Bytes
6607e79 9d0c2d9 6607e79 5d3671b 7abab37 6607e79 7abab37 6607e79 5d3671b 2aae306 900c0ad b84e319 96b9255 6607e79 cbb0a6e 89d8e3e cbb0a6e 89d8e3e 6607e79 5d3671b 6607e79 5d3671b 6607e79 5d3671b 6607e79 5d89abf 6607e79 5d3671b 6607e79 5d89abf 6607e79 5d89abf 6607e79 5d89abf 6607e79 71227fd 2aae306 6607e79 2aae306 7abab37 5d3671b 2aae306 7abab37 f7f3976 7abab37 71227fd f7f3976 71227fd 6607e79 5d89abf 6607e79 38cbba4 2aae306 38cbba4 900c0ad cc89531 6607e79 5d3671b 92a085a 2aae306 b84e319 2aae306 b84e319 2aae306 38cbba4 bf71d2b cc156a3 cbb0a6e cc156a3 cbb0a6e deb6b04 cc156a3 deb6b04 cbb0a6e 2aae306 5d3671b 38cbba4 5d3671b 92a085a |
|
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from io import StringIO
import openpyxl
import matplotlib.font_manager as fm
from scipy import stats
import os
# νκΈ ν°νΈ μ€μ
def set_font():
font_path = "Pretendard-Bold.ttf" # μ€μ ν°νΈ νμΌ κ²½λ‘λ‘ λ³κ²½ν΄μ£ΌμΈμ
fm.fontManager.addfont(font_path)
return {'font.family': 'Pretendard-Bold', 'axes.unicode_minus': False}
# ν°νΈ μ€μ μ κ°μ Έμ΅λλ€
font_settings = set_font()
# μΈμ
μν μ΄κΈ°ν λ° κ΄λ¦¬
def manage_session_state():
if 'data' not in st.session_state:
st.session_state.data = None
if 'processed_data' not in st.session_state:
st.session_state.processed_data = None
if 'numeric_columns' not in st.session_state:
st.session_state.numeric_columns = []
if 'categorical_columns' not in st.session_state:
st.session_state.categorical_columns = []
if 'x_var' not in st.session_state:
st.session_state.x_var = None
if 'y_var' not in st.session_state:
st.session_state.y_var = None
if 'slicers' not in st.session_state:
st.session_state.slicers = {}
if 'analysis_performed' not in st.session_state:
st.session_state.analysis_performed = False
if 'filtered_data' not in st.session_state:
st.session_state.filtered_data = None
SAMPLE_DATA_FILES = [
{"name": "κ³Όλͺ©λ³ λ
Έλ ₯κ³Ό μ±μ·¨λ", "file": "subject.xlsx"},
{"name": "μ±μ ", "file": "score.xlsx"},
{"name": "μΆμμΌμμ μ±μ ", "file": "attendance.xlsx"}
]
def load_sample_data(file_name):
# μμ λ°μ΄ν° νμΌ κ²½λ‘
file_path = os.path.join("sample_data", file_name)
if file_name.endswith('.csv'):
return pd.read_csv(file_path)
elif file_name.endswith(('.xls', '.xlsx')):
return pd.read_excel(file_path)
else:
st.error("μ§μλμ§ μλ νμΌ νμμ
λλ€.")
return None
# λ°μ΄ν° λ‘λ
@st.cache_data
def load_data(file):
file_extension = file.name.split('.')[-1].lower()
if file_extension == 'csv':
data = pd.read_csv(file)
elif file_extension in ['xls', 'xlsx']:
data = pd.read_excel(file)
else:
st.error("μ§μλμ§ μλ νμΌ νμμ
λλ€. CSV, XLS, λλ XLSX νμΌμ μ
λ‘λν΄μ£ΌμΈμ.")
return None
# λΉ μ΄ μ΄λ¦μ κΈ°λ³Έκ° λΆμ¬
if data.columns.isnull().any():
data.columns = [f'Column_{i+1}' if pd.isnull(col) else col for i, col in enumerate(data.columns)]
return data
def manual_data_entry():
col_names = st.text_input("μ΄ μ΄λ¦μ μΌνλ‘ κ΅¬λΆνμ¬ μ
λ ₯νμΈμ:", key="manual_col_names").split(',')
col_names = [name.strip() for name in col_names if name.strip()]
if col_names:
num_rows = st.number_input("μ΄κΈ° νμ μλ₯Ό μ
λ ₯νμΈμ:", min_value=1, value=5, key="manual_num_rows")
data = pd.DataFrame(columns=col_names, index=range(num_rows))
edited_data = st.data_editor(data, num_rows="dynamic", key="manual_data_editor")
return edited_data
return None
def preprocess_data(data):
# λ°μ΄ν° νμ
μΆλ‘ λ° λ³ν
for column in data.columns:
if data[column].dtype == 'object':
try:
# NaN κ°μ 무μνκ³ μ«μλ‘ λ³ν μλ
numeric_converted = pd.to_numeric(data[column], errors='coerce')
# λͺ¨λ κ°μ΄ NaNμ΄ μλλΌλ©΄ λ³νλ μ΄μ μ¬μ©
if not numeric_converted.isna().all():
data[column] = numeric_converted
st.write(f"'{column}' μ΄μ μ«μνμΌλ‘ λ³ννμ΅λλ€.")
except:
st.write(f"'{column}' μ΄μ λ²μ£ΌνμΌλ‘ μ μ§λ©λλ€.")
# κ²°μΈ‘μΉ μ²λ¦¬ (κΈ°μ‘΄ μ½λ μ μ§)
if data.isnull().sum().sum() > 0:
st.write("κ²°μΈ‘μΉ μ²λ¦¬:")
for column in data.columns:
if data[column].isnull().sum() > 0:
method = st.selectbox(f"{column} μ΄μ μ²λ¦¬ λ°©λ² μ ν:",
["μ κ±°", "νκ· μΌλ‘ λ체", "μ€μκ°μΌλ‘ λ체", "μ΅λΉκ°μΌλ‘ λ체"],
key=f"missing_{column}")
if method == "μ κ±°":
data = data.dropna(subset=[column])
elif method == "νκ· μΌλ‘ λ체":
if pd.api.types.is_numeric_dtype(data[column]):
data[column].fillna(data[column].mean(), inplace=True)
else:
st.warning(f"{column} μ΄μ μ«μνμ΄ μλμ΄μ νκ· κ°μΌλ‘ λ체ν μ μμ΅λλ€.")
elif method == "μ€μκ°μΌλ‘ λ체":
if pd.api.types.is_numeric_dtype(data[column]):
data[column].fillna(data[column].median(), inplace=True)
else:
st.warning(f"{column} μ΄μ μ«μνμ΄ μλμ΄μ μ€μκ°μΌλ‘ λ체ν μ μμ΅λλ€.")
elif method == "μ΅λΉκ°μΌλ‘ λ체":
data[column].fillna(data[column].mode()[0], inplace=True)
# μ«μν μ΄κ³Ό λ²μ£Όν μ΄ λΆλ¦¬
st.session_state.numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
st.session_state.categorical_columns = data.select_dtypes(exclude=['float64', 'int64']).columns.tolist()
return data
def update_filtered_data():
st.session_state.filtered_data = apply_slicers(st.session_state.processed_data)
def create_slicers(data):
for col in st.session_state.categorical_columns:
if data[col].nunique() <= 10:
st.session_state.slicers[col] = st.multiselect(
f"{col} μ ν",
options=sorted(data[col].unique()),
default=sorted(data[col].unique()),
key=f"slicer_{col}",
on_change=update_filtered_data
)
def apply_slicers(data):
filtered_data = data.copy()
for col, selected_values in st.session_state.slicers.items():
if selected_values:
filtered_data = filtered_data[filtered_data[col].isin(selected_values)]
return filtered_data
def plot_correlation_heatmap(data):
numeric_data = data[st.session_state.numeric_columns]
if not numeric_data.empty:
corr = numeric_data.corr()
fig = px.imshow(corr, color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
fig.update_layout(title='μκ΄κ΄κ³ ννΈλ§΅')
st.plotly_chart(fig)
else:
st.warning("μκ΄κ΄κ³ ννΈλ§΅μ 그릴 μ μλ μ«μν μ΄μ΄ μμ΅λλ€.")
def plot_scatter_with_regression(data, x_var, y_var):
fig = px.scatter(data, x=x_var, y=y_var, color='λ°' if 'λ°' in data.columns else None)
# νκ·μ μΆκ°
x = data[x_var]
y = data[y_var]
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
line_x = np.array([x.min(), x.max()])
line_y = slope * line_x + intercept
fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='νκ·μ '))
r_squared = r_value ** 2
fig.update_layout(
title=f'{x_var}μ {y_var}μ κ΄κ³ (R-squared: {r_squared:.4f})',
xaxis_title=x_var,
yaxis_title=y_var,
annotations=[
dict(
x=0.5,
y=1.05,
xref='paper',
yref='paper',
text=f'R-squared: {r_squared:.4f}',
showarrow=False,
)
]
)
st.plotly_chart(fig)
# μΆκ° ν΅κ³ μ 보
st.write(f"μκ΄κ³μ: {r_value:.4f}")
st.write(f"p-value: {p_value:.4f}")
st.write(f"νμ€ μ€μ°¨: {std_err:.4f}")
def perform_analysis():
if st.session_state.filtered_data is None:
st.session_state.filtered_data = st.session_state.processed_data.copy()
st.header("νμμ λ°μ΄ν° λΆμ")
# μ¬λΌμ΄μ μμ±
create_slicers(st.session_state.processed_data)
# μμ½ ν΅κ³
st.write("μμ½ ν΅κ³:")
st.write(st.session_state.filtered_data.describe())
# μκ΄κ΄κ³ ννΈλ§΅
st.subheader("μκ΄κ΄κ³ ννΈλ§΅")
plot_correlation_heatmap(st.session_state.filtered_data)
# μ¬μ©μκ° μ νν λ λ³μμ λν μ°μ λ λ° νκ· λΆμ
st.subheader("λ λ³μ κ°μ κ΄κ³ λΆμ")
x_var = st.selectbox("XμΆ λ³μ μ ν", options=st.session_state.numeric_columns, key='x_var')
y_var = st.selectbox("YμΆ λ³μ μ ν", options=[col for col in st.session_state.numeric_columns if col != x_var], key='y_var')
if x_var and y_var:
plot_scatter_with_regression(st.session_state.filtered_data, x_var, y_var)
def main():
st.title("μΈν°λν°λΈ EDA ν΄ν·")
manage_session_state()
if st.session_state.data is None:
data_input_method = st.radio("λ°μ΄ν° μ
λ ₯ λ°©λ² μ ν:", ("νμΌ μ
λ‘λ", "μμ λ°μ΄ν° μ¬μ©", "μλ μ
λ ₯"), key="data_input_method")
if data_input_method == "νμΌ μ
λ‘λ":
uploaded_file = st.file_uploader("CSV, XLS, λλ XLSX νμΌμ μ ννμΈμ", type=["csv", "xls", "xlsx"], key="file_uploader")
if uploaded_file is not None:
st.session_state.data = load_data(uploaded_file)
elif data_input_method == "μμ λ°μ΄ν° μ¬μ©":
sample_choice = st.selectbox(
"μμ λ°μ΄ν° μ ν",
options=[sample["name"] for sample in SAMPLE_DATA_FILES],
format_func=lambda x: x
)
if st.button("μ νν μμ λ°μ΄ν° λ‘λ"):
selected_file = next(sample["file"] for sample in SAMPLE_DATA_FILES if sample["name"] == sample_choice)
st.session_state.data = load_sample_data(selected_file)
else:
st.session_state.data = manual_data_entry()
if st.session_state.data is not None:
st.subheader("μ΄ μ΄λ¦ μμ ")
st.write("μ΄ μ΄λ¦μ νμΈνκ³ νμν κ²½μ° μμ νμΈμ:")
# μ΄ μ΄λ¦ νΈμ§μ μν λ°μ΄ν°νλ μ μμ±
column_names = pd.DataFrame({'νμ¬ μ΄ μ΄λ¦': st.session_state.data.columns})
edited_column_names = st.data_editor(
column_names,
num_rows="fixed",
key="column_name_editor",
column_config={
"νμ¬ μ΄ μ΄λ¦": st.column_config.TextColumn(
"μ΄ μ΄λ¦",
help="μλ‘μ΄ μ΄ μ΄λ¦μ μ
λ ₯νμΈμ",
max_chars=50
)
}
)
# μμ λ μ΄ μ΄λ¦ μ μ©
st.session_state.data.columns = edited_column_names['νμ¬ μ΄ μ΄λ¦']
st.subheader("λ°μ΄ν° 미리보기 λ° μμ ")
st.write("λ°μ΄ν°λ₯Ό νμΈνκ³ νμν κ²½μ° μμ νμΈμ:")
edited_data = st.data_editor(
st.session_state.data,
num_rows="dynamic",
key="main_data_editor" # μ¬κΈ°μ ν€λ₯Ό λ³κ²½νμ΅λλ€
)
if st.button("λ°μ΄ν° λΆμ μμ", key="start_analysis") or st.session_state.analysis_performed:
if not st.session_state.analysis_performed:
st.session_state.processed_data = preprocess_data(edited_data)
st.session_state.analysis_performed = True
perform_analysis()
if __name__ == "__main__":
main() |