Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import plotly.express as px | |
import plotly.graph_objects as go | |
from io import StringIO | |
import openpyxl | |
import matplotlib.font_manager as fm | |
from scipy import stats | |
import os | |
# νκΈ ν°νΈ μ€μ | |
def set_font(): | |
font_path = "Pretendard-Bold.ttf" # μ€μ ν°νΈ νμΌ κ²½λ‘λ‘ λ³κ²½ν΄μ£ΌμΈμ | |
fm.fontManager.addfont(font_path) | |
return {'font.family': 'Pretendard-Bold', 'axes.unicode_minus': False} | |
# ν°νΈ μ€μ μ κ°μ Έμ΅λλ€ | |
font_settings = set_font() | |
# μΈμ μν μ΄κΈ°ν λ° κ΄λ¦¬ | |
def manage_session_state(): | |
if 'data' not in st.session_state: | |
st.session_state.data = None | |
if 'processed_data' not in st.session_state: | |
st.session_state.processed_data = None | |
if 'numeric_columns' not in st.session_state: | |
st.session_state.numeric_columns = [] | |
if 'categorical_columns' not in st.session_state: | |
st.session_state.categorical_columns = [] | |
if 'x_var' not in st.session_state: | |
st.session_state.x_var = None | |
if 'y_var' not in st.session_state: | |
st.session_state.y_var = None | |
if 'slicers' not in st.session_state: | |
st.session_state.slicers = {} | |
if 'analysis_performed' not in st.session_state: | |
st.session_state.analysis_performed = False | |
if 'filtered_data' not in st.session_state: | |
st.session_state.filtered_data = None | |
SAMPLE_DATA_FILES = [ | |
{"name": "κ³Όλͺ©λ³ λ Έλ ₯κ³Ό μ±μ·¨λ", "file": "subject.xlsx"}, | |
{"name": "μ±μ ", "file": "score.xlsx"}, | |
{"name": "μΆμμΌμμ μ±μ ", "file": "attendance.xlsx"} | |
] | |
def load_sample_data(file_name): | |
# μμ λ°μ΄ν° νμΌ κ²½λ‘ | |
file_path = os.path.join("sample_data", file_name) | |
if file_name.endswith('.csv'): | |
return pd.read_csv(file_path) | |
elif file_name.endswith(('.xls', '.xlsx')): | |
return pd.read_excel(file_path) | |
else: | |
st.error("μ§μλμ§ μλ νμΌ νμμ λλ€.") | |
return None | |
# λ°μ΄ν° λ‘λ | |
def load_data(file): | |
file_extension = file.name.split('.')[-1].lower() | |
if file_extension == 'csv': | |
data = pd.read_csv(file) | |
elif file_extension in ['xls', 'xlsx']: | |
data = pd.read_excel(file) | |
else: | |
st.error("μ§μλμ§ μλ νμΌ νμμ λλ€. CSV, XLS, λλ XLSX νμΌμ μ λ‘λν΄μ£ΌμΈμ.") | |
return None | |
# λΉ μ΄ μ΄λ¦μ κΈ°λ³Έκ° λΆμ¬ | |
if data.columns.isnull().any(): | |
data.columns = [f'Column_{i+1}' if pd.isnull(col) else col for i, col in enumerate(data.columns)] | |
return data | |
def manual_data_entry(): | |
col_names = st.text_input("μ΄ μ΄λ¦μ μΌνλ‘ κ΅¬λΆνμ¬ μ λ ₯νμΈμ:", key="manual_col_names").split(',') | |
col_names = [name.strip() for name in col_names if name.strip()] | |
if col_names: | |
num_rows = st.number_input("μ΄κΈ° νμ μλ₯Ό μ λ ₯νμΈμ:", min_value=1, value=5, key="manual_num_rows") | |
data = pd.DataFrame(columns=col_names, index=range(num_rows)) | |
edited_data = st.data_editor(data, num_rows="dynamic", key="manual_data_editor") | |
return edited_data | |
return None | |
def preprocess_data(data): | |
# λ°μ΄ν° νμ μΆλ‘ λ° λ³ν | |
for column in data.columns: | |
if data[column].dtype == 'object': | |
try: | |
# NaN κ°μ 무μνκ³ μ«μλ‘ λ³ν μλ | |
numeric_converted = pd.to_numeric(data[column], errors='coerce') | |
# λͺ¨λ κ°μ΄ NaNμ΄ μλλΌλ©΄ λ³νλ μ΄μ μ¬μ© | |
if not numeric_converted.isna().all(): | |
data[column] = numeric_converted | |
st.write(f"'{column}' μ΄μ μ«μνμΌλ‘ λ³ννμ΅λλ€.") | |
except: | |
st.write(f"'{column}' μ΄μ λ²μ£ΌνμΌλ‘ μ μ§λ©λλ€.") | |
# κ²°μΈ‘μΉ μ²λ¦¬ (κΈ°μ‘΄ μ½λ μ μ§) | |
if data.isnull().sum().sum() > 0: | |
st.write("κ²°μΈ‘μΉ μ²λ¦¬:") | |
for column in data.columns: | |
if data[column].isnull().sum() > 0: | |
method = st.selectbox(f"{column} μ΄μ μ²λ¦¬ λ°©λ² μ ν:", | |
["μ κ±°", "νκ· μΌλ‘ λ체", "μ€μκ°μΌλ‘ λ체", "μ΅λΉκ°μΌλ‘ λ체"], | |
key=f"missing_{column}") | |
if method == "μ κ±°": | |
data = data.dropna(subset=[column]) | |
elif method == "νκ· μΌλ‘ λ체": | |
if pd.api.types.is_numeric_dtype(data[column]): | |
data[column].fillna(data[column].mean(), inplace=True) | |
else: | |
st.warning(f"{column} μ΄μ μ«μνμ΄ μλμ΄μ νκ· κ°μΌλ‘ λ체ν μ μμ΅λλ€.") | |
elif method == "μ€μκ°μΌλ‘ λ체": | |
if pd.api.types.is_numeric_dtype(data[column]): | |
data[column].fillna(data[column].median(), inplace=True) | |
else: | |
st.warning(f"{column} μ΄μ μ«μνμ΄ μλμ΄μ μ€μκ°μΌλ‘ λ체ν μ μμ΅λλ€.") | |
elif method == "μ΅λΉκ°μΌλ‘ λ체": | |
data[column].fillna(data[column].mode()[0], inplace=True) | |
# μ«μν μ΄κ³Ό λ²μ£Όν μ΄ λΆλ¦¬ | |
st.session_state.numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist() | |
st.session_state.categorical_columns = data.select_dtypes(exclude=['float64', 'int64']).columns.tolist() | |
return data | |
def update_filtered_data(): | |
st.session_state.filtered_data = apply_slicers(st.session_state.processed_data) | |
def create_slicers(data): | |
for col in st.session_state.categorical_columns: | |
if data[col].nunique() <= 10: | |
st.session_state.slicers[col] = st.multiselect( | |
f"{col} μ ν", | |
options=sorted(data[col].unique()), | |
default=sorted(data[col].unique()), | |
key=f"slicer_{col}", | |
on_change=update_filtered_data | |
) | |
def apply_slicers(data): | |
filtered_data = data.copy() | |
for col, selected_values in st.session_state.slicers.items(): | |
if selected_values: | |
filtered_data = filtered_data[filtered_data[col].isin(selected_values)] | |
return filtered_data | |
def plot_correlation_heatmap(data): | |
numeric_data = data[st.session_state.numeric_columns] | |
if not numeric_data.empty: | |
corr = numeric_data.corr() | |
fig = px.imshow(corr, color_continuous_scale='RdBu_r', zmin=-1, zmax=1) | |
fig.update_layout(title='μκ΄κ΄κ³ ννΈλ§΅') | |
st.plotly_chart(fig) | |
else: | |
st.warning("μκ΄κ΄κ³ ννΈλ§΅μ 그릴 μ μλ μ«μν μ΄μ΄ μμ΅λλ€.") | |
def plot_scatter_with_regression(data, x_var, y_var): | |
fig = px.scatter(data, x=x_var, y=y_var, color='λ°' if 'λ°' in data.columns else None) | |
# νκ·μ μΆκ° | |
x = data[x_var] | |
y = data[y_var] | |
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) | |
line_x = np.array([x.min(), x.max()]) | |
line_y = slope * line_x + intercept | |
fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='νκ·μ ')) | |
r_squared = r_value ** 2 | |
fig.update_layout( | |
title=f'{x_var}μ {y_var}μ κ΄κ³ (R-squared: {r_squared:.4f})', | |
xaxis_title=x_var, | |
yaxis_title=y_var, | |
annotations=[ | |
dict( | |
x=0.5, | |
y=1.05, | |
xref='paper', | |
yref='paper', | |
text=f'R-squared: {r_squared:.4f}', | |
showarrow=False, | |
) | |
] | |
) | |
st.plotly_chart(fig) | |
# μΆκ° ν΅κ³ μ 보 | |
st.write(f"μκ΄κ³μ: {r_value:.4f}") | |
st.write(f"p-value: {p_value:.4f}") | |
st.write(f"νμ€ μ€μ°¨: {std_err:.4f}") | |
def perform_analysis(): | |
if st.session_state.filtered_data is None: | |
st.session_state.filtered_data = st.session_state.processed_data.copy() | |
st.header("νμμ λ°μ΄ν° λΆμ") | |
# μ¬λΌμ΄μ μμ± | |
create_slicers(st.session_state.processed_data) | |
# μμ½ ν΅κ³ | |
st.write("μμ½ ν΅κ³:") | |
st.write(st.session_state.filtered_data.describe()) | |
# μκ΄κ΄κ³ ννΈλ§΅ | |
st.subheader("μκ΄κ΄κ³ ννΈλ§΅") | |
plot_correlation_heatmap(st.session_state.filtered_data) | |
# μ¬μ©μκ° μ νν λ λ³μμ λν μ°μ λ λ° νκ· λΆμ | |
st.subheader("λ λ³μ κ°μ κ΄κ³ λΆμ") | |
x_var = st.selectbox("XμΆ λ³μ μ ν", options=st.session_state.numeric_columns, key='x_var') | |
y_var = st.selectbox("YμΆ λ³μ μ ν", options=[col for col in st.session_state.numeric_columns if col != x_var], key='y_var') | |
if x_var and y_var: | |
plot_scatter_with_regression(st.session_state.filtered_data, x_var, y_var) | |
def main(): | |
st.title("μΈν°λν°λΈ EDA ν΄ν·") | |
manage_session_state() | |
if st.session_state.data is None: | |
data_input_method = st.radio("λ°μ΄ν° μ λ ₯ λ°©λ² μ ν:", ("νμΌ μ λ‘λ", "μμ λ°μ΄ν° μ¬μ©", "μλ μ λ ₯"), key="data_input_method") | |
if data_input_method == "νμΌ μ λ‘λ": | |
uploaded_file = st.file_uploader("CSV, XLS, λλ XLSX νμΌμ μ ννμΈμ", type=["csv", "xls", "xlsx"], key="file_uploader") | |
if uploaded_file is not None: | |
st.session_state.data = load_data(uploaded_file) | |
elif data_input_method == "μμ λ°μ΄ν° μ¬μ©": | |
sample_choice = st.selectbox( | |
"μμ λ°μ΄ν° μ ν", | |
options=[sample["name"] for sample in SAMPLE_DATA_FILES], | |
format_func=lambda x: x | |
) | |
if st.button("μ νν μμ λ°μ΄ν° λ‘λ"): | |
selected_file = next(sample["file"] for sample in SAMPLE_DATA_FILES if sample["name"] == sample_choice) | |
st.session_state.data = load_sample_data(selected_file) | |
else: | |
st.session_state.data = manual_data_entry() | |
if st.session_state.data is not None: | |
st.subheader("μ΄ μ΄λ¦ μμ ") | |
st.write("μ΄ μ΄λ¦μ νμΈνκ³ νμν κ²½μ° μμ νμΈμ:") | |
# μ΄ μ΄λ¦ νΈμ§μ μν λ°μ΄ν°νλ μ μμ± | |
column_names = pd.DataFrame({'νμ¬ μ΄ μ΄λ¦': st.session_state.data.columns}) | |
edited_column_names = st.data_editor( | |
column_names, | |
num_rows="fixed", | |
key="column_name_editor", | |
column_config={ | |
"νμ¬ μ΄ μ΄λ¦": st.column_config.TextColumn( | |
"μ΄ μ΄λ¦", | |
help="μλ‘μ΄ μ΄ μ΄λ¦μ μ λ ₯νμΈμ", | |
max_chars=50 | |
) | |
} | |
) | |
# μμ λ μ΄ μ΄λ¦ μ μ© | |
st.session_state.data.columns = edited_column_names['νμ¬ μ΄ μ΄λ¦'] | |
st.subheader("λ°μ΄ν° 미리보기 λ° μμ ") | |
st.write("λ°μ΄ν°λ₯Ό νμΈνκ³ νμν κ²½μ° μμ νμΈμ:") | |
edited_data = st.data_editor( | |
st.session_state.data, | |
num_rows="dynamic", | |
key="main_data_editor" # μ¬κΈ°μ ν€λ₯Ό λ³κ²½νμ΅λλ€ | |
) | |
if st.button("λ°μ΄ν° λΆμ μμ", key="start_analysis") or st.session_state.analysis_performed: | |
if not st.session_state.analysis_performed: | |
st.session_state.processed_data = preprocess_data(edited_data) | |
st.session_state.analysis_performed = True | |
perform_analysis() | |
if __name__ == "__main__": | |
main() |