Spaces:
Runtime error
Runtime error
File size: 11,698 Bytes
6607e79 9d0c2d9 6607e79 5d3671b 7abab37 6607e79 7abab37 6607e79 5d3671b 2aae306 900c0ad b84e319 96b9255 6607e79 cbb0a6e 89d8e3e cbb0a6e 89d8e3e 6607e79 5d3671b 6607e79 5d3671b 6607e79 5d3671b 6607e79 5d89abf 6607e79 5d3671b 6607e79 5d89abf 6607e79 5d89abf 6607e79 5d89abf 6607e79 71227fd 2aae306 6607e79 2aae306 7abab37 5d3671b 2aae306 7abab37 f7f3976 7abab37 71227fd f7f3976 71227fd 6607e79 5d89abf 6607e79 38cbba4 2aae306 38cbba4 900c0ad cc89531 6607e79 5d3671b 92a085a 2aae306 b84e319 2aae306 b84e319 2aae306 38cbba4 bf71d2b cc156a3 cbb0a6e cc156a3 cbb0a6e deb6b04 cc156a3 deb6b04 cbb0a6e 2aae306 5d3671b 38cbba4 5d3671b 92a085a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 |
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from io import StringIO
import openpyxl
import matplotlib.font_manager as fm
from scipy import stats
import os
# νκΈ ν°νΈ μ€μ
def set_font():
font_path = "Pretendard-Bold.ttf" # μ€μ ν°νΈ νμΌ κ²½λ‘λ‘ λ³κ²½ν΄μ£ΌμΈμ
fm.fontManager.addfont(font_path)
return {'font.family': 'Pretendard-Bold', 'axes.unicode_minus': False}
# ν°νΈ μ€μ μ κ°μ Έμ΅λλ€
font_settings = set_font()
# μΈμ
μν μ΄κΈ°ν λ° κ΄λ¦¬
def manage_session_state():
if 'data' not in st.session_state:
st.session_state.data = None
if 'processed_data' not in st.session_state:
st.session_state.processed_data = None
if 'numeric_columns' not in st.session_state:
st.session_state.numeric_columns = []
if 'categorical_columns' not in st.session_state:
st.session_state.categorical_columns = []
if 'x_var' not in st.session_state:
st.session_state.x_var = None
if 'y_var' not in st.session_state:
st.session_state.y_var = None
if 'slicers' not in st.session_state:
st.session_state.slicers = {}
if 'analysis_performed' not in st.session_state:
st.session_state.analysis_performed = False
if 'filtered_data' not in st.session_state:
st.session_state.filtered_data = None
SAMPLE_DATA_FILES = [
{"name": "κ³Όλͺ©λ³ λ
Έλ ₯κ³Ό μ±μ·¨λ", "file": "subject.xlsx"},
{"name": "μ±μ ", "file": "score.xlsx"},
{"name": "μΆμμΌμμ μ±μ ", "file": "attendance.xlsx"}
]
def load_sample_data(file_name):
# μμ λ°μ΄ν° νμΌ κ²½λ‘
file_path = os.path.join("sample_data", file_name)
if file_name.endswith('.csv'):
return pd.read_csv(file_path)
elif file_name.endswith(('.xls', '.xlsx')):
return pd.read_excel(file_path)
else:
st.error("μ§μλμ§ μλ νμΌ νμμ
λλ€.")
return None
# λ°μ΄ν° λ‘λ
@st.cache_data
def load_data(file):
file_extension = file.name.split('.')[-1].lower()
if file_extension == 'csv':
data = pd.read_csv(file)
elif file_extension in ['xls', 'xlsx']:
data = pd.read_excel(file)
else:
st.error("μ§μλμ§ μλ νμΌ νμμ
λλ€. CSV, XLS, λλ XLSX νμΌμ μ
λ‘λν΄μ£ΌμΈμ.")
return None
# λΉ μ΄ μ΄λ¦μ κΈ°λ³Έκ° λΆμ¬
if data.columns.isnull().any():
data.columns = [f'Column_{i+1}' if pd.isnull(col) else col for i, col in enumerate(data.columns)]
return data
def manual_data_entry():
col_names = st.text_input("μ΄ μ΄λ¦μ μΌνλ‘ κ΅¬λΆνμ¬ μ
λ ₯νμΈμ:", key="manual_col_names").split(',')
col_names = [name.strip() for name in col_names if name.strip()]
if col_names:
num_rows = st.number_input("μ΄κΈ° νμ μλ₯Ό μ
λ ₯νμΈμ:", min_value=1, value=5, key="manual_num_rows")
data = pd.DataFrame(columns=col_names, index=range(num_rows))
edited_data = st.data_editor(data, num_rows="dynamic", key="manual_data_editor")
return edited_data
return None
def preprocess_data(data):
# λ°μ΄ν° νμ
μΆλ‘ λ° λ³ν
for column in data.columns:
if data[column].dtype == 'object':
try:
# NaN κ°μ 무μνκ³ μ«μλ‘ λ³ν μλ
numeric_converted = pd.to_numeric(data[column], errors='coerce')
# λͺ¨λ κ°μ΄ NaNμ΄ μλλΌλ©΄ λ³νλ μ΄μ μ¬μ©
if not numeric_converted.isna().all():
data[column] = numeric_converted
st.write(f"'{column}' μ΄μ μ«μνμΌλ‘ λ³ννμ΅λλ€.")
except:
st.write(f"'{column}' μ΄μ λ²μ£ΌνμΌλ‘ μ μ§λ©λλ€.")
# κ²°μΈ‘μΉ μ²λ¦¬ (κΈ°μ‘΄ μ½λ μ μ§)
if data.isnull().sum().sum() > 0:
st.write("κ²°μΈ‘μΉ μ²λ¦¬:")
for column in data.columns:
if data[column].isnull().sum() > 0:
method = st.selectbox(f"{column} μ΄μ μ²λ¦¬ λ°©λ² μ ν:",
["μ κ±°", "νκ· μΌλ‘ λ체", "μ€μκ°μΌλ‘ λ체", "μ΅λΉκ°μΌλ‘ λ체"],
key=f"missing_{column}")
if method == "μ κ±°":
data = data.dropna(subset=[column])
elif method == "νκ· μΌλ‘ λ체":
if pd.api.types.is_numeric_dtype(data[column]):
data[column].fillna(data[column].mean(), inplace=True)
else:
st.warning(f"{column} μ΄μ μ«μνμ΄ μλμ΄μ νκ· κ°μΌλ‘ λ체ν μ μμ΅λλ€.")
elif method == "μ€μκ°μΌλ‘ λ체":
if pd.api.types.is_numeric_dtype(data[column]):
data[column].fillna(data[column].median(), inplace=True)
else:
st.warning(f"{column} μ΄μ μ«μνμ΄ μλμ΄μ μ€μκ°μΌλ‘ λ체ν μ μμ΅λλ€.")
elif method == "μ΅λΉκ°μΌλ‘ λ체":
data[column].fillna(data[column].mode()[0], inplace=True)
# μ«μν μ΄κ³Ό λ²μ£Όν μ΄ λΆλ¦¬
st.session_state.numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
st.session_state.categorical_columns = data.select_dtypes(exclude=['float64', 'int64']).columns.tolist()
return data
def update_filtered_data():
st.session_state.filtered_data = apply_slicers(st.session_state.processed_data)
def create_slicers(data):
for col in st.session_state.categorical_columns:
if data[col].nunique() <= 10:
st.session_state.slicers[col] = st.multiselect(
f"{col} μ ν",
options=sorted(data[col].unique()),
default=sorted(data[col].unique()),
key=f"slicer_{col}",
on_change=update_filtered_data
)
def apply_slicers(data):
filtered_data = data.copy()
for col, selected_values in st.session_state.slicers.items():
if selected_values:
filtered_data = filtered_data[filtered_data[col].isin(selected_values)]
return filtered_data
def plot_correlation_heatmap(data):
numeric_data = data[st.session_state.numeric_columns]
if not numeric_data.empty:
corr = numeric_data.corr()
fig = px.imshow(corr, color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
fig.update_layout(title='μκ΄κ΄κ³ ννΈλ§΅')
st.plotly_chart(fig)
else:
st.warning("μκ΄κ΄κ³ ννΈλ§΅μ 그릴 μ μλ μ«μν μ΄μ΄ μμ΅λλ€.")
def plot_scatter_with_regression(data, x_var, y_var):
fig = px.scatter(data, x=x_var, y=y_var, color='λ°' if 'λ°' in data.columns else None)
# νκ·μ μΆκ°
x = data[x_var]
y = data[y_var]
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
line_x = np.array([x.min(), x.max()])
line_y = slope * line_x + intercept
fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='νκ·μ '))
r_squared = r_value ** 2
fig.update_layout(
title=f'{x_var}μ {y_var}μ κ΄κ³ (R-squared: {r_squared:.4f})',
xaxis_title=x_var,
yaxis_title=y_var,
annotations=[
dict(
x=0.5,
y=1.05,
xref='paper',
yref='paper',
text=f'R-squared: {r_squared:.4f}',
showarrow=False,
)
]
)
st.plotly_chart(fig)
# μΆκ° ν΅κ³ μ 보
st.write(f"μκ΄κ³μ: {r_value:.4f}")
st.write(f"p-value: {p_value:.4f}")
st.write(f"νμ€ μ€μ°¨: {std_err:.4f}")
def perform_analysis():
if st.session_state.filtered_data is None:
st.session_state.filtered_data = st.session_state.processed_data.copy()
st.header("νμμ λ°μ΄ν° λΆμ")
# μ¬λΌμ΄μ μμ±
create_slicers(st.session_state.processed_data)
# μμ½ ν΅κ³
st.write("μμ½ ν΅κ³:")
st.write(st.session_state.filtered_data.describe())
# μκ΄κ΄κ³ ννΈλ§΅
st.subheader("μκ΄κ΄κ³ ννΈλ§΅")
plot_correlation_heatmap(st.session_state.filtered_data)
# μ¬μ©μκ° μ νν λ λ³μμ λν μ°μ λ λ° νκ· λΆμ
st.subheader("λ λ³μ κ°μ κ΄κ³ λΆμ")
x_var = st.selectbox("XμΆ λ³μ μ ν", options=st.session_state.numeric_columns, key='x_var')
y_var = st.selectbox("YμΆ λ³μ μ ν", options=[col for col in st.session_state.numeric_columns if col != x_var], key='y_var')
if x_var and y_var:
plot_scatter_with_regression(st.session_state.filtered_data, x_var, y_var)
def main():
st.title("μΈν°λν°λΈ EDA ν΄ν·")
manage_session_state()
if st.session_state.data is None:
data_input_method = st.radio("λ°μ΄ν° μ
λ ₯ λ°©λ² μ ν:", ("νμΌ μ
λ‘λ", "μμ λ°μ΄ν° μ¬μ©", "μλ μ
λ ₯"), key="data_input_method")
if data_input_method == "νμΌ μ
λ‘λ":
uploaded_file = st.file_uploader("CSV, XLS, λλ XLSX νμΌμ μ ννμΈμ", type=["csv", "xls", "xlsx"], key="file_uploader")
if uploaded_file is not None:
st.session_state.data = load_data(uploaded_file)
elif data_input_method == "μμ λ°μ΄ν° μ¬μ©":
sample_choice = st.selectbox(
"μμ λ°μ΄ν° μ ν",
options=[sample["name"] for sample in SAMPLE_DATA_FILES],
format_func=lambda x: x
)
if st.button("μ νν μμ λ°μ΄ν° λ‘λ"):
selected_file = next(sample["file"] for sample in SAMPLE_DATA_FILES if sample["name"] == sample_choice)
st.session_state.data = load_sample_data(selected_file)
else:
st.session_state.data = manual_data_entry()
if st.session_state.data is not None:
st.subheader("μ΄ μ΄λ¦ μμ ")
st.write("μ΄ μ΄λ¦μ νμΈνκ³ νμν κ²½μ° μμ νμΈμ:")
# μ΄ μ΄λ¦ νΈμ§μ μν λ°μ΄ν°νλ μ μμ±
column_names = pd.DataFrame({'νμ¬ μ΄ μ΄λ¦': st.session_state.data.columns})
edited_column_names = st.data_editor(
column_names,
num_rows="fixed",
key="column_name_editor",
column_config={
"νμ¬ μ΄ μ΄λ¦": st.column_config.TextColumn(
"μ΄ μ΄λ¦",
help="μλ‘μ΄ μ΄ μ΄λ¦μ μ
λ ₯νμΈμ",
max_chars=50
)
}
)
# μμ λ μ΄ μ΄λ¦ μ μ©
st.session_state.data.columns = edited_column_names['νμ¬ μ΄ μ΄λ¦']
st.subheader("λ°μ΄ν° 미리보기 λ° μμ ")
st.write("λ°μ΄ν°λ₯Ό νμΈνκ³ νμν κ²½μ° μμ νμΈμ:")
edited_data = st.data_editor(
st.session_state.data,
num_rows="dynamic",
key="main_data_editor" # μ¬κΈ°μ ν€λ₯Ό λ³κ²½νμ΅λλ€
)
if st.button("λ°μ΄ν° λΆμ μμ", key="start_analysis") or st.session_state.analysis_performed:
if not st.session_state.analysis_performed:
st.session_state.processed_data = preprocess_data(edited_data)
st.session_state.analysis_performed = True
perform_analysis()
if __name__ == "__main__":
main() |