File size: 11,698 Bytes
6607e79
 
 
 
 
 
 
 
 
9d0c2d9
6607e79
 
 
 
 
 
 
 
 
 
5d3671b
 
7abab37
 
 
 
6607e79
 
 
 
7abab37
 
 
 
6607e79
 
5d3671b
 
2aae306
 
900c0ad
b84e319
 
 
 
 
 
 
96b9255
 
 
 
 
 
 
 
 
 
 
6607e79
 
 
 
 
 
 
 
 
 
 
cbb0a6e
 
89d8e3e
cbb0a6e
89d8e3e
6607e79
 
 
5d3671b
6607e79
 
 
5d3671b
6607e79
 
5d3671b
6607e79
 
 
 
 
5d89abf
 
 
 
 
 
 
 
 
 
 
 
 
 
6607e79
 
 
 
 
5d3671b
 
6607e79
 
 
5d89abf
 
 
 
6607e79
5d89abf
 
 
 
6607e79
 
 
 
 
5d89abf
6607e79
 
71227fd
2aae306
 
 
6607e79
 
2aae306
7abab37
 
 
5d3671b
2aae306
 
7abab37
 
 
f7f3976
7abab37
71227fd
f7f3976
 
71227fd
6607e79
5d89abf
 
 
 
 
 
 
 
6607e79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38cbba4
 
2aae306
38cbba4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
900c0ad
cc89531
6607e79
5d3671b
92a085a
2aae306
b84e319
2aae306
 
 
 
 
b84e319
 
 
 
 
 
 
 
 
2aae306
 
38cbba4
bf71d2b
cc156a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbb0a6e
cc156a3
 
 
 
 
cbb0a6e
deb6b04
 
 
cc156a3
deb6b04
cbb0a6e
2aae306
 
5d3671b
38cbba4
5d3671b
92a085a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from io import StringIO
import openpyxl
import matplotlib.font_manager as fm
from scipy import stats
import os

# ν•œκΈ€ 폰트 μ„€μ •
def set_font():
    font_path = "Pretendard-Bold.ttf"  # μ‹€μ œ 폰트 파일 경둜둜 λ³€κ²½ν•΄μ£Όμ„Έμš”
    fm.fontManager.addfont(font_path)
    return {'font.family': 'Pretendard-Bold', 'axes.unicode_minus': False}

# 폰트 섀정을 κ°€μ Έμ˜΅λ‹ˆλ‹€
font_settings = set_font()

# μ„Έμ…˜ μƒνƒœ μ΄ˆκΈ°ν™” 및 관리
def manage_session_state():
    if 'data' not in st.session_state:
        st.session_state.data = None
    if 'processed_data' not in st.session_state:
        st.session_state.processed_data = None
    if 'numeric_columns' not in st.session_state:
        st.session_state.numeric_columns = []
    if 'categorical_columns' not in st.session_state:
        st.session_state.categorical_columns = []
    if 'x_var' not in st.session_state:
        st.session_state.x_var = None
    if 'y_var' not in st.session_state:
        st.session_state.y_var = None
    if 'slicers' not in st.session_state:
        st.session_state.slicers = {}
    if 'analysis_performed' not in st.session_state:
        st.session_state.analysis_performed = False
    if 'filtered_data' not in st.session_state:
        st.session_state.filtered_data = None


SAMPLE_DATA_FILES = [
    {"name": "κ³Όλͺ©λ³„ λ…Έλ ₯κ³Ό 성취도", "file": "subject.xlsx"},
    {"name": "채점", "file": "score.xlsx"},
    {"name": "μΆœμ„μΌμˆ˜μ™€ 성적", "file": "attendance.xlsx"}
]

def load_sample_data(file_name):
    # μ˜ˆμ‹œ 데이터 파일 경둜
    file_path = os.path.join("sample_data", file_name)
    if file_name.endswith('.csv'):
        return pd.read_csv(file_path)
    elif file_name.endswith(('.xls', '.xlsx')):
        return pd.read_excel(file_path)
    else:
        st.error("μ§€μ›λ˜μ§€ μ•ŠλŠ” 파일 ν˜•μ‹μž…λ‹ˆλ‹€.")
        return None
        
# 데이터 λ‘œλ“œ
@st.cache_data
def load_data(file):
    file_extension = file.name.split('.')[-1].lower()
    if file_extension == 'csv':
        data = pd.read_csv(file)
    elif file_extension in ['xls', 'xlsx']:
        data = pd.read_excel(file)
    else:
        st.error("μ§€μ›λ˜μ§€ μ•ŠλŠ” 파일 ν˜•μ‹μž…λ‹ˆλ‹€. CSV, XLS, λ˜λŠ” XLSX νŒŒμΌμ„ μ—…λ‘œλ“œν•΄μ£Όμ„Έμš”.")
        return None
    
    # 빈 μ—΄ 이름에 κΈ°λ³Έκ°’ λΆ€μ—¬
    if data.columns.isnull().any():
        data.columns = [f'Column_{i+1}' if pd.isnull(col) else col for i, col in enumerate(data.columns)]
    
    return data

def manual_data_entry():
    col_names = st.text_input("μ—΄ 이름을 μ‰Όν‘œλ‘œ κ΅¬λΆ„ν•˜μ—¬ μž…λ ₯ν•˜μ„Έμš”:", key="manual_col_names").split(',')
    col_names = [name.strip() for name in col_names if name.strip()]
    
    if col_names:
        num_rows = st.number_input("초기 ν–‰μ˜ 수λ₯Ό μž…λ ₯ν•˜μ„Έμš”:", min_value=1, value=5, key="manual_num_rows")
        data = pd.DataFrame(columns=col_names, index=range(num_rows))
        
        edited_data = st.data_editor(data, num_rows="dynamic", key="manual_data_editor")
        
        return edited_data
    return None

def preprocess_data(data):
    # 데이터 νƒ€μž… μΆ”λ‘  및 λ³€ν™˜
    for column in data.columns:
        if data[column].dtype == 'object':
            try:
                # NaN 값을 λ¬΄μ‹œν•˜κ³  숫자둜 λ³€ν™˜ μ‹œλ„
                numeric_converted = pd.to_numeric(data[column], errors='coerce')
                # λͺ¨λ“  값이 NaN이 μ•„λ‹ˆλΌλ©΄ λ³€ν™˜λœ 열을 μ‚¬μš©
                if not numeric_converted.isna().all():
                    data[column] = numeric_converted
                    st.write(f"'{column}' 열을 μˆ«μžν˜•μœΌλ‘œ λ³€ν™˜ν–ˆμŠ΅λ‹ˆλ‹€.")
            except:
                st.write(f"'{column}' 열은 λ²”μ£Όν˜•μœΌλ‘œ μœ μ§€λ©λ‹ˆλ‹€.")

    # 결츑치 처리 (κΈ°μ‘΄ μ½”λ“œ μœ μ§€)
    if data.isnull().sum().sum() > 0:
        st.write("결츑치 처리:")
        for column in data.columns:
            if data[column].isnull().sum() > 0:
                method = st.selectbox(f"{column} μ—΄μ˜ 처리 방법 선택:", 
                                      ["제거", "ν‰κ· μœΌλ‘œ λŒ€μ²΄", "μ€‘μ•™κ°’μœΌλ‘œ λŒ€μ²΄", "μ΅œλΉˆκ°’μœΌλ‘œ λŒ€μ²΄"],
                                      key=f"missing_{column}")
                if method == "제거":
                    data = data.dropna(subset=[column])
                elif method == "ν‰κ· μœΌλ‘œ λŒ€μ²΄":
                    if pd.api.types.is_numeric_dtype(data[column]):
                        data[column].fillna(data[column].mean(), inplace=True)
                    else:
                        st.warning(f"{column} 열은 μˆ«μžν˜•μ΄ μ•„λ‹ˆμ–΄μ„œ ν‰κ· κ°’μœΌλ‘œ λŒ€μ²΄ν•  수 μ—†μŠ΅λ‹ˆλ‹€.")
                elif method == "μ€‘μ•™κ°’μœΌλ‘œ λŒ€μ²΄":
                    if pd.api.types.is_numeric_dtype(data[column]):
                        data[column].fillna(data[column].median(), inplace=True)
                    else:
                        st.warning(f"{column} 열은 μˆ«μžν˜•μ΄ μ•„λ‹ˆμ–΄μ„œ μ€‘μ•™κ°’μœΌλ‘œ λŒ€μ²΄ν•  수 μ—†μŠ΅λ‹ˆλ‹€.")
                elif method == "μ΅œλΉˆκ°’μœΌλ‘œ λŒ€μ²΄":
                    data[column].fillna(data[column].mode()[0], inplace=True)
    
    # μˆ«μžν˜• μ—΄κ³Ό λ²”μ£Όν˜• μ—΄ 뢄리
    st.session_state.numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
    st.session_state.categorical_columns = data.select_dtypes(exclude=['float64', 'int64']).columns.tolist()
    
    return data

def update_filtered_data():
    st.session_state.filtered_data = apply_slicers(st.session_state.processed_data)

def create_slicers(data):
    for col in st.session_state.categorical_columns:
        if data[col].nunique() <= 10:
            st.session_state.slicers[col] = st.multiselect(
                f"{col} 선택", 
                options=sorted(data[col].unique()), 
                default=sorted(data[col].unique()),
                key=f"slicer_{col}",
                on_change=update_filtered_data
            )

def apply_slicers(data):
    filtered_data = data.copy()
    for col, selected_values in st.session_state.slicers.items():
        if selected_values:
            filtered_data = filtered_data[filtered_data[col].isin(selected_values)]
    return filtered_data

def plot_correlation_heatmap(data):
    numeric_data = data[st.session_state.numeric_columns]
    if not numeric_data.empty:
        corr = numeric_data.corr()
        fig = px.imshow(corr, color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
        fig.update_layout(title='상관관계 히트맡')
        st.plotly_chart(fig)
    else:
        st.warning("상관관계 νžˆνŠΈλ§΅μ„ 그릴 수 μžˆλŠ” μˆ«μžν˜• 열이 μ—†μŠ΅λ‹ˆλ‹€.")

def plot_scatter_with_regression(data, x_var, y_var):
    fig = px.scatter(data, x=x_var, y=y_var, color='반' if '반' in data.columns else None)
    
    # νšŒκ·€μ„  μΆ”κ°€
    x = data[x_var]
    y = data[y_var]
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    line_x = np.array([x.min(), x.max()])
    line_y = slope * line_x + intercept
    fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='νšŒκ·€μ„ '))
    
    r_squared = r_value ** 2
    fig.update_layout(
        title=f'{x_var}와 {y_var}의 관계 (R-squared: {r_squared:.4f})',
        xaxis_title=x_var,
        yaxis_title=y_var,
        annotations=[
            dict(
                x=0.5,
                y=1.05,
                xref='paper',
                yref='paper',
                text=f'R-squared: {r_squared:.4f}',
                showarrow=False,
            )
        ]
    )
    st.plotly_chart(fig)
    
    # μΆ”κ°€ 톡계 정보
    st.write(f"μƒκ΄€κ³„μˆ˜: {r_value:.4f}")
    st.write(f"p-value: {p_value:.4f}")
    st.write(f"ν‘œμ€€ 였차: {std_err:.4f}")

def perform_analysis():
    if st.session_state.filtered_data is None:
        st.session_state.filtered_data = st.session_state.processed_data.copy()
    
    st.header("탐색적 데이터 뢄석")
    
    # μŠ¬λΌμ΄μ„œ 생성
    create_slicers(st.session_state.processed_data)
    
    # μš”μ•½ 톡계
    st.write("μš”μ•½ 톡계:")
    st.write(st.session_state.filtered_data.describe())

    # 상관관계 히트맡
    st.subheader("상관관계 히트맡")
    plot_correlation_heatmap(st.session_state.filtered_data)
    
    # μ‚¬μš©μžκ°€ μ„ νƒν•œ 두 λ³€μˆ˜μ— λŒ€ν•œ 산점도 및 νšŒκ·€ 뢄석
    st.subheader("두 λ³€μˆ˜ κ°„μ˜ 관계 뢄석")
    x_var = st.selectbox("XμΆ• λ³€μˆ˜ 선택", options=st.session_state.numeric_columns, key='x_var')
    y_var = st.selectbox("YμΆ• λ³€μˆ˜ 선택", options=[col for col in st.session_state.numeric_columns if col != x_var], key='y_var')

    if x_var and y_var:
        plot_scatter_with_regression(st.session_state.filtered_data, x_var, y_var)

def main():
    st.title("μΈν„°λž™ν‹°λΈŒ EDA νˆ΄ν‚·")
    
    manage_session_state()

    if st.session_state.data is None:
        data_input_method = st.radio("데이터 μž…λ ₯ 방법 선택:", ("파일 μ—…λ‘œλ“œ", "μ˜ˆμ‹œ 데이터 μ‚¬μš©", "μˆ˜λ™ μž…λ ₯"), key="data_input_method")
        
        if data_input_method == "파일 μ—…λ‘œλ“œ":
            uploaded_file = st.file_uploader("CSV, XLS, λ˜λŠ” XLSX νŒŒμΌμ„ μ„ νƒν•˜μ„Έμš”", type=["csv", "xls", "xlsx"], key="file_uploader")
            if uploaded_file is not None:
                st.session_state.data = load_data(uploaded_file)
        elif data_input_method == "μ˜ˆμ‹œ 데이터 μ‚¬μš©":
            sample_choice = st.selectbox(
                "μ˜ˆμ‹œ 데이터 선택",
                options=[sample["name"] for sample in SAMPLE_DATA_FILES],
                format_func=lambda x: x
            )
            if st.button("μ„ νƒν•œ μ˜ˆμ‹œ 데이터 λ‘œλ“œ"):
                selected_file = next(sample["file"] for sample in SAMPLE_DATA_FILES if sample["name"] == sample_choice)
                st.session_state.data = load_sample_data(selected_file)
        else:
            st.session_state.data = manual_data_entry()

    if st.session_state.data is not None:
        st.subheader("μ—΄ 이름 μˆ˜μ •")
        st.write("μ—΄ 이름을 ν™•μΈν•˜κ³  ν•„μš”ν•œ 경우 μˆ˜μ •ν•˜μ„Έμš”:")
        
        # μ—΄ 이름 νŽΈμ§‘μ„ μœ„ν•œ λ°μ΄ν„°ν”„λ ˆμž„ 생성
        column_names = pd.DataFrame({'ν˜„μž¬ μ—΄ 이름': st.session_state.data.columns})
        edited_column_names = st.data_editor(
            column_names,
            num_rows="fixed",
            key="column_name_editor",
            column_config={
                "ν˜„μž¬ μ—΄ 이름": st.column_config.TextColumn(
                    "μ—΄ 이름",
                    help="μƒˆλ‘œμš΄ μ—΄ 이름을 μž…λ ₯ν•˜μ„Έμš”",
                    max_chars=50
                )
            }
        )
        
        # μˆ˜μ •λœ μ—΄ 이름 적용
        st.session_state.data.columns = edited_column_names['ν˜„μž¬ μ—΄ 이름']

        st.subheader("데이터 미리보기 및 μˆ˜μ •")
        st.write("데이터λ₯Ό ν™•μΈν•˜κ³  ν•„μš”ν•œ 경우 μˆ˜μ •ν•˜μ„Έμš”:")
        
        edited_data = st.data_editor(
            st.session_state.data,
            num_rows="dynamic",
            key="main_data_editor"  # μ—¬κΈ°μ„œ ν‚€λ₯Ό λ³€κ²½ν–ˆμŠ΅λ‹ˆλ‹€
        )
        
        if st.button("데이터 뢄석 μ‹œμž‘", key="start_analysis") or st.session_state.analysis_performed:
            if not st.session_state.analysis_performed:
                st.session_state.processed_data = preprocess_data(edited_data)
                st.session_state.analysis_performed = True
            perform_analysis()

if __name__ == "__main__":
    main()