File size: 5,422 Bytes
7abab37
 
 
 
 
 
 
 
 
 
 
 
 
900c0ad
71227fd
 
 
 
 
7abab37
 
 
 
 
 
 
 
 
 
71227fd
 
 
 
900c0ad
cc89531
f076a08
7abab37
 
 
71227fd
cc89531
 
71227fd
f076a08
cc89531
 
71227fd
f076a08
cc89531
 
 
f076a08
cc89531
f076a08
eacbd49
 
 
7abab37
 
 
 
eacbd49
7abab37
 
23711c4
eacbd49
7abab37
 
23711c4
 
 
 
 
 
 
7abab37
 
 
23711c4
 
 
 
 
 
 
 
 
 
 
cc89531
92a085a
eacbd49
 
 
 
92a085a
7abab37
 
900c0ad
cc89531
92a085a
7abab37
 
 
 
 
 
 
 
 
f076a08
7abab37
f076a08
7abab37
1a09a89
 
7abab37
1a09a89
7abab37
 
 
 
92a085a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def initialize_session_state():
    if 'data' not in st.session_state:
        st.session_state.data = None
    if 'processed_data' not in st.session_state:
        st.session_state.processed_data = None
    if 'slicers' not in st.session_state:
        st.session_state.slicers = {}
    if 'x_var' not in st.session_state:
        st.session_state.x_var = None
    if 'y_var' not in st.session_state:
        st.session_state.y_var = None
    if 'analysis_performed' not in st.session_state:
        st.session_state.analysis_performed = False

def create_slicers(data):
    categorical_columns = data.select_dtypes(include=['object', 'category']).columns

    for col in categorical_columns:
        if data[col].nunique() <= 10:  # κ³ μœ κ°’μ΄ 10개 μ΄ν•˜μΈ κ²½μš°μ—λ§Œ μŠ¬λΌμ΄μ„œ 생성
            if col not in st.session_state.slicers:
                st.session_state.slicers[col] = sorted(data[col].unique())
            st.session_state.slicers[col] = st.multiselect(
                f"{col} 선택", 
                options=sorted(data[col].unique()), 
                default=st.session_state.slicers[col]
            )

def apply_slicers(data):
    for col, selected_values in st.session_state.slicers.items():
        if selected_values:
            data = data[data[col].isin(selected_values)]
    return data

def perform_analysis(data):
    st.header("탐색적 데이터 뢄석")
    
    # μŠ¬λΌμ΄μ„œ 생성 및 적용
    create_slicers(data)
    filtered_data = apply_slicers(data)
    
    # μš”μ•½ 톡계
    st.write("μš”μ•½ 톡계:")
    st.write(filtered_data.describe())

    # 상관관계 히트맡
    st.write("상관관계 히트맡:")
    numeric_data = filtered_data.select_dtypes(include=['float64', 'int64'])
    if not numeric_data.empty:
        fig = px.imshow(numeric_data.corr(), color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
        fig.update_layout(title='상관관계 히트맡')
        st.plotly_chart(fig)
    else:
        st.write("상관관계 νžˆνŠΈλ§΅μ„ 그릴 수 μžˆλŠ” μˆ«μžν˜• 열이 μ—†μŠ΅λ‹ˆλ‹€.")

    # μ‚¬μš©μžκ°€ μ„ νƒν•œ 두 λ³€μˆ˜μ— λŒ€ν•œ 산점도 및 νšŒκ·€ 뢄석
    st.subheader("두 λ³€μˆ˜ κ°„μ˜ 관계 뢄석")
    numeric_columns = filtered_data.select_dtypes(include=['float64', 'int64']).columns
    
    st.session_state.x_var = st.selectbox("XμΆ• λ³€μˆ˜ 선택", options=numeric_columns, key='x_var_select', index=numeric_columns.get_loc(st.session_state.x_var) if st.session_state.x_var in numeric_columns else 0)
    y_options = [col for col in numeric_columns if col != st.session_state.x_var]
    st.session_state.y_var = st.selectbox("YμΆ• λ³€μˆ˜ 선택", options=y_options, key='y_var_select', index=y_options.index(st.session_state.y_var) if st.session_state.y_var in y_options else 0)

    if st.session_state.x_var and st.session_state.y_var:
        fig = px.scatter(filtered_data, x=st.session_state.x_var, y=st.session_state.y_var, color='반' if '반' in filtered_data.columns else None)
        
        # νšŒκ·€μ„  μΆ”κ°€
        x = filtered_data[st.session_state.x_var]
        y = filtered_data[st.session_state.y_var]
        slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
        line_x = np.array([x.min(), x.max()])
        line_y = slope * line_x + intercept
        fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='νšŒκ·€μ„ '))
        
        r_squared = r_value ** 2
        fig.update_layout(
            title=f'{st.session_state.x_var}와 {st.session_state.y_var}의 관계 (R-squared: {r_squared:.4f})',
            xaxis_title=st.session_state.x_var,
            yaxis_title=st.session_state.y_var,
            annotations=[
                dict(
                    x=0.5,
                    y=1.05,
                    xref='paper',
                    yref='paper',
                    text=f'R-squared: {r_squared:.4f}',
                    showarrow=False,
                )
            ]
        )
        st.plotly_chart(fig)

        # μΆ”κ°€ 톡계 정보
        st.write(f"μƒκ΄€κ³„μˆ˜: {r_value:.4f}")
        st.write(f"p-value: {p_value:.4f}")
        st.write(f"ν‘œμ€€ 였차: {std_err:.4f}")

    st.session_state.analysis_performed = True

def main():
    st.title("μΈν„°λž™ν‹°λΈŒ EDA νˆ΄ν‚·")

    initialize_session_state()

    if st.session_state.data is None:
        data_input_method = st.radio("데이터 μž…λ ₯ 방법 선택:", ("파일 μ—…λ‘œλ“œ", "μˆ˜λ™ μž…λ ₯"))
        
        if data_input_method == "파일 μ—…λ‘œλ“œ":
            uploaded_file = st.file_uploader("CSV, XLS, λ˜λŠ” XLSX νŒŒμΌμ„ μ„ νƒν•˜μ„Έμš”", type=["csv", "xls", "xlsx"])
            if uploaded_file is not None:
                st.session_state.data = load_data(uploaded_file)
        else:
            st.session_state.data = manual_data_entry()
    
    if st.session_state.data is not None:
        st.subheader("데이터 미리보기 및 μˆ˜μ •")
        st.write("데이터λ₯Ό ν™•μΈν•˜κ³  ν•„μš”ν•œ 경우 μˆ˜μ •ν•˜μ„Έμš”:")
        edited_data = st.data_editor(st.session_state.data, num_rows="dynamic")
        
        if st.button("데이터 뢄석 μ‹œμž‘") or st.session_state.analysis_performed:
            if not st.session_state.analysis_performed:
                st.session_state.processed_data = preprocess_data(edited_data)
            perform_analysis(st.session_state.processed_data)

if __name__ == "__main__":
    main()