JUNGU commited on
Commit
6607e79
Β·
verified Β·
1 Parent(s): 7abab37

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +150 -73
app.py CHANGED
@@ -1,28 +1,107 @@
1
- def initialize_session_state():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  if 'data' not in st.session_state:
3
  st.session_state.data = None
4
  if 'processed_data' not in st.session_state:
5
  st.session_state.processed_data = None
6
- if 'slicers' not in st.session_state:
7
- st.session_state.slicers = {}
 
 
8
  if 'x_var' not in st.session_state:
9
  st.session_state.x_var = None
10
  if 'y_var' not in st.session_state:
11
  st.session_state.y_var = None
12
- if 'analysis_performed' not in st.session_state:
13
- st.session_state.analysis_performed = False
14
 
15
- def create_slicers(data):
16
- categorical_columns = data.select_dtypes(include=['object', 'category']).columns
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- for col in categorical_columns:
 
19
  if data[col].nunique() <= 10: # κ³ μœ κ°’μ΄ 10개 μ΄ν•˜μΈ κ²½μš°μ—λ§Œ μŠ¬λΌμ΄μ„œ 생성
20
- if col not in st.session_state.slicers:
21
- st.session_state.slicers[col] = sorted(data[col].unique())
22
  st.session_state.slicers[col] = st.multiselect(
23
  f"{col} 선택",
24
  options=sorted(data[col].unique()),
25
- default=st.session_state.slicers[col]
26
  )
27
 
28
  def apply_slicers(data):
@@ -31,6 +110,46 @@ def apply_slicers(data):
31
  data = data[data[col].isin(selected_values)]
32
  return data
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def perform_analysis(data):
35
  st.header("탐색적 데이터 뢄석")
36
 
@@ -43,82 +162,40 @@ def perform_analysis(data):
43
  st.write(filtered_data.describe())
44
 
45
  # 상관관계 히트맡
46
- st.write("상관관계 히트맡:")
47
- numeric_data = filtered_data.select_dtypes(include=['float64', 'int64'])
48
- if not numeric_data.empty:
49
- fig = px.imshow(numeric_data.corr(), color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
50
- fig.update_layout(title='상관관계 히트맡')
51
- st.plotly_chart(fig)
52
- else:
53
- st.write("상관관계 νžˆνŠΈλ§΅μ„ 그릴 수 μžˆλŠ” μˆ«μžν˜• 열이 μ—†μŠ΅λ‹ˆλ‹€.")
54
-
55
  # μ‚¬μš©μžκ°€ μ„ νƒν•œ 두 λ³€μˆ˜μ— λŒ€ν•œ 산점도 및 νšŒκ·€ 뢄석
56
  st.subheader("두 λ³€μˆ˜ κ°„μ˜ 관계 뢄석")
57
- numeric_columns = filtered_data.select_dtypes(include=['float64', 'int64']).columns
58
-
59
- st.session_state.x_var = st.selectbox("XμΆ• λ³€μˆ˜ 선택", options=numeric_columns, key='x_var_select', index=numeric_columns.get_loc(st.session_state.x_var) if st.session_state.x_var in numeric_columns else 0)
60
- y_options = [col for col in numeric_columns if col != st.session_state.x_var]
61
- st.session_state.y_var = st.selectbox("YμΆ• λ³€μˆ˜ 선택", options=y_options, key='y_var_select', index=y_options.index(st.session_state.y_var) if st.session_state.y_var in y_options else 0)
62
 
63
- if st.session_state.x_var and st.session_state.y_var:
64
- fig = px.scatter(filtered_data, x=st.session_state.x_var, y=st.session_state.y_var, color='반' if '반' in filtered_data.columns else None)
65
-
66
- # νšŒκ·€μ„  μΆ”κ°€
67
- x = filtered_data[st.session_state.x_var]
68
- y = filtered_data[st.session_state.y_var]
69
- slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
70
- line_x = np.array([x.min(), x.max()])
71
- line_y = slope * line_x + intercept
72
- fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='νšŒκ·€μ„ '))
73
-
74
- r_squared = r_value ** 2
75
- fig.update_layout(
76
- title=f'{st.session_state.x_var}와 {st.session_state.y_var}의 관계 (R-squared: {r_squared:.4f})',
77
- xaxis_title=st.session_state.x_var,
78
- yaxis_title=st.session_state.y_var,
79
- annotations=[
80
- dict(
81
- x=0.5,
82
- y=1.05,
83
- xref='paper',
84
- yref='paper',
85
- text=f'R-squared: {r_squared:.4f}',
86
- showarrow=False,
87
- )
88
- ]
89
- )
90
- st.plotly_chart(fig)
91
-
92
- # μΆ”κ°€ 톡계 정보
93
- st.write(f"μƒκ΄€κ³„μˆ˜: {r_value:.4f}")
94
- st.write(f"p-value: {p_value:.4f}")
95
- st.write(f"ν‘œμ€€ 였차: {std_err:.4f}")
96
-
97
- st.session_state.analysis_performed = True
98
 
99
  def main():
100
  st.title("μΈν„°λž™ν‹°λΈŒ EDA νˆ΄ν‚·")
 
 
101
 
102
- initialize_session_state()
103
-
104
- if st.session_state.data is None:
105
- data_input_method = st.radio("데이터 μž…λ ₯ 방법 선택:", ("파일 μ—…λ‘œλ“œ", "μˆ˜λ™ μž…λ ₯"))
106
-
107
- if data_input_method == "파일 μ—…λ‘œλ“œ":
108
- uploaded_file = st.file_uploader("CSV, XLS, λ˜λŠ” XLSX νŒŒμΌμ„ μ„ νƒν•˜μ„Έμš”", type=["csv", "xls", "xlsx"])
109
- if uploaded_file is not None:
110
- st.session_state.data = load_data(uploaded_file)
111
  else:
112
- st.session_state.data = manual_data_entry()
 
 
113
 
114
  if st.session_state.data is not None:
115
  st.subheader("데이터 미리보기 및 μˆ˜μ •")
116
  st.write("데이터λ₯Ό ν™•μΈν•˜κ³  ν•„μš”ν•œ 경우 μˆ˜μ •ν•˜μ„Έμš”:")
117
  edited_data = st.data_editor(st.session_state.data, num_rows="dynamic")
118
 
119
- if st.button("데이터 뢄석 μ‹œμž‘") or st.session_state.analysis_performed:
120
- if not st.session_state.analysis_performed:
121
- st.session_state.processed_data = preprocess_data(edited_data)
122
  perform_analysis(st.session_state.processed_data)
123
 
124
  if __name__ == "__main__":
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+ from io import StringIO
7
+ import openpyxl
8
+ import matplotlib.font_manager as fm
9
+ from scipy import stats
10
+
11
+ # ν•œκΈ€ 폰트 μ„€μ •
12
+ def set_font():
13
+ font_path = "Pretendard-Bold.ttf" # μ‹€μ œ 폰트 파일 경둜둜 λ³€κ²½ν•΄μ£Όμ„Έμš”
14
+ fm.fontManager.addfont(font_path)
15
+ return {'font.family': 'Pretendard-Bold', 'axes.unicode_minus': False}
16
+
17
+ # 폰트 섀정을 κ°€μ Έμ˜΅λ‹ˆλ‹€
18
+ font_settings = set_font()
19
+
20
+ # μ„Έμ…˜ μƒνƒœ μ΄ˆκΈ°ν™”
21
+ def init_session_state():
22
  if 'data' not in st.session_state:
23
  st.session_state.data = None
24
  if 'processed_data' not in st.session_state:
25
  st.session_state.processed_data = None
26
+ if 'numeric_columns' not in st.session_state:
27
+ st.session_state.numeric_columns = []
28
+ if 'categorical_columns' not in st.session_state:
29
+ st.session_state.categorical_columns = []
30
  if 'x_var' not in st.session_state:
31
  st.session_state.x_var = None
32
  if 'y_var' not in st.session_state:
33
  st.session_state.y_var = None
34
+ if 'slicers' not in st.session_state:
35
+ st.session_state.slicers = {}
36
 
37
+ # 데이터 λ‘œλ“œ
38
+ @st.cache_data
39
+ def load_data(file):
40
+ file_extension = file.name.split('.')[-1].lower()
41
+ if file_extension == 'csv':
42
+ data = pd.read_csv(file)
43
+ elif file_extension in ['xls', 'xlsx']:
44
+ data = pd.read_excel(file)
45
+ else:
46
+ st.error("μ§€μ›λ˜μ§€ μ•ŠλŠ” 파일 ν˜•μ‹μž…λ‹ˆλ‹€. CSV, XLS, λ˜λŠ” XLSX νŒŒμΌμ„ μ—…λ‘œλ“œν•΄μ£Όμ„Έμš”.")
47
+ return None
48
+ return data
49
+
50
+ def manual_data_entry():
51
+ st.subheader("μˆ˜λ™ 데이터 μž…λ ₯")
52
+ col_names = st.text_input("μ—΄ 이름을 μ‰Όν‘œλ‘œ κ΅¬λΆ„ν•˜μ—¬ μž…λ ₯ν•˜μ„Έμš”:").split(',')
53
+ col_names = [name.strip() for name in col_names if name.strip()]
54
+
55
+ if col_names:
56
+ num_rows = st.number_input("초기 ν–‰μ˜ 수λ₯Ό μž…λ ₯ν•˜μ„Έμš”:", min_value=1, value=5)
57
+ data = pd.DataFrame(columns=col_names, index=range(num_rows))
58
+
59
+ edited_data = st.data_editor(data, num_rows="dynamic")
60
+
61
+ return edited_data
62
+ return None
63
+
64
+ def preprocess_data(data):
65
+ st.subheader("데이터 μ „μ²˜λ¦¬")
66
+
67
+ # 결츑치 처리
68
+ if data.isnull().sum().sum() > 0:
69
+ st.write("결츑치 처리:")
70
+ for column in data.columns:
71
+ if data[column].isnull().sum() > 0:
72
+ method = st.selectbox(f"{column} μ—΄μ˜ 처리 방법 선택:",
73
+ ["제거", "ν‰κ· μœΌλ‘œ λŒ€μ²΄", "μ€‘μ•™κ°’μœΌλ‘œ λŒ€μ²΄", "μ΅œλΉˆκ°’μœΌλ‘œ λŒ€μ²΄"])
74
+ if method == "제거":
75
+ data = data.dropna(subset=[column])
76
+ elif method == "ν‰κ· μœΌλ‘œ λŒ€μ²΄":
77
+ data[column].fillna(data[column].mean(), inplace=True)
78
+ elif method == "μ€‘μ•™κ°’μœΌλ‘œ λŒ€μ²΄":
79
+ data[column].fillna(data[column].median(), inplace=True)
80
+ elif method == "μ΅œλΉˆκ°’μœΌλ‘œ λŒ€μ²΄":
81
+ data[column].fillna(data[column].mode()[0], inplace=True)
82
+
83
+ # 데이터 νƒ€μž… λ³€ν™˜
84
+ for column in data.columns:
85
+ if data[column].dtype == 'object':
86
+ try:
87
+ data[column] = pd.to_numeric(data[column])
88
+ st.write(f"{column} 열을 μˆ«μžν˜•μœΌλ‘œ λ³€ν™˜ν–ˆμŠ΅λ‹ˆλ‹€.")
89
+ except ValueError:
90
+ st.write(f"{column} 열은 λ²”μ£Όν˜•μœΌλ‘œ μœ μ§€λ©λ‹ˆλ‹€.")
91
+
92
+ # μˆ«μžν˜• μ—΄κ³Ό λ²”μ£Όν˜• μ—΄ 뢄리
93
+ st.session_state.numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
94
+ st.session_state.categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
95
+
96
+ return data
97
 
98
+ def create_slicers(data):
99
+ for col in st.session_state.categorical_columns:
100
  if data[col].nunique() <= 10: # κ³ μœ κ°’μ΄ 10개 μ΄ν•˜μΈ κ²½μš°μ—λ§Œ μŠ¬λΌμ΄μ„œ 생성
 
 
101
  st.session_state.slicers[col] = st.multiselect(
102
  f"{col} 선택",
103
  options=sorted(data[col].unique()),
104
+ default=sorted(data[col].unique())
105
  )
106
 
107
  def apply_slicers(data):
 
110
  data = data[data[col].isin(selected_values)]
111
  return data
112
 
113
+ def plot_correlation_heatmap(data):
114
+ corr = data[st.session_state.numeric_columns].corr()
115
+ fig = px.imshow(corr, color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
116
+ fig.update_layout(title='상관관계 히트맡')
117
+ st.plotly_chart(fig)
118
+
119
+ def plot_scatter_with_regression(data, x_var, y_var):
120
+ fig = px.scatter(data, x=x_var, y=y_var, color='반' if '반' in data.columns else None)
121
+
122
+ # νšŒκ·€μ„  μΆ”κ°€
123
+ x = data[x_var]
124
+ y = data[y_var]
125
+ slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
126
+ line_x = np.array([x.min(), x.max()])
127
+ line_y = slope * line_x + intercept
128
+ fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='νšŒκ·€μ„ '))
129
+
130
+ r_squared = r_value ** 2
131
+ fig.update_layout(
132
+ title=f'{x_var}와 {y_var}의 관계 (R-squared: {r_squared:.4f})',
133
+ xaxis_title=x_var,
134
+ yaxis_title=y_var,
135
+ annotations=[
136
+ dict(
137
+ x=0.5,
138
+ y=1.05,
139
+ xref='paper',
140
+ yref='paper',
141
+ text=f'R-squared: {r_squared:.4f}',
142
+ showarrow=False,
143
+ )
144
+ ]
145
+ )
146
+ st.plotly_chart(fig)
147
+
148
+ # μΆ”κ°€ 톡계 정보
149
+ st.write(f"μƒκ΄€κ³„μˆ˜: {r_value:.4f}")
150
+ st.write(f"p-value: {p_value:.4f}")
151
+ st.write(f"ν‘œμ€€ 였차: {std_err:.4f}")
152
+
153
  def perform_analysis(data):
154
  st.header("탐색적 데이터 뢄석")
155
 
 
162
  st.write(filtered_data.describe())
163
 
164
  # 상관관계 히트맡
165
+ st.subheader("상관관계 히트맡")
166
+ plot_correlation_heatmap(filtered_data)
167
+
 
 
 
 
 
 
168
  # μ‚¬μš©μžκ°€ μ„ νƒν•œ 두 λ³€μˆ˜μ— λŒ€ν•œ 산점도 및 νšŒκ·€ 뢄석
169
  st.subheader("두 λ³€μˆ˜ κ°„μ˜ 관계 뢄석")
170
+ x_var = st.selectbox("XμΆ• λ³€μˆ˜ 선택", options=st.session_state.numeric_columns, key='x_var')
171
+ y_var = st.selectbox("YμΆ• λ³€μˆ˜ 선택", options=[col for col in st.session_state.numeric_columns if col != x_var], key='y_var')
 
 
 
172
 
173
+ if x_var and y_var:
174
+ plot_scatter_with_regression(filtered_data, x_var, y_var)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  def main():
177
  st.title("μΈν„°λž™ν‹°λΈŒ EDA νˆ΄ν‚·")
178
+
179
+ init_session_state()
180
 
181
+ data_input_method = st.radio("데이터 μž…λ ₯ 방법 선택:", ("파일 μ—…λ‘œλ“œ", "μˆ˜λ™ μž…λ ₯"))
182
+
183
+ if data_input_method == "파일 μ—…λ‘œλ“œ":
184
+ uploaded_file = st.file_uploader("CSV, XLS, λ˜λŠ” XLSX νŒŒμΌμ„ μ„ νƒν•˜μ„Έμš”", type=["csv", "xls", "xlsx"])
185
+ if uploaded_file is not None:
186
+ st.session_state.data = load_data(uploaded_file)
 
 
 
187
  else:
188
+ st.session_state.data = None
189
+ else:
190
+ st.session_state.data = manual_data_entry()
191
 
192
  if st.session_state.data is not None:
193
  st.subheader("데이터 미리보기 및 μˆ˜μ •")
194
  st.write("데이터λ₯Ό ν™•μΈν•˜κ³  ν•„μš”ν•œ 경우 μˆ˜μ •ν•˜μ„Έμš”:")
195
  edited_data = st.data_editor(st.session_state.data, num_rows="dynamic")
196
 
197
+ if st.button("데이터 뢄석 μ‹œμž‘"):
198
+ st.session_state.processed_data = preprocess_data(edited_data)
 
199
  perform_analysis(st.session_state.processed_data)
200
 
201
  if __name__ == "__main__":