JUNGU commited on
Commit
7abab37
Β·
verified Β·
1 Parent(s): eacbd49

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -109
app.py CHANGED
@@ -1,102 +1,42 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import numpy as np
4
- import plotly.express as px
5
- import plotly.graph_objects as go
6
- from io import StringIO
7
- import openpyxl
8
- import matplotlib.font_manager as fm
9
- from scipy import stats
10
-
11
- # ν•œκΈ€ 폰트 μ„€μ •
12
- def set_font():
13
- font_path = "Pretendard-Bold.ttf" # μ‹€μ œ 폰트 파일 경둜둜 λ³€κ²½ν•΄μ£Όμ„Έμš”
14
- fm.fontManager.addfont(font_path)
15
- return {'font.family': 'Pretendard-Bold', 'axes.unicode_minus': False}
16
-
17
- # 폰트 섀정을 κ°€μ Έμ˜΅λ‹ˆλ‹€
18
- font_settings = set_font()
19
-
20
- def load_data(file):
21
- file_extension = file.name.split('.')[-1].lower()
22
- if file_extension == 'csv':
23
- data = pd.read_csv(file)
24
- elif file_extension in ['xls', 'xlsx']:
25
- data = pd.read_excel(file)
26
- else:
27
- st.error("μ§€μ›λ˜μ§€ μ•ŠλŠ” 파일 ν˜•μ‹μž…λ‹ˆλ‹€. CSV, XLS, λ˜λŠ” XLSX νŒŒμΌμ„ μ—…λ‘œλ“œν•΄μ£Όμ„Έμš”.")
28
- return None
29
- return data
30
-
31
- def manual_data_entry():
32
- st.subheader("μˆ˜λ™ 데이터 μž…λ ₯")
33
- col_names = st.text_input("μ—΄ 이름을 μ‰Όν‘œλ‘œ κ΅¬λΆ„ν•˜μ—¬ μž…λ ₯ν•˜μ„Έμš”:").split(',')
34
- col_names = [name.strip() for name in col_names if name.strip()]
35
-
36
- if col_names:
37
- num_rows = st.number_input("초기 ν–‰μ˜ 수λ₯Ό μž…λ ₯ν•˜μ„Έμš”:", min_value=1, value=5)
38
- data = pd.DataFrame(columns=col_names, index=range(num_rows))
39
-
40
- edited_data = st.data_editor(data, num_rows="dynamic")
41
-
42
- return edited_data
43
- return None
44
-
45
- def preprocess_data(data):
46
- st.subheader("데이터 μ „μ²˜λ¦¬")
47
-
48
- # 결츑치 처리
49
- if data.isnull().sum().sum() > 0:
50
- st.write("결츑치 처리:")
51
- for column in data.columns:
52
- if data[column].isnull().sum() > 0:
53
- method = st.selectbox(f"{column} μ—΄μ˜ 처리 방법 선택:",
54
- ["제거", "ν‰κ· μœΌλ‘œ λŒ€μ²΄", "μ€‘μ•™κ°’μœΌλ‘œ λŒ€μ²΄", "μ΅œλΉˆκ°’μœΌλ‘œ λŒ€μ²΄"])
55
- if method == "제거":
56
- data = data.dropna(subset=[column])
57
- elif method == "ν‰κ· μœΌλ‘œ λŒ€μ²΄":
58
- data[column].fillna(data[column].mean(), inplace=True)
59
- elif method == "μ€‘μ•™κ°’μœΌλ‘œ λŒ€μ²΄":
60
- data[column].fillna(data[column].median(), inplace=True)
61
- elif method == "μ΅œλΉˆκ°’μœΌλ‘œ λŒ€μ²΄":
62
- data[column].fillna(data[column].mode()[0], inplace=True)
63
-
64
- # 데이터 νƒ€μž… λ³€ν™˜
65
- for column in data.columns:
66
- if data[column].dtype == 'object':
67
- try:
68
- data[column] = pd.to_numeric(data[column])
69
- st.write(f"{column} 열을 μˆ«μžν˜•μœΌλ‘œ λ³€ν™˜ν–ˆμŠ΅λ‹ˆλ‹€.")
70
- except ValueError:
71
- st.write(f"{column} 열은 λ²”μ£Όν˜•μœΌλ‘œ μœ μ§€λ©λ‹ˆλ‹€.")
72
-
73
- return data
74
 
75
  def create_slicers(data):
76
- slicers = {}
77
  categorical_columns = data.select_dtypes(include=['object', 'category']).columns
78
 
79
  for col in categorical_columns:
80
  if data[col].nunique() <= 10: # κ³ μœ κ°’μ΄ 10개 μ΄ν•˜μΈ κ²½μš°μ—λ§Œ μŠ¬λΌμ΄μ„œ 생성
81
- slicers[col] = st.multiselect(f"{col} 선택", options=sorted(data[col].unique()), default=sorted(data[col].unique()))
82
-
83
- return slicers
84
-
85
- def apply_slicers(data, slicers):
86
- for col, selected_values in slicers.items():
 
 
 
 
87
  if selected_values:
88
  data = data[data[col].isin(selected_values)]
89
  return data
90
 
91
-
92
  def perform_analysis(data):
93
  st.header("탐색적 데이터 뢄석")
94
 
95
- # μŠ¬λΌμ΄μ„œ 생성
96
- slicers = create_slicers(data)
97
-
98
- # μŠ¬λΌμ΄μ„œ 적용
99
- filtered_data = apply_slicers(data, slicers)
100
 
101
  # μš”μ•½ 톡계
102
  st.write("μš”μ•½ 톡계:")
@@ -115,15 +55,17 @@ def perform_analysis(data):
115
  # μ‚¬μš©μžκ°€ μ„ νƒν•œ 두 λ³€μˆ˜μ— λŒ€ν•œ 산점도 및 νšŒκ·€ 뢄석
116
  st.subheader("두 λ³€μˆ˜ κ°„μ˜ 관계 뢄석")
117
  numeric_columns = filtered_data.select_dtypes(include=['float64', 'int64']).columns
118
- x_var = st.selectbox("XμΆ• λ³€μˆ˜ 선택", options=numeric_columns)
119
- y_var = st.selectbox("YμΆ• λ³€μˆ˜ 선택", options=[col for col in numeric_columns if col != x_var])
 
 
120
 
121
- if x_var and y_var:
122
- fig = px.scatter(filtered_data, x=x_var, y=y_var, color='반' if '반' in filtered_data.columns else None)
123
 
124
  # νšŒκ·€μ„  μΆ”κ°€
125
- x = filtered_data[x_var]
126
- y = filtered_data[y_var]
127
  slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
128
  line_x = np.array([x.min(), x.max()])
129
  line_y = slope * line_x + intercept
@@ -131,9 +73,9 @@ def perform_analysis(data):
131
 
132
  r_squared = r_value ** 2
133
  fig.update_layout(
134
- title=f'{x_var}와 {y_var}의 관계 (R-squared: {r_squared:.4f})',
135
- xaxis_title=x_var,
136
- yaxis_title=y_var,
137
  annotations=[
138
  dict(
139
  x=0.5,
@@ -152,28 +94,32 @@ def perform_analysis(data):
152
  st.write(f"p-value: {p_value:.4f}")
153
  st.write(f"ν‘œμ€€ 였차: {std_err:.4f}")
154
 
 
 
155
  def main():
156
  st.title("μΈν„°λž™ν‹°λΈŒ EDA νˆ΄ν‚·")
157
 
158
- data_input_method = st.radio("데이터 μž…λ ₯ 방법 선택:", ("파일 μ—…λ‘œλ“œ", "μˆ˜λ™ μž…λ ₯"))
159
-
160
- if data_input_method == "파일 μ—…λ‘œλ“œ":
161
- uploaded_file = st.file_uploader("CSV, XLS, λ˜λŠ” XLSX νŒŒμΌμ„ μ„ νƒν•˜μ„Έμš”", type=["csv", "xls", "xlsx"])
162
- if uploaded_file is not None:
163
- data = load_data(uploaded_file)
 
 
 
164
  else:
165
- data = None
166
- else:
167
- data = manual_data_entry()
168
 
169
- if data is not None:
170
  st.subheader("데이터 미리보기 및 μˆ˜μ •")
171
  st.write("데이터λ₯Ό ν™•μΈν•˜κ³  ν•„μš”ν•œ 경우 μˆ˜μ •ν•˜μ„Έμš”:")
172
- edited_data = st.data_editor(data, num_rows="dynamic")
173
 
174
- if st.button("데이터 뢄석 μ‹œμž‘"):
175
- processed_data = preprocess_data(edited_data)
176
- perform_analysis(processed_data)
 
177
 
178
  if __name__ == "__main__":
179
  main()
 
1
+ def initialize_session_state():
2
+ if 'data' not in st.session_state:
3
+ st.session_state.data = None
4
+ if 'processed_data' not in st.session_state:
5
+ st.session_state.processed_data = None
6
+ if 'slicers' not in st.session_state:
7
+ st.session_state.slicers = {}
8
+ if 'x_var' not in st.session_state:
9
+ st.session_state.x_var = None
10
+ if 'y_var' not in st.session_state:
11
+ st.session_state.y_var = None
12
+ if 'analysis_performed' not in st.session_state:
13
+ st.session_state.analysis_performed = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def create_slicers(data):
 
16
  categorical_columns = data.select_dtypes(include=['object', 'category']).columns
17
 
18
  for col in categorical_columns:
19
  if data[col].nunique() <= 10: # κ³ μœ κ°’μ΄ 10개 μ΄ν•˜μΈ κ²½μš°μ—λ§Œ μŠ¬λΌμ΄μ„œ 생성
20
+ if col not in st.session_state.slicers:
21
+ st.session_state.slicers[col] = sorted(data[col].unique())
22
+ st.session_state.slicers[col] = st.multiselect(
23
+ f"{col} 선택",
24
+ options=sorted(data[col].unique()),
25
+ default=st.session_state.slicers[col]
26
+ )
27
+
28
+ def apply_slicers(data):
29
+ for col, selected_values in st.session_state.slicers.items():
30
  if selected_values:
31
  data = data[data[col].isin(selected_values)]
32
  return data
33
 
 
34
  def perform_analysis(data):
35
  st.header("탐색적 데이터 뢄석")
36
 
37
+ # μŠ¬λΌμ΄μ„œ 생성 및 적용
38
+ create_slicers(data)
39
+ filtered_data = apply_slicers(data)
 
 
40
 
41
  # μš”μ•½ 톡계
42
  st.write("μš”μ•½ 톡계:")
 
55
  # μ‚¬μš©μžκ°€ μ„ νƒν•œ 두 λ³€μˆ˜μ— λŒ€ν•œ 산점도 및 νšŒκ·€ 뢄석
56
  st.subheader("두 λ³€μˆ˜ κ°„μ˜ 관계 뢄석")
57
  numeric_columns = filtered_data.select_dtypes(include=['float64', 'int64']).columns
58
+
59
+ st.session_state.x_var = st.selectbox("XμΆ• λ³€μˆ˜ 선택", options=numeric_columns, key='x_var_select', index=numeric_columns.get_loc(st.session_state.x_var) if st.session_state.x_var in numeric_columns else 0)
60
+ y_options = [col for col in numeric_columns if col != st.session_state.x_var]
61
+ st.session_state.y_var = st.selectbox("YμΆ• λ³€μˆ˜ 선택", options=y_options, key='y_var_select', index=y_options.index(st.session_state.y_var) if st.session_state.y_var in y_options else 0)
62
 
63
+ if st.session_state.x_var and st.session_state.y_var:
64
+ fig = px.scatter(filtered_data, x=st.session_state.x_var, y=st.session_state.y_var, color='반' if '반' in filtered_data.columns else None)
65
 
66
  # νšŒκ·€μ„  μΆ”κ°€
67
+ x = filtered_data[st.session_state.x_var]
68
+ y = filtered_data[st.session_state.y_var]
69
  slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
70
  line_x = np.array([x.min(), x.max()])
71
  line_y = slope * line_x + intercept
 
73
 
74
  r_squared = r_value ** 2
75
  fig.update_layout(
76
+ title=f'{st.session_state.x_var}와 {st.session_state.y_var}의 관계 (R-squared: {r_squared:.4f})',
77
+ xaxis_title=st.session_state.x_var,
78
+ yaxis_title=st.session_state.y_var,
79
  annotations=[
80
  dict(
81
  x=0.5,
 
94
  st.write(f"p-value: {p_value:.4f}")
95
  st.write(f"ν‘œμ€€ 였차: {std_err:.4f}")
96
 
97
+ st.session_state.analysis_performed = True
98
+
99
  def main():
100
  st.title("μΈν„°λž™ν‹°λΈŒ EDA νˆ΄ν‚·")
101
 
102
+ initialize_session_state()
103
+
104
+ if st.session_state.data is None:
105
+ data_input_method = st.radio("데이터 μž…λ ₯ 방법 선택:", ("파일 μ—…λ‘œλ“œ", "μˆ˜λ™ μž…λ ₯"))
106
+
107
+ if data_input_method == "파일 μ—…λ‘œλ“œ":
108
+ uploaded_file = st.file_uploader("CSV, XLS, λ˜λŠ” XLSX νŒŒμΌμ„ μ„ νƒν•˜μ„Έμš”", type=["csv", "xls", "xlsx"])
109
+ if uploaded_file is not None:
110
+ st.session_state.data = load_data(uploaded_file)
111
  else:
112
+ st.session_state.data = manual_data_entry()
 
 
113
 
114
+ if st.session_state.data is not None:
115
  st.subheader("데이터 미리보기 및 μˆ˜μ •")
116
  st.write("데이터λ₯Ό ν™•μΈν•˜κ³  ν•„μš”ν•œ 경우 μˆ˜μ •ν•˜μ„Έμš”:")
117
+ edited_data = st.data_editor(st.session_state.data, num_rows="dynamic")
118
 
119
+ if st.button("데이터 뢄석 μ‹œμž‘") or st.session_state.analysis_performed:
120
+ if not st.session_state.analysis_performed:
121
+ st.session_state.processed_data = preprocess_data(edited_data)
122
+ perform_analysis(st.session_state.processed_data)
123
 
124
  if __name__ == "__main__":
125
  main()