Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,102 +1,42 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
fm.fontManager.addfont(font_path)
|
15 |
-
return {'font.family': 'Pretendard-Bold', 'axes.unicode_minus': False}
|
16 |
-
|
17 |
-
# ν°νΈ μ€μ μ κ°μ Έμ΅λλ€
|
18 |
-
font_settings = set_font()
|
19 |
-
|
20 |
-
def load_data(file):
|
21 |
-
file_extension = file.name.split('.')[-1].lower()
|
22 |
-
if file_extension == 'csv':
|
23 |
-
data = pd.read_csv(file)
|
24 |
-
elif file_extension in ['xls', 'xlsx']:
|
25 |
-
data = pd.read_excel(file)
|
26 |
-
else:
|
27 |
-
st.error("μ§μλμ§ μλ νμΌ νμμ
λλ€. CSV, XLS, λλ XLSX νμΌμ μ
λ‘λν΄μ£ΌμΈμ.")
|
28 |
-
return None
|
29 |
-
return data
|
30 |
-
|
31 |
-
def manual_data_entry():
|
32 |
-
st.subheader("μλ λ°μ΄ν° μ
λ ₯")
|
33 |
-
col_names = st.text_input("μ΄ μ΄λ¦μ μΌνλ‘ κ΅¬λΆνμ¬ μ
λ ₯νμΈμ:").split(',')
|
34 |
-
col_names = [name.strip() for name in col_names if name.strip()]
|
35 |
-
|
36 |
-
if col_names:
|
37 |
-
num_rows = st.number_input("μ΄κΈ° νμ μλ₯Ό μ
λ ₯νμΈμ:", min_value=1, value=5)
|
38 |
-
data = pd.DataFrame(columns=col_names, index=range(num_rows))
|
39 |
-
|
40 |
-
edited_data = st.data_editor(data, num_rows="dynamic")
|
41 |
-
|
42 |
-
return edited_data
|
43 |
-
return None
|
44 |
-
|
45 |
-
def preprocess_data(data):
|
46 |
-
st.subheader("λ°μ΄ν° μ μ²λ¦¬")
|
47 |
-
|
48 |
-
# κ²°μΈ‘μΉ μ²λ¦¬
|
49 |
-
if data.isnull().sum().sum() > 0:
|
50 |
-
st.write("κ²°μΈ‘μΉ μ²λ¦¬:")
|
51 |
-
for column in data.columns:
|
52 |
-
if data[column].isnull().sum() > 0:
|
53 |
-
method = st.selectbox(f"{column} μ΄μ μ²λ¦¬ λ°©λ² μ ν:",
|
54 |
-
["μ κ±°", "νκ· μΌλ‘ λ체", "μ€μκ°μΌλ‘ λ체", "μ΅λΉκ°μΌλ‘ λ체"])
|
55 |
-
if method == "μ κ±°":
|
56 |
-
data = data.dropna(subset=[column])
|
57 |
-
elif method == "νκ· μΌλ‘ λ체":
|
58 |
-
data[column].fillna(data[column].mean(), inplace=True)
|
59 |
-
elif method == "μ€μκ°μΌλ‘ λ체":
|
60 |
-
data[column].fillna(data[column].median(), inplace=True)
|
61 |
-
elif method == "μ΅λΉκ°μΌλ‘ λ체":
|
62 |
-
data[column].fillna(data[column].mode()[0], inplace=True)
|
63 |
-
|
64 |
-
# λ°μ΄ν° νμ
λ³ν
|
65 |
-
for column in data.columns:
|
66 |
-
if data[column].dtype == 'object':
|
67 |
-
try:
|
68 |
-
data[column] = pd.to_numeric(data[column])
|
69 |
-
st.write(f"{column} μ΄μ μ«μνμΌλ‘ λ³ννμ΅λλ€.")
|
70 |
-
except ValueError:
|
71 |
-
st.write(f"{column} μ΄μ λ²μ£ΌνμΌλ‘ μ μ§λ©λλ€.")
|
72 |
-
|
73 |
-
return data
|
74 |
|
75 |
def create_slicers(data):
|
76 |
-
slicers = {}
|
77 |
categorical_columns = data.select_dtypes(include=['object', 'category']).columns
|
78 |
|
79 |
for col in categorical_columns:
|
80 |
if data[col].nunique() <= 10: # κ³ μ κ°μ΄ 10κ° μ΄νμΈ κ²½μ°μλ§ μ¬λΌμ΄μ μμ±
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
|
|
87 |
if selected_values:
|
88 |
data = data[data[col].isin(selected_values)]
|
89 |
return data
|
90 |
|
91 |
-
|
92 |
def perform_analysis(data):
|
93 |
st.header("νμμ λ°μ΄ν° λΆμ")
|
94 |
|
95 |
-
# μ¬λΌμ΄μ μμ±
|
96 |
-
|
97 |
-
|
98 |
-
# μ¬λΌμ΄μ μ μ©
|
99 |
-
filtered_data = apply_slicers(data, slicers)
|
100 |
|
101 |
# μμ½ ν΅κ³
|
102 |
st.write("μμ½ ν΅κ³:")
|
@@ -115,15 +55,17 @@ def perform_analysis(data):
|
|
115 |
# μ¬μ©μκ° μ νν λ λ³μμ λν μ°μ λ λ° νκ· λΆμ
|
116 |
st.subheader("λ λ³μ κ°μ κ΄κ³ λΆμ")
|
117 |
numeric_columns = filtered_data.select_dtypes(include=['float64', 'int64']).columns
|
118 |
-
|
119 |
-
|
|
|
|
|
120 |
|
121 |
-
if x_var and y_var:
|
122 |
-
fig = px.scatter(filtered_data, x=x_var, y=y_var, color='λ°' if 'λ°' in filtered_data.columns else None)
|
123 |
|
124 |
# νκ·μ μΆκ°
|
125 |
-
x = filtered_data[x_var]
|
126 |
-
y = filtered_data[y_var]
|
127 |
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
|
128 |
line_x = np.array([x.min(), x.max()])
|
129 |
line_y = slope * line_x + intercept
|
@@ -131,9 +73,9 @@ def perform_analysis(data):
|
|
131 |
|
132 |
r_squared = r_value ** 2
|
133 |
fig.update_layout(
|
134 |
-
title=f'{x_var}μ {y_var}μ κ΄κ³ (R-squared: {r_squared:.4f})',
|
135 |
-
xaxis_title=x_var,
|
136 |
-
yaxis_title=y_var,
|
137 |
annotations=[
|
138 |
dict(
|
139 |
x=0.5,
|
@@ -152,28 +94,32 @@ def perform_analysis(data):
|
|
152 |
st.write(f"p-value: {p_value:.4f}")
|
153 |
st.write(f"νμ€ μ€μ°¨: {std_err:.4f}")
|
154 |
|
|
|
|
|
155 |
def main():
|
156 |
st.title("μΈν°λν°λΈ EDA ν΄ν·")
|
157 |
|
158 |
-
|
159 |
-
|
160 |
-
if
|
161 |
-
|
162 |
-
|
163 |
-
|
|
|
|
|
|
|
164 |
else:
|
165 |
-
data =
|
166 |
-
else:
|
167 |
-
data = manual_data_entry()
|
168 |
|
169 |
-
if data is not None:
|
170 |
st.subheader("λ°μ΄ν° 미리보기 λ° μμ ")
|
171 |
st.write("λ°μ΄ν°λ₯Ό νμΈνκ³ νμν κ²½μ° μμ νμΈμ:")
|
172 |
-
edited_data = st.data_editor(data, num_rows="dynamic")
|
173 |
|
174 |
-
if st.button("λ°μ΄ν° λΆμ μμ"):
|
175 |
-
|
176 |
-
|
|
|
177 |
|
178 |
if __name__ == "__main__":
|
179 |
main()
|
|
|
1 |
+
def initialize_session_state():
|
2 |
+
if 'data' not in st.session_state:
|
3 |
+
st.session_state.data = None
|
4 |
+
if 'processed_data' not in st.session_state:
|
5 |
+
st.session_state.processed_data = None
|
6 |
+
if 'slicers' not in st.session_state:
|
7 |
+
st.session_state.slicers = {}
|
8 |
+
if 'x_var' not in st.session_state:
|
9 |
+
st.session_state.x_var = None
|
10 |
+
if 'y_var' not in st.session_state:
|
11 |
+
st.session_state.y_var = None
|
12 |
+
if 'analysis_performed' not in st.session_state:
|
13 |
+
st.session_state.analysis_performed = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
def create_slicers(data):
|
|
|
16 |
categorical_columns = data.select_dtypes(include=['object', 'category']).columns
|
17 |
|
18 |
for col in categorical_columns:
|
19 |
if data[col].nunique() <= 10: # κ³ μ κ°μ΄ 10κ° μ΄νμΈ κ²½μ°μλ§ μ¬λΌμ΄μ μμ±
|
20 |
+
if col not in st.session_state.slicers:
|
21 |
+
st.session_state.slicers[col] = sorted(data[col].unique())
|
22 |
+
st.session_state.slicers[col] = st.multiselect(
|
23 |
+
f"{col} μ ν",
|
24 |
+
options=sorted(data[col].unique()),
|
25 |
+
default=st.session_state.slicers[col]
|
26 |
+
)
|
27 |
+
|
28 |
+
def apply_slicers(data):
|
29 |
+
for col, selected_values in st.session_state.slicers.items():
|
30 |
if selected_values:
|
31 |
data = data[data[col].isin(selected_values)]
|
32 |
return data
|
33 |
|
|
|
34 |
def perform_analysis(data):
|
35 |
st.header("νμμ λ°μ΄ν° λΆμ")
|
36 |
|
37 |
+
# μ¬λΌμ΄μ μμ± λ° μ μ©
|
38 |
+
create_slicers(data)
|
39 |
+
filtered_data = apply_slicers(data)
|
|
|
|
|
40 |
|
41 |
# μμ½ ν΅κ³
|
42 |
st.write("μμ½ ν΅κ³:")
|
|
|
55 |
# μ¬μ©μκ° μ νν λ λ³μμ λν μ°μ λ λ° νκ· λΆμ
|
56 |
st.subheader("λ λ³μ κ°μ κ΄κ³ λΆμ")
|
57 |
numeric_columns = filtered_data.select_dtypes(include=['float64', 'int64']).columns
|
58 |
+
|
59 |
+
st.session_state.x_var = st.selectbox("XμΆ λ³μ μ ν", options=numeric_columns, key='x_var_select', index=numeric_columns.get_loc(st.session_state.x_var) if st.session_state.x_var in numeric_columns else 0)
|
60 |
+
y_options = [col for col in numeric_columns if col != st.session_state.x_var]
|
61 |
+
st.session_state.y_var = st.selectbox("YμΆ λ³μ μ ν", options=y_options, key='y_var_select', index=y_options.index(st.session_state.y_var) if st.session_state.y_var in y_options else 0)
|
62 |
|
63 |
+
if st.session_state.x_var and st.session_state.y_var:
|
64 |
+
fig = px.scatter(filtered_data, x=st.session_state.x_var, y=st.session_state.y_var, color='λ°' if 'λ°' in filtered_data.columns else None)
|
65 |
|
66 |
# νκ·μ μΆκ°
|
67 |
+
x = filtered_data[st.session_state.x_var]
|
68 |
+
y = filtered_data[st.session_state.y_var]
|
69 |
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
|
70 |
line_x = np.array([x.min(), x.max()])
|
71 |
line_y = slope * line_x + intercept
|
|
|
73 |
|
74 |
r_squared = r_value ** 2
|
75 |
fig.update_layout(
|
76 |
+
title=f'{st.session_state.x_var}μ {st.session_state.y_var}μ κ΄κ³ (R-squared: {r_squared:.4f})',
|
77 |
+
xaxis_title=st.session_state.x_var,
|
78 |
+
yaxis_title=st.session_state.y_var,
|
79 |
annotations=[
|
80 |
dict(
|
81 |
x=0.5,
|
|
|
94 |
st.write(f"p-value: {p_value:.4f}")
|
95 |
st.write(f"νμ€ μ€μ°¨: {std_err:.4f}")
|
96 |
|
97 |
+
st.session_state.analysis_performed = True
|
98 |
+
|
99 |
def main():
|
100 |
st.title("μΈν°λν°λΈ EDA ν΄ν·")
|
101 |
|
102 |
+
initialize_session_state()
|
103 |
+
|
104 |
+
if st.session_state.data is None:
|
105 |
+
data_input_method = st.radio("λ°μ΄ν° μ
λ ₯ λ°©λ² μ ν:", ("νμΌ μ
λ‘λ", "μλ μ
λ ₯"))
|
106 |
+
|
107 |
+
if data_input_method == "νμΌ μ
λ‘λ":
|
108 |
+
uploaded_file = st.file_uploader("CSV, XLS, λλ XLSX νμΌμ μ ννμΈμ", type=["csv", "xls", "xlsx"])
|
109 |
+
if uploaded_file is not None:
|
110 |
+
st.session_state.data = load_data(uploaded_file)
|
111 |
else:
|
112 |
+
st.session_state.data = manual_data_entry()
|
|
|
|
|
113 |
|
114 |
+
if st.session_state.data is not None:
|
115 |
st.subheader("λ°μ΄ν° 미리보기 λ° μμ ")
|
116 |
st.write("λ°μ΄ν°λ₯Ό νμΈνκ³ νμν κ²½μ° μμ νμΈμ:")
|
117 |
+
edited_data = st.data_editor(st.session_state.data, num_rows="dynamic")
|
118 |
|
119 |
+
if st.button("λ°μ΄ν° λΆμ μμ") or st.session_state.analysis_performed:
|
120 |
+
if not st.session_state.analysis_performed:
|
121 |
+
st.session_state.processed_data = preprocess_data(edited_data)
|
122 |
+
perform_analysis(st.session_state.processed_data)
|
123 |
|
124 |
if __name__ == "__main__":
|
125 |
main()
|