PPPDC_example / app.py
JUNGU's picture
Update app.py
7b713b3 verified
raw
history blame
5.78 kB
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from io import StringIO
import openpyxl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
# ν•œκΈ€ 폰트 μ„€μ •
def set_font():
font_path = "Pretendard-Bold.ttf"
fm.fontManager.addfont(font_path)
plt.rcParams['font.family'] = 'Pretendard-Bold' #
plt.rcParams['axes.unicode_minus'] = False # λ§ˆμ΄λ„ˆμŠ€ 기호 깨짐 방지
# Streamlit 섀정에 폰트 적용
st.set_option('deprecation.showPyplotGlobalUse', False)
set_font()
def load_data(file):
file_extension = file.name.split('.')[-1].lower()
if file_extension == 'csv':
data = pd.read_csv(file)
elif file_extension in ['xls', 'xlsx']:
data = pd.read_excel(file)
else:
st.error("μ§€μ›λ˜μ§€ μ•ŠλŠ” 파일 ν˜•μ‹μž…λ‹ˆλ‹€. CSV, XLS, λ˜λŠ” XLSX νŒŒμΌμ„ μ—…λ‘œλ“œν•΄μ£Όμ„Έμš”.")
return None
return data
def manual_data_entry():
st.subheader("μˆ˜λ™ 데이터 μž…λ ₯")
col_names = st.text_input("μ—΄ 이름을 μ‰Όν‘œλ‘œ κ΅¬λΆ„ν•˜μ—¬ μž…λ ₯ν•˜μ„Έμš”:").split(',')
col_names = [name.strip() for name in col_names if name.strip()]
if col_names:
num_rows = st.number_input("초기 ν–‰μ˜ 수λ₯Ό μž…λ ₯ν•˜μ„Έμš”:", min_value=1, value=5)
data = pd.DataFrame(columns=col_names, index=range(num_rows))
edited_data = st.data_editor(data, num_rows="dynamic")
return edited_data
return None
def preprocess_data(data):
st.subheader("데이터 μ „μ²˜λ¦¬")
# 결츑치 처리
if data.isnull().sum().sum() > 0:
st.write("결츑치 처리:")
for column in data.columns:
if data[column].isnull().sum() > 0:
method = st.selectbox(f"{column} μ—΄μ˜ 처리 방법 선택:",
["제거", "ν‰κ· μœΌλ‘œ λŒ€μ²΄", "μ€‘μ•™κ°’μœΌλ‘œ λŒ€μ²΄", "μ΅œλΉˆκ°’μœΌλ‘œ λŒ€μ²΄"])
if method == "제거":
data = data.dropna(subset=[column])
elif method == "ν‰κ· μœΌλ‘œ λŒ€μ²΄":
data[column].fillna(data[column].mean(), inplace=True)
elif method == "μ€‘μ•™κ°’μœΌλ‘œ λŒ€μ²΄":
data[column].fillna(data[column].median(), inplace=True)
elif method == "μ΅œλΉˆκ°’μœΌλ‘œ λŒ€μ²΄":
data[column].fillna(data[column].mode()[0], inplace=True)
# 데이터 νƒ€μž… λ³€ν™˜
for column in data.columns:
if data[column].dtype == 'object':
try:
data[column] = pd.to_numeric(data[column])
st.write(f"{column} 열을 μˆ«μžν˜•μœΌλ‘œ λ³€ν™˜ν–ˆμŠ΅λ‹ˆλ‹€.")
except ValueError:
st.write(f"{column} 열은 λ²”μ£Όν˜•μœΌλ‘œ μœ μ§€λ©λ‹ˆλ‹€.")
return data
def perform_analysis(data):
st.header("탐색적 데이터 뢄석")
# μš”μ•½ 톡계
st.write("μš”μ•½ 톡계:")
st.write(data.describe())
# 상관관계 히트맡
st.write("상관관계 히트맡:")
numeric_data = data.select_dtypes(include=['float64', 'int64'])
if not numeric_data.empty:
fig = px.imshow(numeric_data.corr(), color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
fig.update_layout(title='상관관계 히트맡')
st.plotly_chart(fig)
else:
st.write("상관관계 νžˆνŠΈλ§΅μ„ 그릴 수 μžˆλŠ” μˆ«μžν˜• 열이 μ—†μŠ΅λ‹ˆλ‹€.")
# 산점도 ν–‰λ ¬
st.write("산점도 ν–‰λ ¬:")
if not numeric_data.empty:
fig = px.scatter_matrix(numeric_data)
fig.update_layout(title='산점도 ν–‰λ ¬')
st.plotly_chart(fig)
else:
st.write("산점도 행렬을 그릴 수 μžˆλŠ” μˆ«μžν˜• 열이 μ—†μŠ΅λ‹ˆλ‹€.")
# νžˆμŠ€ν† κ·Έλž¨
st.write("νžˆμŠ€ν† κ·Έλž¨:")
for column in numeric_data.columns:
fig = px.histogram(data, x=column, marginal='box')
fig.update_layout(title=f'{column} νžˆμŠ€ν† κ·Έλž¨')
st.plotly_chart(fig)
# λ°•μŠ€ν”Œλ‘―
st.write("λ°•μŠ€ν”Œλ‘―:")
for column in numeric_data.columns:
fig = px.box(data, y=column)
fig.update_layout(title=f'{column} λ°•μŠ€ν”Œλ‘―')
st.plotly_chart(fig)
# λ²”μ£Όν˜• λ³€μˆ˜ λ§‰λŒ€ κ·Έλž˜ν”„
categorical_columns = data.select_dtypes(include=['object']).columns
if not categorical_columns.empty:
st.write("λ²”μ£Όν˜• λ³€μˆ˜ λ§‰λŒ€ κ·Έλž˜ν”„:")
for column in categorical_columns:
value_counts = data[column].value_counts().reset_index()
value_counts.columns = ['category', 'count']
fig = px.bar(value_counts, x='category', y='count', title=f'{column} 뢄포')
fig.update_layout(xaxis_title=column, yaxis_title='개수')
st.plotly_chart(fig)
def main():
st.title("μΈν„°λž™ν‹°λΈŒ EDA νˆ΄ν‚·")
data_input_method = st.radio("데이터 μž…λ ₯ 방법 선택:", ("파일 μ—…λ‘œλ“œ", "μˆ˜λ™ μž…λ ₯"))
if data_input_method == "파일 μ—…λ‘œλ“œ":
uploaded_file = st.file_uploader("CSV, XLS, λ˜λŠ” XLSX νŒŒμΌμ„ μ„ νƒν•˜μ„Έμš”", type=["csv", "xls", "xlsx"])
if uploaded_file is not None:
data = load_data(uploaded_file)
else:
data = None
else:
data = manual_data_entry()
if data is not None:
st.subheader("데이터 미리보기 및 μˆ˜μ •")
st.write("데이터λ₯Ό ν™•μΈν•˜κ³  ν•„μš”ν•œ 경우 μˆ˜μ •ν•˜μ„Έμš”:")
edited_data = st.data_editor(data, num_rows="dynamic")
if st.button("데이터 뢄석 μ‹œμž‘"):
processed_data = preprocess_data(edited_data)
perform_analysis(processed_data)
if __name__ == "__main__":
main()