PPPDC_example / app.py
JUNGU's picture
Update app.py
23711c4 verified
raw
history blame
9.72 kB
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from io import StringIO
import openpyxl
import matplotlib.font_manager as fm
from scipy import stats
# ํ•œ๊ธ€ ํฐํŠธ ์„ค์ •
def set_font():
font_path = "Pretendard-Bold.ttf" # ์‹ค์ œ ํฐํŠธ ํŒŒ์ผ ๊ฒฝ๋กœ๋กœ ๋ณ€๊ฒฝํ•ด์ฃผ์„ธ์š”
fm.fontManager.addfont(font_path)
return {'font.family': 'Pretendard-Bold', 'axes.unicode_minus': False}
# ํฐํŠธ ์„ค์ •์„ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค
font_settings = set_font()
def load_data(file):
file_extension = file.name.split('.')[-1].lower()
if file_extension == 'csv':
data = pd.read_csv(file)
elif file_extension in ['xls', 'xlsx']:
data = pd.read_excel(file)
else:
st.error("์ง€์›๋˜์ง€ ์•Š๋Š” ํŒŒ์ผ ํ˜•์‹์ž…๋‹ˆ๋‹ค. CSV, XLS, ๋˜๋Š” XLSX ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”.")
return None
return data
def manual_data_entry():
st.subheader("์ˆ˜๋™ ๋ฐ์ดํ„ฐ ์ž…๋ ฅ")
col_names = st.text_input("์—ด ์ด๋ฆ„์„ ์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„ํ•˜์—ฌ ์ž…๋ ฅํ•˜์„ธ์š”:").split(',')
col_names = [name.strip() for name in col_names if name.strip()]
if col_names:
num_rows = st.number_input("์ดˆ๊ธฐ ํ–‰์˜ ์ˆ˜๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”:", min_value=1, value=5)
data = pd.DataFrame(columns=col_names, index=range(num_rows))
edited_data = st.data_editor(data, num_rows="dynamic")
return edited_data
return None
def preprocess_data(data):
st.subheader("๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ")
# ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ
if data.isnull().sum().sum() > 0:
st.write("๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ:")
for column in data.columns:
if data[column].isnull().sum() > 0:
method = st.selectbox(f"{column} ์—ด์˜ ์ฒ˜๋ฆฌ ๋ฐฉ๋ฒ• ์„ ํƒ:",
["์ œ๊ฑฐ", "ํ‰๊ท ์œผ๋กœ ๋Œ€์ฒด", "์ค‘์•™๊ฐ’์œผ๋กœ ๋Œ€์ฒด", "์ตœ๋นˆ๊ฐ’์œผ๋กœ ๋Œ€์ฒด"])
if method == "์ œ๊ฑฐ":
data = data.dropna(subset=[column])
elif method == "ํ‰๊ท ์œผ๋กœ ๋Œ€์ฒด":
data[column].fillna(data[column].mean(), inplace=True)
elif method == "์ค‘์•™๊ฐ’์œผ๋กœ ๋Œ€์ฒด":
data[column].fillna(data[column].median(), inplace=True)
elif method == "์ตœ๋นˆ๊ฐ’์œผ๋กœ ๋Œ€์ฒด":
data[column].fillna(data[column].mode()[0], inplace=True)
# ๋ฐ์ดํ„ฐ ํƒ€์ž… ๋ณ€ํ™˜
for column in data.columns:
if data[column].dtype == 'object':
try:
data[column] = pd.to_numeric(data[column])
st.write(f"{column} ์—ด์„ ์ˆซ์žํ˜•์œผ๋กœ ๋ณ€ํ™˜ํ–ˆ์Šต๋‹ˆ๋‹ค.")
except ValueError:
st.write(f"{column} ์—ด์€ ๋ฒ”์ฃผํ˜•์œผ๋กœ ์œ ์ง€๋ฉ๋‹ˆ๋‹ค.")
return data
def perform_analysis(data):
st.header("ํƒ์ƒ‰์  ๋ฐ์ดํ„ฐ ๋ถ„์„")
# ์š”์•ฝ ํ†ต๊ณ„
st.write("์š”์•ฝ ํ†ต๊ณ„:")
st.write(data.describe())
# ์ƒ๊ด€๊ด€๊ณ„ ํžˆํŠธ๋งต
st.write("์ƒ๊ด€๊ด€๊ณ„ ํžˆํŠธ๋งต:")
numeric_data = data.select_dtypes(include=['float64', 'int64'])
if not numeric_data.empty:
fig = px.imshow(numeric_data.corr(), color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
fig.update_layout(title='์ƒ๊ด€๊ด€๊ณ„ ํžˆํŠธ๋งต')
st.plotly_chart(fig)
else:
st.write("์ƒ๊ด€๊ด€๊ณ„ ํžˆํŠธ๋งต์„ ๊ทธ๋ฆด ์ˆ˜ ์žˆ๋Š” ์ˆซ์žํ˜• ์—ด์ด ์—†์Šต๋‹ˆ๋‹ค.")
# ๊ณผ๋ชฉ๋ณ„ ์ ์ˆ˜ ๋ถ„ํฌ
if '๊ณผ๋ชฉ' in data.columns and 'ํ•™์Šตํ‰๊ฐ€' in data.columns:
st.write("๊ณผ๋ชฉ๋ณ„ ์ ์ˆ˜ ๋ถ„ํฌ:")
fig = px.box(data, x='๊ณผ๋ชฉ', y='ํ•™์Šตํ‰๊ฐ€', points="all")
fig.update_layout(title='๊ณผ๋ชฉ๋ณ„ ํ•™์Šตํ‰๊ฐ€ ์ ์ˆ˜ ๋ถ„ํฌ')
st.plotly_chart(fig)
# ์›”๋ณ„ ์ ์ˆ˜ ์ถ”์ด
if '๋‹ฌ' in data.columns and 'ํ•™์Šตํ‰๊ฐ€' in data.columns:
st.write("์›”๋ณ„ ์ ์ˆ˜ ์ถ”์ด:")
fig = px.line(data, x='๋‹ฌ', y='ํ•™์Šตํ‰๊ฐ€', color='๊ณผ๋ชฉ', markers=True)
fig.update_layout(title='์›”๋ณ„ ํ•™์Šตํ‰๊ฐ€ ์ ์ˆ˜ ์ถ”์ด')
st.plotly_chart(fig)
# ์ž๊ธฐ๋…ธ๋ ฅ๋„์™€ ํ•™์Šตํ‰๊ฐ€ ๊ด€๊ณ„ (ํšŒ๊ท€์„ ๊ณผ R-squared ์ถ”๊ฐ€)
if '์ž๊ธฐ๋…ธ๋ ฅ๋„' in data.columns and 'ํ•™์Šตํ‰๊ฐ€' in data.columns:
st.write("์ž๊ธฐ๋…ธ๋ ฅ๋„์™€ ํ•™์Šตํ‰๊ฐ€ ๊ด€๊ณ„:")
fig = px.scatter(data, x='์ž๊ธฐ๋…ธ๋ ฅ๋„', y='ํ•™์Šตํ‰๊ฐ€', color='๊ณผ๋ชฉ', hover_data=['๋‹ฌ'])
# ์ „์ฒด ๋ฐ์ดํ„ฐ์— ๋Œ€ํ•œ ํšŒ๊ท€์„  ์ถ”๊ฐ€
x = data['์ž๊ธฐ๋…ธ๋ ฅ๋„']
y = data['ํ•™์Šตํ‰๊ฐ€']
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
line_x = np.array([x.min(), x.max()])
line_y = slope * line_x + intercept
fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='ํšŒ๊ท€์„ '))
r_squared = r_value ** 2
fig.update_layout(
title=f'์ž๊ธฐ๋…ธ๋ ฅ๋„์™€ ํ•™์Šตํ‰๊ฐ€ ๊ด€๊ณ„ (R-squared: {r_squared:.4f})',
annotations=[
dict(
x=0.5,
y=1.05,
xref='paper',
yref='paper',
text=f'R-squared: {r_squared:.4f}',
showarrow=False,
)
]
)
st.plotly_chart(fig)
# ์ธํ„ฐ๋ž™ํ‹ฐ๋ธŒ ํ•„ํ„ฐ๋ง
st.write("์ธํ„ฐ๋ž™ํ‹ฐ๋ธŒ ํ•„ํ„ฐ๋ง:")
if '์ž๊ธฐ๋…ธ๋ ฅ๋„' in data.columns:
min_effort = int(data['์ž๊ธฐ๋…ธ๋ ฅ๋„'].min())
max_effort = int(data['์ž๊ธฐ๋…ธ๋ ฅ๋„'].max())
effort_range = st.slider("์ž๊ธฐ๋…ธ๋ ฅ๋„ ๋ฒ”์œ„ ์„ ํƒ", min_effort, max_effort, (min_effort, max_effort))
filtered_data = data[(data['์ž๊ธฐ๋…ธ๋ ฅ๋„'] >= effort_range[0]) & (data['์ž๊ธฐ๋…ธ๋ ฅ๋„'] <= effort_range[1])]
if '๊ณผ๋ชฉ' in filtered_data.columns and 'ํ•™์Šตํ‰๊ฐ€' in filtered_data.columns:
fig = px.scatter(filtered_data, x='์ž๊ธฐ๋…ธ๋ ฅ๋„', y='ํ•™์Šตํ‰๊ฐ€', color='๊ณผ๋ชฉ', hover_data=['๋‹ฌ'])
# ํ•„ํ„ฐ๋ง๋œ ๋ฐ์ดํ„ฐ์— ๋Œ€ํ•œ ํšŒ๊ท€์„  ์ถ”๊ฐ€
x = filtered_data['์ž๊ธฐ๋…ธ๋ ฅ๋„']
y = filtered_data['ํ•™์Šตํ‰๊ฐ€']
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
line_x = np.array([x.min(), x.max()])
line_y = slope * line_x + intercept
fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='ํšŒ๊ท€์„ '))
r_squared = r_value ** 2
fig.update_layout(
title=f'์ž๊ธฐ๋…ธ๋ ฅ๋„ {effort_range[0]}-{effort_range[1]} ๋ฒ”์œ„์˜ ํ•™์Šตํ‰๊ฐ€ ๊ด€๊ณ„ (R-squared: {r_squared:.4f})',
annotations=[
dict(
x=0.5,
y=1.05,
xref='paper',
yref='paper',
text=f'R-squared: {r_squared:.4f}',
showarrow=False,
)
]
)
st.plotly_chart(fig)
# ๊ณผ๋ชฉ๋ณ„ ์ƒ์„ธ ๋ถ„์„
if '๊ณผ๋ชฉ' in data.columns:
st.write("๊ณผ๋ชฉ๋ณ„ ์ƒ์„ธ ๋ถ„์„:")
selected_subject = st.selectbox("๋ถ„์„ํ•  ๊ณผ๋ชฉ ์„ ํƒ", data['๊ณผ๋ชฉ'].unique())
subject_data = data[data['๊ณผ๋ชฉ'] == selected_subject]
if '๋‹ฌ' in subject_data.columns and 'ํ•™์Šตํ‰๊ฐ€' in subject_data.columns:
fig = px.line(subject_data, x='๋‹ฌ', y='ํ•™์Šตํ‰๊ฐ€', markers=True)
fig.update_layout(title=f'{selected_subject} ์›”๋ณ„ ํ•™์Šตํ‰๊ฐ€ ์ ์ˆ˜ ์ถ”์ด')
st.plotly_chart(fig)
if '์ž๊ธฐ๋…ธ๋ ฅ๋„' in subject_data.columns and 'ํ•™์Šตํ‰๊ฐ€' in subject_data.columns:
fig = px.scatter(subject_data, x='์ž๊ธฐ๋…ธ๋ ฅ๋„', y='ํ•™์Šตํ‰๊ฐ€', hover_data=['๋‹ฌ'])
# ์„ ํƒ๋œ ๊ณผ๋ชฉ์— ๋Œ€ํ•œ ํšŒ๊ท€์„  ์ถ”๊ฐ€
x = subject_data['์ž๊ธฐ๋…ธ๋ ฅ๋„']
y = subject_data['ํ•™์Šตํ‰๊ฐ€']
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
line_x = np.array([x.min(), x.max()])
line_y = slope * line_x + intercept
fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='ํšŒ๊ท€์„ '))
r_squared = r_value ** 2
fig.update_layout(
title=f'{selected_subject} ์ž๊ธฐ๋…ธ๋ ฅ๋„์™€ ํ•™์Šตํ‰๊ฐ€ ๊ด€๊ณ„ (R-squared: {r_squared:.4f})',
annotations=[
dict(
x=0.5,
y=1.05,
xref='paper',
yref='paper',
text=f'R-squared: {r_squared:.4f}',
showarrow=False,
)
]
)
st.plotly_chart(fig)
def main():
st.title("์ธํ„ฐ๋ž™ํ‹ฐ๋ธŒ EDA ํˆดํ‚ท")
data_input_method = st.radio("๋ฐ์ดํ„ฐ ์ž…๋ ฅ ๋ฐฉ๋ฒ• ์„ ํƒ:", ("ํŒŒ์ผ ์—…๋กœ๋“œ", "์ˆ˜๋™ ์ž…๋ ฅ"))
if data_input_method == "ํŒŒ์ผ ์—…๋กœ๋“œ":
uploaded_file = st.file_uploader("CSV, XLS, ๋˜๋Š” XLSX ํŒŒ์ผ์„ ์„ ํƒํ•˜์„ธ์š”", type=["csv", "xls", "xlsx"])
if uploaded_file is not None:
data = load_data(uploaded_file)
else:
data = None
else:
data = manual_data_entry()
if data is not None:
st.subheader("๋ฐ์ดํ„ฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ ๋ฐ ์ˆ˜์ •")
st.write("๋ฐ์ดํ„ฐ๋ฅผ ํ™•์ธํ•˜๊ณ  ํ•„์š”ํ•œ ๊ฒฝ์šฐ ์ˆ˜์ •ํ•˜์„ธ์š”:")
edited_data = st.data_editor(data, num_rows="dynamic")
if st.button("๋ฐ์ดํ„ฐ ๋ถ„์„ ์‹œ์ž‘"):
processed_data = preprocess_data(edited_data)
perform_analysis(processed_data)
if __name__ == "__main__":
main()