Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import plotly.express as px | |
import plotly.graph_objects as go | |
from io import StringIO | |
import openpyxl | |
import matplotlib.font_manager as fm | |
from scipy import stats | |
# ํ๊ธ ํฐํธ ์ค์ | |
def set_font(): | |
font_path = "Pretendard-Bold.ttf" # ์ค์ ํฐํธ ํ์ผ ๊ฒฝ๋ก๋ก ๋ณ๊ฒฝํด์ฃผ์ธ์ | |
fm.fontManager.addfont(font_path) | |
return {'font.family': 'Pretendard-Bold', 'axes.unicode_minus': False} | |
# ํฐํธ ์ค์ ์ ๊ฐ์ ธ์ต๋๋ค | |
font_settings = set_font() | |
def load_data(file): | |
file_extension = file.name.split('.')[-1].lower() | |
if file_extension == 'csv': | |
data = pd.read_csv(file) | |
elif file_extension in ['xls', 'xlsx']: | |
data = pd.read_excel(file) | |
else: | |
st.error("์ง์๋์ง ์๋ ํ์ผ ํ์์ ๋๋ค. CSV, XLS, ๋๋ XLSX ํ์ผ์ ์ ๋ก๋ํด์ฃผ์ธ์.") | |
return None | |
return data | |
def manual_data_entry(): | |
st.subheader("์๋ ๋ฐ์ดํฐ ์ ๋ ฅ") | |
col_names = st.text_input("์ด ์ด๋ฆ์ ์ผํ๋ก ๊ตฌ๋ถํ์ฌ ์ ๋ ฅํ์ธ์:").split(',') | |
col_names = [name.strip() for name in col_names if name.strip()] | |
if col_names: | |
num_rows = st.number_input("์ด๊ธฐ ํ์ ์๋ฅผ ์ ๋ ฅํ์ธ์:", min_value=1, value=5) | |
data = pd.DataFrame(columns=col_names, index=range(num_rows)) | |
edited_data = st.data_editor(data, num_rows="dynamic") | |
return edited_data | |
return None | |
def preprocess_data(data): | |
st.subheader("๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ") | |
# ๊ฒฐ์ธก์น ์ฒ๋ฆฌ | |
if data.isnull().sum().sum() > 0: | |
st.write("๊ฒฐ์ธก์น ์ฒ๋ฆฌ:") | |
for column in data.columns: | |
if data[column].isnull().sum() > 0: | |
method = st.selectbox(f"{column} ์ด์ ์ฒ๋ฆฌ ๋ฐฉ๋ฒ ์ ํ:", | |
["์ ๊ฑฐ", "ํ๊ท ์ผ๋ก ๋์ฒด", "์ค์๊ฐ์ผ๋ก ๋์ฒด", "์ต๋น๊ฐ์ผ๋ก ๋์ฒด"]) | |
if method == "์ ๊ฑฐ": | |
data = data.dropna(subset=[column]) | |
elif method == "ํ๊ท ์ผ๋ก ๋์ฒด": | |
data[column].fillna(data[column].mean(), inplace=True) | |
elif method == "์ค์๊ฐ์ผ๋ก ๋์ฒด": | |
data[column].fillna(data[column].median(), inplace=True) | |
elif method == "์ต๋น๊ฐ์ผ๋ก ๋์ฒด": | |
data[column].fillna(data[column].mode()[0], inplace=True) | |
# ๋ฐ์ดํฐ ํ์ ๋ณํ | |
for column in data.columns: | |
if data[column].dtype == 'object': | |
try: | |
data[column] = pd.to_numeric(data[column]) | |
st.write(f"{column} ์ด์ ์ซ์ํ์ผ๋ก ๋ณํํ์ต๋๋ค.") | |
except ValueError: | |
st.write(f"{column} ์ด์ ๋ฒ์ฃผํ์ผ๋ก ์ ์ง๋ฉ๋๋ค.") | |
return data | |
def perform_analysis(data): | |
st.header("ํ์์ ๋ฐ์ดํฐ ๋ถ์") | |
# ์์ฝ ํต๊ณ | |
st.write("์์ฝ ํต๊ณ:") | |
st.write(data.describe()) | |
# ์๊ด๊ด๊ณ ํํธ๋งต | |
st.write("์๊ด๊ด๊ณ ํํธ๋งต:") | |
numeric_data = data.select_dtypes(include=['float64', 'int64']) | |
if not numeric_data.empty: | |
fig = px.imshow(numeric_data.corr(), color_continuous_scale='RdBu_r', zmin=-1, zmax=1) | |
fig.update_layout(title='์๊ด๊ด๊ณ ํํธ๋งต') | |
st.plotly_chart(fig) | |
else: | |
st.write("์๊ด๊ด๊ณ ํํธ๋งต์ ๊ทธ๋ฆด ์ ์๋ ์ซ์ํ ์ด์ด ์์ต๋๋ค.") | |
# ๊ณผ๋ชฉ๋ณ ์ ์ ๋ถํฌ | |
if '๊ณผ๋ชฉ' in data.columns and 'ํ์ตํ๊ฐ' in data.columns: | |
st.write("๊ณผ๋ชฉ๋ณ ์ ์ ๋ถํฌ:") | |
fig = px.box(data, x='๊ณผ๋ชฉ', y='ํ์ตํ๊ฐ', points="all") | |
fig.update_layout(title='๊ณผ๋ชฉ๋ณ ํ์ตํ๊ฐ ์ ์ ๋ถํฌ') | |
st.plotly_chart(fig) | |
# ์๋ณ ์ ์ ์ถ์ด | |
if '๋ฌ' in data.columns and 'ํ์ตํ๊ฐ' in data.columns: | |
st.write("์๋ณ ์ ์ ์ถ์ด:") | |
fig = px.line(data, x='๋ฌ', y='ํ์ตํ๊ฐ', color='๊ณผ๋ชฉ', markers=True) | |
fig.update_layout(title='์๋ณ ํ์ตํ๊ฐ ์ ์ ์ถ์ด') | |
st.plotly_chart(fig) | |
# ์๊ธฐ๋ ธ๋ ฅ๋์ ํ์ตํ๊ฐ ๊ด๊ณ (ํ๊ท์ ๊ณผ R-squared ์ถ๊ฐ) | |
if '์๊ธฐ๋ ธ๋ ฅ๋' in data.columns and 'ํ์ตํ๊ฐ' in data.columns: | |
st.write("์๊ธฐ๋ ธ๋ ฅ๋์ ํ์ตํ๊ฐ ๊ด๊ณ:") | |
fig = px.scatter(data, x='์๊ธฐ๋ ธ๋ ฅ๋', y='ํ์ตํ๊ฐ', color='๊ณผ๋ชฉ', hover_data=['๋ฌ']) | |
# ์ ์ฒด ๋ฐ์ดํฐ์ ๋ํ ํ๊ท์ ์ถ๊ฐ | |
x = data['์๊ธฐ๋ ธ๋ ฅ๋'] | |
y = data['ํ์ตํ๊ฐ'] | |
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) | |
line_x = np.array([x.min(), x.max()]) | |
line_y = slope * line_x + intercept | |
fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='ํ๊ท์ ')) | |
r_squared = r_value ** 2 | |
fig.update_layout( | |
title=f'์๊ธฐ๋ ธ๋ ฅ๋์ ํ์ตํ๊ฐ ๊ด๊ณ (R-squared: {r_squared:.4f})', | |
annotations=[ | |
dict( | |
x=0.5, | |
y=1.05, | |
xref='paper', | |
yref='paper', | |
text=f'R-squared: {r_squared:.4f}', | |
showarrow=False, | |
) | |
] | |
) | |
st.plotly_chart(fig) | |
# ์ธํฐ๋ํฐ๋ธ ํํฐ๋ง | |
st.write("์ธํฐ๋ํฐ๋ธ ํํฐ๋ง:") | |
if '์๊ธฐ๋ ธ๋ ฅ๋' in data.columns: | |
min_effort = int(data['์๊ธฐ๋ ธ๋ ฅ๋'].min()) | |
max_effort = int(data['์๊ธฐ๋ ธ๋ ฅ๋'].max()) | |
effort_range = st.slider("์๊ธฐ๋ ธ๋ ฅ๋ ๋ฒ์ ์ ํ", min_effort, max_effort, (min_effort, max_effort)) | |
filtered_data = data[(data['์๊ธฐ๋ ธ๋ ฅ๋'] >= effort_range[0]) & (data['์๊ธฐ๋ ธ๋ ฅ๋'] <= effort_range[1])] | |
if '๊ณผ๋ชฉ' in filtered_data.columns and 'ํ์ตํ๊ฐ' in filtered_data.columns: | |
fig = px.scatter(filtered_data, x='์๊ธฐ๋ ธ๋ ฅ๋', y='ํ์ตํ๊ฐ', color='๊ณผ๋ชฉ', hover_data=['๋ฌ']) | |
# ํํฐ๋ง๋ ๋ฐ์ดํฐ์ ๋ํ ํ๊ท์ ์ถ๊ฐ | |
x = filtered_data['์๊ธฐ๋ ธ๋ ฅ๋'] | |
y = filtered_data['ํ์ตํ๊ฐ'] | |
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) | |
line_x = np.array([x.min(), x.max()]) | |
line_y = slope * line_x + intercept | |
fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='ํ๊ท์ ')) | |
r_squared = r_value ** 2 | |
fig.update_layout( | |
title=f'์๊ธฐ๋ ธ๋ ฅ๋ {effort_range[0]}-{effort_range[1]} ๋ฒ์์ ํ์ตํ๊ฐ ๊ด๊ณ (R-squared: {r_squared:.4f})', | |
annotations=[ | |
dict( | |
x=0.5, | |
y=1.05, | |
xref='paper', | |
yref='paper', | |
text=f'R-squared: {r_squared:.4f}', | |
showarrow=False, | |
) | |
] | |
) | |
st.plotly_chart(fig) | |
# ๊ณผ๋ชฉ๋ณ ์์ธ ๋ถ์ | |
if '๊ณผ๋ชฉ' in data.columns: | |
st.write("๊ณผ๋ชฉ๋ณ ์์ธ ๋ถ์:") | |
selected_subject = st.selectbox("๋ถ์ํ ๊ณผ๋ชฉ ์ ํ", data['๊ณผ๋ชฉ'].unique()) | |
subject_data = data[data['๊ณผ๋ชฉ'] == selected_subject] | |
if '๋ฌ' in subject_data.columns and 'ํ์ตํ๊ฐ' in subject_data.columns: | |
fig = px.line(subject_data, x='๋ฌ', y='ํ์ตํ๊ฐ', markers=True) | |
fig.update_layout(title=f'{selected_subject} ์๋ณ ํ์ตํ๊ฐ ์ ์ ์ถ์ด') | |
st.plotly_chart(fig) | |
if '์๊ธฐ๋ ธ๋ ฅ๋' in subject_data.columns and 'ํ์ตํ๊ฐ' in subject_data.columns: | |
fig = px.scatter(subject_data, x='์๊ธฐ๋ ธ๋ ฅ๋', y='ํ์ตํ๊ฐ', hover_data=['๋ฌ']) | |
# ์ ํ๋ ๊ณผ๋ชฉ์ ๋ํ ํ๊ท์ ์ถ๊ฐ | |
x = subject_data['์๊ธฐ๋ ธ๋ ฅ๋'] | |
y = subject_data['ํ์ตํ๊ฐ'] | |
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) | |
line_x = np.array([x.min(), x.max()]) | |
line_y = slope * line_x + intercept | |
fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='ํ๊ท์ ')) | |
r_squared = r_value ** 2 | |
fig.update_layout( | |
title=f'{selected_subject} ์๊ธฐ๋ ธ๋ ฅ๋์ ํ์ตํ๊ฐ ๊ด๊ณ (R-squared: {r_squared:.4f})', | |
annotations=[ | |
dict( | |
x=0.5, | |
y=1.05, | |
xref='paper', | |
yref='paper', | |
text=f'R-squared: {r_squared:.4f}', | |
showarrow=False, | |
) | |
] | |
) | |
st.plotly_chart(fig) | |
def main(): | |
st.title("์ธํฐ๋ํฐ๋ธ EDA ํดํท") | |
data_input_method = st.radio("๋ฐ์ดํฐ ์ ๋ ฅ ๋ฐฉ๋ฒ ์ ํ:", ("ํ์ผ ์ ๋ก๋", "์๋ ์ ๋ ฅ")) | |
if data_input_method == "ํ์ผ ์ ๋ก๋": | |
uploaded_file = st.file_uploader("CSV, XLS, ๋๋ XLSX ํ์ผ์ ์ ํํ์ธ์", type=["csv", "xls", "xlsx"]) | |
if uploaded_file is not None: | |
data = load_data(uploaded_file) | |
else: | |
data = None | |
else: | |
data = manual_data_entry() | |
if data is not None: | |
st.subheader("๋ฐ์ดํฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ ๋ฐ ์์ ") | |
st.write("๋ฐ์ดํฐ๋ฅผ ํ์ธํ๊ณ ํ์ํ ๊ฒฝ์ฐ ์์ ํ์ธ์:") | |
edited_data = st.data_editor(data, num_rows="dynamic") | |
if st.button("๋ฐ์ดํฐ ๋ถ์ ์์"): | |
processed_data = preprocess_data(edited_data) | |
perform_analysis(processed_data) | |
if __name__ == "__main__": | |
main() |