Spaces:

JUNGU
/

PPPDC_example

Build error

App Files Files Community

PPPDC_example / app.py

JUNGU

Update app.py

eacbd49 verified 12 months ago

raw

history blame

6.76 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.graph_objects as go
	from io import StringIO
	import openpyxl
	import matplotlib.font_manager as fm
	from scipy import stats

	# 한글 폰트 설정
	def set_font():
	font_path = "Pretendard-Bold.ttf" # 실제 폰트 파일 경로로 변경해주세요
	fm.fontManager.addfont(font_path)
	return {'font.family': 'Pretendard-Bold', 'axes.unicode_minus': False}

	# 폰트 설정을 가져옵니다
	font_settings = set_font()

	def load_data(file):
	file_extension = file.name.split('.')[-1].lower()
	if file_extension == 'csv':
	data = pd.read_csv(file)
	elif file_extension in ['xls', 'xlsx']:
	data = pd.read_excel(file)
	else:
	st.error("지원되지 않는 파일 형식입니다. CSV, XLS, 또는 XLSX 파일을 업로드해주세요.")
	return None
	return data

	def manual_data_entry():
	st.subheader("수동 데이터 입력")
	col_names = st.text_input("열 이름을 쉼표로 구분하여 입력하세요:").split(',')
	col_names = [name.strip() for name in col_names if name.strip()]

	if col_names:
	num_rows = st.number_input("초기 행의 수를 입력하세요:", min_value=1, value=5)
	data = pd.DataFrame(columns=col_names, index=range(num_rows))

	edited_data = st.data_editor(data, num_rows="dynamic")

	return edited_data
	return None

	def preprocess_data(data):
	st.subheader("데이터 전처리")

	# 결측치 처리
	if data.isnull().sum().sum() > 0:
	st.write("결측치 처리:")
	for column in data.columns:
	if data[column].isnull().sum() > 0:
	method = st.selectbox(f"{column} 열의 처리 방법 선택:",
	["제거", "평균으로 대체", "중앙값으로 대체", "최빈값으로 대체"])
	if method == "제거":
	data = data.dropna(subset=[column])
	elif method == "평균으로 대체":
	data[column].fillna(data[column].mean(), inplace=True)
	elif method == "중앙값으로 대체":
	data[column].fillna(data[column].median(), inplace=True)
	elif method == "최빈값으로 대체":
	data[column].fillna(data[column].mode()[0], inplace=True)

	# 데이터 타입 변환
	for column in data.columns:
	if data[column].dtype == 'object':
	try:
	data[column] = pd.to_numeric(data[column])
	st.write(f"{column} 열을 숫자형으로 변환했습니다.")
	except ValueError:
	st.write(f"{column} 열은 범주형으로 유지됩니다.")

	return data

	def create_slicers(data):
	slicers = {}
	categorical_columns = data.select_dtypes(include=['object', 'category']).columns

	for col in categorical_columns:
	if data[col].nunique() <= 10: # 고유값이 10개 이하인 경우에만 슬라이서 생성
	slicers[col] = st.multiselect(f"{col} 선택", options=sorted(data[col].unique()), default=sorted(data[col].unique()))

	return slicers

	def apply_slicers(data, slicers):
	for col, selected_values in slicers.items():
	if selected_values:
	data = data[data[col].isin(selected_values)]
	return data


	def perform_analysis(data):
	st.header("탐색적 데이터 분석")

	# 슬라이서 생성
	slicers = create_slicers(data)

	# 슬라이서 적용
	filtered_data = apply_slicers(data, slicers)

	# 요약 통계
	st.write("요약 통계:")
	st.write(filtered_data.describe())

	# 상관관계 히트맵
	st.write("상관관계 히트맵:")
	numeric_data = filtered_data.select_dtypes(include=['float64', 'int64'])
	if not numeric_data.empty:
	fig = px.imshow(numeric_data.corr(), color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
	fig.update_layout(title='상관관계 히트맵')
	st.plotly_chart(fig)
	else:
	st.write("상관관계 히트맵을 그릴 수 있는 숫자형 열이 없습니다.")

	# 사용자가 선택한 두 변수에 대한 산점도 및 회귀 분석
	st.subheader("두 변수 간의 관계 분석")
	numeric_columns = filtered_data.select_dtypes(include=['float64', 'int64']).columns
	x_var = st.selectbox("X축 변수 선택", options=numeric_columns)
	y_var = st.selectbox("Y축 변수 선택", options=[col for col in numeric_columns if col != x_var])

	if x_var and y_var:
	fig = px.scatter(filtered_data, x=x_var, y=y_var, color='반' if '반' in filtered_data.columns else None)

	# 회귀선 추가
	x = filtered_data[x_var]
	y = filtered_data[y_var]
	slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
	line_x = np.array([x.min(), x.max()])
	line_y = slope * line_x + intercept
	fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='회귀선'))

	r_squared = r_value ** 2
	fig.update_layout(
	title=f'{x_var}와 {y_var}의 관계 (R-squared: {r_squared:.4f})',
	xaxis_title=x_var,
	yaxis_title=y_var,
	annotations=[
	dict(
	x=0.5,
	y=1.05,
	xref='paper',
	yref='paper',
	text=f'R-squared: {r_squared:.4f}',
	showarrow=False,
	)
	]
	)
	st.plotly_chart(fig)

	# 추가 통계 정보
	st.write(f"상관계수: {r_value:.4f}")
	st.write(f"p-value: {p_value:.4f}")
	st.write(f"표준 오차: {std_err:.4f}")

	def main():
	st.title("인터랙티브 EDA 툴킷")

	data_input_method = st.radio("데이터 입력 방법 선택:", ("파일 업로드", "수동 입력"))

	if data_input_method == "파일 업로드":
	uploaded_file = st.file_uploader("CSV, XLS, 또는 XLSX 파일을 선택하세요", type=["csv", "xls", "xlsx"])
	if uploaded_file is not None:
	data = load_data(uploaded_file)
	else:
	data = None
	else:
	data = manual_data_entry()

	if data is not None:
	st.subheader("데이터 미리보기 및 수정")
	st.write("데이터를 확인하고 필요한 경우 수정하세요:")
	edited_data = st.data_editor(data, num_rows="dynamic")

	if st.button("데이터 분석 시작"):
	processed_data = preprocess_data(edited_data)
	perform_analysis(processed_data)

	if __name__ == "__main__":
	main()