Spaces:

JUNGU
/

PPPDC_example

Runtime error

App Files Files Community

PPPDC_example / app.py

JUNGU

Update app.py

23711c4 verified 6 months ago

raw

history blame

9.72 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.graph_objects as go
	from io import StringIO
	import openpyxl
	import matplotlib.font_manager as fm
	from scipy import stats

	# 한글 폰트 설정
	def set_font():
	font_path = "Pretendard-Bold.ttf" # 실제 폰트 파일 경로로 변경해주세요
	fm.fontManager.addfont(font_path)
	return {'font.family': 'Pretendard-Bold', 'axes.unicode_minus': False}

	# 폰트 설정을 가져옵니다
	font_settings = set_font()

	def load_data(file):
	file_extension = file.name.split('.')[-1].lower()
	if file_extension == 'csv':
	data = pd.read_csv(file)
	elif file_extension in ['xls', 'xlsx']:
	data = pd.read_excel(file)
	else:
	st.error("지원되지 않는 파일 형식입니다. CSV, XLS, 또는 XLSX 파일을 업로드해주세요.")
	return None
	return data

	def manual_data_entry():
	st.subheader("수동 데이터 입력")
	col_names = st.text_input("열 이름을 쉼표로 구분하여 입력하세요:").split(',')
	col_names = [name.strip() for name in col_names if name.strip()]

	if col_names:
	num_rows = st.number_input("초기 행의 수를 입력하세요:", min_value=1, value=5)
	data = pd.DataFrame(columns=col_names, index=range(num_rows))

	edited_data = st.data_editor(data, num_rows="dynamic")

	return edited_data
	return None

	def preprocess_data(data):
	st.subheader("데이터 전처리")

	# 결측치 처리
	if data.isnull().sum().sum() > 0:
	st.write("결측치 처리:")
	for column in data.columns:
	if data[column].isnull().sum() > 0:
	method = st.selectbox(f"{column} 열의 처리 방법 선택:",
	["제거", "평균으로 대체", "중앙값으로 대체", "최빈값으로 대체"])
	if method == "제거":
	data = data.dropna(subset=[column])
	elif method == "평균으로 대체":
	data[column].fillna(data[column].mean(), inplace=True)
	elif method == "중앙값으로 대체":
	data[column].fillna(data[column].median(), inplace=True)
	elif method == "최빈값으로 대체":
	data[column].fillna(data[column].mode()[0], inplace=True)

	# 데이터 타입 변환
	for column in data.columns:
	if data[column].dtype == 'object':
	try:
	data[column] = pd.to_numeric(data[column])
	st.write(f"{column} 열을 숫자형으로 변환했습니다.")
	except ValueError:
	st.write(f"{column} 열은 범주형으로 유지됩니다.")

	return data

	def perform_analysis(data):
	st.header("탐색적 데이터 분석")

	# 요약 통계
	st.write("요약 통계:")
	st.write(data.describe())

	# 상관관계 히트맵
	st.write("상관관계 히트맵:")
	numeric_data = data.select_dtypes(include=['float64', 'int64'])
	if not numeric_data.empty:
	fig = px.imshow(numeric_data.corr(), color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
	fig.update_layout(title='상관관계 히트맵')
	st.plotly_chart(fig)
	else:
	st.write("상관관계 히트맵을 그릴 수 있는 숫자형 열이 없습니다.")

	# 과목별 점수 분포
	if '과목' in data.columns and '학습평가' in data.columns:
	st.write("과목별 점수 분포:")
	fig = px.box(data, x='과목', y='학습평가', points="all")
	fig.update_layout(title='과목별 학습평가 점수 분포')
	st.plotly_chart(fig)

	# 월별 점수 추이
	if '달' in data.columns and '학습평가' in data.columns:
	st.write("월별 점수 추이:")
	fig = px.line(data, x='달', y='학습평가', color='과목', markers=True)
	fig.update_layout(title='월별 학습평가 점수 추이')
	st.plotly_chart(fig)

	# 자기노력도와 학습평가 관계 (회귀선과 R-squared 추가)
	if '자기노력도' in data.columns and '학습평가' in data.columns:
	st.write("자기노력도와 학습평가 관계:")
	fig = px.scatter(data, x='자기노력도', y='학습평가', color='과목', hover_data=['달'])

	# 전체 데이터에 대한 회귀선 추가
	x = data['자기노력도']
	y = data['학습평가']
	slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
	line_x = np.array([x.min(), x.max()])
	line_y = slope * line_x + intercept
	fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='회귀선'))

	r_squared = r_value ** 2
	fig.update_layout(
	title=f'자기노력도와 학습평가 관계 (R-squared: {r_squared:.4f})',
	annotations=[
	dict(
	x=0.5,
	y=1.05,
	xref='paper',
	yref='paper',
	text=f'R-squared: {r_squared:.4f}',
	showarrow=False,
	)
	]
	)
	st.plotly_chart(fig)

	# 인터랙티브 필터링
	st.write("인터랙티브 필터링:")
	if '자기노력도' in data.columns:
	min_effort = int(data['자기노력도'].min())
	max_effort = int(data['자기노력도'].max())
	effort_range = st.slider("자기노력도 범위 선택", min_effort, max_effort, (min_effort, max_effort))

	filtered_data = data[(data['자기노력도'] >= effort_range[0]) & (data['자기노력도'] <= effort_range[1])]

	if '과목' in filtered_data.columns and '학습평가' in filtered_data.columns:
	fig = px.scatter(filtered_data, x='자기노력도', y='학습평가', color='과목', hover_data=['달'])

	# 필터링된 데이터에 대한 회귀선 추가
	x = filtered_data['자기노력도']
	y = filtered_data['학습평가']
	slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
	line_x = np.array([x.min(), x.max()])
	line_y = slope * line_x + intercept
	fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='회귀선'))

	r_squared = r_value ** 2
	fig.update_layout(
	title=f'자기노력도 {effort_range[0]}-{effort_range[1]} 범위의 학습평가 관계 (R-squared: {r_squared:.4f})',
	annotations=[
	dict(
	x=0.5,
	y=1.05,
	xref='paper',
	yref='paper',
	text=f'R-squared: {r_squared:.4f}',
	showarrow=False,
	)
	]
	)
	st.plotly_chart(fig)

	# 과목별 상세 분석
	if '과목' in data.columns:
	st.write("과목별 상세 분석:")
	selected_subject = st.selectbox("분석할 과목 선택", data['과목'].unique())
	subject_data = data[data['과목'] == selected_subject]

	if '달' in subject_data.columns and '학습평가' in subject_data.columns:
	fig = px.line(subject_data, x='달', y='학습평가', markers=True)
	fig.update_layout(title=f'{selected_subject} 월별 학습평가 점수 추이')
	st.plotly_chart(fig)

	if '자기노력도' in subject_data.columns and '학습평가' in subject_data.columns:
	fig = px.scatter(subject_data, x='자기노력도', y='학습평가', hover_data=['달'])

	# 선택된 과목에 대한 회귀선 추가
	x = subject_data['자기노력도']
	y = subject_data['학습평가']
	slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
	line_x = np.array([x.min(), x.max()])
	line_y = slope * line_x + intercept
	fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='회귀선'))

	r_squared = r_value ** 2
	fig.update_layout(
	title=f'{selected_subject} 자기노력도와 학습평가 관계 (R-squared: {r_squared:.4f})',
	annotations=[
	dict(
	x=0.5,
	y=1.05,
	xref='paper',
	yref='paper',
	text=f'R-squared: {r_squared:.4f}',
	showarrow=False,
	)
	]
	)
	st.plotly_chart(fig)

	def main():
	st.title("인터랙티브 EDA 툴킷")

	data_input_method = st.radio("데이터 입력 방법 선택:", ("파일 업로드", "수동 입력"))

	if data_input_method == "파일 업로드":
	uploaded_file = st.file_uploader("CSV, XLS, 또는 XLSX 파일을 선택하세요", type=["csv", "xls", "xlsx"])
	if uploaded_file is not None:
	data = load_data(uploaded_file)
	else:
	data = None
	else:
	data = manual_data_entry()

	if data is not None:
	st.subheader("데이터 미리보기 및 수정")
	st.write("데이터를 확인하고 필요한 경우 수정하세요:")
	edited_data = st.data_editor(data, num_rows="dynamic")

	if st.button("데이터 분석 시작"):
	processed_data = preprocess_data(edited_data)
	perform_analysis(processed_data)

	if __name__ == "__main__":
	main()