import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from io import StringIO
import openpyxl
import matplotlib.font_manager as fm
from scipy import stats

# 한글 폰트 설정
def set_font():
    font_path = "Pretendard-Bold.ttf"  # 실제 폰트 파일 경로로 변경해주세요
    fm.fontManager.addfont(font_path)
    return {'font.family': 'Pretendard-Bold', 'axes.unicode_minus': False}

# 폰트 설정을 가져옵니다
font_settings = set_font()

def load_data(file):
    file_extension = file.name.split('.')[-1].lower()
    if file_extension == 'csv':
        data = pd.read_csv(file)
    elif file_extension in ['xls', 'xlsx']:
        data = pd.read_excel(file)
    else:
        st.error("지원되지 않는 파일 형식입니다. CSV, XLS, 또는 XLSX 파일을 업로드해주세요.")
        return None
    return data

def manual_data_entry():
    st.subheader("수동 데이터 입력")
    col_names = st.text_input("열 이름을 쉼표로 구분하여 입력하세요:").split(',')
    col_names = [name.strip() for name in col_names if name.strip()]
    
    if col_names:
        num_rows = st.number_input("초기 행의 수를 입력하세요:", min_value=1, value=5)
        data = pd.DataFrame(columns=col_names, index=range(num_rows))
        
        edited_data = st.data_editor(data, num_rows="dynamic")
        
        return edited_data
    return None

def preprocess_data(data):
    st.subheader("데이터 전처리")
    
    # 결측치 처리
    if data.isnull().sum().sum() > 0:
        st.write("결측치 처리:")
        for column in data.columns:
            if data[column].isnull().sum() > 0:
                method = st.selectbox(f"{column} 열의 처리 방법 선택:", 
                                      ["제거", "평균으로 대체", "중앙값으로 대체", "최빈값으로 대체"])
                if method == "제거":
                    data = data.dropna(subset=[column])
                elif method == "평균으로 대체":
                    data[column].fillna(data[column].mean(), inplace=True)
                elif method == "중앙값으로 대체":
                    data[column].fillna(data[column].median(), inplace=True)
                elif method == "최빈값으로 대체":
                    data[column].fillna(data[column].mode()[0], inplace=True)
    
    # 데이터 타입 변환
    for column in data.columns:
        if data[column].dtype == 'object':
            try:
                data[column] = pd.to_numeric(data[column])
                st.write(f"{column} 열을 숫자형으로 변환했습니다.")
            except ValueError:
                st.write(f"{column} 열은 범주형으로 유지됩니다.")
    
    return data

def create_slicers(data):
    slicers = {}
    categorical_columns = data.select_dtypes(include=['object', 'category']).columns

    for col in categorical_columns:
        if data[col].nunique() <= 10:  # 고유값이 10개 이하인 경우에만 슬라이서 생성
            slicers[col] = st.multiselect(f"{col} 선택", options=sorted(data[col].unique()), default=sorted(data[col].unique()))
    
    return slicers

def apply_slicers(data, slicers):
    for col, selected_values in slicers.items():
        if selected_values:
            data = data[data[col].isin(selected_values)]
    return data

def perform_analysis(data):
    st.header("탐색적 데이터 분석")
    
    # 슬라이서 생성
    slicers = create_slicers(data)
    
    # 슬라이서 적용
    filtered_data = apply_slicers(data, slicers)
    
    # 요약 통계
    st.write("요약 통계:")
    st.write(filtered_data.describe())

    # 상관관계 히트맵
    st.write("상관관계 히트맵:")
    numeric_data = filtered_data.select_dtypes(include=['float64', 'int64'])
    if not numeric_data.empty:
        fig = px.imshow(numeric_data.corr(), color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
        fig.update_layout(title='상관관계 히트맵')
        st.plotly_chart(fig)
    else:
        st.write("상관관계 히트맵을 그릴 수 있는 숫자형 열이 없습니다.")

    # 출석일수와 성적 관계 분석
    if '출석일수' in filtered_data.columns and '성적' in filtered_data.columns:
        st.write("출석일수와 성적 관계:")
        fig = px.scatter(filtered_data, x='출석일수', y='성적', color='반', hover_data=filtered_data.columns)
        
        # 전체 데이터에 대한 회귀선 추가
        x = filtered_data['출석일수']
        y = filtered_data['성적']
        slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
        line_x = np.array([x.min(), x.max()])
        line_y = slope * line_x + intercept
        fig.add_trace(go.Scatter(x=line_x, y=line_y, mode='lines', name='회귀선'))
        
        r_squared = r_value ** 2
        fig.update_layout(
            title=f'출석일수와 성적 관계 (R-squared: {r_squared:.4f})',
            annotations=[
                dict(
                    x=0.5,
                    y=1.05,
                    xref='paper',
                    yref='paper',
                    text=f'R-squared: {r_squared:.4f}',
                    showarrow=False,
                )
            ]
        )
        st.plotly_chart(fig)

    # 반별 성적 분포
    if '반' in filtered_data.columns and '성적' in filtered_data.columns:
        st.write("반별 성적 분포:")
        fig = px.box(filtered_data, x='반', y='성적', points="all")
        fig.update_layout(title='반별 성적 분포')
        st.plotly_chart(fig)

    # 출석일수 구간별 성적 분포
    if '출석일수' in filtered_data.columns and '성적' in filtered_data.columns:
        st.write("출석일수 구간별 성적 분포:")
        filtered_data['출석일수_구간'] = pd.cut(filtered_data['출석일수'], bins=5)
        fig = px.box(filtered_data, x='출석일수_구간', y='성적', color='반')
        fig.update_layout(title='출석일수 구간별 성적 분포')
        st.plotly_chart(fig)

def main():
    st.title("인터랙티브 EDA 툴킷")

    data_input_method = st.radio("데이터 입력 방법 선택:", ("파일 업로드", "수동 입력"))
    
    if data_input_method == "파일 업로드":
        uploaded_file = st.file_uploader("CSV, XLS, 또는 XLSX 파일을 선택하세요", type=["csv", "xls", "xlsx"])
        if uploaded_file is not None:
            data = load_data(uploaded_file)
        else:
            data = None
    else:
        data = manual_data_entry()
    
    if data is not None:
        st.subheader("데이터 미리보기 및 수정")
        st.write("데이터를 확인하고 필요한 경우 수정하세요:")
        edited_data = st.data_editor(data, num_rows="dynamic")
        
        if st.button("데이터 분석 시작"):
            processed_data = preprocess_data(edited_data)
            perform_analysis(processed_data)

if __name__ == "__main__":
    main()