Spaces:

JUNGU
/

PPPDC_example

Build error

App Files Files Community

JUNGU commited on Jul 19, 2024

Commit

cc89531

verified ·

1 Parent(s): 4e08e76

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -55

app.py CHANGED Viewed

@@ -1,12 +1,18 @@
 import streamlit as st
 import pandas as pd
-import matplotlib.pyplot as plt
-import seaborn as sns
 import numpy as np
 from io import StringIO
 import openpyxl
 from st_aggrid import AgGrid, GridUpdateMode
 from st_aggrid.grid_options_builder import GridOptionsBuilder
 def load_data(file):
     file_extension = file.name.split('.')[-1].lower()
@@ -15,17 +21,17 @@ def load_data(file):
     elif file_extension in ['xls', 'xlsx']:
         data = pd.read_excel(file)
     else:
-        st.error("Unsupported file format. Please upload a CSV, XLS, or XLSX file.")
         return None
     return data
 def manual_data_entry():
-    st.subheader("Manual Data Entry")
-    col_names = st.text_input("Enter column names separated by commas:").split(',')
     col_names = [name.strip() for name in col_names if name.strip()]
     if col_names:
-        num_rows = st.number_input("Enter number of rows:", min_value=1, value=5)
         data = pd.DataFrame(columns=col_names, index=range(num_rows))
         gd = GridOptionsBuilder.from_dataframe(data)
@@ -40,93 +46,91 @@ def manual_data_entry():
     return None
 def preprocess_data(data):
-    st.subheader("Data Preprocessing")
-    # Handle missing values
     if data.isnull().sum().sum() > 0:
-        st.write("Handling missing values:")
         for column in data.columns:
             if data[column].isnull().sum() > 0:
-                method = st.selectbox(f"Choose method for {column}:",
-                                      ["Drop", "Fill with mean", "Fill with median", "Fill with mode"])
-                if method == "Drop":
                     data = data.dropna(subset=[column])
-                elif method == "Fill with mean":
                     data[column].fillna(data[column].mean(), inplace=True)
-                elif method == "Fill with median":
                     data[column].fillna(data[column].median(), inplace=True)
-                elif method == "Fill with mode":
                     data[column].fillna(data[column].mode()[0], inplace=True)
-    # Convert data types
     for column in data.columns:
         if data[column].dtype == 'object':
             try:
                 data[column] = pd.to_numeric(data[column])
-                st.write(f"Converted {column} to numeric.")
             except ValueError:
-                st.write(f"Kept {column} as categorical.")
     return data
 def perform_analysis(data):
-    st.header("Exploratory Data Analysis")
-    # Summary statistics
-    st.write("Summary Statistics:")
     st.write(data.describe())
-    # Correlation heatmap
-    st.write("Correlation Heatmap:")
     numeric_data = data.select_dtypes(include=['float64', 'int64'])
     if not numeric_data.empty:
-        fig, ax = plt.subplots(figsize=(10, 8))
-        sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', ax=ax)
-        st.pyplot(fig)
     else:
-        st.write("No numeric columns available for correlation heatmap.")
-    # Pairplot
-    st.write("Pairplot:")
     if not numeric_data.empty:
-        fig = sns.pairplot(numeric_data)
-        st.pyplot(fig)
     else:
-        st.write("No numeric columns available for pairplot.")
-    # Histogram
-    st.write("Histograms:")
     for column in numeric_data.columns:
-        fig, ax = plt.subplots()
-        sns.histplot(data[column], kde=True, ax=ax)
-        st.pyplot(fig)
-    # Box plots for numerical columns
-    st.write("Box Plots:")
     for column in numeric_data.columns:
-        fig, ax = plt.subplots()
-        sns.boxplot(data=data, y=column, ax=ax)
-        st.pyplot(fig)
-    # Bar plots for categorical columns
     categorical_columns = data.select_dtypes(include=['object']).columns
     if not categorical_columns.empty:
-        st.write("Bar Plots for Categorical Variables:")
         for column in categorical_columns:
-            fig, ax = plt.subplots()
-            data[column].value_counts().plot(kind='bar', ax=ax)
-            plt.title(f"Distribution of {column}")
-            plt.xlabel(column)
-            plt.ylabel("Count")
-            st.pyplot(fig)
 def main():
-    st.title("Interactive EDA Toolkit")
-    data_input_method = st.radio("Choose data input method:", ("Upload File", "Manual Entry"))
-    if data_input_method == "Upload File":
-        uploaded_file = st.file_uploader("Choose a CSV, XLS, or XLSX file", type=["csv", "xls", "xlsx"])
         if uploaded_file is not None:
             data = load_data(uploaded_file)
         else:
@@ -135,7 +139,7 @@ def main():
         data = manual_data_entry()
     if data is not None:
-        st.write("Data Preview:")
         st.write(data.head())
         data = preprocess_data(data)

 import streamlit as st
 import pandas as pd
 import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
 from io import StringIO
 import openpyxl
 from st_aggrid import AgGrid, GridUpdateMode
 from st_aggrid.grid_options_builder import GridOptionsBuilder
+import matplotlib.font_manager as fm
+# 한글 폰트 설정
+font_path = "./Pretendard-Bold.ttf"  # 실제 폰트 파일 경로로 변경해주세요
+fm.fontManager.addfont(font_path)
+plt.rc('font', family='Pretendard-Bold')  # 'your_font_name'을 실제 폰트 이름으로 변경해주세요
 def load_data(file):
     file_extension = file.name.split('.')[-1].lower()
     elif file_extension in ['xls', 'xlsx']:
         data = pd.read_excel(file)
     else:
+        st.error("지원되지 않는 파일 형식입니다. CSV, XLS, 또는 XLSX 파일을 업로드해주세요.")
         return None
     return data
 def manual_data_entry():
+    st.subheader("수동 데이터 입력")
+    col_names = st.text_input("열 이름을 쉼표로 구분하여 입력하세요:").split(',')
     col_names = [name.strip() for name in col_names if name.strip()]
     if col_names:
+        num_rows = st.number_input("행의 수를 입력하세요:", min_value=1, value=5)
         data = pd.DataFrame(columns=col_names, index=range(num_rows))
         gd = GridOptionsBuilder.from_dataframe(data)
     return None
 def preprocess_data(data):
+    st.subheader("데이터 전처리")
+    # 결측치 처리
     if data.isnull().sum().sum() > 0:
+        st.write("결측치 처리:")
         for column in data.columns:
             if data[column].isnull().sum() > 0:
+                method = st.selectbox(f"{column} 열의 처리 방법 선택:",
+                                      ["제거", "평균으로 대체", "중앙값으로 대체", "최빈값으로 대체"])
+                if method == "제거":
                     data = data.dropna(subset=[column])
+                elif method == "평균으로 대체":
                     data[column].fillna(data[column].mean(), inplace=True)
+                elif method == "중앙값으로 대체":
                     data[column].fillna(data[column].median(), inplace=True)
+                elif method == "최빈값으로 대체":
                     data[column].fillna(data[column].mode()[0], inplace=True)
+    # 데이터 타입 변환
     for column in data.columns:
         if data[column].dtype == 'object':
             try:
                 data[column] = pd.to_numeric(data[column])
+                st.write(f"{column} 열을 숫자형으로 변환했습니다.")
             except ValueError:
+                st.write(f"{column} 열은 범주형으로 유지됩니다.")
     return data
 def perform_analysis(data):
+    st.header("탐색적 데이터 분석")
+    # 요약 통계
+    st.write("요약 통계:")
     st.write(data.describe())
+    # 상관관계 히트맵
+    st.write("상관관계 히트맵:")
     numeric_data = data.select_dtypes(include=['float64', 'int64'])
     if not numeric_data.empty:
+        fig = px.imshow(numeric_data.corr(), color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
+        fig.update_layout(title='상관관계 히트맵')
+        st.plotly_chart(fig)
     else:
+        st.write("상관관계 히트맵을 그릴 수 있는 숫자형 열이 없습니다.")
+    # 산점도 행렬
+    st.write("산점도 행렬:")
     if not numeric_data.empty:
+        fig = px.scatter_matrix(numeric_data)
+        fig.update_layout(title='산점도 행렬')
+        st.plotly_chart(fig)
     else:
+        st.write("산점도 행렬을 그릴 수 있는 숫자형 열이 없습니다.")
+    # 히스토그램
+    st.write("히스토그램:")
     for column in numeric_data.columns:
+        fig = px.histogram(data, x=column, marginal='box')
+        fig.update_layout(title=f'{column} 히스토그램')
+        st.plotly_chart(fig)
+    # 박스플롯
+    st.write("박스플롯:")
     for column in numeric_data.columns:
+        fig = px.box(data, y=column)
+        fig.update_layout(title=f'{column} 박스플롯')
+        st.plotly_chart(fig)
+    # 범주형 변수 막대 그래프
     categorical_columns = data.select_dtypes(include=['object']).columns
     if not categorical_columns.empty:
+        st.write("범주형 변수 막대 그래프:")
         for column in categorical_columns:
+            fig = px.bar(data[column].value_counts().reset_index(), x='index', y=column)
+            fig.update_layout(title=f'{column} 분포', xaxis_title=column, yaxis_title='개수')
+            st.plotly_chart(fig)
 def main():
+    st.title("인터랙티브 EDA 툴킷")
+    data_input_method = st.radio("데이터 입력 방법 선택:", ("파일 업로드", "수동 입력"))
+    if data_input_method == "파일 업로드":
+        uploaded_file = st.file_uploader("CSV, XLS, 또는 XLSX 파일을 선택하세요", type=["csv", "xls", "xlsx"])
         if uploaded_file is not None:
             data = load_data(uploaded_file)
         else:
         data = manual_data_entry()
     if data is not None:
+        st.write("데이터 미리보기:")
         st.write(data.head())
         data = preprocess_data(data)