JUNGU commited on
Commit
cc89531
Β·
verified Β·
1 Parent(s): 4e08e76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -55
app.py CHANGED
@@ -1,12 +1,18 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import matplotlib.pyplot as plt
4
- import seaborn as sns
5
  import numpy as np
 
 
6
  from io import StringIO
7
  import openpyxl
8
  from st_aggrid import AgGrid, GridUpdateMode
9
  from st_aggrid.grid_options_builder import GridOptionsBuilder
 
 
 
 
 
 
10
 
11
  def load_data(file):
12
  file_extension = file.name.split('.')[-1].lower()
@@ -15,17 +21,17 @@ def load_data(file):
15
  elif file_extension in ['xls', 'xlsx']:
16
  data = pd.read_excel(file)
17
  else:
18
- st.error("Unsupported file format. Please upload a CSV, XLS, or XLSX file.")
19
  return None
20
  return data
21
 
22
  def manual_data_entry():
23
- st.subheader("Manual Data Entry")
24
- col_names = st.text_input("Enter column names separated by commas:").split(',')
25
  col_names = [name.strip() for name in col_names if name.strip()]
26
 
27
  if col_names:
28
- num_rows = st.number_input("Enter number of rows:", min_value=1, value=5)
29
  data = pd.DataFrame(columns=col_names, index=range(num_rows))
30
 
31
  gd = GridOptionsBuilder.from_dataframe(data)
@@ -40,93 +46,91 @@ def manual_data_entry():
40
  return None
41
 
42
  def preprocess_data(data):
43
- st.subheader("Data Preprocessing")
44
 
45
- # Handle missing values
46
  if data.isnull().sum().sum() > 0:
47
- st.write("Handling missing values:")
48
  for column in data.columns:
49
  if data[column].isnull().sum() > 0:
50
- method = st.selectbox(f"Choose method for {column}:",
51
- ["Drop", "Fill with mean", "Fill with median", "Fill with mode"])
52
- if method == "Drop":
53
  data = data.dropna(subset=[column])
54
- elif method == "Fill with mean":
55
  data[column].fillna(data[column].mean(), inplace=True)
56
- elif method == "Fill with median":
57
  data[column].fillna(data[column].median(), inplace=True)
58
- elif method == "Fill with mode":
59
  data[column].fillna(data[column].mode()[0], inplace=True)
60
 
61
- # Convert data types
62
  for column in data.columns:
63
  if data[column].dtype == 'object':
64
  try:
65
  data[column] = pd.to_numeric(data[column])
66
- st.write(f"Converted {column} to numeric.")
67
  except ValueError:
68
- st.write(f"Kept {column} as categorical.")
69
 
70
  return data
71
 
72
  def perform_analysis(data):
73
- st.header("Exploratory Data Analysis")
74
 
75
- # Summary statistics
76
- st.write("Summary Statistics:")
77
  st.write(data.describe())
78
 
79
- # Correlation heatmap
80
- st.write("Correlation Heatmap:")
81
  numeric_data = data.select_dtypes(include=['float64', 'int64'])
82
  if not numeric_data.empty:
83
- fig, ax = plt.subplots(figsize=(10, 8))
84
- sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', ax=ax)
85
- st.pyplot(fig)
86
  else:
87
- st.write("No numeric columns available for correlation heatmap.")
88
 
89
- # Pairplot
90
- st.write("Pairplot:")
91
  if not numeric_data.empty:
92
- fig = sns.pairplot(numeric_data)
93
- st.pyplot(fig)
 
94
  else:
95
- st.write("No numeric columns available for pairplot.")
96
 
97
- # Histogram
98
- st.write("Histograms:")
99
  for column in numeric_data.columns:
100
- fig, ax = plt.subplots()
101
- sns.histplot(data[column], kde=True, ax=ax)
102
- st.pyplot(fig)
103
 
104
- # Box plots for numerical columns
105
- st.write("Box Plots:")
106
  for column in numeric_data.columns:
107
- fig, ax = plt.subplots()
108
- sns.boxplot(data=data, y=column, ax=ax)
109
- st.pyplot(fig)
110
 
111
- # Bar plots for categorical columns
112
  categorical_columns = data.select_dtypes(include=['object']).columns
113
  if not categorical_columns.empty:
114
- st.write("Bar Plots for Categorical Variables:")
115
  for column in categorical_columns:
116
- fig, ax = plt.subplots()
117
- data[column].value_counts().plot(kind='bar', ax=ax)
118
- plt.title(f"Distribution of {column}")
119
- plt.xlabel(column)
120
- plt.ylabel("Count")
121
- st.pyplot(fig)
122
 
123
  def main():
124
- st.title("Interactive EDA Toolkit")
125
 
126
- data_input_method = st.radio("Choose data input method:", ("Upload File", "Manual Entry"))
127
 
128
- if data_input_method == "Upload File":
129
- uploaded_file = st.file_uploader("Choose a CSV, XLS, or XLSX file", type=["csv", "xls", "xlsx"])
130
  if uploaded_file is not None:
131
  data = load_data(uploaded_file)
132
  else:
@@ -135,7 +139,7 @@ def main():
135
  data = manual_data_entry()
136
 
137
  if data is not None:
138
- st.write("Data Preview:")
139
  st.write(data.head())
140
 
141
  data = preprocess_data(data)
 
1
  import streamlit as st
2
  import pandas as pd
 
 
3
  import numpy as np
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
  from io import StringIO
7
  import openpyxl
8
  from st_aggrid import AgGrid, GridUpdateMode
9
  from st_aggrid.grid_options_builder import GridOptionsBuilder
10
+ import matplotlib.font_manager as fm
11
+
12
+ # ν•œκΈ€ 폰트 μ„€μ •
13
+ font_path = "./Pretendard-Bold.ttf" # μ‹€μ œ 폰트 파일 경둜둜 λ³€κ²½ν•΄μ£Όμ„Έμš”
14
+ fm.fontManager.addfont(font_path)
15
+ plt.rc('font', family='Pretendard-Bold') # 'your_font_name'을 μ‹€μ œ 폰트 μ΄λ¦„μœΌλ‘œ λ³€κ²½ν•΄μ£Όμ„Έμš”
16
 
17
  def load_data(file):
18
  file_extension = file.name.split('.')[-1].lower()
 
21
  elif file_extension in ['xls', 'xlsx']:
22
  data = pd.read_excel(file)
23
  else:
24
+ st.error("μ§€μ›λ˜μ§€ μ•ŠλŠ” 파일 ν˜•μ‹μž…λ‹ˆλ‹€. CSV, XLS, λ˜λŠ” XLSX νŒŒμΌμ„ μ—…λ‘œλ“œν•΄μ£Όμ„Έμš”.")
25
  return None
26
  return data
27
 
28
  def manual_data_entry():
29
+ st.subheader("μˆ˜λ™ 데이터 μž…λ ₯")
30
+ col_names = st.text_input("μ—΄ 이름을 μ‰Όν‘œλ‘œ κ΅¬λΆ„ν•˜μ—¬ μž…λ ₯ν•˜μ„Έμš”:").split(',')
31
  col_names = [name.strip() for name in col_names if name.strip()]
32
 
33
  if col_names:
34
+ num_rows = st.number_input("ν–‰μ˜ 수λ₯Ό μž…λ ₯ν•˜μ„Έμš”:", min_value=1, value=5)
35
  data = pd.DataFrame(columns=col_names, index=range(num_rows))
36
 
37
  gd = GridOptionsBuilder.from_dataframe(data)
 
46
  return None
47
 
48
  def preprocess_data(data):
49
+ st.subheader("데이터 μ „μ²˜λ¦¬")
50
 
51
+ # 결츑치 처리
52
  if data.isnull().sum().sum() > 0:
53
+ st.write("결츑치 처리:")
54
  for column in data.columns:
55
  if data[column].isnull().sum() > 0:
56
+ method = st.selectbox(f"{column} μ—΄μ˜ 처리 방법 선택:",
57
+ ["제거", "ν‰κ· μœΌλ‘œ λŒ€μ²΄", "μ€‘μ•™κ°’μœΌλ‘œ λŒ€μ²΄", "μ΅œλΉˆκ°’μœΌλ‘œ λŒ€μ²΄"])
58
+ if method == "제거":
59
  data = data.dropna(subset=[column])
60
+ elif method == "ν‰κ· μœΌλ‘œ λŒ€μ²΄":
61
  data[column].fillna(data[column].mean(), inplace=True)
62
+ elif method == "μ€‘μ•™κ°’μœΌλ‘œ λŒ€μ²΄":
63
  data[column].fillna(data[column].median(), inplace=True)
64
+ elif method == "μ΅œλΉˆκ°’μœΌλ‘œ λŒ€μ²΄":
65
  data[column].fillna(data[column].mode()[0], inplace=True)
66
 
67
+ # 데이터 νƒ€μž… λ³€ν™˜
68
  for column in data.columns:
69
  if data[column].dtype == 'object':
70
  try:
71
  data[column] = pd.to_numeric(data[column])
72
+ st.write(f"{column} 열을 μˆ«μžν˜•μœΌλ‘œ λ³€ν™˜ν–ˆμŠ΅λ‹ˆλ‹€.")
73
  except ValueError:
74
+ st.write(f"{column} 열은 λ²”μ£Όν˜•μœΌλ‘œ μœ μ§€λ©λ‹ˆλ‹€.")
75
 
76
  return data
77
 
78
  def perform_analysis(data):
79
+ st.header("탐색적 데이터 뢄석")
80
 
81
+ # μš”μ•½ 톡계
82
+ st.write("μš”μ•½ 톡계:")
83
  st.write(data.describe())
84
 
85
+ # 상관관계 히트맡
86
+ st.write("상관관계 히트맡:")
87
  numeric_data = data.select_dtypes(include=['float64', 'int64'])
88
  if not numeric_data.empty:
89
+ fig = px.imshow(numeric_data.corr(), color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
90
+ fig.update_layout(title='상관관계 히트맡')
91
+ st.plotly_chart(fig)
92
  else:
93
+ st.write("상관관계 νžˆνŠΈλ§΅μ„ 그릴 수 μžˆλŠ” μˆ«μžν˜• 열이 μ—†μŠ΅λ‹ˆλ‹€.")
94
 
95
+ # 산점도 ν–‰λ ¬
96
+ st.write("산점도 ν–‰λ ¬:")
97
  if not numeric_data.empty:
98
+ fig = px.scatter_matrix(numeric_data)
99
+ fig.update_layout(title='산점도 ν–‰λ ¬')
100
+ st.plotly_chart(fig)
101
  else:
102
+ st.write("산점도 행렬을 그릴 수 μžˆλŠ” μˆ«μžν˜• 열이 μ—†μŠ΅λ‹ˆλ‹€.")
103
 
104
+ # νžˆμŠ€ν† κ·Έλž¨
105
+ st.write("νžˆμŠ€ν† κ·Έλž¨:")
106
  for column in numeric_data.columns:
107
+ fig = px.histogram(data, x=column, marginal='box')
108
+ fig.update_layout(title=f'{column} νžˆμŠ€ν† κ·Έλž¨')
109
+ st.plotly_chart(fig)
110
 
111
+ # λ°•μŠ€ν”Œλ‘―
112
+ st.write("λ°•μŠ€ν”Œλ‘―:")
113
  for column in numeric_data.columns:
114
+ fig = px.box(data, y=column)
115
+ fig.update_layout(title=f'{column} λ°•μŠ€ν”Œλ‘―')
116
+ st.plotly_chart(fig)
117
 
118
+ # λ²”μ£Όν˜• λ³€μˆ˜ λ§‰λŒ€ κ·Έλž˜ν”„
119
  categorical_columns = data.select_dtypes(include=['object']).columns
120
  if not categorical_columns.empty:
121
+ st.write("λ²”μ£Όν˜• λ³€μˆ˜ λ§‰λŒ€ κ·Έλž˜ν”„:")
122
  for column in categorical_columns:
123
+ fig = px.bar(data[column].value_counts().reset_index(), x='index', y=column)
124
+ fig.update_layout(title=f'{column} 뢄포', xaxis_title=column, yaxis_title='개수')
125
+ st.plotly_chart(fig)
 
 
 
126
 
127
  def main():
128
+ st.title("μΈν„°λž™ν‹°λΈŒ EDA νˆ΄ν‚·")
129
 
130
+ data_input_method = st.radio("데이터 μž…λ ₯ 방법 선택:", ("파일 μ—…λ‘œλ“œ", "μˆ˜λ™ μž…λ ₯"))
131
 
132
+ if data_input_method == "파일 μ—…λ‘œλ“œ":
133
+ uploaded_file = st.file_uploader("CSV, XLS, λ˜λŠ” XLSX νŒŒμΌμ„ μ„ νƒν•˜μ„Έμš”", type=["csv", "xls", "xlsx"])
134
  if uploaded_file is not None:
135
  data = load_data(uploaded_file)
136
  else:
 
139
  data = manual_data_entry()
140
 
141
  if data is not None:
142
+ st.write("데이터 미리보기:")
143
  st.write(data.head())
144
 
145
  data = preprocess_data(data)