Spaces:
Running
Running
File size: 4,716 Bytes
43b66f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import streamlit as st
import pandas as pd
import json
import io
from utils.dataset_utils import get_dataset_info, detect_dataset_format
def render_dataset_uploader():
"""
Renders the dataset upload component that supports CSV and JSON formats.
"""
st.markdown("""
<div class="upload-container">
<p>Upload your dataset in CSV or JSON format</p>
</div>
""", unsafe_allow_html=True)
# File uploader
uploaded_file = st.file_uploader(
"Choose a file",
type=["csv", "json"],
help="Upload a CSV or JSON file containing your dataset"
)
# Sample dataset option
st.markdown("Or use a sample dataset:")
sample_dataset = st.selectbox(
"Select a sample dataset",
["None", "Iris Dataset", "Titanic Dataset", "Boston Housing Dataset"]
)
# Process uploaded file
if uploaded_file is not None:
try:
# Check file extension
file_extension = uploaded_file.name.split(".")[-1].lower()
if file_extension == "csv":
df = pd.read_csv(uploaded_file)
dataset_type = "csv"
elif file_extension == "json":
# Try different JSON formats
try:
# First try parsing as a regular JSON with records orientation
df = pd.read_json(uploaded_file)
dataset_type = "json"
except:
# If that fails, try to parse as JSON Lines
try:
df = pd.read_json(uploaded_file, lines=True)
dataset_type = "jsonl"
except:
# If that also fails, load raw JSON and convert
content = json.loads(uploaded_file.getvalue().decode("utf-8"))
if isinstance(content, list):
df = pd.DataFrame(content)
elif isinstance(content, dict):
# Handle nested dict structures
if any(isinstance(v, list) for v in content.values()):
# Find the list field and use it
for key, value in content.items():
if isinstance(value, list):
df = pd.DataFrame(value)
break
else:
# Flat dict or dict of dicts
df = pd.DataFrame([content])
dataset_type = "json"
else:
st.error(f"Unsupported file format: {file_extension}")
return
# Store dataset and its info in session state
st.session_state.dataset = df
st.session_state.dataset_name = uploaded_file.name
st.session_state.dataset_type = dataset_type
st.session_state.dataset_info = get_dataset_info(df)
except Exception as e:
st.error(f"Error loading dataset: {str(e)}")
# Process sample dataset
elif sample_dataset != "None":
try:
if sample_dataset == "Iris Dataset":
# Load Iris dataset
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
dataset_type = "csv"
elif sample_dataset == "Titanic Dataset":
# URL for Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)
dataset_type = "csv"
elif sample_dataset == "Boston Housing Dataset":
# Load Boston Housing dataset
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
df = pd.DataFrame(data=housing.data, columns=housing.feature_names)
df['target'] = housing.target
dataset_type = "csv"
# Store dataset and its info in session state
st.session_state.dataset = df
st.session_state.dataset_name = sample_dataset
st.session_state.dataset_type = dataset_type
st.session_state.dataset_info = get_dataset_info(df)
except Exception as e:
st.error(f"Error loading sample dataset: {str(e)}")
|