Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
import time | |
from data_utils import process_python_dataset, list_available_datasets, get_dataset_info | |
from utils import set_page_config, display_sidebar, add_log | |
# Set page configuration | |
set_page_config() | |
# Display sidebar | |
display_sidebar() | |
# Title | |
st.title("Dataset Management") | |
st.markdown("Upload and manage your Python code datasets for model training.") | |
# Create tabs for different dataset operations | |
tab1, tab2 = st.tabs(["Upload Dataset", "View Datasets"]) | |
with tab1: | |
st.subheader("Upload a New Dataset") | |
# Dataset name input | |
dataset_name = st.text_input("Dataset Name", placeholder="e.g., python_functions") | |
# File uploader | |
uploaded_file = st.file_uploader( | |
"Upload Python Code Dataset", | |
type=["py", "json", "csv"], | |
help="Upload Python code files (.py), JSON files containing code snippets, or CSV files with code columns" | |
) | |
# Dataset upload options | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown("### Dataset Format") | |
st.markdown(""" | |
- **Python files (.py)**: Will be split into examples by function/class definitions | |
- **JSON files (.json)**: Should contain a list of objects with a 'code' field | |
- **CSV files (.csv)**: Should have a 'code' column | |
""") | |
with col2: | |
st.markdown("### Processing Options") | |
auto_split = st.checkbox("Automatically split into train/validation sets", value=True) | |
split_ratio = st.slider("Validation Split Ratio", min_value=0.1, max_value=0.3, value=0.2, step=0.05, disabled=not auto_split) | |
# Process button | |
if st.button("Process Dataset"): | |
if not dataset_name: | |
st.error("Please provide a dataset name") | |
elif not uploaded_file: | |
st.error("Please upload a file") | |
elif dataset_name in list_available_datasets(): | |
st.error(f"Dataset with name '{dataset_name}' already exists. Please choose a different name.") | |
else: | |
with st.spinner("Processing dataset..."): | |
success = process_python_dataset(uploaded_file, dataset_name) | |
if success: | |
st.success(f"Dataset '{dataset_name}' processed successfully!") | |
add_log(f"Dataset '{dataset_name}' uploaded and processed") | |
time.sleep(1) | |
st.experimental_rerun() | |
else: | |
st.error("Failed to process dataset. Check logs for details.") | |
with tab2: | |
st.subheader("Available Datasets") | |
# Get available datasets | |
available_datasets = list_available_datasets() | |
if not available_datasets: | |
st.info("No datasets available. Upload a dataset in the 'Upload Dataset' tab.") | |
else: | |
# Dataset selection | |
selected_dataset = st.selectbox("Select a Dataset", available_datasets) | |
if selected_dataset: | |
# Get dataset info | |
dataset_info = get_dataset_info(selected_dataset) | |
if dataset_info: | |
# Display dataset information | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown("### Dataset Information") | |
st.markdown(f"**Name:** {dataset_info['name']}") | |
st.markdown(f"**Total Examples:** {dataset_info['size']}") | |
st.markdown(f"**Training Examples:** {dataset_info['train_size']}") | |
st.markdown(f"**Validation Examples:** {dataset_info['validation_size']}") | |
st.markdown(f"**Created:** {dataset_info['created_at']}") | |
with col2: | |
st.markdown("### Dataset Structure") | |
columns = dataset_info.get('columns', []) | |
for col in columns: | |
st.markdown(f"- {col}") | |
# Display sample data | |
st.markdown("### Sample Data") | |
# Get the dataset | |
dataset = st.session_state.datasets[selected_dataset]['data'] | |
# Display first few examples | |
if 'train' in dataset and len(dataset['train']) > 0: | |
sample_size = min(5, len(dataset['train'])) | |
for i in range(sample_size): | |
with st.expander(f"Example {i+1}"): | |
st.code(dataset['train'][i].get('code', '# No code available'), language='python') | |
else: | |
st.info("No examples available to display") | |
# Actions | |
st.markdown("### Actions") | |
if st.button("Delete Dataset", key="delete_dataset"): | |
if selected_dataset in st.session_state.datasets: | |
del st.session_state.datasets[selected_dataset] | |
add_log(f"Dataset '{selected_dataset}' deleted") | |
st.success(f"Dataset '{selected_dataset}' deleted successfully!") | |
time.sleep(1) | |
st.rerun() | |