File size: 5,210 Bytes
ec547e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import streamlit as st
import pandas as pd
import time
from data_utils import process_python_dataset, list_available_datasets, get_dataset_info
from utils import set_page_config, display_sidebar, add_log

# Set page configuration
set_page_config()

# Display sidebar
display_sidebar()

# Title
st.title("Dataset Management")
st.markdown("Upload and manage your Python code datasets for model training.")

# Create tabs for different dataset operations
tab1, tab2 = st.tabs(["Upload Dataset", "View Datasets"])

with tab1:
    st.subheader("Upload a New Dataset")
    
    # Dataset name input
    dataset_name = st.text_input("Dataset Name", placeholder="e.g., python_functions")
    
    # File uploader
    uploaded_file = st.file_uploader(
        "Upload Python Code Dataset", 
        type=["py", "json", "csv"],
        help="Upload Python code files (.py), JSON files containing code snippets, or CSV files with code columns"
    )
    
    # Dataset upload options
    col1, col2 = st.columns(2)
    with col1:
        st.markdown("### Dataset Format")
        st.markdown("""
        - **Python files (.py)**: Will be split into examples by function/class definitions
        - **JSON files (.json)**: Should contain a list of objects with a 'code' field
        - **CSV files (.csv)**: Should have a 'code' column
        """)
    
    with col2:
        st.markdown("### Processing Options")
        auto_split = st.checkbox("Automatically split into train/validation sets", value=True)
        split_ratio = st.slider("Validation Split Ratio", min_value=0.1, max_value=0.3, value=0.2, step=0.05, disabled=not auto_split)
    
    # Process button
    if st.button("Process Dataset"):
        if not dataset_name:
            st.error("Please provide a dataset name")
        elif not uploaded_file:
            st.error("Please upload a file")
        elif dataset_name in list_available_datasets():
            st.error(f"Dataset with name '{dataset_name}' already exists. Please choose a different name.")
        else:
            with st.spinner("Processing dataset..."):
                success = process_python_dataset(uploaded_file, dataset_name)
                if success:
                    st.success(f"Dataset '{dataset_name}' processed successfully!")
                    add_log(f"Dataset '{dataset_name}' uploaded and processed")
                    time.sleep(1)
                    st.experimental_rerun()
                else:
                    st.error("Failed to process dataset. Check logs for details.")

with tab2:
    st.subheader("Available Datasets")
    
    # Get available datasets
    available_datasets = list_available_datasets()
    
    if not available_datasets:
        st.info("No datasets available. Upload a dataset in the 'Upload Dataset' tab.")
    else:
        # Dataset selection
        selected_dataset = st.selectbox("Select a Dataset", available_datasets)
        
        if selected_dataset:
            # Get dataset info
            dataset_info = get_dataset_info(selected_dataset)
            
            if dataset_info:
                # Display dataset information
                col1, col2 = st.columns(2)
                
                with col1:
                    st.markdown("### Dataset Information")
                    st.markdown(f"**Name:** {dataset_info['name']}")
                    st.markdown(f"**Total Examples:** {dataset_info['size']}")
                    st.markdown(f"**Training Examples:** {dataset_info['train_size']}")
                    st.markdown(f"**Validation Examples:** {dataset_info['validation_size']}")
                    st.markdown(f"**Created:** {dataset_info['created_at']}")
                
                with col2:
                    st.markdown("### Dataset Structure")
                    columns = dataset_info.get('columns', [])
                    for col in columns:
                        st.markdown(f"- {col}")
                
                # Display sample data
                st.markdown("### Sample Data")
                
                # Get the dataset
                dataset = st.session_state.datasets[selected_dataset]['data']
                
                # Display first few examples
                if 'train' in dataset and len(dataset['train']) > 0:
                    sample_size = min(5, len(dataset['train']))
                    for i in range(sample_size):
                        with st.expander(f"Example {i+1}"):
                            st.code(dataset['train'][i].get('code', '# No code available'), language='python')
                else:
                    st.info("No examples available to display")
                
                # Actions
                st.markdown("### Actions")
                if st.button("Delete Dataset", key="delete_dataset"):
                    if selected_dataset in st.session_state.datasets:
                        del st.session_state.datasets[selected_dataset]
                        add_log(f"Dataset '{selected_dataset}' deleted")
                        st.success(f"Dataset '{selected_dataset}' deleted successfully!")
                        time.sleep(1)
                        st.rerun()