CodeCraftLab / data_utils.py
S-Dreamer's picture
Upload 14 files
5dd070e verified
import streamlit as st
import pandas as pd
import json
import io
import os
from utils import add_log
# Handle missing dependencies
try:
from datasets import Dataset, DatasetDict
except ImportError:
# Create dummy classes for Dataset and DatasetDict
class Dataset:
@classmethod
def from_list(cls, items):
return {"data": items, "column_names": list(items[0].keys()) if items else []}
@classmethod
def from_dict(cls, dict_obj):
return {"data": dict_obj, "column_names": list(dict_obj.keys())}
@classmethod
def from_pandas(cls, df):
return {"data": df, "column_names": df.columns.tolist()}
def train_test_split(self, test_size=0.2):
return {
"train": self,
"test": self
}
class DatasetDict(dict):
pass
def process_python_dataset(uploaded_file, dataset_name):
"""
Process an uploaded Python dataset file.
Supports .py, .json, and .csv formats.
Args:
uploaded_file: The uploaded file object
dataset_name: Name to identify the dataset
Returns:
bool: Success status
"""
try:
file_extension = uploaded_file.name.split('.')[-1].lower()
if file_extension == 'py':
# Process Python file
content = uploaded_file.read().decode('utf-8')
# Split by function or class definitions for separate examples
examples = split_python_file(content)
dataset = create_dataset_from_examples(examples)
elif file_extension == 'json':
# Process JSON file
content = json.loads(uploaded_file.read().decode('utf-8'))
if isinstance(content, list):
dataset = Dataset.from_list(content)
else:
dataset = Dataset.from_dict(content)
elif file_extension == 'csv':
# Process CSV file
df = pd.read_csv(uploaded_file)
dataset = Dataset.from_pandas(df)
else:
add_log(f"Unsupported file format: {file_extension}", "ERROR")
return False
# Split into train/validation sets
train_test_split = dataset.train_test_split(test_size=0.2)
# Create a DatasetDict
dataset_dict = DatasetDict({
'train': train_test_split['train'],
'validation': train_test_split['test']
})
# Store in session state
st.session_state.datasets[dataset_name] = {
'data': dataset_dict,
'info': {
'name': dataset_name,
'size': len(dataset),
'train_size': len(train_test_split['train']),
'validation_size': len(train_test_split['test']),
'columns': dataset.column_names,
'created_at': pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
}
}
add_log(f"Dataset '{dataset_name}' processed successfully with {len(dataset)} examples")
return True
except Exception as e:
add_log(f"Error processing dataset: {str(e)}", "ERROR")
return False
def split_python_file(content):
"""
Split a Python file content into separate code examples.
Args:
content: String content of Python file
Returns:
list: List of code examples
"""
examples = []
# Simple splitting by function or class definitions
lines = content.split('\n')
current_example = []
for line in lines:
if (line.startswith('def ') or line.startswith('class ')) and current_example:
# Start of a new function/class, save the previous one
examples.append('\n'.join(current_example))
current_example = [line]
else:
current_example.append(line)
# Add the last example
if current_example:
examples.append('\n'.join(current_example))
# If no examples were extracted, use the whole file as one example
if not examples:
examples = [content]
return [{'code': example} for example in examples]
def create_dataset_from_examples(examples):
"""
Create a dataset from code examples.
Args:
examples: List of code examples
Returns:
Dataset: Hugging Face dataset
"""
return Dataset.from_list(examples)
def validate_dataset_structure(dataset):
"""
Validate that the dataset has the required structure for training.
Args:
dataset: Hugging Face dataset
Returns:
bool: True if valid, False otherwise
"""
if 'code' not in dataset.column_names:
add_log("Dataset missing 'code' column required for training", "ERROR")
return False
return True
def list_available_datasets():
"""
List all available datasets in session state.
Returns:
list: List of dataset names
"""
if 'datasets' in st.session_state:
return list(st.session_state.datasets.keys())
return []
def get_dataset_info(dataset_name):
"""
Get information about a dataset.
Args:
dataset_name: Name of the dataset
Returns:
dict: Dataset information
"""
if 'datasets' in st.session_state and dataset_name in st.session_state.datasets:
return st.session_state.datasets[dataset_name]['info']
return None
def get_dataset(dataset_name):
"""
Get a dataset by name.
Args:
dataset_name: Name of the dataset
Returns:
Dataset: The dataset object
"""
if 'datasets' in st.session_state and dataset_name in st.session_state.datasets:
return st.session_state.datasets[dataset_name]['data']
return None