Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
import json | |
import io | |
import os | |
from utils import add_log | |
# Handle missing dependencies | |
try: | |
from datasets import Dataset, DatasetDict | |
except ImportError: | |
# Create dummy classes for Dataset and DatasetDict | |
class Dataset: | |
def from_list(cls, items): | |
return {"data": items, "column_names": list(items[0].keys()) if items else []} | |
def from_dict(cls, dict_obj): | |
return {"data": dict_obj, "column_names": list(dict_obj.keys())} | |
def from_pandas(cls, df): | |
return {"data": df, "column_names": df.columns.tolist()} | |
def train_test_split(self, test_size=0.2): | |
return { | |
"train": self, | |
"test": self | |
} | |
class DatasetDict(dict): | |
pass | |
def process_python_dataset(uploaded_file, dataset_name): | |
""" | |
Process an uploaded Python dataset file. | |
Supports .py, .json, and .csv formats. | |
Args: | |
uploaded_file: The uploaded file object | |
dataset_name: Name to identify the dataset | |
Returns: | |
bool: Success status | |
""" | |
try: | |
file_extension = uploaded_file.name.split('.')[-1].lower() | |
if file_extension == 'py': | |
# Process Python file | |
content = uploaded_file.read().decode('utf-8') | |
# Split by function or class definitions for separate examples | |
examples = split_python_file(content) | |
dataset = create_dataset_from_examples(examples) | |
elif file_extension == 'json': | |
# Process JSON file | |
content = json.loads(uploaded_file.read().decode('utf-8')) | |
if isinstance(content, list): | |
dataset = Dataset.from_list(content) | |
else: | |
dataset = Dataset.from_dict(content) | |
elif file_extension == 'csv': | |
# Process CSV file | |
df = pd.read_csv(uploaded_file) | |
dataset = Dataset.from_pandas(df) | |
else: | |
add_log(f"Unsupported file format: {file_extension}", "ERROR") | |
return False | |
# Split into train/validation sets | |
train_test_split = dataset.train_test_split(test_size=0.2) | |
# Create a DatasetDict | |
dataset_dict = DatasetDict({ | |
'train': train_test_split['train'], | |
'validation': train_test_split['test'] | |
}) | |
# Store in session state | |
st.session_state.datasets[dataset_name] = { | |
'data': dataset_dict, | |
'info': { | |
'name': dataset_name, | |
'size': len(dataset), | |
'train_size': len(train_test_split['train']), | |
'validation_size': len(train_test_split['test']), | |
'columns': dataset.column_names, | |
'created_at': pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S") | |
} | |
} | |
add_log(f"Dataset '{dataset_name}' processed successfully with {len(dataset)} examples") | |
return True | |
except Exception as e: | |
add_log(f"Error processing dataset: {str(e)}", "ERROR") | |
return False | |
def split_python_file(content): | |
""" | |
Split a Python file content into separate code examples. | |
Args: | |
content: String content of Python file | |
Returns: | |
list: List of code examples | |
""" | |
examples = [] | |
# Simple splitting by function or class definitions | |
lines = content.split('\n') | |
current_example = [] | |
for line in lines: | |
if (line.startswith('def ') or line.startswith('class ')) and current_example: | |
# Start of a new function/class, save the previous one | |
examples.append('\n'.join(current_example)) | |
current_example = [line] | |
else: | |
current_example.append(line) | |
# Add the last example | |
if current_example: | |
examples.append('\n'.join(current_example)) | |
# If no examples were extracted, use the whole file as one example | |
if not examples: | |
examples = [content] | |
return [{'code': example} for example in examples] | |
def create_dataset_from_examples(examples): | |
""" | |
Create a dataset from code examples. | |
Args: | |
examples: List of code examples | |
Returns: | |
Dataset: Hugging Face dataset | |
""" | |
return Dataset.from_list(examples) | |
def validate_dataset_structure(dataset): | |
""" | |
Validate that the dataset has the required structure for training. | |
Args: | |
dataset: Hugging Face dataset | |
Returns: | |
bool: True if valid, False otherwise | |
""" | |
if 'code' not in dataset.column_names: | |
add_log("Dataset missing 'code' column required for training", "ERROR") | |
return False | |
return True | |
def list_available_datasets(): | |
""" | |
List all available datasets in session state. | |
Returns: | |
list: List of dataset names | |
""" | |
if 'datasets' in st.session_state: | |
return list(st.session_state.datasets.keys()) | |
return [] | |
def get_dataset_info(dataset_name): | |
""" | |
Get information about a dataset. | |
Args: | |
dataset_name: Name of the dataset | |
Returns: | |
dict: Dataset information | |
""" | |
if 'datasets' in st.session_state and dataset_name in st.session_state.datasets: | |
return st.session_state.datasets[dataset_name]['info'] | |
return None | |
def get_dataset(dataset_name): | |
""" | |
Get a dataset by name. | |
Args: | |
dataset_name: Name of the dataset | |
Returns: | |
Dataset: The dataset object | |
""" | |
if 'datasets' in st.session_state and dataset_name in st.session_state.datasets: | |
return st.session_state.datasets[dataset_name]['data'] | |
return None | |