Spaces:
Sleeping
Sleeping
# data_uploader.py | |
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
from PIL import Image | |
import torch | |
from torchvision import transforms | |
from torch.utils.data import Dataset, DataLoader | |
from datasets import Dataset as HFDataset, DatasetDict | |
from huggingface_hub import HfApi # For Hugging Face Hub interaction | |
import os | |
# Hugging Face Hub credentials | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
REPO_ID = "louiecerv/american_sign_language" # Replace with your dataset repo name | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
st.write(f"Enabled GPU = {torch.cuda.is_available()}") | |
class MyDataset(Dataset): | |
def __init__(self, x_df, y_df): | |
self.xs = torch.tensor(x_df, dtype=torch.float32).to(device) # Explicitly set dtype | |
self.ys = torch.tensor(y_df, dtype=torch.long).to(device) # Explicitly set dtype | |
def __getitem__(self, idx): | |
x = self.xs[idx] | |
y = self.ys[idx] | |
return x, y | |
def __len__(self): | |
return len(self.xs) | |
# Load the dataset and convert to Hugging Face Dataset | |
def load_and_convert_to_hf_dataset(x, y, split="train"): | |
df = pd.DataFrame({"image": list(x), "label": y}) # Create a DataFrame | |
hf_dataset = HFDataset.from_pandas(df) | |
# Preprocess images (Important for Hugging Face) | |
def preprocess_function(examples): | |
images = [np.array(img).reshape(28, 28) for img in examples["image"]] #Reshape the image | |
# Convert to PIL images and apply transformations | |
transformed_images = [] | |
for image in images: | |
image = Image.fromarray(image.astype('uint8')) | |
transform = transforms.Compose([ | |
transforms.Grayscale(num_output_channels=1), | |
transforms.Resize((28, 28)), | |
transforms.ToTensor(), | |
transforms.Normalize(mean=[0.5], std=[0.5]) | |
]) | |
transformed_image = transform(image) | |
transformed_images.append(transformed_image) | |
examples["pixel_values"] = torch.stack(transformed_images) #Stack the images into a tensor | |
return examples | |
hf_dataset = hf_dataset.map(preprocess_function, batched=True, remove_columns=["image"]) | |
hf_dataset.set_format("torch") # Set format to PyTorch | |
return hf_dataset | |
def upload_dataset_to_hub(dataset, repo_id): | |
api = HfApi(token=HF_TOKEN) | |
api.create_repo(repo_id, repo_type="dataset", exist_ok=True) # Create repo if it doesn't exist | |
dataset.push_to_hub(repo_id) | |
print(f"Dataset uploaded to {repo_id}") | |
def main(): | |
st.title("American Sign Language Dataset Uploader") | |
about = """ | |
## About This App | |
This app is designed to load, preprocess, and upload datasets to the Hugging Face Hub. The main functionalities are encapsulated in the following components: | |
### Custom Dataset Class | |
The `MyDataset` class inherits from `torch.utils.data.Dataset` and is used to handle the dataset. | |
- **Initialization (`__init__`)**: | |
- Converts input dataframes `x_df` and `y_df` to PyTorch tensors with explicit data types (`float32` for features and `long` for labels). | |
- Moves the tensors to the specified device (e.g., GPU). | |
- **Get Item (`__getitem__`)**: | |
- Retrieves the feature (`x`) and label (`y`) tensors at a given index `idx`. | |
- **Length (`__len__`)**: | |
- Returns the length of the dataset. | |
### Load and Convert to Hugging Face Dataset | |
The `load_and_convert_to_hf_dataset` function converts input data into a Hugging Face dataset. | |
- **DataFrame Creation**: | |
- Creates a Pandas DataFrame from the input features (`x`) and labels (`y`). | |
- **Preprocessing Function**: | |
- Reshapes images to 28x28 pixels. | |
- Converts images to PIL format and applies transformations (grayscale, resize, tensor conversion, and normalization). | |
- Stacks the transformed images into a tensor. | |
- **Dataset Mapping**: | |
- Applies the preprocessing function to the dataset. | |
- Sets the dataset format to PyTorch. | |
### Data Loading and Conversion | |
The app loads training and validation data from CSV files and converts them into Hugging Face datasets. | |
- **Training Data**: | |
- Loads data from `sign_mnist_train.csv`. | |
- Separates features and labels. | |
- Converts to a Hugging Face dataset. | |
- **Validation Data**: | |
- Loads data from `sign_mnist_valid.csv`. | |
- Separates features and labels. | |
- Converts to a Hugging Face dataset. | |
### Upload Dataset to Hugging Face Hub | |
The `upload_dataset_to_hub` function uploads the dataset to the Hugging Face Hub. | |
- **Repository Creation**: | |
- Creates a repository if it doesn't exist. | |
- **Dataset Upload**: | |
- Pushes the dataset to the specified repository. | |
### Main Function | |
The `main` function orchestrates the entire process. | |
- Loads and preprocesses training and validation data. | |
- Creates a `DatasetDict` containing both datasets. | |
- Uploads the dataset to the Hugging Face Hub. | |
### Execution | |
The script is executed by calling the `main` function if the script is run as the main module. | |
```python | |
if __name__ == "__main__": | |
main()""" | |
with st.expander("About", expanded=True): | |
st.write (about) | |
st.write("## Instructions") | |
st.write("Do not run this code on Huggingface. Download the code and run it on your local machine.") | |
st.write("Make sure you have the required files in the data/asl_data folder.") | |
st.stop() | |
try: | |
# Load and convert dataframes to Hugging Face datasets | |
train_df = pd.read_csv("data/asl_data/sign_mnist_train.csv") | |
y_train = train_df.pop('label').values | |
x_train = train_df.values | |
valid_df = pd.read_csv("data/asl_data/sign_mnist_valid.csv") | |
y_valid = valid_df.pop('label').values | |
x_valid = valid_df.values | |
train_dataset = load_and_convert_to_hf_dataset(x_train, y_train, "train") | |
valid_dataset = load_and_convert_to_hf_dataset(x_valid, y_valid, "validation") | |
# Create a DatasetDict | |
full_dataset = DatasetDict({ | |
"train": train_dataset, | |
"validation": valid_dataset | |
}) | |
upload_dataset_to_hub(full_dataset, REPO_ID) # Upload the DatasetDict | |
st.write("Data upload complete.") | |
except Exception as e: | |
st.error(f"An error occurred: {e}") | |
if __name__ == "__main__": | |
main() |