Spaces:

louiecerv
/

cats_dogs_dataset_builder

Sleeping

File size: 2,503 Bytes

from datasets import Dataset, DatasetDict, load_dataset, Features, ClassLabel, Array3D
from huggingface_hub import HfApi
import os
from PIL import Image
import numpy as np
import streamlit as st

# Define the target image size
IMG_SIZE = (128, 128)  # Change from (224, 224) to (128, 128)

def load_images_from_folder(folder):
    images = []
    labels = []
    label_names = sorted(os.listdir(folder))

    for i, label in enumerate(label_names):
        label_folder = os.path.join(folder, label)
        if os.path.isdir(label_folder):
            for img_file in os.listdir(label_folder):
                img_path = os.path.join(label_folder, img_file)
                if img_path.lower().endswith((".png", ".jpg", ".jpeg")):
                    try:
                        img = Image.open(img_path).convert("RGB")
                        img = img.resize(IMG_SIZE)  # Resize to 128x128
                        img_np = np.array(img, dtype=np.uint8)  # Convert to NumPy array

                        images.append(img_np)
                        labels.append(i)

                    except Exception as e:
                        print(f"Error processing image {img_path}: {e}")
                        continue  # Skip problematic images

    return Dataset.from_dict(
        {"image": images, "label": labels},
        features=Features({
            "image": Array3D(shape=(128, 128, 3), dtype="uint8"),  # Update shape to (128, 128, 3)
            "label": ClassLabel(names=label_names)
        })
    )

def main():

    st.title("Upload Cats and Dogs Dataset to Hugging Face Hub")
    st.write("Download the archive of images from [this link](https://drive.google.com/file/d/11rYftkuiAUA4cdejsGEntUfhNfEPTrM8/view?usp=sharing) and extract it to the `dataset` folder.")
    st.warning("This script will not work when run from the HuggingFace Space.")

    #remove the stop once you have a copy of the dataset
    #you will ned to run this script from your local machine
    st.stop()

    # Create dataset dictionary
    dataset = DatasetDict({
        "train": load_images_from_folder("dataset/train_set"),
        "test": load_images_from_folder("dataset/test_set")
    })

    # Push dataset to Hugging Face Hub
    repo_id = "cats_dogs_dataset"  # Choose a dataset name
    HF_TOKEN = os.getenv("HF_TOKEN")

    dataset.push_to_hub(repo_id, token=HF_TOKEN, commit_message="Initial dataset upload")

    st.write(f"Dataset uploaded to {repo_id}")

if __name__ == "__main__":
    main()