Spaces:
Sleeping
Sleeping
import gradio as gr | |
from datasets import load_dataset, Features, Value, Audio, Dataset | |
from huggingface_hub import HfApi, create_repo | |
import os | |
# --- Configuration --- (Moved inside functions where needed, for Gradio) | |
animal_keywords = [ | |
"dog", "cat", "bird", "fish", "horse", "cow", "sheep", "pig", "chicken", | |
"duck", "goat", "lion", "tiger", "bear", "elephant", "monkey", "zebra", | |
"giraffe", "rhino", "hippo", "crocodile", "snake", "frog", "turtle", | |
"lizard", "spider", "ant", "bee", "butterfly", "wolf", "fox", "deer", | |
"rabbit", "squirrel", "mouse", "rat", "hamster", "guinea pig", "parrot", | |
"owl", "eagle", "hawk", "penguin", "dolphin", "whale", "shark", "seal", | |
"octopus", "crab", "lobster", "shrimp", "snail", "worm", "kangaroo", "koala", | |
"panda", "sloth", "hedgehog", "raccoon", "skunk", "beaver", "otter", | |
"platypus", "jaguar", "leopard", "cheetah", "puma", "ostrich", "emu", | |
"flamingo", "peacock", "swan", "goose", "turkey", "pigeon", "seagull", "antelope", | |
"bison", "buffalo", "camel", "llama", "alpaca", "donkey", "mule", "ferret", | |
"mongoose", "meerkat", "wombat", "dingo", "armadillo", "badger", "chipmunk", "porcupine" | |
] | |
def filter_and_push(dataset_name, split_name, keywords_text, new_dataset_repo_id, hf_token): | |
"""Filters a dataset based on keywords and pushes it to the Hub.""" | |
if not hf_token: | |
return "Error: Hugging Face token is required. Please provide it.", None | |
try: | |
# --- 1. Load the dataset in streaming mode --- | |
dataset = load_dataset(dataset_name, split=split_name, streaming=True) | |
# --- 2. Filter the dataset (streaming) --- | |
# Process keywords: split the comma-separated string, strip whitespace | |
keywords = [keyword.strip().lower() for keyword in keywords_text.split(',') if keyword.strip()] | |
if not keywords: | |
keywords = animal_keywords | |
# return "Error: No keywords provided. Please enter at least one keyword.", None | |
filtered_dataset = dataset.filter( | |
lambda example: any(keyword in example["prompt"].lower() for keyword in keywords) | |
) | |
# --- 3. Select Indices (Efficiently) --- | |
matching_indices = [] | |
for i, example in enumerate(filtered_dataset): | |
matching_indices.append(i) | |
if not matching_indices: | |
return "No matching examples found with the provided keywords.", None | |
# --- 4. Create the Subset Using .select() --- | |
full_dataset = load_dataset(dataset_name, split=split_name, streaming=False) | |
subset_dataset = full_dataset.select(matching_indices) | |
# --- 5. Define features (for consistent schema) --- | |
features = Features({ | |
'prompt': Value(dtype='string', id=None), | |
'audio': Audio(sampling_rate=16000), # Keep original sampling rate, adjust if needed | |
'strategy': Value(dtype='string', id=None), | |
'seed': Value(dtype='int64', id=None) | |
}) | |
try: | |
subset_dataset = subset_dataset.cast(features) # Cast to ensure features match | |
except Exception as e: | |
return f"An error occurred during casting please ensure that the dataset selected has the correct collumns: {e}", None | |
# --- 6. Upload the Subset Dataset --- | |
api = HfApi(token=hf_token) | |
# Create a repository (if it doesn't exist) | |
try: | |
create_repo(new_dataset_repo_id, token=hf_token, repo_type="dataset") | |
print(f"Repository '{new_dataset_repo_id}' created.") | |
except Exception as e: | |
if "Repo already exists" not in str(e): | |
return f"Error creating repository: {e}", None | |
# Upload to the Hugging Face Hub | |
subset_dataset.push_to_hub(new_dataset_repo_id) | |
dataset_url = f"https://huggingface.co/datasets/{new_dataset_repo_id}" | |
return f"Subset dataset uploaded successfully! {len(matching_indices)} Examples Found", dataset_url | |
except Exception as e: | |
return f"An error occurred: {e}", None | |
# --- Gradio Interface --- | |
with gr.Blocks() as demo: | |
gr.Markdown("# Dataset Filter and Push") | |
with gr.Row(): | |
dataset_name_input = gr.Textbox(label="Source Dataset Name (e.g., declare-lab/audio-alpaca)", value="declare-lab/audio-alpaca") | |
split_name_input = gr.Textbox(label="Split Name (e.g., train)", value="train") | |
keywords_input = gr.Textbox(label="Keywords (comma-separated, e.g., dog, cat, bird)", value="dog, cat, bird") | |
with gr.Row(): | |
new_dataset_repo_id_input = gr.Textbox(label="New Dataset Repo ID (e.g., your_username/your_dataset)") | |
hf_token_input = gr.Textbox(label="Hugging Face Token", type="password") | |
submit_button = gr.Button("Filter and Push") | |
with gr.Row(): | |
output_text = gr.Textbox(label="Status") | |
dataset_output_link = gr.Textbox(label="Dataset URL") | |
submit_button.click( | |
filter_and_push, | |
inputs=[dataset_name_input, split_name_input, keywords_input, new_dataset_repo_id_input, hf_token_input], | |
outputs=[output_text, dataset_output_link], | |
) | |
if __name__ == "__main__": | |
demo.launch() | |
Key changes and explanations: | |
Gradio Integration: The code is now structured to work within a Gradio interface. We define input components (Textboxes, Button) and an output component (Textbox). | |
filter_and_push Function: The core logic is encapsulated in a function. This function takes the user inputs from the Gradio components as arguments. This is crucial for Gradio to work correctly. | |
Error Handling: The code includes comprehensive try...except blocks to catch potential errors, such as: | |
Invalid Hugging Face token. | |
Problems loading the dataset. | |
Repository creation errors (checks if the repo already exists). | |
Issues during the dataset upload. | |
No matching examples are found. | |
Casting error | |
These errors are reported to the user through the Gradio output Textbox, providing helpful feedback. | |
Keyword Processing: The keywords_input is now a single Textbox where users can enter comma-separated keywords. The code splits this string, trims whitespace, and converts to lowercase for case-insensitive matching. An empty keyword list defaults to the original animal keywords | |
HF Token Handling: The HF token is now an input field (with type="password" for security). It's passed directly to the filter_and_push function. | |
Return Values for Gradio: The filter_and_push function now returns two values: | |
A status message (string) to display in the output_text Textbox. | |
The dataset URL (string) or None to the dataset output link. | |
Dataset URL Output: After a successful upload, the URL of the newly created dataset is displayed in the dataset_output_link Textbox, making it easy for the user to access their filtered dataset. | |
Clearer Instructions and Labels: The Gradio interface has descriptive labels for each input field, making it user-friendly. | |
if __name__ == "__main__": block: This standard Python construct ensures that demo.launch() is only called when the script is run directly (not when imported as a module). | |
No Hardcoded Values: All user-configurable parameters are now taken as inputs through the Gradio interface, making the space more flexible. | |
How to run this: | |
Install Libraries: | |
pip install gradio datasets huggingface_hub | |
IGNORE_WHEN_COPYING_START | |
content_copy | |
download | |
Use code with caution. | |
Bash | |
IGNORE_WHEN_COPYING_END | |
Save: Save the code as a Python file (e.g., app.py). | |
Run: | |
python app.py | |
IGNORE_WHEN_COPYING_START | |
content_copy | |
download | |
Use code with caution. | |
Bash | |
IGNORE_WHEN_COPYING_END | |
Open in Browser: Gradio will provide a local URL (usually http://127.0.0.1:7860) that you can open in your web browser. | |
Hugging Face Login (Important): While you don't need to use huggingface-cli login with this Gradio app (because you're entering the token directly), you do need a Hugging Face account, and you need to generate an API token with "write" access. You can create a token here: https://huggingface.co/settings/tokens | |
This improved version provides a robust and user-friendly Hugging Face Space for filtering and uploading datasets. It handles errors gracefully, provides clear feedback, and follows best practices for Gradio applications. |