Spaces:

Bradarr
/

DatasetManager

Sleeping

App Files Files Community

Bradarr commited on Mar 9

Commit

0552607

verified ·

1 Parent(s): d2bb5af

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -1

app.py CHANGED Viewed

@@ -1,3 +1,93 @@
 with gr.Blocks() as demo:
     gr.Markdown("# Dataset Filter and Push")
@@ -22,4 +112,77 @@ with gr.Blocks() as demo:
         filter_and_push,
         inputs=[dataset_name_input, split_name_input, keywords_input, new_dataset_repo_id_input, hf_token_input],
         outputs=[output_text, dataset_output_link],
-    )

+import gradio as gr
+from datasets import load_dataset, Features, Value, Audio, Dataset
+from huggingface_hub import HfApi, create_repo
+import os
+# --- Configuration --- (Moved inside functions where needed, for Gradio)
+animal_keywords = [
+    "dog", "cat", "bird", "fish", "horse", "cow", "sheep", "pig", "chicken",
+    "duck", "goat", "lion", "tiger", "bear", "elephant", "monkey", "zebra",
+    "giraffe", "rhino", "hippo", "crocodile", "snake", "frog", "turtle",
+    "lizard", "spider", "ant", "bee", "butterfly", "wolf", "fox", "deer",
+    "rabbit", "squirrel", "mouse", "rat", "hamster", "guinea pig", "parrot",
+    "owl", "eagle", "hawk", "penguin", "dolphin", "whale", "shark", "seal",
+    "octopus", "crab", "lobster", "shrimp", "snail", "worm", "kangaroo", "koala",
+    "panda", "sloth", "hedgehog", "raccoon", "skunk", "beaver", "otter",
+    "platypus", "jaguar", "leopard", "cheetah", "puma", "ostrich", "emu",
+    "flamingo", "peacock", "swan", "goose", "turkey", "pigeon", "seagull", "antelope",
+    "bison", "buffalo", "camel", "llama", "alpaca", "donkey", "mule", "ferret",
+    "mongoose", "meerkat", "wombat", "dingo", "armadillo", "badger", "chipmunk", "porcupine"
+]
+def filter_and_push(dataset_name, split_name, keywords_text, new_dataset_repo_id, hf_token):
+    """Filters a dataset based on keywords and pushes it to the Hub."""
+    if not hf_token:
+        return "Error: Hugging Face token is required. Please provide it.", None
+    try:
+        # --- 1. Load the dataset in streaming mode ---
+        dataset = load_dataset(dataset_name, split=split_name, streaming=True)
+        # --- 2. Filter the dataset (streaming) ---
+        # Process keywords: split the comma-separated string, strip whitespace
+        keywords = [keyword.strip().lower() for keyword in keywords_text.split(',') if keyword.strip()]
+        if not keywords:
+          keywords = animal_keywords
+          # return "Error: No keywords provided. Please enter at least one keyword.", None
+        filtered_dataset = dataset.filter(
+            lambda example: any(keyword in example["prompt"].lower() for keyword in keywords)
+        )
+        # --- 3.  Select Indices (Efficiently) ---
+        matching_indices = []
+        for i, example in enumerate(filtered_dataset):
+            matching_indices.append(i)
+        if not matching_indices:
+            return "No matching examples found with the provided keywords.", None
+        # --- 4. Create the Subset Using .select() ---
+        full_dataset = load_dataset(dataset_name, split=split_name, streaming=False)
+        subset_dataset = full_dataset.select(matching_indices)
+        # --- 5. Define features (for consistent schema) ---
+        features = Features({
+            'prompt': Value(dtype='string', id=None),
+            'audio': Audio(sampling_rate=16000),  # Keep original sampling rate, adjust if needed
+            'strategy': Value(dtype='string', id=None),
+            'seed': Value(dtype='int64', id=None)
+        })
+        try:
+          subset_dataset = subset_dataset.cast(features)  # Cast to ensure features match
+        except Exception as e:
+           return f"An error occurred during casting please ensure that the dataset selected has the correct collumns: {e}", None
+        # --- 6. Upload the Subset Dataset ---
+        api = HfApi(token=hf_token)
+        # Create a repository (if it doesn't exist)
+        try:
+            create_repo(new_dataset_repo_id, token=hf_token, repo_type="dataset")
+            print(f"Repository '{new_dataset_repo_id}' created.")
+        except Exception as e:
+             if "Repo already exists" not in str(e):
+                return f"Error creating repository: {e}", None
+        # Upload to the Hugging Face Hub
+        subset_dataset.push_to_hub(new_dataset_repo_id)
+        dataset_url = f"https://huggingface.co/datasets/{new_dataset_repo_id}"
+        return f"Subset dataset uploaded successfully!  {len(matching_indices)} Examples Found", dataset_url
+    except Exception as e:
+        return f"An error occurred: {e}", None
+# --- Gradio Interface ---
 with gr.Blocks() as demo:
     gr.Markdown("# Dataset Filter and Push")
         filter_and_push,
         inputs=[dataset_name_input, split_name_input, keywords_input, new_dataset_repo_id_input, hf_token_input],
         outputs=[output_text, dataset_output_link],
+    )
+if __name__ == "__main__":
+    demo.launch()
+Key changes and explanations:
+Gradio Integration: The code is now structured to work within a Gradio interface. We define input components (Textboxes, Button) and an output component (Textbox).
+filter_and_push Function: The core logic is encapsulated in a function. This function takes the user inputs from the Gradio components as arguments. This is crucial for Gradio to work correctly.
+Error Handling: The code includes comprehensive try...except blocks to catch potential errors, such as:
+Invalid Hugging Face token.
+Problems loading the dataset.
+Repository creation errors (checks if the repo already exists).
+Issues during the dataset upload.
+No matching examples are found.
+Casting error
+These errors are reported to the user through the Gradio output Textbox, providing helpful feedback.
+Keyword Processing: The keywords_input is now a single Textbox where users can enter comma-separated keywords. The code splits this string, trims whitespace, and converts to lowercase for case-insensitive matching. An empty keyword list defaults to the original animal keywords
+HF Token Handling: The HF token is now an input field (with type="password" for security). It's passed directly to the filter_and_push function.
+Return Values for Gradio: The filter_and_push function now returns two values:
+A status message (string) to display in the output_text Textbox.
+The dataset URL (string) or None to the dataset output link.
+Dataset URL Output: After a successful upload, the URL of the newly created dataset is displayed in the dataset_output_link Textbox, making it easy for the user to access their filtered dataset.
+Clearer Instructions and Labels: The Gradio interface has descriptive labels for each input field, making it user-friendly.
+if __name__ == "__main__": block: This standard Python construct ensures that demo.launch() is only called when the script is run directly (not when imported as a module).
+No Hardcoded Values: All user-configurable parameters are now taken as inputs through the Gradio interface, making the space more flexible.
+How to run this:
+Install Libraries:
+pip install gradio datasets huggingface_hub
+IGNORE_WHEN_COPYING_START
+content_copy
+download
+Use code with caution.
+Bash
+IGNORE_WHEN_COPYING_END
+Save: Save the code as a Python file (e.g., app.py).
+Run:
+python app.py
+IGNORE_WHEN_COPYING_START
+content_copy
+download
+Use code with caution.
+Bash
+IGNORE_WHEN_COPYING_END
+Open in Browser: Gradio will provide a local URL (usually http://127.0.0.1:7860) that you can open in your web browser.
+Hugging Face Login (Important): While you don't need to use huggingface-cli login with this Gradio app (because you're entering the token directly), you do need a Hugging Face account, and you need to generate an API token with "write" access. You can create a token here: https://huggingface.co/settings/tokens
+This improved version provides a robust and user-friendly Hugging Face Space for filtering and uploading datasets. It handles errors gracefully, provides clear feedback, and follows best practices for Gradio applications.