Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
with gr.Blocks() as demo:
|
2 |
gr.Markdown("# Dataset Filter and Push")
|
3 |
|
@@ -22,4 +112,77 @@ with gr.Blocks() as demo:
|
|
22 |
filter_and_push,
|
23 |
inputs=[dataset_name_input, split_name_input, keywords_input, new_dataset_repo_id_input, hf_token_input],
|
24 |
outputs=[output_text, dataset_output_link],
|
25 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from datasets import load_dataset, Features, Value, Audio, Dataset
|
3 |
+
from huggingface_hub import HfApi, create_repo
|
4 |
+
import os
|
5 |
+
|
6 |
+
# --- Configuration --- (Moved inside functions where needed, for Gradio)
|
7 |
+
animal_keywords = [
|
8 |
+
"dog", "cat", "bird", "fish", "horse", "cow", "sheep", "pig", "chicken",
|
9 |
+
"duck", "goat", "lion", "tiger", "bear", "elephant", "monkey", "zebra",
|
10 |
+
"giraffe", "rhino", "hippo", "crocodile", "snake", "frog", "turtle",
|
11 |
+
"lizard", "spider", "ant", "bee", "butterfly", "wolf", "fox", "deer",
|
12 |
+
"rabbit", "squirrel", "mouse", "rat", "hamster", "guinea pig", "parrot",
|
13 |
+
"owl", "eagle", "hawk", "penguin", "dolphin", "whale", "shark", "seal",
|
14 |
+
"octopus", "crab", "lobster", "shrimp", "snail", "worm", "kangaroo", "koala",
|
15 |
+
"panda", "sloth", "hedgehog", "raccoon", "skunk", "beaver", "otter",
|
16 |
+
"platypus", "jaguar", "leopard", "cheetah", "puma", "ostrich", "emu",
|
17 |
+
"flamingo", "peacock", "swan", "goose", "turkey", "pigeon", "seagull", "antelope",
|
18 |
+
"bison", "buffalo", "camel", "llama", "alpaca", "donkey", "mule", "ferret",
|
19 |
+
"mongoose", "meerkat", "wombat", "dingo", "armadillo", "badger", "chipmunk", "porcupine"
|
20 |
+
]
|
21 |
+
|
22 |
+
|
23 |
+
def filter_and_push(dataset_name, split_name, keywords_text, new_dataset_repo_id, hf_token):
|
24 |
+
"""Filters a dataset based on keywords and pushes it to the Hub."""
|
25 |
+
|
26 |
+
if not hf_token:
|
27 |
+
return "Error: Hugging Face token is required. Please provide it.", None
|
28 |
+
|
29 |
+
try:
|
30 |
+
# --- 1. Load the dataset in streaming mode ---
|
31 |
+
dataset = load_dataset(dataset_name, split=split_name, streaming=True)
|
32 |
+
|
33 |
+
# --- 2. Filter the dataset (streaming) ---
|
34 |
+
# Process keywords: split the comma-separated string, strip whitespace
|
35 |
+
keywords = [keyword.strip().lower() for keyword in keywords_text.split(',') if keyword.strip()]
|
36 |
+
if not keywords:
|
37 |
+
keywords = animal_keywords
|
38 |
+
# return "Error: No keywords provided. Please enter at least one keyword.", None
|
39 |
+
|
40 |
+
filtered_dataset = dataset.filter(
|
41 |
+
lambda example: any(keyword in example["prompt"].lower() for keyword in keywords)
|
42 |
+
)
|
43 |
+
|
44 |
+
# --- 3. Select Indices (Efficiently) ---
|
45 |
+
matching_indices = []
|
46 |
+
for i, example in enumerate(filtered_dataset):
|
47 |
+
matching_indices.append(i)
|
48 |
+
|
49 |
+
|
50 |
+
if not matching_indices:
|
51 |
+
return "No matching examples found with the provided keywords.", None
|
52 |
+
|
53 |
+
# --- 4. Create the Subset Using .select() ---
|
54 |
+
full_dataset = load_dataset(dataset_name, split=split_name, streaming=False)
|
55 |
+
subset_dataset = full_dataset.select(matching_indices)
|
56 |
+
|
57 |
+
# --- 5. Define features (for consistent schema) ---
|
58 |
+
features = Features({
|
59 |
+
'prompt': Value(dtype='string', id=None),
|
60 |
+
'audio': Audio(sampling_rate=16000), # Keep original sampling rate, adjust if needed
|
61 |
+
'strategy': Value(dtype='string', id=None),
|
62 |
+
'seed': Value(dtype='int64', id=None)
|
63 |
+
})
|
64 |
+
|
65 |
+
try:
|
66 |
+
subset_dataset = subset_dataset.cast(features) # Cast to ensure features match
|
67 |
+
except Exception as e:
|
68 |
+
return f"An error occurred during casting please ensure that the dataset selected has the correct collumns: {e}", None
|
69 |
+
|
70 |
+
# --- 6. Upload the Subset Dataset ---
|
71 |
+
api = HfApi(token=hf_token)
|
72 |
+
|
73 |
+
# Create a repository (if it doesn't exist)
|
74 |
+
try:
|
75 |
+
create_repo(new_dataset_repo_id, token=hf_token, repo_type="dataset")
|
76 |
+
print(f"Repository '{new_dataset_repo_id}' created.")
|
77 |
+
except Exception as e:
|
78 |
+
if "Repo already exists" not in str(e):
|
79 |
+
return f"Error creating repository: {e}", None
|
80 |
+
|
81 |
+
# Upload to the Hugging Face Hub
|
82 |
+
subset_dataset.push_to_hub(new_dataset_repo_id)
|
83 |
+
dataset_url = f"https://huggingface.co/datasets/{new_dataset_repo_id}"
|
84 |
+
return f"Subset dataset uploaded successfully! {len(matching_indices)} Examples Found", dataset_url
|
85 |
+
|
86 |
+
except Exception as e:
|
87 |
+
return f"An error occurred: {e}", None
|
88 |
+
|
89 |
+
|
90 |
+
# --- Gradio Interface ---
|
91 |
with gr.Blocks() as demo:
|
92 |
gr.Markdown("# Dataset Filter and Push")
|
93 |
|
|
|
112 |
filter_and_push,
|
113 |
inputs=[dataset_name_input, split_name_input, keywords_input, new_dataset_repo_id_input, hf_token_input],
|
114 |
outputs=[output_text, dataset_output_link],
|
115 |
+
)
|
116 |
+
|
117 |
+
if __name__ == "__main__":
|
118 |
+
demo.launch()
|
119 |
+
|
120 |
+
|
121 |
+
Key changes and explanations:
|
122 |
+
|
123 |
+
Gradio Integration: The code is now structured to work within a Gradio interface. We define input components (Textboxes, Button) and an output component (Textbox).
|
124 |
+
|
125 |
+
filter_and_push Function: The core logic is encapsulated in a function. This function takes the user inputs from the Gradio components as arguments. This is crucial for Gradio to work correctly.
|
126 |
+
|
127 |
+
Error Handling: The code includes comprehensive try...except blocks to catch potential errors, such as:
|
128 |
+
|
129 |
+
Invalid Hugging Face token.
|
130 |
+
|
131 |
+
Problems loading the dataset.
|
132 |
+
|
133 |
+
Repository creation errors (checks if the repo already exists).
|
134 |
+
|
135 |
+
Issues during the dataset upload.
|
136 |
+
|
137 |
+
No matching examples are found.
|
138 |
+
|
139 |
+
Casting error
|
140 |
+
These errors are reported to the user through the Gradio output Textbox, providing helpful feedback.
|
141 |
+
|
142 |
+
Keyword Processing: The keywords_input is now a single Textbox where users can enter comma-separated keywords. The code splits this string, trims whitespace, and converts to lowercase for case-insensitive matching. An empty keyword list defaults to the original animal keywords
|
143 |
+
|
144 |
+
HF Token Handling: The HF token is now an input field (with type="password" for security). It's passed directly to the filter_and_push function.
|
145 |
+
|
146 |
+
Return Values for Gradio: The filter_and_push function now returns two values:
|
147 |
+
|
148 |
+
A status message (string) to display in the output_text Textbox.
|
149 |
+
|
150 |
+
The dataset URL (string) or None to the dataset output link.
|
151 |
+
|
152 |
+
Dataset URL Output: After a successful upload, the URL of the newly created dataset is displayed in the dataset_output_link Textbox, making it easy for the user to access their filtered dataset.
|
153 |
+
|
154 |
+
Clearer Instructions and Labels: The Gradio interface has descriptive labels for each input field, making it user-friendly.
|
155 |
+
|
156 |
+
if __name__ == "__main__": block: This standard Python construct ensures that demo.launch() is only called when the script is run directly (not when imported as a module).
|
157 |
+
|
158 |
+
No Hardcoded Values: All user-configurable parameters are now taken as inputs through the Gradio interface, making the space more flexible.
|
159 |
+
|
160 |
+
How to run this:
|
161 |
+
|
162 |
+
Install Libraries:
|
163 |
+
|
164 |
+
pip install gradio datasets huggingface_hub
|
165 |
+
IGNORE_WHEN_COPYING_START
|
166 |
+
content_copy
|
167 |
+
download
|
168 |
+
Use code with caution.
|
169 |
+
Bash
|
170 |
+
IGNORE_WHEN_COPYING_END
|
171 |
+
|
172 |
+
Save: Save the code as a Python file (e.g., app.py).
|
173 |
+
|
174 |
+
Run:
|
175 |
+
|
176 |
+
python app.py
|
177 |
+
IGNORE_WHEN_COPYING_START
|
178 |
+
content_copy
|
179 |
+
download
|
180 |
+
Use code with caution.
|
181 |
+
Bash
|
182 |
+
IGNORE_WHEN_COPYING_END
|
183 |
+
|
184 |
+
Open in Browser: Gradio will provide a local URL (usually http://127.0.0.1:7860) that you can open in your web browser.
|
185 |
+
|
186 |
+
Hugging Face Login (Important): While you don't need to use huggingface-cli login with this Gradio app (because you're entering the token directly), you do need a Hugging Face account, and you need to generate an API token with "write" access. You can create a token here: https://huggingface.co/settings/tokens
|
187 |
+
|
188 |
+
This improved version provides a robust and user-friendly Hugging Face Space for filtering and uploading datasets. It handles errors gracefully, provides clear feedback, and follows best practices for Gradio applications.
|