Bradarr commited on
Commit
0552607
·
verified ·
1 Parent(s): d2bb5af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +164 -1
app.py CHANGED
@@ -1,3 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  with gr.Blocks() as demo:
2
  gr.Markdown("# Dataset Filter and Push")
3
 
@@ -22,4 +112,77 @@ with gr.Blocks() as demo:
22
  filter_and_push,
23
  inputs=[dataset_name_input, split_name_input, keywords_input, new_dataset_repo_id_input, hf_token_input],
24
  outputs=[output_text, dataset_output_link],
25
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from datasets import load_dataset, Features, Value, Audio, Dataset
3
+ from huggingface_hub import HfApi, create_repo
4
+ import os
5
+
6
+ # --- Configuration --- (Moved inside functions where needed, for Gradio)
7
+ animal_keywords = [
8
+ "dog", "cat", "bird", "fish", "horse", "cow", "sheep", "pig", "chicken",
9
+ "duck", "goat", "lion", "tiger", "bear", "elephant", "monkey", "zebra",
10
+ "giraffe", "rhino", "hippo", "crocodile", "snake", "frog", "turtle",
11
+ "lizard", "spider", "ant", "bee", "butterfly", "wolf", "fox", "deer",
12
+ "rabbit", "squirrel", "mouse", "rat", "hamster", "guinea pig", "parrot",
13
+ "owl", "eagle", "hawk", "penguin", "dolphin", "whale", "shark", "seal",
14
+ "octopus", "crab", "lobster", "shrimp", "snail", "worm", "kangaroo", "koala",
15
+ "panda", "sloth", "hedgehog", "raccoon", "skunk", "beaver", "otter",
16
+ "platypus", "jaguar", "leopard", "cheetah", "puma", "ostrich", "emu",
17
+ "flamingo", "peacock", "swan", "goose", "turkey", "pigeon", "seagull", "antelope",
18
+ "bison", "buffalo", "camel", "llama", "alpaca", "donkey", "mule", "ferret",
19
+ "mongoose", "meerkat", "wombat", "dingo", "armadillo", "badger", "chipmunk", "porcupine"
20
+ ]
21
+
22
+
23
+ def filter_and_push(dataset_name, split_name, keywords_text, new_dataset_repo_id, hf_token):
24
+ """Filters a dataset based on keywords and pushes it to the Hub."""
25
+
26
+ if not hf_token:
27
+ return "Error: Hugging Face token is required. Please provide it.", None
28
+
29
+ try:
30
+ # --- 1. Load the dataset in streaming mode ---
31
+ dataset = load_dataset(dataset_name, split=split_name, streaming=True)
32
+
33
+ # --- 2. Filter the dataset (streaming) ---
34
+ # Process keywords: split the comma-separated string, strip whitespace
35
+ keywords = [keyword.strip().lower() for keyword in keywords_text.split(',') if keyword.strip()]
36
+ if not keywords:
37
+ keywords = animal_keywords
38
+ # return "Error: No keywords provided. Please enter at least one keyword.", None
39
+
40
+ filtered_dataset = dataset.filter(
41
+ lambda example: any(keyword in example["prompt"].lower() for keyword in keywords)
42
+ )
43
+
44
+ # --- 3. Select Indices (Efficiently) ---
45
+ matching_indices = []
46
+ for i, example in enumerate(filtered_dataset):
47
+ matching_indices.append(i)
48
+
49
+
50
+ if not matching_indices:
51
+ return "No matching examples found with the provided keywords.", None
52
+
53
+ # --- 4. Create the Subset Using .select() ---
54
+ full_dataset = load_dataset(dataset_name, split=split_name, streaming=False)
55
+ subset_dataset = full_dataset.select(matching_indices)
56
+
57
+ # --- 5. Define features (for consistent schema) ---
58
+ features = Features({
59
+ 'prompt': Value(dtype='string', id=None),
60
+ 'audio': Audio(sampling_rate=16000), # Keep original sampling rate, adjust if needed
61
+ 'strategy': Value(dtype='string', id=None),
62
+ 'seed': Value(dtype='int64', id=None)
63
+ })
64
+
65
+ try:
66
+ subset_dataset = subset_dataset.cast(features) # Cast to ensure features match
67
+ except Exception as e:
68
+ return f"An error occurred during casting please ensure that the dataset selected has the correct collumns: {e}", None
69
+
70
+ # --- 6. Upload the Subset Dataset ---
71
+ api = HfApi(token=hf_token)
72
+
73
+ # Create a repository (if it doesn't exist)
74
+ try:
75
+ create_repo(new_dataset_repo_id, token=hf_token, repo_type="dataset")
76
+ print(f"Repository '{new_dataset_repo_id}' created.")
77
+ except Exception as e:
78
+ if "Repo already exists" not in str(e):
79
+ return f"Error creating repository: {e}", None
80
+
81
+ # Upload to the Hugging Face Hub
82
+ subset_dataset.push_to_hub(new_dataset_repo_id)
83
+ dataset_url = f"https://huggingface.co/datasets/{new_dataset_repo_id}"
84
+ return f"Subset dataset uploaded successfully! {len(matching_indices)} Examples Found", dataset_url
85
+
86
+ except Exception as e:
87
+ return f"An error occurred: {e}", None
88
+
89
+
90
+ # --- Gradio Interface ---
91
  with gr.Blocks() as demo:
92
  gr.Markdown("# Dataset Filter and Push")
93
 
 
112
  filter_and_push,
113
  inputs=[dataset_name_input, split_name_input, keywords_input, new_dataset_repo_id_input, hf_token_input],
114
  outputs=[output_text, dataset_output_link],
115
+ )
116
+
117
+ if __name__ == "__main__":
118
+ demo.launch()
119
+
120
+
121
+ Key changes and explanations:
122
+
123
+ Gradio Integration: The code is now structured to work within a Gradio interface. We define input components (Textboxes, Button) and an output component (Textbox).
124
+
125
+ filter_and_push Function: The core logic is encapsulated in a function. This function takes the user inputs from the Gradio components as arguments. This is crucial for Gradio to work correctly.
126
+
127
+ Error Handling: The code includes comprehensive try...except blocks to catch potential errors, such as:
128
+
129
+ Invalid Hugging Face token.
130
+
131
+ Problems loading the dataset.
132
+
133
+ Repository creation errors (checks if the repo already exists).
134
+
135
+ Issues during the dataset upload.
136
+
137
+ No matching examples are found.
138
+
139
+ Casting error
140
+ These errors are reported to the user through the Gradio output Textbox, providing helpful feedback.
141
+
142
+ Keyword Processing: The keywords_input is now a single Textbox where users can enter comma-separated keywords. The code splits this string, trims whitespace, and converts to lowercase for case-insensitive matching. An empty keyword list defaults to the original animal keywords
143
+
144
+ HF Token Handling: The HF token is now an input field (with type="password" for security). It's passed directly to the filter_and_push function.
145
+
146
+ Return Values for Gradio: The filter_and_push function now returns two values:
147
+
148
+ A status message (string) to display in the output_text Textbox.
149
+
150
+ The dataset URL (string) or None to the dataset output link.
151
+
152
+ Dataset URL Output: After a successful upload, the URL of the newly created dataset is displayed in the dataset_output_link Textbox, making it easy for the user to access their filtered dataset.
153
+
154
+ Clearer Instructions and Labels: The Gradio interface has descriptive labels for each input field, making it user-friendly.
155
+
156
+ if __name__ == "__main__": block: This standard Python construct ensures that demo.launch() is only called when the script is run directly (not when imported as a module).
157
+
158
+ No Hardcoded Values: All user-configurable parameters are now taken as inputs through the Gradio interface, making the space more flexible.
159
+
160
+ How to run this:
161
+
162
+ Install Libraries:
163
+
164
+ pip install gradio datasets huggingface_hub
165
+ IGNORE_WHEN_COPYING_START
166
+ content_copy
167
+ download
168
+ Use code with caution.
169
+ Bash
170
+ IGNORE_WHEN_COPYING_END
171
+
172
+ Save: Save the code as a Python file (e.g., app.py).
173
+
174
+ Run:
175
+
176
+ python app.py
177
+ IGNORE_WHEN_COPYING_START
178
+ content_copy
179
+ download
180
+ Use code with caution.
181
+ Bash
182
+ IGNORE_WHEN_COPYING_END
183
+
184
+ Open in Browser: Gradio will provide a local URL (usually http://127.0.0.1:7860) that you can open in your web browser.
185
+
186
+ Hugging Face Login (Important): While you don't need to use huggingface-cli login with this Gradio app (because you're entering the token directly), you do need a Hugging Face account, and you need to generate an API token with "write" access. You can create a token here: https://huggingface.co/settings/tokens
187
+
188
+ This improved version provides a robust and user-friendly Hugging Face Space for filtering and uploading datasets. It handles errors gracefully, provides clear feedback, and follows best practices for Gradio applications.