Spaces:

Bradarr
/

DatasetManager

Sleeping

App Files Files Community

DatasetManager / app.py

Bradarr

Update app.py

52420e4 verified 4 months ago

raw

history blame

5.58 kB

	import gradio as gr
	from datasets import load_dataset, Features, Value, Audio, Dataset
	from huggingface_hub import HfApi, create_repo
	import pandas as pd # Import pandas for displaying the dataset


	def filter_dataset(dataset_name, split_name, keywords_text):
	"""Filters a dataset based on keywords and returns a Pandas DataFrame."""
	try:
	# --- 1. Load the dataset in streaming mode ---
	dataset = load_dataset(dataset_name, split=split_name, streaming=True)

	# --- 2. Filter the dataset (streaming) ---
	keywords = [keyword.strip().lower() for keyword in keywords_text.split(',') if keyword.strip()]
	if not keywords:
	return pd.DataFrame(), "Error: No keywords provided."

	filtered_dataset = dataset.filter(
	lambda example: any(keyword in example["prompt"].lower() for keyword in keywords)
	)

	# --- 3. Select Indices (Efficiently) ---
	matching_indices = []
	data_for_df = [] # Store data for DataFrame
	for i, example in enumerate(filtered_dataset):
	matching_indices.append(i)
	# Extract data and append. Crucially, decode audio here.
	example_data = {
	'prompt': example['prompt'],
	'strategy': example['strategy'],
	'seed': example['seed'],
	'audio': example['audio']['array'] # Get the NumPy array
	}
	data_for_df.append(example_data)

	if not matching_indices:
	return pd.DataFrame(), "No matching examples found."

	# --- 4. Create Pandas DataFrame ---
	df = pd.DataFrame(data_for_df)
	return df, f"Found {len(matching_indices)} matching examples."

	except Exception as e:
	return pd.DataFrame(), f"An error occurred: {e}"


	def push_to_hub(df_json, dataset_name, split_name, new_dataset_repo_id, hf_token):
	"""Pushes a Pandas DataFrame (from JSON) to the Hugging Face Hub."""
	if not hf_token:
	return "Error: Hugging Face token is required.", None

	try:
	# Convert JSON back to DataFrame
	df = pd.read_json(df_json)

	if df.empty:
	return "Error: Cannot push an empty dataset",None

	# Convert DataFrame to Hugging Face Dataset
	dataset = Dataset.from_pandas(df)

	# --- Load original (for feature definition)
	full_dataset = load_dataset(dataset_name, split=split_name, streaming=False)

	if len(full_dataset) == 0:
	return "Error: Source Dataset Appears Empty",None

	# --- 5. Define features (for consistent schema) ---
	features = Features({
	'prompt': Value(dtype='string', id=None),
	'audio': Audio(sampling_rate=16000),
	'strategy': Value(dtype='string', id=None),
	'seed': Value(dtype='int64', id=None)
	})

	try:
	dataset = dataset.cast(features)
	except Exception as e:
	return f"An error occurred: {e}",None


	# --- 6. Upload to the Hugging Face Hub ---
	api = HfApi(token=hf_token)
	try:
	create_repo(new_dataset_repo_id, token=hf_token, repo_type="dataset")
	print(f"Repository '{new_dataset_repo_id}' created.")
	except Exception as e:
	if "Repo already exists" not in str(e):
	return f"Error creating repository: {e}", None

	dataset.push_to_hub(new_dataset_repo_id)
	dataset_url = f"https://huggingface.co/datasets/{new_dataset_repo_id}"
	return f"Subset dataset uploaded successfully!", dataset_url

	except Exception as e:
	return f"An error occurred during push: {e}", None


	# --- Gradio Interface ---
	with gr.Blocks() as demo:
	gr.Markdown("# Dataset Filter and Push")

	with gr.Row():
	dataset_name_input = gr.Textbox(label="Source Dataset Name", value="declare-lab/audio-alpaca")
	split_name_input = gr.Textbox(label="Split Name", value="train")

	keywords_input = gr.Textbox(label="Keywords (comma-separated)", value="dog, cat")

	filter_button = gr.Button("Filter Dataset")

	# Display the filtered data. 'label' is important for presentation.
	filtered_data_output = gr.Dataframe(label="Filtered Data")
	filter_status_output = gr.Textbox(label="Filter Status")

	with gr.Row():
	new_dataset_repo_id_input = gr.Textbox(label="New Dataset Repo ID")
	hf_token_input = gr.Textbox(label="Hugging Face Token", type="password")

	push_button = gr.Button("Push to Hub")
	push_status_output = gr.Textbox(label="Push Status")
	dataset_url_output = gr.Textbox(label="Dataset URL") # Display the dataset URL

	# Hidden component to store the filtered dataset (as JSON)
	filtered_data_json = gr.JSON(visible=False)

	# Connect the filter button
	filter_button.click(
	filter_dataset,
	inputs=[dataset_name_input, split_name_input, keywords_input],
	outputs=[filtered_data_output, filter_status_output]
	).then( # Use .then() to chain actions
	lambda df: df.to_json(), # Convert DataFrame to JSON
	inputs=[filtered_data_output],
	outputs=[filtered_data_json] # Store in the hidden JSON component
	)

	# Connect the push button
	push_button.click(
	push_to_hub,
	inputs=[filtered_data_json, dataset_name_input, split_name_input, new_dataset_repo_id_input, hf_token_input],
	outputs=[push_status_output, dataset_url_output]
	)

	if __name__ == "__main__":
	demo.launch()