Spaces:
Sleeping
Sleeping
File size: 5,716 Bytes
0552607 a6596f7 0552607 52420e4 0552607 52420e4 0552607 cc93bc7 0552607 52420e4 0552607 a6596f7 52420e4 a6596f7 52420e4 0552607 52420e4 a6596f7 52420e4 a6596f7 52420e4 a6596f7 52420e4 0552607 a6596f7 0552607 a6596f7 0552607 a6596f7 0552607 a6596f7 0552607 52420e4 0552607 52420e4 0552607 52420e4 0552607 52420e4 0552607 52420e4 0552607 d2bb5af a6596f7 52420e4 d2bb5af 52420e4 d2bb5af 52420e4 d2bb5af 52420e4 d2bb5af 52420e4 d2bb5af 52420e4 d2bb5af 52420e4 0552607 9e2a193 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import gradio as gr
from datasets import load_dataset, Features, Value, Audio, Dataset
from huggingface_hub import HfApi, create_repo
import pandas as pd
def filter_dataset(dataset_name, split_name, keywords_text):
"""Filters a dataset based on keywords and returns a Pandas DataFrame."""
try:
# --- 1. Load the dataset in streaming mode ---
dataset = load_dataset(dataset_name, split=split_name, streaming=True)
# --- 2. Filter the dataset (streaming) ---
keywords = [keyword.strip().lower() for keyword in keywords_text.split(',') if keyword.strip()]
if not keywords:
return pd.DataFrame(), "Error: No keywords provided."
# Define a filtering function that handles potential KeyErrors
def filter_func(example):
prompt_value = example.get("prompt", "") # Get prompt, default to empty string
return any(keyword in prompt_value.lower() for keyword in keywords)
filtered_dataset = dataset.filter(filter_func)
# --- 3. Select Indices (Efficiently) ---
matching_indices = []
data_for_df = [] # Store data for DataFrame
for i, example in enumerate(filtered_dataset):
matching_indices.append(i)
# Extract data and append. Handle potential KeyErrors.
example_data = {
'prompt': example.get('prompt', None), # Use .get() for safety
'chosen': example.get('chosen', {}).get('array', None) if isinstance(example.get('chosen'), dict) else None, # Handle nested structure, check if it's a dict
'rejected': example.get('rejected', {}).get('array', None) if isinstance(example.get('rejected'), dict) else None, # Handle nested structure
}
data_for_df.append(example_data)
if not matching_indices:
return pd.DataFrame(), "No matching examples found."
# --- 4. Create Pandas DataFrame ---
df = pd.DataFrame(data_for_df)
return df, f"Found {len(matching_indices)} matching examples."
except Exception as e:
return pd.DataFrame(), f"An error occurred: {e}"
def push_to_hub(df_json, dataset_name, split_name, new_dataset_repo_id, hf_token):
"""Pushes a Pandas DataFrame (from JSON) to the Hugging Face Hub."""
if not hf_token:
return "Error: Hugging Face Token is required.", None
try:
# Convert JSON back to DataFrame
df = pd.read_json(df_json)
if df.empty:
return "Error: Cannot push an empty dataset", None
# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)
# --- 5. Define features (for consistent schema) ---
features_dict = {
'prompt': Value(dtype='string', id=None),
'chosen': Audio(sampling_rate=16000), # Assuming 16kHz; adjust if needed
'rejected': Audio(sampling_rate=16000), # Assuming 16kHz
}
features = Features(features_dict)
try:
dataset = dataset.cast(features)
except Exception as e:
return f"An error occurred during casting: {e}", None
# --- 6. Upload to the Hugging Face Hub ---
api = HfApi(token=hf_token)
try:
create_repo(new_dataset_repo_id, token=hf_token, repo_type="dataset")
print(f"Repository '{new_dataset_repo_id}' created.")
except Exception as e:
if "Repo already exists" not in str(e):
return f"Error creating repository: {e}", None
dataset.push_to_hub(new_dataset_repo_id)
dataset_url = f"https://huggingface.co/datasets/{new_dataset_repo_id}"
return f"Subset dataset uploaded successfully!", dataset_url
except Exception as e:
return f"An error occurred during push: {e}", None
# --- Gradio Interface ---
with gr.Blocks() as demo:
gr.Markdown("# Dataset Filter and Push")
with gr.Row():
dataset_name_input = gr.Textbox(label="Source Dataset Name", value="ashraq/esc50") # Example with chosen/rejected
split_name_input = gr.Textbox(label="Split Name", value="train")
keywords_input = gr.Textbox(label="Keywords (comma-separated)", value="dog, cat")
filter_button = gr.Button("Filter Dataset")
# Display the filtered data. 'label' is important for presentation.
filtered_data_output = gr.Dataframe(label="Filtered Data")
filter_status_output = gr.Textbox(label="Filter Status")
with gr.Row():
new_dataset_repo_id_input = gr.Textbox(label="New Dataset Repo ID")
hf_token_input = gr.Textbox(label="Hugging Face Token", type="password")
push_button = gr.Button("Push to Hub")
push_status_output = gr.Textbox(label="Push Status")
dataset_url_output = gr.Textbox(label="Dataset URL") # Display the dataset URL
# Hidden component to store the filtered dataset (as JSON)
filtered_data_json = gr.JSON(visible=False)
# Connect the filter button
filter_button.click(
filter_dataset,
inputs=[dataset_name_input, split_name_input, keywords_input],
outputs=[filtered_data_output, filter_status_output]
).then( # Use .then() to chain actions
lambda df: df.to_json(), # Convert DataFrame to JSON
inputs=[filtered_data_output],
outputs=[filtered_data_json] # Store in the hidden JSON component
)
# Connect the push button
push_button.click(
push_to_hub,
inputs=[filtered_data_json, dataset_name_input, split_name_input, new_dataset_repo_id_input, hf_token_input],
outputs=[push_status_output, dataset_url_output]
)
if __name__ == "__main__":
demo.launch() |