Spaces:

Bradarr
/

DatasetManager

Sleeping

File size: 5,716 Bytes

import gradio as gr
from datasets import load_dataset, Features, Value, Audio, Dataset
from huggingface_hub import HfApi, create_repo
import pandas as pd


def filter_dataset(dataset_name, split_name, keywords_text):
    """Filters a dataset based on keywords and returns a Pandas DataFrame."""
    try:
        # --- 1. Load the dataset in streaming mode ---
        dataset = load_dataset(dataset_name, split=split_name, streaming=True)

        # --- 2. Filter the dataset (streaming) ---
        keywords = [keyword.strip().lower() for keyword in keywords_text.split(',') if keyword.strip()]
        if not keywords:
            return pd.DataFrame(), "Error: No keywords provided."

        # Define a filtering function that handles potential KeyErrors
        def filter_func(example):
            prompt_value = example.get("prompt", "")  # Get prompt, default to empty string
            return any(keyword in prompt_value.lower() for keyword in keywords)

        filtered_dataset = dataset.filter(filter_func)


        # --- 3.  Select Indices (Efficiently) ---
        matching_indices = []
        data_for_df = []  # Store data for DataFrame
        for i, example in enumerate(filtered_dataset):
            matching_indices.append(i)
            #  Extract data and append. Handle potential KeyErrors.
            example_data = {
                'prompt': example.get('prompt', None),  # Use .get() for safety
                'chosen': example.get('chosen', {}).get('array', None) if isinstance(example.get('chosen'), dict) else None, # Handle nested structure, check if it's a dict
                'rejected': example.get('rejected', {}).get('array', None) if isinstance(example.get('rejected'), dict) else None,  # Handle nested structure
            }
            data_for_df.append(example_data)

        if not matching_indices:
            return pd.DataFrame(), "No matching examples found."

        # --- 4. Create Pandas DataFrame ---
        df = pd.DataFrame(data_for_df)
        return df, f"Found {len(matching_indices)} matching examples."

    except Exception as e:
        return pd.DataFrame(), f"An error occurred: {e}"


def push_to_hub(df_json, dataset_name, split_name, new_dataset_repo_id, hf_token):
    """Pushes a Pandas DataFrame (from JSON) to the Hugging Face Hub."""
    if not hf_token:
        return "Error: Hugging Face Token is required.", None

    try:
        # Convert JSON back to DataFrame
        df = pd.read_json(df_json)

        if df.empty:
            return "Error: Cannot push an empty dataset", None

        # Convert DataFrame to Hugging Face Dataset
        dataset = Dataset.from_pandas(df)


        # --- 5. Define features (for consistent schema) ---
        features_dict = {
            'prompt': Value(dtype='string', id=None),
            'chosen': Audio(sampling_rate=16000), # Assuming 16kHz; adjust if needed
            'rejected': Audio(sampling_rate=16000), # Assuming 16kHz
        }

        features = Features(features_dict)

        try:
            dataset = dataset.cast(features)
        except Exception as e:
            return f"An error occurred during casting: {e}", None

        # --- 6. Upload to the Hugging Face Hub ---
        api = HfApi(token=hf_token)
        try:
            create_repo(new_dataset_repo_id, token=hf_token, repo_type="dataset")
            print(f"Repository '{new_dataset_repo_id}' created.")
        except Exception as e:
            if "Repo already exists" not in str(e):
                return f"Error creating repository: {e}", None

        dataset.push_to_hub(new_dataset_repo_id)
        dataset_url = f"https://huggingface.co/datasets/{new_dataset_repo_id}"
        return f"Subset dataset uploaded successfully!", dataset_url

    except Exception as e:
        return f"An error occurred during push: {e}", None
# --- Gradio Interface ---
with gr.Blocks() as demo:
    gr.Markdown("# Dataset Filter and Push")

    with gr.Row():
        dataset_name_input = gr.Textbox(label="Source Dataset Name", value="ashraq/esc50") # Example with chosen/rejected
        split_name_input = gr.Textbox(label="Split Name", value="train")

    keywords_input = gr.Textbox(label="Keywords (comma-separated)", value="dog, cat")

    filter_button = gr.Button("Filter Dataset")

    #  Display the filtered data.  'label' is important for presentation.
    filtered_data_output = gr.Dataframe(label="Filtered Data")
    filter_status_output = gr.Textbox(label="Filter Status")

    with gr.Row():
        new_dataset_repo_id_input = gr.Textbox(label="New Dataset Repo ID")
        hf_token_input = gr.Textbox(label="Hugging Face Token", type="password")

    push_button = gr.Button("Push to Hub")
    push_status_output = gr.Textbox(label="Push Status")
    dataset_url_output = gr.Textbox(label="Dataset URL")  # Display the dataset URL

    # Hidden component to store the filtered dataset (as JSON)
    filtered_data_json = gr.JSON(visible=False)

    # Connect the filter button
    filter_button.click(
        filter_dataset,
        inputs=[dataset_name_input, split_name_input, keywords_input],
        outputs=[filtered_data_output, filter_status_output]
    ).then(  # Use .then() to chain actions
        lambda df: df.to_json(),  # Convert DataFrame to JSON
        inputs=[filtered_data_output],
        outputs=[filtered_data_json]  # Store in the hidden JSON component
    )

    # Connect the push button
    push_button.click(
        push_to_hub,
        inputs=[filtered_data_json, dataset_name_input, split_name_input, new_dataset_repo_id_input, hf_token_input],
        outputs=[push_status_output, dataset_url_output]
    )

if __name__ == "__main__":
    demo.launch()