File size: 5,716 Bytes
0552607
 
 
a6596f7
0552607
 
52420e4
 
0552607
 
 
 
 
 
 
52420e4
0552607
cc93bc7
 
 
 
 
 
 
0552607
 
 
52420e4
0552607
 
a6596f7
52420e4
a6596f7
 
 
52420e4
 
0552607
 
52420e4
 
 
 
 
 
 
 
 
 
 
 
 
a6596f7
52420e4
 
 
 
a6596f7
52420e4
a6596f7
 
52420e4
 
0552607
 
 
a6596f7
0552607
a6596f7
 
 
 
 
 
0552607
a6596f7
0552607
a6596f7
0552607
52420e4
 
0552607
 
 
 
52420e4
0552607
 
52420e4
0552607
52420e4
0552607
 
52420e4
0552607
d2bb5af
 
 
 
a6596f7
52420e4
d2bb5af
52420e4
d2bb5af
52420e4
d2bb5af
52420e4
 
 
d2bb5af
 
52420e4
 
d2bb5af
52420e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2bb5af
52420e4
 
 
 
 
0552607
 
 
9e2a193
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import gradio as gr
from datasets import load_dataset, Features, Value, Audio, Dataset
from huggingface_hub import HfApi, create_repo
import pandas as pd


def filter_dataset(dataset_name, split_name, keywords_text):
    """Filters a dataset based on keywords and returns a Pandas DataFrame."""
    try:
        # --- 1. Load the dataset in streaming mode ---
        dataset = load_dataset(dataset_name, split=split_name, streaming=True)

        # --- 2. Filter the dataset (streaming) ---
        keywords = [keyword.strip().lower() for keyword in keywords_text.split(',') if keyword.strip()]
        if not keywords:
            return pd.DataFrame(), "Error: No keywords provided."

        # Define a filtering function that handles potential KeyErrors
        def filter_func(example):
            prompt_value = example.get("prompt", "")  # Get prompt, default to empty string
            return any(keyword in prompt_value.lower() for keyword in keywords)

        filtered_dataset = dataset.filter(filter_func)


        # --- 3.  Select Indices (Efficiently) ---
        matching_indices = []
        data_for_df = []  # Store data for DataFrame
        for i, example in enumerate(filtered_dataset):
            matching_indices.append(i)
            #  Extract data and append. Handle potential KeyErrors.
            example_data = {
                'prompt': example.get('prompt', None),  # Use .get() for safety
                'chosen': example.get('chosen', {}).get('array', None) if isinstance(example.get('chosen'), dict) else None, # Handle nested structure, check if it's a dict
                'rejected': example.get('rejected', {}).get('array', None) if isinstance(example.get('rejected'), dict) else None,  # Handle nested structure
            }
            data_for_df.append(example_data)

        if not matching_indices:
            return pd.DataFrame(), "No matching examples found."

        # --- 4. Create Pandas DataFrame ---
        df = pd.DataFrame(data_for_df)
        return df, f"Found {len(matching_indices)} matching examples."

    except Exception as e:
        return pd.DataFrame(), f"An error occurred: {e}"


def push_to_hub(df_json, dataset_name, split_name, new_dataset_repo_id, hf_token):
    """Pushes a Pandas DataFrame (from JSON) to the Hugging Face Hub."""
    if not hf_token:
        return "Error: Hugging Face Token is required.", None

    try:
        # Convert JSON back to DataFrame
        df = pd.read_json(df_json)

        if df.empty:
            return "Error: Cannot push an empty dataset", None

        # Convert DataFrame to Hugging Face Dataset
        dataset = Dataset.from_pandas(df)


        # --- 5. Define features (for consistent schema) ---
        features_dict = {
            'prompt': Value(dtype='string', id=None),
            'chosen': Audio(sampling_rate=16000), # Assuming 16kHz; adjust if needed
            'rejected': Audio(sampling_rate=16000), # Assuming 16kHz
        }

        features = Features(features_dict)

        try:
            dataset = dataset.cast(features)
        except Exception as e:
            return f"An error occurred during casting: {e}", None

        # --- 6. Upload to the Hugging Face Hub ---
        api = HfApi(token=hf_token)
        try:
            create_repo(new_dataset_repo_id, token=hf_token, repo_type="dataset")
            print(f"Repository '{new_dataset_repo_id}' created.")
        except Exception as e:
            if "Repo already exists" not in str(e):
                return f"Error creating repository: {e}", None

        dataset.push_to_hub(new_dataset_repo_id)
        dataset_url = f"https://huggingface.co/datasets/{new_dataset_repo_id}"
        return f"Subset dataset uploaded successfully!", dataset_url

    except Exception as e:
        return f"An error occurred during push: {e}", None
# --- Gradio Interface ---
with gr.Blocks() as demo:
    gr.Markdown("# Dataset Filter and Push")

    with gr.Row():
        dataset_name_input = gr.Textbox(label="Source Dataset Name", value="ashraq/esc50") # Example with chosen/rejected
        split_name_input = gr.Textbox(label="Split Name", value="train")

    keywords_input = gr.Textbox(label="Keywords (comma-separated)", value="dog, cat")

    filter_button = gr.Button("Filter Dataset")

    #  Display the filtered data.  'label' is important for presentation.
    filtered_data_output = gr.Dataframe(label="Filtered Data")
    filter_status_output = gr.Textbox(label="Filter Status")

    with gr.Row():
        new_dataset_repo_id_input = gr.Textbox(label="New Dataset Repo ID")
        hf_token_input = gr.Textbox(label="Hugging Face Token", type="password")

    push_button = gr.Button("Push to Hub")
    push_status_output = gr.Textbox(label="Push Status")
    dataset_url_output = gr.Textbox(label="Dataset URL")  # Display the dataset URL

    # Hidden component to store the filtered dataset (as JSON)
    filtered_data_json = gr.JSON(visible=False)

    # Connect the filter button
    filter_button.click(
        filter_dataset,
        inputs=[dataset_name_input, split_name_input, keywords_input],
        outputs=[filtered_data_output, filter_status_output]
    ).then(  # Use .then() to chain actions
        lambda df: df.to_json(),  # Convert DataFrame to JSON
        inputs=[filtered_data_output],
        outputs=[filtered_data_json]  # Store in the hidden JSON component
    )

    # Connect the push button
    push_button.click(
        push_to_hub,
        inputs=[filtered_data_json, dataset_name_input, split_name_input, new_dataset_repo_id_input, hf_token_input],
        outputs=[push_status_output, dataset_url_output]
    )

if __name__ == "__main__":
    demo.launch()