Datasets-Convertor

Running

File size: 3,654 Bytes

d2b9031
49e25d2
 
ff86828
d2b9031
0df8fba
 
 
 
 
 
 
 
 
 
 
1fd0c30
0df8fba
 
 
90f89f0
0df8fba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bf1e25
90f89f0
0df8fba
 
 
2bf1e25
90f89f0
0df8fba
90f89f0
 
 
0df8fba
90f89f0
7773ef1
49e25d2
90f89f0
0df8fba
 
 
 
 
 
 
2bf1e25
90f89f0
 
fcd8f70
90f89f0
 
0df8fba
 
 
90f89f0
49e25d2
72dd3ca
 
ff86828

import gradio as gr
import pandas as pd
import requests
from io import BytesIO

def convert_hf_dataset(input_file, file_url):
    """
    This function accepts either an uploaded file or a Hugging Face dataset URL.
    It automatically determines the file type (CSV or Parquet) based on the file extension,
    converts the file to the opposite format, and returns the converted file along with a preview
    of the top 10 rows.
    """
    df = None
    source = None
    converted_format = None
    output_file = None

    # If no file is provided via upload and URL is empty, raise an error.
    if input_file is None and (file_url is None or file_url.strip() == ""):
        raise ValueError("Please provide an uploaded file or a Hugging Face dataset URL.")

    if input_file is not None:
        # Process the uploaded file.
        source = input_file.name
        file_extension = source.lower().split('.')[-1]
        file_bytes = input_file.read()  # read the file content

        if file_extension == "csv":
            df = pd.read_csv(BytesIO(file_bytes))
            converted_format = "Parquet"
            output_file = "output.parquet"
        elif file_extension == "parquet":
            df = pd.read_parquet(BytesIO(file_bytes))
            converted_format = "CSV"
            output_file = "output.csv"
        else:
            raise ValueError("Uploaded file must have a .csv or .parquet extension.")
    else:
        # Process the URL input.
        file_url = file_url.strip()
        if "huggingface.co" not in file_url:
            raise ValueError("Please provide a URL from Hugging Face datasets.")
        if not file_url.lower().startswith(("http://", "https://")):
            file_url = "https://" + file_url

        source = file_url.split('/')[-1]
        response = requests.get(file_url)
        response.raise_for_status()
        content = response.content

        if file_url.lower().endswith(".csv"):
            df = pd.read_csv(BytesIO(content))
            converted_format = "Parquet"
            output_file = "output.parquet"
        elif file_url.lower().endswith(".parquet"):
            df = pd.read_parquet(BytesIO(content))
            converted_format = "CSV"
            output_file = "output.csv"
        else:
            raise ValueError("The URL must point to a .csv or .parquet file.")

    # Convert the file: if CSV, convert to Parquet; if Parquet, convert to CSV.
    if converted_format == "Parquet":
        df.to_parquet(output_file, index=False)
    else:
        df.to_csv(output_file, index=False)

    # Create a preview (top 10 rows) of the DataFrame.
    preview = df.head(10).to_string(index=False)
    info_message = (
        f"Input file: {source}\n"
        f"Converted file format: {converted_format}\n\n"
        f"Preview (Top 10 Rows):\n{preview}"
    )

    return output_file, info_message

demo = gr.Interface(
    fn=convert_hf_dataset,
    inputs=[
        gr.File(label="Uploaded File (Optional)"),
        gr.Textbox(
            label="Hugging Face Dataset URL (Optional)",
            placeholder="e.g., huggingface.co/datasets/username/dataset/filename.csv"
        )
    ],
    outputs=[
        gr.File(label="Converted File"),
        gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
    ],
    title="Hugging Face CSV <-> Parquet Converter",
    description=(
        "Upload a file or enter the URL of a Hugging Face dataset file. "
        "The app automatically detects the file type (.csv or .parquet), converts it to the opposite format, "
        "and displays a preview of the top 10 rows."
    )
)

if __name__ == "__main__":
    demo.launch()