Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
import requests | |
from io import BytesIO | |
def convert_hf_dataset(input_file, file_url): | |
""" | |
This function accepts either an uploaded file or a Hugging Face dataset URL. | |
It automatically determines the file type (CSV or Parquet) based on the file extension, | |
converts the file to the opposite format, and returns the converted file along with a preview | |
of the top 10 rows. | |
""" | |
df = None | |
source = None | |
converted_format = None | |
output_file = None | |
# If no file is provided via upload and URL is empty, raise an error. | |
if input_file is None and (file_url is None or file_url.strip() == ""): | |
raise ValueError("Please provide an uploaded file or a Hugging Face dataset URL.") | |
if input_file is not None: | |
# Process the uploaded file. | |
source = input_file.name | |
file_extension = source.lower().split('.')[-1] | |
file_bytes = input_file.read() # read the file content | |
if file_extension == "csv": | |
df = pd.read_csv(BytesIO(file_bytes)) | |
converted_format = "Parquet" | |
output_file = "output.parquet" | |
elif file_extension == "parquet": | |
df = pd.read_parquet(BytesIO(file_bytes)) | |
converted_format = "CSV" | |
output_file = "output.csv" | |
else: | |
raise ValueError("Uploaded file must have a .csv or .parquet extension.") | |
else: | |
# Process the URL input. | |
file_url = file_url.strip() | |
if "huggingface.co" not in file_url: | |
raise ValueError("Please provide a URL from Hugging Face datasets.") | |
if not file_url.lower().startswith(("http://", "https://")): | |
file_url = "https://" + file_url | |
source = file_url.split('/')[-1] | |
response = requests.get(file_url) | |
response.raise_for_status() | |
content = response.content | |
if file_url.lower().endswith(".csv"): | |
df = pd.read_csv(BytesIO(content)) | |
converted_format = "Parquet" | |
output_file = "output.parquet" | |
elif file_url.lower().endswith(".parquet"): | |
df = pd.read_parquet(BytesIO(content)) | |
converted_format = "CSV" | |
output_file = "output.csv" | |
else: | |
raise ValueError("The URL must point to a .csv or .parquet file.") | |
# Convert the file: if CSV, convert to Parquet; if Parquet, convert to CSV. | |
if converted_format == "Parquet": | |
df.to_parquet(output_file, index=False) | |
else: | |
df.to_csv(output_file, index=False) | |
# Create a preview (top 10 rows) of the DataFrame. | |
preview = df.head(10).to_string(index=False) | |
info_message = ( | |
f"Input file: {source}\n" | |
f"Converted file format: {converted_format}\n\n" | |
f"Preview (Top 10 Rows):\n{preview}" | |
) | |
return output_file, info_message | |
demo = gr.Interface( | |
fn=convert_hf_dataset, | |
inputs=[ | |
gr.File(label="Uploaded File (Optional)"), | |
gr.Textbox( | |
label="Hugging Face Dataset URL (Optional)", | |
placeholder="e.g., huggingface.co/datasets/username/dataset/filename.csv" | |
) | |
], | |
outputs=[ | |
gr.File(label="Converted File"), | |
gr.Textbox(label="Preview (Top 10 Rows)", lines=15) | |
], | |
title="Hugging Face CSV <-> Parquet Converter", | |
description=( | |
"Upload a file or enter the URL of a Hugging Face dataset file. " | |
"The app automatically detects the file type (.csv or .parquet), converts it to the opposite format, " | |
"and displays a preview of the top 10 rows." | |
) | |
) | |
if __name__ == "__main__": | |
demo.launch() | |