|
import gradio as gr |
|
import pandas as pd |
|
import requests |
|
from io import BytesIO |
|
|
|
def convert_hf_dataset(input_file, file_url): |
|
""" |
|
This function accepts either an uploaded file or a Hugging Face dataset URL. |
|
It automatically determines the file type (CSV or Parquet) based on the file extension, |
|
converts the file to the opposite format, and returns the converted file along with a preview |
|
of the top 10 rows. |
|
""" |
|
df = None |
|
source = None |
|
converted_format = None |
|
output_file = None |
|
|
|
|
|
if input_file is None and (file_url is None or file_url.strip() == ""): |
|
raise ValueError("Please provide an uploaded file or a Hugging Face dataset URL.") |
|
|
|
if input_file is not None: |
|
|
|
source = input_file.name |
|
file_extension = source.lower().split('.')[-1] |
|
file_bytes = input_file.read() |
|
|
|
if file_extension == "csv": |
|
df = pd.read_csv(BytesIO(file_bytes)) |
|
converted_format = "Parquet" |
|
output_file = "output.parquet" |
|
elif file_extension == "parquet": |
|
df = pd.read_parquet(BytesIO(file_bytes)) |
|
converted_format = "CSV" |
|
output_file = "output.csv" |
|
else: |
|
raise ValueError("Uploaded file must have a .csv or .parquet extension.") |
|
else: |
|
|
|
file_url = file_url.strip() |
|
if "huggingface.co" not in file_url: |
|
raise ValueError("Please provide a URL from Hugging Face datasets.") |
|
if not file_url.lower().startswith(("http://", "https://")): |
|
file_url = "https://" + file_url |
|
|
|
source = file_url.split('/')[-1] |
|
response = requests.get(file_url) |
|
response.raise_for_status() |
|
content = response.content |
|
|
|
if file_url.lower().endswith(".csv"): |
|
df = pd.read_csv(BytesIO(content)) |
|
converted_format = "Parquet" |
|
output_file = "output.parquet" |
|
elif file_url.lower().endswith(".parquet"): |
|
df = pd.read_parquet(BytesIO(content)) |
|
converted_format = "CSV" |
|
output_file = "output.csv" |
|
else: |
|
raise ValueError("The URL must point to a .csv or .parquet file.") |
|
|
|
|
|
if converted_format == "Parquet": |
|
df.to_parquet(output_file, index=False) |
|
else: |
|
df.to_csv(output_file, index=False) |
|
|
|
|
|
preview = df.head(10).to_string(index=False) |
|
info_message = ( |
|
f"Input file: {source}\n" |
|
f"Converted file format: {converted_format}\n\n" |
|
f"Preview (Top 10 Rows):\n{preview}" |
|
) |
|
|
|
return output_file, info_message |
|
|
|
demo = gr.Interface( |
|
fn=convert_hf_dataset, |
|
inputs=[ |
|
gr.File(label="Uploaded File (Optional)"), |
|
gr.Textbox( |
|
label="Hugging Face Dataset URL (Optional)", |
|
placeholder="e.g., huggingface.co/datasets/username/dataset/filename.csv" |
|
) |
|
], |
|
outputs=[ |
|
gr.File(label="Converted File"), |
|
gr.Textbox(label="Preview (Top 10 Rows)", lines=15) |
|
], |
|
title="Hugging Face CSV <-> Parquet Converter", |
|
description=( |
|
"Upload a file or enter the URL of a Hugging Face dataset file. " |
|
"The app automatically detects the file type (.csv or .parquet), converts it to the opposite format, " |
|
"and displays a preview of the top 10 rows." |
|
) |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|