|
import gradio as gr |
|
import pandas as pd |
|
import requests |
|
from io import BytesIO |
|
|
|
def convert_hf_dataset(file_url: str): |
|
file_url = file_url.strip() |
|
|
|
if "huggingface.co" not in file_url: |
|
raise ValueError("Please provide a URL from Hugging Face datasets.") |
|
|
|
|
|
if not file_url.lower().startswith(("http://", "https://")): |
|
file_url = "https://" + file_url |
|
|
|
|
|
response = requests.get(file_url) |
|
response.raise_for_status() |
|
content = response.content |
|
|
|
|
|
if file_url.lower().endswith(".csv"): |
|
|
|
df = pd.read_csv(BytesIO(content)) |
|
output_file = "output.parquet" |
|
df.to_parquet(output_file, index=False) |
|
converted_format = "Parquet" |
|
elif file_url.lower().endswith(".parquet"): |
|
|
|
df = pd.read_parquet(BytesIO(content)) |
|
output_file = "output.csv" |
|
df.to_csv(output_file, index=False) |
|
converted_format = "CSV" |
|
else: |
|
raise ValueError("The URL must point to a .csv or .parquet file.") |
|
|
|
|
|
preview = df.head(10).to_string(index=False) |
|
info_message = ( |
|
f"Input file: {file_url.split('/')[-1]}\n" |
|
f"Converted file format: {converted_format}\n\n" |
|
f"Preview (Top 10 Rows):\n{preview}" |
|
) |
|
|
|
return output_file, info_message |
|
|
|
demo = gr.Interface( |
|
fn=convert_hf_dataset, |
|
inputs=gr.Textbox( |
|
label="Hugging Face Dataset URL", |
|
placeholder="e.g., huggingface.co/datasets/username/dataset/filename.csv" |
|
), |
|
outputs=[ |
|
gr.File(label="Converted File"), |
|
gr.Textbox(label="Preview (Top 10 Rows)", lines=15) |
|
], |
|
title="Hugging Face CSV <-> Parquet Converter", |
|
description=( |
|
"Enter the URL of a Hugging Face dataset file (must end with .csv or .parquet). " |
|
"The app will automatically detect the file type, convert it to the opposite format, " |
|
"and display a preview of the top 10 rows." |
|
) |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|