Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
import requests | |
from io import BytesIO | |
def convert_hf_dataset(file_url: str): | |
file_url = file_url.strip() | |
# Check that the URL is from Hugging Face | |
if "huggingface.co" not in file_url: | |
raise ValueError("Please provide a URL from Hugging Face datasets.") | |
# Ensure the URL has a scheme; if not, add "https://" | |
if not file_url.lower().startswith(("http://", "https://")): | |
file_url = "https://" + file_url | |
# Download the content from the URL | |
response = requests.get(file_url) | |
response.raise_for_status() | |
content = response.content | |
# Determine file type from URL extension and convert accordingly | |
if file_url.lower().endswith(".csv"): | |
# If it's a CSV, read it and convert to Parquet | |
df = pd.read_csv(BytesIO(content)) | |
output_file = "output.parquet" | |
df.to_parquet(output_file, index=False) | |
converted_format = "Parquet" | |
elif file_url.lower().endswith(".parquet"): | |
# If it's a Parquet file, read it and convert to CSV | |
df = pd.read_parquet(BytesIO(content)) | |
output_file = "output.csv" | |
df.to_csv(output_file, index=False) | |
converted_format = "CSV" | |
else: | |
raise ValueError("The URL must point to a .csv or .parquet file.") | |
# Create a preview of the top 10 rows | |
preview = df.head(10).to_string(index=False) | |
info_message = ( | |
f"Input file: {file_url.split('/')[-1]}\n" | |
f"Converted file format: {converted_format}\n\n" | |
f"Preview (Top 10 Rows):\n{preview}" | |
) | |
return output_file, info_message | |
demo = gr.Interface( | |
fn=convert_hf_dataset, | |
inputs=gr.Textbox( | |
label="Hugging Face Dataset URL", | |
placeholder="e.g., huggingface.co/datasets/username/dataset/filename.csv" | |
), | |
outputs=[ | |
gr.File(label="Converted File"), | |
gr.Textbox(label="Preview (Top 10 Rows)", lines=15) | |
], | |
title="Hugging Face CSV <-> Parquet Converter", | |
description=( | |
"Enter the URL of a Hugging Face dataset file (must end with .csv or .parquet). " | |
"The app will automatically detect the file type, convert it to the opposite format, " | |
"and display a preview of the top 10 rows." | |
) | |
) | |
if __name__ == "__main__": | |
demo.launch() | |