openfree's picture
Update app.py
90f89f0 verified
raw
history blame
2.32 kB
import gradio as gr
import pandas as pd
import requests
from io import BytesIO
def convert_hf_dataset(file_url: str):
file_url = file_url.strip()
# Check that the URL is from Hugging Face
if "huggingface.co" not in file_url:
raise ValueError("Please provide a URL from Hugging Face datasets.")
# Ensure the URL has a scheme; if not, add "https://"
if not file_url.lower().startswith(("http://", "https://")):
file_url = "https://" + file_url
# Download the content from the URL
response = requests.get(file_url)
response.raise_for_status()
content = response.content
# Determine file type from URL extension and convert accordingly
if file_url.lower().endswith(".csv"):
# If it's a CSV, read it and convert to Parquet
df = pd.read_csv(BytesIO(content))
output_file = "output.parquet"
df.to_parquet(output_file, index=False)
converted_format = "Parquet"
elif file_url.lower().endswith(".parquet"):
# If it's a Parquet file, read it and convert to CSV
df = pd.read_parquet(BytesIO(content))
output_file = "output.csv"
df.to_csv(output_file, index=False)
converted_format = "CSV"
else:
raise ValueError("The URL must point to a .csv or .parquet file.")
# Create a preview of the top 10 rows
preview = df.head(10).to_string(index=False)
info_message = (
f"Input file: {file_url.split('/')[-1]}\n"
f"Converted file format: {converted_format}\n\n"
f"Preview (Top 10 Rows):\n{preview}"
)
return output_file, info_message
demo = gr.Interface(
fn=convert_hf_dataset,
inputs=gr.Textbox(
label="Hugging Face Dataset URL",
placeholder="e.g., huggingface.co/datasets/username/dataset/filename.csv"
),
outputs=[
gr.File(label="Converted File"),
gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
],
title="Hugging Face CSV <-> Parquet Converter",
description=(
"Enter the URL of a Hugging Face dataset file (must end with .csv or .parquet). "
"The app will automatically detect the file type, convert it to the opposite format, "
"and display a preview of the top 10 rows."
)
)
if __name__ == "__main__":
demo.launch()