openfree's picture
Update app.py
0df8fba verified
raw
history blame
3.65 kB
import gradio as gr
import pandas as pd
import requests
from io import BytesIO
def convert_hf_dataset(input_file, file_url):
"""
This function accepts either an uploaded file or a Hugging Face dataset URL.
It automatically determines the file type (CSV or Parquet) based on the file extension,
converts the file to the opposite format, and returns the converted file along with a preview
of the top 10 rows.
"""
df = None
source = None
converted_format = None
output_file = None
# If no file is provided via upload and URL is empty, raise an error.
if input_file is None and (file_url is None or file_url.strip() == ""):
raise ValueError("Please provide an uploaded file or a Hugging Face dataset URL.")
if input_file is not None:
# Process the uploaded file.
source = input_file.name
file_extension = source.lower().split('.')[-1]
file_bytes = input_file.read() # read the file content
if file_extension == "csv":
df = pd.read_csv(BytesIO(file_bytes))
converted_format = "Parquet"
output_file = "output.parquet"
elif file_extension == "parquet":
df = pd.read_parquet(BytesIO(file_bytes))
converted_format = "CSV"
output_file = "output.csv"
else:
raise ValueError("Uploaded file must have a .csv or .parquet extension.")
else:
# Process the URL input.
file_url = file_url.strip()
if "huggingface.co" not in file_url:
raise ValueError("Please provide a URL from Hugging Face datasets.")
if not file_url.lower().startswith(("http://", "https://")):
file_url = "https://" + file_url
source = file_url.split('/')[-1]
response = requests.get(file_url)
response.raise_for_status()
content = response.content
if file_url.lower().endswith(".csv"):
df = pd.read_csv(BytesIO(content))
converted_format = "Parquet"
output_file = "output.parquet"
elif file_url.lower().endswith(".parquet"):
df = pd.read_parquet(BytesIO(content))
converted_format = "CSV"
output_file = "output.csv"
else:
raise ValueError("The URL must point to a .csv or .parquet file.")
# Convert the file: if CSV, convert to Parquet; if Parquet, convert to CSV.
if converted_format == "Parquet":
df.to_parquet(output_file, index=False)
else:
df.to_csv(output_file, index=False)
# Create a preview (top 10 rows) of the DataFrame.
preview = df.head(10).to_string(index=False)
info_message = (
f"Input file: {source}\n"
f"Converted file format: {converted_format}\n\n"
f"Preview (Top 10 Rows):\n{preview}"
)
return output_file, info_message
demo = gr.Interface(
fn=convert_hf_dataset,
inputs=[
gr.File(label="Uploaded File (Optional)"),
gr.Textbox(
label="Hugging Face Dataset URL (Optional)",
placeholder="e.g., huggingface.co/datasets/username/dataset/filename.csv"
)
],
outputs=[
gr.File(label="Converted File"),
gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
],
title="Hugging Face CSV <-> Parquet Converter",
description=(
"Upload a file or enter the URL of a Hugging Face dataset file. "
"The app automatically detects the file type (.csv or .parquet), converts it to the opposite format, "
"and displays a preview of the top 10 rows."
)
)
if __name__ == "__main__":
demo.launch()