Spaces:
Running
Running
File size: 3,654 Bytes
d2b9031 49e25d2 ff86828 d2b9031 0df8fba 1fd0c30 0df8fba 90f89f0 0df8fba 2bf1e25 90f89f0 0df8fba 2bf1e25 90f89f0 0df8fba 90f89f0 0df8fba 90f89f0 7773ef1 49e25d2 90f89f0 0df8fba 2bf1e25 90f89f0 fcd8f70 90f89f0 0df8fba 90f89f0 49e25d2 72dd3ca ff86828 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import gradio as gr
import pandas as pd
import requests
from io import BytesIO
def convert_hf_dataset(input_file, file_url):
"""
This function accepts either an uploaded file or a Hugging Face dataset URL.
It automatically determines the file type (CSV or Parquet) based on the file extension,
converts the file to the opposite format, and returns the converted file along with a preview
of the top 10 rows.
"""
df = None
source = None
converted_format = None
output_file = None
# If no file is provided via upload and URL is empty, raise an error.
if input_file is None and (file_url is None or file_url.strip() == ""):
raise ValueError("Please provide an uploaded file or a Hugging Face dataset URL.")
if input_file is not None:
# Process the uploaded file.
source = input_file.name
file_extension = source.lower().split('.')[-1]
file_bytes = input_file.read() # read the file content
if file_extension == "csv":
df = pd.read_csv(BytesIO(file_bytes))
converted_format = "Parquet"
output_file = "output.parquet"
elif file_extension == "parquet":
df = pd.read_parquet(BytesIO(file_bytes))
converted_format = "CSV"
output_file = "output.csv"
else:
raise ValueError("Uploaded file must have a .csv or .parquet extension.")
else:
# Process the URL input.
file_url = file_url.strip()
if "huggingface.co" not in file_url:
raise ValueError("Please provide a URL from Hugging Face datasets.")
if not file_url.lower().startswith(("http://", "https://")):
file_url = "https://" + file_url
source = file_url.split('/')[-1]
response = requests.get(file_url)
response.raise_for_status()
content = response.content
if file_url.lower().endswith(".csv"):
df = pd.read_csv(BytesIO(content))
converted_format = "Parquet"
output_file = "output.parquet"
elif file_url.lower().endswith(".parquet"):
df = pd.read_parquet(BytesIO(content))
converted_format = "CSV"
output_file = "output.csv"
else:
raise ValueError("The URL must point to a .csv or .parquet file.")
# Convert the file: if CSV, convert to Parquet; if Parquet, convert to CSV.
if converted_format == "Parquet":
df.to_parquet(output_file, index=False)
else:
df.to_csv(output_file, index=False)
# Create a preview (top 10 rows) of the DataFrame.
preview = df.head(10).to_string(index=False)
info_message = (
f"Input file: {source}\n"
f"Converted file format: {converted_format}\n\n"
f"Preview (Top 10 Rows):\n{preview}"
)
return output_file, info_message
demo = gr.Interface(
fn=convert_hf_dataset,
inputs=[
gr.File(label="Uploaded File (Optional)"),
gr.Textbox(
label="Hugging Face Dataset URL (Optional)",
placeholder="e.g., huggingface.co/datasets/username/dataset/filename.csv"
)
],
outputs=[
gr.File(label="Converted File"),
gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
],
title="Hugging Face CSV <-> Parquet Converter",
description=(
"Upload a file or enter the URL of a Hugging Face dataset file. "
"The app automatically detects the file type (.csv or .parquet), converts it to the opposite format, "
"and displays a preview of the top 10 rows."
)
)
if __name__ == "__main__":
demo.launch()
|