Datasets-Convertor

Running

App Files Files Community

Datasets-Convertor / app.py

openfree

Update app.py

0df8fba verified 5 months ago

raw

history blame

3.65 kB

	import gradio as gr
	import pandas as pd
	import requests
	from io import BytesIO

	def convert_hf_dataset(input_file, file_url):
	"""
	This function accepts either an uploaded file or a Hugging Face dataset URL.
	It automatically determines the file type (CSV or Parquet) based on the file extension,
	converts the file to the opposite format, and returns the converted file along with a preview
	of the top 10 rows.
	"""
	df = None
	source = None
	converted_format = None
	output_file = None

	# If no file is provided via upload and URL is empty, raise an error.
	if input_file is None and (file_url is None or file_url.strip() == ""):
	raise ValueError("Please provide an uploaded file or a Hugging Face dataset URL.")

	if input_file is not None:
	# Process the uploaded file.
	source = input_file.name
	file_extension = source.lower().split('.')[-1]
	file_bytes = input_file.read() # read the file content

	if file_extension == "csv":
	df = pd.read_csv(BytesIO(file_bytes))
	converted_format = "Parquet"
	output_file = "output.parquet"
	elif file_extension == "parquet":
	df = pd.read_parquet(BytesIO(file_bytes))
	converted_format = "CSV"
	output_file = "output.csv"
	else:
	raise ValueError("Uploaded file must have a .csv or .parquet extension.")
	else:
	# Process the URL input.
	file_url = file_url.strip()
	if "huggingface.co" not in file_url:
	raise ValueError("Please provide a URL from Hugging Face datasets.")
	if not file_url.lower().startswith(("http://", "https://")):
	file_url = "https://" + file_url

	source = file_url.split('/')[-1]
	response = requests.get(file_url)
	response.raise_for_status()
	content = response.content

	if file_url.lower().endswith(".csv"):
	df = pd.read_csv(BytesIO(content))
	converted_format = "Parquet"
	output_file = "output.parquet"
	elif file_url.lower().endswith(".parquet"):
	df = pd.read_parquet(BytesIO(content))
	converted_format = "CSV"
	output_file = "output.csv"
	else:
	raise ValueError("The URL must point to a .csv or .parquet file.")

	# Convert the file: if CSV, convert to Parquet; if Parquet, convert to CSV.
	if converted_format == "Parquet":
	df.to_parquet(output_file, index=False)
	else:
	df.to_csv(output_file, index=False)

	# Create a preview (top 10 rows) of the DataFrame.
	preview = df.head(10).to_string(index=False)
	info_message = (
	f"Input file: {source}\n"
	f"Converted file format: {converted_format}\n\n"
	f"Preview (Top 10 Rows):\n{preview}"
	)

	return output_file, info_message

	demo = gr.Interface(
	fn=convert_hf_dataset,
	inputs=[
	gr.File(label="Uploaded File (Optional)"),
	gr.Textbox(
	label="Hugging Face Dataset URL (Optional)",
	placeholder="e.g., huggingface.co/datasets/username/dataset/filename.csv"
	)
	],
	outputs=[
	gr.File(label="Converted File"),
	gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
	],
	title="Hugging Face CSV <-> Parquet Converter",
	description=(
	"Upload a file or enter the URL of a Hugging Face dataset file. "
	"The app automatically detects the file type (.csv or .parquet), converts it to the opposite format, "
	"and displays a preview of the top 10 rows."
	)
	)

	if __name__ == "__main__":
	demo.launch()