import gradio as gr import pandas as pd import requests from io import BytesIO def convert_hf_dataset(input_file, file_url): """ This function accepts either an uploaded file or a Hugging Face dataset URL. It automatically determines the file type (CSV or Parquet) based on the file extension, converts the file to the opposite format, and returns the converted file along with a preview of the top 10 rows. """ df = None source = None converted_format = None output_file = None # If no file is provided via upload and URL is empty, raise an error. if input_file is None and (file_url is None or file_url.strip() == ""): raise ValueError("Please provide an uploaded file or a Hugging Face dataset URL.") if input_file is not None: # Process the uploaded file. source = input_file.name file_extension = source.lower().split('.')[-1] file_bytes = input_file.read() # read the file content if file_extension == "csv": df = pd.read_csv(BytesIO(file_bytes)) converted_format = "Parquet" output_file = "output.parquet" elif file_extension == "parquet": df = pd.read_parquet(BytesIO(file_bytes)) converted_format = "CSV" output_file = "output.csv" else: raise ValueError("Uploaded file must have a .csv or .parquet extension.") else: # Process the URL input. file_url = file_url.strip() if "huggingface.co" not in file_url: raise ValueError("Please provide a URL from Hugging Face datasets.") if not file_url.lower().startswith(("http://", "https://")): file_url = "https://" + file_url source = file_url.split('/')[-1] response = requests.get(file_url) response.raise_for_status() content = response.content if file_url.lower().endswith(".csv"): df = pd.read_csv(BytesIO(content)) converted_format = "Parquet" output_file = "output.parquet" elif file_url.lower().endswith(".parquet"): df = pd.read_parquet(BytesIO(content)) converted_format = "CSV" output_file = "output.csv" else: raise ValueError("The URL must point to a .csv or .parquet file.") # Convert the file: if CSV, convert to Parquet; if Parquet, convert to CSV. if converted_format == "Parquet": df.to_parquet(output_file, index=False) else: df.to_csv(output_file, index=False) # Create a preview (top 10 rows) of the DataFrame. preview = df.head(10).to_string(index=False) info_message = ( f"Input file: {source}\n" f"Converted file format: {converted_format}\n\n" f"Preview (Top 10 Rows):\n{preview}" ) return output_file, info_message demo = gr.Interface( fn=convert_hf_dataset, inputs=[ gr.File(label="Uploaded File (Optional)"), gr.Textbox( label="Hugging Face Dataset URL (Optional)", placeholder="e.g., huggingface.co/datasets/username/dataset/filename.csv" ) ], outputs=[ gr.File(label="Converted File"), gr.Textbox(label="Preview (Top 10 Rows)", lines=15) ], title="Hugging Face CSV <-> Parquet Converter", description=( "Upload a file or enter the URL of a Hugging Face dataset file. " "The app automatically detects the file type (.csv or .parquet), converts it to the opposite format, " "and displays a preview of the top 10 rows." ) ) if __name__ == "__main__": demo.launch()