import gradio as gr import pandas as pd import requests from io import BytesIO def convert_hf_dataset(file_url: str): file_url = file_url.strip() # Check that the URL is from Hugging Face if "huggingface.co" not in file_url: raise ValueError("Please provide a URL from Hugging Face datasets.") # Ensure the URL has a scheme; if not, add "https://" if not file_url.lower().startswith(("http://", "https://")): file_url = "https://" + file_url # Download the content from the URL response = requests.get(file_url) response.raise_for_status() content = response.content # Determine file type from URL extension and convert accordingly if file_url.lower().endswith(".csv"): # If it's a CSV, read it and convert to Parquet df = pd.read_csv(BytesIO(content)) output_file = "output.parquet" df.to_parquet(output_file, index=False) converted_format = "Parquet" elif file_url.lower().endswith(".parquet"): # If it's a Parquet file, read it and convert to CSV df = pd.read_parquet(BytesIO(content)) output_file = "output.csv" df.to_csv(output_file, index=False) converted_format = "CSV" else: raise ValueError("The URL must point to a .csv or .parquet file.") # Create a preview of the top 10 rows preview = df.head(10).to_string(index=False) info_message = ( f"Input file: {file_url.split('/')[-1]}\n" f"Converted file format: {converted_format}\n\n" f"Preview (Top 10 Rows):\n{preview}" ) return output_file, info_message demo = gr.Interface( fn=convert_hf_dataset, inputs=gr.Textbox( label="Hugging Face Dataset URL", placeholder="e.g., huggingface.co/datasets/username/dataset/filename.csv" ), outputs=[ gr.File(label="Converted File"), gr.Textbox(label="Preview (Top 10 Rows)", lines=15) ], title="Hugging Face CSV <-> Parquet Converter", description=( "Enter the URL of a Hugging Face dataset file (must end with .csv or .parquet). " "The app will automatically detect the file type, convert it to the opposite format, " "and display a preview of the top 10 rows." ) ) if __name__ == "__main__": demo.launch()