File size: 2,319 Bytes
d2b9031
49e25d2
 
ff86828
d2b9031
90f89f0
 
 
 
 
2bf1e25
90f89f0
 
 
1fd0c30
90f89f0
 
 
 
 
 
 
 
 
2bf1e25
 
90f89f0
 
 
 
2bf1e25
 
90f89f0
 
 
2bf1e25
90f89f0
2bf1e25
90f89f0
 
 
 
 
2bf1e25
90f89f0
7773ef1
49e25d2
90f89f0
 
 
 
 
2bf1e25
90f89f0
 
fcd8f70
90f89f0
 
 
 
 
 
49e25d2
72dd3ca
 
ff86828
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import gradio as gr
import pandas as pd
import requests
from io import BytesIO

def convert_hf_dataset(file_url: str):
    file_url = file_url.strip()
    # Check that the URL is from Hugging Face
    if "huggingface.co" not in file_url:
        raise ValueError("Please provide a URL from Hugging Face datasets.")
    
    # Ensure the URL has a scheme; if not, add "https://"
    if not file_url.lower().startswith(("http://", "https://")):
        file_url = "https://" + file_url

    # Download the content from the URL
    response = requests.get(file_url)
    response.raise_for_status()
    content = response.content

    # Determine file type from URL extension and convert accordingly
    if file_url.lower().endswith(".csv"):
        # If it's a CSV, read it and convert to Parquet
        df = pd.read_csv(BytesIO(content))
        output_file = "output.parquet"
        df.to_parquet(output_file, index=False)
        converted_format = "Parquet"
    elif file_url.lower().endswith(".parquet"):
        # If it's a Parquet file, read it and convert to CSV
        df = pd.read_parquet(BytesIO(content))
        output_file = "output.csv"
        df.to_csv(output_file, index=False)
        converted_format = "CSV"
    else:
        raise ValueError("The URL must point to a .csv or .parquet file.")
    
    # Create a preview of the top 10 rows
    preview = df.head(10).to_string(index=False)
    info_message = (
        f"Input file: {file_url.split('/')[-1]}\n"
        f"Converted file format: {converted_format}\n\n"
        f"Preview (Top 10 Rows):\n{preview}"
    )
    
    return output_file, info_message

demo = gr.Interface(
    fn=convert_hf_dataset,
    inputs=gr.Textbox(
        label="Hugging Face Dataset URL", 
        placeholder="e.g., huggingface.co/datasets/username/dataset/filename.csv"
    ),
    outputs=[
        gr.File(label="Converted File"),
        gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
    ],
    title="Hugging Face CSV <-> Parquet Converter",
    description=(
        "Enter the URL of a Hugging Face dataset file (must end with .csv or .parquet). "
        "The app will automatically detect the file type, convert it to the opposite format, "
        "and display a preview of the top 10 rows."
    )
)

if __name__ == "__main__":
    demo.launch()