Datasets-Convertor

Running

File size: 1,534 Bytes

d2b9031
49e25d2
 
ff86828
d2b9031
a92db70
 
49e25d2
 
 
 
a92db70
ff86828
49e25d2
 
a92db70
 
9faad6b
 
a92db70
 
9faad6b
 
a92db70
 
 
 
 
9abfad3
a92db70
 
ff86828
7773ef1
49e25d2
a92db70
49e25d2
a92db70
 
 
49e25d2
72dd3ca
 
ff86828

import gradio as gr
import pandas as pd
import requests
from io import BytesIO

def convert_parquet_to_csv(parquet_file=None, parquet_url=None):
    # Read the Parquet file either from an upload or a URL
    if parquet_file is not None:
        df = pd.read_parquet(parquet_file.name)
    elif parquet_url is not None:
        response = requests.get(parquet_url)
        response.raise_for_status()  # Check that the request was successful
        df = pd.read_parquet(BytesIO(response.content))
    else:
        raise ValueError("Either parquet_file or parquet_url must be provided")
    
    # Clean string columns to handle any invalid UTF-8 sequences
    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = df[col].apply(
            lambda x: x.encode("utf-8", errors="replace").decode("utf-8", errors="replace") 
            if isinstance(x, str) else x
        )
    
    # Convert the DataFrame to CSV format
    csv_data = df.to_csv(index=False)
    
    # Save the CSV data to a file
    output_file_path = "output.csv"
    with open(output_file_path, "w", encoding="utf-8") as f:
        f.write(csv_data)
    
    return output_file_path

demo = gr.Interface(
    fn=convert_parquet_to_csv,
    inputs=[gr.File(label="Parquet File"), gr.Textbox(label="Parquet File URL")],
    outputs=[gr.File(label="CSV Output")],
    title="Parquet to CSV Converter",
    description="Convert a Parquet file to CSV format from a downloadable link or file upload"
)

if __name__ == "__main__":
    demo.launch()