File size: 5,170 Bytes
d2b9031
49e25d2
fa41b98
ff86828
d2b9031
fa41b98
 
 
 
 
 
6a1564b
 
 
fa41b98
 
 
4f2568a
 
 
 
 
fa41b98
4f2568a
 
 
 
6a1564b
 
 
 
fa41b98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7773ef1
31c7995
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa41b98
 
 
 
 
31c7995
 
 
 
 
 
 
 
fa41b98
 
 
 
 
 
 
72dd3ca
fa41b98
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import gradio as gr
import pandas as pd
import chardet
from io import BytesIO

def detect_encoding(file_bytes):
    """Detect the encoding of the file."""
    # Use chardet to detect encoding
    result = chardet.detect(file_bytes)
    return result['encoding']

def convert_file(input_file, conversion_type):
    # Check if a file was uploaded
    if input_file is None:
        return None, "Please upload a file."
    
    # Read the file content
    try:
        # Try reading from file-like object
        file_bytes = input_file.read()
        file_name = input_file.name
    except AttributeError:
        # If there's an AttributeError, treat input_file as a file path
        file_name = input_file
        with open(file_name, "rb") as f:
            file_bytes = f.read()
    
    file_extension = file_name.lower().split('.')[-1]
    df = None
    output_file = None
    converted_format = None
    
    try:
        # Conversion: CSV to Parquet
        if conversion_type == "CSV to Parquet":
            if file_extension != "csv":
                return None, "For CSV to Parquet conversion, please upload a CSV file."
            
            # Detect the encoding of the CSV file
            encoding = detect_encoding(file_bytes)
            
            # Try to read with detected encoding
            try:
                df = pd.read_csv(BytesIO(file_bytes), encoding=encoding)
            except Exception as e:
                # If that fails, try with other common encodings
                for enc in ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']:
                    try:
                        df = pd.read_csv(BytesIO(file_bytes), encoding=enc)
                        encoding = enc
                        break
                    except:
                        continue
                if df is None:
                    return None, f"Failed to read CSV with any encoding. Error: {str(e)}"
            
            output_file = "output.parquet"
            df.to_parquet(output_file, index=False)
            converted_format = "Parquet"
            
        # Conversion: Parquet to CSV
        elif conversion_type == "Parquet to CSV":
            if file_extension != "parquet":
                return None, "For Parquet to CSV conversion, please upload a Parquet file."
            
            df = pd.read_parquet(BytesIO(file_bytes))
            output_file = "output.csv"
            df.to_csv(output_file, index=False, encoding='utf-8')
            converted_format = "CSV"
        else:
            return None, "Invalid conversion type selected."
        
        # Generate a preview of the top 10 rows
        preview = df.head(10).to_string(index=False)
        info_message = (
            f"Input file: {file_name}\n"
            f"Converted file format: {converted_format}\n"
        )
        if conversion_type == "CSV to Parquet":
            info_message += f"Detected encoding: {encoding}\n"
        
        info_message += f"\nPreview (Top 10 Rows):\n{preview}"
        
        return output_file, info_message
    
    except Exception as e:
        return None, f"Error during conversion: {str(e)}"

# Custom CSS for a modern and sleek look
custom_css = """
body {
    background-color: #f4f4f4;
    font-family: 'Helvetica Neue', Arial, sans-serif;
}
.gradio-container {
    max-width: 900px;
    margin: 40px auto;
    padding: 20px;
    background-color: #ffffff;
    border-radius: 12px;
    box-shadow: 0 8px 16px rgba(0,0,0,0.1);
}
h1, h2 {
    color: #333333;
}
.gradio-input, .gradio-output {
    margin-bottom: 20px;
}
.gradio-button {
    background-color: #4CAF50 !important;
    color: white !important;
    border: none !important;
    padding: 10px 20px !important;
    font-size: 16px !important;
    border-radius: 6px !important;
    cursor: pointer;
}
.gradio-button:hover {
    background-color: #45a049 !important;
}
"""

with gr.Blocks(css=custom_css, title="CSV <-> Parquet Converter") as demo:
    gr.Markdown("# CSV <-> Parquet Converter")
    gr.Markdown("Upload a CSV or Parquet file and select the conversion type. The app converts the file to the opposite format and displays a preview of the top 10 rows.")
    
    with gr.Row():
        with gr.Column(scale=1):
            input_file = gr.File(label="Upload CSV or Parquet File")
        with gr.Column(scale=1):
            conversion_type = gr.Radio(
                choices=["CSV to Parquet", "Parquet to CSV"], 
                label="Conversion Type",
                value="CSV to Parquet"  # Set default value
            )
    
    convert_button = gr.Button("Convert", elem_classes=["gradio-button"])
    
    with gr.Row():
        output_file = gr.File(label="Converted File")
        preview = gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
    
    convert_button.click(fn=convert_file, inputs=[input_file, conversion_type], outputs=[output_file, preview])
    
    gr.Markdown("""
    ### Notes:
    - This converter can handle various CSV encodings
    - Parquet files are always encoded in UTF-8
    - The preview shows only the first 10 rows of data
    """)

if __name__ == "__main__":
    demo.launch()