File size: 5,170 Bytes
d2b9031 49e25d2 fa41b98 ff86828 d2b9031 fa41b98 6a1564b fa41b98 4f2568a fa41b98 4f2568a 6a1564b fa41b98 7773ef1 31c7995 fa41b98 31c7995 fa41b98 72dd3ca fa41b98 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import gradio as gr
import pandas as pd
import chardet
from io import BytesIO
def detect_encoding(file_bytes):
"""Detect the encoding of the file."""
# Use chardet to detect encoding
result = chardet.detect(file_bytes)
return result['encoding']
def convert_file(input_file, conversion_type):
# Check if a file was uploaded
if input_file is None:
return None, "Please upload a file."
# Read the file content
try:
# Try reading from file-like object
file_bytes = input_file.read()
file_name = input_file.name
except AttributeError:
# If there's an AttributeError, treat input_file as a file path
file_name = input_file
with open(file_name, "rb") as f:
file_bytes = f.read()
file_extension = file_name.lower().split('.')[-1]
df = None
output_file = None
converted_format = None
try:
# Conversion: CSV to Parquet
if conversion_type == "CSV to Parquet":
if file_extension != "csv":
return None, "For CSV to Parquet conversion, please upload a CSV file."
# Detect the encoding of the CSV file
encoding = detect_encoding(file_bytes)
# Try to read with detected encoding
try:
df = pd.read_csv(BytesIO(file_bytes), encoding=encoding)
except Exception as e:
# If that fails, try with other common encodings
for enc in ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']:
try:
df = pd.read_csv(BytesIO(file_bytes), encoding=enc)
encoding = enc
break
except:
continue
if df is None:
return None, f"Failed to read CSV with any encoding. Error: {str(e)}"
output_file = "output.parquet"
df.to_parquet(output_file, index=False)
converted_format = "Parquet"
# Conversion: Parquet to CSV
elif conversion_type == "Parquet to CSV":
if file_extension != "parquet":
return None, "For Parquet to CSV conversion, please upload a Parquet file."
df = pd.read_parquet(BytesIO(file_bytes))
output_file = "output.csv"
df.to_csv(output_file, index=False, encoding='utf-8')
converted_format = "CSV"
else:
return None, "Invalid conversion type selected."
# Generate a preview of the top 10 rows
preview = df.head(10).to_string(index=False)
info_message = (
f"Input file: {file_name}\n"
f"Converted file format: {converted_format}\n"
)
if conversion_type == "CSV to Parquet":
info_message += f"Detected encoding: {encoding}\n"
info_message += f"\nPreview (Top 10 Rows):\n{preview}"
return output_file, info_message
except Exception as e:
return None, f"Error during conversion: {str(e)}"
# Custom CSS for a modern and sleek look
custom_css = """
body {
background-color: #f4f4f4;
font-family: 'Helvetica Neue', Arial, sans-serif;
}
.gradio-container {
max-width: 900px;
margin: 40px auto;
padding: 20px;
background-color: #ffffff;
border-radius: 12px;
box-shadow: 0 8px 16px rgba(0,0,0,0.1);
}
h1, h2 {
color: #333333;
}
.gradio-input, .gradio-output {
margin-bottom: 20px;
}
.gradio-button {
background-color: #4CAF50 !important;
color: white !important;
border: none !important;
padding: 10px 20px !important;
font-size: 16px !important;
border-radius: 6px !important;
cursor: pointer;
}
.gradio-button:hover {
background-color: #45a049 !important;
}
"""
with gr.Blocks(css=custom_css, title="CSV <-> Parquet Converter") as demo:
gr.Markdown("# CSV <-> Parquet Converter")
gr.Markdown("Upload a CSV or Parquet file and select the conversion type. The app converts the file to the opposite format and displays a preview of the top 10 rows.")
with gr.Row():
with gr.Column(scale=1):
input_file = gr.File(label="Upload CSV or Parquet File")
with gr.Column(scale=1):
conversion_type = gr.Radio(
choices=["CSV to Parquet", "Parquet to CSV"],
label="Conversion Type",
value="CSV to Parquet" # Set default value
)
convert_button = gr.Button("Convert", elem_classes=["gradio-button"])
with gr.Row():
output_file = gr.File(label="Converted File")
preview = gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
convert_button.click(fn=convert_file, inputs=[input_file, conversion_type], outputs=[output_file, preview])
gr.Markdown("""
### Notes:
- This converter can handle various CSV encodings
- Parquet files are always encoded in UTF-8
- The preview shows only the first 10 rows of data
""")
if __name__ == "__main__":
demo.launch() |