openfree commited on
Commit
bcf63da
·
verified ·
1 Parent(s): 5e382d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -30
app.py CHANGED
@@ -1,53 +1,111 @@
1
  import gradio as gr
2
  import pandas as pd
 
3
  from io import BytesIO
 
4
 
5
- def convert_file(input_file, conversion_type):
6
- # Check if a file was uploaded
7
- if input_file is None:
8
- raise ValueError("Please upload a file.")
 
9
 
10
- # Determine if input_file is a file-like object or a file path string.
11
- try:
12
- # Try reading from file-like object
13
- file_bytes = input_file.read()
14
- file_name = input_file.name
15
- except AttributeError:
16
- # If there's an AttributeError, treat input_file as a file path.
17
- file_name = input_file
18
- with open(file_name, "rb") as f:
19
- file_bytes = f.read()
20
 
21
- file_extension = file_name.lower().split('.')[-1]
22
- df = None
23
- output_file = None
24
- converted_format = None
25
-
26
  # Conversion: CSV to Parquet
27
  if conversion_type == "CSV to Parquet":
28
- if file_extension != "csv":
29
  raise ValueError("For CSV to Parquet conversion, please upload a CSV file.")
30
  df = pd.read_csv(BytesIO(file_bytes))
31
  output_file = "output.parquet"
32
  df.to_parquet(output_file, index=False)
33
  converted_format = "Parquet"
 
 
34
  # Conversion: Parquet to CSV
35
  elif conversion_type == "Parquet to CSV":
36
- if file_extension != "parquet":
37
  raise ValueError("For Parquet to CSV conversion, please upload a Parquet file.")
38
  df = pd.read_parquet(BytesIO(file_bytes))
39
  output_file = "output.csv"
40
  df.to_csv(output_file, index=False)
41
  converted_format = "CSV"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  else:
43
  raise ValueError("Invalid conversion type selected.")
44
 
45
- # Generate a preview of the top 10 rows
46
- preview = df.head(10).to_string(index=False)
47
  info_message = (
48
- f"Input file: {file_name}\n"
49
  f"Converted file format: {converted_format}\n\n"
50
- f"Preview (Top 10 Rows):\n{preview}"
51
  )
52
  return output_file, info_message
53
 
@@ -85,15 +143,24 @@ h1, h2 {
85
  }
86
  """
87
 
88
- with gr.Blocks(css=custom_css, title="CSV <-> Parquet Converter") as demo:
89
- gr.Markdown("# CSV <-> Parquet Converter")
90
- gr.Markdown("Upload a CSV or Parquet file and select the conversion type. The app converts the file to the opposite format and displays a preview of the top 10 rows.")
 
 
 
91
 
92
  with gr.Row():
93
  with gr.Column(scale=1):
94
  input_file = gr.File(label="Upload CSV or Parquet File")
95
  with gr.Column(scale=1):
96
- conversion_type = gr.Radio(choices=["CSV to Parquet", "Parquet to CSV"], label="Conversion Type")
 
 
 
 
 
 
97
 
98
  convert_button = gr.Button("Convert", elem_classes=["gradio-button"])
99
 
@@ -101,6 +168,10 @@ with gr.Blocks(css=custom_css, title="CSV <-> Parquet Converter") as demo:
101
  output_file = gr.File(label="Converted File")
102
  preview = gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
103
 
104
- convert_button.click(fn=convert_file, inputs=[input_file, conversion_type], outputs=[output_file, preview])
 
 
 
 
105
 
106
  demo.launch()
 
1
  import gradio as gr
2
  import pandas as pd
3
+ import json
4
  from io import BytesIO
5
+ import requests
6
 
7
+ def dataset_converter(input_file, conversion_type, parquet_url):
8
+ # Initialize variables for file data and extension
9
+ file_bytes = None
10
+ file_name = None
11
+ file_extension = None
12
 
13
+ # Read the input file if provided
14
+ if input_file is not None:
15
+ try:
16
+ file_bytes = input_file.read()
17
+ file_name = input_file.name
18
+ except AttributeError:
19
+ file_name = input_file
20
+ with open(file_name, "rb") as f:
21
+ file_bytes = f.read()
22
+ file_extension = file_name.lower().split('.')[-1]
23
 
 
 
 
 
 
24
  # Conversion: CSV to Parquet
25
  if conversion_type == "CSV to Parquet":
26
+ if input_file is None or file_extension != "csv":
27
  raise ValueError("For CSV to Parquet conversion, please upload a CSV file.")
28
  df = pd.read_csv(BytesIO(file_bytes))
29
  output_file = "output.parquet"
30
  df.to_parquet(output_file, index=False)
31
  converted_format = "Parquet"
32
+ preview_str = df.head(10).to_string(index=False)
33
+
34
  # Conversion: Parquet to CSV
35
  elif conversion_type == "Parquet to CSV":
36
+ if input_file is None or file_extension != "parquet":
37
  raise ValueError("For Parquet to CSV conversion, please upload a Parquet file.")
38
  df = pd.read_parquet(BytesIO(file_bytes))
39
  output_file = "output.csv"
40
  df.to_csv(output_file, index=False)
41
  converted_format = "CSV"
42
+ preview_str = df.head(10).to_string(index=False)
43
+
44
+ # Conversion: CSV to JSONL
45
+ elif conversion_type == "CSV to JSONL":
46
+ if input_file is None or file_extension != "csv":
47
+ raise ValueError("For CSV to JSONL conversion, please upload a CSV file.")
48
+ # Read CSV with latin1 encoding
49
+ df = pd.read_csv(BytesIO(file_bytes), encoding='latin1')
50
+ output_file = "metadata.jsonl"
51
+ total_data = []
52
+ for index, row in df.iterrows():
53
+ data = {}
54
+ file_name_val = None # Initialize file_name for each row
55
+ for column in df.columns:
56
+ if column == 'file_name':
57
+ file_name_val = row[column]
58
+ data[column] = row[column]
59
+ row_data = {"file_name": file_name_val, "ground_truth": json.dumps(data)}
60
+ total_data.append(row_data)
61
+ # Write JSONL output (using write mode so previous data is overwritten)
62
+ with open(output_file, 'w', encoding='utf-8') as f:
63
+ for row_data in total_data:
64
+ f.write(json.dumps(row_data) + '\n')
65
+ converted_format = "JSONL"
66
+ preview_str = df.head(10).to_string(index=False)
67
+
68
+ # Conversion: Parquet to JSONL
69
+ elif conversion_type == "Parquet to JSONL":
70
+ # Use uploaded file if available; otherwise try the provided URL
71
+ if input_file is not None:
72
+ df = pd.read_parquet(BytesIO(file_bytes))
73
+ file_name = file_name # already set
74
+ elif parquet_url:
75
+ response = requests.get(parquet_url)
76
+ response.raise_for_status() # Ensure the request was successful
77
+ df = pd.read_parquet(BytesIO(response.content))
78
+ file_name = "from_url.parquet"
79
+ else:
80
+ raise ValueError("For Parquet to JSONL conversion, please upload a file or provide a URL.")
81
+
82
+ output_file = "output.jsonl"
83
+ # Recursive function to decode bytes to UTF-8 strings
84
+ def recursive_sanitize(val):
85
+ if isinstance(val, bytes):
86
+ return val.decode("utf-8", errors="replace")
87
+ elif isinstance(val, dict):
88
+ return {k: recursive_sanitize(v) for k, v in val.items()}
89
+ elif isinstance(val, list):
90
+ return [recursive_sanitize(item) for item in val]
91
+ else:
92
+ return val
93
+
94
+ records = df.to_dict(orient="records")
95
+ with open(output_file, "w", encoding="utf-8") as f:
96
+ for record in records:
97
+ sanitized_record = recursive_sanitize(record)
98
+ f.write(json.dumps(sanitized_record, ensure_ascii=False) + "\n")
99
+ converted_format = "JSONL"
100
+ preview_str = df.head(10).to_string(index=False)
101
+
102
  else:
103
  raise ValueError("Invalid conversion type selected.")
104
 
 
 
105
  info_message = (
106
+ f"Input file: {file_name if file_name is not None else 'N/A'}\n"
107
  f"Converted file format: {converted_format}\n\n"
108
+ f"Preview (Top 10 Rows):\n{preview_str}"
109
  )
110
  return output_file, info_message
111
 
 
143
  }
144
  """
145
 
146
+ with gr.Blocks(css=custom_css, title="Comprehensive Dataset Converter") as demo:
147
+ gr.Markdown("# Comprehensive Dataset Converter")
148
+ gr.Markdown(
149
+ "Upload a CSV or Parquet file (or provide a Parquet file URL for Parquet to JSONL conversion) "
150
+ "and select the conversion type. The app converts the file to the desired format and displays a preview of the top 10 rows."
151
+ )
152
 
153
  with gr.Row():
154
  with gr.Column(scale=1):
155
  input_file = gr.File(label="Upload CSV or Parquet File")
156
  with gr.Column(scale=1):
157
+ conversion_type = gr.Radio(
158
+ choices=["CSV to Parquet", "Parquet to CSV", "CSV to JSONL", "Parquet to JSONL"],
159
+ label="Conversion Type"
160
+ )
161
+
162
+ # Optional URL input for Parquet to JSONL conversion
163
+ parquet_url = gr.Textbox(label="Parquet File URL (Optional)", placeholder="Enter URL if not uploading a file")
164
 
165
  convert_button = gr.Button("Convert", elem_classes=["gradio-button"])
166
 
 
168
  output_file = gr.File(label="Converted File")
169
  preview = gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
170
 
171
+ convert_button.click(
172
+ fn=dataset_converter,
173
+ inputs=[input_file, conversion_type, parquet_url],
174
+ outputs=[output_file, preview]
175
+ )
176
 
177
  demo.launch()