DocSrvNyk commited on
Commit
e38d45e
·
1 Parent(s): 38d7b01

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -0
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import os
4
+ import zipfile
5
+
6
+ def process_csv(uploaded_file):
7
+ """
8
+ Process the uploaded CSV file to:
9
+ 1. Replace text-based columns and numerical columns with less than six unique options with coded values.
10
+ 2. Fill missing values in numerical columns with their respective medians.
11
+ 3. Return a zip file containing the modified CSV file, a legend CSV, and a CSV detailing data fill methods.
12
+ """
13
+ # Load the data from the uploaded file's byte stream
14
+ data = pd.read_csv(uploaded_file.name)
15
+
16
+ # Dictionary to store column name and its mapping of original values to codes
17
+ legend_dict = {}
18
+
19
+ # List to store the details of columns where data was added
20
+ data_added_details = []
21
+
22
+ # Loop through each column in the DataFrame
23
+ for col in data.columns:
24
+ # Check if the column is of type object (text-based) or if it's numerical with less than six unique options
25
+ if data[col].dtype == 'object' or (data[col].nunique() < 6 and pd.api.types.is_numeric_dtype(data[col])):
26
+ # Create a mapping of original values to codes, including NaN or blank values mapped to -9999
27
+ mapping = {value: code if pd.notna(value) else -9999 for code, value in enumerate(data[col].unique())}
28
+ legend_dict[col] = mapping
29
+ # Replace the values in the column with their respective codes
30
+ data[col] = data[col].map(mapping)
31
+ elif pd.api.types.is_numeric_dtype(data[col]) and any(pd.isna(data[col])):
32
+ # Replace with median
33
+ median_value = data[col].median()
34
+ data[col].fillna(median_value, inplace=True)
35
+ data_added_details.append([col, "Median", median_value])
36
+
37
+ # Name of the zip file based on uploaded file name
38
+ zip_name = "processed_files.zip"
39
+
40
+ # Save CSV files and add them to the zip file
41
+ with zipfile.ZipFile(zip_name, 'w') as zipf:
42
+ data.to_csv("modified_data.csv", index=False)
43
+ zipf.write("modified_data.csv")
44
+
45
+ legend_df = pd.DataFrame(list(legend_dict.items()), columns=['Column', 'Mapping'])
46
+ legend_df.to_csv("legend.csv", index=False)
47
+ zipf.write("legend.csv")
48
+
49
+ data_added_df = pd.DataFrame(data_added_details, columns=['Column', 'Method', 'Value Added'])
50
+ data_added_df.to_csv("data_added_details.csv", index=False)
51
+ zipf.write("data_added_details.csv")
52
+
53
+ return zip_name
54
+
55
+ # Gradio Interface
56
+ iface = gr.Interface(
57
+ fn=process_csv,
58
+ inputs=gr.inputs.File(type="file", label="Upload CSV File"),
59
+ outputs=gr.outputs.File(label="Download Processed Files"),
60
+ live=False
61
+ )
62
+
63
+ iface.launch()