Haseeb-001 commited on
Commit
ac98ac9
Β·
verified Β·
1 Parent(s): 0bf34ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -130
app.py CHANGED
@@ -1,132 +1,164 @@
1
- import os
2
  import pandas as pd
3
- import re
4
- from groq import Groq
5
- import gradio as gr
6
- from nltk.corpus import stopwords
7
- import nltk
8
-
9
- # Download stopwords for text cleaning
10
- nltk.download('stopwords')
11
- STOPWORDS = set(stopwords.words('english'))
12
-
13
- # Set Groq API Key
14
- GROQ_API_KEY = "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"
15
- client = Groq(api_key=GROQ_API_KEY)
16
-
17
- # Function: Generate Missing Data Report
18
- def missing_data_report(data):
19
- missing_report = data.isnull().sum()
20
- total_missing = missing_report.sum()
21
- return f"Missing Data Report:\n\n{missing_report}\n\nTotal Missing Values: {total_missing}"
22
-
23
- # Function: Auto-label Columns
24
- def auto_label_columns(data):
25
- if not all(data.columns):
26
- data.columns = [f"column_{i}" if not col else col for i, col in enumerate(data.columns)]
27
- return data
28
-
29
- # Function: Clean Dataset
30
- def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=False):
31
- # Auto-label columns if missing
32
- data = auto_label_columns(data)
33
-
34
- # Fill missing values
35
- data.fillna(method='ffill', inplace=True)
36
- data.fillna(method='bfill', inplace=True)
37
-
38
- # Remove duplicates
39
- data = data.drop_duplicates()
40
-
41
- # Normalize and clean text columns
42
- for col in data.select_dtypes(include=['object']).columns:
43
- if lowercase:
44
- data[col] = data[col].str.lower()
45
- if remove_punctuation:
46
- data[col] = data[col].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
47
- if remove_stopwords:
48
- data[col] = data[col].apply(lambda x: ' '.join([word for word in str(x).split() if word not in STOPWORDS]))
49
-
50
- return data
51
-
52
- # Function: Chunk Text
53
- def chunk_text(text, max_length=100):
54
- words = text.split()
55
- return [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]
56
-
57
- # Function: Generate Embeddings
58
- def generate_embeddings(chunk):
59
- chat_completion = client.chat.completions.create(
60
- messages=[{"role": "user", "content": chunk}],
61
- model="llama3-8b-8192",
62
- stream=False,
63
- )
64
- return chat_completion.choices[0].message.content
65
-
66
- # Main Function: Process Data
67
- def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
68
- # Load data
69
- data = pd.read_csv(file)
70
-
71
- # Generate missing data report
72
- missing_report = missing_data_report(data)
73
-
74
- # Step 1: Clean data
75
- cleaned_data = clean_data(data, lowercase, remove_punctuation, remove_stopwords)
76
-
77
- # Step 2: Create chunks
78
- cleaned_data['chunks'] = cleaned_data['text_column'].apply(lambda x: chunk_text(x, max_length=chunk_size))
79
-
80
- # Step 3: Generate embeddings
81
- cleaned_data['embeddings'] = cleaned_data['chunks'].apply(
82
- lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
83
- )
84
-
85
- # Save cleaned data with embeddings
86
- output_file = 'processed_data.csv'
87
- cleaned_data.to_csv(output_file, index=False)
88
-
89
- # Display sample embeddings
90
- embedding_sample = cleaned_data['embeddings'].head(5)
91
-
92
- return missing_report, embedding_sample, output_file
93
-
94
- # Gradio UI
95
- def gradio_interface(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
96
- missing_report, embedding_sample, output_file = process_dataset(
97
- file, chunk_size, lowercase, remove_punctuation, remove_stopwords
98
- )
99
- return (
100
- missing_report,
101
- f"Sample Embeddings:\n{embedding_sample}",
102
- output_file
103
- )
104
-
105
- # Gradio App
106
- ui = gr.Interface(
107
- fn=gradio_interface,
108
- inputs=[
109
- gr.File(label="πŸ“ Upload CSV Dataset"),
110
- gr.Slider(50, 500, step=50, value=100, label="πŸ”’ Chunk Size (words)"),
111
- gr.Checkbox(label="πŸ”  Convert Text to Lowercase", value=True),
112
- gr.Checkbox(label="❌ Remove Punctuation", value=True),
113
- gr.Checkbox(label="πŸ“ Remove Stopwords", value=False),
114
- ],
115
- outputs=[
116
- gr.Textbox(label="πŸ“Š Missing Data Report"),
117
- gr.Textbox(label="🧩 Embedding Sample"),
118
- gr.File(label="πŸ“₯ Download Processed Dataset"),
119
- ],
120
- title="✨ Professional Data Cleaning & Embedding Tool",
121
- description=(
122
- "Upload your dataset to clean, chunk, and generate embeddings using Llama LLM with Groq API. "
123
- "Customize text cleaning options and chunk size to suit your needs, or use the default settings. "
124
- "Missing column labels will be auto-generated."
125
- ),
126
- theme="huggingface",
127
- live=True,
128
- )
129
-
130
- # Launch App
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  if __name__ == "__main__":
132
- ui.launch()
 
 
1
  import pandas as pd
2
+ import numpy as np
3
+ import streamlit as st
4
+ import os
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+
8
+ try:
9
+ import tabula
10
+ from tabula import read_pdf
11
+ except:
12
+ read_pdf = None
13
+
14
+ # ----------- File Upload Handler ----------- #
15
+ def file_upload(file):
16
+ file_ext = os.path.splitext(file.name)[1].lower()
17
+ try:
18
+ if file_ext == '.csv':
19
+ df = pd.read_csv(file)
20
+ elif file_ext in ['.xls', '.xlsx']:
21
+ df = pd.read_excel(file)
22
+ elif file_ext == '.json':
23
+ df = pd.read_json(file)
24
+ elif file_ext == '.pdf' and read_pdf:
25
+ df = read_pdf(file, pages='all', multiple_tables=False)[0]
26
+ else:
27
+ st.error("❌ Unsupported file type or missing dependencies for PDF.")
28
+ return None
29
+ return df
30
+ except Exception as e:
31
+ st.error(f"⚠️ Error loading file: {e}")
32
+ return None
33
+
34
+ # ----------- Cleaning Functions ----------- #
35
+ def remove_empty_rows(df):
36
+ st.info("πŸ” Null values before cleaning:")
37
+ st.write(df.isnull().sum())
38
+ df_cleaned = df.dropna()
39
+ st.success("βœ… Null values removed.")
40
+ return df_cleaned
41
+
42
+ def replace_nulls(df, value):
43
+ st.info("πŸ” Null values before replacement:")
44
+ st.write(df.isnull().sum())
45
+ df_filled = df.fillna(value)
46
+ st.success("βœ… Null values replaced.")
47
+ return df_filled
48
+
49
+ def remove_noise(df):
50
+ noise_words = {'the', 'is', 'an', 'a', 'in', 'of', 'to'}
51
+ def clean_text(val):
52
+ if isinstance(val, str):
53
+ return ' '.join(word for word in val.split() if word.lower() not in noise_words)
54
+ return val
55
+ df_cleaned = df.applymap(clean_text)
56
+ st.success("βœ… Noise words removed.")
57
+ return df_cleaned
58
+
59
+ def remove_duplicates(df):
60
+ df_deduped = df.drop_duplicates()
61
+ st.success("βœ… Duplicate rows removed.")
62
+ return df_deduped
63
+
64
+ def convert_column_dtype(df, column, dtype):
65
+ try:
66
+ df[column] = df[column].astype(dtype)
67
+ st.success(f"βœ… Converted '{column}' to {dtype}")
68
+ except Exception as e:
69
+ st.error(f"⚠️ Conversion error: {e}")
70
+ return df
71
+
72
+ def detect_outliers(df, column):
73
+ if column in df.select_dtypes(include=['float', 'int']).columns:
74
+ Q1 = df[column].quantile(0.25)
75
+ Q3 = df[column].quantile(0.75)
76
+ IQR = Q3 - Q1
77
+ lower = Q1 - 1.5 * IQR
78
+ upper = Q3 + 1.5 * IQR
79
+ outliers = df[(df[column] < lower) | (df[column] > upper)]
80
+ st.write(f"πŸ” Found {len(outliers)} outliers in column '{column}'")
81
+ return outliers
82
+ else:
83
+ st.warning("⚠️ Column must be numeric to detect outliers.")
84
+ return pd.DataFrame()
85
+
86
+ def plot_distributions(df):
87
+ st.subheader("πŸ“Š Data Distributions")
88
+ numeric_cols = df.select_dtypes(include=['float', 'int']).columns
89
+ for col in numeric_cols:
90
+ fig, ax = plt.subplots()
91
+ sns.histplot(df[col].dropna(), kde=True, ax=ax)
92
+ ax.set_title(f"Distribution of {col}")
93
+ st.pyplot(fig)
94
+
95
+ def plot_missing_data(df):
96
+ st.subheader("πŸ“‰ Missing Data Heatmap")
97
+ fig, ax = plt.subplots()
98
+ sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
99
+ st.pyplot(fig)
100
+
101
+ def main():
102
+ st.set_page_config(page_title="🧹 Smart Dataset Cleaner", layout="wide")
103
+ st.title("🧹 Smart Dataset Cleaner")
104
+ st.caption("✨ Clean, analyze, and preprocess your dataset with ease")
105
+
106
+ uploaded_file = st.file_uploader("πŸ“‚ Upload your dataset", type=["csv", "xlsx", "xls", "json", "pdf"])
107
+ if uploaded_file:
108
+ df = file_upload(uploaded_file)
109
+ if df is not None:
110
+ st.subheader("πŸ“‹ Original Dataset Preview")
111
+ st.dataframe(df.head())
112
+
113
+ st.markdown("## 🧰 Data Cleaning Tools")
114
+ with st.expander("βž• Replace Null Values"):
115
+ fill_value = st.text_input("Enter value to replace nulls with:")
116
+ if st.button("Replace Nulls"):
117
+ df = replace_nulls(df, fill_value)
118
+ st.dataframe(df)
119
+
120
+ if st.button("🧼 Remove Empty Rows"):
121
+ df = remove_empty_rows(df)
122
+ st.dataframe(df)
123
+
124
+ if st.button("🧹 Remove Duplicate Rows"):
125
+ df = remove_duplicates(df)
126
+ st.dataframe(df)
127
+
128
+ if st.button("πŸ“‰ Remove Noise Words from Text"):
129
+ df = remove_noise(df)
130
+ st.dataframe(df)
131
+
132
+ with st.expander("πŸ” Convert Column DataType"):
133
+ selected_col = st.selectbox("Select column", df.columns)
134
+ dtype = st.selectbox("Select target type", ["int", "float", "str", "bool"])
135
+ if st.button("Convert"):
136
+ df = convert_column_dtype(df, selected_col, dtype)
137
+ st.dataframe(df)
138
+
139
+ st.markdown("## πŸ“Š Data Visualizations")
140
+ if st.checkbox("πŸ“ˆ Show Summary Stats"):
141
+ st.write(df.describe(include='all'))
142
+
143
+ if st.checkbox("πŸ“Œ Plot Column Distributions"):
144
+ plot_distributions(df)
145
+
146
+ if st.checkbox("πŸ“ Show Missing Data Heatmap"):
147
+ plot_missing_data(df)
148
+
149
+ st.markdown("## 🚨 Outlier Detection")
150
+ outlier_col = st.selectbox("Select numeric column", df.select_dtypes(include=['float', 'int']).columns)
151
+ if st.button("Detect Outliers"):
152
+ outliers = detect_outliers(df, outlier_col)
153
+ if not outliers.empty:
154
+ st.write(outliers)
155
+
156
+ st.markdown("## πŸ’Ύ Download Cleaned Dataset")
157
+ file_name = st.text_input("Filename:", "cleaned_dataset.csv")
158
+ if st.button("Download CSV"):
159
+ st.download_button("πŸ“„ Download", df.to_csv(index=False), file_name, mime="text/csv")
160
+ else:
161
+ st.warning("⚠️ Please upload a supported file to begin.")
162
+
163
  if __name__ == "__main__":
164
+ main()