Haseeb-001 commited on
Commit
0bf34ea
Β·
verified Β·
1 Parent(s): c409139

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -23
app.py CHANGED
@@ -1,12 +1,12 @@
1
  import os
2
  import pandas as pd
3
  import re
4
- import gradio as gr
5
  from groq import Groq
 
6
  from nltk.corpus import stopwords
7
  import nltk
8
 
9
- # Download stopwords
10
  nltk.download('stopwords')
11
  STOPWORDS = set(stopwords.words('english'))
12
 
@@ -20,19 +20,24 @@ def missing_data_report(data):
20
  total_missing = missing_report.sum()
21
  return f"Missing Data Report:\n\n{missing_report}\n\nTotal Missing Values: {total_missing}"
22
 
 
 
 
 
 
 
23
  # Function: Clean Dataset
24
  def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=False):
 
 
 
25
  # Fill missing values
26
  data.fillna(method='ffill', inplace=True)
27
  data.fillna(method='bfill', inplace=True)
28
-
29
- # Auto-generate column labels if missing
30
- if data.columns.isnull().any():
31
- data.columns = [f"Column_{i + 1}" for i in range(data.shape[1])]
32
-
33
  # Remove duplicates
34
  data = data.drop_duplicates()
35
-
36
  # Normalize and clean text columns
37
  for col in data.select_dtypes(include=['object']).columns:
38
  if lowercase:
@@ -41,7 +46,7 @@ def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=F
41
  data[col] = data[col].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
42
  if remove_stopwords:
43
  data[col] = data[col].apply(lambda x: ' '.join([word for word in str(x).split() if word not in STOPWORDS]))
44
-
45
  return data
46
 
47
  # Function: Chunk Text
@@ -68,25 +73,25 @@ def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=Tru
68
 
69
  # Step 1: Clean data
70
  cleaned_data = clean_data(data, lowercase, remove_punctuation, remove_stopwords)
71
-
72
  # Step 2: Create chunks
73
  cleaned_data['chunks'] = cleaned_data['text_column'].apply(lambda x: chunk_text(x, max_length=chunk_size))
74
-
75
  # Step 3: Generate embeddings
76
  cleaned_data['embeddings'] = cleaned_data['chunks'].apply(
77
  lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
78
  )
79
-
80
  # Save cleaned data with embeddings
81
  output_file = 'processed_data.csv'
82
  cleaned_data.to_csv(output_file, index=False)
83
-
84
  # Display sample embeddings
85
  embedding_sample = cleaned_data['embeddings'].head(5)
86
-
87
  return missing_report, embedding_sample, output_file
88
 
89
- # Gradio Interface Function
90
  def gradio_interface(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
91
  missing_report, embedding_sample, output_file = process_dataset(
92
  file, chunk_size, lowercase, remove_punctuation, remove_stopwords
@@ -102,21 +107,23 @@ ui = gr.Interface(
102
  fn=gradio_interface,
103
  inputs=[
104
  gr.File(label="πŸ“ Upload CSV Dataset"),
105
- gr.Slider(50, 500, step=50, default=100, label="πŸ“ Chunk Size (words)"),
106
- gr.Checkbox(label="πŸ”  Convert Text to Lowercase", default=True),
107
- gr.Checkbox(label="❌ Remove Punctuation", default=True),
108
- gr.Checkbox(label="πŸ—‘οΈ Remove Stopwords", default=False),
109
  ],
110
  outputs=[
111
  gr.Textbox(label="πŸ“Š Missing Data Report"),
112
- gr.Textbox(label="✨ Embedding Sample"),
113
- gr.File(label="⬇️ Download Processed Dataset"),
114
  ],
115
- title="πŸ” Advanced Data Cleaning & Embedding Tool",
116
  description=(
117
  "Upload your dataset to clean, chunk, and generate embeddings using Llama LLM with Groq API. "
118
- "Customize text cleaning options, chunk size, and more. Automatically adds column labels if missing."
 
119
  ),
 
120
  live=True,
121
  )
122
 
 
1
  import os
2
  import pandas as pd
3
  import re
 
4
  from groq import Groq
5
+ import gradio as gr
6
  from nltk.corpus import stopwords
7
  import nltk
8
 
9
+ # Download stopwords for text cleaning
10
  nltk.download('stopwords')
11
  STOPWORDS = set(stopwords.words('english'))
12
 
 
20
  total_missing = missing_report.sum()
21
  return f"Missing Data Report:\n\n{missing_report}\n\nTotal Missing Values: {total_missing}"
22
 
23
+ # Function: Auto-label Columns
24
+ def auto_label_columns(data):
25
+ if not all(data.columns):
26
+ data.columns = [f"column_{i}" if not col else col for i, col in enumerate(data.columns)]
27
+ return data
28
+
29
  # Function: Clean Dataset
30
  def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=False):
31
+ # Auto-label columns if missing
32
+ data = auto_label_columns(data)
33
+
34
  # Fill missing values
35
  data.fillna(method='ffill', inplace=True)
36
  data.fillna(method='bfill', inplace=True)
37
+
 
 
 
 
38
  # Remove duplicates
39
  data = data.drop_duplicates()
40
+
41
  # Normalize and clean text columns
42
  for col in data.select_dtypes(include=['object']).columns:
43
  if lowercase:
 
46
  data[col] = data[col].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
47
  if remove_stopwords:
48
  data[col] = data[col].apply(lambda x: ' '.join([word for word in str(x).split() if word not in STOPWORDS]))
49
+
50
  return data
51
 
52
  # Function: Chunk Text
 
73
 
74
  # Step 1: Clean data
75
  cleaned_data = clean_data(data, lowercase, remove_punctuation, remove_stopwords)
76
+
77
  # Step 2: Create chunks
78
  cleaned_data['chunks'] = cleaned_data['text_column'].apply(lambda x: chunk_text(x, max_length=chunk_size))
79
+
80
  # Step 3: Generate embeddings
81
  cleaned_data['embeddings'] = cleaned_data['chunks'].apply(
82
  lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
83
  )
84
+
85
  # Save cleaned data with embeddings
86
  output_file = 'processed_data.csv'
87
  cleaned_data.to_csv(output_file, index=False)
88
+
89
  # Display sample embeddings
90
  embedding_sample = cleaned_data['embeddings'].head(5)
91
+
92
  return missing_report, embedding_sample, output_file
93
 
94
+ # Gradio UI
95
  def gradio_interface(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
96
  missing_report, embedding_sample, output_file = process_dataset(
97
  file, chunk_size, lowercase, remove_punctuation, remove_stopwords
 
107
  fn=gradio_interface,
108
  inputs=[
109
  gr.File(label="πŸ“ Upload CSV Dataset"),
110
+ gr.Slider(50, 500, step=50, value=100, label="πŸ”’ Chunk Size (words)"),
111
+ gr.Checkbox(label="πŸ”  Convert Text to Lowercase", value=True),
112
+ gr.Checkbox(label="❌ Remove Punctuation", value=True),
113
+ gr.Checkbox(label="πŸ“ Remove Stopwords", value=False),
114
  ],
115
  outputs=[
116
  gr.Textbox(label="πŸ“Š Missing Data Report"),
117
+ gr.Textbox(label="🧩 Embedding Sample"),
118
+ gr.File(label="πŸ“₯ Download Processed Dataset"),
119
  ],
120
+ title="✨ Professional Data Cleaning & Embedding Tool",
121
  description=(
122
  "Upload your dataset to clean, chunk, and generate embeddings using Llama LLM with Groq API. "
123
+ "Customize text cleaning options and chunk size to suit your needs, or use the default settings. "
124
+ "Missing column labels will be auto-generated."
125
  ),
126
+ theme="huggingface",
127
  live=True,
128
  )
129