Spaces:

Haseeb-001
/

Smart_Data_Cleaner

Runtime error

App Files Files Community

Haseeb-001 commited on Jan 16

Commit

c409139

verified ·

1 Parent(s): ac3f2e1

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -46

app.py CHANGED Viewed

@@ -1,25 +1,14 @@
 import os
 import pandas as pd
 import re
-from groq import Groq
 import gradio as gr
 from nltk.corpus import stopwords
-# Removed tqdm import due to compatibility issues
-# Set stopwords for text cleaning (you can hard-code them if you face download issues)
-STOPWORDS = set([
-    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
-    "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself",
-    "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these",
-    "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
-    "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by",
-    "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below",
-    "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here",
-    "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such",
-    "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should",
-    "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn", "haven", "isn",
-    "ma", "mightn", "mustn", "needn", "shan", "shouldn", "wasn", "weren", "won", "wouldn"
-])
 # Set Groq API Key
 GROQ_API_KEY = "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"
@@ -34,27 +23,31 @@ def missing_data_report(data):
 # Function: Clean Dataset
 def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=False):
     # Fill missing values
-    data.fillna(method="ffill", inplace=True)
-    data.fillna(method="bfill", inplace=True)
     # Remove duplicates
     data = data.drop_duplicates()
     # Normalize and clean text columns
-    for col in data.select_dtypes(include=["object"]).columns:
         if lowercase:
             data[col] = data[col].str.lower()
         if remove_punctuation:
-            data[col] = data[col].apply(lambda x: re.sub(r"[^\w\s]", "", str(x)))
         if remove_stopwords:
-            data[col] = data[col].apply(lambda x: " ".join([word for word in str(x).split() if word not in STOPWORDS]))
     return data
 # Function: Chunk Text
 def chunk_text(text, max_length=100):
     words = text.split()
-    return [" ".join(words[i : i + max_length]) for i in range(0, len(words), max_length)]
 # Function: Generate Embeddings
 def generate_embeddings(chunk):
@@ -68,7 +61,7 @@ def generate_embeddings(chunk):
 # Main Function: Process Data
 def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
     # Load data
-    data = pd.read_csv(file.name)
     # Generate missing data report
     missing_report = missing_data_report(data)
@@ -76,50 +69,55 @@ def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=Tru
     # Step 1: Clean data
     cleaned_data = clean_data(data, lowercase, remove_punctuation, remove_stopwords)
-    # Step 2: Create chunks (removed tqdm)
-    if "text_column" in cleaned_data.columns:
-        cleaned_data["chunks"] = cleaned_data["text_column"].apply(lambda x: chunk_text(x, max_length=chunk_size))
-    else:
-        return "Error: 'text_column' not found in the dataset.", None, None
-    # Step 3: Generate embeddings (removed tqdm)
-    cleaned_data["embeddings"] = cleaned_data["chunks"].apply(
         lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
     )
     # Save cleaned data with embeddings
-    output_file = "processed_data.csv"
     cleaned_data.to_csv(output_file, index=False)
     # Display sample embeddings
-    embedding_sample = cleaned_data["embeddings"].head(5).to_string()
     return missing_report, embedding_sample, output_file
-# Gradio UI
-def gradio_interface(file, chunk_size, lowercase, remove_punctuation, remove_stopwords):
     missing_report, embedding_sample, output_file = process_dataset(
         file, chunk_size, lowercase, remove_punctuation, remove_stopwords
     )
-    return missing_report, embedding_sample, output_file
 # Gradio App
 ui = gr.Interface(
     fn=gradio_interface,
     inputs=[
-        gr.File(label="Upload CSV Dataset"),
-        gr.Slider(50, 500, step=50, value=100, label="Chunk Size (words)"),
-        gr.Checkbox(label="Convert Text to Lowercase", value=True),
-        gr.Checkbox(label="Remove Punctuation", value=True),
-        gr.Checkbox(label="Remove Stopwords", value=False),
     ],
     outputs=[
-        gr.Text(label="Missing Data Report"),
-        gr.Text(label="Embedding Sample"),
-        gr.File(label="Download Processed Dataset"),
     ],
-    title="Enhanced Data Cleaning and Embedding Tool",
-    description="Upload your dataset to clean, chunk, and generate embeddings using Llama LLM with Groq API.",
 )
 # Launch App

 import os
 import pandas as pd
 import re
 import gradio as gr
+from groq import Groq
 from nltk.corpus import stopwords
+import nltk
+# Download stopwords
+nltk.download('stopwords')
+STOPWORDS = set(stopwords.words('english'))
 # Set Groq API Key
 GROQ_API_KEY = "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"
 # Function: Clean Dataset
 def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=False):
     # Fill missing values
+    data.fillna(method='ffill', inplace=True)
+    data.fillna(method='bfill', inplace=True)
+    # Auto-generate column labels if missing
+    if data.columns.isnull().any():
+        data.columns = [f"Column_{i + 1}" for i in range(data.shape[1])]
     # Remove duplicates
     data = data.drop_duplicates()
     # Normalize and clean text columns
+    for col in data.select_dtypes(include=['object']).columns:
         if lowercase:
             data[col] = data[col].str.lower()
         if remove_punctuation:
+            data[col] = data[col].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
         if remove_stopwords:
+            data[col] = data[col].apply(lambda x: ' '.join([word for word in str(x).split() if word not in STOPWORDS]))
     return data
 # Function: Chunk Text
 def chunk_text(text, max_length=100):
     words = text.split()
+    return [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]
 # Function: Generate Embeddings
 def generate_embeddings(chunk):
 # Main Function: Process Data
 def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
     # Load data
+    data = pd.read_csv(file)
     # Generate missing data report
     missing_report = missing_data_report(data)
     # Step 1: Clean data
     cleaned_data = clean_data(data, lowercase, remove_punctuation, remove_stopwords)
+    # Step 2: Create chunks
+    cleaned_data['chunks'] = cleaned_data['text_column'].apply(lambda x: chunk_text(x, max_length=chunk_size))
+    # Step 3: Generate embeddings
+    cleaned_data['embeddings'] = cleaned_data['chunks'].apply(
         lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
     )
     # Save cleaned data with embeddings
+    output_file = 'processed_data.csv'
     cleaned_data.to_csv(output_file, index=False)
     # Display sample embeddings
+    embedding_sample = cleaned_data['embeddings'].head(5)
     return missing_report, embedding_sample, output_file
+# Gradio Interface Function
+def gradio_interface(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
     missing_report, embedding_sample, output_file = process_dataset(
         file, chunk_size, lowercase, remove_punctuation, remove_stopwords
     )
+    return (
+        missing_report,
+        f"Sample Embeddings:\n{embedding_sample}",
+        output_file
+    )
 # Gradio App
 ui = gr.Interface(
     fn=gradio_interface,
     inputs=[
+        gr.File(label="📁 Upload CSV Dataset"),
+        gr.Slider(50, 500, step=50, default=100, label="📝 Chunk Size (words)"),
+        gr.Checkbox(label="🔠 Convert Text to Lowercase", default=True),
+        gr.Checkbox(label="❌ Remove Punctuation", default=True),
+        gr.Checkbox(label="🗑️ Remove Stopwords", default=False),
     ],
     outputs=[
+        gr.Textbox(label="📊 Missing Data Report"),
+        gr.Textbox(label="✨ Embedding Sample"),
+        gr.File(label="⬇️ Download Processed Dataset"),
     ],
+    title="🔍 Advanced Data Cleaning & Embedding Tool",
+    description=(
+        "Upload your dataset to clean, chunk, and generate embeddings using Llama LLM with Groq API. "
+        "Customize text cleaning options, chunk size, and more. Automatically adds column labels if missing."
+    ),
+    live=True,
 )
 # Launch App