Spaces:

Haseeb-001
/

Smart_Data_Cleaner

Runtime error

App Files Files Community

Haseeb-001 commited on Jan 15

Commit

bce8bc7

verified ·

1 Parent(s): e108ee1

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -74

app.py CHANGED Viewed

@@ -3,27 +3,13 @@ import pandas as pd
 import re
 from groq import Groq
 import gradio as gr
-from nltk.corpus import stopwords
-from tqdm import tqdm
-import nltk
-# Download stopwords for text cleaning
-nltk.download('stopwords')
-STOPWORDS = set(stopwords.words('english'))
-# Set Groq API Key (Consider environment variables for security)
 GROQ_API_KEY = "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"
 client = Groq(api_key=GROQ_API_KEY)
-# Function: Generate Missing Data Report
-def missing_data_report(data):
-    missing_report = data.isnull().sum()
-    total_missing = missing_report.sum()
-    return f"Missing Data Report:\n\n{missing_report}\n\nTotal Missing Values: {total_missing}"
 # Function: Clean Dataset
-def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=False):
     # Fill missing values
     data.fillna(method='ffill', inplace=True)
     data.fillna(method='bfill', inplace=True)
@@ -33,12 +19,7 @@ def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=F
     # Normalize and clean text columns
     for col in data.select_dtypes(include=['object']).columns:
-        if lowercase:
-            data[col] = data[col].str.lower()
-        if remove_punctuation:
-            data[col] = data[col].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
-        if remove_stopwords:
-            data[col] = data[col].apply(lambda x: ' '.join([word for word in str(x).split() if word not in STOPWORDS]))
     return data
@@ -57,69 +38,39 @@ def generate_embeddings(chunk):
     return chat_completion.choices[0].message.content
 # Main Function: Process Data
-def process_dataset(file_path, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
     # Load data
-    data = pd.read_csv(file_path)
-    # Generate missing data report
-    missing_report = missing_data_report(data)
     # Step 1: Clean data
-    cleaned_data = clean_data(data, lowercase, remove_punctuation, remove_stopwords)
     # Step 2: Create chunks
-    tqdm.pandas(desc="Chunking Data")
-    cleaned_data['chunks'] = cleaned_data['text_column'].progress_apply(lambda x: chunk_text(x, max_length=chunk_size))
     # Step 3: Generate embeddings
-    tqdm.pandas(desc="Generating Embeddings")
-    cleaned_data['embeddings'] = cleaned_data['chunks'].progress_apply(
-        lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
-    )
-    # Save cleaned data with embeddings (adjust based on Spaces file system)
-    output_file = 'processed_data.csv'  # Replace with appropriate path within Spaces
-    cleaned_data.to_csv(output_file, index=False)
-    return missing_report, cleaned_data
-# Modified Gradio interface for Hugging Face Spaces
-def hf_spaces_interface(file_path, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
-    """
-    Modified interface for Hugging Face Spaces.
-    Assumes the input file is already uploaded to the Spaces environment.
-    """
-    missing_report, processed_data = process_dataset(
-        file_path, chunk_size, lowercase, remove_punctuation, remove_stopwords
-    )
-    # Display results directly
-    return processed_data.to_markdown(index=False), missing_report
-# Create the Gradio interface
 ui = gr.Interface(
-    fn=hf_spaces_interface,
-    inputs=[
-        gr.inputs.Textbox(label="File Path (within Spaces)", lines=1),  # Get file path from user
-        gr.inputs.Slider(50, 500, step=50, default=100, label="Chunk Size (words)"),
-        gr.inputs.Checkbox(label="Convert Text to Lowercase", default=True),
-        gr.inputs.Checkbox(label="Remove Punctuation", default=True),
-        gr.inputs.Checkbox(label="Remove Stopwords", default=False),
-    ],
-    outputs=[
-        gr.outputs.Markdown(label="Processed Data"),
-        gr.outputs.Textbox(label="Missing Data Report"),
-    ],
-    title="Enhanced Data Cleaning and Embedding Tool",
-    description=(
-        "Upload your dataset to Spaces and provide the file path here. "
-        "Clean, chunk, and generate embeddings using Llama LLM with Groq API. "
-        "Customize text cleaning options and chunk size to suit your needs, or use the default settings."
-    ),
-    theme="huggingface",
     live=True,
 )
-# Launch the app (for Spaces, this might be handled differently)
 if __name__ == "__main__":
-    ui.launch(share=True)  # Share the app publicly (optional)

 import re
 from groq import Groq
 import gradio as gr
+# Set Groq API Key
 GROQ_API_KEY = "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"
 client = Groq(api_key=GROQ_API_KEY)
 # Function: Clean Dataset
+def clean_data(data):
     # Fill missing values
     data.fillna(method='ffill', inplace=True)
     data.fillna(method='bfill', inplace=True)
     # Normalize and clean text columns
     for col in data.select_dtypes(include=['object']).columns:
+        data[col] = data[col].apply(lambda x: re.sub(r'[^\w\s]', '', str(x).lower()))
     return data
     return chat_completion.choices[0].message.content
 # Main Function: Process Data
+def process_dataset(file):
     # Load data
+    data = pd.read_csv(file)
     # Step 1: Clean data
+    cleaned_data = clean_data(data)
     # Step 2: Create chunks
+    cleaned_data['chunks'] = cleaned_data['text_column'].apply(chunk_text)
     # Step 3: Generate embeddings
+    cleaned_data['embeddings'] = cleaned_data['chunks'].apply(lambda chunks: [generate_embeddings(chunk) for chunk in chunks])
+    # Save cleaned data with embeddings
+    cleaned_data.to_csv('processed_data.csv', index=False)
+    return "Dataset cleaned, chunked, and embedded successfully! Saved as 'processed_data.csv'."
+# Gradio UI
+def gradio_interface(file):
+    result = process_dataset(file.name)
+    return result
+# Gradio App
 ui = gr.Interface(
+    fn=gradio_interface,
+    inputs=gr.inputs.File(label="Upload CSV Dataset"),
+    outputs=gr.outputs.Textbox(label="Processing Result"),
+    title="Data Cleaning and Embedding Tool",
+    description="Upload your dataset to clean, chunk, and generate embeddings using Llama LLM with Groq API. Perfect for deployment on Hugging Face.",
+    theme="compact",
     live=True,
 )
+# Launch App
 if __name__ == "__main__":
+    ui.launch()