Spaces:

Haseeb-001
/

Smart_Data_Cleaner

Runtime error

App Files Files Community

Haseeb-001 commited on Jan 15

Commit

e108ee1

verified ·

1 Parent(s): b52603a

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -26

app.py CHANGED Viewed

@@ -7,11 +7,12 @@ from nltk.corpus import stopwords
 from tqdm import tqdm
 import nltk
 # Download stopwords for text cleaning
 nltk.download('stopwords')
 STOPWORDS = set(stopwords.words('english'))
-# Set Groq API Key
 GROQ_API_KEY = "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"
 client = Groq(api_key=GROQ_API_KEY)
@@ -56,9 +57,9 @@ def generate_embeddings(chunk):
     return chat_completion.choices[0].message.content
 # Main Function: Process Data
-def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
     # Load data
-    data = pd.read_csv(file)
     # Generate missing data report
     missing_report = missing_data_report(data)
@@ -76,50 +77,49 @@ def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=Tru
         lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
     )
-    # Save cleaned data with embeddings
-    output_file = 'processed_data.csv'
     cleaned_data.to_csv(output_file, index=False)
-    # Display sample embeddings
-    embedding_sample = cleaned_data['embeddings'].head(5)
-    return missing_report, embedding_sample, output_file
-# Gradio UI
-def gradio_interface(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
-    missing_report, embedding_sample, output_file = process_dataset(
-        file, chunk_size, lowercase, remove_punctuation, remove_stopwords
-    )
-    return (
-        missing_report,
-        f"Sample Embeddings:\n{embedding_sample}",
-        output_file
     )
-# Gradio App
 ui = gr.Interface(
-    fn=gradio_interface,
     inputs=[
-        gr.inputs.File(label="Upload CSV Dataset"),
         gr.inputs.Slider(50, 500, step=50, default=100, label="Chunk Size (words)"),
         gr.inputs.Checkbox(label="Convert Text to Lowercase", default=True),
         gr.inputs.Checkbox(label="Remove Punctuation", default=True),
         gr.inputs.Checkbox(label="Remove Stopwords", default=False),
     ],
     outputs=[
         gr.outputs.Textbox(label="Missing Data Report"),
-        gr.outputs.Textbox(label="Embedding Sample"),
-        gr.outputs.File(label="Download Processed Dataset"),
     ],
     title="Enhanced Data Cleaning and Embedding Tool",
     description=(
-        "Upload your dataset to clean, chunk, and generate embeddings using Llama LLM with Groq API. "
         "Customize text cleaning options and chunk size to suit your needs, or use the default settings."
     ),
     theme="huggingface",
     live=True,
 )
-# Launch App
 if __name__ == "__main__":
-    ui.launch()

 from tqdm import tqdm
 import nltk
 # Download stopwords for text cleaning
 nltk.download('stopwords')
 STOPWORDS = set(stopwords.words('english'))
+# Set Groq API Key (Consider environment variables for security)
 GROQ_API_KEY = "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"
 client = Groq(api_key=GROQ_API_KEY)
     return chat_completion.choices[0].message.content
 # Main Function: Process Data
+def process_dataset(file_path, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
     # Load data
+    data = pd.read_csv(file_path)
     # Generate missing data report
     missing_report = missing_data_report(data)
         lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
     )
+    # Save cleaned data with embeddings (adjust based on Spaces file system)
+    output_file = 'processed_data.csv'  # Replace with appropriate path within Spaces
     cleaned_data.to_csv(output_file, index=False)
+    return missing_report, cleaned_data
+# Modified Gradio interface for Hugging Face Spaces
+def hf_spaces_interface(file_path, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
+    """
+    Modified interface for Hugging Face Spaces.
+    Assumes the input file is already uploaded to the Spaces environment.
+    """
+    missing_report, processed_data = process_dataset(
+        file_path, chunk_size, lowercase, remove_punctuation, remove_stopwords
     )
+    # Display results directly
+    return processed_data.to_markdown(index=False), missing_report
+# Create the Gradio interface
 ui = gr.Interface(
+    fn=hf_spaces_interface,
     inputs=[
+        gr.inputs.Textbox(label="File Path (within Spaces)", lines=1),  # Get file path from user
         gr.inputs.Slider(50, 500, step=50, default=100, label="Chunk Size (words)"),
         gr.inputs.Checkbox(label="Convert Text to Lowercase", default=True),
         gr.inputs.Checkbox(label="Remove Punctuation", default=True),
         gr.inputs.Checkbox(label="Remove Stopwords", default=False),
     ],
     outputs=[
+        gr.outputs.Markdown(label="Processed Data"),
         gr.outputs.Textbox(label="Missing Data Report"),
     ],
     title="Enhanced Data Cleaning and Embedding Tool",
     description=(
+        "Upload your dataset to Spaces and provide the file path here. "
+        "Clean, chunk, and generate embeddings using Llama LLM with Groq API. "
         "Customize text cleaning options and chunk size to suit your needs, or use the default settings."
     ),
     theme="huggingface",
     live=True,
 )
+# Launch the app (for Spaces, this might be handled differently)
 if __name__ == "__main__":
+    ui.launch(share=True)  # Share the app publicly (optional)