Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -7,11 +7,12 @@ from nltk.corpus import stopwords
|
|
7 |
from tqdm import tqdm
|
8 |
import nltk
|
9 |
|
|
|
10 |
# Download stopwords for text cleaning
|
11 |
nltk.download('stopwords')
|
12 |
STOPWORDS = set(stopwords.words('english'))
|
13 |
|
14 |
-
# Set Groq API Key
|
15 |
GROQ_API_KEY = "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"
|
16 |
client = Groq(api_key=GROQ_API_KEY)
|
17 |
|
@@ -56,9 +57,9 @@ def generate_embeddings(chunk):
|
|
56 |
return chat_completion.choices[0].message.content
|
57 |
|
58 |
# Main Function: Process Data
|
59 |
-
def process_dataset(
|
60 |
# Load data
|
61 |
-
data = pd.read_csv(
|
62 |
|
63 |
# Generate missing data report
|
64 |
missing_report = missing_data_report(data)
|
@@ -76,50 +77,49 @@ def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=Tru
|
|
76 |
lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
|
77 |
)
|
78 |
|
79 |
-
# Save cleaned data with embeddings
|
80 |
-
output_file = 'processed_data.csv'
|
81 |
cleaned_data.to_csv(output_file, index=False)
|
82 |
|
83 |
-
|
84 |
-
embedding_sample = cleaned_data['embeddings'].head(5)
|
85 |
-
|
86 |
-
return missing_report, embedding_sample, output_file
|
87 |
|
88 |
-
# Gradio
|
89 |
-
def
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
output_file
|
97 |
)
|
98 |
|
99 |
-
#
|
|
|
|
|
|
|
100 |
ui = gr.Interface(
|
101 |
-
fn=
|
102 |
inputs=[
|
103 |
-
gr.inputs.
|
104 |
gr.inputs.Slider(50, 500, step=50, default=100, label="Chunk Size (words)"),
|
105 |
gr.inputs.Checkbox(label="Convert Text to Lowercase", default=True),
|
106 |
gr.inputs.Checkbox(label="Remove Punctuation", default=True),
|
107 |
gr.inputs.Checkbox(label="Remove Stopwords", default=False),
|
108 |
],
|
109 |
outputs=[
|
|
|
110 |
gr.outputs.Textbox(label="Missing Data Report"),
|
111 |
-
gr.outputs.Textbox(label="Embedding Sample"),
|
112 |
-
gr.outputs.File(label="Download Processed Dataset"),
|
113 |
],
|
114 |
title="Enhanced Data Cleaning and Embedding Tool",
|
115 |
description=(
|
116 |
-
"Upload your dataset to
|
|
|
117 |
"Customize text cleaning options and chunk size to suit your needs, or use the default settings."
|
118 |
),
|
119 |
theme="huggingface",
|
120 |
live=True,
|
121 |
)
|
122 |
|
123 |
-
# Launch
|
124 |
if __name__ == "__main__":
|
125 |
-
ui.launch()
|
|
|
7 |
from tqdm import tqdm
|
8 |
import nltk
|
9 |
|
10 |
+
|
11 |
# Download stopwords for text cleaning
|
12 |
nltk.download('stopwords')
|
13 |
STOPWORDS = set(stopwords.words('english'))
|
14 |
|
15 |
+
# Set Groq API Key (Consider environment variables for security)
|
16 |
GROQ_API_KEY = "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"
|
17 |
client = Groq(api_key=GROQ_API_KEY)
|
18 |
|
|
|
57 |
return chat_completion.choices[0].message.content
|
58 |
|
59 |
# Main Function: Process Data
|
60 |
+
def process_dataset(file_path, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
|
61 |
# Load data
|
62 |
+
data = pd.read_csv(file_path)
|
63 |
|
64 |
# Generate missing data report
|
65 |
missing_report = missing_data_report(data)
|
|
|
77 |
lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
|
78 |
)
|
79 |
|
80 |
+
# Save cleaned data with embeddings (adjust based on Spaces file system)
|
81 |
+
output_file = 'processed_data.csv' # Replace with appropriate path within Spaces
|
82 |
cleaned_data.to_csv(output_file, index=False)
|
83 |
|
84 |
+
return missing_report, cleaned_data
|
|
|
|
|
|
|
85 |
|
86 |
+
# Modified Gradio interface for Hugging Face Spaces
|
87 |
+
def hf_spaces_interface(file_path, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
|
88 |
+
"""
|
89 |
+
Modified interface for Hugging Face Spaces.
|
90 |
+
Assumes the input file is already uploaded to the Spaces environment.
|
91 |
+
"""
|
92 |
+
missing_report, processed_data = process_dataset(
|
93 |
+
file_path, chunk_size, lowercase, remove_punctuation, remove_stopwords
|
|
|
94 |
)
|
95 |
|
96 |
+
# Display results directly
|
97 |
+
return processed_data.to_markdown(index=False), missing_report
|
98 |
+
|
99 |
+
# Create the Gradio interface
|
100 |
ui = gr.Interface(
|
101 |
+
fn=hf_spaces_interface,
|
102 |
inputs=[
|
103 |
+
gr.inputs.Textbox(label="File Path (within Spaces)", lines=1), # Get file path from user
|
104 |
gr.inputs.Slider(50, 500, step=50, default=100, label="Chunk Size (words)"),
|
105 |
gr.inputs.Checkbox(label="Convert Text to Lowercase", default=True),
|
106 |
gr.inputs.Checkbox(label="Remove Punctuation", default=True),
|
107 |
gr.inputs.Checkbox(label="Remove Stopwords", default=False),
|
108 |
],
|
109 |
outputs=[
|
110 |
+
gr.outputs.Markdown(label="Processed Data"),
|
111 |
gr.outputs.Textbox(label="Missing Data Report"),
|
|
|
|
|
112 |
],
|
113 |
title="Enhanced Data Cleaning and Embedding Tool",
|
114 |
description=(
|
115 |
+
"Upload your dataset to Spaces and provide the file path here. "
|
116 |
+
"Clean, chunk, and generate embeddings using Llama LLM with Groq API. "
|
117 |
"Customize text cleaning options and chunk size to suit your needs, or use the default settings."
|
118 |
),
|
119 |
theme="huggingface",
|
120 |
live=True,
|
121 |
)
|
122 |
|
123 |
+
# Launch the app (for Spaces, this might be handled differently)
|
124 |
if __name__ == "__main__":
|
125 |
+
ui.launch(share=True) # Share the app publicly (optional)
|