Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,25 +1,14 @@
|
|
1 |
import os
|
2 |
import pandas as pd
|
3 |
import re
|
4 |
-
from groq import Groq
|
5 |
import gradio as gr
|
|
|
6 |
from nltk.corpus import stopwords
|
7 |
-
|
8 |
-
|
9 |
-
#
|
10 |
-
|
11 |
-
|
12 |
-
"yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself",
|
13 |
-
"they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these",
|
14 |
-
"those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
|
15 |
-
"did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by",
|
16 |
-
"for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below",
|
17 |
-
"to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here",
|
18 |
-
"there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such",
|
19 |
-
"no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should",
|
20 |
-
"now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn", "haven", "isn",
|
21 |
-
"ma", "mightn", "mustn", "needn", "shan", "shouldn", "wasn", "weren", "won", "wouldn"
|
22 |
-
])
|
23 |
|
24 |
# Set Groq API Key
|
25 |
GROQ_API_KEY = "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"
|
@@ -34,27 +23,31 @@ def missing_data_report(data):
|
|
34 |
# Function: Clean Dataset
|
35 |
def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=False):
|
36 |
# Fill missing values
|
37 |
-
data.fillna(method=
|
38 |
-
data.fillna(method=
|
|
|
|
|
|
|
|
|
39 |
|
40 |
# Remove duplicates
|
41 |
data = data.drop_duplicates()
|
42 |
|
43 |
# Normalize and clean text columns
|
44 |
-
for col in data.select_dtypes(include=[
|
45 |
if lowercase:
|
46 |
data[col] = data[col].str.lower()
|
47 |
if remove_punctuation:
|
48 |
-
data[col] = data[col].apply(lambda x: re.sub(r
|
49 |
if remove_stopwords:
|
50 |
-
data[col] = data[col].apply(lambda x:
|
51 |
|
52 |
return data
|
53 |
|
54 |
# Function: Chunk Text
|
55 |
def chunk_text(text, max_length=100):
|
56 |
words = text.split()
|
57 |
-
return [
|
58 |
|
59 |
# Function: Generate Embeddings
|
60 |
def generate_embeddings(chunk):
|
@@ -68,7 +61,7 @@ def generate_embeddings(chunk):
|
|
68 |
# Main Function: Process Data
|
69 |
def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
|
70 |
# Load data
|
71 |
-
data = pd.read_csv(file
|
72 |
|
73 |
# Generate missing data report
|
74 |
missing_report = missing_data_report(data)
|
@@ -76,50 +69,55 @@ def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=Tru
|
|
76 |
# Step 1: Clean data
|
77 |
cleaned_data = clean_data(data, lowercase, remove_punctuation, remove_stopwords)
|
78 |
|
79 |
-
# Step 2: Create chunks
|
80 |
-
|
81 |
-
cleaned_data["chunks"] = cleaned_data["text_column"].apply(lambda x: chunk_text(x, max_length=chunk_size))
|
82 |
-
else:
|
83 |
-
return "Error: 'text_column' not found in the dataset.", None, None
|
84 |
|
85 |
-
# Step 3: Generate embeddings
|
86 |
-
cleaned_data[
|
87 |
lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
|
88 |
)
|
89 |
|
90 |
# Save cleaned data with embeddings
|
91 |
-
output_file =
|
92 |
cleaned_data.to_csv(output_file, index=False)
|
93 |
|
94 |
# Display sample embeddings
|
95 |
-
embedding_sample = cleaned_data[
|
96 |
|
97 |
return missing_report, embedding_sample, output_file
|
98 |
|
99 |
-
# Gradio
|
100 |
-
def gradio_interface(file, chunk_size, lowercase, remove_punctuation, remove_stopwords):
|
101 |
missing_report, embedding_sample, output_file = process_dataset(
|
102 |
file, chunk_size, lowercase, remove_punctuation, remove_stopwords
|
103 |
)
|
104 |
-
return
|
|
|
|
|
|
|
|
|
105 |
|
106 |
# Gradio App
|
107 |
ui = gr.Interface(
|
108 |
fn=gradio_interface,
|
109 |
inputs=[
|
110 |
-
gr.File(label="Upload CSV Dataset"),
|
111 |
-
gr.Slider(50, 500, step=50,
|
112 |
-
gr.Checkbox(label="Convert Text to Lowercase",
|
113 |
-
gr.Checkbox(label="Remove Punctuation",
|
114 |
-
gr.Checkbox(label="Remove Stopwords",
|
115 |
],
|
116 |
outputs=[
|
117 |
-
gr.
|
118 |
-
gr.
|
119 |
-
gr.File(label="Download Processed Dataset"),
|
120 |
],
|
121 |
-
title="
|
122 |
-
description=
|
|
|
|
|
|
|
|
|
123 |
)
|
124 |
|
125 |
# Launch App
|
|
|
1 |
import os
|
2 |
import pandas as pd
|
3 |
import re
|
|
|
4 |
import gradio as gr
|
5 |
+
from groq import Groq
|
6 |
from nltk.corpus import stopwords
|
7 |
+
import nltk
|
8 |
+
|
9 |
+
# Download stopwords
|
10 |
+
nltk.download('stopwords')
|
11 |
+
STOPWORDS = set(stopwords.words('english'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
# Set Groq API Key
|
14 |
GROQ_API_KEY = "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"
|
|
|
23 |
# Function: Clean Dataset
|
24 |
def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=False):
|
25 |
# Fill missing values
|
26 |
+
data.fillna(method='ffill', inplace=True)
|
27 |
+
data.fillna(method='bfill', inplace=True)
|
28 |
+
|
29 |
+
# Auto-generate column labels if missing
|
30 |
+
if data.columns.isnull().any():
|
31 |
+
data.columns = [f"Column_{i + 1}" for i in range(data.shape[1])]
|
32 |
|
33 |
# Remove duplicates
|
34 |
data = data.drop_duplicates()
|
35 |
|
36 |
# Normalize and clean text columns
|
37 |
+
for col in data.select_dtypes(include=['object']).columns:
|
38 |
if lowercase:
|
39 |
data[col] = data[col].str.lower()
|
40 |
if remove_punctuation:
|
41 |
+
data[col] = data[col].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
|
42 |
if remove_stopwords:
|
43 |
+
data[col] = data[col].apply(lambda x: ' '.join([word for word in str(x).split() if word not in STOPWORDS]))
|
44 |
|
45 |
return data
|
46 |
|
47 |
# Function: Chunk Text
|
48 |
def chunk_text(text, max_length=100):
|
49 |
words = text.split()
|
50 |
+
return [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]
|
51 |
|
52 |
# Function: Generate Embeddings
|
53 |
def generate_embeddings(chunk):
|
|
|
61 |
# Main Function: Process Data
|
62 |
def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
|
63 |
# Load data
|
64 |
+
data = pd.read_csv(file)
|
65 |
|
66 |
# Generate missing data report
|
67 |
missing_report = missing_data_report(data)
|
|
|
69 |
# Step 1: Clean data
|
70 |
cleaned_data = clean_data(data, lowercase, remove_punctuation, remove_stopwords)
|
71 |
|
72 |
+
# Step 2: Create chunks
|
73 |
+
cleaned_data['chunks'] = cleaned_data['text_column'].apply(lambda x: chunk_text(x, max_length=chunk_size))
|
|
|
|
|
|
|
74 |
|
75 |
+
# Step 3: Generate embeddings
|
76 |
+
cleaned_data['embeddings'] = cleaned_data['chunks'].apply(
|
77 |
lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
|
78 |
)
|
79 |
|
80 |
# Save cleaned data with embeddings
|
81 |
+
output_file = 'processed_data.csv'
|
82 |
cleaned_data.to_csv(output_file, index=False)
|
83 |
|
84 |
# Display sample embeddings
|
85 |
+
embedding_sample = cleaned_data['embeddings'].head(5)
|
86 |
|
87 |
return missing_report, embedding_sample, output_file
|
88 |
|
89 |
+
# Gradio Interface Function
|
90 |
+
def gradio_interface(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
|
91 |
missing_report, embedding_sample, output_file = process_dataset(
|
92 |
file, chunk_size, lowercase, remove_punctuation, remove_stopwords
|
93 |
)
|
94 |
+
return (
|
95 |
+
missing_report,
|
96 |
+
f"Sample Embeddings:\n{embedding_sample}",
|
97 |
+
output_file
|
98 |
+
)
|
99 |
|
100 |
# Gradio App
|
101 |
ui = gr.Interface(
|
102 |
fn=gradio_interface,
|
103 |
inputs=[
|
104 |
+
gr.File(label="π Upload CSV Dataset"),
|
105 |
+
gr.Slider(50, 500, step=50, default=100, label="π Chunk Size (words)"),
|
106 |
+
gr.Checkbox(label="π Convert Text to Lowercase", default=True),
|
107 |
+
gr.Checkbox(label="β Remove Punctuation", default=True),
|
108 |
+
gr.Checkbox(label="ποΈ Remove Stopwords", default=False),
|
109 |
],
|
110 |
outputs=[
|
111 |
+
gr.Textbox(label="π Missing Data Report"),
|
112 |
+
gr.Textbox(label="β¨ Embedding Sample"),
|
113 |
+
gr.File(label="β¬οΈ Download Processed Dataset"),
|
114 |
],
|
115 |
+
title="π Advanced Data Cleaning & Embedding Tool",
|
116 |
+
description=(
|
117 |
+
"Upload your dataset to clean, chunk, and generate embeddings using Llama LLM with Groq API. "
|
118 |
+
"Customize text cleaning options, chunk size, and more. Automatically adds column labels if missing."
|
119 |
+
),
|
120 |
+
live=True,
|
121 |
)
|
122 |
|
123 |
# Launch App
|