Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -3,27 +3,13 @@ import pandas as pd
|
|
3 |
import re
|
4 |
from groq import Groq
|
5 |
import gradio as gr
|
6 |
-
from nltk.corpus import stopwords
|
7 |
-
from tqdm import tqdm
|
8 |
-
import nltk
|
9 |
|
10 |
-
|
11 |
-
# Download stopwords for text cleaning
|
12 |
-
nltk.download('stopwords')
|
13 |
-
STOPWORDS = set(stopwords.words('english'))
|
14 |
-
|
15 |
-
# Set Groq API Key (Consider environment variables for security)
|
16 |
GROQ_API_KEY = "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"
|
17 |
client = Groq(api_key=GROQ_API_KEY)
|
18 |
|
19 |
-
# Function: Generate Missing Data Report
|
20 |
-
def missing_data_report(data):
|
21 |
-
missing_report = data.isnull().sum()
|
22 |
-
total_missing = missing_report.sum()
|
23 |
-
return f"Missing Data Report:\n\n{missing_report}\n\nTotal Missing Values: {total_missing}"
|
24 |
-
|
25 |
# Function: Clean Dataset
|
26 |
-
def clean_data(data
|
27 |
# Fill missing values
|
28 |
data.fillna(method='ffill', inplace=True)
|
29 |
data.fillna(method='bfill', inplace=True)
|
@@ -33,12 +19,7 @@ def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=F
|
|
33 |
|
34 |
# Normalize and clean text columns
|
35 |
for col in data.select_dtypes(include=['object']).columns:
|
36 |
-
|
37 |
-
data[col] = data[col].str.lower()
|
38 |
-
if remove_punctuation:
|
39 |
-
data[col] = data[col].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
|
40 |
-
if remove_stopwords:
|
41 |
-
data[col] = data[col].apply(lambda x: ' '.join([word for word in str(x).split() if word not in STOPWORDS]))
|
42 |
|
43 |
return data
|
44 |
|
@@ -57,69 +38,39 @@ def generate_embeddings(chunk):
|
|
57 |
return chat_completion.choices[0].message.content
|
58 |
|
59 |
# Main Function: Process Data
|
60 |
-
def process_dataset(
|
61 |
# Load data
|
62 |
-
data = pd.read_csv(
|
63 |
-
|
64 |
-
# Generate missing data report
|
65 |
-
missing_report = missing_data_report(data)
|
66 |
-
|
67 |
# Step 1: Clean data
|
68 |
-
cleaned_data = clean_data(data
|
69 |
|
70 |
# Step 2: Create chunks
|
71 |
-
|
72 |
-
cleaned_data['chunks'] = cleaned_data['text_column'].progress_apply(lambda x: chunk_text(x, max_length=chunk_size))
|
73 |
|
74 |
# Step 3: Generate embeddings
|
75 |
-
|
76 |
-
cleaned_data['embeddings'] = cleaned_data['chunks'].progress_apply(
|
77 |
-
lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
|
78 |
-
)
|
79 |
-
|
80 |
-
# Save cleaned data with embeddings (adjust based on Spaces file system)
|
81 |
-
output_file = 'processed_data.csv' # Replace with appropriate path within Spaces
|
82 |
-
cleaned_data.to_csv(output_file, index=False)
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
def hf_spaces_interface(file_path, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
|
88 |
-
"""
|
89 |
-
Modified interface for Hugging Face Spaces.
|
90 |
-
Assumes the input file is already uploaded to the Spaces environment.
|
91 |
-
"""
|
92 |
-
missing_report, processed_data = process_dataset(
|
93 |
-
file_path, chunk_size, lowercase, remove_punctuation, remove_stopwords
|
94 |
-
)
|
95 |
|
96 |
-
|
97 |
-
|
|
|
|
|
98 |
|
99 |
-
#
|
100 |
ui = gr.Interface(
|
101 |
-
fn=
|
102 |
-
inputs=
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
gr.inputs.Checkbox(label="Remove Stopwords", default=False),
|
108 |
-
],
|
109 |
-
outputs=[
|
110 |
-
gr.outputs.Markdown(label="Processed Data"),
|
111 |
-
gr.outputs.Textbox(label="Missing Data Report"),
|
112 |
-
],
|
113 |
-
title="Enhanced Data Cleaning and Embedding Tool",
|
114 |
-
description=(
|
115 |
-
"Upload your dataset to Spaces and provide the file path here. "
|
116 |
-
"Clean, chunk, and generate embeddings using Llama LLM with Groq API. "
|
117 |
-
"Customize text cleaning options and chunk size to suit your needs, or use the default settings."
|
118 |
-
),
|
119 |
-
theme="huggingface",
|
120 |
live=True,
|
121 |
)
|
122 |
|
123 |
-
# Launch
|
124 |
if __name__ == "__main__":
|
125 |
-
ui.launch(
|
|
|
3 |
import re
|
4 |
from groq import Groq
|
5 |
import gradio as gr
|
|
|
|
|
|
|
6 |
|
7 |
+
# Set Groq API Key
|
|
|
|
|
|
|
|
|
|
|
8 |
GROQ_API_KEY = "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"
|
9 |
client = Groq(api_key=GROQ_API_KEY)
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
# Function: Clean Dataset
|
12 |
+
def clean_data(data):
|
13 |
# Fill missing values
|
14 |
data.fillna(method='ffill', inplace=True)
|
15 |
data.fillna(method='bfill', inplace=True)
|
|
|
19 |
|
20 |
# Normalize and clean text columns
|
21 |
for col in data.select_dtypes(include=['object']).columns:
|
22 |
+
data[col] = data[col].apply(lambda x: re.sub(r'[^\w\s]', '', str(x).lower()))
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
return data
|
25 |
|
|
|
38 |
return chat_completion.choices[0].message.content
|
39 |
|
40 |
# Main Function: Process Data
|
41 |
+
def process_dataset(file):
|
42 |
# Load data
|
43 |
+
data = pd.read_csv(file)
|
44 |
+
|
|
|
|
|
|
|
45 |
# Step 1: Clean data
|
46 |
+
cleaned_data = clean_data(data)
|
47 |
|
48 |
# Step 2: Create chunks
|
49 |
+
cleaned_data['chunks'] = cleaned_data['text_column'].apply(chunk_text)
|
|
|
50 |
|
51 |
# Step 3: Generate embeddings
|
52 |
+
cleaned_data['embeddings'] = cleaned_data['chunks'].apply(lambda chunks: [generate_embeddings(chunk) for chunk in chunks])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
+
# Save cleaned data with embeddings
|
55 |
+
cleaned_data.to_csv('processed_data.csv', index=False)
|
56 |
+
return "Dataset cleaned, chunked, and embedded successfully! Saved as 'processed_data.csv'."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
+
# Gradio UI
|
59 |
+
def gradio_interface(file):
|
60 |
+
result = process_dataset(file.name)
|
61 |
+
return result
|
62 |
|
63 |
+
# Gradio App
|
64 |
ui = gr.Interface(
|
65 |
+
fn=gradio_interface,
|
66 |
+
inputs=gr.inputs.File(label="Upload CSV Dataset"),
|
67 |
+
outputs=gr.outputs.Textbox(label="Processing Result"),
|
68 |
+
title="Data Cleaning and Embedding Tool",
|
69 |
+
description="Upload your dataset to clean, chunk, and generate embeddings using Llama LLM with Groq API. Perfect for deployment on Hugging Face.",
|
70 |
+
theme="compact",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
live=True,
|
72 |
)
|
73 |
|
74 |
+
# Launch App
|
75 |
if __name__ == "__main__":
|
76 |
+
ui.launch()
|