Haseeb-001 commited on
Commit
c409139
Β·
verified Β·
1 Parent(s): ac3f2e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -46
app.py CHANGED
@@ -1,25 +1,14 @@
1
  import os
2
  import pandas as pd
3
  import re
4
- from groq import Groq
5
  import gradio as gr
 
6
  from nltk.corpus import stopwords
7
- # Removed tqdm import due to compatibility issues
8
-
9
- # Set stopwords for text cleaning (you can hard-code them if you face download issues)
10
- STOPWORDS = set([
11
- "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
12
- "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself",
13
- "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these",
14
- "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
15
- "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by",
16
- "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below",
17
- "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here",
18
- "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such",
19
- "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should",
20
- "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn", "haven", "isn",
21
- "ma", "mightn", "mustn", "needn", "shan", "shouldn", "wasn", "weren", "won", "wouldn"
22
- ])
23
 
24
  # Set Groq API Key
25
  GROQ_API_KEY = "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"
@@ -34,27 +23,31 @@ def missing_data_report(data):
34
  # Function: Clean Dataset
35
  def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=False):
36
  # Fill missing values
37
- data.fillna(method="ffill", inplace=True)
38
- data.fillna(method="bfill", inplace=True)
 
 
 
 
39
 
40
  # Remove duplicates
41
  data = data.drop_duplicates()
42
 
43
  # Normalize and clean text columns
44
- for col in data.select_dtypes(include=["object"]).columns:
45
  if lowercase:
46
  data[col] = data[col].str.lower()
47
  if remove_punctuation:
48
- data[col] = data[col].apply(lambda x: re.sub(r"[^\w\s]", "", str(x)))
49
  if remove_stopwords:
50
- data[col] = data[col].apply(lambda x: " ".join([word for word in str(x).split() if word not in STOPWORDS]))
51
 
52
  return data
53
 
54
  # Function: Chunk Text
55
  def chunk_text(text, max_length=100):
56
  words = text.split()
57
- return [" ".join(words[i : i + max_length]) for i in range(0, len(words), max_length)]
58
 
59
  # Function: Generate Embeddings
60
  def generate_embeddings(chunk):
@@ -68,7 +61,7 @@ def generate_embeddings(chunk):
68
  # Main Function: Process Data
69
  def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
70
  # Load data
71
- data = pd.read_csv(file.name)
72
 
73
  # Generate missing data report
74
  missing_report = missing_data_report(data)
@@ -76,50 +69,55 @@ def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=Tru
76
  # Step 1: Clean data
77
  cleaned_data = clean_data(data, lowercase, remove_punctuation, remove_stopwords)
78
 
79
- # Step 2: Create chunks (removed tqdm)
80
- if "text_column" in cleaned_data.columns:
81
- cleaned_data["chunks"] = cleaned_data["text_column"].apply(lambda x: chunk_text(x, max_length=chunk_size))
82
- else:
83
- return "Error: 'text_column' not found in the dataset.", None, None
84
 
85
- # Step 3: Generate embeddings (removed tqdm)
86
- cleaned_data["embeddings"] = cleaned_data["chunks"].apply(
87
  lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
88
  )
89
 
90
  # Save cleaned data with embeddings
91
- output_file = "processed_data.csv"
92
  cleaned_data.to_csv(output_file, index=False)
93
 
94
  # Display sample embeddings
95
- embedding_sample = cleaned_data["embeddings"].head(5).to_string()
96
 
97
  return missing_report, embedding_sample, output_file
98
 
99
- # Gradio UI
100
- def gradio_interface(file, chunk_size, lowercase, remove_punctuation, remove_stopwords):
101
  missing_report, embedding_sample, output_file = process_dataset(
102
  file, chunk_size, lowercase, remove_punctuation, remove_stopwords
103
  )
104
- return missing_report, embedding_sample, output_file
 
 
 
 
105
 
106
  # Gradio App
107
  ui = gr.Interface(
108
  fn=gradio_interface,
109
  inputs=[
110
- gr.File(label="Upload CSV Dataset"),
111
- gr.Slider(50, 500, step=50, value=100, label="Chunk Size (words)"),
112
- gr.Checkbox(label="Convert Text to Lowercase", value=True),
113
- gr.Checkbox(label="Remove Punctuation", value=True),
114
- gr.Checkbox(label="Remove Stopwords", value=False),
115
  ],
116
  outputs=[
117
- gr.Text(label="Missing Data Report"),
118
- gr.Text(label="Embedding Sample"),
119
- gr.File(label="Download Processed Dataset"),
120
  ],
121
- title="Enhanced Data Cleaning and Embedding Tool",
122
- description="Upload your dataset to clean, chunk, and generate embeddings using Llama LLM with Groq API.",
 
 
 
 
123
  )
124
 
125
  # Launch App
 
1
  import os
2
  import pandas as pd
3
  import re
 
4
  import gradio as gr
5
+ from groq import Groq
6
  from nltk.corpus import stopwords
7
+ import nltk
8
+
9
+ # Download stopwords
10
+ nltk.download('stopwords')
11
+ STOPWORDS = set(stopwords.words('english'))
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Set Groq API Key
14
  GROQ_API_KEY = "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"
 
23
  # Function: Clean Dataset
24
  def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=False):
25
  # Fill missing values
26
+ data.fillna(method='ffill', inplace=True)
27
+ data.fillna(method='bfill', inplace=True)
28
+
29
+ # Auto-generate column labels if missing
30
+ if data.columns.isnull().any():
31
+ data.columns = [f"Column_{i + 1}" for i in range(data.shape[1])]
32
 
33
  # Remove duplicates
34
  data = data.drop_duplicates()
35
 
36
  # Normalize and clean text columns
37
+ for col in data.select_dtypes(include=['object']).columns:
38
  if lowercase:
39
  data[col] = data[col].str.lower()
40
  if remove_punctuation:
41
+ data[col] = data[col].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
42
  if remove_stopwords:
43
+ data[col] = data[col].apply(lambda x: ' '.join([word for word in str(x).split() if word not in STOPWORDS]))
44
 
45
  return data
46
 
47
  # Function: Chunk Text
48
  def chunk_text(text, max_length=100):
49
  words = text.split()
50
+ return [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]
51
 
52
  # Function: Generate Embeddings
53
  def generate_embeddings(chunk):
 
61
  # Main Function: Process Data
62
  def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
63
  # Load data
64
+ data = pd.read_csv(file)
65
 
66
  # Generate missing data report
67
  missing_report = missing_data_report(data)
 
69
  # Step 1: Clean data
70
  cleaned_data = clean_data(data, lowercase, remove_punctuation, remove_stopwords)
71
 
72
+ # Step 2: Create chunks
73
+ cleaned_data['chunks'] = cleaned_data['text_column'].apply(lambda x: chunk_text(x, max_length=chunk_size))
 
 
 
74
 
75
+ # Step 3: Generate embeddings
76
+ cleaned_data['embeddings'] = cleaned_data['chunks'].apply(
77
  lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
78
  )
79
 
80
  # Save cleaned data with embeddings
81
+ output_file = 'processed_data.csv'
82
  cleaned_data.to_csv(output_file, index=False)
83
 
84
  # Display sample embeddings
85
+ embedding_sample = cleaned_data['embeddings'].head(5)
86
 
87
  return missing_report, embedding_sample, output_file
88
 
89
+ # Gradio Interface Function
90
+ def gradio_interface(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
91
  missing_report, embedding_sample, output_file = process_dataset(
92
  file, chunk_size, lowercase, remove_punctuation, remove_stopwords
93
  )
94
+ return (
95
+ missing_report,
96
+ f"Sample Embeddings:\n{embedding_sample}",
97
+ output_file
98
+ )
99
 
100
  # Gradio App
101
  ui = gr.Interface(
102
  fn=gradio_interface,
103
  inputs=[
104
+ gr.File(label="πŸ“ Upload CSV Dataset"),
105
+ gr.Slider(50, 500, step=50, default=100, label="πŸ“ Chunk Size (words)"),
106
+ gr.Checkbox(label="πŸ”  Convert Text to Lowercase", default=True),
107
+ gr.Checkbox(label="❌ Remove Punctuation", default=True),
108
+ gr.Checkbox(label="πŸ—‘οΈ Remove Stopwords", default=False),
109
  ],
110
  outputs=[
111
+ gr.Textbox(label="πŸ“Š Missing Data Report"),
112
+ gr.Textbox(label="✨ Embedding Sample"),
113
+ gr.File(label="⬇️ Download Processed Dataset"),
114
  ],
115
+ title="πŸ” Advanced Data Cleaning & Embedding Tool",
116
+ description=(
117
+ "Upload your dataset to clean, chunk, and generate embeddings using Llama LLM with Groq API. "
118
+ "Customize text cleaning options, chunk size, and more. Automatically adds column labels if missing."
119
+ ),
120
+ live=True,
121
  )
122
 
123
  # Launch App