Haseeb-001 commited on
Commit
bce8bc7
·
verified ·
1 Parent(s): e108ee1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -74
app.py CHANGED
@@ -3,27 +3,13 @@ import pandas as pd
3
  import re
4
  from groq import Groq
5
  import gradio as gr
6
- from nltk.corpus import stopwords
7
- from tqdm import tqdm
8
- import nltk
9
 
10
-
11
- # Download stopwords for text cleaning
12
- nltk.download('stopwords')
13
- STOPWORDS = set(stopwords.words('english'))
14
-
15
- # Set Groq API Key (Consider environment variables for security)
16
  GROQ_API_KEY = "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"
17
  client = Groq(api_key=GROQ_API_KEY)
18
 
19
- # Function: Generate Missing Data Report
20
- def missing_data_report(data):
21
- missing_report = data.isnull().sum()
22
- total_missing = missing_report.sum()
23
- return f"Missing Data Report:\n\n{missing_report}\n\nTotal Missing Values: {total_missing}"
24
-
25
  # Function: Clean Dataset
26
- def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=False):
27
  # Fill missing values
28
  data.fillna(method='ffill', inplace=True)
29
  data.fillna(method='bfill', inplace=True)
@@ -33,12 +19,7 @@ def clean_data(data, lowercase=True, remove_punctuation=True, remove_stopwords=F
33
 
34
  # Normalize and clean text columns
35
  for col in data.select_dtypes(include=['object']).columns:
36
- if lowercase:
37
- data[col] = data[col].str.lower()
38
- if remove_punctuation:
39
- data[col] = data[col].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
40
- if remove_stopwords:
41
- data[col] = data[col].apply(lambda x: ' '.join([word for word in str(x).split() if word not in STOPWORDS]))
42
 
43
  return data
44
 
@@ -57,69 +38,39 @@ def generate_embeddings(chunk):
57
  return chat_completion.choices[0].message.content
58
 
59
  # Main Function: Process Data
60
- def process_dataset(file_path, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
61
  # Load data
62
- data = pd.read_csv(file_path)
63
-
64
- # Generate missing data report
65
- missing_report = missing_data_report(data)
66
-
67
  # Step 1: Clean data
68
- cleaned_data = clean_data(data, lowercase, remove_punctuation, remove_stopwords)
69
 
70
  # Step 2: Create chunks
71
- tqdm.pandas(desc="Chunking Data")
72
- cleaned_data['chunks'] = cleaned_data['text_column'].progress_apply(lambda x: chunk_text(x, max_length=chunk_size))
73
 
74
  # Step 3: Generate embeddings
75
- tqdm.pandas(desc="Generating Embeddings")
76
- cleaned_data['embeddings'] = cleaned_data['chunks'].progress_apply(
77
- lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
78
- )
79
-
80
- # Save cleaned data with embeddings (adjust based on Spaces file system)
81
- output_file = 'processed_data.csv' # Replace with appropriate path within Spaces
82
- cleaned_data.to_csv(output_file, index=False)
83
 
84
- return missing_report, cleaned_data
85
-
86
- # Modified Gradio interface for Hugging Face Spaces
87
- def hf_spaces_interface(file_path, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
88
- """
89
- Modified interface for Hugging Face Spaces.
90
- Assumes the input file is already uploaded to the Spaces environment.
91
- """
92
- missing_report, processed_data = process_dataset(
93
- file_path, chunk_size, lowercase, remove_punctuation, remove_stopwords
94
- )
95
 
96
- # Display results directly
97
- return processed_data.to_markdown(index=False), missing_report
 
 
98
 
99
- # Create the Gradio interface
100
  ui = gr.Interface(
101
- fn=hf_spaces_interface,
102
- inputs=[
103
- gr.inputs.Textbox(label="File Path (within Spaces)", lines=1), # Get file path from user
104
- gr.inputs.Slider(50, 500, step=50, default=100, label="Chunk Size (words)"),
105
- gr.inputs.Checkbox(label="Convert Text to Lowercase", default=True),
106
- gr.inputs.Checkbox(label="Remove Punctuation", default=True),
107
- gr.inputs.Checkbox(label="Remove Stopwords", default=False),
108
- ],
109
- outputs=[
110
- gr.outputs.Markdown(label="Processed Data"),
111
- gr.outputs.Textbox(label="Missing Data Report"),
112
- ],
113
- title="Enhanced Data Cleaning and Embedding Tool",
114
- description=(
115
- "Upload your dataset to Spaces and provide the file path here. "
116
- "Clean, chunk, and generate embeddings using Llama LLM with Groq API. "
117
- "Customize text cleaning options and chunk size to suit your needs, or use the default settings."
118
- ),
119
- theme="huggingface",
120
  live=True,
121
  )
122
 
123
- # Launch the app (for Spaces, this might be handled differently)
124
  if __name__ == "__main__":
125
- ui.launch(share=True) # Share the app publicly (optional)
 
3
  import re
4
  from groq import Groq
5
  import gradio as gr
 
 
 
6
 
7
+ # Set Groq API Key
 
 
 
 
 
8
  GROQ_API_KEY = "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"
9
  client = Groq(api_key=GROQ_API_KEY)
10
 
 
 
 
 
 
 
11
  # Function: Clean Dataset
12
+ def clean_data(data):
13
  # Fill missing values
14
  data.fillna(method='ffill', inplace=True)
15
  data.fillna(method='bfill', inplace=True)
 
19
 
20
  # Normalize and clean text columns
21
  for col in data.select_dtypes(include=['object']).columns:
22
+ data[col] = data[col].apply(lambda x: re.sub(r'[^\w\s]', '', str(x).lower()))
 
 
 
 
 
23
 
24
  return data
25
 
 
38
  return chat_completion.choices[0].message.content
39
 
40
  # Main Function: Process Data
41
+ def process_dataset(file):
42
  # Load data
43
+ data = pd.read_csv(file)
44
+
 
 
 
45
  # Step 1: Clean data
46
+ cleaned_data = clean_data(data)
47
 
48
  # Step 2: Create chunks
49
+ cleaned_data['chunks'] = cleaned_data['text_column'].apply(chunk_text)
 
50
 
51
  # Step 3: Generate embeddings
52
+ cleaned_data['embeddings'] = cleaned_data['chunks'].apply(lambda chunks: [generate_embeddings(chunk) for chunk in chunks])
 
 
 
 
 
 
 
53
 
54
+ # Save cleaned data with embeddings
55
+ cleaned_data.to_csv('processed_data.csv', index=False)
56
+ return "Dataset cleaned, chunked, and embedded successfully! Saved as 'processed_data.csv'."
 
 
 
 
 
 
 
 
57
 
58
+ # Gradio UI
59
+ def gradio_interface(file):
60
+ result = process_dataset(file.name)
61
+ return result
62
 
63
+ # Gradio App
64
  ui = gr.Interface(
65
+ fn=gradio_interface,
66
+ inputs=gr.inputs.File(label="Upload CSV Dataset"),
67
+ outputs=gr.outputs.Textbox(label="Processing Result"),
68
+ title="Data Cleaning and Embedding Tool",
69
+ description="Upload your dataset to clean, chunk, and generate embeddings using Llama LLM with Groq API. Perfect for deployment on Hugging Face.",
70
+ theme="compact",
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  live=True,
72
  )
73
 
74
+ # Launch App
75
  if __name__ == "__main__":
76
+ ui.launch()