Haseeb-001 commited on
Commit
e108ee1
·
verified ·
1 Parent(s): b52603a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -26
app.py CHANGED
@@ -7,11 +7,12 @@ from nltk.corpus import stopwords
7
  from tqdm import tqdm
8
  import nltk
9
 
 
10
  # Download stopwords for text cleaning
11
  nltk.download('stopwords')
12
  STOPWORDS = set(stopwords.words('english'))
13
 
14
- # Set Groq API Key
15
  GROQ_API_KEY = "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"
16
  client = Groq(api_key=GROQ_API_KEY)
17
 
@@ -56,9 +57,9 @@ def generate_embeddings(chunk):
56
  return chat_completion.choices[0].message.content
57
 
58
  # Main Function: Process Data
59
- def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
60
  # Load data
61
- data = pd.read_csv(file)
62
 
63
  # Generate missing data report
64
  missing_report = missing_data_report(data)
@@ -76,50 +77,49 @@ def process_dataset(file, chunk_size=100, lowercase=True, remove_punctuation=Tru
76
  lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
77
  )
78
 
79
- # Save cleaned data with embeddings
80
- output_file = 'processed_data.csv'
81
  cleaned_data.to_csv(output_file, index=False)
82
 
83
- # Display sample embeddings
84
- embedding_sample = cleaned_data['embeddings'].head(5)
85
-
86
- return missing_report, embedding_sample, output_file
87
 
88
- # Gradio UI
89
- def gradio_interface(file, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
90
- missing_report, embedding_sample, output_file = process_dataset(
91
- file, chunk_size, lowercase, remove_punctuation, remove_stopwords
92
- )
93
- return (
94
- missing_report,
95
- f"Sample Embeddings:\n{embedding_sample}",
96
- output_file
97
  )
98
 
99
- # Gradio App
 
 
 
100
  ui = gr.Interface(
101
- fn=gradio_interface,
102
  inputs=[
103
- gr.inputs.File(label="Upload CSV Dataset"),
104
  gr.inputs.Slider(50, 500, step=50, default=100, label="Chunk Size (words)"),
105
  gr.inputs.Checkbox(label="Convert Text to Lowercase", default=True),
106
  gr.inputs.Checkbox(label="Remove Punctuation", default=True),
107
  gr.inputs.Checkbox(label="Remove Stopwords", default=False),
108
  ],
109
  outputs=[
 
110
  gr.outputs.Textbox(label="Missing Data Report"),
111
- gr.outputs.Textbox(label="Embedding Sample"),
112
- gr.outputs.File(label="Download Processed Dataset"),
113
  ],
114
  title="Enhanced Data Cleaning and Embedding Tool",
115
  description=(
116
- "Upload your dataset to clean, chunk, and generate embeddings using Llama LLM with Groq API. "
 
117
  "Customize text cleaning options and chunk size to suit your needs, or use the default settings."
118
  ),
119
  theme="huggingface",
120
  live=True,
121
  )
122
 
123
- # Launch App
124
  if __name__ == "__main__":
125
- ui.launch()
 
7
  from tqdm import tqdm
8
  import nltk
9
 
10
+
11
  # Download stopwords for text cleaning
12
  nltk.download('stopwords')
13
  STOPWORDS = set(stopwords.words('english'))
14
 
15
+ # Set Groq API Key (Consider environment variables for security)
16
  GROQ_API_KEY = "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"
17
  client = Groq(api_key=GROQ_API_KEY)
18
 
 
57
  return chat_completion.choices[0].message.content
58
 
59
  # Main Function: Process Data
60
+ def process_dataset(file_path, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
61
  # Load data
62
+ data = pd.read_csv(file_path)
63
 
64
  # Generate missing data report
65
  missing_report = missing_data_report(data)
 
77
  lambda chunks: [generate_embeddings(chunk) for chunk in chunks]
78
  )
79
 
80
+ # Save cleaned data with embeddings (adjust based on Spaces file system)
81
+ output_file = 'processed_data.csv' # Replace with appropriate path within Spaces
82
  cleaned_data.to_csv(output_file, index=False)
83
 
84
+ return missing_report, cleaned_data
 
 
 
85
 
86
+ # Modified Gradio interface for Hugging Face Spaces
87
+ def hf_spaces_interface(file_path, chunk_size=100, lowercase=True, remove_punctuation=True, remove_stopwords=False):
88
+ """
89
+ Modified interface for Hugging Face Spaces.
90
+ Assumes the input file is already uploaded to the Spaces environment.
91
+ """
92
+ missing_report, processed_data = process_dataset(
93
+ file_path, chunk_size, lowercase, remove_punctuation, remove_stopwords
 
94
  )
95
 
96
+ # Display results directly
97
+ return processed_data.to_markdown(index=False), missing_report
98
+
99
+ # Create the Gradio interface
100
  ui = gr.Interface(
101
+ fn=hf_spaces_interface,
102
  inputs=[
103
+ gr.inputs.Textbox(label="File Path (within Spaces)", lines=1), # Get file path from user
104
  gr.inputs.Slider(50, 500, step=50, default=100, label="Chunk Size (words)"),
105
  gr.inputs.Checkbox(label="Convert Text to Lowercase", default=True),
106
  gr.inputs.Checkbox(label="Remove Punctuation", default=True),
107
  gr.inputs.Checkbox(label="Remove Stopwords", default=False),
108
  ],
109
  outputs=[
110
+ gr.outputs.Markdown(label="Processed Data"),
111
  gr.outputs.Textbox(label="Missing Data Report"),
 
 
112
  ],
113
  title="Enhanced Data Cleaning and Embedding Tool",
114
  description=(
115
+ "Upload your dataset to Spaces and provide the file path here. "
116
+ "Clean, chunk, and generate embeddings using Llama LLM with Groq API. "
117
  "Customize text cleaning options and chunk size to suit your needs, or use the default settings."
118
  ),
119
  theme="huggingface",
120
  live=True,
121
  )
122
 
123
+ # Launch the app (for Spaces, this might be handled differently)
124
  if __name__ == "__main__":
125
+ ui.launch(share=True) # Share the app publicly (optional)