SantanuBanerjee commited on
Commit
35e172a
·
verified ·
1 Parent(s): 7f8700c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +145 -11
app.py CHANGED
@@ -113,16 +113,49 @@ from nltk.tokenize import word_tokenize
113
  nltk.download('punkt')
114
  nltk.download('stopwords')
115
 
116
- def combined_text_processing(text):
117
- # Remove punctuation, numbers, URLs, and special characters
118
- text = re.sub(r'[^\w\s]', '', text) # Remove punctuation and special characters
119
- text = re.sub(r'\d+', '', text) # Remove numbers
120
- text = re.sub(r'http\S+', '', text) # Remove URLs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  # Tokenize and remove stopwords
123
- tokens = word_tokenize(text.lower()) # Convert to lowercase
124
  stop_words = set(stopwords.words('english'))
125
- tokens = [word for word in tokens if word not in stop_words]
 
 
 
 
 
 
 
 
 
126
 
127
  # Lemmatize tokens using SpaCy
128
  doc = nlp(' '.join(tokens))
@@ -139,6 +172,97 @@ def combined_text_processing(text):
139
 
140
 
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
 
144
 
@@ -153,15 +277,25 @@ def combined_text_processing(text):
153
 
154
  def nlp_pipeline(original_df):
155
  # Data Preprocessing
156
- processed_df = data_pre_processing(original_df)
 
 
 
 
 
 
 
 
 
 
 
157
 
158
 
159
- # Apply the combined function to your DataFrame
160
- processed_df['Processed_ProblemDescription'] = processed_df['Problem_Description'].apply(combined_text_processing)
161
 
162
 
 
 
163
 
164
- return processed_df
165
 
166
  def process_excel(file):
167
  try:
 
113
  nltk.download('punkt')
114
  nltk.download('stopwords')
115
 
116
+ # def combined_text_processing(text):
117
+ # # Remove punctuation, numbers, URLs, and special characters
118
+ # text = re.sub(r'[^\w\s]', '', text) # Remove punctuation and special characters
119
+ # text = re.sub(r'\d+', '', text) # Remove numbers
120
+ # text = re.sub(r'http\S+', '', text) # Remove URLs
121
+
122
+ # # Tokenize and remove stopwords
123
+ # tokens = word_tokenize(text.lower()) # Convert to lowercase
124
+ # stop_words = set(stopwords.words('english'))
125
+ # tokens = [word for word in tokens if word not in stop_words]
126
+
127
+ # # Lemmatize tokens using SpaCy
128
+ # doc = nlp(' '.join(tokens))
129
+ # lemmatized_text = ' '.join([token.lemma_ for token in doc])
130
+
131
+ # # Apply Hugging Face Transformers
132
+ # inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
133
+ # with torch.no_grad():
134
+ # outputs = model(**inputs)
135
+
136
+ # return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
137
+
138
+
139
+ def text_processing_for_domain(text):
140
+ # Text Cleaning
141
+ text = re.sub(r'[^\w\s]', '', text)
142
+ text = re.sub(r'\d+', '', text)
143
+ text = re.sub(r'http\S+', '', text) # Remove https URLs
144
+ text = re.sub(r'www\.\S+', '', text) # Remove www URLs
145
 
146
  # Tokenize and remove stopwords
147
+ tokens = word_tokenize(text.lower())
148
  stop_words = set(stopwords.words('english'))
149
+ custom_stopwords = {'example', 'another'} # Add custom stopwords
150
+ tokens = [word for word in tokens if word not in stop_words and word not in custom_stopwords]
151
+
152
+ # NER - Remove named entities
153
+ doc = nlp(' '.join(tokens))
154
+ tokens = [token.text for token in doc if not token.ent_type_]
155
+
156
+ # POS Tagging (optional)
157
+ pos_tags = nltk.pos_tag(tokens)
158
+ tokens = [word for word, pos in pos_tags if pos in ['NN', 'NNS']] # Filter nouns
159
 
160
  # Lemmatize tokens using SpaCy
161
  doc = nlp(' '.join(tokens))
 
172
 
173
 
174
 
175
+ # # 2. Clustering from ChatGPT
176
+ # # Libraries: scikit-learn, sentence-transformers
177
+ # # Use sentence embeddings and clustering algorithms to group similar project proposals.
178
+ # from bertopic import BERTopic
179
+ # def perform_clustering(texts, n_clusters):
180
+ # topic_model = BERTopic(n_topics=n_clusters)
181
+ # topics, _ = topic_model.fit_transform(texts)
182
+ # return topics, topic_model
183
+ # # Clustering function call
184
+ # clustered_df, cluster_centers = clustering(processed_df)
185
+ # Method 1: Sentence Transformers + KMeans
186
+
187
+ # # 2. Clustering: from Claude
188
+ # # Use BERTopic for advanced topic modeling and clustering.
189
+ # from bertopic import BERTopic
190
+ # def perform_clustering(texts, n_clusters):
191
+ # topic_model = BERTopic(n_topics=n_clusters)
192
+ # topics, _ = topic_model.fit_transform(texts)
193
+ # return topics, topic_model
194
+ # # Clustering function call
195
+ # problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
196
+ # location_clusters, location_model = perform_clustering(processed_df['Geographical_Location'], n_clusters=5)
197
+ # After this Method 2: BERTopic function, the following need to be done:
198
+ # processed_df['Problem_Cluster'] = problem_clusters
199
+
200
+
201
+
202
+ # 2. Meta AI Function: Sentence Transformers + Hierarchical Clustering + Silhouette Analysis
203
+ # Now this also includes:
204
+ # Topic Modeling using BERTopic: Integrated BERTopic to extract representative words for each cluster.
205
+ # Cluster Visualization: Added a simple visualization to display the top words in each cluster.
206
+ # Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
207
+
208
+ from sentence_transformers import SentenceTransformer
209
+ from sklearn.cluster import AgglomerativeClustering
210
+ from sklearn.metrics import silhouette_score
211
+ from bertopic import BERTopic
212
+
213
+ # def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
214
+ def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
215
+
216
+ # Select Model (can we also optimize model selection automatically?)
217
+ # model = SentenceTransformer('all-MiniLM-L6-v2')
218
+ model = SentenceTransformer('all-mpnet-base-v2')
219
+ # model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
220
+ # Generate embeddings
221
+ embeddings = model.encode(df[text_column].tolist())
222
+
223
+ # Perform hierarchical clustering with Silhouette Analysis
224
+ silhouette_scores = []
225
+ for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
226
+ clustering = AgglomerativeClustering(n_clusters=n_clusters)
227
+ cluster_labels = clustering.fit_predict(embeddings)
228
+ silhouette_avg = silhouette_score(embeddings, cluster_labels)
229
+ silhouette_scores.append(silhouette_avg)
230
+
231
+ # Determine the optimal number of clusters
232
+ optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
233
+
234
+ # Perform clustering with the optimal number of clusters
235
+ clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
236
+ cluster_labels = clustering.fit_predict(embeddings)
237
+
238
+
239
+ # BERTopic for topic modelling
240
+ topic_model = BERTopic(n_topics=optimal_n_clusters)
241
+ topics, _ = topic_model.fit_transform(df[text_column].tolist())
242
+
243
+
244
+ # Get representative words for each cluster
245
+ cluster_representations = {}
246
+ for i in range(optimal_n_clusters):
247
+ cluster_representations[i] = topic_model.get_topic_info(i)['words'][:top_words]
248
+
249
+ # Map cluster labels to representative words
250
+ df["Problem_Cluster"] = cluster_labels
251
+ df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
252
+
253
+ # # Print clusters and their representative words
254
+ # for cluster_label, words in cluster_representations.items():
255
+ # print(f"Domain {cluster_label}: {', '.join(words)}")
256
+
257
+ # return df.assign(cluster=cluster_labels), optimal_n_clusters
258
+
259
+ # df[new_column_name] = clustering.fit_predict(embeddings)
260
+ return df, optimal_n_clusters
261
+
262
+ # Usage
263
+ # clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)
264
+ # print(f'Optimal number of clusters: {optimal_n_clusters}')
265
+
266
 
267
 
268
 
 
277
 
278
  def nlp_pipeline(original_df):
279
  # Data Preprocessing
280
+ processed_df = data_pre_processing(original_df) # merged_dataset
281
+
282
+
283
+ # Starting the Pipeline for Domain Extraction
284
+ # Apply the text_processing_for_domain function to the DataFrame
285
+ processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
286
+ # Domain Clustering
287
+ domain_df, optimal_n_clusters = extract_problem_domains(processed_df)
288
+
289
+
290
+ # problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
291
+ # location_clusters, location_model = perform_clustering(processed_df['Geographical_Location'], n_clusters=5)
292
 
293
 
 
 
294
 
295
 
296
+ # return processed_df
297
+ return domain_df
298
 
 
299
 
300
  def process_excel(file):
301
  try: