SantanuBanerjee commited on
Commit
2e7a421
·
verified ·
1 Parent(s): 0e7ae0f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -70
app.py CHANGED
@@ -210,97 +210,94 @@ def text_processing_for_domain(text):
210
  # Cluster Visualization: Added a simple visualization to display the top words in each cluster.
211
  # Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
212
 
 
 
213
  from sentence_transformers import SentenceTransformer
214
- from sklearn.cluster import AgglomerativeClustering
 
215
  from sklearn.metrics import silhouette_score
216
- # from bertopic import BERTopic
217
  from collections import Counter
 
218
 
219
 
220
- # def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
221
- # def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
222
  def extract_problem_domains(df,
223
  text_column='Problem_Description',
224
  cluster_range=(10, 50),
225
- top_words=17):
226
-
227
- # Select Model (can we also optimize model selection automatically?)
228
- # model = SentenceTransformer('all-MiniLM-L6-v2')
229
- model = SentenceTransformer('all-mpnet-base-v2')
230
- # model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
231
- # Generate embeddings
232
- embeddings = model.encode(df[text_column].tolist())
233
-
234
- # Perform hierarchical clustering with Silhouette Analysis
235
- silhouette_scores = []
236
- for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
237
- clustering = AgglomerativeClustering(n_clusters=n_clusters)
238
- cluster_labels = clustering.fit_predict(embeddings)
239
- silhouette_avg = silhouette_score(embeddings, cluster_labels)
240
- silhouette_scores.append(silhouette_avg)
241
 
242
- # Determine the optimal number of clusters
243
- optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
244
 
245
- # Perform clustering with the optimal number of clusters
246
- clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
247
- cluster_labels = clustering.fit_predict(embeddings)
248
 
249
 
250
- # # BERTopic for topic modelling
251
- # # topic_model = BERTopic(num_topics=optimal_n_clusters)
252
- # # topics, _ = topic_model.fit_transform(df[text_column].tolist())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  # topic_model = BERTopic()
254
  # topics, _ = topic_model.fit_transform(df[text_column].tolist())
255
  # topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
256
-
257
-
258
- # # Get representative words for each cluster
259
- # cluster_representations = {}
260
- # for i in range(optimal_n_clusters):
261
- # # cluster_representations[i] = topic_model.get_topic_info(i)['words'][:top_words]
262
- # cluster_representations[i] = topic_model.get_topic_info(i).get('words', [])[:top_words]
263
-
264
-
265
-
266
- # Get representative words for each cluster (without BERTopic)
267
- cluster_representations = {}
268
- for i in range(optimal_n_clusters):
269
- # Use the most common words in each cluster as representative words
270
- cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
271
- cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
272
-
273
-
274
-
275
-
276
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
  # Map cluster labels to representative words
279
  df["Problem_Cluster"] = cluster_labels
280
  df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
281
-
282
- # # Print clusters and their representative words
283
- # for cluster_label, words in cluster_representations.items():
284
- # print(f"Domain {cluster_label}: {', '.join(words)}")
285
-
286
- # return df.assign(cluster=cluster_labels), optimal_n_clusters
287
-
288
- # df[new_column_name] = clustering.fit_predict(embeddings)
289
  return df, optimal_n_clusters
290
-
291
-
292
-
293
-
294
-
295
-
296
-
297
-
298
-
299
-
300
-
301
-
302
-
303
-
304
 
305
 
306
  # Usage
 
210
  # Cluster Visualization: Added a simple visualization to display the top words in each cluster.
211
  # Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
212
 
213
+
214
+
215
  from sentence_transformers import SentenceTransformer
216
+ from sklearn.cluster import AgglomerativeClustering, KMeans
217
+ from sklearn.feature_extraction.text import TfidfVectorizer
218
  from sklearn.metrics import silhouette_score
219
+ from bertopic import BERTopic
220
  from collections import Counter
221
+ import numpy as np
222
 
223
 
 
 
224
  def extract_problem_domains(df,
225
  text_column='Problem_Description',
226
  cluster_range=(10, 50),
227
+ top_words=17,
228
+ # method='sentence_transformers'
229
+ method='tfidf_kmeans'
230
+ ):
 
 
 
 
 
 
 
 
 
 
 
 
231
 
 
 
232
 
 
 
 
233
 
234
 
235
+ if method == 'sentence_transformers':
236
+ # Sentence Transformers approach
237
+ model = SentenceTransformer('all-mpnet-base-v2')
238
+ embeddings = model.encode(df[text_column].tolist())
239
+
240
+ # Perform hierarchical clustering with Silhouette Analysis
241
+ silhouette_scores = []
242
+ for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
243
+ clustering = AgglomerativeClustering(n_clusters=n_clusters)
244
+ cluster_labels = clustering.fit_predict(embeddings)
245
+ silhouette_avg = silhouette_score(embeddings, cluster_labels)
246
+ silhouette_scores.append(silhouette_avg)
247
+
248
+ # Determine the optimal number of clusters
249
+ optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
250
+
251
+ # Perform clustering with the optimal number of clusters
252
+ clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
253
+ cluster_labels = clustering.fit_predict(embeddings)
254
+
255
+ elif method == 'tfidf_kmeans':
256
+ # TF-IDF Vectorization and K-Means approach
257
+ vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
258
+ X = vectorizer.fit_transform(df[text_column])
259
+
260
+ # Perform K-Means clustering with Silhouette Analysis
261
+ silhouette_scores = []
262
+ for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
263
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
264
+ cluster_labels = kmeans.fit_predict(X)
265
+ silhouette_avg = silhouette_score(X, cluster_labels)
266
+ silhouette_scores.append(silhouette_avg)
267
+
268
+ # Determine the optimal number of clusters
269
+ optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
270
+
271
+ # Perform final clustering with optimal number of clusters
272
+ kmeans = KMeans(n_clusters=optimal_n_clusters, random_state=42)
273
+ cluster_labels = kmeans.fit_predict(X)
274
+
275
+ # # BERTopic approach (commented out)
276
  # topic_model = BERTopic()
277
  # topics, _ = topic_model.fit_transform(df[text_column].tolist())
278
  # topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
279
+ # cluster_labels = topics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
+ # Get representative words for each cluster
282
+ if method == 'sentence_transformers':
283
+ cluster_representations = {}
284
+ for i in range(optimal_n_clusters):
285
+ cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
286
+ cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
287
+ elif method == 'tfidf_kmeans':
288
+ feature_names = vectorizer.get_feature_names_out()
289
+ cluster_representations = {}
290
+ for i in range(optimal_n_clusters):
291
+ center = kmeans.cluster_centers_[i]
292
+ top_word_indices = center.argsort()[-top_words:][::-1]
293
+ top_words = [feature_names[index] for index in top_word_indices]
294
+ cluster_representations[i] = top_words
295
 
296
  # Map cluster labels to representative words
297
  df["Problem_Cluster"] = cluster_labels
298
  df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
299
+
 
 
 
 
 
 
 
300
  return df, optimal_n_clusters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
 
303
  # Usage