SantanuBanerjee commited on
Commit
0e7ae0f
·
verified ·
1 Parent(s): b187752

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -16
app.py CHANGED
@@ -213,7 +213,9 @@ def text_processing_for_domain(text):
213
  from sentence_transformers import SentenceTransformer
214
  from sklearn.cluster import AgglomerativeClustering
215
  from sklearn.metrics import silhouette_score
216
- from bertopic import BERTopic
 
 
217
 
218
  # def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
219
  # def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
@@ -239,30 +241,66 @@ def extract_problem_domains(df,
239
 
240
  # Determine the optimal number of clusters
241
  optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
242
-
243
- # BERTopic for topic modeling
244
- topic_model = BERTopic(nr_topics=optimal_n_clusters)
245
- topics, probs = topic_model.fit_transform(df[text_column].tolist())
246
-
247
- # Get representative words for each cluster
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  cluster_representations = {}
249
- for topic in range(len(topic_model.get_topic_info())):
250
- topic_words = topic_model.get_topic(topic)
251
- if isinstance(topic_words, list) and len(topic_words) > 0 and isinstance(topic_words[0], tuple):
252
- words = [word for word, _ in topic_words[:top_words]]
253
- else:
254
- words = []
255
- cluster_representations[topic] = words
 
 
256
 
257
  # Map cluster labels to representative words
258
- df["Problem_Cluster"] = topics
259
- df['Problem_Category_Words'] = [cluster_representations.get(label, []) for label in topics]
 
 
 
 
 
 
260
 
 
261
  return df, optimal_n_clusters
262
 
263
 
264
 
265
 
 
 
 
 
 
 
 
 
 
 
266
 
267
 
268
  # Usage
 
213
  from sentence_transformers import SentenceTransformer
214
  from sklearn.cluster import AgglomerativeClustering
215
  from sklearn.metrics import silhouette_score
216
+ # from bertopic import BERTopic
217
+ from collections import Counter
218
+
219
 
220
  # def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
221
  # def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
 
241
 
242
  # Determine the optimal number of clusters
243
  optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
244
+
245
+ # Perform clustering with the optimal number of clusters
246
+ clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
247
+ cluster_labels = clustering.fit_predict(embeddings)
248
+
249
+
250
+ # # BERTopic for topic modelling
251
+ # # topic_model = BERTopic(num_topics=optimal_n_clusters)
252
+ # # topics, _ = topic_model.fit_transform(df[text_column].tolist())
253
+ # topic_model = BERTopic()
254
+ # topics, _ = topic_model.fit_transform(df[text_column].tolist())
255
+ # topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
256
+
257
+
258
+ # # Get representative words for each cluster
259
+ # cluster_representations = {}
260
+ # for i in range(optimal_n_clusters):
261
+ # # cluster_representations[i] = topic_model.get_topic_info(i)['words'][:top_words]
262
+ # cluster_representations[i] = topic_model.get_topic_info(i).get('words', [])[:top_words]
263
+
264
+
265
+
266
+ # Get representative words for each cluster (without BERTopic)
267
  cluster_representations = {}
268
+ for i in range(optimal_n_clusters):
269
+ # Use the most common words in each cluster as representative words
270
+ cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
271
+ cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
272
+
273
+
274
+
275
+
276
+
277
 
278
  # Map cluster labels to representative words
279
+ df["Problem_Cluster"] = cluster_labels
280
+ df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
281
+
282
+ # # Print clusters and their representative words
283
+ # for cluster_label, words in cluster_representations.items():
284
+ # print(f"Domain {cluster_label}: {', '.join(words)}")
285
+
286
+ # return df.assign(cluster=cluster_labels), optimal_n_clusters
287
 
288
+ # df[new_column_name] = clustering.fit_predict(embeddings)
289
  return df, optimal_n_clusters
290
 
291
 
292
 
293
 
294
+
295
+
296
+
297
+
298
+
299
+
300
+
301
+
302
+
303
+
304
 
305
 
306
  # Usage