SantanuBanerjee commited on
Commit
ee949ff
·
verified ·
1 Parent(s): 46a0ef6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -26
app.py CHANGED
@@ -216,7 +216,11 @@ from sklearn.metrics import silhouette_score
216
  from bertopic import BERTopic
217
 
218
  # def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
219
- def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
 
 
 
 
220
 
221
  # Select Model (can we also optimize model selection automatically?)
222
  # model = SentenceTransformer('all-MiniLM-L6-v2')
@@ -235,38 +239,27 @@ def extract_problem_domains(df, text_column='Problem_Description', cluster_range
235
 
236
  # Determine the optimal number of clusters
237
  optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
238
-
239
- # Perform clustering with the optimal number of clusters
240
- clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
241
- cluster_labels = clustering.fit_predict(embeddings)
242
-
243
-
244
- # BERTopic for topic modelling
245
- # topic_model = BERTopic(num_topics=optimal_n_clusters)
246
- # topics, _ = topic_model.fit_transform(df[text_column].tolist())
247
- topic_model = BERTopic()
248
- topics, _ = topic_model.fit_transform(df[text_column].tolist())
249
- topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
250
-
251
-
252
  # Get representative words for each cluster
253
  cluster_representations = {}
254
- for i in range(optimal_n_clusters):
255
- # cluster_representations[i] = topic_model.get_topic_info(i)['words'][:top_words]
256
- cluster_representations[i] = topic_model.get_topic_info(i).get('words', [])[:top_words]
257
 
258
  # Map cluster labels to representative words
259
- df["Problem_Cluster"] = cluster_labels
260
- df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
261
 
262
- # # Print clusters and their representative words
263
- # for cluster_label, words in cluster_representations.items():
264
- # print(f"Domain {cluster_label}: {', '.join(words)}")
265
 
266
- # return df.assign(cluster=cluster_labels), optimal_n_clusters
267
 
268
- # df[new_column_name] = clustering.fit_predict(embeddings)
269
- return df, optimal_n_clusters
 
270
 
271
  # Usage
272
  # clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)
 
216
  from bertopic import BERTopic
217
 
218
  # def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
219
+ # def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
220
+ def extract_problem_domains(df,
221
+ text_column='Problem_Description',
222
+ cluster_range=(10, 50),
223
+ top_words=17):
224
 
225
  # Select Model (can we also optimize model selection automatically?)
226
  # model = SentenceTransformer('all-MiniLM-L6-v2')
 
239
 
240
  # Determine the optimal number of clusters
241
  optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
242
+
243
+ # BERTopic for topic modeling
244
+ topic_model = BERTopic(nr_topics=optimal_n_clusters)
245
+ topics, probs = topic_model.fit_transform(df[text_column].tolist())
246
+
 
 
 
 
 
 
 
 
 
247
  # Get representative words for each cluster
248
  cluster_representations = {}
249
+ for topic in range(len(topic_model.get_topic_info())):
250
+ words, _ = zip(*topic_model.get_topic(topic))
251
+ cluster_representations[topic] = list(words)[:top_words]
252
 
253
  # Map cluster labels to representative words
254
+ df["Problem_Cluster"] = topics
255
+ df['Problem_Category_Words'] = [cluster_representations.get(label, []) for label in topics]
256
 
257
+ return df, optimal_n_clusters
 
 
258
 
 
259
 
260
+
261
+
262
+
263
 
264
  # Usage
265
  # clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)