SantanuBanerjee commited on
Commit
887a7c1
·
verified ·
1 Parent(s): 9d1c4e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -25
app.py CHANGED
@@ -261,6 +261,140 @@ def text_processing_for_domain(text):
261
  # Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
262
 
263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
  from sentence_transformers import SentenceTransformer
266
  from sklearn.cluster import AgglomerativeClustering, KMeans
@@ -268,18 +402,15 @@ from sklearn.feature_extraction.text import TfidfVectorizer
268
  from sklearn.metrics import silhouette_score
269
  from bertopic import BERTopic
270
  from collections import Counter
271
-
272
 
273
  def extract_problem_domains(df,
274
  text_column='Processed_ProblemDescription_forDomainExtraction',
275
- # text_column='Problem_Description',
276
  cluster_range=(5, 15),
277
  top_words=10,
278
- # method='sentence_transformers'
279
  method='tfidf_kmeans'
280
  ):
281
 
282
-
283
  console_messages.append("Extracting Problem Domains...")
284
 
285
  if method == 'sentence_transformers':
@@ -339,32 +470,19 @@ def extract_problem_domains(df,
339
  feature_names = vectorizer.get_feature_names_out()
340
  cluster_representations = {}
341
  for i in range(optimal_n_clusters):
342
- # center = kmeans.cluster_centers_[i]
343
-
344
- # # print(f"top_words: {top_words}, type: {type(top_words)}")
345
- # # print(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
346
-
347
- # console_messages.append(f"top_words: {top_words}, type: {type(top_words)}")
348
- # console_messages.append(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
349
-
350
- # # top_word_indices = center.argsort()[-top_words:][::-1]
351
- # top_word_indices = center.argsort()[-top_words:][::-1].tolist() # Indexes of top words
352
-
353
- # top_words = [feature_names[index] for index in top_word_indices]
354
- # cluster_representations[i] = top_words
355
-
356
  try:
357
  center = kmeans.cluster_centers_[i]
358
  console_messages.append(f"Processing cluster {i}")
359
  console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
360
 
361
- if not isinstance(center, np.ndarray):
362
  center = np.array(center)
363
 
364
  # Remove NaN values
365
- center = center[~np.isnan(center)]
 
366
 
367
- sorted_indices = np.array(center.argsort())
368
 
369
  top_word_indices = sorted_indices[-top_words:][::-1]
370
 
@@ -381,8 +499,6 @@ def extract_problem_domains(df,
381
  console_messages.append(f"Error processing cluster {i}: {str(e)}")
382
  console_messages.append(f"Center: {center}")
383
 
384
-
385
-
386
  console_messages.append(f"Number of clusters: {optimal_n_clusters}")
387
  console_messages.append(f"Sample cluster words: {cluster_representations[0][:5]}...")
388
 
@@ -390,10 +506,9 @@ def extract_problem_domains(df,
390
  df["Problem_Cluster"] = cluster_labels
391
  df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
392
 
393
- # console_messages.append("Returning from Problem Domain Extraction function.")
394
  console_messages.append("Problem Domain Extraction completed.")
395
  return df, optimal_n_clusters
396
-
397
 
398
  # Usage
399
  # clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)
 
261
  # Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
262
 
263
 
264
+ # From here Sanban
265
+ # from sentence_transformers import SentenceTransformer
266
+ # from sklearn.cluster import AgglomerativeClustering, KMeans
267
+ # from sklearn.feature_extraction.text import TfidfVectorizer
268
+ # from sklearn.metrics import silhouette_score
269
+ # from bertopic import BERTopic
270
+ # from collections import Counter
271
+
272
+
273
+ # def extract_problem_domains(df,
274
+ # text_column='Processed_ProblemDescription_forDomainExtraction',
275
+ # # text_column='Problem_Description',
276
+ # cluster_range=(5, 15),
277
+ # top_words=10,
278
+ # # method='sentence_transformers'
279
+ # method='tfidf_kmeans'
280
+ # ):
281
+
282
+
283
+ # console_messages.append("Extracting Problem Domains...")
284
+
285
+ # if method == 'sentence_transformers':
286
+ # # Sentence Transformers approach
287
+ # model = SentenceTransformer('all-mpnet-base-v2')
288
+ # embeddings = model.encode(df[text_column].tolist())
289
+
290
+ # # Perform hierarchical clustering with Silhouette Analysis
291
+ # silhouette_scores = []
292
+ # for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
293
+ # clustering = AgglomerativeClustering(n_clusters=n_clusters)
294
+ # cluster_labels = clustering.fit_predict(embeddings)
295
+ # silhouette_avg = silhouette_score(embeddings, cluster_labels)
296
+ # silhouette_scores.append(silhouette_avg)
297
+
298
+ # # Determine the optimal number of clusters
299
+ # optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
300
+
301
+ # # Perform clustering with the optimal number of clusters
302
+ # clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
303
+ # cluster_labels = clustering.fit_predict(embeddings)
304
+
305
+ # elif method == 'tfidf_kmeans':
306
+ # # TF-IDF Vectorization and K-Means approach
307
+ # vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
308
+ # X = vectorizer.fit_transform(df[text_column])
309
+
310
+ # # Perform K-Means clustering with Silhouette Analysis
311
+ # silhouette_scores = []
312
+ # for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
313
+ # kmeans = KMeans(n_clusters=n_clusters)#, random_state=42)
314
+ # cluster_labels = kmeans.fit_predict(X)
315
+ # silhouette_avg = silhouette_score(X, cluster_labels)
316
+ # silhouette_scores.append(silhouette_avg)
317
+
318
+ # # Determine the optimal number of clusters
319
+ # optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
320
+
321
+ # # Perform final clustering with optimal number of clusters
322
+ # kmeans = KMeans(n_clusters=optimal_n_clusters) #, random_state=42)
323
+ # cluster_labels = kmeans.fit_predict(X)
324
+
325
+ # # # BERTopic approach (commented out)
326
+ # console_messages.append("BERT is currently commented...")
327
+ # # topic_model = BERTopic()
328
+ # # topics, _ = topic_model.fit_transform(df[text_column].tolist())
329
+ # # topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
330
+ # # cluster_labels = topics
331
+
332
+ # # Get representative words for each cluster
333
+ # if method == 'sentence_transformers':
334
+ # cluster_representations = {}
335
+ # for i in range(optimal_n_clusters):
336
+ # cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
337
+ # cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
338
+ # elif method == 'tfidf_kmeans':
339
+ # feature_names = vectorizer.get_feature_names_out()
340
+ # cluster_representations = {}
341
+ # for i in range(optimal_n_clusters):
342
+ # # center = kmeans.cluster_centers_[i]
343
+
344
+ # # # print(f"top_words: {top_words}, type: {type(top_words)}")
345
+ # # # print(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
346
+
347
+ # # console_messages.append(f"top_words: {top_words}, type: {type(top_words)}")
348
+ # # console_messages.append(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
349
+
350
+ # # # top_word_indices = center.argsort()[-top_words:][::-1]
351
+ # # top_word_indices = center.argsort()[-top_words:][::-1].tolist() # Indexes of top words
352
+
353
+ # # top_words = [feature_names[index] for index in top_word_indices]
354
+ # # cluster_representations[i] = top_words
355
+
356
+ # try:
357
+ # center = kmeans.cluster_centers_[i]
358
+ # console_messages.append(f"Processing cluster {i}")
359
+ # console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
360
+
361
+ # if not isinstance(center, np.ndarray):
362
+ # center = np.array(center)
363
+
364
+ # # Remove NaN values
365
+ # center = center[~np.isnan(center)]
366
+
367
+ # sorted_indices = np.array(center.argsort())
368
+
369
+ # top_word_indices = sorted_indices[-top_words:][::-1]
370
+
371
+ # # Check for valid indices
372
+ # if np.any(top_word_indices < 0) or np.any(top_word_indices >= len(feature_names)):
373
+ # console_messages.append(f"Invalid top word indices for cluster {i}")
374
+ # continue
375
+
376
+ # top_words = [feature_names[index] for index in top_word_indices]
377
+ # console_messages.append(f"Top words: {top_words}")
378
+ # cluster_representations[i] = top_words
379
+
380
+ # except Exception as e:
381
+ # console_messages.append(f"Error processing cluster {i}: {str(e)}")
382
+ # console_messages.append(f"Center: {center}")
383
+
384
+
385
+
386
+ # console_messages.append(f"Number of clusters: {optimal_n_clusters}")
387
+ # console_messages.append(f"Sample cluster words: {cluster_representations[0][:5]}...")
388
+
389
+ # # Map cluster labels to representative words
390
+ # df["Problem_Cluster"] = cluster_labels
391
+ # df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
392
+
393
+ # # console_messages.append("Returning from Problem Domain Extraction function.")
394
+ # console_messages.append("Problem Domain Extraction completed.")
395
+ # return df, optimal_n_clusters
396
+ # Till here sanban
397
+
398
 
399
  from sentence_transformers import SentenceTransformer
400
  from sklearn.cluster import AgglomerativeClustering, KMeans
 
402
  from sklearn.metrics import silhouette_score
403
  from bertopic import BERTopic
404
  from collections import Counter
405
+ import numpy as np
406
 
407
  def extract_problem_domains(df,
408
  text_column='Processed_ProblemDescription_forDomainExtraction',
 
409
  cluster_range=(5, 15),
410
  top_words=10,
 
411
  method='tfidf_kmeans'
412
  ):
413
 
 
414
  console_messages.append("Extracting Problem Domains...")
415
 
416
  if method == 'sentence_transformers':
 
470
  feature_names = vectorizer.get_feature_names_out()
471
  cluster_representations = {}
472
  for i in range(optimal_n_clusters):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
  try:
474
  center = kmeans.cluster_centers_[i]
475
  console_messages.append(f"Processing cluster {i}")
476
  console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
477
 
478
+ if isinstance(center, list):
479
  center = np.array(center)
480
 
481
  # Remove NaN values
482
+ if np.any(np.isnan(center)):
483
+ center = np.nan_to_num(center)
484
 
485
+ sorted_indices = np.argsort(center)
486
 
487
  top_word_indices = sorted_indices[-top_words:][::-1]
488
 
 
499
  console_messages.append(f"Error processing cluster {i}: {str(e)}")
500
  console_messages.append(f"Center: {center}")
501
 
 
 
502
  console_messages.append(f"Number of clusters: {optimal_n_clusters}")
503
  console_messages.append(f"Sample cluster words: {cluster_representations[0][:5]}...")
504
 
 
506
  df["Problem_Cluster"] = cluster_labels
507
  df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
508
 
 
509
  console_messages.append("Problem Domain Extraction completed.")
510
  return df, optimal_n_clusters
511
+
512
 
513
  # Usage
514
  # clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)