SantanuBanerjee commited on
Commit
b4b6a14
·
verified ·
1 Parent(s): 887a7c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -299
app.py CHANGED
@@ -104,17 +104,6 @@ except OSError:
104
  tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
105
  model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")
106
 
107
- # def combined_text_processing(text):
108
- # # Basic NLP processing using SpaCy
109
- # doc = nlp(text)
110
- # lemmatized_text = ' '.join([token.lemma_ for token in doc])
111
-
112
- # # Advanced text representation using Hugging Face Transformers
113
- # inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
114
- # with torch.no_grad():
115
- # outputs = model(**inputs)
116
-
117
- # return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
118
 
119
 
120
 
@@ -130,29 +119,6 @@ nltk.download('averaged_perceptron_tagger')
130
 
131
 
132
 
133
- # def combined_text_processing(text):
134
- # # Remove punctuation, numbers, URLs, and special characters
135
- # text = re.sub(r'[^\w\s]', '', text) # Remove punctuation and special characters
136
- # text = re.sub(r'\d+', '', text) # Remove numbers
137
- # text = re.sub(r'http\S+', '', text) # Remove URLs
138
-
139
- # # Tokenize and remove stopwords
140
- # tokens = word_tokenize(text.lower()) # Convert to lowercase
141
- # stop_words = set(stopwords.words('english'))
142
- # tokens = [word for word in tokens if word not in stop_words]
143
-
144
- # # Lemmatize tokens using SpaCy
145
- # doc = nlp(' '.join(tokens))
146
- # lemmatized_text = ' '.join([token.lemma_ for token in doc])
147
-
148
- # # Apply Hugging Face Transformers
149
- # inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
150
- # with torch.no_grad():
151
- # outputs = model(**inputs)
152
-
153
- # return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
154
-
155
-
156
  import numpy as np
157
  import sentencepiece as sp
158
  from transformers import pipeline
@@ -223,297 +189,56 @@ def text_processing_for_domain(text):
223
 
224
 
225
 
226
-
227
-
228
-
229
-
230
- # # 2. Clustering from ChatGPT
231
- # # Libraries: scikit-learn, sentence-transformers
232
- # # Use sentence embeddings and clustering algorithms to group similar project proposals.
233
- # from bertopic import BERTopic
234
- # def perform_clustering(texts, n_clusters):
235
- # topic_model = BERTopic(n_topics=n_clusters)
236
- # topics, _ = topic_model.fit_transform(texts)
237
- # return topics, topic_model
238
- # # Clustering function call
239
- # clustered_df, cluster_centers = clustering(processed_df)
240
- # Method 1: Sentence Transformers + KMeans
241
-
242
- # # 2. Clustering: from Claude
243
- # # Use BERTopic for advanced topic modeling and clustering.
244
- # from bertopic import BERTopic
245
- # def perform_clustering(texts, n_clusters):
246
- # topic_model = BERTopic(n_topics=n_clusters)
247
- # topics, _ = topic_model.fit_transform(texts)
248
- # return topics, topic_model
249
- # # Clustering function call
250
- # problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
251
- # location_clusters, location_model = perform_clustering(processed_df['Geographical_Location'], n_clusters=5)
252
- # After this Method 2: BERTopic function, the following need to be done:
253
- # processed_df['Problem_Cluster'] = problem_clusters
254
-
255
-
256
-
257
- # 2. Meta AI Function: Sentence Transformers + Hierarchical Clustering + Silhouette Analysis
258
- # Now this also includes:
259
- # Topic Modeling using BERTopic: Integrated BERTopic to extract representative words for each cluster.
260
- # Cluster Visualization: Added a simple visualization to display the top words in each cluster.
261
- # Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
262
-
263
-
264
- # From here Sanban
265
- # from sentence_transformers import SentenceTransformer
266
- # from sklearn.cluster import AgglomerativeClustering, KMeans
267
- # from sklearn.feature_extraction.text import TfidfVectorizer
268
- # from sklearn.metrics import silhouette_score
269
- # from bertopic import BERTopic
270
- # from collections import Counter
271
-
272
-
273
- # def extract_problem_domains(df,
274
- # text_column='Processed_ProblemDescription_forDomainExtraction',
275
- # # text_column='Problem_Description',
276
- # cluster_range=(5, 15),
277
- # top_words=10,
278
- # # method='sentence_transformers'
279
- # method='tfidf_kmeans'
280
- # ):
281
-
282
-
283
- # console_messages.append("Extracting Problem Domains...")
284
-
285
- # if method == 'sentence_transformers':
286
- # # Sentence Transformers approach
287
- # model = SentenceTransformer('all-mpnet-base-v2')
288
- # embeddings = model.encode(df[text_column].tolist())
289
-
290
- # # Perform hierarchical clustering with Silhouette Analysis
291
- # silhouette_scores = []
292
- # for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
293
- # clustering = AgglomerativeClustering(n_clusters=n_clusters)
294
- # cluster_labels = clustering.fit_predict(embeddings)
295
- # silhouette_avg = silhouette_score(embeddings, cluster_labels)
296
- # silhouette_scores.append(silhouette_avg)
297
-
298
- # # Determine the optimal number of clusters
299
- # optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
300
-
301
- # # Perform clustering with the optimal number of clusters
302
- # clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
303
- # cluster_labels = clustering.fit_predict(embeddings)
304
-
305
- # elif method == 'tfidf_kmeans':
306
- # # TF-IDF Vectorization and K-Means approach
307
- # vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
308
- # X = vectorizer.fit_transform(df[text_column])
309
-
310
- # # Perform K-Means clustering with Silhouette Analysis
311
- # silhouette_scores = []
312
- # for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
313
- # kmeans = KMeans(n_clusters=n_clusters)#, random_state=42)
314
- # cluster_labels = kmeans.fit_predict(X)
315
- # silhouette_avg = silhouette_score(X, cluster_labels)
316
- # silhouette_scores.append(silhouette_avg)
317
-
318
- # # Determine the optimal number of clusters
319
- # optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
320
-
321
- # # Perform final clustering with optimal number of clusters
322
- # kmeans = KMeans(n_clusters=optimal_n_clusters) #, random_state=42)
323
- # cluster_labels = kmeans.fit_predict(X)
324
-
325
- # # # BERTopic approach (commented out)
326
- # console_messages.append("BERT is currently commented...")
327
- # # topic_model = BERTopic()
328
- # # topics, _ = topic_model.fit_transform(df[text_column].tolist())
329
- # # topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
330
- # # cluster_labels = topics
331
-
332
- # # Get representative words for each cluster
333
- # if method == 'sentence_transformers':
334
- # cluster_representations = {}
335
- # for i in range(optimal_n_clusters):
336
- # cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
337
- # cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
338
- # elif method == 'tfidf_kmeans':
339
- # feature_names = vectorizer.get_feature_names_out()
340
- # cluster_representations = {}
341
- # for i in range(optimal_n_clusters):
342
- # # center = kmeans.cluster_centers_[i]
343
-
344
- # # # print(f"top_words: {top_words}, type: {type(top_words)}")
345
- # # # print(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
346
-
347
- # # console_messages.append(f"top_words: {top_words}, type: {type(top_words)}")
348
- # # console_messages.append(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
349
-
350
- # # # top_word_indices = center.argsort()[-top_words:][::-1]
351
- # # top_word_indices = center.argsort()[-top_words:][::-1].tolist() # Indexes of top words
352
-
353
- # # top_words = [feature_names[index] for index in top_word_indices]
354
- # # cluster_representations[i] = top_words
355
-
356
- # try:
357
- # center = kmeans.cluster_centers_[i]
358
- # console_messages.append(f"Processing cluster {i}")
359
- # console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
360
-
361
- # if not isinstance(center, np.ndarray):
362
- # center = np.array(center)
363
-
364
- # # Remove NaN values
365
- # center = center[~np.isnan(center)]
366
-
367
- # sorted_indices = np.array(center.argsort())
368
-
369
- # top_word_indices = sorted_indices[-top_words:][::-1]
370
-
371
- # # Check for valid indices
372
- # if np.any(top_word_indices < 0) or np.any(top_word_indices >= len(feature_names)):
373
- # console_messages.append(f"Invalid top word indices for cluster {i}")
374
- # continue
375
-
376
- # top_words = [feature_names[index] for index in top_word_indices]
377
- # console_messages.append(f"Top words: {top_words}")
378
- # cluster_representations[i] = top_words
379
-
380
- # except Exception as e:
381
- # console_messages.append(f"Error processing cluster {i}: {str(e)}")
382
- # console_messages.append(f"Center: {center}")
383
-
384
-
385
-
386
- # console_messages.append(f"Number of clusters: {optimal_n_clusters}")
387
- # console_messages.append(f"Sample cluster words: {cluster_representations[0][:5]}...")
388
-
389
- # # Map cluster labels to representative words
390
- # df["Problem_Cluster"] = cluster_labels
391
- # df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
392
-
393
- # # console_messages.append("Returning from Problem Domain Extraction function.")
394
- # console_messages.append("Problem Domain Extraction completed.")
395
- # return df, optimal_n_clusters
396
- # Till here sanban
397
-
398
-
399
  from sentence_transformers import SentenceTransformer
400
  from sklearn.cluster import AgglomerativeClustering, KMeans
401
  from sklearn.feature_extraction.text import TfidfVectorizer
402
  from sklearn.metrics import silhouette_score
403
  from bertopic import BERTopic
404
  from collections import Counter
405
- import numpy as np
406
 
407
  def extract_problem_domains(df,
408
  text_column='Processed_ProblemDescription_forDomainExtraction',
409
  cluster_range=(5, 15),
410
  top_words=10,
411
- method='tfidf_kmeans'
412
  ):
413
-
414
  console_messages.append("Extracting Problem Domains...")
415
-
416
- if method == 'sentence_transformers':
417
- # Sentence Transformers approach
418
- model = SentenceTransformer('all-mpnet-base-v2')
419
- embeddings = model.encode(df[text_column].tolist())
420
-
421
- # Perform hierarchical clustering with Silhouette Analysis
422
- silhouette_scores = []
423
- for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
424
- clustering = AgglomerativeClustering(n_clusters=n_clusters)
425
- cluster_labels = clustering.fit_predict(embeddings)
426
- silhouette_avg = silhouette_score(embeddings, cluster_labels)
427
- silhouette_scores.append(silhouette_avg)
428
-
429
- # Determine the optimal number of clusters
430
- optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
431
-
432
- # Perform clustering with the optimal number of clusters
433
- clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
434
  cluster_labels = clustering.fit_predict(embeddings)
435
-
436
- elif method == 'tfidf_kmeans':
437
- # TF-IDF Vectorization and K-Means approach
438
- vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
439
- X = vectorizer.fit_transform(df[text_column])
440
-
441
- # Perform K-Means clustering with Silhouette Analysis
442
- silhouette_scores = []
443
- for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
444
- kmeans = KMeans(n_clusters=n_clusters)#, random_state=42)
445
- cluster_labels = kmeans.fit_predict(X)
446
- silhouette_avg = silhouette_score(X, cluster_labels)
447
- silhouette_scores.append(silhouette_avg)
448
-
449
- # Determine the optimal number of clusters
450
- optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
451
-
452
- # Perform final clustering with optimal number of clusters
453
- kmeans = KMeans(n_clusters=optimal_n_clusters) #, random_state=42)
454
- cluster_labels = kmeans.fit_predict(X)
455
 
456
- # # BERTopic approach (commented out)
457
- console_messages.append("BERT is currently commented...")
458
- # topic_model = BERTopic()
459
- # topics, _ = topic_model.fit_transform(df[text_column].tolist())
460
- # topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
461
- # cluster_labels = topics
462
 
463
  # Get representative words for each cluster
464
- if method == 'sentence_transformers':
465
- cluster_representations = {}
466
- for i in range(optimal_n_clusters):
467
- cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
468
- cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
469
- elif method == 'tfidf_kmeans':
470
- feature_names = vectorizer.get_feature_names_out()
471
- cluster_representations = {}
472
- for i in range(optimal_n_clusters):
473
- try:
474
- center = kmeans.cluster_centers_[i]
475
- console_messages.append(f"Processing cluster {i}")
476
- console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
477
-
478
- if isinstance(center, list):
479
- center = np.array(center)
480
-
481
- # Remove NaN values
482
- if np.any(np.isnan(center)):
483
- center = np.nan_to_num(center)
484
-
485
- sorted_indices = np.argsort(center)
486
-
487
- top_word_indices = sorted_indices[-top_words:][::-1]
488
-
489
- # Check for valid indices
490
- if np.any(top_word_indices < 0) or np.any(top_word_indices >= len(feature_names)):
491
- console_messages.append(f"Invalid top word indices for cluster {i}")
492
- continue
493
-
494
- top_words = [feature_names[index] for index in top_word_indices]
495
- console_messages.append(f"Top words: {top_words}")
496
- cluster_representations[i] = top_words
497
-
498
- except Exception as e:
499
- console_messages.append(f"Error processing cluster {i}: {str(e)}")
500
- console_messages.append(f"Center: {center}")
501
-
502
- console_messages.append(f"Number of clusters: {optimal_n_clusters}")
503
- console_messages.append(f"Sample cluster words: {cluster_representations[0][:5]}...")
504
 
505
  # Map cluster labels to representative words
506
  df["Problem_Cluster"] = cluster_labels
507
  df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
508
 
 
509
  console_messages.append("Problem Domain Extraction completed.")
510
  return df, optimal_n_clusters
511
 
512
 
513
- # Usage
514
- # clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)
515
- # print(f'Optimal number of clusters: {optimal_n_clusters}')
516
-
517
 
518
 
519
 
 
104
  tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
105
  model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")
106
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
 
109
 
 
119
 
120
 
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  import numpy as np
123
  import sentencepiece as sp
124
  from transformers import pipeline
 
189
 
190
 
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  from sentence_transformers import SentenceTransformer
193
  from sklearn.cluster import AgglomerativeClustering, KMeans
194
  from sklearn.feature_extraction.text import TfidfVectorizer
195
  from sklearn.metrics import silhouette_score
196
  from bertopic import BERTopic
197
  from collections import Counter
198
+
199
 
200
  def extract_problem_domains(df,
201
  text_column='Processed_ProblemDescription_forDomainExtraction',
202
  cluster_range=(5, 15),
203
  top_words=10,
204
+ method='sentence_transformers'
205
  ):
 
206
  console_messages.append("Extracting Problem Domains...")
207
+
208
+ # Sentence Transformers approach
209
+ model = SentenceTransformer('all-mpnet-base-v2')
210
+ embeddings = model.encode(df[text_column].tolist())
211
+
212
+ # Perform hierarchical clustering with Silhouette Analysis
213
+ silhouette_scores = []
214
+ for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
215
+ clustering = AgglomerativeClustering(n_clusters=n_clusters)
 
 
 
 
 
 
 
 
 
 
216
  cluster_labels = clustering.fit_predict(embeddings)
217
+ silhouette_avg = silhouette_score(embeddings, cluster_labels)
218
+ silhouette_scores.append(silhouette_avg)
219
+
220
+ # Determine the optimal number of clusters
221
+ optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
+ # Perform clustering with the optimal number of clusters
224
+ clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
225
+ cluster_labels = clustering.fit_predict(embeddings)
 
 
 
226
 
227
  # Get representative words for each cluster
228
+ cluster_representations = {}
229
+ for i in range(optimal_n_clusters):
230
+ cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
231
+ cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
  # Map cluster labels to representative words
234
  df["Problem_Cluster"] = cluster_labels
235
  df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
236
 
237
+ # console_messages.append("Returning from Problem Domain Extraction function.")
238
  console_messages.append("Problem Domain Extraction completed.")
239
  return df, optimal_n_clusters
240
 
241
 
 
 
 
 
242
 
243
 
244