Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -261,6 +261,140 @@ def text_processing_for_domain(text):
|
|
261 |
# Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
|
262 |
|
263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
|
265 |
from sentence_transformers import SentenceTransformer
|
266 |
from sklearn.cluster import AgglomerativeClustering, KMeans
|
@@ -268,18 +402,15 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
|
268 |
from sklearn.metrics import silhouette_score
|
269 |
from bertopic import BERTopic
|
270 |
from collections import Counter
|
271 |
-
|
272 |
|
273 |
def extract_problem_domains(df,
|
274 |
text_column='Processed_ProblemDescription_forDomainExtraction',
|
275 |
-
# text_column='Problem_Description',
|
276 |
cluster_range=(5, 15),
|
277 |
top_words=10,
|
278 |
-
# method='sentence_transformers'
|
279 |
method='tfidf_kmeans'
|
280 |
):
|
281 |
|
282 |
-
|
283 |
console_messages.append("Extracting Problem Domains...")
|
284 |
|
285 |
if method == 'sentence_transformers':
|
@@ -339,32 +470,19 @@ def extract_problem_domains(df,
|
|
339 |
feature_names = vectorizer.get_feature_names_out()
|
340 |
cluster_representations = {}
|
341 |
for i in range(optimal_n_clusters):
|
342 |
-
# center = kmeans.cluster_centers_[i]
|
343 |
-
|
344 |
-
# # print(f"top_words: {top_words}, type: {type(top_words)}")
|
345 |
-
# # print(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
|
346 |
-
|
347 |
-
# console_messages.append(f"top_words: {top_words}, type: {type(top_words)}")
|
348 |
-
# console_messages.append(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
|
349 |
-
|
350 |
-
# # top_word_indices = center.argsort()[-top_words:][::-1]
|
351 |
-
# top_word_indices = center.argsort()[-top_words:][::-1].tolist() # Indexes of top words
|
352 |
-
|
353 |
-
# top_words = [feature_names[index] for index in top_word_indices]
|
354 |
-
# cluster_representations[i] = top_words
|
355 |
-
|
356 |
try:
|
357 |
center = kmeans.cluster_centers_[i]
|
358 |
console_messages.append(f"Processing cluster {i}")
|
359 |
console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
|
360 |
|
361 |
-
if
|
362 |
center = np.array(center)
|
363 |
|
364 |
# Remove NaN values
|
365 |
-
|
|
|
366 |
|
367 |
-
sorted_indices = np.
|
368 |
|
369 |
top_word_indices = sorted_indices[-top_words:][::-1]
|
370 |
|
@@ -381,8 +499,6 @@ def extract_problem_domains(df,
|
|
381 |
console_messages.append(f"Error processing cluster {i}: {str(e)}")
|
382 |
console_messages.append(f"Center: {center}")
|
383 |
|
384 |
-
|
385 |
-
|
386 |
console_messages.append(f"Number of clusters: {optimal_n_clusters}")
|
387 |
console_messages.append(f"Sample cluster words: {cluster_representations[0][:5]}...")
|
388 |
|
@@ -390,10 +506,9 @@ def extract_problem_domains(df,
|
|
390 |
df["Problem_Cluster"] = cluster_labels
|
391 |
df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
392 |
|
393 |
-
# console_messages.append("Returning from Problem Domain Extraction function.")
|
394 |
console_messages.append("Problem Domain Extraction completed.")
|
395 |
return df, optimal_n_clusters
|
396 |
-
|
397 |
|
398 |
# Usage
|
399 |
# clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)
|
|
|
261 |
# Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
|
262 |
|
263 |
|
264 |
+
# From here Sanban
|
265 |
+
# from sentence_transformers import SentenceTransformer
|
266 |
+
# from sklearn.cluster import AgglomerativeClustering, KMeans
|
267 |
+
# from sklearn.feature_extraction.text import TfidfVectorizer
|
268 |
+
# from sklearn.metrics import silhouette_score
|
269 |
+
# from bertopic import BERTopic
|
270 |
+
# from collections import Counter
|
271 |
+
|
272 |
+
|
273 |
+
# def extract_problem_domains(df,
|
274 |
+
# text_column='Processed_ProblemDescription_forDomainExtraction',
|
275 |
+
# # text_column='Problem_Description',
|
276 |
+
# cluster_range=(5, 15),
|
277 |
+
# top_words=10,
|
278 |
+
# # method='sentence_transformers'
|
279 |
+
# method='tfidf_kmeans'
|
280 |
+
# ):
|
281 |
+
|
282 |
+
|
283 |
+
# console_messages.append("Extracting Problem Domains...")
|
284 |
+
|
285 |
+
# if method == 'sentence_transformers':
|
286 |
+
# # Sentence Transformers approach
|
287 |
+
# model = SentenceTransformer('all-mpnet-base-v2')
|
288 |
+
# embeddings = model.encode(df[text_column].tolist())
|
289 |
+
|
290 |
+
# # Perform hierarchical clustering with Silhouette Analysis
|
291 |
+
# silhouette_scores = []
|
292 |
+
# for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
|
293 |
+
# clustering = AgglomerativeClustering(n_clusters=n_clusters)
|
294 |
+
# cluster_labels = clustering.fit_predict(embeddings)
|
295 |
+
# silhouette_avg = silhouette_score(embeddings, cluster_labels)
|
296 |
+
# silhouette_scores.append(silhouette_avg)
|
297 |
+
|
298 |
+
# # Determine the optimal number of clusters
|
299 |
+
# optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
|
300 |
+
|
301 |
+
# # Perform clustering with the optimal number of clusters
|
302 |
+
# clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
|
303 |
+
# cluster_labels = clustering.fit_predict(embeddings)
|
304 |
+
|
305 |
+
# elif method == 'tfidf_kmeans':
|
306 |
+
# # TF-IDF Vectorization and K-Means approach
|
307 |
+
# vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
|
308 |
+
# X = vectorizer.fit_transform(df[text_column])
|
309 |
+
|
310 |
+
# # Perform K-Means clustering with Silhouette Analysis
|
311 |
+
# silhouette_scores = []
|
312 |
+
# for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
|
313 |
+
# kmeans = KMeans(n_clusters=n_clusters)#, random_state=42)
|
314 |
+
# cluster_labels = kmeans.fit_predict(X)
|
315 |
+
# silhouette_avg = silhouette_score(X, cluster_labels)
|
316 |
+
# silhouette_scores.append(silhouette_avg)
|
317 |
+
|
318 |
+
# # Determine the optimal number of clusters
|
319 |
+
# optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
|
320 |
+
|
321 |
+
# # Perform final clustering with optimal number of clusters
|
322 |
+
# kmeans = KMeans(n_clusters=optimal_n_clusters) #, random_state=42)
|
323 |
+
# cluster_labels = kmeans.fit_predict(X)
|
324 |
+
|
325 |
+
# # # BERTopic approach (commented out)
|
326 |
+
# console_messages.append("BERT is currently commented...")
|
327 |
+
# # topic_model = BERTopic()
|
328 |
+
# # topics, _ = topic_model.fit_transform(df[text_column].tolist())
|
329 |
+
# # topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
|
330 |
+
# # cluster_labels = topics
|
331 |
+
|
332 |
+
# # Get representative words for each cluster
|
333 |
+
# if method == 'sentence_transformers':
|
334 |
+
# cluster_representations = {}
|
335 |
+
# for i in range(optimal_n_clusters):
|
336 |
+
# cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
|
337 |
+
# cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
|
338 |
+
# elif method == 'tfidf_kmeans':
|
339 |
+
# feature_names = vectorizer.get_feature_names_out()
|
340 |
+
# cluster_representations = {}
|
341 |
+
# for i in range(optimal_n_clusters):
|
342 |
+
# # center = kmeans.cluster_centers_[i]
|
343 |
+
|
344 |
+
# # # print(f"top_words: {top_words}, type: {type(top_words)}")
|
345 |
+
# # # print(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
|
346 |
+
|
347 |
+
# # console_messages.append(f"top_words: {top_words}, type: {type(top_words)}")
|
348 |
+
# # console_messages.append(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
|
349 |
+
|
350 |
+
# # # top_word_indices = center.argsort()[-top_words:][::-1]
|
351 |
+
# # top_word_indices = center.argsort()[-top_words:][::-1].tolist() # Indexes of top words
|
352 |
+
|
353 |
+
# # top_words = [feature_names[index] for index in top_word_indices]
|
354 |
+
# # cluster_representations[i] = top_words
|
355 |
+
|
356 |
+
# try:
|
357 |
+
# center = kmeans.cluster_centers_[i]
|
358 |
+
# console_messages.append(f"Processing cluster {i}")
|
359 |
+
# console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
|
360 |
+
|
361 |
+
# if not isinstance(center, np.ndarray):
|
362 |
+
# center = np.array(center)
|
363 |
+
|
364 |
+
# # Remove NaN values
|
365 |
+
# center = center[~np.isnan(center)]
|
366 |
+
|
367 |
+
# sorted_indices = np.array(center.argsort())
|
368 |
+
|
369 |
+
# top_word_indices = sorted_indices[-top_words:][::-1]
|
370 |
+
|
371 |
+
# # Check for valid indices
|
372 |
+
# if np.any(top_word_indices < 0) or np.any(top_word_indices >= len(feature_names)):
|
373 |
+
# console_messages.append(f"Invalid top word indices for cluster {i}")
|
374 |
+
# continue
|
375 |
+
|
376 |
+
# top_words = [feature_names[index] for index in top_word_indices]
|
377 |
+
# console_messages.append(f"Top words: {top_words}")
|
378 |
+
# cluster_representations[i] = top_words
|
379 |
+
|
380 |
+
# except Exception as e:
|
381 |
+
# console_messages.append(f"Error processing cluster {i}: {str(e)}")
|
382 |
+
# console_messages.append(f"Center: {center}")
|
383 |
+
|
384 |
+
|
385 |
+
|
386 |
+
# console_messages.append(f"Number of clusters: {optimal_n_clusters}")
|
387 |
+
# console_messages.append(f"Sample cluster words: {cluster_representations[0][:5]}...")
|
388 |
+
|
389 |
+
# # Map cluster labels to representative words
|
390 |
+
# df["Problem_Cluster"] = cluster_labels
|
391 |
+
# df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
392 |
+
|
393 |
+
# # console_messages.append("Returning from Problem Domain Extraction function.")
|
394 |
+
# console_messages.append("Problem Domain Extraction completed.")
|
395 |
+
# return df, optimal_n_clusters
|
396 |
+
# Till here sanban
|
397 |
+
|
398 |
|
399 |
from sentence_transformers import SentenceTransformer
|
400 |
from sklearn.cluster import AgglomerativeClustering, KMeans
|
|
|
402 |
from sklearn.metrics import silhouette_score
|
403 |
from bertopic import BERTopic
|
404 |
from collections import Counter
|
405 |
+
import numpy as np
|
406 |
|
407 |
def extract_problem_domains(df,
|
408 |
text_column='Processed_ProblemDescription_forDomainExtraction',
|
|
|
409 |
cluster_range=(5, 15),
|
410 |
top_words=10,
|
|
|
411 |
method='tfidf_kmeans'
|
412 |
):
|
413 |
|
|
|
414 |
console_messages.append("Extracting Problem Domains...")
|
415 |
|
416 |
if method == 'sentence_transformers':
|
|
|
470 |
feature_names = vectorizer.get_feature_names_out()
|
471 |
cluster_representations = {}
|
472 |
for i in range(optimal_n_clusters):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
473 |
try:
|
474 |
center = kmeans.cluster_centers_[i]
|
475 |
console_messages.append(f"Processing cluster {i}")
|
476 |
console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
|
477 |
|
478 |
+
if isinstance(center, list):
|
479 |
center = np.array(center)
|
480 |
|
481 |
# Remove NaN values
|
482 |
+
if np.any(np.isnan(center)):
|
483 |
+
center = np.nan_to_num(center)
|
484 |
|
485 |
+
sorted_indices = np.argsort(center)
|
486 |
|
487 |
top_word_indices = sorted_indices[-top_words:][::-1]
|
488 |
|
|
|
499 |
console_messages.append(f"Error processing cluster {i}: {str(e)}")
|
500 |
console_messages.append(f"Center: {center}")
|
501 |
|
|
|
|
|
502 |
console_messages.append(f"Number of clusters: {optimal_n_clusters}")
|
503 |
console_messages.append(f"Sample cluster words: {cluster_representations[0][:5]}...")
|
504 |
|
|
|
506 |
df["Problem_Cluster"] = cluster_labels
|
507 |
df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
508 |
|
|
|
509 |
console_messages.append("Problem Domain Extraction completed.")
|
510 |
return df, optimal_n_clusters
|
511 |
+
|
512 |
|
513 |
# Usage
|
514 |
# clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)
|