Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Aug 4, 2024

Commit

ba1e210

verified ·

1 Parent(s): cf3adb9

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -9

app.py CHANGED Viewed

@@ -274,7 +274,7 @@ def extract_problem_domains(df,
         optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
         # Perform final clustering with optimal number of clusters
-        kmeans = KMeans(n_clusters=optimal_n_clusters, random_state=42)
         cluster_labels = kmeans.fit_predict(X)
     # # BERTopic approach (commented out)
@@ -312,14 +312,14 @@ def extract_problem_domains(df,
             try:
                 center = kmeans.cluster_centers_[i]
-                console_messages.append(f"Processing cluster {i}")
-                console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
                 top_word_indices = center.argsort()[-top_words:][::-1].tolist()
-                console_messages.append(f"Top word indices: {top_word_indices}")
                 top_words = [feature_names[index] for index in top_word_indices]
-                console_messages.append(f"Top words: {top_words}")
                 cluster_representations[i] = top_words
             except Exception as e:
@@ -327,15 +327,15 @@ def extract_problem_domains(df,
                 console_messages.append(f"Center: {center}")
     # Map cluster labels to representative words
     df["Problem_Cluster"] = cluster_labels
     df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
-    console_messages.append("Returning from Problem Domain Extraction function.")
     return df, optimal_n_clusters

         optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
         # Perform final clustering with optimal number of clusters
+        kmeans = KMeans(n_clusters=optimal_n_clusters) #, random_state=42)
         cluster_labels = kmeans.fit_predict(X)
     # # BERTopic approach (commented out)
             try:
                 center = kmeans.cluster_centers_[i]
+                # console_messages.append(f"Processing cluster {i}")
+                # console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
                 top_word_indices = center.argsort()[-top_words:][::-1].tolist()
+                # console_messages.append(f"Top word indices: {top_word_indices}")
                 top_words = [feature_names[index] for index in top_word_indices]
+                # console_messages.append(f"Top words: {top_words}")
                 cluster_representations[i] = top_words
             except Exception as e:
                 console_messages.append(f"Center: {center}")
+        console_messages.append(f"Number of clusters: {optimal_n_clusters}")
+        console_messages.append(f"Sample cluster words: {cluster_representations[0][:5]}...")
     # Map cluster labels to representative words
     df["Problem_Cluster"] = cluster_labels
     df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
+    # console_messages.append("Returning from Problem Domain Extraction function.")
+    console_messages.append("Problem Domain Extraction completed.")
     return df, optimal_n_clusters