Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Aug 4, 2024

Commit

2e7a421

verified ·

1 Parent(s): 0e7ae0f

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -70

app.py CHANGED Viewed

@@ -210,97 +210,94 @@ def text_processing_for_domain(text):
 # Cluster Visualization: Added a simple visualization to display the top words in each cluster.
 # Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
 from sentence_transformers import SentenceTransformer
-from sklearn.cluster import AgglomerativeClustering
 from sklearn.metrics import silhouette_score
-# from bertopic import BERTopic
 from collections import Counter
-# def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
-# def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
 def extract_problem_domains(df,
                             text_column='Problem_Description',
                             cluster_range=(10, 50),
-                            top_words=17):
-    # Select Model (can we also optimize model selection automatically?)
-    # model = SentenceTransformer('all-MiniLM-L6-v2')
-    model = SentenceTransformer('all-mpnet-base-v2')
-    # model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
-    # Generate embeddings
-    embeddings = model.encode(df[text_column].tolist())
-    # Perform hierarchical clustering with Silhouette Analysis
-    silhouette_scores = []
-    for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
-        clustering = AgglomerativeClustering(n_clusters=n_clusters)
-        cluster_labels = clustering.fit_predict(embeddings)
-        silhouette_avg = silhouette_score(embeddings, cluster_labels)
-        silhouette_scores.append(silhouette_avg)
-    # Determine the optimal number of clusters
-    optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
-    # Perform clustering with the optimal number of clusters
-    clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
-    cluster_labels = clustering.fit_predict(embeddings)
-    # # BERTopic for topic modelling
-    # # topic_model = BERTopic(num_topics=optimal_n_clusters)
-    # # topics, _ = topic_model.fit_transform(df[text_column].tolist())
     # topic_model = BERTopic()
     # topics, _ = topic_model.fit_transform(df[text_column].tolist())
     # topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
-    # # Get representative words for each cluster
-    # cluster_representations = {}
-    # for i in range(optimal_n_clusters):
-    #     # cluster_representations[i] = topic_model.get_topic_info(i)['words'][:top_words]
-    #     cluster_representations[i] = topic_model.get_topic_info(i).get('words', [])[:top_words]
-    # Get representative words for each cluster (without BERTopic)
-    cluster_representations = {}
-    for i in range(optimal_n_clusters):
-        # Use the most common words in each cluster as representative words
-        cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
-        cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
     # Map cluster labels to representative words
     df["Problem_Cluster"] = cluster_labels
     df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
-    # # Print clusters and their representative words
-    # for cluster_label, words in cluster_representations.items():
-    #     print(f"Domain {cluster_label}: {', '.join(words)}")
-    # return df.assign(cluster=cluster_labels), optimal_n_clusters
-    # df[new_column_name] = clustering.fit_predict(embeddings)
     return df, optimal_n_clusters
 # Usage

 # Cluster Visualization: Added a simple visualization to display the top words in each cluster.
 # Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
 from sentence_transformers import SentenceTransformer
+from sklearn.cluster import AgglomerativeClustering, KMeans
+from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics import silhouette_score
+from bertopic import BERTopic
 from collections import Counter
+import numpy as np
 def extract_problem_domains(df,
                             text_column='Problem_Description',
                             cluster_range=(10, 50),
+                            top_words=17,
+                            # method='sentence_transformers'
+                            method='tfidf_kmeans'
+                           ):
+    if method == 'sentence_transformers':
+        # Sentence Transformers approach
+        model = SentenceTransformer('all-mpnet-base-v2')
+        embeddings = model.encode(df[text_column].tolist())
+        # Perform hierarchical clustering with Silhouette Analysis
+        silhouette_scores = []
+        for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
+            clustering = AgglomerativeClustering(n_clusters=n_clusters)
+            cluster_labels = clustering.fit_predict(embeddings)
+            silhouette_avg = silhouette_score(embeddings, cluster_labels)
+            silhouette_scores.append(silhouette_avg)
+        # Determine the optimal number of clusters
+        optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
+        # Perform clustering with the optimal number of clusters
+        clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
+        cluster_labels = clustering.fit_predict(embeddings)
+    elif method == 'tfidf_kmeans':
+        # TF-IDF Vectorization and K-Means approach
+        vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
+        X = vectorizer.fit_transform(df[text_column])
+        # Perform K-Means clustering with Silhouette Analysis
+        silhouette_scores = []
+        for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
+            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+            cluster_labels = kmeans.fit_predict(X)
+            silhouette_avg = silhouette_score(X, cluster_labels)
+            silhouette_scores.append(silhouette_avg)
+        # Determine the optimal number of clusters
+        optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
+        # Perform final clustering with optimal number of clusters
+        kmeans = KMeans(n_clusters=optimal_n_clusters, random_state=42)
+        cluster_labels = kmeans.fit_predict(X)
+    # # BERTopic approach (commented out)
     # topic_model = BERTopic()
     # topics, _ = topic_model.fit_transform(df[text_column].tolist())
     # topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
+    # cluster_labels = topics
+    # Get representative words for each cluster
+    if method == 'sentence_transformers':
+        cluster_representations = {}
+        for i in range(optimal_n_clusters):
+            cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
+            cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
+    elif method == 'tfidf_kmeans':
+        feature_names = vectorizer.get_feature_names_out()
+        cluster_representations = {}
+        for i in range(optimal_n_clusters):
+            center = kmeans.cluster_centers_[i]
+            top_word_indices = center.argsort()[-top_words:][::-1]
+            top_words = [feature_names[index] for index in top_word_indices]
+            cluster_representations[i] = top_words
     # Map cluster labels to representative words
     df["Problem_Cluster"] = cluster_labels
     df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
     return df, optimal_n_clusters
 # Usage