Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -210,97 +210,94 @@ def text_processing_for_domain(text):
|
|
210 |
# Cluster Visualization: Added a simple visualization to display the top words in each cluster.
|
211 |
# Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
|
212 |
|
|
|
|
|
213 |
from sentence_transformers import SentenceTransformer
|
214 |
-
from sklearn.cluster import AgglomerativeClustering
|
|
|
215 |
from sklearn.metrics import silhouette_score
|
216 |
-
|
217 |
from collections import Counter
|
|
|
218 |
|
219 |
|
220 |
-
# def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
|
221 |
-
# def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
|
222 |
def extract_problem_domains(df,
|
223 |
text_column='Problem_Description',
|
224 |
cluster_range=(10, 50),
|
225 |
-
top_words=17
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
model = SentenceTransformer('all-mpnet-base-v2')
|
230 |
-
# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
|
231 |
-
# Generate embeddings
|
232 |
-
embeddings = model.encode(df[text_column].tolist())
|
233 |
-
|
234 |
-
# Perform hierarchical clustering with Silhouette Analysis
|
235 |
-
silhouette_scores = []
|
236 |
-
for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
|
237 |
-
clustering = AgglomerativeClustering(n_clusters=n_clusters)
|
238 |
-
cluster_labels = clustering.fit_predict(embeddings)
|
239 |
-
silhouette_avg = silhouette_score(embeddings, cluster_labels)
|
240 |
-
silhouette_scores.append(silhouette_avg)
|
241 |
|
242 |
-
# Determine the optimal number of clusters
|
243 |
-
optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
|
244 |
|
245 |
-
# Perform clustering with the optimal number of clusters
|
246 |
-
clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
|
247 |
-
cluster_labels = clustering.fit_predict(embeddings)
|
248 |
|
249 |
|
250 |
-
|
251 |
-
|
252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
# topic_model = BERTopic()
|
254 |
# topics, _ = topic_model.fit_transform(df[text_column].tolist())
|
255 |
# topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
|
256 |
-
|
257 |
-
|
258 |
-
# # Get representative words for each cluster
|
259 |
-
# cluster_representations = {}
|
260 |
-
# for i in range(optimal_n_clusters):
|
261 |
-
# # cluster_representations[i] = topic_model.get_topic_info(i)['words'][:top_words]
|
262 |
-
# cluster_representations[i] = topic_model.get_topic_info(i).get('words', [])[:top_words]
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
# Get representative words for each cluster (without BERTopic)
|
267 |
-
cluster_representations = {}
|
268 |
-
for i in range(optimal_n_clusters):
|
269 |
-
# Use the most common words in each cluster as representative words
|
270 |
-
cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
|
271 |
-
cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
|
278 |
# Map cluster labels to representative words
|
279 |
df["Problem_Cluster"] = cluster_labels
|
280 |
df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
281 |
-
|
282 |
-
# # Print clusters and their representative words
|
283 |
-
# for cluster_label, words in cluster_representations.items():
|
284 |
-
# print(f"Domain {cluster_label}: {', '.join(words)}")
|
285 |
-
|
286 |
-
# return df.assign(cluster=cluster_labels), optimal_n_clusters
|
287 |
-
|
288 |
-
# df[new_column_name] = clustering.fit_predict(embeddings)
|
289 |
return df, optimal_n_clusters
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
|
305 |
|
306 |
# Usage
|
|
|
210 |
# Cluster Visualization: Added a simple visualization to display the top words in each cluster.
|
211 |
# Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
|
212 |
|
213 |
+
|
214 |
+
|
215 |
from sentence_transformers import SentenceTransformer
|
216 |
+
from sklearn.cluster import AgglomerativeClustering, KMeans
|
217 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
218 |
from sklearn.metrics import silhouette_score
|
219 |
+
from bertopic import BERTopic
|
220 |
from collections import Counter
|
221 |
+
import numpy as np
|
222 |
|
223 |
|
|
|
|
|
224 |
def extract_problem_domains(df,
|
225 |
text_column='Problem_Description',
|
226 |
cluster_range=(10, 50),
|
227 |
+
top_words=17,
|
228 |
+
# method='sentence_transformers'
|
229 |
+
method='tfidf_kmeans'
|
230 |
+
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
|
|
|
|
|
232 |
|
|
|
|
|
|
|
233 |
|
234 |
|
235 |
+
if method == 'sentence_transformers':
|
236 |
+
# Sentence Transformers approach
|
237 |
+
model = SentenceTransformer('all-mpnet-base-v2')
|
238 |
+
embeddings = model.encode(df[text_column].tolist())
|
239 |
+
|
240 |
+
# Perform hierarchical clustering with Silhouette Analysis
|
241 |
+
silhouette_scores = []
|
242 |
+
for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
|
243 |
+
clustering = AgglomerativeClustering(n_clusters=n_clusters)
|
244 |
+
cluster_labels = clustering.fit_predict(embeddings)
|
245 |
+
silhouette_avg = silhouette_score(embeddings, cluster_labels)
|
246 |
+
silhouette_scores.append(silhouette_avg)
|
247 |
+
|
248 |
+
# Determine the optimal number of clusters
|
249 |
+
optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
|
250 |
+
|
251 |
+
# Perform clustering with the optimal number of clusters
|
252 |
+
clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
|
253 |
+
cluster_labels = clustering.fit_predict(embeddings)
|
254 |
+
|
255 |
+
elif method == 'tfidf_kmeans':
|
256 |
+
# TF-IDF Vectorization and K-Means approach
|
257 |
+
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
|
258 |
+
X = vectorizer.fit_transform(df[text_column])
|
259 |
+
|
260 |
+
# Perform K-Means clustering with Silhouette Analysis
|
261 |
+
silhouette_scores = []
|
262 |
+
for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
|
263 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
264 |
+
cluster_labels = kmeans.fit_predict(X)
|
265 |
+
silhouette_avg = silhouette_score(X, cluster_labels)
|
266 |
+
silhouette_scores.append(silhouette_avg)
|
267 |
+
|
268 |
+
# Determine the optimal number of clusters
|
269 |
+
optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
|
270 |
+
|
271 |
+
# Perform final clustering with optimal number of clusters
|
272 |
+
kmeans = KMeans(n_clusters=optimal_n_clusters, random_state=42)
|
273 |
+
cluster_labels = kmeans.fit_predict(X)
|
274 |
+
|
275 |
+
# # BERTopic approach (commented out)
|
276 |
# topic_model = BERTopic()
|
277 |
# topics, _ = topic_model.fit_transform(df[text_column].tolist())
|
278 |
# topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
|
279 |
+
# cluster_labels = topics
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
|
281 |
+
# Get representative words for each cluster
|
282 |
+
if method == 'sentence_transformers':
|
283 |
+
cluster_representations = {}
|
284 |
+
for i in range(optimal_n_clusters):
|
285 |
+
cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
|
286 |
+
cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
|
287 |
+
elif method == 'tfidf_kmeans':
|
288 |
+
feature_names = vectorizer.get_feature_names_out()
|
289 |
+
cluster_representations = {}
|
290 |
+
for i in range(optimal_n_clusters):
|
291 |
+
center = kmeans.cluster_centers_[i]
|
292 |
+
top_word_indices = center.argsort()[-top_words:][::-1]
|
293 |
+
top_words = [feature_names[index] for index in top_word_indices]
|
294 |
+
cluster_representations[i] = top_words
|
295 |
|
296 |
# Map cluster labels to representative words
|
297 |
df["Problem_Cluster"] = cluster_labels
|
298 |
df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
299 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
300 |
return df, optimal_n_clusters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
|
302 |
|
303 |
# Usage
|