Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Aug 4, 2024

Commit

b4b6a14

verified ·

1 Parent(s): 887a7c1

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -299

app.py CHANGED Viewed

@@ -104,17 +104,6 @@ except OSError:
 tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
 model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")
-# def combined_text_processing(text):
-#     # Basic NLP processing using SpaCy
-#     doc = nlp(text)
-#     lemmatized_text = ' '.join([token.lemma_ for token in doc])
-#     # Advanced text representation using Hugging Face Transformers
-#     inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
-#     with torch.no_grad():
-#         outputs = model(**inputs)
-#     return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
@@ -130,29 +119,6 @@ nltk.download('averaged_perceptron_tagger')
-# def combined_text_processing(text):
-#     # Remove punctuation, numbers, URLs, and special characters
-#     text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
-#     text = re.sub(r'\d+', '', text)  # Remove numbers
-#     text = re.sub(r'http\S+', '', text)  # Remove URLs
-#     # Tokenize and remove stopwords
-#     tokens = word_tokenize(text.lower())  # Convert to lowercase
-#     stop_words = set(stopwords.words('english'))
-#     tokens = [word for word in tokens if word not in stop_words]
-#     # Lemmatize tokens using SpaCy
-#     doc = nlp(' '.join(tokens))
-#     lemmatized_text = ' '.join([token.lemma_ for token in doc])
-#     # Apply Hugging Face Transformers
-#     inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
-#     with torch.no_grad():
-#         outputs = model(**inputs)
-#     return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
 import numpy as np
 import sentencepiece as sp
 from transformers import pipeline
@@ -223,297 +189,56 @@ def text_processing_for_domain(text):
-# # 2. Clustering from ChatGPT
-# # Libraries: scikit-learn, sentence-transformers
-# # Use sentence embeddings and clustering algorithms to group similar project proposals.
-# from bertopic import BERTopic
-# def perform_clustering(texts, n_clusters):
-#     topic_model = BERTopic(n_topics=n_clusters)
-#     topics, _ = topic_model.fit_transform(texts)
-#     return topics, topic_model
-# # Clustering function call
-# clustered_df, cluster_centers = clustering(processed_df)
-# Method 1: Sentence Transformers + KMeans
-# # 2. Clustering: from Claude
-# # Use BERTopic for advanced topic modeling and clustering.
-# from bertopic import BERTopic
-# def perform_clustering(texts, n_clusters):
-#     topic_model = BERTopic(n_topics=n_clusters)
-#     topics, _ = topic_model.fit_transform(texts)
-#     return topics, topic_model
-# # Clustering function call
-# problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
-# location_clusters, location_model = perform_clustering(processed_df['Geographical_Location'], n_clusters=5)
-# After this Method 2: BERTopic function, the following need to be done:
-# processed_df['Problem_Cluster'] = problem_clusters
-# 2. Meta AI Function: Sentence Transformers + Hierarchical Clustering + Silhouette Analysis
-# Now this also includes:
-# Topic Modeling using BERTopic: Integrated BERTopic to extract representative words for each cluster.
-# Cluster Visualization: Added a simple visualization to display the top words in each cluster.
-# Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
-# From here Sanban
-# from sentence_transformers import SentenceTransformer
-# from sklearn.cluster import AgglomerativeClustering, KMeans
-# from sklearn.feature_extraction.text import TfidfVectorizer
-# from sklearn.metrics import silhouette_score
-# from bertopic import BERTopic
-# from collections import Counter
-# def extract_problem_domains(df,
-#                             text_column='Processed_ProblemDescription_forDomainExtraction',
-#                             # text_column='Problem_Description',
-#                             cluster_range=(5, 15),
-#                             top_words=10,
-#                             # method='sentence_transformers'
-#                             method='tfidf_kmeans'
-#                            ):
-#     console_messages.append("Extracting Problem Domains...")
-#     if method == 'sentence_transformers':
-#         # Sentence Transformers approach
-#         model = SentenceTransformer('all-mpnet-base-v2')
-#         embeddings = model.encode(df[text_column].tolist())
-#         # Perform hierarchical clustering with Silhouette Analysis
-#         silhouette_scores = []
-#         for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
-#             clustering = AgglomerativeClustering(n_clusters=n_clusters)
-#             cluster_labels = clustering.fit_predict(embeddings)
-#             silhouette_avg = silhouette_score(embeddings, cluster_labels)
-#             silhouette_scores.append(silhouette_avg)
-#         # Determine the optimal number of clusters
-#         optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
-#         # Perform clustering with the optimal number of clusters
-#         clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
-#         cluster_labels = clustering.fit_predict(embeddings)
-#     elif method == 'tfidf_kmeans':
-#         # TF-IDF Vectorization and K-Means approach
-#         vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
-#         X = vectorizer.fit_transform(df[text_column])
-#         # Perform K-Means clustering with Silhouette Analysis
-#         silhouette_scores = []
-#         for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
-#             kmeans = KMeans(n_clusters=n_clusters)#, random_state=42)
-#             cluster_labels = kmeans.fit_predict(X)
-#             silhouette_avg = silhouette_score(X, cluster_labels)
-#             silhouette_scores.append(silhouette_avg)
-#         # Determine the optimal number of clusters
-#         optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
-#         # Perform final clustering with optimal number of clusters
-#         kmeans = KMeans(n_clusters=optimal_n_clusters) #, random_state=42)
-#         cluster_labels = kmeans.fit_predict(X)
-#     # # BERTopic approach (commented out)
-#     console_messages.append("BERT is currently commented...")
-#     # topic_model = BERTopic()
-#     # topics, _ = topic_model.fit_transform(df[text_column].tolist())
-#     # topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
-#     # cluster_labels = topics
-#     # Get representative words for each cluster
-#     if method == 'sentence_transformers':
-#         cluster_representations = {}
-#         for i in range(optimal_n_clusters):
-#             cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
-#             cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
-#     elif method == 'tfidf_kmeans':
-#         feature_names = vectorizer.get_feature_names_out()
-#         cluster_representations = {}
-#         for i in range(optimal_n_clusters):
-#             # center = kmeans.cluster_centers_[i]
-#             # # print(f"top_words: {top_words}, type: {type(top_words)}")
-#             # # print(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
-#             # console_messages.append(f"top_words: {top_words}, type: {type(top_words)}")
-#             # console_messages.append(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
-#             # # top_word_indices = center.argsort()[-top_words:][::-1]
-#             # top_word_indices = center.argsort()[-top_words:][::-1].tolist() # Indexes of top words
-#             # top_words = [feature_names[index] for index in top_word_indices]
-#             # cluster_representations[i] = top_words
-#             try:
-#                 center = kmeans.cluster_centers_[i]
-#                 console_messages.append(f"Processing cluster {i}")
-#                 console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
-#                 if not isinstance(center, np.ndarray):
-#                     center = np.array(center)
-#                 # Remove NaN values
-#                 center = center[~np.isnan(center)]
-#                 sorted_indices = np.array(center.argsort())
-#                 top_word_indices = sorted_indices[-top_words:][::-1]
-#                 # Check for valid indices
-#                 if np.any(top_word_indices < 0) or np.any(top_word_indices >= len(feature_names)):
-#                     console_messages.append(f"Invalid top word indices for cluster {i}")
-#                     continue
-#                 top_words = [feature_names[index] for index in top_word_indices]
-#                 console_messages.append(f"Top words: {top_words}")
-#                 cluster_representations[i] = top_words
-#             except Exception as e:
-#                 console_messages.append(f"Error processing cluster {i}: {str(e)}")
-#                 console_messages.append(f"Center: {center}")
-#         console_messages.append(f"Number of clusters: {optimal_n_clusters}")
-#         console_messages.append(f"Sample cluster words: {cluster_representations[0][:5]}...")
-#     # Map cluster labels to representative words
-#     df["Problem_Cluster"] = cluster_labels
-#     df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
-#     # console_messages.append("Returning from Problem Domain Extraction function.")
-#     console_messages.append("Problem Domain Extraction completed.")
-#     return df, optimal_n_clusters
-# Till here sanban
 from sentence_transformers import SentenceTransformer
 from sklearn.cluster import AgglomerativeClustering, KMeans
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics import silhouette_score
 from bertopic import BERTopic
 from collections import Counter
-import numpy as np
 def extract_problem_domains(df,
                             text_column='Processed_ProblemDescription_forDomainExtraction',
                             cluster_range=(5, 15),
                             top_words=10,
-                            method='tfidf_kmeans'
                            ):
     console_messages.append("Extracting Problem Domains...")
-    if method == 'sentence_transformers':
-        # Sentence Transformers approach
-        model = SentenceTransformer('all-mpnet-base-v2')
-        embeddings = model.encode(df[text_column].tolist())
-        # Perform hierarchical clustering with Silhouette Analysis
-        silhouette_scores = []
-        for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
-            clustering = AgglomerativeClustering(n_clusters=n_clusters)
-            cluster_labels = clustering.fit_predict(embeddings)
-            silhouette_avg = silhouette_score(embeddings, cluster_labels)
-            silhouette_scores.append(silhouette_avg)
-        # Determine the optimal number of clusters
-        optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
-        # Perform clustering with the optimal number of clusters
-        clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
         cluster_labels = clustering.fit_predict(embeddings)
-    elif method == 'tfidf_kmeans':
-        # TF-IDF Vectorization and K-Means approach
-        vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
-        X = vectorizer.fit_transform(df[text_column])
-        # Perform K-Means clustering with Silhouette Analysis
-        silhouette_scores = []
-        for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
-            kmeans = KMeans(n_clusters=n_clusters)#, random_state=42)
-            cluster_labels = kmeans.fit_predict(X)
-            silhouette_avg = silhouette_score(X, cluster_labels)
-            silhouette_scores.append(silhouette_avg)
-        # Determine the optimal number of clusters
-        optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
-        # Perform final clustering with optimal number of clusters
-        kmeans = KMeans(n_clusters=optimal_n_clusters) #, random_state=42)
-        cluster_labels = kmeans.fit_predict(X)
-    # # BERTopic approach (commented out)
-    console_messages.append("BERT is currently commented...")
-    # topic_model = BERTopic()
-    # topics, _ = topic_model.fit_transform(df[text_column].tolist())
-    # topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
-    # cluster_labels = topics
     # Get representative words for each cluster
-    if method == 'sentence_transformers':
-        cluster_representations = {}
-        for i in range(optimal_n_clusters):
-            cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
-            cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
-    elif method == 'tfidf_kmeans':
-        feature_names = vectorizer.get_feature_names_out()
-        cluster_representations = {}
-        for i in range(optimal_n_clusters):
-            try:
-                center = kmeans.cluster_centers_[i]
-                console_messages.append(f"Processing cluster {i}")
-                console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
-                if isinstance(center, list):
-                    center = np.array(center)
-                # Remove NaN values
-                if np.any(np.isnan(center)):
-                    center = np.nan_to_num(center)
-                sorted_indices = np.argsort(center)
-                top_word_indices = sorted_indices[-top_words:][::-1]
-                # Check for valid indices
-                if np.any(top_word_indices < 0) or np.any(top_word_indices >= len(feature_names)):
-                    console_messages.append(f"Invalid top word indices for cluster {i}")
-                    continue
-                top_words = [feature_names[index] for index in top_word_indices]
-                console_messages.append(f"Top words: {top_words}")
-                cluster_representations[i] = top_words
-            except Exception as e:
-                console_messages.append(f"Error processing cluster {i}: {str(e)}")
-                console_messages.append(f"Center: {center}")
-        console_messages.append(f"Number of clusters: {optimal_n_clusters}")
-        console_messages.append(f"Sample cluster words: {cluster_representations[0][:5]}...")
     # Map cluster labels to representative words
     df["Problem_Cluster"] = cluster_labels
     df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
     console_messages.append("Problem Domain Extraction completed.")
     return df, optimal_n_clusters
-# Usage
-# clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)
-# print(f'Optimal number of clusters: {optimal_n_clusters}')

 tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
 model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")
 import numpy as np
 import sentencepiece as sp
 from transformers import pipeline
 from sentence_transformers import SentenceTransformer
 from sklearn.cluster import AgglomerativeClustering, KMeans
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics import silhouette_score
 from bertopic import BERTopic
 from collections import Counter
 def extract_problem_domains(df,
                             text_column='Processed_ProblemDescription_forDomainExtraction',
                             cluster_range=(5, 15),
                             top_words=10,
+                            method='sentence_transformers'
                            ):
     console_messages.append("Extracting Problem Domains...")
+    # Sentence Transformers approach
+    model = SentenceTransformer('all-mpnet-base-v2')
+    embeddings = model.encode(df[text_column].tolist())
+    # Perform hierarchical clustering with Silhouette Analysis
+    silhouette_scores = []
+    for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
+        clustering = AgglomerativeClustering(n_clusters=n_clusters)
         cluster_labels = clustering.fit_predict(embeddings)
+        silhouette_avg = silhouette_score(embeddings, cluster_labels)
+        silhouette_scores.append(silhouette_avg)
+    # Determine the optimal number of clusters
+    optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
+    # Perform clustering with the optimal number of clusters
+    clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
+    cluster_labels = clustering.fit_predict(embeddings)
     # Get representative words for each cluster
+    cluster_representations = {}
+    for i in range(optimal_n_clusters):
+        cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
+        cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
     # Map cluster labels to representative words
     df["Problem_Cluster"] = cluster_labels
     df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
+    # console_messages.append("Returning from Problem Domain Extraction function.")
     console_messages.append("Problem Domain Extraction completed.")
     return df, optimal_n_clusters