Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Aug 4, 2024

Commit

35e172a

verified ·

1 Parent(s): 7f8700c

Update app.py

Browse files

Files changed (1) hide show

app.py +145 -11

app.py CHANGED Viewed

@@ -113,16 +113,49 @@ from nltk.tokenize import word_tokenize
 nltk.download('punkt')
 nltk.download('stopwords')
-def combined_text_processing(text):
-    # Remove punctuation, numbers, URLs, and special characters
-    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
-    text = re.sub(r'\d+', '', text)  # Remove numbers
-    text = re.sub(r'http\S+', '', text)  # Remove URLs
     # Tokenize and remove stopwords
-    tokens = word_tokenize(text.lower())  # Convert to lowercase
     stop_words = set(stopwords.words('english'))
-    tokens = [word for word in tokens if word not in stop_words]
     # Lemmatize tokens using SpaCy
     doc = nlp(' '.join(tokens))
@@ -139,6 +172,97 @@ def combined_text_processing(text):
@@ -153,15 +277,25 @@ def combined_text_processing(text):
 def nlp_pipeline(original_df):
     # Data Preprocessing
-    processed_df = data_pre_processing(original_df)
-    # Apply the combined function to your DataFrame
-    processed_df['Processed_ProblemDescription'] = processed_df['Problem_Description'].apply(combined_text_processing)
-    return processed_df
 def process_excel(file):
     try:

 nltk.download('punkt')
 nltk.download('stopwords')
+# def combined_text_processing(text):
+#     # Remove punctuation, numbers, URLs, and special characters
+#     text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
+#     text = re.sub(r'\d+', '', text)  # Remove numbers
+#     text = re.sub(r'http\S+', '', text)  # Remove URLs
+#     # Tokenize and remove stopwords
+#     tokens = word_tokenize(text.lower())  # Convert to lowercase
+#     stop_words = set(stopwords.words('english'))
+#     tokens = [word for word in tokens if word not in stop_words]
+#     # Lemmatize tokens using SpaCy
+#     doc = nlp(' '.join(tokens))
+#     lemmatized_text = ' '.join([token.lemma_ for token in doc])
+#     # Apply Hugging Face Transformers
+#     inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
+#     with torch.no_grad():
+#         outputs = model(**inputs)
+#     return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
+def text_processing_for_domain(text):
+    # Text Cleaning
+    text = re.sub(r'[^\w\s]', '', text)
+    text = re.sub(r'\d+', '', text)
+    text = re.sub(r'http\S+', '', text)  # Remove https URLs
+    text = re.sub(r'www\.\S+', '', text)  # Remove www URLs
     # Tokenize and remove stopwords
+    tokens = word_tokenize(text.lower())
     stop_words = set(stopwords.words('english'))
+    custom_stopwords = {'example', 'another'}  # Add custom stopwords
+    tokens = [word for word in tokens if word not in stop_words and word not in custom_stopwords]
+    # NER - Remove named entities
+    doc = nlp(' '.join(tokens))
+    tokens = [token.text for token in doc if not token.ent_type_]
+    # POS Tagging (optional)
+    pos_tags = nltk.pos_tag(tokens)
+    tokens = [word for word, pos in pos_tags if pos in ['NN', 'NNS']]  # Filter nouns
     # Lemmatize tokens using SpaCy
     doc = nlp(' '.join(tokens))
+# # 2. Clustering from ChatGPT
+# # Libraries: scikit-learn, sentence-transformers
+# # Use sentence embeddings and clustering algorithms to group similar project proposals.
+# from bertopic import BERTopic
+# def perform_clustering(texts, n_clusters):
+#     topic_model = BERTopic(n_topics=n_clusters)
+#     topics, _ = topic_model.fit_transform(texts)
+#     return topics, topic_model
+# # Clustering function call
+# clustered_df, cluster_centers = clustering(processed_df)
+# Method 1: Sentence Transformers + KMeans
+# # 2. Clustering: from Claude
+# # Use BERTopic for advanced topic modeling and clustering.
+# from bertopic import BERTopic
+# def perform_clustering(texts, n_clusters):
+#     topic_model = BERTopic(n_topics=n_clusters)
+#     topics, _ = topic_model.fit_transform(texts)
+#     return topics, topic_model
+# # Clustering function call
+# problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
+# location_clusters, location_model = perform_clustering(processed_df['Geographical_Location'], n_clusters=5)
+# After this Method 2: BERTopic function, the following need to be done:
+# processed_df['Problem_Cluster'] = problem_clusters
+# 2. Meta AI Function: Sentence Transformers + Hierarchical Clustering + Silhouette Analysis
+# Now this also includes:
+# Topic Modeling using BERTopic: Integrated BERTopic to extract representative words for each cluster.
+# Cluster Visualization: Added a simple visualization to display the top words in each cluster.
+# Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
+from sentence_transformers import SentenceTransformer
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.metrics import silhouette_score
+from bertopic import BERTopic
+# def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
+def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
+    # Select Model (can we also optimize model selection automatically?)
+    # model = SentenceTransformer('all-MiniLM-L6-v2')
+    model = SentenceTransformer('all-mpnet-base-v2')
+    # model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
+    # Generate embeddings
+    embeddings = model.encode(df[text_column].tolist())
+    # Perform hierarchical clustering with Silhouette Analysis
+    silhouette_scores = []
+    for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
+        clustering = AgglomerativeClustering(n_clusters=n_clusters)
+        cluster_labels = clustering.fit_predict(embeddings)
+        silhouette_avg = silhouette_score(embeddings, cluster_labels)
+        silhouette_scores.append(silhouette_avg)
+    # Determine the optimal number of clusters
+    optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
+    # Perform clustering with the optimal number of clusters
+    clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
+    cluster_labels = clustering.fit_predict(embeddings)
+    # BERTopic for topic modelling
+    topic_model = BERTopic(n_topics=optimal_n_clusters)
+    topics, _ = topic_model.fit_transform(df[text_column].tolist())
+    # Get representative words for each cluster
+    cluster_representations = {}
+    for i in range(optimal_n_clusters):
+        cluster_representations[i] = topic_model.get_topic_info(i)['words'][:top_words]
+    # Map cluster labels to representative words
+    df["Problem_Cluster"] = cluster_labels
+    df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
+    # # Print clusters and their representative words
+    # for cluster_label, words in cluster_representations.items():
+    #     print(f"Domain {cluster_label}: {', '.join(words)}")
+    # return df.assign(cluster=cluster_labels), optimal_n_clusters
+    # df[new_column_name] = clustering.fit_predict(embeddings)
+    return df, optimal_n_clusters
+# Usage
+# clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)
+# print(f'Optimal number of clusters: {optimal_n_clusters}')
 def nlp_pipeline(original_df):
     # Data Preprocessing
+    processed_df = data_pre_processing(original_df) # merged_dataset
+    # Starting the Pipeline for Domain Extraction
+    # Apply the text_processing_for_domain function to the DataFrame
+    processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
+    # Domain Clustering
+    domain_df, optimal_n_clusters = extract_problem_domains(processed_df)
+    # problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
+    # location_clusters, location_model = perform_clustering(processed_df['Geographical_Location'], n_clusters=5)
+    # return processed_df
+    return domain_df
 def process_excel(file):
     try: