Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -113,16 +113,49 @@ from nltk.tokenize import word_tokenize
|
|
113 |
nltk.download('punkt')
|
114 |
nltk.download('stopwords')
|
115 |
|
116 |
-
def combined_text_processing(text):
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
# Tokenize and remove stopwords
|
123 |
-
tokens = word_tokenize(text.lower())
|
124 |
stop_words = set(stopwords.words('english'))
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
# Lemmatize tokens using SpaCy
|
128 |
doc = nlp(' '.join(tokens))
|
@@ -139,6 +172,97 @@ def combined_text_processing(text):
|
|
139 |
|
140 |
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
|
144 |
|
@@ -153,15 +277,25 @@ def combined_text_processing(text):
|
|
153 |
|
154 |
def nlp_pipeline(original_df):
|
155 |
# Data Preprocessing
|
156 |
-
processed_df = data_pre_processing(original_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
|
159 |
-
# Apply the combined function to your DataFrame
|
160 |
-
processed_df['Processed_ProblemDescription'] = processed_df['Problem_Description'].apply(combined_text_processing)
|
161 |
|
162 |
|
|
|
|
|
163 |
|
164 |
-
return processed_df
|
165 |
|
166 |
def process_excel(file):
|
167 |
try:
|
|
|
113 |
nltk.download('punkt')
|
114 |
nltk.download('stopwords')
|
115 |
|
116 |
+
# def combined_text_processing(text):
|
117 |
+
# # Remove punctuation, numbers, URLs, and special characters
|
118 |
+
# text = re.sub(r'[^\w\s]', '', text) # Remove punctuation and special characters
|
119 |
+
# text = re.sub(r'\d+', '', text) # Remove numbers
|
120 |
+
# text = re.sub(r'http\S+', '', text) # Remove URLs
|
121 |
+
|
122 |
+
# # Tokenize and remove stopwords
|
123 |
+
# tokens = word_tokenize(text.lower()) # Convert to lowercase
|
124 |
+
# stop_words = set(stopwords.words('english'))
|
125 |
+
# tokens = [word for word in tokens if word not in stop_words]
|
126 |
+
|
127 |
+
# # Lemmatize tokens using SpaCy
|
128 |
+
# doc = nlp(' '.join(tokens))
|
129 |
+
# lemmatized_text = ' '.join([token.lemma_ for token in doc])
|
130 |
+
|
131 |
+
# # Apply Hugging Face Transformers
|
132 |
+
# inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
|
133 |
+
# with torch.no_grad():
|
134 |
+
# outputs = model(**inputs)
|
135 |
+
|
136 |
+
# return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
137 |
+
|
138 |
+
|
139 |
+
def text_processing_for_domain(text):
|
140 |
+
# Text Cleaning
|
141 |
+
text = re.sub(r'[^\w\s]', '', text)
|
142 |
+
text = re.sub(r'\d+', '', text)
|
143 |
+
text = re.sub(r'http\S+', '', text) # Remove https URLs
|
144 |
+
text = re.sub(r'www\.\S+', '', text) # Remove www URLs
|
145 |
|
146 |
# Tokenize and remove stopwords
|
147 |
+
tokens = word_tokenize(text.lower())
|
148 |
stop_words = set(stopwords.words('english'))
|
149 |
+
custom_stopwords = {'example', 'another'} # Add custom stopwords
|
150 |
+
tokens = [word for word in tokens if word not in stop_words and word not in custom_stopwords]
|
151 |
+
|
152 |
+
# NER - Remove named entities
|
153 |
+
doc = nlp(' '.join(tokens))
|
154 |
+
tokens = [token.text for token in doc if not token.ent_type_]
|
155 |
+
|
156 |
+
# POS Tagging (optional)
|
157 |
+
pos_tags = nltk.pos_tag(tokens)
|
158 |
+
tokens = [word for word, pos in pos_tags if pos in ['NN', 'NNS']] # Filter nouns
|
159 |
|
160 |
# Lemmatize tokens using SpaCy
|
161 |
doc = nlp(' '.join(tokens))
|
|
|
172 |
|
173 |
|
174 |
|
175 |
+
# # 2. Clustering from ChatGPT
|
176 |
+
# # Libraries: scikit-learn, sentence-transformers
|
177 |
+
# # Use sentence embeddings and clustering algorithms to group similar project proposals.
|
178 |
+
# from bertopic import BERTopic
|
179 |
+
# def perform_clustering(texts, n_clusters):
|
180 |
+
# topic_model = BERTopic(n_topics=n_clusters)
|
181 |
+
# topics, _ = topic_model.fit_transform(texts)
|
182 |
+
# return topics, topic_model
|
183 |
+
# # Clustering function call
|
184 |
+
# clustered_df, cluster_centers = clustering(processed_df)
|
185 |
+
# Method 1: Sentence Transformers + KMeans
|
186 |
+
|
187 |
+
# # 2. Clustering: from Claude
|
188 |
+
# # Use BERTopic for advanced topic modeling and clustering.
|
189 |
+
# from bertopic import BERTopic
|
190 |
+
# def perform_clustering(texts, n_clusters):
|
191 |
+
# topic_model = BERTopic(n_topics=n_clusters)
|
192 |
+
# topics, _ = topic_model.fit_transform(texts)
|
193 |
+
# return topics, topic_model
|
194 |
+
# # Clustering function call
|
195 |
+
# problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
|
196 |
+
# location_clusters, location_model = perform_clustering(processed_df['Geographical_Location'], n_clusters=5)
|
197 |
+
# After this Method 2: BERTopic function, the following need to be done:
|
198 |
+
# processed_df['Problem_Cluster'] = problem_clusters
|
199 |
+
|
200 |
+
|
201 |
+
|
202 |
+
# 2. Meta AI Function: Sentence Transformers + Hierarchical Clustering + Silhouette Analysis
|
203 |
+
# Now this also includes:
|
204 |
+
# Topic Modeling using BERTopic: Integrated BERTopic to extract representative words for each cluster.
|
205 |
+
# Cluster Visualization: Added a simple visualization to display the top words in each cluster.
|
206 |
+
# Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
|
207 |
+
|
208 |
+
from sentence_transformers import SentenceTransformer
|
209 |
+
from sklearn.cluster import AgglomerativeClustering
|
210 |
+
from sklearn.metrics import silhouette_score
|
211 |
+
from bertopic import BERTopic
|
212 |
+
|
213 |
+
# def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
|
214 |
+
def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
|
215 |
+
|
216 |
+
# Select Model (can we also optimize model selection automatically?)
|
217 |
+
# model = SentenceTransformer('all-MiniLM-L6-v2')
|
218 |
+
model = SentenceTransformer('all-mpnet-base-v2')
|
219 |
+
# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
|
220 |
+
# Generate embeddings
|
221 |
+
embeddings = model.encode(df[text_column].tolist())
|
222 |
+
|
223 |
+
# Perform hierarchical clustering with Silhouette Analysis
|
224 |
+
silhouette_scores = []
|
225 |
+
for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
|
226 |
+
clustering = AgglomerativeClustering(n_clusters=n_clusters)
|
227 |
+
cluster_labels = clustering.fit_predict(embeddings)
|
228 |
+
silhouette_avg = silhouette_score(embeddings, cluster_labels)
|
229 |
+
silhouette_scores.append(silhouette_avg)
|
230 |
+
|
231 |
+
# Determine the optimal number of clusters
|
232 |
+
optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
|
233 |
+
|
234 |
+
# Perform clustering with the optimal number of clusters
|
235 |
+
clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
|
236 |
+
cluster_labels = clustering.fit_predict(embeddings)
|
237 |
+
|
238 |
+
|
239 |
+
# BERTopic for topic modelling
|
240 |
+
topic_model = BERTopic(n_topics=optimal_n_clusters)
|
241 |
+
topics, _ = topic_model.fit_transform(df[text_column].tolist())
|
242 |
+
|
243 |
+
|
244 |
+
# Get representative words for each cluster
|
245 |
+
cluster_representations = {}
|
246 |
+
for i in range(optimal_n_clusters):
|
247 |
+
cluster_representations[i] = topic_model.get_topic_info(i)['words'][:top_words]
|
248 |
+
|
249 |
+
# Map cluster labels to representative words
|
250 |
+
df["Problem_Cluster"] = cluster_labels
|
251 |
+
df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
252 |
+
|
253 |
+
# # Print clusters and their representative words
|
254 |
+
# for cluster_label, words in cluster_representations.items():
|
255 |
+
# print(f"Domain {cluster_label}: {', '.join(words)}")
|
256 |
+
|
257 |
+
# return df.assign(cluster=cluster_labels), optimal_n_clusters
|
258 |
+
|
259 |
+
# df[new_column_name] = clustering.fit_predict(embeddings)
|
260 |
+
return df, optimal_n_clusters
|
261 |
+
|
262 |
+
# Usage
|
263 |
+
# clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)
|
264 |
+
# print(f'Optimal number of clusters: {optimal_n_clusters}')
|
265 |
+
|
266 |
|
267 |
|
268 |
|
|
|
277 |
|
278 |
def nlp_pipeline(original_df):
|
279 |
# Data Preprocessing
|
280 |
+
processed_df = data_pre_processing(original_df) # merged_dataset
|
281 |
+
|
282 |
+
|
283 |
+
# Starting the Pipeline for Domain Extraction
|
284 |
+
# Apply the text_processing_for_domain function to the DataFrame
|
285 |
+
processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
|
286 |
+
# Domain Clustering
|
287 |
+
domain_df, optimal_n_clusters = extract_problem_domains(processed_df)
|
288 |
+
|
289 |
+
|
290 |
+
# problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
|
291 |
+
# location_clusters, location_model = perform_clustering(processed_df['Geographical_Location'], n_clusters=5)
|
292 |
|
293 |
|
|
|
|
|
294 |
|
295 |
|
296 |
+
# return processed_df
|
297 |
+
return domain_df
|
298 |
|
|
|
299 |
|
300 |
def process_excel(file):
|
301 |
try:
|