SantanuBanerjee commited on
Commit
5a0f2dc
·
verified ·
1 Parent(s): 452c821

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -0
app.py CHANGED
@@ -239,8 +239,83 @@ def extract_problem_domains(df,
239
 
240
 
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
  # def nlp_pipeline(original_df):
246
  def nlp_pipeline(original_df):
@@ -281,6 +356,27 @@ def nlp_pipeline(original_df):
281
  # problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
282
  # location_clusters, location_model = perform_clustering(processed_df['Geographical_Location'], n_clusters=5)
283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
 
286
 
 
239
 
240
 
241
 
242
+
243
+
244
+
245
+
246
+ def Extract_Location(text):
247
+ doc = nlp(text)
248
+ locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
249
+ return ' '.join(locations)
250
+
251
+ def text_processing_for_location(text):
252
+ # Extract locations
253
+ locations_text = Extract_Location(text)
254
+
255
+ # Perform further text cleaning if necessary
256
+ processed_locations_text = Lemmatize_text(locations_text)
257
+
258
+ # Remove special characters, digits, and punctuation
259
+ processed_locations_text = re.sub(r'[^a-zA-Z\s]', '', processed_locations_text)
260
+
261
+ # Tokenize and remove stopwords
262
+ tokens = word_tokenize(processed_locations_text.lower())
263
+ stop_words = set(stopwords.words('english'))
264
+ tokens = [word for word in tokens if word not in stop_words]
265
+
266
+ # Join location words into a single string
267
+ final_locations_text = ' '.join(tokens)
268
+
269
+ return final_locations_text if final_locations_text else "India"
270
 
271
 
272
+ def extract_location_clusters(df,
273
+ text_column='Processed_LocationText_forClustering',
274
+ cluster_range=(3, 10),
275
+ top_words=5):
276
+ console_messages.append("Extracting Location Clusters...")
277
+
278
+ # Sentence Transformers approach for embeddings
279
+ model = SentenceTransformer('all-mpnet-base-v2')
280
+ embeddings = model.encode(df[text_column].tolist())
281
+
282
+ # Perform hierarchical clustering with Silhouette Analysis
283
+ silhouette_scores = []
284
+ for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
285
+ clustering = AgglomerativeClustering(n_clusters=n_clusters)
286
+ cluster_labels = clustering.fit_predict(embeddings)
287
+ silhouette_avg = silhouette_score(embeddings, cluster_labels)
288
+ silhouette_scores.append(silhouette_avg)
289
+
290
+ # Determine the optimal number of clusters
291
+ optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
292
+
293
+ # Perform clustering with the optimal number of clusters
294
+ clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
295
+ cluster_labels = clustering.fit_predict(embeddings)
296
+
297
+ # Get representative words for each cluster
298
+ cluster_representations = {}
299
+ for i in range(optimal_n_clusters):
300
+ cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
301
+ cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
302
+
303
+ # Map cluster labels to representative words
304
+ df["Location_Cluster"] = cluster_labels
305
+ df['Location_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
306
+
307
+ console_messages.append("Location Clustering completed.")
308
+ return df, optimal_n_clusters
309
+
310
+
311
+
312
+
313
+
314
+
315
+
316
+
317
+
318
+
319
 
320
  # def nlp_pipeline(original_df):
321
  def nlp_pipeline(original_df):
 
356
  # problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
357
  # location_clusters, location_model = perform_clustering(processed_df['Geographical_Location'], n_clusters=5)
358
 
359
+ console_messages.append("Starting NLP pipeline for location extraction...")
360
+
361
+ # Apply the text_processing_for_location function to the DataFrame
362
+ processed_df['Processed_LocationText_forClustering'] = processed_df['Problem_Description'].apply(text_processing_for_location)
363
+
364
+ # Location Clustering
365
+ try:
366
+ location_df, optimal_n_clusters = extract_location_clusters(processed_df)
367
+ console_messages.append("NLP pipeline for location extraction completed.")
368
+ return location_df
369
+ except Exception as e:
370
+ console_messages.append(f"Error in extract_location_clusters: {str(e)}")
371
+ return processed_df
372
+
373
+
374
+
375
+
376
+
377
+
378
+
379
+
380
 
381
 
382