SantanuBanerjee commited on
Commit
efb3f07
·
verified ·
1 Parent(s): 86142db

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +150 -31
app.py CHANGED
@@ -243,31 +243,63 @@ def extract_problem_domains(df,
243
 
244
 
245
 
246
- def Extract_Location(text):
247
- doc = nlp(text)
248
- locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
249
- return ' '.join(locations)
250
 
251
- def text_processing_for_location(text):
252
- # Extract locations
253
- locations_text = Extract_Location(text)
254
-
255
- # Perform further text cleaning if necessary
256
- processed_locations_text = Lemmatize_text(locations_text)
257
-
258
- # Remove special characters, digits, and punctuation
259
- processed_locations_text = re.sub(r'[^a-zA-Z\s]', '', processed_locations_text)
260
-
261
- # Tokenize and remove stopwords
262
- tokens = word_tokenize(processed_locations_text.lower())
263
- stop_words = set(stopwords.words('english'))
264
- tokens = [word for word in tokens if word not in stop_words]
265
-
266
- # Join location words into a single string
267
- final_locations_text = ' '.join(tokens)
268
-
269
- return final_locations_text if final_locations_text else "India"
270
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
  def extract_location_clusters(df,
273
  text_column='Processed_LocationText_forClustering',
@@ -314,10 +346,97 @@ def extract_location_clusters(df,
314
 
315
 
316
 
317
- import copy
318
 
319
 
320
- # def nlp_pipeline(original_df):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  def nlp_pipeline(original_df):
322
  console_messages.append("Starting NLP pipeline...")
323
 
@@ -344,8 +463,10 @@ def nlp_pipeline(original_df):
344
 
345
 
346
  console_messages.append("Starting NLP pipeline for Location extraction with text processing.")
 
347
  # Apply the text_processing_for_location function to the DataFrame
348
- processed_df['Processed_LocationText_forClustering'] = processed_df['Problem_Description'].apply(text_processing_for_location)
 
349
 
350
  # Location Clustering
351
  try:
@@ -355,12 +476,10 @@ def nlp_pipeline(original_df):
355
  console_messages.append(f"Error in extract_location_clusters: {str(e)}")
356
  console_messages.append("NLP pipeline for location extraction completed.")
357
 
358
-
359
  console_messages.append("NLP pipeline completed.")
360
  return processed_df
361
-
362
-
363
-
364
 
365
 
366
 
 
243
 
244
 
245
 
 
 
 
 
246
 
247
+
248
+
249
+
250
+
251
+ import spacy
252
+ from geopy.geocoders import Nominatim
253
+ from geopy.exc import GeocoderTimedOut, GeocoderUnavailable
254
+ import pandas as pd
255
+
256
+ nlp = spacy.load('en_core_web_sm')
257
+ geolocator = Nominatim(user_agent="my_agent")
258
+
259
+ def extract_and_geocode_locations(text, user_locations):
260
+ # Extract locations from text
261
+ doc = nlp(text)
262
+ extracted_locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
263
+
264
+ # Combine extracted locations with user-provided locations
265
+ all_locations = list(set(extracted_locations + user_locations.split(', ')))
266
+
267
+ geocoded_locations = []
268
+ for loc in all_locations:
269
+ try:
270
+ location = geolocator.geocode(loc)
271
+ if location:
272
+ geocoded_locations.append({
273
+ 'name': loc,
274
+ 'latitude': location.latitude,
275
+ 'longitude': location.longitude,
276
+ 'country': location.raw.get('display_name', '').split(', ')[-1]
277
+ })
278
+ else:
279
+ # If geocoding fails, add the location without coordinates
280
+ geocoded_locations.append({
281
+ 'name': loc,
282
+ 'latitude': None,
283
+ 'longitude': None,
284
+ 'country': None
285
+ })
286
+ except (GeocoderTimedOut, GeocoderUnavailable):
287
+ print(f"Geocoding failed for {loc}")
288
+ # Add the location without coordinates
289
+ geocoded_locations.append({
290
+ 'name': loc,
291
+ 'latitude': None,
292
+ 'longitude': None,
293
+ 'country': None
294
+ })
295
+
296
+ return geocoded_locations
297
+
298
+ def text_processing_for_location(row):
299
+ locations = extract_and_geocode_locations(row['Problem_Description'], row['Geographical_Location'])
300
+ location_text = ' '.join([loc['name'] for loc in locations])
301
+ processed_text = Lemmatize_text(location_text)
302
+ return processed_text, locations
303
 
304
  def extract_location_clusters(df,
305
  text_column='Processed_LocationText_forClustering',
 
346
 
347
 
348
 
 
349
 
350
 
351
+
352
+
353
+
354
+
355
+
356
+
357
+
358
+ # def Extract_Location(text):
359
+ # doc = nlp(text)
360
+ # locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
361
+ # return ' '.join(locations)
362
+
363
+ # def text_processing_for_location(text):
364
+ # # Extract locations
365
+ # locations_text = Extract_Location(text)
366
+
367
+ # # Perform further text cleaning if necessary
368
+ # processed_locations_text = Lemmatize_text(locations_text)
369
+
370
+ # # Remove special characters, digits, and punctuation
371
+ # processed_locations_text = re.sub(r'[^a-zA-Z\s]', '', processed_locations_text)
372
+
373
+ # # Tokenize and remove stopwords
374
+ # tokens = word_tokenize(processed_locations_text.lower())
375
+ # stop_words = set(stopwords.words('english'))
376
+ # tokens = [word for word in tokens if word not in stop_words]
377
+
378
+ # # Join location words into a single string
379
+ # final_locations_text = ' '.join(tokens)
380
+
381
+ # return final_locations_text if final_locations_text else "India"
382
+
383
+
384
+ # def extract_location_clusters(df,
385
+ # text_column='Processed_LocationText_forClustering',
386
+ # cluster_range=(3, 10),
387
+ # top_words=5):
388
+ # console_messages.append("Extracting Location Clusters...")
389
+
390
+ # # Sentence Transformers approach for embeddings
391
+ # model = SentenceTransformer('all-mpnet-base-v2')
392
+ # embeddings = model.encode(df[text_column].tolist())
393
+
394
+ # # Perform hierarchical clustering with Silhouette Analysis
395
+ # silhouette_scores = []
396
+ # for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
397
+ # clustering = AgglomerativeClustering(n_clusters=n_clusters)
398
+ # cluster_labels = clustering.fit_predict(embeddings)
399
+ # silhouette_avg = silhouette_score(embeddings, cluster_labels)
400
+ # silhouette_scores.append(silhouette_avg)
401
+
402
+ # # Determine the optimal number of clusters
403
+ # optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
404
+
405
+ # # Perform clustering with the optimal number of clusters
406
+ # clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
407
+ # cluster_labels = clustering.fit_predict(embeddings)
408
+
409
+ # # Get representative words for each cluster
410
+ # cluster_representations = {}
411
+ # for i in range(optimal_n_clusters):
412
+ # cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
413
+ # cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
414
+
415
+ # # Map cluster labels to representative words
416
+ # df["Location_Cluster"] = cluster_labels
417
+ # df['Location_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
418
+
419
+ # console_messages.append("Location Clustering completed.")
420
+ # return df, optimal_n_clusters
421
+
422
+
423
+
424
+
425
+
426
+
427
+
428
+
429
+
430
+
431
+
432
+
433
+
434
+
435
+
436
+
437
+
438
+
439
+
440
  def nlp_pipeline(original_df):
441
  console_messages.append("Starting NLP pipeline...")
442
 
 
463
 
464
 
465
  console_messages.append("Starting NLP pipeline for Location extraction with text processing.")
466
+
467
  # Apply the text_processing_for_location function to the DataFrame
468
+ # processed_df['Processed_LocationText_forClustering'] = processed_df['Problem_Description'].apply(text_processing_for_location)
469
+ processed_df['Processed_LocationText_forClustering'], processed_df['Extracted_Locations'] = zip(*processed_df.apply(text_processing_for_location, axis=1))
470
 
471
  # Location Clustering
472
  try:
 
476
  console_messages.append(f"Error in extract_location_clusters: {str(e)}")
477
  console_messages.append("NLP pipeline for location extraction completed.")
478
 
479
+
480
  console_messages.append("NLP pipeline completed.")
481
  return processed_df
482
+
 
 
483
 
484
 
485