SantanuBanerjee commited on
Commit
89ee992
·
verified ·
1 Parent(s): 31a5c30

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -134
app.py CHANGED
@@ -248,58 +248,36 @@ def extract_problem_domains(df,
248
 
249
 
250
 
251
- import spacy
252
- from geopy.geocoders import Nominatim
253
- from geopy.exc import GeocoderTimedOut, GeocoderUnavailable
254
- import pandas as pd
255
 
256
- nlp = spacy.load('en_core_web_sm')
257
- geolocator = Nominatim(user_agent="my_agent")
258
 
259
- def extract_and_geocode_locations(text, user_locations):
260
- # Extract locations from text
 
 
261
  doc = nlp(text)
262
- extracted_locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
263
-
264
- # Combine extracted locations with user-provided locations
265
- all_locations = list(set(extracted_locations + user_locations.split(', ')))
266
-
267
- geocoded_locations = []
268
- for loc in all_locations:
269
- try:
270
- location = geolocator.geocode(loc)
271
- if location:
272
- geocoded_locations.append({
273
- 'name': loc,
274
- 'latitude': location.latitude,
275
- 'longitude': location.longitude,
276
- 'country': location.raw.get('display_name', '').split(', ')[-1]
277
- })
278
- else:
279
- # If geocoding fails, add the location without coordinates
280
- geocoded_locations.append({
281
- 'name': loc,
282
- 'latitude': None,
283
- 'longitude': None,
284
- 'country': None
285
- })
286
- except (GeocoderTimedOut, GeocoderUnavailable):
287
- print(f"Geocoding failed for {loc}")
288
- # Add the location without coordinates
289
- geocoded_locations.append({
290
- 'name': loc,
291
- 'latitude': None,
292
- 'longitude': None,
293
- 'country': None
294
- })
295
-
296
- return geocoded_locations
297
-
298
- def text_processing_for_location(row):
299
- locations = extract_and_geocode_locations(row['Problem_Description'], row['Geographical_Location'])
300
- location_text = ' '.join([loc['name'] for loc in locations])
301
- processed_text = Lemmatize_text(location_text)
302
- return processed_text, locations
303
 
304
  def extract_location_clusters(df,
305
  text_column='Processed_LocationText_forClustering',
@@ -355,86 +333,6 @@ def extract_location_clusters(df,
355
 
356
 
357
 
358
- # def Extract_Location(text):
359
- # doc = nlp(text)
360
- # locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
361
- # return ' '.join(locations)
362
-
363
- # def text_processing_for_location(text):
364
- # # Extract locations
365
- # locations_text = Extract_Location(text)
366
-
367
- # # Perform further text cleaning if necessary
368
- # processed_locations_text = Lemmatize_text(locations_text)
369
-
370
- # # Remove special characters, digits, and punctuation
371
- # processed_locations_text = re.sub(r'[^a-zA-Z\s]', '', processed_locations_text)
372
-
373
- # # Tokenize and remove stopwords
374
- # tokens = word_tokenize(processed_locations_text.lower())
375
- # stop_words = set(stopwords.words('english'))
376
- # tokens = [word for word in tokens if word not in stop_words]
377
-
378
- # # Join location words into a single string
379
- # final_locations_text = ' '.join(tokens)
380
-
381
- # return final_locations_text if final_locations_text else "India"
382
-
383
-
384
- # def extract_location_clusters(df,
385
- # text_column='Processed_LocationText_forClustering',
386
- # cluster_range=(3, 10),
387
- # top_words=5):
388
- # console_messages.append("Extracting Location Clusters...")
389
-
390
- # # Sentence Transformers approach for embeddings
391
- # model = SentenceTransformer('all-mpnet-base-v2')
392
- # embeddings = model.encode(df[text_column].tolist())
393
-
394
- # # Perform hierarchical clustering with Silhouette Analysis
395
- # silhouette_scores = []
396
- # for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
397
- # clustering = AgglomerativeClustering(n_clusters=n_clusters)
398
- # cluster_labels = clustering.fit_predict(embeddings)
399
- # silhouette_avg = silhouette_score(embeddings, cluster_labels)
400
- # silhouette_scores.append(silhouette_avg)
401
-
402
- # # Determine the optimal number of clusters
403
- # optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
404
-
405
- # # Perform clustering with the optimal number of clusters
406
- # clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
407
- # cluster_labels = clustering.fit_predict(embeddings)
408
-
409
- # # Get representative words for each cluster
410
- # cluster_representations = {}
411
- # for i in range(optimal_n_clusters):
412
- # cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
413
- # cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
414
-
415
- # # Map cluster labels to representative words
416
- # df["Location_Cluster"] = cluster_labels
417
- # df['Location_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
418
-
419
- # console_messages.append("Location Clustering completed.")
420
- # return df, optimal_n_clusters
421
-
422
-
423
-
424
-
425
-
426
-
427
-
428
-
429
-
430
-
431
-
432
-
433
-
434
-
435
-
436
-
437
-
438
 
439
 
440
  def nlp_pipeline(original_df):
@@ -519,14 +417,16 @@ def process_excel(file):
519
 
520
 
521
 
 
 
 
 
 
522
  # example_files = ['#TaxDirection (Responses)_BasicExample.xlsx',
523
  # '#TaxDirection (Responses)_IntermediateExample.xlsx',
524
- # '#TaxDirection (Responses)_UltimateExample.xlsx'
525
- # ]
526
 
527
- example_files = ['#TaxDirection (Responses)_BasicExample.xlsx',
528
- '#TaxDirection (Responses)_IntermediateExample.xlsx',
529
- ]
530
 
531
 
532
  import random
 
248
 
249
 
250
 
 
 
 
 
251
 
 
 
252
 
253
+
254
+
255
+
256
+ def Extract_Location(text):
257
  doc = nlp(text)
258
+ locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC']]
259
+ return ' '.join(locations)
260
+
261
+ def text_processing_for_location(text):
262
+ # Extract locations
263
+ locations_text = Extract_Location(text)
264
+
265
+ # Perform further text cleaning if necessary
266
+ processed_locations_text = Lemmatize_text(locations_text)
267
+
268
+ # Remove special characters, digits, and punctuation
269
+ processed_locations_text = re.sub(r'[^a-zA-Z\s]', '', processed_locations_text)
270
+
271
+ # Tokenize and remove stopwords
272
+ tokens = word_tokenize(processed_locations_text.lower())
273
+ stop_words = set(stopwords.words('english'))
274
+ tokens = [word for word in tokens if word not in stop_words]
275
+
276
+ # Join location words into a single string
277
+ final_locations_text = ' '.join(tokens)
278
+
279
+ return final_locations_text if final_locations_text else "India"
280
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
  def extract_location_clusters(df,
283
  text_column='Processed_LocationText_forClustering',
 
333
 
334
 
335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
337
 
338
  def nlp_pipeline(original_df):
 
417
 
418
 
419
 
420
+ example_files = ['#TaxDirection (Responses)_BasicExample.xlsx',
421
+ '#TaxDirection (Responses)_IntermediateExample.xlsx',
422
+ '#TaxDirection (Responses)_UltimateExample.xlsx'
423
+ ]
424
+
425
  # example_files = ['#TaxDirection (Responses)_BasicExample.xlsx',
426
  # '#TaxDirection (Responses)_IntermediateExample.xlsx',
427
+ # ]
 
428
 
429
+ # example_files = ['#TaxDirection (Responses)_BasicExample.xlsx',]
 
 
430
 
431
 
432
  import random