ValadisCERTH commited on
Commit
8201122
·
1 Parent(s): 4ae82fa

Update helper.py

Browse files
Files changed (1) hide show
  1. helper.py +77 -33
helper.py CHANGED
@@ -11,6 +11,7 @@ import re
11
  from transformers import BertTokenizer, BertModel
12
  import torch
13
 
 
14
  # initial loads
15
 
16
  # load the spacy model
@@ -339,50 +340,93 @@ def identify_locations(sentence):
339
  return (0, "LOCATION", "unknown_error")
340
 
341
 
342
- def identify_locations2(sentence):
343
  """
344
  Identify all the possible Country and City references in the given sentence, using different approaches in a hybrid manner
345
  """
346
 
347
  locations = []
348
 
349
- # ner
350
- locations.append(identify_loc_ner(sentence))
351
 
352
- # geoparse libs
353
- geoparse_list, countries, cities = identify_loc_geoparselibs(sentence)
354
- locations.append(geoparse_list)
355
 
356
- # flatten the geoparse list
357
- locations_flat_1 = list(flatten(locations))
 
358
 
359
- # regex
360
- locations_flat_1.append(identify_loc_regex(sentence))
361
 
362
- # flatten the regex list
363
- locations_flat_2 = list(flatten(locations))
364
 
365
- # embeddings
366
- locations_flat_2.append(identify_loc_embeddings(sentence, countries, cities))
367
 
368
- # flatten the embeddings list
369
- locations_flat_3 = list(flatten(locations))
370
 
371
- # remove duplicates while also taking under consideration capitalization (e.g. a reference of italy should be valid, while also a reference of Italy and italy)
372
- # Lowercase the words and get their unique references using set()
373
- loc_unique = set([loc.lower() for loc in locations_flat_3])
374
 
375
- # Create a new list of locations with initial capitalization, removing duplicates
376
- loc_capitalization = list(set([loc.capitalize() if loc.lower() in loc_unique else loc.lower() for loc in locations_flat_3]))
377
-
378
- # validate that indeed each one of the countries/cities are indeed countries/cities
379
- validated_locations = validate_locations(loc_capitalization)
380
-
381
- # create a proper dictionary with country/city tags and the relevant entries as a result
382
- locations_dict = {}
383
- for location, loc_type in validated_locations:
384
- if loc_type not in locations_dict:
385
- locations_dict[loc_type] = []
386
- locations_dict[loc_type].append(location)
387
-
388
- return locations_dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  from transformers import BertTokenizer, BertModel
12
  import torch
13
 
14
+
15
  # initial loads
16
 
17
  # load the spacy model
 
340
  return (0, "LOCATION", "unknown_error")
341
 
342
 
343
+ def identify_locations(sentence):
344
  """
345
  Identify all the possible Country and City references in the given sentence, using different approaches in a hybrid manner
346
  """
347
 
348
  locations = []
349
 
350
+ try:
 
351
 
352
+ # ner
353
+ locations.append(identify_loc_ner(sentence))
 
354
 
355
+ # geoparse libs
356
+ geoparse_list, countries, cities = identify_loc_geoparselibs(sentence)
357
+ locations.append(geoparse_list)
358
 
359
+ # flatten the geoparse list
360
+ locations_flat_1 = list(flatten(locations))
361
 
362
+ # regex
363
+ locations_flat_1.append(identify_loc_regex(sentence))
364
 
365
+ # flatten the regex list
366
+ locations_flat_2 = list(flatten(locations))
367
 
368
+ # embeddings
369
+ locations_flat_2.append(identify_loc_embeddings(sentence, countries, cities))
370
 
371
+ # flatten the embeddings list
372
+ locations_flat_3 = list(flatten(locations))
 
373
 
374
+ # remove duplicates while also taking under consideration capitalization (e.g. a reference of italy should be valid, while also a reference of Italy and italy)
375
+ # Lowercase the words and get their unique references using set()
376
+ loc_unique = set([loc.lower() for loc in locations_flat_3])
377
+
378
+ # Create a new list of locations with initial capitalization, removing duplicates
379
+ loc_capitalization = list(set([loc.capitalize() if loc.lower() in loc_unique else loc.lower() for loc in locations_flat_3]))
380
+
381
+ # validate that indeed each one of the countries/cities are indeed countries/cities
382
+ validated_locations = validate_locations(loc_capitalization)
383
+
384
+ # create a proper dictionary with country/city tags and the relevant entries as a result
385
+ locations_dict = {}
386
+ for location, loc_type in validated_locations:
387
+ if loc_type not in locations_dict:
388
+ locations_dict[loc_type] = []
389
+ locations_dict[loc_type].append(location)
390
+
391
+ # conditions for multiple references
392
+ # it is mandatory that a country will exist
393
+ if locations_dict['country']:
394
+
395
+ # if a city exists
396
+ if 'city' in locations_dict:
397
+
398
+ # we accept one country and one city
399
+ if len(locations_dict['country']) == 1 and len(locations_dict['city']) == 1:
400
+
401
+ # capitalize because there may be cases that it will return 'italy'
402
+ locations_dict['country'][0] = locations_dict['country'][0].capitalize()
403
+ return locations_dict
404
+
405
+ # we can accept an absence of city but a country is always mandatory
406
+ elif len(locations_dict['country']) == 1 and len(locations_dict['city']) == 0:
407
+ locations_dict['country'][0] = locations_dict['country'][0].capitalize()
408
+ return locations_dict
409
+
410
+ # error if more than one country or city
411
+ else:
412
+ return (0, "LOCATION", "more_city_or_country")
413
+
414
+
415
+ # if a city does not exist
416
+ else:
417
+ # we only accept for one country
418
+ if len(locations_dict['country']) == 1:
419
+ locations_dict['country'][0] = locations_dict['country'][0].capitalize()
420
+ return locations_dict
421
+
422
+ # error if more than one country
423
+ else:
424
+ return (0, "LOCATION", "more_country")
425
+
426
+ # error if no country is referred
427
+ else:
428
+ return (0, "LOCATION", "no_country")
429
+
430
+ except:
431
+ # handle the exception if any errors occur while identifying a country/city
432
+ return (0, "LOCATION", "unknown_error")