Spaces:

ValadisCERTH
/

CountriesCitiesModuleSerco

Sleeping

App Files Files Community

ValadisCERTH commited on Mar 30, 2023

Commit

8201122

1 Parent(s): 4ae82fa

Update helper.py

Browse files

Files changed (1) hide show

helper.py +77 -33

helper.py CHANGED Viewed

@@ -11,6 +11,7 @@ import re
 from transformers import BertTokenizer, BertModel
 import torch
 # initial loads
 # load the spacy model
@@ -339,50 +340,93 @@ def identify_locations(sentence):
       return (0, "LOCATION", "unknown_error")
-def identify_locations2(sentence):
     """
     Identify all the possible Country and City references in the given sentence, using different approaches in a hybrid manner
     """
     locations = []
-    # ner
-    locations.append(identify_loc_ner(sentence))
-    # geoparse libs
-    geoparse_list, countries, cities = identify_loc_geoparselibs(sentence)
-    locations.append(geoparse_list)
-    # flatten the geoparse list
-    locations_flat_1 = list(flatten(locations))
-    # regex
-    locations_flat_1.append(identify_loc_regex(sentence))
-    # flatten the regex list
-    locations_flat_2 = list(flatten(locations))
-    # embeddings
-    locations_flat_2.append(identify_loc_embeddings(sentence, countries, cities))
-    # flatten the embeddings list
-    locations_flat_3 = list(flatten(locations))
-    # remove duplicates while also taking under consideration capitalization (e.g. a reference of italy should be valid, while also a reference of Italy and italy)
-    # Lowercase the words and get their unique references using set()
-    loc_unique = set([loc.lower() for loc in locations_flat_3])
-    # Create a new list of locations with initial capitalization, removing duplicates
-    loc_capitalization = list(set([loc.capitalize() if loc.lower() in loc_unique else loc.lower() for loc in locations_flat_3]))
-    # validate that indeed each one of the countries/cities are indeed countries/cities
-    validated_locations = validate_locations(loc_capitalization)
-    # create a proper dictionary with country/city tags and the relevant entries as a result
-    locations_dict = {}
-    for location, loc_type in validated_locations:
-        if loc_type not in locations_dict:
-            locations_dict[loc_type] = []
-        locations_dict[loc_type].append(location)
-    return locations_dict

 from transformers import BertTokenizer, BertModel
 import torch
 # initial loads
 # load the spacy model
       return (0, "LOCATION", "unknown_error")
+def identify_locations(sentence):
     """
     Identify all the possible Country and City references in the given sentence, using different approaches in a hybrid manner
     """
     locations = []
+    try:
+      # ner
+      locations.append(identify_loc_ner(sentence))
+      # geoparse libs
+      geoparse_list, countries, cities = identify_loc_geoparselibs(sentence)
+      locations.append(geoparse_list)
+      # flatten the geoparse list
+      locations_flat_1 = list(flatten(locations))
+      # regex
+      locations_flat_1.append(identify_loc_regex(sentence))
+      # flatten the regex list
+      locations_flat_2 = list(flatten(locations))
+      # embeddings
+      locations_flat_2.append(identify_loc_embeddings(sentence, countries, cities))
+      # flatten the embeddings list
+      locations_flat_3 = list(flatten(locations))
+      # remove duplicates while also taking under consideration capitalization (e.g. a reference of italy should be valid, while also a reference of Italy and italy)
+      # Lowercase the words and get their unique references using set()
+      loc_unique = set([loc.lower() for loc in locations_flat_3])
+      # Create a new list of locations with initial capitalization, removing duplicates
+      loc_capitalization = list(set([loc.capitalize() if loc.lower() in loc_unique else loc.lower() for loc in locations_flat_3]))
+      # validate that indeed each one of the countries/cities are indeed countries/cities
+      validated_locations = validate_locations(loc_capitalization)
+      # create a proper dictionary with country/city tags and the relevant entries as a result
+      locations_dict = {}
+      for location, loc_type in validated_locations:
+          if loc_type not in locations_dict:
+              locations_dict[loc_type] = []
+          locations_dict[loc_type].append(location)
+      # conditions for multiple references
+      # it is mandatory that a country will exist
+      if locations_dict['country']:
+          # if a city exists
+          if 'city' in locations_dict:
+              # we accept one country and one city
+              if len(locations_dict['country']) == 1 and len(locations_dict['city']) == 1:
+                  # capitalize because there may be cases that it will return 'italy'
+                  locations_dict['country'][0] = locations_dict['country'][0].capitalize()
+                  return locations_dict
+              # we can accept an absence of city but a country is always mandatory
+              elif len(locations_dict['country']) == 1 and len(locations_dict['city']) == 0:
+                  locations_dict['country'][0] = locations_dict['country'][0].capitalize()
+                  return locations_dict
+              # error if more than one country or city
+              else:
+                return (0, "LOCATION", "more_city_or_country")
+          # if a city does not exist
+          else:
+              # we only accept for one country
+              if len(locations_dict['country']) == 1:
+                  locations_dict['country'][0] = locations_dict['country'][0].capitalize()
+                  return locations_dict
+              # error if more than one country
+              else:
+                return (0, "LOCATION", "more_country")
+      # error if no country is referred
+      else:
+         return (0, "LOCATION", "no_country")
+    except:
+      # handle the exception if any errors occur while identifying a country/city
+      return (0, "LOCATION", "unknown_error")