Spaces:

ValadisCERTH
/

CountriesCitiesModuleSerco

Sleeping

App Files Files Community

ValadisCERTH commited on Mar 31, 2023

Commit

581a861

1 Parent(s): a7fdc57

Update helper.py

Browse files

Files changed (1) hide show

helper.py +130 -4

helper.py CHANGED Viewed

@@ -116,6 +116,7 @@ def validate_locations(locations):
     return validated_loc
 def identify_loc_ner(sentence):
     """
     Identify all the geopolitical and location entities with the spacy tool
@@ -140,6 +141,7 @@ def identify_loc_ner(sentence):
     return ner_locations
 def identify_loc_geoparselibs(sentence):
     """
     Identify cities and countries with 3 different geoparsing libraries
@@ -189,6 +191,7 @@ def identify_loc_geoparselibs(sentence):
     return (geoparse_locations, countries, cities)
 def identify_loc_regex(sentence):
     """
     Identify cities and countries with regular expression matching
@@ -206,6 +209,7 @@ def identify_loc_regex(sentence):
     return regex_locations
 def identify_loc_embeddings(sentence, countries, cities):
     """
     Identify cities and countries with the BERT pre-trained embeddings matching
@@ -248,6 +252,120 @@ def identify_loc_embeddings(sentence, countries, cities):
     return embd_locations
 def identify_locations(sentence):
     """
     Identify all the possible Country and City references in the given sentence, using different approaches in a hybrid manner
@@ -257,6 +375,9 @@ def identify_locations(sentence):
     try:
       # ner
       locations.append(identify_loc_ner(sentence))
@@ -290,12 +411,17 @@ def identify_locations(sentence):
       validated_locations = validate_locations(loc_capitalization)
       # create a proper dictionary with country/city tags and the relevant entries as a result
-      locations_dict = {}
       for location, loc_type in validated_locations:
-          if loc_type not in locations_dict:
-              locations_dict[loc_type] = []
-          locations_dict[loc_type].append(location)
       # conditions for multiple references
       # it is mandatory that a country will exist

     return validated_loc
 def identify_loc_ner(sentence):
     """
     Identify all the geopolitical and location entities with the spacy tool
     return ner_locations
 def identify_loc_geoparselibs(sentence):
     """
     Identify cities and countries with 3 different geoparsing libraries
     return (geoparse_locations, countries, cities)
 def identify_loc_regex(sentence):
     """
     Identify cities and countries with regular expression matching
     return regex_locations
 def identify_loc_embeddings(sentence, countries, cities):
     """
     Identify cities and countries with the BERT pre-trained embeddings matching
     return embd_locations
+def multiple_country_city_identifications_solve(country_city_dict):
+  """
+  This is a function to solve the appearance of multiple identification of countries and cities.
+  It checks all the elements of the input dictionary and if any smaller length element exists as a substring inside
+  a bigger length element of it, it deletes the smaller size one. In that sense, a dictionary of the sort
+  {'city': ['Port moresby', 'Port'], 'country': ['Guinea', 'Papua new guinea']} will be converted into
+  {'city': ['Port moresby'], 'country': ['Papua new guinea']}.
+  The reason for that function, is because such type of incosistencies were identified during country/city identification,
+  propably relevant to the geoparsing libraries in use
+  """
+  try:
+    country_flag = False
+    city_flag = False
+    # to avoid examining any element in any case, we validate that both a country and a city exist
+    # on the input dictionary and that they are of length more than one (which is the target case for us)
+    if 'country' in country_city_dict:
+      if len(country_city_dict['country']) > 1:
+        country_flag = True
+    if 'city' in country_city_dict:
+      if len(country_city_dict['city']) > 1:
+        city_flag = True
+    # at first cope with country multiple iterative references
+    if country_flag:
+      # Sort the countries by length, longest first
+      country_city_dict['country'].sort(key=lambda x: len(x), reverse=True)
+      # Create a new list of countries that don't contain any substrings
+      cleaned_countries = []
+      for i in range(len(country_city_dict['country'])):
+          is_substring = False
+          for j in range(len(cleaned_countries)):
+              if country_city_dict['country'][i].lower().find(cleaned_countries[j].lower()) != -1:
+                  # If the i-th country is a substring of an already-cleaned country, skip it
+                  is_substring = True
+                  break
+          if not is_substring:
+              cleaned_countries.append(country_city_dict['country'][i])
+      # Replace the original list of countries with the cleaned one
+      country_city_dict['country'] = cleaned_countries
+      # Create a new list of countries that are not substrings of other countries
+      final_countries = []
+      for i in range(len(country_city_dict['country'])):
+          is_superstring = False
+          for j in range(len(country_city_dict['country'])):
+              if i == j:
+                  continue
+              if country_city_dict['country'][j].lower().find(country_city_dict['country'][i].lower()) != -1:
+                  # If the i-th country is a substring of a different country, skip it
+                  is_superstring = True
+                  break
+          if not is_superstring:
+              final_countries.append(country_city_dict['country'][i])
+      # Replace the original list of countries with the final one
+      country_city_dict['country'] = final_countries
+    # then cope with city multiple iterative references
+    if city_flag:
+      # Sort the cities by length, longest first
+      country_city_dict['city'].sort(key=lambda x: len(x), reverse=True)
+      # Create a new list of cities that don't contain any substrings
+      cleaned_cities = []
+      for i in range(len(country_city_dict['city'])):
+          is_substring = False
+          for j in range(len(cleaned_cities)):
+              if country_city_dict['city'][i].lower().find(cleaned_cities[j].lower()) != -1:
+                  # If the i-th city is a substring of an already-cleaned city, skip it
+                  is_substring = True
+                  break
+          if not is_substring:
+              cleaned_cities.append(country_city_dict['city'][i])
+      # Replace the original list of cities with the cleaned one
+      country_city_dict['city'] = cleaned_cities
+      # Create a new list of cities that are not substrings of other cities
+      final_cities = []
+      for i in range(len(country_city_dict['city'])):
+          is_superstring = False
+          for j in range(len(country_city_dict['city'])):
+              if i == j:
+                  continue
+              if country_city_dict['city'][j].lower().find(country_city_dict['city'][i].lower()) != -1:
+                  # If the i-th city is a substring of a different city, skip it
+                  is_superstring = True
+                  break
+          if not is_superstring:
+              final_cities.append(country_city_dict['city'][i])
+      # Replace the original list of cities with the final one
+      country_city_dict['city'] = final_cities
+    # return the final dictionary
+    if country_city_dict:
+      return country_city_dict
+  except:
+    return (0, "LOCATION", "unknown_error")
 def identify_locations(sentence):
     """
     Identify all the possible Country and City references in the given sentence, using different approaches in a hybrid manner
     try:
+      # # # this is because there were cases were a city followed by comma was not understood by the system
+      sentence = sentence.replace(",", " x$x ")
       # ner
       locations.append(identify_loc_ner(sentence))
       validated_locations = validate_locations(loc_capitalization)
       # create a proper dictionary with country/city tags and the relevant entries as a result
+      loc_dict = {}
       for location, loc_type in validated_locations:
+          if loc_type not in loc_dict:
+              loc_dict[loc_type] = []
+          loc_dict[loc_type].append(location)
+      # bring sentence on previous form
+      sentence = sentence.replace(" x$x ",",")
+      # cope with cases of iterative country or city reference due to geoparse lib issues
+      locations_dict = multiple_country_city_identifications_solve(loc_dict)
       # conditions for multiple references
       # it is mandatory that a country will exist