Spaces:

ValadisCERTH
/

CountriesCitiesModuleSerco

Sleeping

App Files Files Community

ValadisCERTH commited on Mar 23, 2023

Commit

2ae3a4d

1 Parent(s): 9186715

Create helper.py

Browse files

Files changed (1) hide show

helper.py +295 -0

helper.py ADDED Viewed

	@@ -0,0 +1,295 @@

+import spacy
+from geopy.geocoders import Nominatim
+import geonamescache
+import pycountry
+from geotext import GeoText
+import re
+from transformers import BertTokenizer, BertModel
+import torch
+# initial loads
+# load the spacy model
+spacy.cli.download("en_core_web_lg")
+nlp = spacy.load("en_core_web_lg")
+# load the pre-trained BERT tokenizer and model
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+model = BertModel.from_pretrained('bert-base-uncased')
+# Load valid city names from geonamescache
+gc = geonamescache.GeonamesCache()
+city_names = set([city['name'] for city in gc.get_cities().values()])
+def flatten(lst):
+    """
+    Define a helper function to flatten the list recursively
+    """
+    for item in lst:
+      if isinstance(item, list):
+          yield from flatten(item)
+      else:
+          yield item
+def is_country(reference):
+    """
+    Check if a given reference is a valid country name
+    """
+    try:
+        # use the pycountry library to verify if an input is a country
+        country = pycountry.countries.search_fuzzy(reference)[0]
+        return True
+    except LookupError:
+        return False
+def is_city(reference):
+    """
+    Check if the given reference is a valid city name
+    """
+    # Check if the reference is a valid city name
+    if reference in city_names:
+        return True
+    # Load the Nomatim (open street maps) api
+    geolocator = Nominatim(user_agent="certh_serco_validate_city_app")
+    location = geolocator.geocode(reference, language="en")
+    # If a reference is identified as a 'city', 'town', or 'village', then it is indeed a city
+    if location.raw['type'] in ['city', 'town', 'village']:
+        return True
+    # If a reference is identified as 'administrative' (e.g. administrative area),
+    # then we further examine if the retrieved info is a single token (meaning a country) or a series of tokens (meaning a city)
+    # that condition takes place to separate some cases where small cities were identified as administrative areas
+    elif location.raw['type'] == 'administrative':
+        if len(location.raw['display_name'].split(",")) > 1:
+            return True
+    return False
+def validate_locations(locations):
+    """
+    Validate that the identified references are indeed a Country and a City
+    """
+    validated_loc = []
+    for location in locations:
+        if is_city(location):
+            validated_loc.append((location, 'city'))
+        elif is_country(location):
+            validated_loc.append((location, 'country'))
+        else:
+            # Check if the location is a multi-word name
+            words = location.split()
+            if len(words) > 1:
+                # Try to find the country or city name among the words
+                for i in range(len(words)):
+                    name = ' '.join(words[i:])
+                    if is_country(name):
+                        validated_loc.append((name, 'country'))
+                        break
+                    elif is_city(name):
+                        validated_loc.append((name, 'city'))
+                        break
+    return validated_loc
+def identify_loc_ner(sentence):
+    """
+    Identify all the geopolitical and location entities with the spacy tool
+    """
+    doc = nlp(sentence)
+    ner_locations = []
+    # GPE and LOC are the labels for location entities in spaCy
+    for ent in doc.ents:
+        if ent.label_ in ['GPE', 'LOC']:
+            if len(ent.text.split()) > 1:
+                ner_locations.append(ent.text)
+            else:
+                for token in ent:
+                    if token.ent_type_ == 'GPE':
+                        ner_locations.append(ent.text)
+                        break
+    return ner_locations
+def identify_loc_geoparselibs(sentence):
+    """
+    Identify cities and countries with 3 different geoparsing libraries
+    """
+    geoparse_locations = []
+    # Geoparsing library 1
+    # Load geonames cache to check if a city name is valid
+    gc = geonamescache.GeonamesCache()
+    # Get a list of many countries/cities
+    countries = gc.get_countries()
+    cities = gc.get_cities()
+    city_names = [city['name'] for city in cities.values()]
+    country_names = [country['name'] for country in countries.values()]
+    # if any word sequence in our sentence is one of those countries/cities identify it
+    words = sentence.split()
+    for i in range(len(words)):
+        for j in range(i+1, len(words)+1):
+            word_seq = ' '.join(words[i:j])
+            if word_seq in city_names or word_seq in country_names:
+                geoparse_locations.append(word_seq)
+    # Geoparsing library 2
+    # similarly with the pycountry library
+    for country in pycountry.countries:
+        if country.name in sentence:
+            geoparse_locations.append(country.name)
+    # Geoparsing library 3
+    # similarly with the geotext library
+    places = GeoText(sentence)
+    cities = list(places.cities)
+    countries = list(places.countries)
+    if cities:
+        geoparse_locations += cities
+    if countries:
+        geoparse_locations += countries
+    return (geoparse_locations, countries, cities)
+def identify_loc_regex(sentence):
+    """
+    Identify cities and countries with regular expression matching
+    """
+    regex_locations = []
+    # Country references can be preceded by 'in', 'from' or 'of'
+    pattern = r"\b(in|from|of)\b\s([\w\s]+)"
+    additional_refs = re.findall(pattern, sentence)
+    for match in additional_refs:
+        regex_locations.append(match[1])
+    return regex_locations
+def identify_loc_embeddings(sentence, countries, cities):
+    """
+    Identify cities and countries with the BERT pre-trained embeddings matching
+    """
+    embd_locations = []
+    # Define a list of country and city names (those are given by the geonamescache library before)
+    countries_cities = countries + cities
+    # Concatenate multi-word countries and cities into a single string
+    multiword_countries = [c.replace(' ', '_') for c in countries if ' ' in c]
+    multiword_cities = [c.replace(' ', '_') for c in cities if ' ' in c]
+    countries_cities += multiword_countries + multiword_cities
+    # Preprocess the input sentence
+    tokens = tokenizer.tokenize(sentence)
+    input_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tokens)])
+    # Get the BERT embeddings for the input sentence
+    with torch.no_grad():
+        embeddings = model(input_ids)[0][0]
+    # Find the country and city names in the input sentence
+    for i in range(len(tokens)):
+        token = tokens[i]
+        if token in countries_cities:
+            embd_locations.append(token)
+        else:
+            word_vector = embeddings[i]
+            similarity_scores = torch.nn.functional.cosine_similarity(word_vector.unsqueeze(0), embeddings)
+            similar_tokens = [tokens[j] for j in similarity_scores.argsort(descending=True)[1:6]]
+            for word in similar_tokens:
+                if word in countries_cities and similarity_scores[tokens.index(word)] > 0.5:
+                    embd_locations.append(word)
+    # Convert back multi-word country and city names to original form
+    embd_locations = [loc.replace('_', ' ') if '_' in loc else loc for loc in embd_locations]
+    return embd_locations
+def identify_locations(sentence):
+    """
+    Identify all the possible Country and City references in the given sentence, using different approaches in a hybrid manner
+    """
+    locations = []
+    # add all the identified country/cities results in a list
+    try:
+      # ner
+      locations.append(identify_loc_ner(sentence))
+      # geoparse libs
+      geoparse_list, countries, cities = identify_loc_geoparselibs(sentence)
+      locations.append(geoparse_list)
+      # flatten the geoparse list
+      locations_flat_1 = list(flatten(locations))
+      # regex
+      locations_flat_1.append(identify_loc_regex(sentence))
+      # flatten the regex list
+      locations_flat_2 = list(flatten(locations))
+      # embeddings
+      locations_flat_2.append(identify_loc_embeddings(sentence, countries, cities))
+      # flatten the embeddings list
+      locations_flat_3 = list(flatten(locations))
+      # acquire the unique country/city names (because it is possible that many different approaches will capture the same countries/cities)
+      flat_loc_list = set(locations_flat_3)
+      # validate that indeed each one of the countries/cities are indeed countries/cities
+      validated_locations = validate_locations(flat_loc_list)
+      # create a proper dictionary with country/city tags and the relevant entries as a result
+      locations_dict = {}
+      for location, loc_type in validated_locations:
+          if loc_type not in locations_dict:
+              locations_dict[loc_type] = []
+          locations_dict[loc_type].append(location)
+      return locations_dict
+    except:
+      # handle the exception if any errors occur while identifying a country/city
+      print(f"An error occurred while checking if a city or country exists")
+      return ""