Spaces:

ValadisCERTH
/

NaturalLanguageModule_complete

Runtime error

App Files Files Community

ValadisCERTH commited on May 9, 2023

Commit

7cd3150

1 Parent(s): b6076c4

Update countriesIdentification.py

Browse files

Files changed (1) hide show

countriesIdentification.py +3 -58

countriesIdentification.py CHANGED Viewed

@@ -8,18 +8,11 @@ from geotext import GeoText
 import re
-from transformers import BertTokenizer, BertModel
-import torch
 spacy.cli.download("en_core_web_lg")
 # Load the spacy model with GloVe embeddings
 nlp = spacy.load("en_core_web_lg")
-# load the pre-trained BERT tokenizer and model
-tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-model = BertModel.from_pretrained('bert-base-cased')
 # Load valid city names from geonamescache
 gc = geonamescache.GeonamesCache()
@@ -267,48 +260,6 @@ def identify_loc_regex(sentence):
     return regex_locations
-def identify_loc_embeddings(sentence, countries, cities):
-    """
-    Identify cities and countries with the BERT pre-trained embeddings matching
-    """
-    embd_locations = []
-    # Define a list of country and city names (those are given by the geonamescache library before)
-    countries_cities = countries + cities
-    # Concatenate multi-word countries and cities into a single string
-    multiword_countries = [c.replace(' ', '_') for c in countries if ' ' in c]
-    multiword_cities = [c.replace(' ', '_') for c in cities if ' ' in c]
-    countries_cities += multiword_countries + multiword_cities
-    # Preprocess the input sentence
-    tokens = tokenizer.tokenize(sentence)
-    input_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tokens)])
-    # Get the BERT embeddings for the input sentence
-    with torch.no_grad():
-        embeddings = model(input_ids)[0][0]
-    # Find the country and city names in the input sentence
-    for i in range(len(tokens)):
-        token = tokens[i]
-        if token in countries_cities:
-            embd_locations.append(token)
-        else:
-            word_vector = embeddings[i]
-            similarity_scores = torch.nn.functional.cosine_similarity(word_vector.unsqueeze(0), embeddings)
-            similar_tokens = [tokens[j] for j in similarity_scores.argsort(descending=True)[1:6]]
-            for word in similar_tokens:
-                if word in countries_cities and similarity_scores[tokens.index(word)] > 0.5:
-                    embd_locations.append(word)
-    # Convert back multi-word country and city names to original form
-    embd_locations = [loc.replace('_', ' ') if '_' in loc else loc for loc in embd_locations]
-    return embd_locations
 def multiple_country_city_identifications_solve(country_city_dict):
     """
@@ -580,19 +531,13 @@ def identify_locations(sentence):
         # flatten the regex list
         locations_flat_2 = list(flatten(locations))
-        # embeddings
-        locations_flat_2.append(identify_loc_embeddings(sentence, countries, cities))
-        # flatten the embeddings list
-        locations_flat_3 = list(flatten(locations))
         # remove duplicates while also taking under consideration capitalization (e.g. a reference of italy should be valid, while also a reference of Italy and italy)
         # Lowercase the words and get their unique references using set()
-        loc_unique = set([loc.lower() for loc in locations_flat_3])
         # Create a new list of locations with initial capitalization, removing duplicates
         loc_capitalization = list(
-            set([loc.capitalize() if loc.lower() in loc_unique else loc.lower() for loc in locations_flat_3]))
         # That calculation checks whether there are substrings contained in another string. E.g. for the case of [timor leste, timor], it should remove "timor"
         if extra_serco_countries:
@@ -705,5 +650,5 @@ def identify_locations(sentence):
                 return (0, "LOCATION", "no_country")
     except:
-        # handle the exception if any errors occur while  identifying a country/city
         return (0, "LOCATION", "unknown_error")

 import re
 spacy.cli.download("en_core_web_lg")
 # Load the spacy model with GloVe embeddings
 nlp = spacy.load("en_core_web_lg")
 # Load valid city names from geonamescache
 gc = geonamescache.GeonamesCache()
     return regex_locations
 def multiple_country_city_identifications_solve(country_city_dict):
     """
         # flatten the regex list
         locations_flat_2 = list(flatten(locations))
         # remove duplicates while also taking under consideration capitalization (e.g. a reference of italy should be valid, while also a reference of Italy and italy)
         # Lowercase the words and get their unique references using set()
+        loc_unique = set([loc.lower() for loc in locations_flat_2])
         # Create a new list of locations with initial capitalization, removing duplicates
         loc_capitalization = list(
+            set([loc.capitalize() if loc.lower() in loc_unique else loc.lower() for loc in locations_flat_2]))
         # That calculation checks whether there are substrings contained in another string. E.g. for the case of [timor leste, timor], it should remove "timor"
         if extra_serco_countries:
                 return (0, "LOCATION", "no_country")
     except:
+        # handle the exception if any errors occur while identifying a country/city
         return (0, "LOCATION", "unknown_error")