Spaces:
Runtime error
Runtime error
Commit
·
65f09a7
1
Parent(s):
2be1d3d
Create countriesIdentification
Browse files- countriesIdentification +706 -0
countriesIdentification
ADDED
@@ -0,0 +1,706 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
|
3 |
+
from geopy.geocoders import Nominatim
|
4 |
+
import geonamescache
|
5 |
+
import pycountry
|
6 |
+
|
7 |
+
from geotext import GeoText
|
8 |
+
|
9 |
+
import re
|
10 |
+
|
11 |
+
from transformers import BertTokenizer, BertModel
|
12 |
+
import torch
|
13 |
+
|
14 |
+
|
15 |
+
# Load the spacy model with GloVe embeddings
|
16 |
+
nlp = spacy.load("en_core_web_lg")
|
17 |
+
|
18 |
+
# load the pre-trained BERT tokenizer and model
|
19 |
+
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
|
20 |
+
model = BertModel.from_pretrained('bert-base-cased')
|
21 |
+
|
22 |
+
# Load valid city names from geonamescache
|
23 |
+
gc = geonamescache.GeonamesCache()
|
24 |
+
|
25 |
+
# There is a bug with geonamescache where some countries exist as cities (e.g. albania)
|
26 |
+
# So initially we delete any country reference from the cities
|
27 |
+
|
28 |
+
# Get a list of all country names
|
29 |
+
original_countries = set(country['name'] for country in gc.get_countries().values())
|
30 |
+
|
31 |
+
# Get a list of all the original city names
|
32 |
+
original_cities = set(city['name'] for city in gc.get_cities().values())
|
33 |
+
|
34 |
+
# Get a list of all country names that appear as city names
|
35 |
+
country_names = set(
|
36 |
+
country['name'] for country in gc.get_countries().values() if country['name'] not in original_cities)
|
37 |
+
|
38 |
+
# We also add these two cases because they have been asked by SERCO
|
39 |
+
country_names.add("Guinea Bissau")
|
40 |
+
country_names.add("Guinea bissau")
|
41 |
+
country_names.add("guinea Bissau")
|
42 |
+
country_names.add("guinea bissau")
|
43 |
+
country_names.add("Timor Leste")
|
44 |
+
country_names.add("Timor leste")
|
45 |
+
country_names.add("timor Leste")
|
46 |
+
country_names.add("timor leste")
|
47 |
+
country_names.add("UAE")
|
48 |
+
country_names.add("uae")
|
49 |
+
country_names.add("Uae")
|
50 |
+
country_names.add("Uk")
|
51 |
+
country_names.add("uK")
|
52 |
+
country_names.add("uk")
|
53 |
+
country_names.add("USa")
|
54 |
+
country_names.add("Usa")
|
55 |
+
country_names.add("usa")
|
56 |
+
country_names.add("uSa")
|
57 |
+
country_names.add("usA")
|
58 |
+
country_names.add("uSA")
|
59 |
+
country_names.add("Palestine")
|
60 |
+
|
61 |
+
# Get a list of all city names, excluding country names
|
62 |
+
city_names = set(city['name'] for city in gc.get_cities().values() if city['name'] not in original_countries)
|
63 |
+
|
64 |
+
city_names.add("Puebla de sanabria")
|
65 |
+
|
66 |
+
|
67 |
+
def flatten(lst):
|
68 |
+
"""
|
69 |
+
Define a helper function to flatten the list recursively
|
70 |
+
"""
|
71 |
+
|
72 |
+
for item in lst:
|
73 |
+
if isinstance(item, list):
|
74 |
+
yield from flatten(item)
|
75 |
+
else:
|
76 |
+
yield item
|
77 |
+
|
78 |
+
|
79 |
+
def is_country(reference):
|
80 |
+
"""
|
81 |
+
Check if a given reference is a valid country name
|
82 |
+
"""
|
83 |
+
try:
|
84 |
+
# Check if the reference is a valid city name from the first geoparse library
|
85 |
+
if reference in country_names:
|
86 |
+
return True
|
87 |
+
|
88 |
+
else:
|
89 |
+
# if not then use the pycountry library to verify if an input is a country
|
90 |
+
country = pycountry.countries.search_fuzzy(reference)[0]
|
91 |
+
|
92 |
+
temp_country_names = []
|
93 |
+
|
94 |
+
if country:
|
95 |
+
if hasattr(country, 'name') or hasattr(country, 'official_name') or hasattr(country, 'common_name'):
|
96 |
+
|
97 |
+
if hasattr(country, 'official_name'):
|
98 |
+
temp_country_names.append(country.official_name.lower())
|
99 |
+
if hasattr(country, 'name'):
|
100 |
+
temp_country_names.append(country.name.lower())
|
101 |
+
if hasattr(country, 'common_name'):
|
102 |
+
temp_country_names.append(country.common_name.lower())
|
103 |
+
if any(reference.lower()==elem for elem in temp_country_names):
|
104 |
+
return True
|
105 |
+
|
106 |
+
return False
|
107 |
+
|
108 |
+
except LookupError:
|
109 |
+
return False
|
110 |
+
|
111 |
+
|
112 |
+
def is_city(reference):
|
113 |
+
"""
|
114 |
+
Check if a given reference is a valid city name
|
115 |
+
"""
|
116 |
+
|
117 |
+
reference = reference.replace("x$x", "").strip()
|
118 |
+
|
119 |
+
# Check if the reference is a valid city name
|
120 |
+
if reference in city_names:
|
121 |
+
return True
|
122 |
+
|
123 |
+
# Load the Nomatim (open street maps) api
|
124 |
+
geolocator = Nominatim(user_agent="certh_serco_validate_city_app")
|
125 |
+
location = geolocator.geocode(reference, language="en", timeout=10)
|
126 |
+
|
127 |
+
# If a reference is identified as a 'city', 'town', or 'village', then it is indeed a city
|
128 |
+
if location.raw['type'] in ['city', 'town', 'village']:
|
129 |
+
return True
|
130 |
+
|
131 |
+
# If a reference is identified as 'administrative' (e.g. administrative area),
|
132 |
+
# then we further examine if the retrieved info is a single token (meaning a country) or a series of tokens (meaning a city)
|
133 |
+
# that condition takes place to separate some cases where small cities were identified as administrative areas
|
134 |
+
elif location.raw['type'] == 'administrative':
|
135 |
+
|
136 |
+
if len(location.raw['display_name'].split(",")) > 1:
|
137 |
+
return True
|
138 |
+
|
139 |
+
return False
|
140 |
+
|
141 |
+
|
142 |
+
def validate_locations(locations):
|
143 |
+
"""
|
144 |
+
Validate that the identified references are indeed a Country and a City
|
145 |
+
"""
|
146 |
+
|
147 |
+
validated_loc = []
|
148 |
+
|
149 |
+
for location in locations:
|
150 |
+
|
151 |
+
# validate whether it is a country
|
152 |
+
if is_country(location):
|
153 |
+
validated_loc.append((location, 'country'))
|
154 |
+
|
155 |
+
# validate whether it is a city
|
156 |
+
elif is_city(location):
|
157 |
+
validated_loc.append((location, 'city'))
|
158 |
+
|
159 |
+
else:
|
160 |
+
# Check if the location is a multi-word name
|
161 |
+
words = location.split()
|
162 |
+
if len(words) > 1:
|
163 |
+
|
164 |
+
# Try to find the country or city name among the words
|
165 |
+
for i in range(len(words)):
|
166 |
+
name = ' '.join(words[i:])
|
167 |
+
|
168 |
+
if is_country(name):
|
169 |
+
validated_loc.append((name, 'country'))
|
170 |
+
break
|
171 |
+
|
172 |
+
elif is_city(name):
|
173 |
+
validated_loc.append((name, 'city'))
|
174 |
+
break
|
175 |
+
|
176 |
+
return validated_loc
|
177 |
+
|
178 |
+
|
179 |
+
def identify_loc_ner(sentence):
|
180 |
+
"""
|
181 |
+
Identify all the geopolitical and location entities with the spacy tool
|
182 |
+
"""
|
183 |
+
|
184 |
+
doc = nlp(sentence)
|
185 |
+
|
186 |
+
ner_locations = []
|
187 |
+
|
188 |
+
# GPE and LOC are the labels for location entities in spaCy
|
189 |
+
for ent in doc.ents:
|
190 |
+
if ent.label_ in ['GPE', 'LOC']:
|
191 |
+
|
192 |
+
if len(ent.text.split()) > 1:
|
193 |
+
ner_locations.append(ent.text)
|
194 |
+
else:
|
195 |
+
for token in ent:
|
196 |
+
if token.ent_type_ == 'GPE':
|
197 |
+
ner_locations.append(ent.text)
|
198 |
+
break
|
199 |
+
|
200 |
+
return ner_locations
|
201 |
+
|
202 |
+
|
203 |
+
def identify_loc_geoparselibs(sentence):
|
204 |
+
"""
|
205 |
+
Identify cities and countries with 3 different geoparsing libraries
|
206 |
+
"""
|
207 |
+
|
208 |
+
geoparse_locations = []
|
209 |
+
|
210 |
+
# Geoparsing library 1
|
211 |
+
|
212 |
+
# Load geonames cache to check if a city name is valid
|
213 |
+
gc = geonamescache.GeonamesCache()
|
214 |
+
|
215 |
+
# Get a list of many countries/cities
|
216 |
+
countries = gc.get_countries()
|
217 |
+
cities = gc.get_cities()
|
218 |
+
|
219 |
+
city_names = [city['name'] for city in cities.values()]
|
220 |
+
country_names = [country['name'] for country in countries.values()]
|
221 |
+
|
222 |
+
# if any word sequence in our sentence is one of those countries/cities identify it
|
223 |
+
words = sentence.split()
|
224 |
+
for i in range(len(words)):
|
225 |
+
for j in range(i + 1, len(words) + 1):
|
226 |
+
word_seq = ' '.join(words[i:j])
|
227 |
+
if word_seq in city_names or word_seq in country_names:
|
228 |
+
geoparse_locations.append(word_seq)
|
229 |
+
|
230 |
+
# Geoparsing library 2
|
231 |
+
|
232 |
+
# similarly with the pycountry library
|
233 |
+
for country in pycountry.countries:
|
234 |
+
if country.name in sentence:
|
235 |
+
geoparse_locations.append(country.name)
|
236 |
+
|
237 |
+
# Geoparsing library 3
|
238 |
+
|
239 |
+
# similarly with the geotext library
|
240 |
+
places = GeoText(sentence)
|
241 |
+
cities = list(places.cities)
|
242 |
+
countries = list(places.countries)
|
243 |
+
|
244 |
+
if cities:
|
245 |
+
geoparse_locations += cities
|
246 |
+
if countries:
|
247 |
+
geoparse_locations += countries
|
248 |
+
|
249 |
+
return (geoparse_locations, countries, cities)
|
250 |
+
|
251 |
+
|
252 |
+
def identify_loc_regex(sentence):
|
253 |
+
"""
|
254 |
+
Identify cities and countries with regular expression matching
|
255 |
+
"""
|
256 |
+
|
257 |
+
regex_locations = []
|
258 |
+
|
259 |
+
# Country and cities references can be preceded by 'in', 'from' or 'of'
|
260 |
+
pattern = r"\b(in|from|of)\b\s([\w\s]+)"
|
261 |
+
additional_refs = re.findall(pattern, sentence)
|
262 |
+
|
263 |
+
for match in additional_refs:
|
264 |
+
regex_locations.append(match[1])
|
265 |
+
|
266 |
+
return regex_locations
|
267 |
+
|
268 |
+
|
269 |
+
def identify_loc_embeddings(sentence, countries, cities):
|
270 |
+
"""
|
271 |
+
Identify cities and countries with the BERT pre-trained embeddings matching
|
272 |
+
"""
|
273 |
+
|
274 |
+
embd_locations = []
|
275 |
+
|
276 |
+
# Define a list of country and city names (those are given by the geonamescache library before)
|
277 |
+
countries_cities = countries + cities
|
278 |
+
|
279 |
+
# Concatenate multi-word countries and cities into a single string
|
280 |
+
multiword_countries = [c.replace(' ', '_') for c in countries if ' ' in c]
|
281 |
+
multiword_cities = [c.replace(' ', '_') for c in cities if ' ' in c]
|
282 |
+
countries_cities += multiword_countries + multiword_cities
|
283 |
+
|
284 |
+
# Preprocess the input sentence
|
285 |
+
tokens = tokenizer.tokenize(sentence)
|
286 |
+
input_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tokens)])
|
287 |
+
|
288 |
+
# Get the BERT embeddings for the input sentence
|
289 |
+
with torch.no_grad():
|
290 |
+
embeddings = model(input_ids)[0][0]
|
291 |
+
|
292 |
+
# Find the country and city names in the input sentence
|
293 |
+
for i in range(len(tokens)):
|
294 |
+
token = tokens[i]
|
295 |
+
if token in countries_cities:
|
296 |
+
embd_locations.append(token)
|
297 |
+
else:
|
298 |
+
word_vector = embeddings[i]
|
299 |
+
similarity_scores = torch.nn.functional.cosine_similarity(word_vector.unsqueeze(0), embeddings)
|
300 |
+
similar_tokens = [tokens[j] for j in similarity_scores.argsort(descending=True)[1:6]]
|
301 |
+
for word in similar_tokens:
|
302 |
+
if word in countries_cities and similarity_scores[tokens.index(word)] > 0.5:
|
303 |
+
embd_locations.append(word)
|
304 |
+
|
305 |
+
# Convert back multi-word country and city names to original form
|
306 |
+
embd_locations = [loc.replace('_', ' ') if '_' in loc else loc for loc in embd_locations]
|
307 |
+
|
308 |
+
return embd_locations
|
309 |
+
|
310 |
+
|
311 |
+
|
312 |
+
def multiple_country_city_identifications_solve(country_city_dict):
|
313 |
+
"""
|
314 |
+
This is a function to solve the appearance of multiple identification of countries and cities.
|
315 |
+
It checks all the elements of the input dictionary and if any smaller length element exists as a substring inside
|
316 |
+
a bigger length element of it, it deletes the smaller size one. In that sense, a dictionary of the sort
|
317 |
+
{'city': ['Port moresby', 'Port'], 'country': ['Guinea', 'Papua new guinea']} will be converted into
|
318 |
+
{'city': ['Port moresby'], 'country': ['Papua new guinea']}.
|
319 |
+
|
320 |
+
The reason for that function, is because such type of incosistencies were identified during country/city identification,
|
321 |
+
propably relevant to the geoparsing libraries in use
|
322 |
+
"""
|
323 |
+
|
324 |
+
try:
|
325 |
+
|
326 |
+
country_flag = False
|
327 |
+
city_flag = False
|
328 |
+
|
329 |
+
# to avoid examining any element in any case, we validate that both a country and a city exist
|
330 |
+
# on the input dictionary and that they are of length more than one (which is the target case for us)
|
331 |
+
if 'country' in country_city_dict:
|
332 |
+
if len(country_city_dict['country']) > 1:
|
333 |
+
country_flag = True
|
334 |
+
|
335 |
+
if 'city' in country_city_dict:
|
336 |
+
if len(country_city_dict['city']) > 1:
|
337 |
+
city_flag = True
|
338 |
+
|
339 |
+
# at first cope with country multiple iterative references
|
340 |
+
if country_flag:
|
341 |
+
|
342 |
+
# Sort the countries by length, longest first
|
343 |
+
country_city_dict['country'].sort(key=lambda x: len(x), reverse=True)
|
344 |
+
|
345 |
+
# Create a new list of countries that don't contain any substrings
|
346 |
+
cleaned_countries = []
|
347 |
+
for i in range(len(country_city_dict['country'])):
|
348 |
+
is_substring = False
|
349 |
+
for j in range(len(cleaned_countries)):
|
350 |
+
if country_city_dict['country'][i].lower().find(cleaned_countries[j].lower()) != -1:
|
351 |
+
# If the i-th country is a substring of an already-cleaned country, skip it
|
352 |
+
is_substring = True
|
353 |
+
break
|
354 |
+
if not is_substring:
|
355 |
+
cleaned_countries.append(country_city_dict['country'][i])
|
356 |
+
|
357 |
+
# Replace the original list of countries with the cleaned one
|
358 |
+
country_city_dict['country'] = cleaned_countries
|
359 |
+
|
360 |
+
# Create a new list of countries that are not substrings of other countries
|
361 |
+
final_countries = []
|
362 |
+
for i in range(len(country_city_dict['country'])):
|
363 |
+
is_superstring = False
|
364 |
+
for j in range(len(country_city_dict['country'])):
|
365 |
+
if i == j:
|
366 |
+
continue
|
367 |
+
if country_city_dict['country'][j].lower().find(country_city_dict['country'][i].lower()) != -1:
|
368 |
+
# If the i-th country is a substring of a different country, skip it
|
369 |
+
is_superstring = True
|
370 |
+
break
|
371 |
+
if not is_superstring:
|
372 |
+
final_countries.append(country_city_dict['country'][i])
|
373 |
+
|
374 |
+
# Replace the original list of countries with the final one
|
375 |
+
country_city_dict['country'] = final_countries
|
376 |
+
|
377 |
+
# then cope with city multiple iterative references
|
378 |
+
if city_flag:
|
379 |
+
|
380 |
+
# Sort the cities by length, longest first
|
381 |
+
country_city_dict['city'].sort(key=lambda x: len(x), reverse=True)
|
382 |
+
|
383 |
+
# Create a new list of cities that don't contain any substrings
|
384 |
+
cleaned_cities = []
|
385 |
+
for i in range(len(country_city_dict['city'])):
|
386 |
+
is_substring = False
|
387 |
+
for j in range(len(cleaned_cities)):
|
388 |
+
if country_city_dict['city'][i].lower().find(cleaned_cities[j].lower()) != -1:
|
389 |
+
# If the i-th city is a substring of an already-cleaned city, skip it
|
390 |
+
is_substring = True
|
391 |
+
break
|
392 |
+
if not is_substring:
|
393 |
+
cleaned_cities.append(country_city_dict['city'][i])
|
394 |
+
|
395 |
+
# Replace the original list of cities with the cleaned one
|
396 |
+
country_city_dict['city'] = cleaned_cities
|
397 |
+
|
398 |
+
# Create a new list of cities that are not substrings of other cities
|
399 |
+
final_cities = []
|
400 |
+
for i in range(len(country_city_dict['city'])):
|
401 |
+
is_superstring = False
|
402 |
+
for j in range(len(country_city_dict['city'])):
|
403 |
+
if i == j:
|
404 |
+
continue
|
405 |
+
if country_city_dict['city'][j].lower().find(country_city_dict['city'][i].lower()) != -1:
|
406 |
+
# If the i-th city is a substring of a different city, skip it
|
407 |
+
is_superstring = True
|
408 |
+
break
|
409 |
+
if not is_superstring:
|
410 |
+
final_cities.append(country_city_dict['city'][i])
|
411 |
+
|
412 |
+
# Replace the original list of cities with the final one
|
413 |
+
country_city_dict['city'] = final_cities
|
414 |
+
|
415 |
+
# return the final dictionary
|
416 |
+
if country_city_dict:
|
417 |
+
return country_city_dict
|
418 |
+
|
419 |
+
except:
|
420 |
+
return (0, "LOCATION", "unknown_error")
|
421 |
+
|
422 |
+
|
423 |
+
def helper_resolve_cities(sentence, locations):
|
424 |
+
"""
|
425 |
+
Verify that the city captured does not belong to the capture country. If so delete it, unless there is also a second reference on the original sentence
|
426 |
+
(which might be the case of a city with a similar name/substring of a country)
|
427 |
+
"""
|
428 |
+
|
429 |
+
if 'country' in locations and 'city' in locations:
|
430 |
+
|
431 |
+
# Check if any city names are also present in the corresponding country name
|
432 |
+
for country in locations['country']:
|
433 |
+
for city in locations['city']:
|
434 |
+
|
435 |
+
if city.lower() in country.lower():
|
436 |
+
# If the city name is found in the country name, check how many times it appears in the sentence
|
437 |
+
city_count = len(re.findall(city, sentence, re.IGNORECASE))
|
438 |
+
if city_count == 1:
|
439 |
+
# If the city appears only once, remove it from the locations dictionary
|
440 |
+
locations['city'] = [c for c in locations['city'] if c != city]
|
441 |
+
|
442 |
+
return locations
|
443 |
+
|
444 |
+
|
445 |
+
def helper_delete_city_reference(locations):
|
446 |
+
"""
|
447 |
+
If the 'city' reference was captured by mistake by the system, delete it, unless it belongs to the cities that should contain it (e.g. Mexico city)
|
448 |
+
"""
|
449 |
+
|
450 |
+
city_cities = ["Adamstown City", "Alexander City", "Angeles City", "Antipolo City", "Arizona City", "Arkansas City",
|
451 |
+
"Ashley City", "Atlantic City", "Bacolod City", "Bacoor City", "Bago City", "Baguio City",
|
452 |
+
"Baker City", "Baltimore City", "Batangas City", "Bay City", "Belgrade City", "Belize City",
|
453 |
+
"Benin City", "Big Bear City", "Bossier City", "Boulder City", "Brazil City", "Bridge City",
|
454 |
+
"Brigham City", "Brighton City", "Bristol City", "Buckeye City", "Bullhead City", "Butuan City",
|
455 |
+
"Cabanatuan City", "Calamba City", "Calbayog City", "California City", "Caloocan City",
|
456 |
+
"Calumet City", "Candon City", "Canon City", "Carcar City", "Carson City", "Castries City",
|
457 |
+
"Cathedral City", "Cavite City", "Cebu City", "Cedar City", "Central Falls City", "Century City",
|
458 |
+
"Cestos City", "City Bell", "City Terrace", "City of Balikpapan", "City of Calamba",
|
459 |
+
"City of Gold Coast", "City of Industry", "City of Isabela", "City of Orange", "City of Paranaque",
|
460 |
+
"City of Parramatta", "City of Shoalhaven", "Collier City", "Columbia City", "Commerce City",
|
461 |
+
"Cooper City", "Cotabato City", "Crescent City", "Crescent City North", "Culver City",
|
462 |
+
"Dagupan City", "Dale City", "Dali City", "Daly City", "Danao City", "Dasmariñas City", "Davao City",
|
463 |
+
"De Forest City", "Del City", "Dhaka City", "Dipolog City", "Dodge City", "Dumaguete City",
|
464 |
+
"El Centro City", "Elizabeth City", "Elk City", "Ellicott City", "Emeryville City", "Fernley City",
|
465 |
+
"Florida City", "Forest City", "Forrest City", "Foster City", "Freeport City", "Garden City",
|
466 |
+
"Gdynia City", "General Santos City", "General Trias City", "Gloucester City", "Granite City",
|
467 |
+
"Green City", "Grove City", "Guatemala City", "Haines City", "Haltom City", "Harbor City",
|
468 |
+
"Havre City", "Highland City", "Ho Chi Minh City", "Holiday City", "Horizon City", "Hyderabad City",
|
469 |
+
"Iligan City", "Iloilo City", "Imus City", "Iowa City", "Iriga City", "Isabela City", "Jacinto City",
|
470 |
+
"James City County", "Jefferson City", "Jersey City", "Jhang City", "Jincheng City", "Johnson City",
|
471 |
+
"Junction City", "Kaiyuan City", "Kansas City", "King City", "Kingman City", "Kingston City",
|
472 |
+
"Koror City", "Kowloon City", "Kuwait City", "Lake City", "Lake Havasu City", "Laoag City",
|
473 |
+
"Lapu-Lapu City", "Las Pinas City", "Las Piñas City", "League City", "Legazpi City", "Leisure City",
|
474 |
+
"Lenoir City", "Ligao City", "Lincoln City", "Linyi City", "Lipa City", "Loma Linda City",
|
475 |
+
"Lucena City", "Madrid City", "Makati City", "Malabon City", "Mandaluyong City", "Mandaue City",
|
476 |
+
"Manukau City", "Marawi City", "Marikina City", "Maryland City", "Mason City", "McKee City",
|
477 |
+
"Mexico City", "Mexico City Beach", "Michigan City", "Midwest City", "Mineral City", "Missouri City",
|
478 |
+
"Morehead City", "Morgan City", "Muntinlupa City", "Naga City", "Nagasaki City", "National City",
|
479 |
+
"Navotas City", "Nay Pyi Taw City", "Nevada City", "New City", "New York City", "Norwich City",
|
480 |
+
"Ocean City", "Oil City", "Oklahoma City", "Olongapo City", "Orange City", "Oregon City",
|
481 |
+
"Ozamiz City", "Pagadian City", "Palayan City", "Palm City", "Panabo City", "Panama City",
|
482 |
+
"Panama City", "Panama City Beach", "Parañaque City", "Park City", "Pasay City", "Peachtree City",
|
483 |
+
"Pearl City", "Pell City", "Phenix City", "Plant City", "Ponca City", "Port Augusta City",
|
484 |
+
"Port Pirie City", "Quad Cities", "Quartzsite City", "Quebec City", "Quezon City", "Quezon City",
|
485 |
+
"Rainbow City", "Rapid City", "Red City", "Redwood City", "Richmond City", "Rio Grande City",
|
486 |
+
"Roxas City", "Royse City", "Salt Lake City", "Salt Lake City", "Samal City", "San Carlos City",
|
487 |
+
"San Carlos City", "San Fernando City", "San Fernando City", "San Fernando City", "San Jose City",
|
488 |
+
"San Jose City", "San Juan City", "San Juan City", "San Pedro City", "Santa Rosa City",
|
489 |
+
"Science City of Munoz", "Shelby City", "Sialkot City", "Silver City", "Sioux City",
|
490 |
+
"South Lake Tahoe City", "South Sioux City", "Studio City", "Suisun City", "Summit Park City",
|
491 |
+
"Sun City", "Sun City Center", "Sun City West", "Sun City West", "Suva City", "Tabaco City",
|
492 |
+
"Tacloban City", "Tagbilaran City", "Taguig City", "Tagum City", "Talisay City", "Tanauan City",
|
493 |
+
"Tarlac City", "Tauranga City", "Tayabas City", "Temple City", "Texas City", "Thomas City",
|
494 |
+
"Tipp City", "Toledo City", "Traverse City", "Trece Martires City", "Tuba City", "Union City",
|
495 |
+
"Universal City", "University City", "Upper Hutt City", "Valencia City", "Valenzuela City",
|
496 |
+
"Vatican City", "Vatican City", "Ventnor City", "Webb City", "Wellington City", "Welwyn Garden City",
|
497 |
+
"West Valley City", "White City", "Yazoo City", "Yuba City", "Zamboanga City"]
|
498 |
+
|
499 |
+
if 'city' in locations:
|
500 |
+
for city in locations['city']:
|
501 |
+
if 'city' in city:
|
502 |
+
if not city in city_cities:
|
503 |
+
city = city.replace("city", "")
|
504 |
+
|
505 |
+
elif 'City' in city:
|
506 |
+
if not city in city_cities:
|
507 |
+
city = city.replace("City", "")
|
508 |
+
|
509 |
+
locations['city'] = city
|
510 |
+
|
511 |
+
# Convert city values to a list
|
512 |
+
if isinstance(locations['city'], str):
|
513 |
+
locations['city'] = [locations['city']]
|
514 |
+
|
515 |
+
return locations
|
516 |
+
|
517 |
+
|
518 |
+
def helper_delete_country_reference(locations):
|
519 |
+
"""
|
520 |
+
If the 'country' reference was captured by mistake by the system and exists in a city name, delete it
|
521 |
+
"""
|
522 |
+
|
523 |
+
country_city_same = ["djibouti", "guatemala", "mexico", "panama", "san marino", "singapore", "vatican"]
|
524 |
+
|
525 |
+
if 'country' in locations:
|
526 |
+
for i, country in enumerate(locations['country']):
|
527 |
+
|
528 |
+
if country.lower() not in country_city_same:
|
529 |
+
split_country = country.lower().split()
|
530 |
+
|
531 |
+
if 'city' in locations:
|
532 |
+
for j, city in enumerate(locations['city']):
|
533 |
+
split_city = city.lower().split()
|
534 |
+
|
535 |
+
for substring in split_country:
|
536 |
+
if substring in split_city:
|
537 |
+
split_city.remove(substring)
|
538 |
+
new_city = ' '.join(split_city)
|
539 |
+
locations['city'][j] = new_city.strip()
|
540 |
+
|
541 |
+
return locations
|
542 |
+
|
543 |
+
|
544 |
+
def identify_locations(sentence):
|
545 |
+
"""
|
546 |
+
Identify all the possible Country and City references in the given sentence, using different approaches in a hybrid manner
|
547 |
+
"""
|
548 |
+
|
549 |
+
locations = []
|
550 |
+
extra_serco_countries = False
|
551 |
+
|
552 |
+
try:
|
553 |
+
# # # this is because there were cases were a city followed by comma was not understood by the system
|
554 |
+
|
555 |
+
sentence = sentence.replace(",", " x$x ")
|
556 |
+
|
557 |
+
# Serco wanted to also handle these two cases without the symbol "-". The only way to do that is by hardcoding it
|
558 |
+
if "Timor Leste" in sentence:
|
559 |
+
extra_serco_countries = True
|
560 |
+
locations.append("Timor Leste")
|
561 |
+
|
562 |
+
if "Guinea Bissau" in sentence:
|
563 |
+
extra_serco_countries = True
|
564 |
+
locations.append("Guinea Bissau")
|
565 |
+
|
566 |
+
# ner
|
567 |
+
locations.append(identify_loc_ner(sentence))
|
568 |
+
|
569 |
+
# geoparse libs
|
570 |
+
geoparse_list, countries, cities = identify_loc_geoparselibs(sentence)
|
571 |
+
locations.append(geoparse_list)
|
572 |
+
|
573 |
+
# flatten the geoparse list
|
574 |
+
locations_flat_1 = list(flatten(locations))
|
575 |
+
|
576 |
+
# regex
|
577 |
+
locations_flat_1.append(identify_loc_regex(sentence))
|
578 |
+
|
579 |
+
# flatten the regex list
|
580 |
+
locations_flat_2 = list(flatten(locations))
|
581 |
+
|
582 |
+
# embeddings
|
583 |
+
locations_flat_2.append(identify_loc_embeddings(sentence, countries, cities))
|
584 |
+
|
585 |
+
# flatten the embeddings list
|
586 |
+
locations_flat_3 = list(flatten(locations))
|
587 |
+
|
588 |
+
# remove duplicates while also taking under consideration capitalization (e.g. a reference of italy should be valid, while also a reference of Italy and italy)
|
589 |
+
# Lowercase the words and get their unique references using set()
|
590 |
+
loc_unique = set([loc.lower() for loc in locations_flat_3])
|
591 |
+
|
592 |
+
# Create a new list of locations with initial capitalization, removing duplicates
|
593 |
+
loc_capitalization = list(
|
594 |
+
set([loc.capitalize() if loc.lower() in loc_unique else loc.lower() for loc in locations_flat_3]))
|
595 |
+
|
596 |
+
# That calculation checks whether there are substrings contained in another string. E.g. for the case of [timor leste, timor], it should remove "timor"
|
597 |
+
if extra_serco_countries:
|
598 |
+
loc_capitalization_cp = loc_capitalization.copy()
|
599 |
+
for i, loc1 in enumerate(loc_capitalization):
|
600 |
+
for j, loc2 in enumerate(loc_capitalization):
|
601 |
+
if i != j and loc1 in loc2:
|
602 |
+
loc_capitalization_cp.remove(loc1)
|
603 |
+
break
|
604 |
+
|
605 |
+
loc_capitalization = loc_capitalization_cp
|
606 |
+
|
607 |
+
# validate that indeed each one of the countries/cities are indeed countries/cities
|
608 |
+
validated_locations = validate_locations(loc_capitalization)
|
609 |
+
|
610 |
+
# create a proper dictionary with country/city tags and the relevant entries as a result
|
611 |
+
loc_dict = {}
|
612 |
+
for location, loc_type in validated_locations:
|
613 |
+
if loc_type not in loc_dict:
|
614 |
+
loc_dict[loc_type] = []
|
615 |
+
loc_dict[loc_type].append(location)
|
616 |
+
|
617 |
+
# bring sentence on previous form
|
618 |
+
sentence = sentence.replace(" x$x ", ",")
|
619 |
+
|
620 |
+
# cope with cases of iterative country or city reference due to geoparse lib issues
|
621 |
+
locations_dict = multiple_country_city_identifications_solve(loc_dict)
|
622 |
+
|
623 |
+
if locations_dict == None:
|
624 |
+
return (0, "LOCATION", "no_country")
|
625 |
+
# return {'city':[], 'country':[]}
|
626 |
+
|
627 |
+
else:
|
628 |
+
# conditions for multiple references
|
629 |
+
# it is mandatory that a country will exist
|
630 |
+
if 'country' in locations_dict:
|
631 |
+
|
632 |
+
# if a city exists
|
633 |
+
if 'city' in locations_dict:
|
634 |
+
resolved_dict = helper_resolve_cities(sentence, locations_dict)
|
635 |
+
|
636 |
+
# we accept one country and one city
|
637 |
+
if len(resolved_dict['country']) == 1 and len(resolved_dict['city']) == 1:
|
638 |
+
|
639 |
+
# capitalize because there may be cases that it will return 'italy'
|
640 |
+
resolved_dict['country'][0] = resolved_dict['country'][0].capitalize()
|
641 |
+
|
642 |
+
# there were some cases that the 'x$x' was not removed
|
643 |
+
for key, values in resolved_dict.items():
|
644 |
+
for i, value in enumerate(values):
|
645 |
+
if 'x$x' in value:
|
646 |
+
values[i] = value.replace('x$x', '')
|
647 |
+
|
648 |
+
delete_city = helper_delete_city_reference(resolved_dict)
|
649 |
+
|
650 |
+
return helper_delete_country_reference(delete_city)
|
651 |
+
|
652 |
+
|
653 |
+
# we can accept an absence of city but a country is always mandatory
|
654 |
+
elif len(resolved_dict['country']) == 1 and len(resolved_dict['city']) == 0:
|
655 |
+
|
656 |
+
resolved_dict['country'][0] = resolved_dict['country'][0].capitalize()
|
657 |
+
|
658 |
+
# there were some cases that the 'x$x' was not removed
|
659 |
+
for key, values in resolved_dict.items():
|
660 |
+
for i, value in enumerate(values):
|
661 |
+
if 'x$x' in value:
|
662 |
+
values[i] = value.replace('x$x', '')
|
663 |
+
|
664 |
+
delete_city = helper_delete_city_reference(resolved_dict)
|
665 |
+
|
666 |
+
return helper_delete_country_reference(delete_city)
|
667 |
+
|
668 |
+
# error if more than one country or city
|
669 |
+
else:
|
670 |
+
return (0, "LOCATION", "more_city_or_country")
|
671 |
+
|
672 |
+
|
673 |
+
# if a city does not exist
|
674 |
+
else:
|
675 |
+
# we only accept for one country
|
676 |
+
if len(locations_dict['country']) == 1:
|
677 |
+
|
678 |
+
locations_dict['country'][0] = locations_dict['country'][0].capitalize()
|
679 |
+
|
680 |
+
# there were some cases that the 'x$x' was not removed
|
681 |
+
for key, values in locations_dict.items():
|
682 |
+
for i, value in enumerate(values):
|
683 |
+
if 'x$x' in value:
|
684 |
+
values[i] = value.replace('x$x', '')
|
685 |
+
|
686 |
+
resolved_cities = helper_resolve_cities(sentence, locations_dict)
|
687 |
+
delete_city = helper_delete_city_reference(resolved_cities)
|
688 |
+
|
689 |
+
help_city = helper_delete_country_reference(delete_city)
|
690 |
+
|
691 |
+
if not 'city' in help_city:
|
692 |
+
help_city['city'] = [0]
|
693 |
+
|
694 |
+
return help_city
|
695 |
+
|
696 |
+
# error if more than one country
|
697 |
+
else:
|
698 |
+
return (0, "LOCATION", "more_country")
|
699 |
+
|
700 |
+
# error if no country is referred
|
701 |
+
else:
|
702 |
+
return (0, "LOCATION", "no_country")
|
703 |
+
|
704 |
+
except:
|
705 |
+
# handle the exception if any errors occur while identifying a country/city
|
706 |
+
return (0, "LOCATION", "unknown_error")
|