ValadisCERTH commited on
Commit
65f09a7
·
1 Parent(s): 2be1d3d

Create countriesIdentification

Browse files
Files changed (1) hide show
  1. countriesIdentification +706 -0
countriesIdentification ADDED
@@ -0,0 +1,706 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+
3
+ from geopy.geocoders import Nominatim
4
+ import geonamescache
5
+ import pycountry
6
+
7
+ from geotext import GeoText
8
+
9
+ import re
10
+
11
+ from transformers import BertTokenizer, BertModel
12
+ import torch
13
+
14
+
15
+ # Load the spacy model with GloVe embeddings
16
+ nlp = spacy.load("en_core_web_lg")
17
+
18
+ # load the pre-trained BERT tokenizer and model
19
+ tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
20
+ model = BertModel.from_pretrained('bert-base-cased')
21
+
22
+ # Load valid city names from geonamescache
23
+ gc = geonamescache.GeonamesCache()
24
+
25
+ # There is a bug with geonamescache where some countries exist as cities (e.g. albania)
26
+ # So initially we delete any country reference from the cities
27
+
28
+ # Get a list of all country names
29
+ original_countries = set(country['name'] for country in gc.get_countries().values())
30
+
31
+ # Get a list of all the original city names
32
+ original_cities = set(city['name'] for city in gc.get_cities().values())
33
+
34
+ # Get a list of all country names that appear as city names
35
+ country_names = set(
36
+ country['name'] for country in gc.get_countries().values() if country['name'] not in original_cities)
37
+
38
+ # We also add these two cases because they have been asked by SERCO
39
+ country_names.add("Guinea Bissau")
40
+ country_names.add("Guinea bissau")
41
+ country_names.add("guinea Bissau")
42
+ country_names.add("guinea bissau")
43
+ country_names.add("Timor Leste")
44
+ country_names.add("Timor leste")
45
+ country_names.add("timor Leste")
46
+ country_names.add("timor leste")
47
+ country_names.add("UAE")
48
+ country_names.add("uae")
49
+ country_names.add("Uae")
50
+ country_names.add("Uk")
51
+ country_names.add("uK")
52
+ country_names.add("uk")
53
+ country_names.add("USa")
54
+ country_names.add("Usa")
55
+ country_names.add("usa")
56
+ country_names.add("uSa")
57
+ country_names.add("usA")
58
+ country_names.add("uSA")
59
+ country_names.add("Palestine")
60
+
61
+ # Get a list of all city names, excluding country names
62
+ city_names = set(city['name'] for city in gc.get_cities().values() if city['name'] not in original_countries)
63
+
64
+ city_names.add("Puebla de sanabria")
65
+
66
+
67
+ def flatten(lst):
68
+ """
69
+ Define a helper function to flatten the list recursively
70
+ """
71
+
72
+ for item in lst:
73
+ if isinstance(item, list):
74
+ yield from flatten(item)
75
+ else:
76
+ yield item
77
+
78
+
79
+ def is_country(reference):
80
+ """
81
+ Check if a given reference is a valid country name
82
+ """
83
+ try:
84
+ # Check if the reference is a valid city name from the first geoparse library
85
+ if reference in country_names:
86
+ return True
87
+
88
+ else:
89
+ # if not then use the pycountry library to verify if an input is a country
90
+ country = pycountry.countries.search_fuzzy(reference)[0]
91
+
92
+ temp_country_names = []
93
+
94
+ if country:
95
+ if hasattr(country, 'name') or hasattr(country, 'official_name') or hasattr(country, 'common_name'):
96
+
97
+ if hasattr(country, 'official_name'):
98
+ temp_country_names.append(country.official_name.lower())
99
+ if hasattr(country, 'name'):
100
+ temp_country_names.append(country.name.lower())
101
+ if hasattr(country, 'common_name'):
102
+ temp_country_names.append(country.common_name.lower())
103
+ if any(reference.lower()==elem for elem in temp_country_names):
104
+ return True
105
+
106
+ return False
107
+
108
+ except LookupError:
109
+ return False
110
+
111
+
112
+ def is_city(reference):
113
+ """
114
+ Check if a given reference is a valid city name
115
+ """
116
+
117
+ reference = reference.replace("x$x", "").strip()
118
+
119
+ # Check if the reference is a valid city name
120
+ if reference in city_names:
121
+ return True
122
+
123
+ # Load the Nomatim (open street maps) api
124
+ geolocator = Nominatim(user_agent="certh_serco_validate_city_app")
125
+ location = geolocator.geocode(reference, language="en", timeout=10)
126
+
127
+ # If a reference is identified as a 'city', 'town', or 'village', then it is indeed a city
128
+ if location.raw['type'] in ['city', 'town', 'village']:
129
+ return True
130
+
131
+ # If a reference is identified as 'administrative' (e.g. administrative area),
132
+ # then we further examine if the retrieved info is a single token (meaning a country) or a series of tokens (meaning a city)
133
+ # that condition takes place to separate some cases where small cities were identified as administrative areas
134
+ elif location.raw['type'] == 'administrative':
135
+
136
+ if len(location.raw['display_name'].split(",")) > 1:
137
+ return True
138
+
139
+ return False
140
+
141
+
142
+ def validate_locations(locations):
143
+ """
144
+ Validate that the identified references are indeed a Country and a City
145
+ """
146
+
147
+ validated_loc = []
148
+
149
+ for location in locations:
150
+
151
+ # validate whether it is a country
152
+ if is_country(location):
153
+ validated_loc.append((location, 'country'))
154
+
155
+ # validate whether it is a city
156
+ elif is_city(location):
157
+ validated_loc.append((location, 'city'))
158
+
159
+ else:
160
+ # Check if the location is a multi-word name
161
+ words = location.split()
162
+ if len(words) > 1:
163
+
164
+ # Try to find the country or city name among the words
165
+ for i in range(len(words)):
166
+ name = ' '.join(words[i:])
167
+
168
+ if is_country(name):
169
+ validated_loc.append((name, 'country'))
170
+ break
171
+
172
+ elif is_city(name):
173
+ validated_loc.append((name, 'city'))
174
+ break
175
+
176
+ return validated_loc
177
+
178
+
179
+ def identify_loc_ner(sentence):
180
+ """
181
+ Identify all the geopolitical and location entities with the spacy tool
182
+ """
183
+
184
+ doc = nlp(sentence)
185
+
186
+ ner_locations = []
187
+
188
+ # GPE and LOC are the labels for location entities in spaCy
189
+ for ent in doc.ents:
190
+ if ent.label_ in ['GPE', 'LOC']:
191
+
192
+ if len(ent.text.split()) > 1:
193
+ ner_locations.append(ent.text)
194
+ else:
195
+ for token in ent:
196
+ if token.ent_type_ == 'GPE':
197
+ ner_locations.append(ent.text)
198
+ break
199
+
200
+ return ner_locations
201
+
202
+
203
+ def identify_loc_geoparselibs(sentence):
204
+ """
205
+ Identify cities and countries with 3 different geoparsing libraries
206
+ """
207
+
208
+ geoparse_locations = []
209
+
210
+ # Geoparsing library 1
211
+
212
+ # Load geonames cache to check if a city name is valid
213
+ gc = geonamescache.GeonamesCache()
214
+
215
+ # Get a list of many countries/cities
216
+ countries = gc.get_countries()
217
+ cities = gc.get_cities()
218
+
219
+ city_names = [city['name'] for city in cities.values()]
220
+ country_names = [country['name'] for country in countries.values()]
221
+
222
+ # if any word sequence in our sentence is one of those countries/cities identify it
223
+ words = sentence.split()
224
+ for i in range(len(words)):
225
+ for j in range(i + 1, len(words) + 1):
226
+ word_seq = ' '.join(words[i:j])
227
+ if word_seq in city_names or word_seq in country_names:
228
+ geoparse_locations.append(word_seq)
229
+
230
+ # Geoparsing library 2
231
+
232
+ # similarly with the pycountry library
233
+ for country in pycountry.countries:
234
+ if country.name in sentence:
235
+ geoparse_locations.append(country.name)
236
+
237
+ # Geoparsing library 3
238
+
239
+ # similarly with the geotext library
240
+ places = GeoText(sentence)
241
+ cities = list(places.cities)
242
+ countries = list(places.countries)
243
+
244
+ if cities:
245
+ geoparse_locations += cities
246
+ if countries:
247
+ geoparse_locations += countries
248
+
249
+ return (geoparse_locations, countries, cities)
250
+
251
+
252
+ def identify_loc_regex(sentence):
253
+ """
254
+ Identify cities and countries with regular expression matching
255
+ """
256
+
257
+ regex_locations = []
258
+
259
+ # Country and cities references can be preceded by 'in', 'from' or 'of'
260
+ pattern = r"\b(in|from|of)\b\s([\w\s]+)"
261
+ additional_refs = re.findall(pattern, sentence)
262
+
263
+ for match in additional_refs:
264
+ regex_locations.append(match[1])
265
+
266
+ return regex_locations
267
+
268
+
269
+ def identify_loc_embeddings(sentence, countries, cities):
270
+ """
271
+ Identify cities and countries with the BERT pre-trained embeddings matching
272
+ """
273
+
274
+ embd_locations = []
275
+
276
+ # Define a list of country and city names (those are given by the geonamescache library before)
277
+ countries_cities = countries + cities
278
+
279
+ # Concatenate multi-word countries and cities into a single string
280
+ multiword_countries = [c.replace(' ', '_') for c in countries if ' ' in c]
281
+ multiword_cities = [c.replace(' ', '_') for c in cities if ' ' in c]
282
+ countries_cities += multiword_countries + multiword_cities
283
+
284
+ # Preprocess the input sentence
285
+ tokens = tokenizer.tokenize(sentence)
286
+ input_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tokens)])
287
+
288
+ # Get the BERT embeddings for the input sentence
289
+ with torch.no_grad():
290
+ embeddings = model(input_ids)[0][0]
291
+
292
+ # Find the country and city names in the input sentence
293
+ for i in range(len(tokens)):
294
+ token = tokens[i]
295
+ if token in countries_cities:
296
+ embd_locations.append(token)
297
+ else:
298
+ word_vector = embeddings[i]
299
+ similarity_scores = torch.nn.functional.cosine_similarity(word_vector.unsqueeze(0), embeddings)
300
+ similar_tokens = [tokens[j] for j in similarity_scores.argsort(descending=True)[1:6]]
301
+ for word in similar_tokens:
302
+ if word in countries_cities and similarity_scores[tokens.index(word)] > 0.5:
303
+ embd_locations.append(word)
304
+
305
+ # Convert back multi-word country and city names to original form
306
+ embd_locations = [loc.replace('_', ' ') if '_' in loc else loc for loc in embd_locations]
307
+
308
+ return embd_locations
309
+
310
+
311
+
312
+ def multiple_country_city_identifications_solve(country_city_dict):
313
+ """
314
+ This is a function to solve the appearance of multiple identification of countries and cities.
315
+ It checks all the elements of the input dictionary and if any smaller length element exists as a substring inside
316
+ a bigger length element of it, it deletes the smaller size one. In that sense, a dictionary of the sort
317
+ {'city': ['Port moresby', 'Port'], 'country': ['Guinea', 'Papua new guinea']} will be converted into
318
+ {'city': ['Port moresby'], 'country': ['Papua new guinea']}.
319
+
320
+ The reason for that function, is because such type of incosistencies were identified during country/city identification,
321
+ propably relevant to the geoparsing libraries in use
322
+ """
323
+
324
+ try:
325
+
326
+ country_flag = False
327
+ city_flag = False
328
+
329
+ # to avoid examining any element in any case, we validate that both a country and a city exist
330
+ # on the input dictionary and that they are of length more than one (which is the target case for us)
331
+ if 'country' in country_city_dict:
332
+ if len(country_city_dict['country']) > 1:
333
+ country_flag = True
334
+
335
+ if 'city' in country_city_dict:
336
+ if len(country_city_dict['city']) > 1:
337
+ city_flag = True
338
+
339
+ # at first cope with country multiple iterative references
340
+ if country_flag:
341
+
342
+ # Sort the countries by length, longest first
343
+ country_city_dict['country'].sort(key=lambda x: len(x), reverse=True)
344
+
345
+ # Create a new list of countries that don't contain any substrings
346
+ cleaned_countries = []
347
+ for i in range(len(country_city_dict['country'])):
348
+ is_substring = False
349
+ for j in range(len(cleaned_countries)):
350
+ if country_city_dict['country'][i].lower().find(cleaned_countries[j].lower()) != -1:
351
+ # If the i-th country is a substring of an already-cleaned country, skip it
352
+ is_substring = True
353
+ break
354
+ if not is_substring:
355
+ cleaned_countries.append(country_city_dict['country'][i])
356
+
357
+ # Replace the original list of countries with the cleaned one
358
+ country_city_dict['country'] = cleaned_countries
359
+
360
+ # Create a new list of countries that are not substrings of other countries
361
+ final_countries = []
362
+ for i in range(len(country_city_dict['country'])):
363
+ is_superstring = False
364
+ for j in range(len(country_city_dict['country'])):
365
+ if i == j:
366
+ continue
367
+ if country_city_dict['country'][j].lower().find(country_city_dict['country'][i].lower()) != -1:
368
+ # If the i-th country is a substring of a different country, skip it
369
+ is_superstring = True
370
+ break
371
+ if not is_superstring:
372
+ final_countries.append(country_city_dict['country'][i])
373
+
374
+ # Replace the original list of countries with the final one
375
+ country_city_dict['country'] = final_countries
376
+
377
+ # then cope with city multiple iterative references
378
+ if city_flag:
379
+
380
+ # Sort the cities by length, longest first
381
+ country_city_dict['city'].sort(key=lambda x: len(x), reverse=True)
382
+
383
+ # Create a new list of cities that don't contain any substrings
384
+ cleaned_cities = []
385
+ for i in range(len(country_city_dict['city'])):
386
+ is_substring = False
387
+ for j in range(len(cleaned_cities)):
388
+ if country_city_dict['city'][i].lower().find(cleaned_cities[j].lower()) != -1:
389
+ # If the i-th city is a substring of an already-cleaned city, skip it
390
+ is_substring = True
391
+ break
392
+ if not is_substring:
393
+ cleaned_cities.append(country_city_dict['city'][i])
394
+
395
+ # Replace the original list of cities with the cleaned one
396
+ country_city_dict['city'] = cleaned_cities
397
+
398
+ # Create a new list of cities that are not substrings of other cities
399
+ final_cities = []
400
+ for i in range(len(country_city_dict['city'])):
401
+ is_superstring = False
402
+ for j in range(len(country_city_dict['city'])):
403
+ if i == j:
404
+ continue
405
+ if country_city_dict['city'][j].lower().find(country_city_dict['city'][i].lower()) != -1:
406
+ # If the i-th city is a substring of a different city, skip it
407
+ is_superstring = True
408
+ break
409
+ if not is_superstring:
410
+ final_cities.append(country_city_dict['city'][i])
411
+
412
+ # Replace the original list of cities with the final one
413
+ country_city_dict['city'] = final_cities
414
+
415
+ # return the final dictionary
416
+ if country_city_dict:
417
+ return country_city_dict
418
+
419
+ except:
420
+ return (0, "LOCATION", "unknown_error")
421
+
422
+
423
+ def helper_resolve_cities(sentence, locations):
424
+ """
425
+ Verify that the city captured does not belong to the capture country. If so delete it, unless there is also a second reference on the original sentence
426
+ (which might be the case of a city with a similar name/substring of a country)
427
+ """
428
+
429
+ if 'country' in locations and 'city' in locations:
430
+
431
+ # Check if any city names are also present in the corresponding country name
432
+ for country in locations['country']:
433
+ for city in locations['city']:
434
+
435
+ if city.lower() in country.lower():
436
+ # If the city name is found in the country name, check how many times it appears in the sentence
437
+ city_count = len(re.findall(city, sentence, re.IGNORECASE))
438
+ if city_count == 1:
439
+ # If the city appears only once, remove it from the locations dictionary
440
+ locations['city'] = [c for c in locations['city'] if c != city]
441
+
442
+ return locations
443
+
444
+
445
+ def helper_delete_city_reference(locations):
446
+ """
447
+ If the 'city' reference was captured by mistake by the system, delete it, unless it belongs to the cities that should contain it (e.g. Mexico city)
448
+ """
449
+
450
+ city_cities = ["Adamstown City", "Alexander City", "Angeles City", "Antipolo City", "Arizona City", "Arkansas City",
451
+ "Ashley City", "Atlantic City", "Bacolod City", "Bacoor City", "Bago City", "Baguio City",
452
+ "Baker City", "Baltimore City", "Batangas City", "Bay City", "Belgrade City", "Belize City",
453
+ "Benin City", "Big Bear City", "Bossier City", "Boulder City", "Brazil City", "Bridge City",
454
+ "Brigham City", "Brighton City", "Bristol City", "Buckeye City", "Bullhead City", "Butuan City",
455
+ "Cabanatuan City", "Calamba City", "Calbayog City", "California City", "Caloocan City",
456
+ "Calumet City", "Candon City", "Canon City", "Carcar City", "Carson City", "Castries City",
457
+ "Cathedral City", "Cavite City", "Cebu City", "Cedar City", "Central Falls City", "Century City",
458
+ "Cestos City", "City Bell", "City Terrace", "City of Balikpapan", "City of Calamba",
459
+ "City of Gold Coast", "City of Industry", "City of Isabela", "City of Orange", "City of Paranaque",
460
+ "City of Parramatta", "City of Shoalhaven", "Collier City", "Columbia City", "Commerce City",
461
+ "Cooper City", "Cotabato City", "Crescent City", "Crescent City North", "Culver City",
462
+ "Dagupan City", "Dale City", "Dali City", "Daly City", "Danao City", "Dasmariñas City", "Davao City",
463
+ "De Forest City", "Del City", "Dhaka City", "Dipolog City", "Dodge City", "Dumaguete City",
464
+ "El Centro City", "Elizabeth City", "Elk City", "Ellicott City", "Emeryville City", "Fernley City",
465
+ "Florida City", "Forest City", "Forrest City", "Foster City", "Freeport City", "Garden City",
466
+ "Gdynia City", "General Santos City", "General Trias City", "Gloucester City", "Granite City",
467
+ "Green City", "Grove City", "Guatemala City", "Haines City", "Haltom City", "Harbor City",
468
+ "Havre City", "Highland City", "Ho Chi Minh City", "Holiday City", "Horizon City", "Hyderabad City",
469
+ "Iligan City", "Iloilo City", "Imus City", "Iowa City", "Iriga City", "Isabela City", "Jacinto City",
470
+ "James City County", "Jefferson City", "Jersey City", "Jhang City", "Jincheng City", "Johnson City",
471
+ "Junction City", "Kaiyuan City", "Kansas City", "King City", "Kingman City", "Kingston City",
472
+ "Koror City", "Kowloon City", "Kuwait City", "Lake City", "Lake Havasu City", "Laoag City",
473
+ "Lapu-Lapu City", "Las Pinas City", "Las Piñas City", "League City", "Legazpi City", "Leisure City",
474
+ "Lenoir City", "Ligao City", "Lincoln City", "Linyi City", "Lipa City", "Loma Linda City",
475
+ "Lucena City", "Madrid City", "Makati City", "Malabon City", "Mandaluyong City", "Mandaue City",
476
+ "Manukau City", "Marawi City", "Marikina City", "Maryland City", "Mason City", "McKee City",
477
+ "Mexico City", "Mexico City Beach", "Michigan City", "Midwest City", "Mineral City", "Missouri City",
478
+ "Morehead City", "Morgan City", "Muntinlupa City", "Naga City", "Nagasaki City", "National City",
479
+ "Navotas City", "Nay Pyi Taw City", "Nevada City", "New City", "New York City", "Norwich City",
480
+ "Ocean City", "Oil City", "Oklahoma City", "Olongapo City", "Orange City", "Oregon City",
481
+ "Ozamiz City", "Pagadian City", "Palayan City", "Palm City", "Panabo City", "Panama City",
482
+ "Panama City", "Panama City Beach", "Parañaque City", "Park City", "Pasay City", "Peachtree City",
483
+ "Pearl City", "Pell City", "Phenix City", "Plant City", "Ponca City", "Port Augusta City",
484
+ "Port Pirie City", "Quad Cities", "Quartzsite City", "Quebec City", "Quezon City", "Quezon City",
485
+ "Rainbow City", "Rapid City", "Red City", "Redwood City", "Richmond City", "Rio Grande City",
486
+ "Roxas City", "Royse City", "Salt Lake City", "Salt Lake City", "Samal City", "San Carlos City",
487
+ "San Carlos City", "San Fernando City", "San Fernando City", "San Fernando City", "San Jose City",
488
+ "San Jose City", "San Juan City", "San Juan City", "San Pedro City", "Santa Rosa City",
489
+ "Science City of Munoz", "Shelby City", "Sialkot City", "Silver City", "Sioux City",
490
+ "South Lake Tahoe City", "South Sioux City", "Studio City", "Suisun City", "Summit Park City",
491
+ "Sun City", "Sun City Center", "Sun City West", "Sun City West", "Suva City", "Tabaco City",
492
+ "Tacloban City", "Tagbilaran City", "Taguig City", "Tagum City", "Talisay City", "Tanauan City",
493
+ "Tarlac City", "Tauranga City", "Tayabas City", "Temple City", "Texas City", "Thomas City",
494
+ "Tipp City", "Toledo City", "Traverse City", "Trece Martires City", "Tuba City", "Union City",
495
+ "Universal City", "University City", "Upper Hutt City", "Valencia City", "Valenzuela City",
496
+ "Vatican City", "Vatican City", "Ventnor City", "Webb City", "Wellington City", "Welwyn Garden City",
497
+ "West Valley City", "White City", "Yazoo City", "Yuba City", "Zamboanga City"]
498
+
499
+ if 'city' in locations:
500
+ for city in locations['city']:
501
+ if 'city' in city:
502
+ if not city in city_cities:
503
+ city = city.replace("city", "")
504
+
505
+ elif 'City' in city:
506
+ if not city in city_cities:
507
+ city = city.replace("City", "")
508
+
509
+ locations['city'] = city
510
+
511
+ # Convert city values to a list
512
+ if isinstance(locations['city'], str):
513
+ locations['city'] = [locations['city']]
514
+
515
+ return locations
516
+
517
+
518
+ def helper_delete_country_reference(locations):
519
+ """
520
+ If the 'country' reference was captured by mistake by the system and exists in a city name, delete it
521
+ """
522
+
523
+ country_city_same = ["djibouti", "guatemala", "mexico", "panama", "san marino", "singapore", "vatican"]
524
+
525
+ if 'country' in locations:
526
+ for i, country in enumerate(locations['country']):
527
+
528
+ if country.lower() not in country_city_same:
529
+ split_country = country.lower().split()
530
+
531
+ if 'city' in locations:
532
+ for j, city in enumerate(locations['city']):
533
+ split_city = city.lower().split()
534
+
535
+ for substring in split_country:
536
+ if substring in split_city:
537
+ split_city.remove(substring)
538
+ new_city = ' '.join(split_city)
539
+ locations['city'][j] = new_city.strip()
540
+
541
+ return locations
542
+
543
+
544
+ def identify_locations(sentence):
545
+ """
546
+ Identify all the possible Country and City references in the given sentence, using different approaches in a hybrid manner
547
+ """
548
+
549
+ locations = []
550
+ extra_serco_countries = False
551
+
552
+ try:
553
+ # # # this is because there were cases were a city followed by comma was not understood by the system
554
+
555
+ sentence = sentence.replace(",", " x$x ")
556
+
557
+ # Serco wanted to also handle these two cases without the symbol "-". The only way to do that is by hardcoding it
558
+ if "Timor Leste" in sentence:
559
+ extra_serco_countries = True
560
+ locations.append("Timor Leste")
561
+
562
+ if "Guinea Bissau" in sentence:
563
+ extra_serco_countries = True
564
+ locations.append("Guinea Bissau")
565
+
566
+ # ner
567
+ locations.append(identify_loc_ner(sentence))
568
+
569
+ # geoparse libs
570
+ geoparse_list, countries, cities = identify_loc_geoparselibs(sentence)
571
+ locations.append(geoparse_list)
572
+
573
+ # flatten the geoparse list
574
+ locations_flat_1 = list(flatten(locations))
575
+
576
+ # regex
577
+ locations_flat_1.append(identify_loc_regex(sentence))
578
+
579
+ # flatten the regex list
580
+ locations_flat_2 = list(flatten(locations))
581
+
582
+ # embeddings
583
+ locations_flat_2.append(identify_loc_embeddings(sentence, countries, cities))
584
+
585
+ # flatten the embeddings list
586
+ locations_flat_3 = list(flatten(locations))
587
+
588
+ # remove duplicates while also taking under consideration capitalization (e.g. a reference of italy should be valid, while also a reference of Italy and italy)
589
+ # Lowercase the words and get their unique references using set()
590
+ loc_unique = set([loc.lower() for loc in locations_flat_3])
591
+
592
+ # Create a new list of locations with initial capitalization, removing duplicates
593
+ loc_capitalization = list(
594
+ set([loc.capitalize() if loc.lower() in loc_unique else loc.lower() for loc in locations_flat_3]))
595
+
596
+ # That calculation checks whether there are substrings contained in another string. E.g. for the case of [timor leste, timor], it should remove "timor"
597
+ if extra_serco_countries:
598
+ loc_capitalization_cp = loc_capitalization.copy()
599
+ for i, loc1 in enumerate(loc_capitalization):
600
+ for j, loc2 in enumerate(loc_capitalization):
601
+ if i != j and loc1 in loc2:
602
+ loc_capitalization_cp.remove(loc1)
603
+ break
604
+
605
+ loc_capitalization = loc_capitalization_cp
606
+
607
+ # validate that indeed each one of the countries/cities are indeed countries/cities
608
+ validated_locations = validate_locations(loc_capitalization)
609
+
610
+ # create a proper dictionary with country/city tags and the relevant entries as a result
611
+ loc_dict = {}
612
+ for location, loc_type in validated_locations:
613
+ if loc_type not in loc_dict:
614
+ loc_dict[loc_type] = []
615
+ loc_dict[loc_type].append(location)
616
+
617
+ # bring sentence on previous form
618
+ sentence = sentence.replace(" x$x ", ",")
619
+
620
+ # cope with cases of iterative country or city reference due to geoparse lib issues
621
+ locations_dict = multiple_country_city_identifications_solve(loc_dict)
622
+
623
+ if locations_dict == None:
624
+ return (0, "LOCATION", "no_country")
625
+ # return {'city':[], 'country':[]}
626
+
627
+ else:
628
+ # conditions for multiple references
629
+ # it is mandatory that a country will exist
630
+ if 'country' in locations_dict:
631
+
632
+ # if a city exists
633
+ if 'city' in locations_dict:
634
+ resolved_dict = helper_resolve_cities(sentence, locations_dict)
635
+
636
+ # we accept one country and one city
637
+ if len(resolved_dict['country']) == 1 and len(resolved_dict['city']) == 1:
638
+
639
+ # capitalize because there may be cases that it will return 'italy'
640
+ resolved_dict['country'][0] = resolved_dict['country'][0].capitalize()
641
+
642
+ # there were some cases that the 'x$x' was not removed
643
+ for key, values in resolved_dict.items():
644
+ for i, value in enumerate(values):
645
+ if 'x$x' in value:
646
+ values[i] = value.replace('x$x', '')
647
+
648
+ delete_city = helper_delete_city_reference(resolved_dict)
649
+
650
+ return helper_delete_country_reference(delete_city)
651
+
652
+
653
+ # we can accept an absence of city but a country is always mandatory
654
+ elif len(resolved_dict['country']) == 1 and len(resolved_dict['city']) == 0:
655
+
656
+ resolved_dict['country'][0] = resolved_dict['country'][0].capitalize()
657
+
658
+ # there were some cases that the 'x$x' was not removed
659
+ for key, values in resolved_dict.items():
660
+ for i, value in enumerate(values):
661
+ if 'x$x' in value:
662
+ values[i] = value.replace('x$x', '')
663
+
664
+ delete_city = helper_delete_city_reference(resolved_dict)
665
+
666
+ return helper_delete_country_reference(delete_city)
667
+
668
+ # error if more than one country or city
669
+ else:
670
+ return (0, "LOCATION", "more_city_or_country")
671
+
672
+
673
+ # if a city does not exist
674
+ else:
675
+ # we only accept for one country
676
+ if len(locations_dict['country']) == 1:
677
+
678
+ locations_dict['country'][0] = locations_dict['country'][0].capitalize()
679
+
680
+ # there were some cases that the 'x$x' was not removed
681
+ for key, values in locations_dict.items():
682
+ for i, value in enumerate(values):
683
+ if 'x$x' in value:
684
+ values[i] = value.replace('x$x', '')
685
+
686
+ resolved_cities = helper_resolve_cities(sentence, locations_dict)
687
+ delete_city = helper_delete_city_reference(resolved_cities)
688
+
689
+ help_city = helper_delete_country_reference(delete_city)
690
+
691
+ if not 'city' in help_city:
692
+ help_city['city'] = [0]
693
+
694
+ return help_city
695
+
696
+ # error if more than one country
697
+ else:
698
+ return (0, "LOCATION", "more_country")
699
+
700
+ # error if no country is referred
701
+ else:
702
+ return (0, "LOCATION", "no_country")
703
+
704
+ except:
705
+ # handle the exception if any errors occur while identifying a country/city
706
+ return (0, "LOCATION", "unknown_error")