ValadisCERTH commited on
Commit
581a861
·
1 Parent(s): a7fdc57

Update helper.py

Browse files
Files changed (1) hide show
  1. helper.py +130 -4
helper.py CHANGED
@@ -116,6 +116,7 @@ def validate_locations(locations):
116
  return validated_loc
117
 
118
 
 
119
  def identify_loc_ner(sentence):
120
  """
121
  Identify all the geopolitical and location entities with the spacy tool
@@ -140,6 +141,7 @@ def identify_loc_ner(sentence):
140
  return ner_locations
141
 
142
 
 
143
  def identify_loc_geoparselibs(sentence):
144
  """
145
  Identify cities and countries with 3 different geoparsing libraries
@@ -189,6 +191,7 @@ def identify_loc_geoparselibs(sentence):
189
  return (geoparse_locations, countries, cities)
190
 
191
 
 
192
  def identify_loc_regex(sentence):
193
  """
194
  Identify cities and countries with regular expression matching
@@ -206,6 +209,7 @@ def identify_loc_regex(sentence):
206
  return regex_locations
207
 
208
 
 
209
  def identify_loc_embeddings(sentence, countries, cities):
210
  """
211
  Identify cities and countries with the BERT pre-trained embeddings matching
@@ -248,6 +252,120 @@ def identify_loc_embeddings(sentence, countries, cities):
248
  return embd_locations
249
 
250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  def identify_locations(sentence):
252
  """
253
  Identify all the possible Country and City references in the given sentence, using different approaches in a hybrid manner
@@ -257,6 +375,9 @@ def identify_locations(sentence):
257
 
258
  try:
259
 
 
 
 
260
  # ner
261
  locations.append(identify_loc_ner(sentence))
262
 
@@ -290,12 +411,17 @@ def identify_locations(sentence):
290
  validated_locations = validate_locations(loc_capitalization)
291
 
292
  # create a proper dictionary with country/city tags and the relevant entries as a result
293
- locations_dict = {}
294
  for location, loc_type in validated_locations:
295
- if loc_type not in locations_dict:
296
- locations_dict[loc_type] = []
297
- locations_dict[loc_type].append(location)
 
 
 
298
 
 
 
299
 
300
  # conditions for multiple references
301
  # it is mandatory that a country will exist
 
116
  return validated_loc
117
 
118
 
119
+
120
  def identify_loc_ner(sentence):
121
  """
122
  Identify all the geopolitical and location entities with the spacy tool
 
141
  return ner_locations
142
 
143
 
144
+
145
  def identify_loc_geoparselibs(sentence):
146
  """
147
  Identify cities and countries with 3 different geoparsing libraries
 
191
  return (geoparse_locations, countries, cities)
192
 
193
 
194
+
195
  def identify_loc_regex(sentence):
196
  """
197
  Identify cities and countries with regular expression matching
 
209
  return regex_locations
210
 
211
 
212
+
213
  def identify_loc_embeddings(sentence, countries, cities):
214
  """
215
  Identify cities and countries with the BERT pre-trained embeddings matching
 
252
  return embd_locations
253
 
254
 
255
+
256
+ def multiple_country_city_identifications_solve(country_city_dict):
257
+ """
258
+ This is a function to solve the appearance of multiple identification of countries and cities.
259
+ It checks all the elements of the input dictionary and if any smaller length element exists as a substring inside
260
+ a bigger length element of it, it deletes the smaller size one. In that sense, a dictionary of the sort
261
+ {'city': ['Port moresby', 'Port'], 'country': ['Guinea', 'Papua new guinea']} will be converted into
262
+ {'city': ['Port moresby'], 'country': ['Papua new guinea']}.
263
+
264
+ The reason for that function, is because such type of incosistencies were identified during country/city identification,
265
+ propably relevant to the geoparsing libraries in use
266
+ """
267
+
268
+ try:
269
+
270
+ country_flag = False
271
+ city_flag = False
272
+
273
+ # to avoid examining any element in any case, we validate that both a country and a city exist
274
+ # on the input dictionary and that they are of length more than one (which is the target case for us)
275
+ if 'country' in country_city_dict:
276
+ if len(country_city_dict['country']) > 1:
277
+ country_flag = True
278
+
279
+ if 'city' in country_city_dict:
280
+ if len(country_city_dict['city']) > 1:
281
+ city_flag = True
282
+
283
+
284
+ # at first cope with country multiple iterative references
285
+ if country_flag:
286
+
287
+ # Sort the countries by length, longest first
288
+ country_city_dict['country'].sort(key=lambda x: len(x), reverse=True)
289
+
290
+ # Create a new list of countries that don't contain any substrings
291
+ cleaned_countries = []
292
+ for i in range(len(country_city_dict['country'])):
293
+ is_substring = False
294
+ for j in range(len(cleaned_countries)):
295
+ if country_city_dict['country'][i].lower().find(cleaned_countries[j].lower()) != -1:
296
+ # If the i-th country is a substring of an already-cleaned country, skip it
297
+ is_substring = True
298
+ break
299
+ if not is_substring:
300
+ cleaned_countries.append(country_city_dict['country'][i])
301
+
302
+ # Replace the original list of countries with the cleaned one
303
+ country_city_dict['country'] = cleaned_countries
304
+
305
+ # Create a new list of countries that are not substrings of other countries
306
+ final_countries = []
307
+ for i in range(len(country_city_dict['country'])):
308
+ is_superstring = False
309
+ for j in range(len(country_city_dict['country'])):
310
+ if i == j:
311
+ continue
312
+ if country_city_dict['country'][j].lower().find(country_city_dict['country'][i].lower()) != -1:
313
+ # If the i-th country is a substring of a different country, skip it
314
+ is_superstring = True
315
+ break
316
+ if not is_superstring:
317
+ final_countries.append(country_city_dict['country'][i])
318
+
319
+ # Replace the original list of countries with the final one
320
+ country_city_dict['country'] = final_countries
321
+
322
+ # then cope with city multiple iterative references
323
+ if city_flag:
324
+
325
+ # Sort the cities by length, longest first
326
+ country_city_dict['city'].sort(key=lambda x: len(x), reverse=True)
327
+
328
+ # Create a new list of cities that don't contain any substrings
329
+ cleaned_cities = []
330
+ for i in range(len(country_city_dict['city'])):
331
+ is_substring = False
332
+ for j in range(len(cleaned_cities)):
333
+ if country_city_dict['city'][i].lower().find(cleaned_cities[j].lower()) != -1:
334
+ # If the i-th city is a substring of an already-cleaned city, skip it
335
+ is_substring = True
336
+ break
337
+ if not is_substring:
338
+ cleaned_cities.append(country_city_dict['city'][i])
339
+
340
+ # Replace the original list of cities with the cleaned one
341
+ country_city_dict['city'] = cleaned_cities
342
+
343
+ # Create a new list of cities that are not substrings of other cities
344
+ final_cities = []
345
+ for i in range(len(country_city_dict['city'])):
346
+ is_superstring = False
347
+ for j in range(len(country_city_dict['city'])):
348
+ if i == j:
349
+ continue
350
+ if country_city_dict['city'][j].lower().find(country_city_dict['city'][i].lower()) != -1:
351
+ # If the i-th city is a substring of a different city, skip it
352
+ is_superstring = True
353
+ break
354
+ if not is_superstring:
355
+ final_cities.append(country_city_dict['city'][i])
356
+
357
+ # Replace the original list of cities with the final one
358
+ country_city_dict['city'] = final_cities
359
+
360
+ # return the final dictionary
361
+ if country_city_dict:
362
+ return country_city_dict
363
+
364
+ except:
365
+ return (0, "LOCATION", "unknown_error")
366
+
367
+
368
+
369
  def identify_locations(sentence):
370
  """
371
  Identify all the possible Country and City references in the given sentence, using different approaches in a hybrid manner
 
375
 
376
  try:
377
 
378
+ # # # this is because there were cases were a city followed by comma was not understood by the system
379
+ sentence = sentence.replace(",", " x$x ")
380
+
381
  # ner
382
  locations.append(identify_loc_ner(sentence))
383
 
 
411
  validated_locations = validate_locations(loc_capitalization)
412
 
413
  # create a proper dictionary with country/city tags and the relevant entries as a result
414
+ loc_dict = {}
415
  for location, loc_type in validated_locations:
416
+ if loc_type not in loc_dict:
417
+ loc_dict[loc_type] = []
418
+ loc_dict[loc_type].append(location)
419
+
420
+ # bring sentence on previous form
421
+ sentence = sentence.replace(" x$x ",",")
422
 
423
+ # cope with cases of iterative country or city reference due to geoparse lib issues
424
+ locations_dict = multiple_country_city_identifications_solve(loc_dict)
425
 
426
  # conditions for multiple references
427
  # it is mandatory that a country will exist