ValadisCERTH commited on
Commit
f220c47
·
1 Parent(s): 8201122

Update helper.py

Browse files
Files changed (1) hide show
  1. helper.py +0 -92
helper.py CHANGED
@@ -248,98 +248,6 @@ def identify_loc_embeddings(sentence, countries, cities):
248
  return embd_locations
249
 
250
 
251
- def identify_locations(sentence):
252
- """
253
- Identify all the possible Country and City references in the given sentence, using different approaches in a hybrid manner
254
- """
255
-
256
- locations = []
257
-
258
- try:
259
-
260
- # ner
261
- locations.append(identify_loc_ner(sentence))
262
-
263
- # geoparse libs
264
- geoparse_list, countries, cities = identify_loc_geoparselibs(sentence)
265
- locations.append(geoparse_list)
266
-
267
- # flatten the geoparse list
268
- locations_flat_1 = list(flatten(locations))
269
-
270
- # regex
271
- locations_flat_1.append(identify_loc_regex(sentence))
272
-
273
- # flatten the regex list
274
- locations_flat_2 = list(flatten(locations))
275
-
276
- # embeddings
277
- locations_flat_2.append(identify_loc_embeddings(sentence, countries, cities))
278
-
279
- # flatten the embeddings list
280
- locations_flat_3 = list(flatten(locations))
281
-
282
- # remove duplicates while also taking under consideration capitalization (e.g. a reference of italy should be valid, while also a reference of Italy and italy)
283
- # Lowercase the words and get their unique references using set()
284
- loc_unique = set([loc.lower() for loc in locations_flat_3])
285
-
286
- # Create a new list of locations with initial capitalization, removing duplicates
287
- loc_capitalization = list(set([loc.capitalize() if loc.lower() in loc_unique else loc.lower() for loc in locations_flat_3]))
288
-
289
- # validate that indeed each one of the countries/cities are indeed countries/cities
290
- validated_locations = validate_locations(loc_capitalization)
291
-
292
- # create a proper dictionary with country/city tags and the relevant entries as a result
293
- locations_dict = {}
294
- for location, loc_type in validated_locations:
295
- if loc_type not in locations_dict:
296
- locations_dict[loc_type] = []
297
- locations_dict[loc_type].append(location)
298
-
299
- # conditions for multiple references
300
- # it is mandatory that a country will exist
301
- if locations_dict['country']:
302
-
303
- # if a city exists
304
- if 'city' in locations_dict:
305
-
306
- # we accept one country and one city
307
- if len(locations_dict['country']) == 1 and len(locations_dict['city']) == 1:
308
-
309
- # capitalize because there may be cases that it will return 'italy'
310
- locations_dict['country'][0] = locations_dict['country'][0].capitalize()
311
- return locations_dict
312
-
313
- # we can accept an absence of city but a country is always mandatory
314
- elif len(locations_dict['country']) == 1 and len(locations_dict['city']) == 0:
315
- locations_dict['country'][0] = locations_dict['country'][0].capitalize()
316
- return locations_dict
317
-
318
- # error if more than one country or city
319
- else:
320
- return (0, "LOCATION", "more_city_or_country")
321
-
322
-
323
- # if a city does not exist
324
- else:
325
- # we only accept for one country
326
- if len(locations_dict['country']) == 1:
327
- locations_dict['country'][0] = locations_dict['country'][0].capitalize()
328
- return locations_dict
329
-
330
- # error if more than one country
331
- else:
332
- return (0, "LOCATION", "more_country")
333
-
334
- # error if no country is referred
335
- else:
336
- return (0, "LOCATION", "no_country")
337
-
338
- except:
339
- # handle the exception if any errors occur while identifying a country/city
340
- return (0, "LOCATION", "unknown_error")
341
-
342
-
343
  def identify_locations(sentence):
344
  """
345
  Identify all the possible Country and City references in the given sentence, using different approaches in a hybrid manner
 
248
  return embd_locations
249
 
250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  def identify_locations(sentence):
252
  """
253
  Identify all the possible Country and City references in the given sentence, using different approaches in a hybrid manner