Spaces:
Runtime error
Runtime error
""" | |
Problem: | |
Nt3awnou's platform collects raw data filled manually by users (people in need). | |
Among this data is the user's localisation. | |
The localisation is a text input that is not standardized: | |
i.e. a user can input a single or multiple locations | |
(either douars/provinces/communes/regions or all combined), | |
in arabic or latin, with misspellings etc. | |
This doesn't help in visualization or in statistics | |
where localisations can be redundant because they were written in different manners. | |
Examples | |
``` | |
دوار تجكَالت | |
ابرداتن ازكور | |
خزامة | |
Tansgharte | |
دوار امندار | |
Douar Essour Tidrara Aghwatim Tahnaouet Al Haouz | |
دوار تكاديرت | |
Douar Essour tidrara- aghouatine- Tahanaout-El Haouz | |
``` | |
Solution: | |
We collected a reference dataset that contains all douar names (arabic and latin) | |
with their corresponding regions, communes and provinces. | |
We developed methods using fuzzy matching and phonetics | |
to map the user's localisation to the closest match in the reference dataset | |
""" | |
from typing import Tuple | |
from pyphonetics import RefinedSoundex, Metaphone | |
import math | |
import difflib | |
import re | |
EPICENTER_LOCATION = [31.12210171476489, -8.42945837915193] | |
certainty_threshold = 1 | |
def extract_ngrams(text, n): | |
""" | |
A function that returns a list of n-grams from a text | |
""" | |
ngrams = [] | |
if n < 1 or n > len(text): | |
return ngrams # Return an empty list if n is invalid | |
# Iterate through the text and extract n-grams | |
for i in range(len(text) - n + 1): | |
ngram = text[i:i + n] | |
ngrams.append(' '.join(ngram)) | |
return ngrams | |
def get_phonetics_distance(w1, w2): | |
""" | |
A function that calculates levenhstein distance between phonetics | |
representation of two words: add error term to the score | |
""" | |
rs = RefinedSoundex() | |
mt = Metaphone() | |
d1 = mt.distance(w1, w2, metric='levenshtein') | |
d2 = rs.distance(w1, w2, metric='levenshtein') | |
res = (d1 + d2) / 2 + 0.05 | |
return res | |
def get_top_n_phonetics_matches( | |
w: str, ref_words: list, threshold=1, top_n=1) -> list[Tuple]: | |
""" | |
A function that returns the top_n closest words to w from ref_words | |
for which distance <= threshold | |
using phonetical representation | |
""" | |
if not w: | |
return list() | |
distances = {x: get_phonetics_distance(w, x) for x in ref_words} | |
selected_words = {x: d for x, d in distances.items() if d<=threshold} | |
sorted_d = dict(sorted(selected_words.items(), key=lambda item: item[1])) | |
return list(sorted_d.items())[:top_n] | |
def get_geometric_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float: | |
""" | |
A function that returns the distance between two points on earth | |
using the haversine formula | |
""" | |
dlon = math.radians(lon2 - lon1) | |
dlat = math.radians(lat2 - lat1) | |
a0 = (math.sin(dlat / 2)) ** 2 + math.cos(math.radians(lat1)) | |
a = a0 * math.cos(math.radians(lat2)) * (math.sin(dlon / 2)) ** 2 | |
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a)) | |
distance = 6371 * c | |
return distance | |
def are_village_names_similar(village_a: str, village_b: str) -> float: | |
""" | |
A function that returns True if the two villages | |
are similar using strict fuzzy matching | |
""" | |
if difflib.SequenceMatcher(None, village_a, village_b).ratio() >= 0.90: | |
return True | |
return False | |
def get_uncertainty_range(input_dict: dict, threshold: float) -> list: | |
""" | |
A function that returns a list of tuples of the closest matches | |
""" | |
if len(input_dict)<=1: | |
return input_dict | |
# sort by distance | |
sorted_items = sorted(input_dict.items(), key=lambda item: item[1][1]) | |
data = {key: value for key, value in sorted_items} | |
# Iterate through the keys in the dictionary | |
keys = list(data.keys()) | |
min_key = keys[0] | |
min_value = data[min_key][1] | |
# Initialize a list to store the result tuples | |
result = {f"{min_key}":data[min_key]} | |
for j in range(1, len(keys)): | |
key2 = keys[j] | |
value2 = data[key2][1] | |
# Calculate the absolute difference between the float values | |
difference = abs(min_value - value2) | |
# If the difference is less than the threshold, add the tuple to the result | |
if difference <= threshold: | |
result[key2] = data[key2] | |
else: | |
break | |
return result | |
def match_word(w, ref_dict, select_one_match=False): | |
""" | |
A function that returns the closest match of w from ref_dict | |
using phonetical representation and fuzzy matching | |
""" | |
w = w.strip().upper() | |
if len(w)==0: | |
return {} | |
else: | |
closest_ref_w = dict() | |
use_phonetics = True | |
for category, names in ref_dict.items(): | |
# check exact matching | |
if w in names: | |
use_phonetics = False | |
closest_ref_w[category] = (w, 0) | |
break | |
# check textual similarity (fuzzy matching) | |
sim = list(map(lambda x:are_village_names_similar(w,x), names)) | |
similar_names = [names[i] for i in range(len(names)) if sim[i]==True] | |
if similar_names: | |
use_phonetics = False | |
closest_ref_w[category] = (similar_names[0], 0.01) if select_one_match else list(map(lambda x:(x, 0.01), similar_names)) | |
# if no similar name was found check phonetical similarity | |
else: | |
res = get_top_n_phonetics_matches(w, names, threshold=2, top_n=1) | |
if res: | |
closest_ref_w[category] = res[0] # get closest match | |
if closest_ref_w and use_phonetics: | |
if not select_one_match: | |
closest_ref_w = get_uncertainty_range(closest_ref_w, certainty_threshold) | |
else: | |
k, v = min(closest_ref_w.items(), key=lambda x: x[1][1]) | |
closest_ref_w = {k: v} | |
return closest_ref_w | |
def parse_and_map_localisation(text: str, ref_dict: dict, select_one_match: bool=True): | |
""" | |
A function that parses text containing users localisation | |
and returns the closest matches per categoty from ref_dict | |
Example: | |
input = COMMUNE MZODA : DOUARS : TOUKHRIBIN –TLAKEMT - COMMUNE IMINDOUNITE : DOUAR AZARZO | |
output = {'commune_fr': ('IMINDOUNIT', 0.01), 'nom_fr': ('TOUKHRIBINE', 0.01)} | |
""" | |
toxic = r"\bدوار|مصلى|\(|\)|douars?|communes?|cercles?|provinces?|villes?|regions?|caidate?|and|جماعة|\b|:|-|\d" | |
text = re.sub(toxic, '', text.lower()) | |
regex_pattern = r"\|| |\.|,|/|et |و " | |
tokens = re.split(regex_pattern, text.replace('-', ' ')) | |
filtered_tokens = [s for s in tokens if s.strip() != ''] | |
ngrams_mapping = {} | |
for n in range(1, len(filtered_tokens)+1): | |
# generate ngrams | |
ngrams = extract_ngrams(filtered_tokens, n) | |
# init dict with ngram mapping | |
mapping_ngram = {} | |
# generate a mapping for the ngram with argmin matches | |
for tok in ngrams: | |
res = match_word(tok, ref_dict, select_one_match=select_one_match) | |
if not res: | |
continue | |
min_k, min_v = min(res.items(), key=lambda x:x[1][1]) | |
# if min_k in previous tokens, then choose the min, else add it to mapping | |
if min_k in mapping_ngram: | |
saved_match, saved_distance = mapping_ngram[min_k] | |
if saved_distance > min_v[1]: | |
mapping_ngram[min_k] = min_v | |
else: | |
continue | |
else: | |
mapping_ngram[min_k] = min_v | |
ngrams_mapping[n] = mapping_ngram | |
# first squeeze dict s.t. one match remains per category | |
categories = ref_dict.keys() | |
result = {} | |
for _, inner_dict in ngrams_mapping.items(): | |
for k in categories: | |
# Check if the key exists in the inner dictionary | |
if k in inner_dict: | |
current_match, current_val = inner_dict[k] | |
if k in result: | |
previous_match, previous_val = result[k] | |
if current_val < previous_val: | |
result[k] = (current_match, current_val) | |
else: | |
result[k] = (current_match, current_val) | |
# then, discard matches with a high distance from min (set 0.5+min_d as threshold) | |
thresh = min(result.values(), key=lambda x:x[1])[1] + 0.5 | |
output = {k: v_d for k, v_d in result.items() if v_d[1]<=thresh} | |
return output |