|
import json |
|
import os |
|
import pickle |
|
import random |
|
|
|
import numpy as np |
|
|
|
from Preprocessing.multilinguality.create_distance_lookups import CacheCreator |
|
from Utility.utils import load_json_from_path |
|
|
|
|
|
class SimilaritySolver: |
|
def __init__(self, |
|
tree_dist=None, |
|
map_dist=None, |
|
asp_dict=None, |
|
largest_value_map_dist=None, |
|
tree_dist_path=None, |
|
map_dist_path=None, |
|
asp_dict_path=None, |
|
iso_to_fullname=None, |
|
iso_to_fullname_path=None, |
|
learned_dist=None, |
|
learned_dist_path=None, |
|
oracle_dist=None, |
|
oracle_dist_path=None, |
|
force_reload=False): |
|
self.lang_1_to_lang_2_to_tree_dist = tree_dist |
|
self.lang_1_to_lang_2_to_map_dist = map_dist |
|
self.largest_value_map_dist = largest_value_map_dist |
|
self.asp_dict = asp_dict |
|
self.lang_1_to_lang_2_to_learned_dist = learned_dist |
|
self.lang_1_to_lang_2_to_oracle_dist = oracle_dist |
|
self.iso_to_fullname = iso_to_fullname |
|
iso_to_fullname_path = "iso_to_fullname.json" if not iso_to_fullname_path else iso_to_fullname_path |
|
|
|
if force_reload: |
|
tree_dist_path = 'lang_1_to_lang_2_to_tree_dist.json' if not tree_dist_path else tree_dist_path |
|
self.lang_1_to_lang_2_to_tree_dist = load_json_from_path(tree_dist_path) |
|
map_dist_path = 'lang_1_to_lang_2_to_map_dist.json' if not map_dist_path else map_dist_path |
|
self.lang_1_to_lang_2_to_map_dist = load_json_from_path(map_dist_path) |
|
self.largest_value_map_dist = 0.0 |
|
for _, values in self.lang_1_to_lang_2_to_map_dist.items(): |
|
for _, value in values.items(): |
|
self.largest_value_map_dist = max(self.largest_value_map_dist, value) |
|
learned_dist_path = 'lang_1_to_lang_2_to_learned_dist.json' if not learned_dist_path else tree_dist_path |
|
self.lang_1_to_lang_2_to_learned_dist = load_json_from_path(learned_dist_path) |
|
oracle_dist_path = 'lang_1_to_lang_2_to_oracle_dist.json' if not oracle_dist_path else oracle_dist_path |
|
self.lang_1_to_lang_2_to_oracle_dist = load_json_from_path(oracle_dist_path) |
|
asp_dict_path = "asp_dict.pkl" if not asp_dict_path else asp_dict_path |
|
with open(asp_dict_path, "rb") as f: |
|
self.asp_dict = pickle.load(f) |
|
self.iso_to_fullname = load_json_from_path(iso_to_fullname_path) |
|
|
|
|
|
pop_keys = list() |
|
for el in self.iso_to_fullname: |
|
if "Sign Language" in self.iso_to_fullname[el]: |
|
pop_keys.append(el) |
|
for pop_key in pop_keys: |
|
self.iso_to_fullname.pop(pop_key) |
|
with open(iso_to_fullname_path, 'w', encoding='utf-8') as f: |
|
json.dump(self.iso_to_fullname, f, ensure_ascii=False, indent=4) |
|
|
|
def find_closest_combined_distance(self, |
|
lang, |
|
supervised_langs, |
|
combined_distance="average", |
|
k=50, |
|
individual_distances=False, |
|
verbose=False, |
|
excluded_features=[], |
|
find_furthest=False): |
|
"""Find the k closest languages according to a combination of map distance, tree distance, and ASP distance. |
|
Returns a dict of dicts (`individual_distances` optional) of the format {"supervised_lang_1": |
|
{"euclidean_distance": 5.39, "individual_distances": [<map_dist>, <tree_dist>, <asp_dist>]}, |
|
"supervised_lang_2": |
|
{...}, ...}""" |
|
|
|
if combined_distance not in ["average", "euclidean"]: |
|
raise ValueError("distance needs to be `average` or `euclidean`") |
|
combined_dict = {} |
|
supervised_langs = set(supervised_langs) if isinstance(supervised_langs, list) else supervised_langs |
|
|
|
if "urk" in supervised_langs: |
|
supervised_langs.remove("urk") |
|
if lang in supervised_langs: |
|
supervised_langs.remove(lang) |
|
for sup_lang in supervised_langs: |
|
map_dist = self.get_map_distance(lang, sup_lang) |
|
tree_dist = self.get_tree_distance(lang, sup_lang) |
|
asp_score = self.get_asp(lang, sup_lang, self.asp_dict) |
|
|
|
if None in {map_dist, tree_dist, asp_score}: |
|
continue |
|
|
|
combined_dict[sup_lang] = {} |
|
asp_dist = 1 - asp_score |
|
dist_list = [] |
|
if "map" not in excluded_features: |
|
dist_list.append(map_dist) |
|
if "asp" not in excluded_features: |
|
dist_list.append(asp_dist) |
|
if "tree" not in excluded_features: |
|
dist_list.append(tree_dist) |
|
dist_array = np.array(dist_list) |
|
if combined_distance == "euclidean": |
|
euclidean_dist = np.sqrt(np.sum(dist_array**2)) |
|
combined_dict[sup_lang]["combined_distance"] = euclidean_dist |
|
elif combined_distance == "average": |
|
avg_dist = np.mean(dist_array) |
|
combined_dict[sup_lang]["combined_distance"] = avg_dist |
|
|
|
if individual_distances: |
|
combined_dict[sup_lang]["individual_distances"] = [map_dist, tree_dist, asp_dist] |
|
|
|
results = dict(sorted(combined_dict.items(), key=lambda x: x[1]["combined_distance"], reverse=find_furthest)[:k]) |
|
if verbose: |
|
sorted_by = "closest" if not find_furthest else "furthest" |
|
print(f"{k} {sorted_by} languages to {self.iso_to_fullname[lang]} w.r.t. the combined features are:") |
|
for result in results: |
|
try: |
|
print(self.iso_to_fullname[result]) |
|
print(results[result]) |
|
except KeyError: |
|
print("Full Name of Language Missing") |
|
return results |
|
|
|
def find_closest(self, distance_type, lang, supervised_langs, k=50, find_furthest=False, random_seed=42, verbose=False): |
|
"""Find the k nearest languages in terms of a given feature. |
|
Returns a dict {language: distance} sorted by distance.""" |
|
distance_types = ["learned", "map", "tree", "asp", "random", "oracle"] |
|
if distance_type not in distance_types: |
|
raise ValueError(f"Invalid distance type '{distance_type}'. Expected one of {distance_types}") |
|
langs_to_dist = dict() |
|
supervised_langs = set(supervised_langs) if isinstance(supervised_langs, list) else supervised_langs |
|
|
|
if "urk" in supervised_langs: |
|
supervised_langs.remove("urk") |
|
if lang in supervised_langs: |
|
supervised_langs.remove(lang) |
|
|
|
if distance_type == "learned": |
|
for sup_lang in supervised_langs: |
|
dist = self.get_learned_distance(lang, sup_lang) |
|
if dist is not None: |
|
langs_to_dist[sup_lang] = dist |
|
elif distance_type == "map": |
|
for sup_lang in supervised_langs: |
|
dist = self.get_map_distance(lang, sup_lang) |
|
if dist is not None: |
|
langs_to_dist[sup_lang] = dist |
|
elif distance_type == "tree": |
|
for sup_lang in supervised_langs: |
|
dist = self.get_tree_distance(lang, sup_lang) |
|
if dist is not None: |
|
langs_to_dist[sup_lang] = dist |
|
elif distance_type == "asp": |
|
for sup_lang in supervised_langs: |
|
asp_score = self.get_asp(lang, sup_lang, self.asp_dict) |
|
if asp_score is not None: |
|
asp_dist = 1 - asp_score |
|
langs_to_dist[sup_lang] = asp_dist |
|
elif distance_type == "oracle": |
|
for sup_lang in supervised_langs: |
|
dist = self.get_oracle_distance(lang, sup_lang) |
|
if dist is not None: |
|
langs_to_dist[sup_lang] = dist |
|
elif distance_type == "random": |
|
random.seed(random_seed) |
|
random_langs = random.sample(supervised_langs, k) |
|
|
|
random_dict = {rand_lang: 0.5 for rand_lang in random_langs} |
|
return random_dict |
|
|
|
|
|
results = dict(sorted(langs_to_dist.items(), key=lambda x: x[1], reverse=find_furthest)[:k]) |
|
if verbose: |
|
sorted_by = "closest" if not find_furthest else "furthest" |
|
print(f"{k} {sorted_by} languages to {self.iso_to_fullname[lang]} w.r.t. {distance_type} are:") |
|
for result in results: |
|
try: |
|
print(self.iso_to_fullname[result]) |
|
print(results[result]) |
|
except KeyError: |
|
print("Full Name of Language Missing") |
|
return results |
|
|
|
def get_map_distance(self, lang_1, lang_2): |
|
"""Returns normalized map distance between two languages. |
|
If no value can be retrieved, returns None.""" |
|
try: |
|
dist = self.lang_1_to_lang_2_to_map_dist[lang_1][lang_2] |
|
except KeyError: |
|
try: |
|
dist = self.lang_1_to_lang_2_to_map_dist[lang_2][lang_1] |
|
except KeyError: |
|
return None |
|
dist = dist / self.largest_value_map_dist |
|
return dist |
|
|
|
def get_tree_distance(self, lang_1, lang_2): |
|
"""Returns normalized tree distance between two languages. |
|
If no value can be retrieved, returns None.""" |
|
try: |
|
dist = self.lang_1_to_lang_2_to_tree_dist[lang_1][lang_2] |
|
except KeyError: |
|
try: |
|
dist = self.lang_1_to_lang_2_to_tree_dist[lang_2][lang_1] |
|
except KeyError: |
|
return None |
|
return dist |
|
|
|
def get_learned_distance(self, lang_1, lang_2): |
|
"""Returns normalized learned distance between two languages. |
|
If no value can be retrieved, returns None.""" |
|
try: |
|
dist = self.lang_1_to_lang_2_to_learned_dist[lang_1][lang_2] |
|
except KeyError: |
|
try: |
|
dist = self.lang_1_to_lang_2_to_learned_dist[lang_2][lang_1] |
|
except KeyError: |
|
return None |
|
return dist |
|
|
|
def get_oracle_distance(self, lang_1, lang_2): |
|
"""Returns oracle language embedding distance (MSE) between two languages. |
|
If no value can be retrieved, returns None.""" |
|
try: |
|
dist = self.lang_1_to_lang_2_to_oracle_dist[lang_1][lang_2] |
|
except KeyError: |
|
try: |
|
dist = self.lang_1_to_lang_2_to_oracle_dist[lang_2][lang_1] |
|
except KeyError: |
|
return None |
|
return dist |
|
|
|
def get_asp(self, lang_a, lang_b, path_to_dict): |
|
"""Look up and return the ASP between lang_a and lang_b from (pre-calculated) dictionary at path_to_dict. |
|
Note: This is a SIMILARITY measure, NOT a distance!""" |
|
asp_dict = load_asp_dict(path_to_dict) |
|
lang_list = list(asp_dict) |
|
lang_b_idx = lang_list.index(lang_b) |
|
try: |
|
asp = asp_dict[lang_a][lang_b_idx] |
|
except KeyError: |
|
return None |
|
return asp |
|
|
|
|
|
def load_asp_dict(path_to_dict): |
|
"""If the input is already a dict, return it, else load dict from input path and return the dict.""" |
|
if isinstance(path_to_dict, dict): |
|
return path_to_dict |
|
else: |
|
with open(path_to_dict, 'rb') as dictfile: |
|
asp_dict = pickle.load(dictfile) |
|
return asp_dict |
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
if not (os.path.exists("lang_1_to_lang_2_to_map_dist.json") and \ |
|
os.path.exists("lang_1_to_lang_2_to_tree_dist.json") and \ |
|
os.path.exists("lang_1_to_lang_2_to_oracle_dist.json") and \ |
|
os.path.exists("lang_1_to_lang_2_to_learned_dist.json") and \ |
|
os.path.exists("asp_dict.pkl")): |
|
CacheCreator() |
|
|
|
ss = SimilaritySolver() |
|
|
|
ss.find_closest("asp", |
|
"aym", |
|
['eng', 'deu', 'fra', 'spa', 'cmn', 'por', 'pol', 'ita', 'nld', 'ell', 'fin', 'vie', 'rus', 'hun', 'bem', 'swh', 'amh', 'wol', 'mal', 'chv', 'iba', 'jav', 'fon', 'hau', 'lbb', 'kik', 'lin', 'lug', 'luo', 'sxb', 'yor', 'nya', 'loz', 'toi', 'afr', 'arb', 'asm', 'ast', 'azj', 'bel', 'bul', 'ben', 'bos', 'cat', |
|
'ceb', 'sdh', 'ces', 'cym', 'dan', 'ekk', 'pes', 'fil', 'gle', 'glg', 'guj', 'heb', 'hin', 'hrv', 'hye', 'ind', 'ibo', 'isl', 'kat', 'kam', 'kea', 'kaz', 'khm', 'kan', 'kor', 'ltz', 'lao', 'lit', 'lvs', 'mri', 'mkd', 'xng', 'mar', 'zsm', 'mlt', 'oci', 'ory', 'pan', 'pst', 'ron', 'snd', 'slk', 'slv', 'sna', |
|
'som', 'srp', 'swe', 'tam', 'tel', 'tgk', 'tur', 'ukr', 'umb', 'urd', 'uzn', 'bhd', 'kfs', 'dgo', 'gbk', 'bgc', 'xnr', 'kfx', 'mjl', 'bfz', 'acf', 'bss', 'inb', 'nca', 'quh', 'wap', 'acr', 'bus', 'dgr', 'maz', 'nch', 'qul', 'tav', 'wmw', 'acu', 'byr', 'dik', 'iou', 'mbb', 'ncj', 'qvc', 'tbc', 'xed', 'agd', |
|
'bzh', 'djk', 'ipi', 'mbc', 'ncl', 'qve', 'tbg', 'xon', 'agg', 'bzj', 'dop', 'jac', 'mbh', 'ncu', 'qvh', 'tbl', 'xtd', 'agn', 'caa', 'jic', 'mbj', 'ndj', 'qvm', 'tbz', 'xtm', 'agr', 'cab', 'emp', 'jiv', 'mbt', 'nfa', 'qvn', 'tca', 'yaa', 'agu', 'cap', 'jvn', 'mca', 'ngp', 'qvs', 'tcs', 'yad', 'aia', 'car', |
|
'ese', 'mcb', 'ngu', 'qvw', 'yal', 'cax', 'kaq', 'mcd', 'nhe', 'qvz', 'tee', 'ycn', 'ake', 'cbc', 'far', 'mco', 'qwh', 'yka', 'alp', 'cbi', 'kdc', 'mcp', 'nhu', 'qxh', 'ame', 'cbr', 'gai', 'kde', 'mcq', 'nhw', 'qxn', 'tew', 'yre', 'amf', 'cbs', 'gam', 'kdl', 'mdy', 'nhy', 'qxo', 'tfr', 'yva', 'amk', 'cbt', |
|
'geb', 'kek', 'med', 'nin', 'rai', 'zaa', 'apb', 'cbu', 'glk', 'ken', 'mee', 'nko', 'rgu', 'zab', 'apr', 'cbv', 'meq', 'tgo', 'zac', 'arl', 'cco', 'gng', 'kje', 'met', 'nlg', 'rop', 'tgp', 'zad', 'grc', 'klv', 'mgh', 'nnq', 'rro', 'zai', 'ata', 'cek', 'gub', 'kmu', 'mib', 'noa', 'ruf', 'tna', 'zam', 'atb', |
|
'cgc', 'guh', 'kne', 'mie', 'not', 'rug', 'tnk', 'zao', 'atg', 'chf', 'knf', 'mih', 'npl', 'tnn', 'zar', 'awb', 'chz', 'gum', 'knj', 'mil', 'sab', 'tnp', 'zas', 'cjo', 'guo', 'ksr', 'mio', 'obo', 'seh', 'toc', 'zav', 'azg', 'cle', 'gux', 'kue', 'mit', 'omw', 'sey', 'tos', 'zaw', 'azz', 'cme', 'gvc', 'kvn', |
|
'miz', 'ood', 'sgb', 'tpi', 'zca', 'bao', 'cni', 'gwi', 'kwd', 'mkl', 'shp', 'tpt', 'zga', 'bba', 'cnl', 'gym', 'kwf', 'mkn', 'ote', 'sja', 'trc', 'ziw', 'bbb', 'cnt', 'gyr', 'kwi', 'mop', 'otq', 'snn', 'ttc', 'zlm', 'cof', 'hat', 'kyc', 'mox', 'pab', 'snp', 'tte', 'zos', 'bgt', 'con', 'kyf', 'mpm', 'pad', |
|
'tue', 'zpc', 'bjr', 'cot', 'kyg', 'mpp', 'soy', 'tuf', 'zpl', 'bjv', 'cpa', 'kyq', 'mpx', 'pao', 'tuo', 'zpm', 'bjz', 'cpb', 'hlt', 'kyz', 'mqb', 'pib', 'spp', 'zpo', 'bkd', 'cpu', 'hns', 'lac', 'mqj', 'pir', 'spy', 'txq', 'zpu', 'blz', 'crn', 'hto', 'lat', 'msy', 'pjt', 'sri', 'txu', 'zpz', 'bmr', 'cso', |
|
'hub', 'lex', 'mto', 'pls', 'srm', 'udu', 'ztq', 'bmu', 'ctu', 'lgl', 'muy', 'poi', 'srn', 'zty', 'bnp', 'cuc', 'lid', 'mxb', 'stp', 'upv', 'zyp', 'boa', 'cui', 'huu', 'mxq', 'sus', 'ura', 'boj', 'cuk', 'huv', 'llg', 'mxt', 'poy', 'suz', 'urb', 'box', 'cwe', 'hvn', 'prf', 'urt', 'bpr', 'cya', 'ign', 'lww', |
|
'myk', 'ptu', 'usp', 'bps', 'daa', 'ikk', 'maj', 'myy', 'vid', 'bqc', 'dah', 'nab', 'qub', 'tac', 'bqp', 'ded', 'imo', 'maq', 'nas', 'quf', 'taj', 'vmy'], |
|
k=5, verbose=True) |
|
|
|
ss.find_closest_combined_distance("aym", |
|
['eng', 'deu', 'fra', 'spa', 'cmn', 'por', 'pol', 'ita', 'nld', 'ell', 'fin', 'vie', 'rus', 'hun', 'bem', 'swh', 'amh', 'wol', 'mal', 'chv', 'iba', 'jav', 'fon', 'hau', 'lbb', 'kik', 'lin', 'lug', 'luo', 'sxb', 'yor', 'nya', 'loz', 'toi', 'afr', 'arb', 'asm', 'ast', 'azj', 'bel', 'bul', 'ben', 'bos', 'cat', |
|
'ceb', 'sdh', 'ces', 'cym', 'dan', 'ekk', 'pes', 'fil', 'gle', 'glg', 'guj', 'heb', 'hin', 'hrv', 'hye', 'ind', 'ibo', 'isl', 'kat', 'kam', 'kea', 'kaz', 'khm', 'kan', 'kor', 'ltz', 'lao', 'lit', 'lvs', 'mri', 'mkd', 'xng', 'mar', 'zsm', 'mlt', 'oci', 'ory', 'pan', 'pst', 'ron', 'snd', 'slk', 'slv', 'sna', |
|
'som', 'srp', 'swe', 'tam', 'tel', 'tgk', 'tur', 'ukr', 'umb', 'urd', 'uzn', 'bhd', 'kfs', 'dgo', 'gbk', 'bgc', 'xnr', 'kfx', 'mjl', 'bfz', 'acf', 'bss', 'inb', 'nca', 'quh', 'wap', 'acr', 'bus', 'dgr', 'maz', 'nch', 'qul', 'tav', 'wmw', 'acu', 'byr', 'dik', 'iou', 'mbb', 'ncj', 'qvc', 'tbc', 'xed', 'agd', |
|
'bzh', 'djk', 'ipi', 'mbc', 'ncl', 'qve', 'tbg', 'xon', 'agg', 'bzj', 'dop', 'jac', 'mbh', 'ncu', 'qvh', 'tbl', 'xtd', 'agn', 'caa', 'jic', 'mbj', 'ndj', 'qvm', 'tbz', 'xtm', 'agr', 'cab', 'emp', 'jiv', 'mbt', 'nfa', 'qvn', 'tca', 'yaa', 'agu', 'cap', 'jvn', 'mca', 'ngp', 'qvs', 'tcs', 'yad', 'aia', 'car', |
|
'ese', 'mcb', 'ngu', 'qvw', 'yal', 'cax', 'kaq', 'mcd', 'nhe', 'qvz', 'tee', 'ycn', 'ake', 'cbc', 'far', 'mco', 'qwh', 'yka', 'alp', 'cbi', 'kdc', 'mcp', 'nhu', 'qxh', 'ame', 'cbr', 'gai', 'kde', 'mcq', 'nhw', 'qxn', 'tew', 'yre', 'amf', 'cbs', 'gam', 'kdl', 'mdy', 'nhy', 'qxo', 'tfr', 'yva', 'amk', 'cbt', |
|
'geb', 'kek', 'med', 'nin', 'rai', 'zaa', 'apb', 'cbu', 'glk', 'ken', 'mee', 'nko', 'rgu', 'zab', 'apr', 'cbv', 'meq', 'tgo', 'zac', 'arl', 'cco', 'gng', 'kje', 'met', 'nlg', 'rop', 'tgp', 'zad', 'grc', 'klv', 'mgh', 'nnq', 'rro', 'zai', 'ata', 'cek', 'gub', 'kmu', 'mib', 'noa', 'ruf', 'tna', 'zam', 'atb', |
|
'cgc', 'guh', 'kne', 'mie', 'not', 'rug', 'tnk', 'zao', 'atg', 'chf', 'knf', 'mih', 'npl', 'tnn', 'zar', 'awb', 'chz', 'gum', 'knj', 'mil', 'sab', 'tnp', 'zas', 'cjo', 'guo', 'ksr', 'mio', 'obo', 'seh', 'toc', 'zav', 'azg', 'cle', 'gux', 'kue', 'mit', 'omw', 'sey', 'tos', 'zaw', 'azz', 'cme', 'gvc', 'kvn', |
|
'miz', 'ood', 'sgb', 'tpi', 'zca', 'bao', 'cni', 'gwi', 'kwd', 'mkl', 'shp', 'tpt', 'zga', 'bba', 'cnl', 'gym', 'kwf', 'mkn', 'ote', 'sja', 'trc', 'ziw', 'bbb', 'cnt', 'gyr', 'kwi', 'mop', 'otq', 'snn', 'ttc', 'zlm', 'cof', 'hat', 'kyc', 'mox', 'pab', 'snp', 'tte', 'zos', 'bgt', 'con', 'kyf', 'mpm', 'pad', |
|
'tue', 'zpc', 'bjr', 'cot', 'kyg', 'mpp', 'soy', 'tuf', 'zpl', 'bjv', 'cpa', 'kyq', 'mpx', 'pao', 'tuo', 'zpm', 'bjz', 'cpb', 'hlt', 'kyz', 'mqb', 'pib', 'spp', 'zpo', 'bkd', 'cpu', 'hns', 'lac', 'mqj', 'pir', 'spy', 'txq', 'zpu', 'blz', 'crn', 'hto', 'lat', 'msy', 'pjt', 'sri', 'txu', 'zpz', 'bmr', 'cso', |
|
'hub', 'lex', 'mto', 'pls', 'srm', 'udu', 'ztq', 'bmu', 'ctu', 'lgl', 'muy', 'poi', 'srn', 'zty', 'bnp', 'cuc', 'lid', 'mxb', 'stp', 'upv', 'zyp', 'boa', 'cui', 'huu', 'mxq', 'sus', 'ura', 'boj', 'cuk', 'huv', 'llg', 'mxt', 'poy', 'suz', 'urb', 'box', 'cwe', 'hvn', 'prf', 'urt', 'bpr', 'cya', 'ign', 'lww', |
|
'myk', 'ptu', 'usp', 'bps', 'daa', 'ikk', 'maj', 'myy', 'vid', 'bqc', 'dah', 'nab', 'qub', 'tac', 'bqp', 'ded', 'imo', 'maq', 'nas', 'quf', 'taj', 'vmy'], |
|
distance="average", |
|
k=5, |
|
verbose=True) |
|
|
|
ss.find_closest("map", |
|
"aym", |
|
['eng', 'deu', 'fra', 'spa', 'cmn', 'por', 'pol', 'ita', 'nld', 'ell', 'fin', 'vie', 'rus', 'hun', 'bem', 'swh', 'amh', 'wol', 'mal', 'chv', 'iba', 'jav', 'fon', 'hau', 'lbb', 'kik', 'lin', 'lug', 'luo', 'sxb', 'yor', 'nya', 'loz', 'toi', 'afr', 'arb', 'asm', 'ast', 'azj', 'bel', 'bul', 'ben', 'bos', 'cat', |
|
'ceb', 'sdh', 'ces', 'cym', 'dan', 'ekk', 'pes', 'fil', 'gle', 'glg', 'guj', 'heb', 'hin', 'hrv', 'hye', 'ind', 'ibo', 'isl', 'kat', 'kam', 'kea', 'kaz', 'khm', 'kan', 'kor', 'ltz', 'lao', 'lit', 'lvs', 'mri', 'mkd', 'xng', 'mar', 'zsm', 'mlt', 'oci', 'ory', 'pan', 'pst', 'ron', 'snd', 'slk', 'slv', 'sna', |
|
'som', 'srp', 'swe', 'tam', 'tel', 'tgk', 'tur', 'ukr', 'umb', 'urd', 'uzn', 'bhd', 'kfs', 'dgo', 'gbk', 'bgc', 'xnr', 'kfx', 'mjl', 'bfz', 'acf', 'bss', 'inb', 'nca', 'quh', 'wap', 'acr', 'bus', 'dgr', 'maz', 'nch', 'qul', 'tav', 'wmw', 'acu', 'byr', 'dik', 'iou', 'mbb', 'ncj', 'qvc', 'tbc', 'xed', 'agd', |
|
'bzh', 'djk', 'ipi', 'mbc', 'ncl', 'qve', 'tbg', 'xon', 'agg', 'bzj', 'dop', 'jac', 'mbh', 'ncu', 'qvh', 'tbl', 'xtd', 'agn', 'caa', 'jic', 'mbj', 'ndj', 'qvm', 'tbz', 'xtm', 'agr', 'cab', 'emp', 'jiv', 'mbt', 'nfa', 'qvn', 'tca', 'yaa', 'agu', 'cap', 'jvn', 'mca', 'ngp', 'qvs', 'tcs', 'yad', 'aia', 'car', |
|
'ese', 'mcb', 'ngu', 'qvw', 'yal', 'cax', 'kaq', 'mcd', 'nhe', 'qvz', 'tee', 'ycn', 'ake', 'cbc', 'far', 'mco', 'qwh', 'yka', 'alp', 'cbi', 'kdc', 'mcp', 'nhu', 'qxh', 'ame', 'cbr', 'gai', 'kde', 'mcq', 'nhw', 'qxn', 'tew', 'yre', 'amf', 'cbs', 'gam', 'kdl', 'mdy', 'nhy', 'qxo', 'tfr', 'yva', 'amk', 'cbt', |
|
'geb', 'kek', 'med', 'nin', 'rai', 'zaa', 'apb', 'cbu', 'glk', 'ken', 'mee', 'nko', 'rgu', 'zab', 'apr', 'cbv', 'meq', 'tgo', 'zac', 'arl', 'cco', 'gng', 'kje', 'met', 'nlg', 'rop', 'tgp', 'zad', 'grc', 'klv', 'mgh', 'nnq', 'rro', 'zai', 'ata', 'cek', 'gub', 'kmu', 'mib', 'noa', 'ruf', 'tna', 'zam', 'atb', |
|
'cgc', 'guh', 'kne', 'mie', 'not', 'rug', 'tnk', 'zao', 'atg', 'chf', 'knf', 'mih', 'npl', 'tnn', 'zar', 'awb', 'chz', 'gum', 'knj', 'mil', 'sab', 'tnp', 'zas', 'cjo', 'guo', 'ksr', 'mio', 'obo', 'seh', 'toc', 'zav', 'azg', 'cle', 'gux', 'kue', 'mit', 'omw', 'sey', 'tos', 'zaw', 'azz', 'cme', 'gvc', 'kvn', |
|
'miz', 'ood', 'sgb', 'tpi', 'zca', 'bao', 'cni', 'gwi', 'kwd', 'mkl', 'shp', 'tpt', 'zga', 'bba', 'cnl', 'gym', 'kwf', 'mkn', 'ote', 'sja', 'trc', 'ziw', 'bbb', 'cnt', 'gyr', 'kwi', 'mop', 'otq', 'snn', 'ttc', 'zlm', 'cof', 'hat', 'kyc', 'mox', 'pab', 'snp', 'tte', 'zos', 'bgt', 'con', 'kyf', 'mpm', 'pad', |
|
'tue', 'zpc', 'bjr', 'cot', 'kyg', 'mpp', 'soy', 'tuf', 'zpl', 'bjv', 'cpa', 'kyq', 'mpx', 'pao', 'tuo', 'zpm', 'bjz', 'cpb', 'hlt', 'kyz', 'mqb', 'pib', 'spp', 'zpo', 'bkd', 'cpu', 'hns', 'lac', 'mqj', 'pir', 'spy', 'txq', 'zpu', 'blz', 'crn', 'hto', 'lat', 'msy', 'pjt', 'sri', 'txu', 'zpz', 'bmr', 'cso', |
|
'hub', 'lex', 'mto', 'pls', 'srm', 'udu', 'ztq', 'bmu', 'ctu', 'lgl', 'muy', 'poi', 'srn', 'zty', 'bnp', 'cuc', 'lid', 'mxb', 'stp', 'upv', 'zyp', 'boa', 'cui', 'huu', 'mxq', 'sus', 'ura', 'boj', 'cuk', 'huv', 'llg', 'mxt', 'poy', 'suz', 'urb', 'box', 'cwe', 'hvn', 'prf', 'urt', 'bpr', 'cya', 'ign', 'lww', |
|
'myk', 'ptu', 'usp', 'bps', 'daa', 'ikk', 'maj', 'myy', 'vid', 'bqc', 'dah', 'nab', 'qub', 'tac', 'bqp', 'ded', 'imo', 'maq', 'nas', 'quf', 'taj', 'vmy'], |
|
k=5) |
|
|
|
ss.find_closest("tree", |
|
"aym", |
|
['eng', 'deu', 'fra', 'spa', 'cmn', 'por', 'pol', 'ita', 'nld', 'ell', 'fin', 'vie', 'rus', 'hun', 'bem', 'swh', 'amh', 'wol', 'mal', 'chv', 'iba', 'jav', 'fon', 'hau', 'lbb', 'kik', 'lin', 'lug', 'luo', 'sxb', 'yor', 'nya', 'loz', 'toi', 'afr', 'arb', 'asm', 'ast', 'azj', 'bel', 'bul', 'ben', 'bos', 'cat', |
|
'ceb', 'sdh', 'ces', 'cym', 'dan', 'ekk', 'pes', 'fil', 'gle', 'glg', 'guj', 'heb', 'hin', 'hrv', 'hye', 'ind', 'ibo', 'isl', 'kat', 'kam', 'kea', 'kaz', 'khm', 'kan', 'kor', 'ltz', 'lao', 'lit', 'lvs', 'mri', 'mkd', 'xng', 'mar', 'zsm', 'mlt', 'oci', 'ory', 'pan', 'pst', 'ron', 'snd', 'slk', 'slv', 'sna', |
|
'som', 'srp', 'swe', 'tam', 'tel', 'tgk', 'tur', 'ukr', 'umb', 'urd', 'uzn', 'bhd', 'kfs', 'dgo', 'gbk', 'bgc', 'xnr', 'kfx', 'mjl', 'bfz', 'acf', 'bss', 'inb', 'nca', 'quh', 'wap', 'acr', 'bus', 'dgr', 'maz', 'nch', 'qul', 'tav', 'wmw', 'acu', 'byr', 'dik', 'iou', 'mbb', 'ncj', 'qvc', 'tbc', 'xed', 'agd', |
|
'bzh', 'djk', 'ipi', 'mbc', 'ncl', 'qve', 'tbg', 'xon', 'agg', 'bzj', 'dop', 'jac', 'mbh', 'ncu', 'qvh', 'tbl', 'xtd', 'agn', 'caa', 'jic', 'mbj', 'ndj', 'qvm', 'tbz', 'xtm', 'agr', 'cab', 'emp', 'jiv', 'mbt', 'nfa', 'qvn', 'tca', 'yaa', 'agu', 'cap', 'jvn', 'mca', 'ngp', 'qvs', 'tcs', 'yad', 'aia', 'car', |
|
'ese', 'mcb', 'ngu', 'qvw', 'yal', 'cax', 'kaq', 'mcd', 'nhe', 'qvz', 'tee', 'ycn', 'ake', 'cbc', 'far', 'mco', 'qwh', 'yka', 'alp', 'cbi', 'kdc', 'mcp', 'nhu', 'qxh', 'ame', 'cbr', 'gai', 'kde', 'mcq', 'nhw', 'qxn', 'tew', 'yre', 'amf', 'cbs', 'gam', 'kdl', 'mdy', 'nhy', 'qxo', 'tfr', 'yva', 'amk', 'cbt', |
|
'geb', 'kek', 'med', 'nin', 'rai', 'zaa', 'apb', 'cbu', 'glk', 'ken', 'mee', 'nko', 'rgu', 'zab', 'apr', 'cbv', 'meq', 'tgo', 'zac', 'arl', 'cco', 'gng', 'kje', 'met', 'nlg', 'rop', 'tgp', 'zad', 'grc', 'klv', 'mgh', 'nnq', 'rro', 'zai', 'ata', 'cek', 'gub', 'kmu', 'mib', 'noa', 'ruf', 'tna', 'zam', 'atb', |
|
'cgc', 'guh', 'kne', 'mie', 'not', 'rug', 'tnk', 'zao', 'atg', 'chf', 'knf', 'mih', 'npl', 'tnn', 'zar', 'awb', 'chz', 'gum', 'knj', 'mil', 'sab', 'tnp', 'zas', 'cjo', 'guo', 'ksr', 'mio', 'obo', 'seh', 'toc', 'zav', 'azg', 'cle', 'gux', 'kue', 'mit', 'omw', 'sey', 'tos', 'zaw', 'azz', 'cme', 'gvc', 'kvn', |
|
'miz', 'ood', 'sgb', 'tpi', 'zca', 'bao', 'cni', 'gwi', 'kwd', 'mkl', 'shp', 'tpt', 'zga', 'bba', 'cnl', 'gym', 'kwf', 'mkn', 'ote', 'sja', 'trc', 'ziw', 'bbb', 'cnt', 'gyr', 'kwi', 'mop', 'otq', 'snn', 'ttc', 'zlm', 'cof', 'hat', 'kyc', 'mox', 'pab', 'snp', 'tte', 'zos', 'bgt', 'con', 'kyf', 'mpm', 'pad', |
|
'tue', 'zpc', 'bjr', 'cot', 'kyg', 'mpp', 'soy', 'tuf', 'zpl', 'bjv', 'cpa', 'kyq', 'mpx', 'pao', 'tuo', 'zpm', 'bjz', 'cpb', 'hlt', 'kyz', 'mqb', 'pib', 'spp', 'zpo', 'bkd', 'cpu', 'hns', 'lac', 'mqj', 'pir', 'spy', 'txq', 'zpu', 'blz', 'crn', 'hto', 'lat', 'msy', 'pjt', 'sri', 'txu', 'zpz', 'bmr', 'cso', |
|
'hub', 'lex', 'mto', 'pls', 'srm', 'udu', 'ztq', 'bmu', 'ctu', 'lgl', 'muy', 'poi', 'srn', 'zty', 'bnp', 'cuc', 'lid', 'mxb', 'stp', 'upv', 'zyp', 'boa', 'cui', 'huu', 'mxq', 'sus', 'ura', 'boj', 'cuk', 'huv', 'llg', 'mxt', 'poy', 'suz', 'urb', 'box', 'cwe', 'hvn', 'prf', 'urt', 'bpr', 'cya', 'ign', 'lww', |
|
'myk', 'ptu', 'usp', 'bps', 'daa', 'ikk', 'maj', 'myy', 'vid', 'bqc', 'dah', 'nab', 'qub', 'tac', 'bqp', 'ded', 'imo', 'maq', 'nas', 'quf', 'taj', 'vmy'], |
|
k=10, find_furthest=True) |
|
|