import os import re import json import itertools import pandas as pd from simplemma import lemmatize from names_dataset import NameDataset def load_json(path): """ Load gazetteers from a file :param path: path to the gazetteer file :return: a dict of gazetteers """ with open(path, 'r') as file: data = json.load(file) return data def save_json(data, path): """ Save gazetteers to a file :param path: path to the gazetteer file :param gazetteers: a dict of gazetteers """ with open(path, 'w') as file: json.dump(data, file, indent=4) def merge_gazetteers(*gazetteers): """ Merge multiple gazetteer dictionaries into a single gazetteer dictionary. Returns: dict: A merged gazetteer dictionary containing all the keys and values from the input gazetteers. """ # Initialize a new dictionary to store merged results merged_gazetteers = {} # Iterate over each dictionary provided for gaz in gazetteers: # Iterate over each key and set in the current dictionary for key, value_set in gaz.items(): if key in merged_gazetteers: # If the key already exists in the result, union the sets merged_gazetteers[key] |= value_set else: # Otherwise, initialize the key with the set from the current dictionary merged_gazetteers[key] = value_set.copy() # Use copy to avoid mutating the original sets return merged_gazetteers #################################################################################################### ### PREPROCESSING OF GAZETTEERS ################################################################### #################################################################################################### def remove_all_brackets(text): return re.sub(r'[\(\{\[].*?[\)\}\]]', '', text) def lemmatizing(x): if x == "": return "" return lemmatize(x, lang="cs") def multi_lemmatizing(x): words = x.split(" ") phrase = "" for word in words: phrase += lemmatizing(word) + " " return phrase.strip() def build_reverse_dictionary(dictionary, apply_lemmatizing=False): reverse_dictionary = {} for key, values in dictionary.items(): for value in values: reverse_dictionary[value] = key if apply_lemmatizing: temp = lemmatizing(value) if temp != value: reverse_dictionary[temp] = key return reverse_dictionary def split_gazetteers_for_single_token_match(gazetteers): result = {} for k, v in gazetteers.items(): result[k] = set([x for xs in [vv.split(" ") for vv in v] for x in xs]) result[k] = {x for x in result[k] if len(x) > 2} return result def preprocess_gazetteers(gazetteers, config): if config["remove_brackets"]: for k, values in gazetteers.items(): gazetteers[k] = {remove_all_brackets(vv).strip() for vv in values if len(remove_all_brackets(vv).strip()) > 2} if config["split_person"]: gazetteers["per"].update(set([x for x in list(itertools.chain(*[v.split(" ") for v in gazetteers["per"]])) if len(x) > 2])) if config["techniq_for_matching"] == "single": gazetteers = split_gazetteers_for_single_token_match(gazetteers) if config["lemmatize"]: for k, values in gazetteers.items(): gazetteers[k] = set(list(itertools.chain(*[(vv, lemmatizing(vv)) for vv in values if len(vv) > 2]))) elif config["lemmatize"]: for k, values in gazetteers.items(): gazetteers[k] = set(list(itertools.chain(*[(value, multi_lemmatizing(value)) for value in values if len(value) > 2]))) if config["remove_numeric"]: for k, values in gazetteers.items(): gazetteers[k] = {vv for vv in values if not vv.isnumeric()} for k, values in gazetteers.items(): gazetteers[k] = list(values) return gazetteers