import os | |
import re | |
import json | |
import itertools | |
import pandas as pd | |
from simplemma import lemmatize | |
from names_dataset import NameDataset | |
def load_json(path): | |
""" | |
Load gazetteers from a file | |
:param path: path to the gazetteer file | |
:return: a dict of gazetteers | |
""" | |
with open(path, 'r') as file: | |
data = json.load(file) | |
return data | |
def save_json(data, path): | |
""" | |
Save gazetteers to a file | |
:param path: path to the gazetteer file | |
:param gazetteers: a dict of gazetteers | |
""" | |
with open(path, 'w') as file: | |
json.dump(data, file, indent=4) | |
def merge_gazetteers(*gazetteers): | |
""" | |
Merge multiple gazetteer dictionaries into a single gazetteer dictionary. | |
Returns: | |
dict: A merged gazetteer dictionary containing all the keys and values from the input gazetteers. | |
""" | |
# Initialize a new dictionary to store merged results | |
merged_gazetteers = {} | |
# Iterate over each dictionary provided | |
for gaz in gazetteers: | |
# Iterate over each key and set in the current dictionary | |
for key, value_set in gaz.items(): | |
if key in merged_gazetteers: | |
# If the key already exists in the result, union the sets | |
merged_gazetteers[key] |= value_set | |
else: | |
# Otherwise, initialize the key with the set from the current dictionary | |
merged_gazetteers[key] = value_set.copy() # Use copy to avoid mutating the original sets | |
return merged_gazetteers | |
#################################################################################################### | |
### PREPROCESSING OF GAZETTEERS ################################################################### | |
#################################################################################################### | |
def remove_all_brackets(text): | |
return re.sub(r'[\(\{\[].*?[\)\}\]]', '', text) | |
def lemmatizing(x): | |
if x == "": | |
return "" | |
return lemmatize(x, lang="cs") | |
def multi_lemmatizing(x): | |
words = x.split(" ") | |
phrase = "" | |
for word in words: | |
phrase += lemmatizing(word) + " " | |
return phrase.strip() | |
def build_reverse_dictionary(dictionary, apply_lemmatizing=False): | |
reverse_dictionary = {} | |
for key, values in dictionary.items(): | |
for value in values: | |
reverse_dictionary[value] = key | |
if apply_lemmatizing: | |
temp = lemmatizing(value) | |
if temp != value: | |
reverse_dictionary[temp] = key | |
return reverse_dictionary | |
def split_gazetteers_for_single_token_match(gazetteers): | |
result = {} | |
for k, v in gazetteers.items(): | |
result[k] = set([x for xs in [vv.split(" ") for vv in v] for x in xs]) | |
result[k] = {x for x in result[k] if len(x) > 2} | |
return result | |
def preprocess_gazetteers(gazetteers, config): | |
if config["remove_brackets"]: | |
for k, values in gazetteers.items(): | |
gazetteers[k] = {remove_all_brackets(vv).strip() for vv in values if len(remove_all_brackets(vv).strip()) > 2} | |
if config["split_person"]: | |
gazetteers["per"].update(set([x for x in list(itertools.chain(*[v.split(" ") for v in gazetteers["per"]])) if len(x) > 2])) | |
if config["techniq_for_matching"] == "single": | |
gazetteers = split_gazetteers_for_single_token_match(gazetteers) | |
if config["lemmatize"]: | |
for k, values in gazetteers.items(): | |
gazetteers[k] = set(list(itertools.chain(*[(vv, lemmatizing(vv)) for vv in values if len(vv) > 2]))) | |
elif config["lemmatize"]: | |
for k, values in gazetteers.items(): | |
gazetteers[k] = set(list(itertools.chain(*[(value, multi_lemmatizing(value)) for value in values if len(value) > 2]))) | |
if config["remove_numeric"]: | |
for k, values in gazetteers.items(): | |
gazetteers[k] = {vv for vv in values if not vv.isnumeric()} | |
for k, values in gazetteers.items(): | |
gazetteers[k] = list(values) | |
return gazetteers | |