NerRoB-czech / data_manipulation /creation_gazetteers.py
AlzbetaStrompova
minor changes
75a65be
raw
history blame
4.04 kB
import os
import re
import json
import itertools
import pandas as pd
from simplemma import lemmatize
from names_dataset import NameDataset
def load_json(path):
"""
Load gazetteers from a file
:param path: path to the gazetteer file
:return: a dict of gazetteers
"""
with open(path, 'r') as file:
data = json.load(file)
return data
def save_json(data, path):
"""
Save gazetteers to a file
:param path: path to the gazetteer file
:param gazetteers: a dict of gazetteers
"""
with open(path, 'w') as file:
json.dump(data, file, indent=4)
def merge_gazetteers(*gazetteers):
"""
Merge multiple gazetteer dictionaries into a single gazetteer dictionary.
Returns:
dict: A merged gazetteer dictionary containing all the keys and values from the input gazetteers.
"""
# Initialize a new dictionary to store merged results
merged_gazetteers = {}
# Iterate over each dictionary provided
for gaz in gazetteers:
# Iterate over each key and set in the current dictionary
for key, value_set in gaz.items():
if key in merged_gazetteers:
# If the key already exists in the result, union the sets
merged_gazetteers[key] |= value_set
else:
# Otherwise, initialize the key with the set from the current dictionary
merged_gazetteers[key] = value_set.copy() # Use copy to avoid mutating the original sets
return merged_gazetteers
####################################################################################################
### PREPROCESSING OF GAZETTEERS ###################################################################
####################################################################################################
def remove_all_brackets(text):
return re.sub(r'[\(\{\[].*?[\)\}\]]', '', text)
def lemmatizing(x):
if x == "":
return ""
return lemmatize(x, lang="cs")
def multi_lemmatizing(x):
words = x.split(" ")
phrase = ""
for word in words:
phrase += lemmatizing(word) + " "
return phrase.strip()
def build_reverse_dictionary(dictionary, apply_lemmatizing=False):
reverse_dictionary = {}
for key, values in dictionary.items():
for value in values:
reverse_dictionary[value] = key
if apply_lemmatizing:
temp = lemmatizing(value)
if temp != value:
reverse_dictionary[temp] = key
return reverse_dictionary
def split_gazetteers_for_single_token_match(gazetteers):
result = {}
for k, v in gazetteers.items():
result[k] = set([x for xs in [vv.split(" ") for vv in v] for x in xs])
result[k] = {x for x in result[k] if len(x) > 2}
return result
def preprocess_gazetteers(gazetteers, config):
if config["remove_brackets"]:
for k, values in gazetteers.items():
gazetteers[k] = {remove_all_brackets(vv).strip() for vv in values if len(remove_all_brackets(vv).strip()) > 2}
if config["split_person"]:
gazetteers["per"].update(set([x for x in list(itertools.chain(*[v.split(" ") for v in gazetteers["per"]])) if len(x) > 2]))
if config["techniq_for_matching"] == "single":
gazetteers = split_gazetteers_for_single_token_match(gazetteers)
if config["lemmatize"]:
for k, values in gazetteers.items():
gazetteers[k] = set(list(itertools.chain(*[(vv, lemmatizing(vv)) for vv in values if len(vv) > 2])))
elif config["lemmatize"]:
for k, values in gazetteers.items():
gazetteers[k] = set(list(itertools.chain(*[(value, multi_lemmatizing(value)) for value in values if len(value) > 2])))
if config["remove_numeric"]:
for k, values in gazetteers.items():
gazetteers[k] = {vv for vv in values if not vv.isnumeric()}
for k, values in gazetteers.items():
gazetteers[k] = list(values)
return gazetteers