|
import re |
|
from spacy.tokenizer import Tokenizer |
|
from spacy.util import compile_infix_regex, compile_prefix_regex, compile_suffix_regex, registry |
|
from spacy.symbols import ORTH |
|
|
|
@registry.tokenizers("customize_tokenizer") |
|
def make_customize_tokenizer(): |
|
def customize_tokenizer(nlp): |
|
return custom_tokenizer(nlp) |
|
|
|
return customize_tokenizer |
|
|
|
|
|
|
|
EXTENDED_LETTER_RANGE = "A-Za-zäöüÄÖÜàòèéìù" |
|
DATE = r"[0-3][1-9]\.[0-1][1-9]\.[1-2][0-9]{3}" |
|
TOP_LEVEL_DOMAINS = "ch|at|de|com|edu|org|gov|net|fr|uk|be|es|pl|it|eu|nl|ba|cz|dk|al|ad|bg|by|fi|gr|ie|li|lu|no|pt|ro|rs|ru|se|si|sk" |
|
|
|
DOT_AFTER_WORD = [ |
|
rf"(?<!www\.)(?<=([a-zA-ZäöüÄÖÜ]){{{i}}})\.(?!({TOP_LEVEL_DOMAINS}))" |
|
for i in range(3, 30) |
|
] |
|
|
|
DOT_AFTER_DATE = rf"(?<=({DATE}))\." |
|
|
|
infix_res = [ |
|
r"[\(\[\]\)]", |
|
r"(?<=\.--)\.", |
|
rf"\.(?=[{EXTENDED_LETTER_RANGE}]{{3,20}})", |
|
r"'\.\.", |
|
*DOT_AFTER_WORD, |
|
r"[A-Z](?=\. )", |
|
DOT_AFTER_DATE, |
|
] |
|
|
|
LETTER_DOUBLE_ENDING_DOT_VAR_LENGTH = [ |
|
rf"(?<=([{EXTENDED_LETTER_RANGE}]\.){{{i}}})\." for i in range(1, 30) |
|
] |
|
|
|
suffix_res = [ |
|
r"(?<=\d)[\.]", |
|
r"(?<=[\.])[\]\)]", |
|
rf"[\)\]](?=[\(\[\.{EXTENDED_LETTER_RANGE}0-9]+)", |
|
r"(?<=')\.\.", |
|
r"\.\.\.", |
|
*LETTER_DOUBLE_ENDING_DOT_VAR_LENGTH, |
|
r"(?<=[A-Z])\.", |
|
] |
|
|
|
DOT_DOT_PLUS = r"\.\.+" |
|
DOT_DOT_PLUS_FIXED = r"\.\.\.+" |
|
NUMBER_DASH_NUMBER = r"(?<=[0-9])-(?=[0-9])" |
|
NUMBER_SIGN_NUMBER = r"(?<=[0-9])[+\-\*^](?=[0-9-])" |
|
NUMBER_SIGN_NUMBER_FIXED = r"(?<=[0-9])[+\*^](?=[0-9])" |
|
|
|
|
|
|
|
def custom_tokenizer(nlp): |
|
nlp.tokenizer = Tokenizer(nlp.vocab) |
|
|
|
prefix_regex = compile_prefix_regex(nlp.Defaults.prefixes) |
|
nlp.tokenizer.prefix_search = prefix_regex.search |
|
|
|
|
|
|
|
|
|
|
|
infixes = nlp.Defaults.infixes |
|
if NUMBER_DASH_NUMBER in infixes: |
|
infixes.remove(NUMBER_DASH_NUMBER) |
|
if NUMBER_SIGN_NUMBER in infixes: |
|
infixes.remove(NUMBER_SIGN_NUMBER) |
|
infixes.append(NUMBER_SIGN_NUMBER_FIXED) |
|
infixes += infix_res |
|
infix_regex = compile_infix_regex(infixes) |
|
nlp.tokenizer.infix_finditer = infix_regex.finditer |
|
|
|
|
|
suffixes = nlp.Defaults.suffixes |
|
if DOT_DOT_PLUS in suffixes: |
|
suffixes.remove(DOT_DOT_PLUS) |
|
suffixes.append(DOT_DOT_PLUS_FIXED) |
|
suffixes += suffix_res |
|
suffix_regex = compile_suffix_regex(suffixes) |
|
nlp.tokenizer.suffix_search = suffix_regex.search |
|
|
|
|
|
for special_case, tokens in special_cases.items(): |
|
nlp.tokenizer.add_special_case(special_case, tokens) |
|
|
|
nlp.tokenizer.token_match = re.compile(r"^\[$").search |
|
|
|
return nlp.tokenizer |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
special_cases = { |
|
"cf.": [{ORTH: "cf."}], |
|
"etc.": [{ORTH: "etc."}], |
|
"usw.": [{ORTH: "usw."}], |
|
"u.s.w.": [{ORTH: "u.s.w."}], |
|
"u.ä.": [{ORTH: "u.ä."}], |
|
"Liq..": [{ORTH: "Liq."}, {ORTH: "."}], |
|
"Cie..": [{ORTH: "Cie."}, {ORTH: "."}], |
|
"Co..": [{ORTH: "Co."}, {ORTH: "."}], |
|
"S.à.r.l.": [{ORTH: "S.à.r.l."}], |
|
"r.l.": [{ORTH: "r.l."}], |
|
"R.l.": [{ORTH: "R.l."}], |
|
"g.l.": [{ORTH: "g.l."}], |
|
"S.c.r.l.": [{ORTH: "S.c.r.l."}], |
|
"u.a.": [{ORTH: "u.a."}], |
|
"u.a.m.": [{ORTH: "u.a.m."}], |
|
"s.à.r.l.": [{ORTH: "s.à.r.l."}], |
|
"S.a.r.l.": [{ORTH: "S.a.r.l."}], |
|
"s.a.r.l.": [{ORTH: "s.a.r.l."}], |
|
"s.àr.l.": [{ORTH: "s.àr.l."}], |
|
"u.d.g.": [{ORTH: "u.d.g."}], |
|
"S.a.g.l.": [{ORTH: "S.a.g.l."}], |
|
"S.r.l.": [{ORTH: "S.r.l."}], |
|
"S.r.": [{ORTH: "S.r."}], |
|
"Ltd..": [{ORTH: "Ltd."}, {ORTH: "."}], |
|
"LTD..": [{ORTH: "LTD."}, {ORTH: "."}], |
|
"ltd..": [{ORTH: "ltd."}, {ORTH: "."}], |
|
"Corp..": [{ORTH: "Corp."}, {ORTH: "."}], |
|
"Inc..": [{ORTH: "Inc."}, {ORTH: "."}], |
|
"dgl..": [{ORTH: "dgl."}, {ORTH: "."}], |
|
"ect..": [{ORTH: "ect."}, {ORTH: "."}], |
|
"co..": [{ORTH: "co."}, {ORTH: "."}], |
|
"CO..": [{ORTH: "CO."}, {ORTH: "."}], |
|
"Ing..": [{ORTH: "Ing."}, {ORTH: "."}], |
|
"HRegV..": [{ORTH: "HRegV."}, {ORTH: "."}], |
|
"ehf..": [{ORTH: "ehf."}, {ORTH: "."}], |
|
"Gen..": [{ORTH: "Gen."}, {ORTH: "."}], |
|
"Var..": [{ORTH: "Var."}, {ORTH: "."}], |
|
"b.v..": [{ORTH: "b.v."}, {ORTH: "."}], |
|
"Dr..": [{ORTH: "Dr."}, {ORTH: "."}], |
|
"Br..": [{ORTH: "Br."}, {ORTH: "."}], |
|
"iu..": [{ORTH: "iu."}, {ORTH: "."}], |
|
"Ch..": [{ORTH: "Ch."}, {ORTH: "."}], |
|
"Inh..": [{ORTH: "Inh."}, {ORTH: "."}], |
|
"sf..": [{ORTH: "sf."}, {ORTH: "."}], |
|
"sen..": [{ORTH: "sen."}, {ORTH: "."}], |
|
"Std..": [{ORTH: "Std."}, {ORTH: "."}], |
|
"d.o.o..": [{ORTH: "d.o.o."}, {ORTH: "."}], |
|
"M.Sc..": [{ORTH: "M.Sc."}, {ORTH: "."}], |
|
"s.a..": [{ORTH: "s.a."}, {ORTH: "."}], |
|
"ag..": [{ORTH: "ag."}, {ORTH: "."}], |
|
"Fa..": [{ORTH: "Fa."}, {ORTH: "."}], |
|
"Ti..": [{ORTH: "Ti."}, {ORTH: "."}], |
|
"div..": [{ORTH: "div."}, {ORTH: "."}], |
|
"ä..": [{ORTH: "ä."}, {ORTH: "."}], |
|
"v.k.s.s..": [{ORTH: "v.k.s.s."}, {ORTH: "."}], |
|
"ecc..": [{ORTH: "ecc."}, {ORTH: "."}], |
|
"fed..": [{ORTH: "fed."}, {ORTH: "."}], |
|
"Psy-K..": [{ORTH: "Psy-K."}, {ORTH: "."}], |
|
"dipl.fed..": [{ORTH: "dipl.fed."}, {ORTH: "."}], |
|
"Jr..": [{ORTH: "Jr."}, {ORTH: "."}], |
|
"succ..": [{ORTH: "succ."}, {ORTH: "."}], |
|
"méd..": [{ORTH: "méd."}, {ORTH: "."}], |
|
"ass..": [{ORTH: "ass."}, {ORTH: "."}], |
|
"env..": [{ORTH: "env."}, {ORTH: "."}], |
|
"Int..": [{ORTH: "Int."}, {ORTH: "."}], |
|
"Chr..": [{ORTH: "Chr."}, {ORTH: "."}], |
|
} |
|
|