File size: 914 Bytes
ce9c6ae
 
 
 
 
 
 
 
 
 
 
 
a4e98a4
ce9c6ae
 
 
a4e98a4
 
 
 
 
ce9c6ae
 
 
a4e98a4
 
ce9c6ae
 
 
a4e98a4
 
ce9c6ae
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import spacy
from spacy.tokenizer import Tokenizer


@spacy.registry.tokenizers("custom_tokenizer")
def create_custom_tokenizer():
    def create_tokenizer(nlp):
        infixes = nlp.Defaults.infixes + [
            r"/",
            r"-",
            r",",
            r":",
            r"\+",
        ]
        prefixes = nlp.Defaults.prefixes + [
            r"-",
            r"\(",
        ]
        suffixes = nlp.Defaults.suffixes + [
            r"\)",
            r"-",
        ]
        prefix_regex = spacy.util.compile_prefix_regex(prefixes)
        infix_regex = spacy.util.compile_infix_regex(infixes)
        suffix_regex = spacy.util.compile_suffix_regex(suffixes)

        return Tokenizer(
            nlp.vocab,
            infix_finditer=infix_regex.finditer,
            prefix_search=prefix_regex.search,
            suffix_search=suffix_regex.search,
        )

    return create_tokenizer