File size: 4,181 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Multi-Word Expression tokenizer
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Rob Malouf <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""

Multi-Word Expression Tokenizer



A ``MWETokenizer`` takes a string which has already been divided into tokens and

retokenizes it, merging multi-word expressions into single tokens, using a lexicon

of MWEs:





    >>> from nltk.tokenize import MWETokenizer



    >>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])

    >>> tokenizer.add_mwe(('in', 'spite', 'of'))



    >>> tokenizer.tokenize('Testing testing testing one two three'.split())

    ['Testing', 'testing', 'testing', 'one', 'two', 'three']



    >>> tokenizer.tokenize('This is a test in spite'.split())

    ['This', 'is', 'a', 'test', 'in', 'spite']



    >>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())

    ['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']



"""
from nltk.tokenize.api import TokenizerI
from nltk.util import Trie


class MWETokenizer(TokenizerI):
    """A tokenizer that processes tokenized text and merges multi-word expressions

    into single tokens.

    """

    def __init__(self, mwes=None, separator="_"):
        """Initialize the multi-word tokenizer with a list of expressions and a

        separator



        :type mwes: list(list(str))

        :param mwes: A sequence of multi-word expressions to be merged, where

            each MWE is a sequence of strings.

        :type separator: str

        :param separator: String that should be inserted between words in a multi-word

            expression token. (Default is '_')



        """
        if not mwes:
            mwes = []
        self._mwes = Trie(mwes)
        self._separator = separator

    def add_mwe(self, mwe):
        """Add a multi-word expression to the lexicon (stored as a word trie)



        We use ``util.Trie`` to represent the trie. Its form is a dict of dicts.

        The key True marks the end of a valid MWE.



        :param mwe: The multi-word expression we're adding into the word trie

        :type mwe: tuple(str) or list(str)



        :Example:



        >>> tokenizer = MWETokenizer()

        >>> tokenizer.add_mwe(('a', 'b'))

        >>> tokenizer.add_mwe(('a', 'b', 'c'))

        >>> tokenizer.add_mwe(('a', 'x'))

        >>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}

        >>> tokenizer._mwes == expected

        True



        """
        self._mwes.insert(mwe)

    def tokenize(self, text):
        """



        :param text: A list containing tokenized text

        :type text: list(str)

        :return: A list of the tokenized text with multi-words merged together

        :rtype: list(str)



        :Example:



        >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')

        >>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split())

        ['An', "hors+d'oeuvre", 'tonight,', 'sir?']



        """
        i = 0
        n = len(text)
        result = []

        while i < n:
            if text[i] in self._mwes:
                # possible MWE match
                j = i
                trie = self._mwes
                last_match = -1
                while j < n and text[j] in trie:  # and len(trie[text[j]]) > 0 :
                    trie = trie[text[j]]
                    j = j + 1
                    if Trie.LEAF in trie:
                        last_match = j
                else:
                    if last_match > -1:
                        j = last_match

                    if Trie.LEAF in trie or last_match > -1:
                        # success!
                        result.append(self._separator.join(text[i:j]))
                        i = j
                    else:
                        # no match, so backtrack
                        result.append(text[i])
                        i += 1
            else:
                result.append(text[i])
                i += 1
        return result