Spaces:
Sleeping
Sleeping
File size: 4,181 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
# Multi-Word Expression tokenizer
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Rob Malouf <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Multi-Word Expression Tokenizer
A ``MWETokenizer`` takes a string which has already been divided into tokens and
retokenizes it, merging multi-word expressions into single tokens, using a lexicon
of MWEs:
>>> from nltk.tokenize import MWETokenizer
>>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])
>>> tokenizer.add_mwe(('in', 'spite', 'of'))
>>> tokenizer.tokenize('Testing testing testing one two three'.split())
['Testing', 'testing', 'testing', 'one', 'two', 'three']
>>> tokenizer.tokenize('This is a test in spite'.split())
['This', 'is', 'a', 'test', 'in', 'spite']
>>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())
['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']
"""
from nltk.tokenize.api import TokenizerI
from nltk.util import Trie
class MWETokenizer(TokenizerI):
"""A tokenizer that processes tokenized text and merges multi-word expressions
into single tokens.
"""
def __init__(self, mwes=None, separator="_"):
"""Initialize the multi-word tokenizer with a list of expressions and a
separator
:type mwes: list(list(str))
:param mwes: A sequence of multi-word expressions to be merged, where
each MWE is a sequence of strings.
:type separator: str
:param separator: String that should be inserted between words in a multi-word
expression token. (Default is '_')
"""
if not mwes:
mwes = []
self._mwes = Trie(mwes)
self._separator = separator
def add_mwe(self, mwe):
"""Add a multi-word expression to the lexicon (stored as a word trie)
We use ``util.Trie`` to represent the trie. Its form is a dict of dicts.
The key True marks the end of a valid MWE.
:param mwe: The multi-word expression we're adding into the word trie
:type mwe: tuple(str) or list(str)
:Example:
>>> tokenizer = MWETokenizer()
>>> tokenizer.add_mwe(('a', 'b'))
>>> tokenizer.add_mwe(('a', 'b', 'c'))
>>> tokenizer.add_mwe(('a', 'x'))
>>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}
>>> tokenizer._mwes == expected
True
"""
self._mwes.insert(mwe)
def tokenize(self, text):
"""
:param text: A list containing tokenized text
:type text: list(str)
:return: A list of the tokenized text with multi-words merged together
:rtype: list(str)
:Example:
>>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
>>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split())
['An', "hors+d'oeuvre", 'tonight,', 'sir?']
"""
i = 0
n = len(text)
result = []
while i < n:
if text[i] in self._mwes:
# possible MWE match
j = i
trie = self._mwes
last_match = -1
while j < n and text[j] in trie: # and len(trie[text[j]]) > 0 :
trie = trie[text[j]]
j = j + 1
if Trie.LEAF in trie:
last_match = j
else:
if last_match > -1:
j = last_match
if Trie.LEAF in trie or last_match > -1:
# success!
result.append(self._separator.join(text[i:j]))
i = j
else:
# no match, so backtrack
result.append(text[i])
i += 1
else:
result.append(text[i])
i += 1
return result
|