Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

File size: 5,243 Bytes

d916065

# Natural Language Toolkit: Tokenizers
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Edward Loper <[email protected]>
#         Steven Bird <[email protected]> (minor additions)
# Contributors: matthewmc, clouds56
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

r"""

NLTK Tokenizer Package



Tokenizers divide strings into lists of substrings.  For example,

tokenizers can be used to find the words and punctuation in a string:



    >>> from nltk.tokenize import word_tokenize

    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me

    ... two of them.\n\nThanks.'''

    >>> word_tokenize(s) # doctest: +NORMALIZE_WHITESPACE

    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',

    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']



This particular tokenizer requires the Punkt sentence tokenization

models to be installed. NLTK also provides a simpler,

regular-expression based tokenizer, which splits text on whitespace

and punctuation:



    >>> from nltk.tokenize import wordpunct_tokenize

    >>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE

    ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',

    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']



We can also operate at the level of sentences, using the sentence

tokenizer directly as follows:



    >>> from nltk.tokenize import sent_tokenize, word_tokenize

    >>> sent_tokenize(s)

    ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.']

    >>> [word_tokenize(t) for t in sent_tokenize(s)] # doctest: +NORMALIZE_WHITESPACE

    [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],

    ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]



Caution: when tokenizing a Unicode string, make sure you are not

using an encoded version of the string (it may be necessary to

decode it first, e.g. with ``s.decode("utf8")``.



NLTK tokenizers can produce token-spans, represented as tuples of integers

having the same semantics as string slices, to support efficient comparison

of tokenizers.  (These methods are implemented as generators.)



    >>> from nltk.tokenize import WhitespaceTokenizer

    >>> list(WhitespaceTokenizer().span_tokenize(s)) # doctest: +NORMALIZE_WHITESPACE

    [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),

    (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]



There are numerous ways to tokenize text.  If you need more control over

tokenization, see the other methods provided in this package.



For further information, please see Chapter 3 of the NLTK book.

"""

import re

from nltk.data import load
from nltk.tokenize.casual import TweetTokenizer, casual_tokenize
from nltk.tokenize.destructive import NLTKWordTokenizer
from nltk.tokenize.legality_principle import LegalitySyllableTokenizer
from nltk.tokenize.mwe import MWETokenizer
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize.regexp import (
    BlanklineTokenizer,
    RegexpTokenizer,
    WhitespaceTokenizer,
    WordPunctTokenizer,
    blankline_tokenize,
    regexp_tokenize,
    wordpunct_tokenize,
)
from nltk.tokenize.repp import ReppTokenizer
from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize
from nltk.tokenize.simple import (
    LineTokenizer,
    SpaceTokenizer,
    TabTokenizer,
    line_tokenize,
)
from nltk.tokenize.sonority_sequencing import SyllableTokenizer
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
from nltk.tokenize.texttiling import TextTilingTokenizer
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer
from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize


# Standard sentence tokenizer.
def sent_tokenize(text, language="english"):
    """

    Return a sentence-tokenized copy of *text*,

    using NLTK's recommended sentence tokenizer

    (currently :class:`.PunktSentenceTokenizer`

    for the specified language).



    :param text: text to split into sentences

    :param language: the model name in the Punkt corpus

    """
    tokenizer = load(f"tokenizers/punkt/{language}.pickle")
    return tokenizer.tokenize(text)


# Standard word tokenizer.
_treebank_word_tokenizer = NLTKWordTokenizer()


def word_tokenize(text, language="english", preserve_line=False):
    """

    Return a tokenized copy of *text*,

    using NLTK's recommended word tokenizer

    (currently an improved :class:`.TreebankWordTokenizer`

    along with :class:`.PunktSentenceTokenizer`

    for the specified language).



    :param text: text to split into words

    :type text: str

    :param language: the model name in the Punkt corpus

    :type language: str

    :param preserve_line: A flag to decide whether to sentence tokenize the text or not.

    :type preserve_line: bool

    """
    sentences = [text] if preserve_line else sent_tokenize(text, language)
    return [
        token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
    ]