Spaces:
Sleeping
Sleeping
File size: 7,679 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
# Natural Language Toolkit: Python port of the tok-tok.pl tokenizer.
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Jon Dehdari
# Contributors: Liling Tan, Selcuk Ayguney, ikegami, Martijn Pieters
#
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT
"""
The tok-tok tokenizer is a simple, general tokenizer, where the input has one
sentence per line; thus only final period is tokenized.
Tok-tok has been tested on, and gives reasonably good results for English,
Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others.
The input should be in UTF-8 encoding.
Reference:
Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language
Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University.
"""
import re
from nltk.tokenize.api import TokenizerI
class ToktokTokenizer(TokenizerI):
"""
This is a Python port of the tok-tok.pl from
https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl
>>> toktok = ToktokTokenizer()
>>> text = u'Is 9.5 or 525,600 my favorite number?'
>>> print(toktok.tokenize(text, return_str=True))
Is 9.5 or 525,600 my favorite number ?
>>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
>>> print(toktok.tokenize(text, return_str=True))
The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
>>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
>>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
>>> assert toktok.tokenize(text, return_str=True) == expected
>>> toktok.tokenize(text) == [u'\xa1', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf']
True
"""
# Replace non-breaking spaces with normal spaces.
NON_BREAKING = re.compile("\u00A0"), " "
# Pad some funky punctuation.
FUNKY_PUNCT_1 = re.compile(r'([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])'), r" \1 "
# Pad more funky punctuation.
FUNKY_PUNCT_2 = re.compile(r"([({\[“‘„‚«‹「『])"), r" \1 "
# Pad En dash and em dash
EN_EM_DASHES = re.compile("([–—])"), r" \1 "
# Replace problematic character with numeric character reference.
AMPERCENT = re.compile("& "), "& "
TAB = re.compile("\t"), " 	 "
PIPE = re.compile(r"\|"), " | "
# Pad numbers with commas to keep them from further tokenization.
COMMA_IN_NUM = re.compile(r"(?<!,)([,،])(?![,\d])"), r" \1 "
# Just pad problematic (often neurotic) hyphen/single quote, etc.
PROB_SINGLE_QUOTES = re.compile(r"(['’`])"), r" \1 "
# Group ` ` stupid quotes ' ' into a single token.
STUPID_QUOTES_1 = re.compile(r" ` ` "), r" `` "
STUPID_QUOTES_2 = re.compile(r" ' ' "), r" '' "
# Don't tokenize period unless it ends the line and that it isn't
# preceded by another period, e.g.
# "something ..." -> "something ..."
# "something." -> "something ."
FINAL_PERIOD_1 = re.compile(r"(?<!\.)\.$"), r" ."
# Don't tokenize period unless it ends the line eg.
# " ... stuff." -> "... stuff ."
FINAL_PERIOD_2 = re.compile(r"""(?<!\.)\.\s*(["'’»›”]) *$"""), r" . \1"
# Treat continuous commas as fake German,Czech, etc.: „
MULTI_COMMAS = re.compile(r"(,{2,})"), r" \1 "
# Treat continuous dashes as fake en-dash, etc.
MULTI_DASHES = re.compile(r"(-{2,})"), r" \1 "
# Treat multiple periods as a thing (eg. ellipsis)
MULTI_DOTS = re.compile(r"(\.{2,})"), r" \1 "
# This is the \p{Open_Punctuation} from Perl's perluniprops
# see https://perldoc.perl.org/perluniprops.html
OPEN_PUNCT = str(
"([{\u0f3a\u0f3c\u169b\u201a\u201e\u2045\u207d"
"\u208d\u2329\u2768\u276a\u276c\u276e\u2770\u2772"
"\u2774\u27c5\u27e6\u27e8\u27ea\u27ec\u27ee\u2983"
"\u2985\u2987\u2989\u298b\u298d\u298f\u2991\u2993"
"\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26"
"\u2e28\u3008\u300a\u300c\u300e\u3010\u3014\u3016"
"\u3018\u301a\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39"
"\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe47\ufe59\ufe5b"
"\ufe5d\uff08\uff3b\uff5b\uff5f\uff62"
)
# This is the \p{Close_Punctuation} from Perl's perluniprops
CLOSE_PUNCT = str(
")]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a"
"\u2769\u276b\u276d\u276f\u2771\u2773\u2775\u27c6"
"\u27e7\u27e9\u27eb\u27ed\u27ef\u2984\u2986\u2988"
"\u298a\u298c\u298e\u2990\u2992\u2994\u2996\u2998"
"\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29\u3009"
"\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b"
"\u301e\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c"
"\ufe3e\ufe40\ufe42\ufe44\ufe48\ufe5a\ufe5c\ufe5e"
"\uff09\uff3d\uff5d\uff60\uff63"
)
# This is the \p{Close_Punctuation} from Perl's perluniprops
CURRENCY_SYM = str(
"$\xa2\xa3\xa4\xa5\u058f\u060b\u09f2\u09f3\u09fb"
"\u0af1\u0bf9\u0e3f\u17db\u20a0\u20a1\u20a2\u20a3"
"\u20a4\u20a5\u20a6\u20a7\u20a8\u20a9\u20aa\u20ab"
"\u20ac\u20ad\u20ae\u20af\u20b0\u20b1\u20b2\u20b3"
"\u20b4\u20b5\u20b6\u20b7\u20b8\u20b9\u20ba\ua838"
"\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6"
)
# Pad spaces after opening punctuations.
OPEN_PUNCT_RE = re.compile(f"([{OPEN_PUNCT}])"), r"\1 "
# Pad spaces before closing punctuations.
CLOSE_PUNCT_RE = re.compile(f"([{CLOSE_PUNCT}])"), r"\1 "
# Pad spaces after currency symbols.
CURRENCY_SYM_RE = re.compile(f"([{CURRENCY_SYM}])"), r"\1 "
# Use for tokenizing URL-unfriendly characters: [:/?#]
URL_FOE_1 = re.compile(r":(?!//)"), r" : " # in perl s{:(?!//)}{ : }g;
URL_FOE_2 = re.compile(r"\?(?!\S)"), r" ? " # in perl s{\?(?!\S)}{ ? }g;
# in perl: m{://} or m{\S+\.\S+/\S+} or s{/}{ / }g;
URL_FOE_3 = re.compile(r"(:\/\/)[\S+\.\S+\/\S+][\/]"), " / "
URL_FOE_4 = re.compile(r" /"), r" / " # s{ /}{ / }g;
# Left/Right strip, i.e. remove heading/trailing spaces.
# These strip regexes should NOT be used,
# instead use str.lstrip(), str.rstrip() or str.strip()
# (They are kept for reference purposes to the original toktok.pl code)
LSTRIP = re.compile(r"^ +"), ""
RSTRIP = re.compile(r"\s+$"), "\n"
# Merge multiple spaces.
ONE_SPACE = re.compile(r" {2,}"), " "
TOKTOK_REGEXES = [
NON_BREAKING,
FUNKY_PUNCT_1,
URL_FOE_1,
URL_FOE_2,
URL_FOE_3,
URL_FOE_4,
AMPERCENT,
TAB,
PIPE,
OPEN_PUNCT_RE,
CLOSE_PUNCT_RE,
MULTI_COMMAS,
COMMA_IN_NUM,
FINAL_PERIOD_2,
PROB_SINGLE_QUOTES,
STUPID_QUOTES_1,
STUPID_QUOTES_2,
CURRENCY_SYM_RE,
EN_EM_DASHES,
MULTI_DASHES,
MULTI_DOTS,
FINAL_PERIOD_1,
FINAL_PERIOD_2,
ONE_SPACE,
]
def tokenize(self, text, return_str=False):
text = str(text) # Converts input string into unicode.
for regexp, substitution in self.TOKTOK_REGEXES:
text = regexp.sub(substitution, text)
# Finally, strips heading and trailing spaces
# and converts output string into unicode.
text = str(text.strip())
return text if return_str else text.split()
|