Spaces:
Sleeping
Sleeping
File size: 6,347 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import unittest
from contextlib import closing
from nltk import data
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
class SnowballTest(unittest.TestCase):
def test_arabic(self):
"""
this unit testing for test the snowball arabic light stemmer
this stemmer deals with prefixes and suffixes
"""
# Test where the ignore_stopwords=True.
ar_stemmer = SnowballStemmer("arabic", True)
assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب"
assert ar_stemmer.stem("العربية") == "عرب"
assert ar_stemmer.stem("فقالوا") == "قال"
assert ar_stemmer.stem("الطالبات") == "طالب"
assert ar_stemmer.stem("فالطالبات") == "طالب"
assert ar_stemmer.stem("والطالبات") == "طالب"
assert ar_stemmer.stem("الطالبون") == "طالب"
assert ar_stemmer.stem("اللذان") == "اللذان"
assert ar_stemmer.stem("من") == "من"
# Test where the ignore_stopwords=False.
ar_stemmer = SnowballStemmer("arabic", False)
assert ar_stemmer.stem("اللذان") == "اللذ" # this is a stop word
assert ar_stemmer.stem("الطالبات") == "طالب"
assert ar_stemmer.stem("الكلمات") == "كلم"
# test where create the arabic stemmer without given init value to ignore_stopwords
ar_stemmer = SnowballStemmer("arabic")
assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب"
assert ar_stemmer.stem("العربية") == "عرب"
assert ar_stemmer.stem("فقالوا") == "قال"
assert ar_stemmer.stem("الطالبات") == "طالب"
assert ar_stemmer.stem("الكلمات") == "كلم"
def test_russian(self):
stemmer_russian = SnowballStemmer("russian")
assert stemmer_russian.stem("авантненькая") == "авантненьк"
def test_german(self):
stemmer_german = SnowballStemmer("german")
stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)
assert stemmer_german.stem("Schr\xe4nke") == "schrank"
assert stemmer_german2.stem("Schr\xe4nke") == "schrank"
assert stemmer_german.stem("keinen") == "kein"
assert stemmer_german2.stem("keinen") == "keinen"
def test_spanish(self):
stemmer = SnowballStemmer("spanish")
assert stemmer.stem("Visionado") == "vision"
# The word 'algue' was raising an IndexError
assert stemmer.stem("algue") == "algu"
def test_short_strings_bug(self):
stemmer = SnowballStemmer("english")
assert stemmer.stem("y's") == "y"
class PorterTest(unittest.TestCase):
def _vocabulary(self):
with closing(
data.find("stemmers/porter_test/porter_vocabulary.txt").open(
encoding="utf-8"
)
) as fp:
return fp.read().splitlines()
def _test_against_expected_output(self, stemmer_mode, expected_stems):
stemmer = PorterStemmer(mode=stemmer_mode)
for word, true_stem in zip(self._vocabulary(), expected_stems):
our_stem = stemmer.stem(word)
assert (
our_stem == true_stem
), "{} should stem to {} in {} mode but got {}".format(
word,
true_stem,
stemmer_mode,
our_stem,
)
def test_vocabulary_martin_mode(self):
"""Tests all words from the test vocabulary provided by M Porter
The sample vocabulary and output were sourced from
https://tartarus.org/martin/PorterStemmer/voc.txt and
https://tartarus.org/martin/PorterStemmer/output.txt
and are linked to from the Porter Stemmer algorithm's homepage
at https://tartarus.org/martin/PorterStemmer/
"""
with closing(
data.find("stemmers/porter_test/porter_martin_output.txt").open(
encoding="utf-8"
)
) as fp:
self._test_against_expected_output(
PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines()
)
def test_vocabulary_nltk_mode(self):
with closing(
data.find("stemmers/porter_test/porter_nltk_output.txt").open(
encoding="utf-8"
)
) as fp:
self._test_against_expected_output(
PorterStemmer.NLTK_EXTENSIONS, fp.read().splitlines()
)
def test_vocabulary_original_mode(self):
# The list of stems for this test was generated by taking the
# Martin-blessed stemmer from
# https://tartarus.org/martin/PorterStemmer/c.txt
# and removing all the --DEPARTURE-- sections from it and
# running it against Martin's test vocabulary.
with closing(
data.find("stemmers/porter_test/porter_original_output.txt").open(
encoding="utf-8"
)
) as fp:
self._test_against_expected_output(
PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines()
)
self._test_against_expected_output(
PorterStemmer.ORIGINAL_ALGORITHM,
data.find("stemmers/porter_test/porter_original_output.txt")
.open(encoding="utf-8")
.read()
.splitlines(),
)
def test_oed_bug(self):
"""Test for bug https://github.com/nltk/nltk/issues/1581
Ensures that 'oed' can be stemmed without throwing an error.
"""
assert PorterStemmer().stem("oed") == "o"
def test_lowercase_option(self):
"""Test for improvement on https://github.com/nltk/nltk/issues/2507
Ensures that stems are lowercased when `to_lowercase=True`
"""
porter = PorterStemmer()
assert porter.stem("On") == "on"
assert porter.stem("I") == "i"
assert porter.stem("I", to_lowercase=False) == "I"
assert porter.stem("Github") == "github"
assert porter.stem("Github", to_lowercase=False) == "Github"
|