Spaces:
Sleeping
Sleeping
import unittest | |
from contextlib import closing | |
from nltk import data | |
from nltk.stem.porter import PorterStemmer | |
from nltk.stem.snowball import SnowballStemmer | |
class SnowballTest(unittest.TestCase): | |
def test_arabic(self): | |
""" | |
this unit testing for test the snowball arabic light stemmer | |
this stemmer deals with prefixes and suffixes | |
""" | |
# Test where the ignore_stopwords=True. | |
ar_stemmer = SnowballStemmer("arabic", True) | |
assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب" | |
assert ar_stemmer.stem("العربية") == "عرب" | |
assert ar_stemmer.stem("فقالوا") == "قال" | |
assert ar_stemmer.stem("الطالبات") == "طالب" | |
assert ar_stemmer.stem("فالطالبات") == "طالب" | |
assert ar_stemmer.stem("والطالبات") == "طالب" | |
assert ar_stemmer.stem("الطالبون") == "طالب" | |
assert ar_stemmer.stem("اللذان") == "اللذان" | |
assert ar_stemmer.stem("من") == "من" | |
# Test where the ignore_stopwords=False. | |
ar_stemmer = SnowballStemmer("arabic", False) | |
assert ar_stemmer.stem("اللذان") == "اللذ" # this is a stop word | |
assert ar_stemmer.stem("الطالبات") == "طالب" | |
assert ar_stemmer.stem("الكلمات") == "كلم" | |
# test where create the arabic stemmer without given init value to ignore_stopwords | |
ar_stemmer = SnowballStemmer("arabic") | |
assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب" | |
assert ar_stemmer.stem("العربية") == "عرب" | |
assert ar_stemmer.stem("فقالوا") == "قال" | |
assert ar_stemmer.stem("الطالبات") == "طالب" | |
assert ar_stemmer.stem("الكلمات") == "كلم" | |
def test_russian(self): | |
stemmer_russian = SnowballStemmer("russian") | |
assert stemmer_russian.stem("авантненькая") == "авантненьк" | |
def test_german(self): | |
stemmer_german = SnowballStemmer("german") | |
stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True) | |
assert stemmer_german.stem("Schr\xe4nke") == "schrank" | |
assert stemmer_german2.stem("Schr\xe4nke") == "schrank" | |
assert stemmer_german.stem("keinen") == "kein" | |
assert stemmer_german2.stem("keinen") == "keinen" | |
def test_spanish(self): | |
stemmer = SnowballStemmer("spanish") | |
assert stemmer.stem("Visionado") == "vision" | |
# The word 'algue' was raising an IndexError | |
assert stemmer.stem("algue") == "algu" | |
def test_short_strings_bug(self): | |
stemmer = SnowballStemmer("english") | |
assert stemmer.stem("y's") == "y" | |
class PorterTest(unittest.TestCase): | |
def _vocabulary(self): | |
with closing( | |
data.find("stemmers/porter_test/porter_vocabulary.txt").open( | |
encoding="utf-8" | |
) | |
) as fp: | |
return fp.read().splitlines() | |
def _test_against_expected_output(self, stemmer_mode, expected_stems): | |
stemmer = PorterStemmer(mode=stemmer_mode) | |
for word, true_stem in zip(self._vocabulary(), expected_stems): | |
our_stem = stemmer.stem(word) | |
assert ( | |
our_stem == true_stem | |
), "{} should stem to {} in {} mode but got {}".format( | |
word, | |
true_stem, | |
stemmer_mode, | |
our_stem, | |
) | |
def test_vocabulary_martin_mode(self): | |
"""Tests all words from the test vocabulary provided by M Porter | |
The sample vocabulary and output were sourced from | |
https://tartarus.org/martin/PorterStemmer/voc.txt and | |
https://tartarus.org/martin/PorterStemmer/output.txt | |
and are linked to from the Porter Stemmer algorithm's homepage | |
at https://tartarus.org/martin/PorterStemmer/ | |
""" | |
with closing( | |
data.find("stemmers/porter_test/porter_martin_output.txt").open( | |
encoding="utf-8" | |
) | |
) as fp: | |
self._test_against_expected_output( | |
PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines() | |
) | |
def test_vocabulary_nltk_mode(self): | |
with closing( | |
data.find("stemmers/porter_test/porter_nltk_output.txt").open( | |
encoding="utf-8" | |
) | |
) as fp: | |
self._test_against_expected_output( | |
PorterStemmer.NLTK_EXTENSIONS, fp.read().splitlines() | |
) | |
def test_vocabulary_original_mode(self): | |
# The list of stems for this test was generated by taking the | |
# Martin-blessed stemmer from | |
# https://tartarus.org/martin/PorterStemmer/c.txt | |
# and removing all the --DEPARTURE-- sections from it and | |
# running it against Martin's test vocabulary. | |
with closing( | |
data.find("stemmers/porter_test/porter_original_output.txt").open( | |
encoding="utf-8" | |
) | |
) as fp: | |
self._test_against_expected_output( | |
PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines() | |
) | |
self._test_against_expected_output( | |
PorterStemmer.ORIGINAL_ALGORITHM, | |
data.find("stemmers/porter_test/porter_original_output.txt") | |
.open(encoding="utf-8") | |
.read() | |
.splitlines(), | |
) | |
def test_oed_bug(self): | |
"""Test for bug https://github.com/nltk/nltk/issues/1581 | |
Ensures that 'oed' can be stemmed without throwing an error. | |
""" | |
assert PorterStemmer().stem("oed") == "o" | |
def test_lowercase_option(self): | |
"""Test for improvement on https://github.com/nltk/nltk/issues/2507 | |
Ensures that stems are lowercased when `to_lowercase=True` | |
""" | |
porter = PorterStemmer() | |
assert porter.stem("On") == "on" | |
assert porter.stem("I") == "i" | |
assert porter.stem("I", to_lowercase=False) == "I" | |
assert porter.stem("Github") == "github" | |
assert porter.stem("Github", to_lowercase=False) == "Github" | |