File size: 6,347 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import unittest
from contextlib import closing

from nltk import data
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer


class SnowballTest(unittest.TestCase):
    def test_arabic(self):
        """

        this unit testing for test the snowball arabic light stemmer

        this stemmer deals with prefixes and suffixes

        """
        # Test where the ignore_stopwords=True.
        ar_stemmer = SnowballStemmer("arabic", True)
        assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب"
        assert ar_stemmer.stem("العربية") == "عرب"
        assert ar_stemmer.stem("فقالوا") == "قال"
        assert ar_stemmer.stem("الطالبات") == "طالب"
        assert ar_stemmer.stem("فالطالبات") == "طالب"
        assert ar_stemmer.stem("والطالبات") == "طالب"
        assert ar_stemmer.stem("الطالبون") == "طالب"
        assert ar_stemmer.stem("اللذان") == "اللذان"
        assert ar_stemmer.stem("من") == "من"
        # Test where the ignore_stopwords=False.
        ar_stemmer = SnowballStemmer("arabic", False)
        assert ar_stemmer.stem("اللذان") == "اللذ"  # this is a stop word
        assert ar_stemmer.stem("الطالبات") == "طالب"
        assert ar_stemmer.stem("الكلمات") == "كلم"
        # test where create the arabic stemmer without given init value to ignore_stopwords
        ar_stemmer = SnowballStemmer("arabic")
        assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب"
        assert ar_stemmer.stem("العربية") == "عرب"
        assert ar_stemmer.stem("فقالوا") == "قال"
        assert ar_stemmer.stem("الطالبات") == "طالب"
        assert ar_stemmer.stem("الكلمات") == "كلم"

    def test_russian(self):
        stemmer_russian = SnowballStemmer("russian")
        assert stemmer_russian.stem("авантненькая") == "авантненьк"

    def test_german(self):
        stemmer_german = SnowballStemmer("german")
        stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)

        assert stemmer_german.stem("Schr\xe4nke") == "schrank"
        assert stemmer_german2.stem("Schr\xe4nke") == "schrank"

        assert stemmer_german.stem("keinen") == "kein"
        assert stemmer_german2.stem("keinen") == "keinen"

    def test_spanish(self):
        stemmer = SnowballStemmer("spanish")

        assert stemmer.stem("Visionado") == "vision"

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == "algu"

    def test_short_strings_bug(self):
        stemmer = SnowballStemmer("english")
        assert stemmer.stem("y's") == "y"


class PorterTest(unittest.TestCase):
    def _vocabulary(self):
        with closing(
            data.find("stemmers/porter_test/porter_vocabulary.txt").open(
                encoding="utf-8"
            )
        ) as fp:
            return fp.read().splitlines()

    def _test_against_expected_output(self, stemmer_mode, expected_stems):
        stemmer = PorterStemmer(mode=stemmer_mode)
        for word, true_stem in zip(self._vocabulary(), expected_stems):
            our_stem = stemmer.stem(word)
            assert (
                our_stem == true_stem
            ), "{} should stem to {} in {} mode but got {}".format(
                word,
                true_stem,
                stemmer_mode,
                our_stem,
            )

    def test_vocabulary_martin_mode(self):
        """Tests all words from the test vocabulary provided by M Porter



        The sample vocabulary and output were sourced from

        https://tartarus.org/martin/PorterStemmer/voc.txt and

        https://tartarus.org/martin/PorterStemmer/output.txt

        and are linked to from the Porter Stemmer algorithm's homepage

        at https://tartarus.org/martin/PorterStemmer/

        """
        with closing(
            data.find("stemmers/porter_test/porter_martin_output.txt").open(
                encoding="utf-8"
            )
        ) as fp:
            self._test_against_expected_output(
                PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines()
            )

    def test_vocabulary_nltk_mode(self):
        with closing(
            data.find("stemmers/porter_test/porter_nltk_output.txt").open(
                encoding="utf-8"
            )
        ) as fp:
            self._test_against_expected_output(
                PorterStemmer.NLTK_EXTENSIONS, fp.read().splitlines()
            )

    def test_vocabulary_original_mode(self):
        # The list of stems for this test was generated by taking the
        # Martin-blessed stemmer from
        # https://tartarus.org/martin/PorterStemmer/c.txt
        # and removing all the --DEPARTURE-- sections from it and
        # running it against Martin's test vocabulary.

        with closing(
            data.find("stemmers/porter_test/porter_original_output.txt").open(
                encoding="utf-8"
            )
        ) as fp:
            self._test_against_expected_output(
                PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines()
            )

        self._test_against_expected_output(
            PorterStemmer.ORIGINAL_ALGORITHM,
            data.find("stemmers/porter_test/porter_original_output.txt")
            .open(encoding="utf-8")
            .read()
            .splitlines(),
        )

    def test_oed_bug(self):
        """Test for bug https://github.com/nltk/nltk/issues/1581



        Ensures that 'oed' can be stemmed without throwing an error.

        """
        assert PorterStemmer().stem("oed") == "o"

    def test_lowercase_option(self):
        """Test for improvement on https://github.com/nltk/nltk/issues/2507



        Ensures that stems are lowercased when `to_lowercase=True`

        """
        porter = PorterStemmer()
        assert porter.stem("On") == "on"
        assert porter.stem("I") == "i"
        assert porter.stem("I", to_lowercase=False) == "I"
        assert porter.stem("Github") == "github"
        assert porter.stem("Github", to_lowercase=False) == "Github"