Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /test /unit /test_stem.py

sunnychenxiwang

update nltk

d916065 over 1 year ago

raw

history blame

6.35 kB

	import unittest
	from contextlib import closing

	from nltk import data
	from nltk.stem.porter import PorterStemmer
	from nltk.stem.snowball import SnowballStemmer


	class SnowballTest(unittest.TestCase):
	def test_arabic(self):
	"""
	this unit testing for test the snowball arabic light stemmer
	this stemmer deals with prefixes and suffixes
	"""
	# Test where the ignore_stopwords=True.
	ar_stemmer = SnowballStemmer("arabic", True)
	assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب"
	assert ar_stemmer.stem("العربية") == "عرب"
	assert ar_stemmer.stem("فقالوا") == "قال"
	assert ar_stemmer.stem("الطالبات") == "طالب"
	assert ar_stemmer.stem("فالطالبات") == "طالب"
	assert ar_stemmer.stem("والطالبات") == "طالب"
	assert ar_stemmer.stem("الطالبون") == "طالب"
	assert ar_stemmer.stem("اللذان") == "اللذان"
	assert ar_stemmer.stem("من") == "من"
	# Test where the ignore_stopwords=False.
	ar_stemmer = SnowballStemmer("arabic", False)
	assert ar_stemmer.stem("اللذان") == "اللذ" # this is a stop word
	assert ar_stemmer.stem("الطالبات") == "طالب"
	assert ar_stemmer.stem("الكلمات") == "كلم"
	# test where create the arabic stemmer without given init value to ignore_stopwords
	ar_stemmer = SnowballStemmer("arabic")
	assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب"
	assert ar_stemmer.stem("العربية") == "عرب"
	assert ar_stemmer.stem("فقالوا") == "قال"
	assert ar_stemmer.stem("الطالبات") == "طالب"
	assert ar_stemmer.stem("الكلمات") == "كلم"

	def test_russian(self):
	stemmer_russian = SnowballStemmer("russian")
	assert stemmer_russian.stem("авантненькая") == "авантненьк"

	def test_german(self):
	stemmer_german = SnowballStemmer("german")
	stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)

	assert stemmer_german.stem("Schr\xe4nke") == "schrank"
	assert stemmer_german2.stem("Schr\xe4nke") == "schrank"

	assert stemmer_german.stem("keinen") == "kein"
	assert stemmer_german2.stem("keinen") == "keinen"

	def test_spanish(self):
	stemmer = SnowballStemmer("spanish")

	assert stemmer.stem("Visionado") == "vision"

	# The word 'algue' was raising an IndexError
	assert stemmer.stem("algue") == "algu"

	def test_short_strings_bug(self):
	stemmer = SnowballStemmer("english")
	assert stemmer.stem("y's") == "y"


	class PorterTest(unittest.TestCase):
	def _vocabulary(self):
	with closing(
	data.find("stemmers/porter_test/porter_vocabulary.txt").open(
	encoding="utf-8"
	)
	) as fp:
	return fp.read().splitlines()

	def _test_against_expected_output(self, stemmer_mode, expected_stems):
	stemmer = PorterStemmer(mode=stemmer_mode)
	for word, true_stem in zip(self._vocabulary(), expected_stems):
	our_stem = stemmer.stem(word)
	assert (
	our_stem == true_stem
	), "{} should stem to {} in {} mode but got {}".format(
	word,
	true_stem,
	stemmer_mode,
	our_stem,
	)

	def test_vocabulary_martin_mode(self):
	"""Tests all words from the test vocabulary provided by M Porter

	The sample vocabulary and output were sourced from
	https://tartarus.org/martin/PorterStemmer/voc.txt and
	https://tartarus.org/martin/PorterStemmer/output.txt
	and are linked to from the Porter Stemmer algorithm's homepage
	at https://tartarus.org/martin/PorterStemmer/
	"""
	with closing(
	data.find("stemmers/porter_test/porter_martin_output.txt").open(
	encoding="utf-8"
	)
	) as fp:
	self._test_against_expected_output(
	PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines()
	)

	def test_vocabulary_nltk_mode(self):
	with closing(
	data.find("stemmers/porter_test/porter_nltk_output.txt").open(
	encoding="utf-8"
	)
	) as fp:
	self._test_against_expected_output(
	PorterStemmer.NLTK_EXTENSIONS, fp.read().splitlines()
	)

	def test_vocabulary_original_mode(self):
	# The list of stems for this test was generated by taking the
	# Martin-blessed stemmer from
	# https://tartarus.org/martin/PorterStemmer/c.txt
	# and removing all the --DEPARTURE-- sections from it and
	# running it against Martin's test vocabulary.

	with closing(
	data.find("stemmers/porter_test/porter_original_output.txt").open(
	encoding="utf-8"
	)
	) as fp:
	self._test_against_expected_output(
	PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines()
	)

	self._test_against_expected_output(
	PorterStemmer.ORIGINAL_ALGORITHM,
	data.find("stemmers/porter_test/porter_original_output.txt")
	.open(encoding="utf-8")
	.read()
	.splitlines(),
	)

	def test_oed_bug(self):
	"""Test for bug https://github.com/nltk/nltk/issues/1581

	Ensures that 'oed' can be stemmed without throwing an error.
	"""
	assert PorterStemmer().stem("oed") == "o"

	def test_lowercase_option(self):
	"""Test for improvement on https://github.com/nltk/nltk/issues/2507

	Ensures that stems are lowercased when `to_lowercase=True`
	"""
	porter = PorterStemmer()
	assert porter.stem("On") == "on"
	assert porter.stem("I") == "i"
	assert porter.stem("I", to_lowercase=False) == "I"
	assert porter.stem("Github") == "github"
	assert porter.stem("Github", to_lowercase=False) == "Github"