Spaces:
Sleeping
Sleeping
File size: 5,511 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# Natural Language Toolkit: RSLP Stemmer
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Tiago Tresoldi <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
# This code is based on the algorithm presented in the paper "A Stemming
# Algorithm for the Portuguese Language" by Viviane Moreira Orengo and
# Christian Huyck, which unfortunately I had no access to. The code is a
# Python version, with some minor modifications of mine, to the description
# presented at https://www.webcitation.org/5NnvdIzOb and to the C source code
# available at http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html.
# Please note that this stemmer is intended for demonstration and educational
# purposes only. Feel free to write me for any comments, including the
# development of a different and/or better stemmer for Portuguese. I also
# suggest using NLTK's mailing list for Portuguese for any discussion.
# Este código é baseado no algoritmo apresentado no artigo "A Stemming
# Algorithm for the Portuguese Language" de Viviane Moreira Orengo e
# Christian Huyck, o qual infelizmente não tive a oportunidade de ler. O
# código é uma conversão para Python, com algumas pequenas modificações
# minhas, daquele apresentado em https://www.webcitation.org/5NnvdIzOb e do
# código para linguagem C disponível em
# http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. Por favor,
# lembre-se de que este stemmer foi desenvolvido com finalidades unicamente
# de demonstração e didáticas. Sinta-se livre para me escrever para qualquer
# comentário, inclusive sobre o desenvolvimento de um stemmer diferente
# e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão
# do NLTK para o português para qualquer debate.
from nltk.data import load
from nltk.stem.api import StemmerI
class RSLPStemmer(StemmerI):
"""
A stemmer for Portuguese.
>>> from nltk.stem import RSLPStemmer
>>> st = RSLPStemmer()
>>> # opening lines of Erico Verissimo's "Música ao Longe"
>>> text = '''
... Clarissa risca com giz no quadro-negro a paisagem que os alunos
... devem copiar . Uma casinha de porta e janela , em cima duma
... coxilha .'''
>>> for token in text.split(): # doctest: +NORMALIZE_WHITESPACE
... print(st.stem(token))
clariss risc com giz no quadro-negr a pais que os alun dev copi .
uma cas de port e janel , em cim dum coxilh .
"""
def __init__(self):
self._model = []
self._model.append(self.read_rule("step0.pt"))
self._model.append(self.read_rule("step1.pt"))
self._model.append(self.read_rule("step2.pt"))
self._model.append(self.read_rule("step3.pt"))
self._model.append(self.read_rule("step4.pt"))
self._model.append(self.read_rule("step5.pt"))
self._model.append(self.read_rule("step6.pt"))
def read_rule(self, filename):
rules = load("nltk:stemmers/rslp/" + filename, format="raw").decode("utf8")
lines = rules.split("\n")
lines = [line for line in lines if line != ""] # remove blank lines
lines = [line for line in lines if line[0] != "#"] # remove comments
# NOTE: a simple but ugly hack to make this parser happy with double '\t's
lines = [line.replace("\t\t", "\t") for line in lines]
# parse rules
rules = []
for line in lines:
rule = []
tokens = line.split("\t")
# text to be searched for at the end of the string
rule.append(tokens[0][1:-1]) # remove quotes
# minimum stem size to perform the replacement
rule.append(int(tokens[1]))
# text to be replaced into
rule.append(tokens[2][1:-1]) # remove quotes
# exceptions to this rule
rule.append([token[1:-1] for token in tokens[3].split(",")])
# append to the results
rules.append(rule)
return rules
def stem(self, word):
word = word.lower()
# the word ends in 's'? apply rule for plural reduction
if word[-1] == "s":
word = self.apply_rule(word, 0)
# the word ends in 'a'? apply rule for feminine reduction
if word[-1] == "a":
word = self.apply_rule(word, 1)
# augmentative reduction
word = self.apply_rule(word, 3)
# adverb reduction
word = self.apply_rule(word, 2)
# noun reduction
prev_word = word
word = self.apply_rule(word, 4)
if word == prev_word:
# verb reduction
prev_word = word
word = self.apply_rule(word, 5)
if word == prev_word:
# vowel removal
word = self.apply_rule(word, 6)
return word
def apply_rule(self, word, rule_index):
rules = self._model[rule_index]
for rule in rules:
suffix_length = len(rule[0])
if word[-suffix_length:] == rule[0]: # if suffix matches
if len(word) >= suffix_length + rule[1]: # if we have minimum size
if word not in rule[3]: # if not an exception
word = word[:-suffix_length] + rule[2]
break
return word
|