Spaces:
Running
Running
import hazm | |
import typing | |
normalizer = hazm.Normalizer() | |
sent_tokenizer = hazm.SentenceTokenizer() | |
word_tokenizer = hazm.WordTokenizer() | |
tagger = hazm.POSTagger( | |
model=str("pos_tagger.model") | |
) | |
def preprocess_text(text: str) -> typing.List[typing.List[str]]: | |
"""Split/normalize text into sentences/words with hazm""" | |
text = normalizer.normalize(text) | |
processed_sentences = [] | |
for sentence in sent_tokenizer.tokenize(text): | |
words = word_tokenizer.tokenize(sentence) | |
processed_words = fix_words(words) | |
processed_sentences.append(" ".join(processed_words)) | |
return " ".join(processed_sentences) | |
def fix_words(words: typing.List[str]) -> typing.List[str]: | |
fixed_words = [] | |
for word, pos in tagger.tag(words): | |
if pos[-1] == "Z": | |
if word[-1] != "ِ": | |
if (word[-1] == "ه") and (word[-2] != "ا"): | |
word += "ی" | |
word += "ِ" | |
fixed_words.append(word) | |
return fixed_words |