gyroing commited on
Commit
0830d03
1 Parent(s): 686e3d3

Create Hazm_correction.py

Browse files
Files changed (1) hide show
  1. Hazm_correction.py +37 -0
Hazm_correction.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hazm
2
+ import typing
3
+
4
+ normalizer = hazm.Normalizer()
5
+ sent_tokenizer = hazm.SentenceTokenizer()
6
+ word_tokenizer = hazm.WordTokenizer()
7
+
8
+ tagger = hazm.POSTagger(
9
+ model=str("pos_tagger.model")
10
+ )
11
+
12
+ def preprocess_text(text: str) -> typing.List[typing.List[str]]:
13
+ """Split/normalize text into sentences/words with hazm"""
14
+ text = normalizer.normalize(text)
15
+ processed_sentences = []
16
+
17
+ for sentence in sent_tokenizer.tokenize(text):
18
+ words = word_tokenizer.tokenize(sentence)
19
+ processed_words = fix_words(words)
20
+ processed_sentences.append(" ".join(processed_words))
21
+
22
+ return " ".join(processed_sentences)
23
+
24
+ def fix_words(words: typing.List[str]) -> typing.List[str]:
25
+ fixed_words = []
26
+
27
+ for word, pos in tagger.tag(words):
28
+ if pos[-1] == "Z":
29
+ if word[-1] != "ِ":
30
+ if (word[-1] == "ه") and (word[-2] != "ا"):
31
+ word += "‌ی"
32
+ word += "ِ"
33
+
34
+
35
+ fixed_words.append(word)
36
+
37
+ return fixed_words