Spaces:
Build error
Build error
File size: 6,282 Bytes
d984b22 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
#
# Copyright (c) 2013-present, Anoop Kunchukuttan
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
#Program for normalization of text written in Unicode. This is mainly geared towards Indic scripts
#
# @author Anoop Kunchukuttan
#
import sys
from indicnlp.normalize import indic_normalize
from indicnlp.transliterate import unicode_transliterate
from indicnlp import loader
class AggressiveScriptUnifier():
def __init__(self,common_lang='hi',nasals_mode='to_nasal_consonants'):
self.common_lang=common_lang
self.nasals_mode=nasals_mode
self.do_normalize_chandras=True
self.do_normalize_vowel_ending=True
self.remove_nuktas=True
self.normalizer_map={}
self._init_normalizers()
def _init_normalizers(self):
normalizer_factory=indic_normalize.IndicNormalizerFactory()
## for languages with common parameters
for lang in ['hi','mr','sa','kK','ne','sd','bn','gu','ta','te','kn']:
self.normalizer_map[lang]=normalizer_factory.get_normalizer(lang, nasals_mode=self.nasals_mode,
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
do_normalize_vowel_ending=self.do_normalize_vowel_ending)
## for languages with language specific parameters
self.normalizer_map['pa']=normalizer_factory.get_normalizer('pa', nasals_mode=self.nasals_mode,
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
do_normalize_vowel_ending=self.do_normalize_vowel_ending,
do_canonicalize_addak=True, do_canonicalize_tippi=True,
do_replace_vowel_bases=True)
self.normalizer_map['or']=normalizer_factory.get_normalizer('or', nasals_mode=self.nasals_mode,
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
do_normalize_vowel_ending=self.do_normalize_vowel_ending,
do_remap_wa=True)
self.normalizer_map['as']=normalizer_factory.get_normalizer('as', nasals_mode=self.nasals_mode,
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
do_normalize_vowel_ending=self.do_normalize_vowel_ending,
do_remap_assamese_chars=True)
self.normalizer_map['ml']=normalizer_factory.get_normalizer('ml', nasals_mode=self.nasals_mode,
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas,
do_normalize_vowel_ending=self.do_normalize_vowel_ending,
do_canonicalize_chillus=True, do_correct_geminated_T=True)
def transform(self,text,lang):
text=self.normalizer_map[lang].normalize(text)
text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang)
return text
class BasicScriptUnifier():
def __init__(self,common_lang='hi',nasals_mode='do_nothing'):
self.common_lang=common_lang
self.nasals_mode=nasals_mode
self.normalizer_map={}
self._init_normalizers()
def _init_normalizers(self):
normalizer_factory=indic_normalize.IndicNormalizerFactory()
for lang in ['hi','mr','sa','kK','ne','sd','bn','gu','ta','te','kn','pa','or','as','ml']:
self.normalizer_map[lang]=normalizer_factory.get_normalizer(lang, nasals_mode=self.nasals_mode)
def transform(self,text,lang):
if lang in self.normalizer_map:
text=self.normalizer_map[lang].normalize(text)
text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang)
return text
class NaiveScriptUnifier():
def __init__(self,common_lang='hi'):
self.common_lang=common_lang
def transform(self,text,lang):
text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang)
return text
if __name__ == '__main__':
loader.load()
if len(sys.argv)<=4:
print("Usage: python script_unifier <command> <infile> <outfile> <language>")
sys.exit(1)
if sys.argv[1]=='aggressive':
language=sys.argv[4]
unifier=AggressiveScriptUnifier(nasals_mode='to_nasal_consonants')
with open(sys.argv[2],'r',encoding='utf-8') as ifile:
with open(sys.argv[3],'w',encoding='utf-8') as ofile:
for i, line in enumerate(ifile.readlines()):
line=line.strip()
transliterated_line=unifier.transform(line,language)
ofile.write(transliterated_line+'\n')
elif sys.argv[1]=='moderate':
language=sys.argv[4]
unifier=AggressiveScriptUnifier(nasals_mode='do_nothing')
with open(sys.argv[2],'r',encoding='utf-8') as ifile:
with open(sys.argv[3],'w',encoding='utf-8') as ofile:
for i, line in enumerate(ifile.readlines()):
line=line.strip()
transliterated_line=unifier.transform(line,language)
ofile.write(transliterated_line+'\n')
elif sys.argv[1]=='basic':
language=sys.argv[4]
unifier=BasicScriptUnifier()
with open(sys.argv[2],'r',encoding='utf-8') as ifile:
with open(sys.argv[3],'w',encoding='utf-8') as ofile:
for i, line in enumerate(ifile.readlines()):
line=line.strip()
transliterated_line=unifier.transform(line,language)
ofile.write(transliterated_line+'\n')
elif sys.argv[1]=='naive':
language=sys.argv[4]
unifier=NaiveScriptUnifier()
with open(sys.argv[2],'r',encoding='utf-8') as ifile:
with open(sys.argv[3],'w',encoding='utf-8') as ofile:
for i, line in enumerate(ifile.readlines()):
line=line.strip()
transliterated_line=unifier.transform(line,language)
ofile.write(transliterated_line+'\n')
|