|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import sys |
|
from indicnlp.normalize import indic_normalize |
|
from indicnlp.transliterate import unicode_transliterate |
|
from indicnlp import loader |
|
|
|
class AggressiveScriptUnifier(): |
|
|
|
def __init__(self,common_lang='hi',nasals_mode='to_nasal_consonants'): |
|
self.common_lang=common_lang |
|
self.nasals_mode=nasals_mode |
|
self.do_normalize_chandras=True |
|
self.do_normalize_vowel_ending=True |
|
self.remove_nuktas=True |
|
self.normalizer_map={} |
|
self._init_normalizers() |
|
|
|
def _init_normalizers(self): |
|
normalizer_factory=indic_normalize.IndicNormalizerFactory() |
|
|
|
|
|
for lang in ['hi','mr','sa','kK','ne','sd','bn','gu','ta','te','kn']: |
|
self.normalizer_map[lang]=normalizer_factory.get_normalizer(lang, nasals_mode=self.nasals_mode, |
|
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, |
|
do_normalize_vowel_ending=self.do_normalize_vowel_ending) |
|
|
|
|
|
self.normalizer_map['pa']=normalizer_factory.get_normalizer('pa', nasals_mode=self.nasals_mode, |
|
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, |
|
do_normalize_vowel_ending=self.do_normalize_vowel_ending, |
|
do_canonicalize_addak=True, do_canonicalize_tippi=True, |
|
do_replace_vowel_bases=True) |
|
self.normalizer_map['or']=normalizer_factory.get_normalizer('or', nasals_mode=self.nasals_mode, |
|
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, |
|
do_normalize_vowel_ending=self.do_normalize_vowel_ending, |
|
do_remap_wa=True) |
|
self.normalizer_map['as']=normalizer_factory.get_normalizer('as', nasals_mode=self.nasals_mode, |
|
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, |
|
do_normalize_vowel_ending=self.do_normalize_vowel_ending, |
|
do_remap_assamese_chars=True) |
|
self.normalizer_map['ml']=normalizer_factory.get_normalizer('ml', nasals_mode=self.nasals_mode, |
|
do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, |
|
do_normalize_vowel_ending=self.do_normalize_vowel_ending, |
|
do_canonicalize_chillus=True, do_correct_geminated_T=True) |
|
|
|
def transform(self,text,lang): |
|
text=self.normalizer_map[lang].normalize(text) |
|
text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang) |
|
return text |
|
|
|
class BasicScriptUnifier(): |
|
|
|
def __init__(self,common_lang='hi',nasals_mode='do_nothing'): |
|
self.common_lang=common_lang |
|
self.nasals_mode=nasals_mode |
|
self.normalizer_map={} |
|
self._init_normalizers() |
|
|
|
def _init_normalizers(self): |
|
normalizer_factory=indic_normalize.IndicNormalizerFactory() |
|
|
|
for lang in ['hi','mr','sa','kK','ne','sd','bn','gu','ta','te','kn','pa','or','as','ml']: |
|
self.normalizer_map[lang]=normalizer_factory.get_normalizer(lang, nasals_mode=self.nasals_mode) |
|
|
|
def transform(self,text,lang): |
|
|
|
if lang in self.normalizer_map: |
|
text=self.normalizer_map[lang].normalize(text) |
|
|
|
text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang) |
|
return text |
|
|
|
class NaiveScriptUnifier(): |
|
|
|
def __init__(self,common_lang='hi'): |
|
self.common_lang=common_lang |
|
|
|
def transform(self,text,lang): |
|
|
|
text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang) |
|
return text |
|
|
|
if __name__ == '__main__': |
|
|
|
loader.load() |
|
|
|
if len(sys.argv)<=4: |
|
print("Usage: python script_unifier <command> <infile> <outfile> <language>") |
|
sys.exit(1) |
|
|
|
if sys.argv[1]=='aggressive': |
|
|
|
language=sys.argv[4] |
|
|
|
unifier=AggressiveScriptUnifier(nasals_mode='to_nasal_consonants') |
|
|
|
with open(sys.argv[2],'r',encoding='utf-8') as ifile: |
|
with open(sys.argv[3],'w',encoding='utf-8') as ofile: |
|
for i, line in enumerate(ifile.readlines()): |
|
|
|
line=line.strip() |
|
transliterated_line=unifier.transform(line,language) |
|
ofile.write(transliterated_line+'\n') |
|
|
|
elif sys.argv[1]=='moderate': |
|
|
|
language=sys.argv[4] |
|
|
|
unifier=AggressiveScriptUnifier(nasals_mode='do_nothing') |
|
|
|
with open(sys.argv[2],'r',encoding='utf-8') as ifile: |
|
with open(sys.argv[3],'w',encoding='utf-8') as ofile: |
|
for i, line in enumerate(ifile.readlines()): |
|
|
|
line=line.strip() |
|
transliterated_line=unifier.transform(line,language) |
|
ofile.write(transliterated_line+'\n') |
|
|
|
elif sys.argv[1]=='basic': |
|
|
|
language=sys.argv[4] |
|
|
|
unifier=BasicScriptUnifier() |
|
|
|
with open(sys.argv[2],'r',encoding='utf-8') as ifile: |
|
with open(sys.argv[3],'w',encoding='utf-8') as ofile: |
|
for i, line in enumerate(ifile.readlines()): |
|
|
|
line=line.strip() |
|
transliterated_line=unifier.transform(line,language) |
|
ofile.write(transliterated_line+'\n') |
|
|
|
elif sys.argv[1]=='naive': |
|
|
|
language=sys.argv[4] |
|
|
|
unifier=NaiveScriptUnifier() |
|
|
|
with open(sys.argv[2],'r',encoding='utf-8') as ifile: |
|
with open(sys.argv[3],'w',encoding='utf-8') as ofile: |
|
for i, line in enumerate(ifile.readlines()): |
|
|
|
line=line.strip() |
|
transliterated_line=unifier.transform(line,language) |
|
ofile.write(transliterated_line+'\n') |
|
|