|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import sys, string, itertools, re, os |
|
from collections import defaultdict |
|
|
|
from indicnlp import common |
|
from indicnlp import langinfo |
|
from indicnlp.script import indic_scripts as isc |
|
from indicnlp.transliterate.sinhala_transliterator import SinhalaDevanagariTransliterator as sdt |
|
import pandas as pd |
|
|
|
OFFSET_TO_ITRANS={} |
|
ITRANS_TO_OFFSET=defaultdict(list) |
|
|
|
DUPLICATE_ITRANS_REPRESENTATIONS={} |
|
|
|
|
|
def init(): |
|
""" |
|
To be called by library loader, do not call it in your program |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
itrans_map_fname=os.path.join(common.get_resources_path(),'transliterate','offset_itrans_map.csv') |
|
|
|
itrans_df=pd.read_csv(itrans_map_fname,encoding='utf-8') |
|
|
|
global OFFSET_TO_ITRANS, ITRANS_TO_OFFSET, DUPLICATE_ITRANS_REPRESENTATIONS |
|
|
|
for r in itrans_df.iterrows(): |
|
itrans=r[1]['itrans'] |
|
o=int(r[1]['offset_hex'],base=16) |
|
|
|
OFFSET_TO_ITRANS[o]=itrans |
|
|
|
if langinfo.is_consonant_offset(o): |
|
|
|
ITRANS_TO_OFFSET[itrans[:-1]].extend([o,0x4d]) |
|
else: |
|
|
|
ITRANS_TO_OFFSET[itrans].append(o) |
|
|
|
|
|
DUPLICATE_ITRANS_REPRESENTATIONS = { |
|
'A': 'aa', |
|
'I': 'ii', |
|
'U': 'uu', |
|
'RRi': 'R^i', |
|
'RRI': 'R^I', |
|
'LLi': 'L^i', |
|
'LLI': 'L^I', |
|
'L': 'ld', |
|
'w': 'v', |
|
'x': 'kSh', |
|
'gj': 'j~n', |
|
'dny': 'j~n', |
|
'.n': '.m', |
|
'M': '.m', |
|
'OM': 'AUM' |
|
} |
|
|
|
class UnicodeIndicTransliterator(object): |
|
""" |
|
Base class for rule-based transliteration among Indian languages. |
|
|
|
Script pair specific transliterators should derive from this class and override the transliterate() method. |
|
They can call the super class 'transliterate()' method to avail of the common transliteration |
|
""" |
|
|
|
@staticmethod |
|
def _correct_tamil_mapping(offset): |
|
|
|
|
|
|
|
|
|
|
|
if offset>=0x15 and offset<=0x28 and \ |
|
offset!=0x1c and \ |
|
not ( (offset-0x15)%5==0 or (offset-0x15)%5==4 ) : |
|
subst_char=(offset-0x15)//5 |
|
offset=0x15+5*subst_char |
|
|
|
|
|
if offset in [ 0x2b, 0x2c, 0x2d]: |
|
offset=0x2a |
|
|
|
|
|
if offset==0x36: |
|
offset=0x37 |
|
|
|
return offset |
|
|
|
@staticmethod |
|
def transliterate(text,lang1_code,lang2_code): |
|
""" |
|
convert the source language script (lang1) to target language script (lang2) |
|
|
|
text: text to transliterate |
|
lang1_code: language 1 code |
|
lang1_code: language 2 code |
|
""" |
|
if lang1_code in langinfo.SCRIPT_RANGES and lang2_code in langinfo.SCRIPT_RANGES: |
|
|
|
|
|
if lang1_code=='si': |
|
text=sdt.sinhala_to_devanagari(text) |
|
lang1_code='hi' |
|
|
|
|
|
org_lang2_code='' |
|
if lang2_code=='si': |
|
lang2_code='hi' |
|
org_lang2_code='si' |
|
|
|
trans_lit_text=[] |
|
for c in text: |
|
newc=c |
|
offset=ord(c)-langinfo.SCRIPT_RANGES[lang1_code][0] |
|
if offset >=langinfo.COORDINATED_RANGE_START_INCLUSIVE and offset <= langinfo.COORDINATED_RANGE_END_INCLUSIVE: |
|
if lang2_code=='ta': |
|
|
|
offset=UnicodeIndicTransliterator._correct_tamil_mapping(offset) |
|
newc=chr(langinfo.SCRIPT_RANGES[lang2_code][0]+offset) |
|
|
|
trans_lit_text.append(newc) |
|
|
|
|
|
if org_lang2_code=='si': |
|
return sdt.devanagari_to_sinhala(''.join(trans_lit_text)) |
|
|
|
return ''.join(trans_lit_text) |
|
else: |
|
return text |
|
|
|
class ItransTransliterator(object): |
|
""" |
|
Transliterator between Indian scripts and ITRANS |
|
""" |
|
|
|
@staticmethod |
|
def to_itrans(text,lang_code): |
|
if lang_code in langinfo.SCRIPT_RANGES: |
|
if lang_code=='ml': |
|
|
|
text=text.replace('\u0d7a','\u0d23\u0d4d') |
|
text=text.replace('\u0d7b','\u0d28\u0d4d') |
|
text=text.replace('\u0d7c','\u0d30\u0d4d') |
|
text=text.replace('\u0d7d','\u0d32\u0d4d') |
|
text=text.replace('\u0d7e','\u0d33\u0d4d') |
|
text=text.replace('\u0d7f','\u0d15\u0d4d') |
|
|
|
offsets = [ isc.get_offset(c,lang_code) for c in text ] |
|
|
|
|
|
|
|
itrans_l=[] |
|
for o in offsets: |
|
itrans=OFFSET_TO_ITRANS.get(o, chr(langinfo.SCRIPT_RANGES[lang_code][0]+o) ) |
|
if langinfo.is_halanta_offset(o): |
|
itrans='' |
|
if len(itrans_l)>0: |
|
itrans_l.pop() |
|
elif langinfo.is_vowel_sign_offset(o) and len(itrans_l)>0: |
|
itrans_l.pop() |
|
itrans_l.extend(itrans) |
|
|
|
return ''.join(itrans_l) |
|
|
|
else: |
|
return text |
|
|
|
@staticmethod |
|
def from_itrans(text,lang): |
|
""" |
|
TODO: Document this method properly |
|
TODO: A little hack is used to handle schwa: needs to be documented |
|
TODO: check for robustness |
|
""" |
|
|
|
MAXCODE=4 |
|
|
|
|
|
for k, v in DUPLICATE_ITRANS_REPRESENTATIONS.items(): |
|
if k in text: |
|
text=text.replace(k,v) |
|
|
|
start=0 |
|
match=None |
|
solution=[] |
|
|
|
i=start+1 |
|
while i<=len(text): |
|
|
|
itrans=text[start:i] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if itrans in ITRANS_TO_OFFSET: |
|
offs=ITRANS_TO_OFFSET[itrans] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(offs)==2 and \ |
|
langinfo.is_vowel_offset(offs[0]): |
|
|
|
|
|
if len(solution)>0 and langinfo.is_halanta(solution[-1],lang): |
|
offs=[offs[1]] |
|
else: |
|
offs=[offs[0]] |
|
|
|
c=''.join([ langinfo.offset_to_char(x,lang) for x in offs ]) |
|
match=(i,c) |
|
|
|
elif len(itrans)==1: |
|
match=(i,itrans) |
|
elif i<len(text) and (i-start)<MAXCODE+1: |
|
i=i+1 |
|
continue |
|
else: |
|
solution.extend(match[1]) |
|
|
|
start=match[0] |
|
i=start |
|
match=None |
|
|
|
|
|
|
|
|
|
|
|
i=i+1 |
|
|
|
|
|
if match is not None: |
|
solution.extend(match[1]) |
|
|
|
|
|
|
|
|
|
temp_out=list(''.join(solution)) |
|
rem_indices=[] |
|
for i in range(len(temp_out)-1): |
|
if langinfo.is_halanta(temp_out[i],lang) and \ |
|
(langinfo.is_vowel_sign(temp_out[i+1],lang) \ |
|
or langinfo.is_nukta(temp_out[i+1],lang) \ |
|
or temp_out[i+1]==langinfo.offset_to_char(0x7f,lang)): |
|
rem_indices.append(i) |
|
|
|
|
|
for i in reversed(rem_indices): |
|
temp_out.pop(i) |
|
|
|
out=''.join(temp_out) |
|
|
|
|
|
out=out.replace(langinfo.offset_to_char(0x7f,lang),'') |
|
|
|
return out |
|
|
|
if __name__ == '__main__': |
|
|
|
if len(sys.argv)<4: |
|
print("Usage: python unicode_transliterate.py <command> <infile> <outfile> <src_language> <tgt_language>") |
|
sys.exit(1) |
|
|
|
if sys.argv[1]=='transliterate': |
|
|
|
src_language=sys.argv[4] |
|
tgt_language=sys.argv[5] |
|
|
|
with open(sys.argv[2],'r', encoding='utf-8') as ifile: |
|
with open(sys.argv[3],'w', encoding='utf-8') as ofile: |
|
for line in ifile.readlines(): |
|
transliterated_line=UnicodeIndicTransliterator.transliterate(line,src_language,tgt_language) |
|
ofile.write(transliterated_line) |
|
|
|
elif sys.argv[1]=='romanize': |
|
|
|
language=sys.argv[4] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open(sys.argv[2],'r', encoding='utf-8') as ifile: |
|
with open(sys.argv[3],'w', encoding='utf-8') as ofile: |
|
for line in ifile.readlines(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
transliterated_line=ItransTransliterator.to_itrans(line,language) |
|
|
|
|
|
transliterated_line=transliterated_line.replace('ph','f') |
|
|
|
ofile.write(transliterated_line) |
|
|
|
elif sys.argv[1]=='indicize': |
|
|
|
language=sys.argv[4] |
|
|
|
with open(sys.argv[2],'r', encoding='utf-8') as ifile: |
|
with open(sys.argv[3],'w', encoding='utf-8') as ofile: |
|
for line in ifile.readlines(): |
|
transliterated_line=ItransTransliterator.from_itrans(line,language) |
|
ofile.write(transliterated_line) |
|
|
|
|