|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
import numpy as np |
|
import os |
|
|
|
from indicnlp import common |
|
from indicnlp.common import IndicNlpException |
|
from indicnlp import langinfo as li |
|
|
|
|
|
|
|
|
|
|
|
""" Phonetic data about all languages except Tamil """ |
|
ALL_PHONETIC_DATA=None |
|
|
|
""" Phonetic data for Tamil """ |
|
TAMIL_PHONETIC_DATA=None |
|
|
|
""" Phonetic vector for all languages except Tamil """ |
|
ALL_PHONETIC_VECTORS=None |
|
|
|
""" Phonetic vector for Tamil """ |
|
TAMIL_PHONETIC_VECTORS=None |
|
|
|
""" Length of phonetic vector """ |
|
PHONETIC_VECTOR_LENGTH=38 |
|
|
|
""" Start offset for the phonetic feature vector in the phonetic data vector """ |
|
PHONETIC_VECTOR_START_OFFSET=6 |
|
|
|
|
|
|
|
PV_PROP=['basic_type', |
|
'vowel_length', |
|
'vowel_strength', |
|
'vowel_status', |
|
'consonant_type', |
|
'articulation_place', |
|
'aspiration', |
|
'voicing', |
|
'nasalization', |
|
'vowel_horizontal', |
|
'vowel_vertical', |
|
'vowel_roundness', |
|
] |
|
|
|
|
|
|
|
|
|
|
|
PV_PROP_RANGES={ |
|
'basic_type': [0,6], |
|
'vowel_length': [6,8], |
|
'vowel_strength': [8,11], |
|
'vowel_status': [11,13], |
|
'consonant_type': [13,18], |
|
'articulation_place': [18,23], |
|
'aspiration': [23,25], |
|
'voicing': [25,27], |
|
'nasalization': [27,29], |
|
'vowel_horizontal': [29,32], |
|
'vowel_vertical': [32,36], |
|
'vowel_roundness': [36,38], |
|
} |
|
|
|
|
|
|
|
|
|
|
|
PVIDX_BT_VOWEL=0 |
|
PVIDX_BT_CONSONANT=1 |
|
PVIDX_BT_NUKTA=2 |
|
PVIDX_BT_HALANT=3 |
|
PVIDX_BT_ANUSVAAR=4 |
|
PVIDX_BT_MISC=5 |
|
PVIDX_BT_S=PVIDX_BT_VOWEL |
|
PVIDX_BT_E=PVIDX_BT_MISC+1 |
|
|
|
PVIDX_VSTAT_DEP=12 |
|
|
|
|
|
|
|
|
|
|
|
SCRIPT_OFFSET_START=0 |
|
SCRIPT_OFFSET_RANGE=0x80 |
|
|
|
def init(): |
|
""" |
|
To be called by library loader, do not call it in your program |
|
""" |
|
|
|
global ALL_PHONETIC_DATA, ALL_PHONETIC_VECTORS, TAMIL_PHONETIC_DATA, TAMIL_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET |
|
|
|
ALL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','all_script_phonetic_data.csv'),encoding='utf-8') |
|
TAMIL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','tamil_script_phonetic_data.csv'),encoding='utf-8') |
|
|
|
ALL_PHONETIC_VECTORS= ALL_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values |
|
TAMIL_PHONETIC_VECTORS=TAMIL_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values |
|
|
|
PHONETIC_VECTOR_LENGTH=ALL_PHONETIC_VECTORS.shape[1] |
|
|
|
def is_supported_language(lang): |
|
return lang in list(li.SCRIPT_RANGES.keys()) |
|
|
|
def get_offset(c,lang): |
|
if not is_supported_language(lang): |
|
raise IndicNlpException('Language {} not supported'.format(lang)) |
|
return ord(c)-li.SCRIPT_RANGES[lang][0] |
|
|
|
def offset_to_char(off,lang): |
|
""" |
|
Applicable to Brahmi derived Indic scripts |
|
""" |
|
if not is_supported_language(lang): |
|
raise IndicNlpException('Language {} not supported'.format(lang)) |
|
return chr(off+li.SCRIPT_RANGES[lang][0]) |
|
|
|
def is_indiclang_char(c,lang): |
|
""" |
|
Applicable to Brahmi derived Indic scripts |
|
Note that DANDA and DOUBLE_DANDA have the same Unicode codepoint for all Indic scripts |
|
""" |
|
if not is_supported_language(lang): |
|
raise IndicNlpException('Language {} not supported'.format(lang)) |
|
o=get_offset(c,lang) |
|
return (o>=SCRIPT_OFFSET_START and o<SCRIPT_OFFSET_RANGE) \ |
|
or ord(c)==li.DANDA or ord(c)==li.DOUBLE_DANDA |
|
|
|
def in_coordinated_range_offset(c_offset): |
|
""" |
|
Applicable to Brahmi derived Indic scripts |
|
""" |
|
return (c_offset>=li.COORDINATED_RANGE_START_INCLUSIVE and c_offset<=li.COORDINATED_RANGE_END_INCLUSIVE) |
|
|
|
def in_coordinated_range(c,lang): |
|
if not is_supported_language(lang): |
|
raise IndicNlpException('Language {} not supported'.format(lang)) |
|
return in_coordinated_range_offset(get_offset(c,lang)) |
|
|
|
def get_phonetic_info(lang): |
|
if not is_supported_language(lang): |
|
raise IndicNlpException('Language {} not supported'.format(lang)) |
|
phonetic_data= ALL_PHONETIC_DATA if lang!=li.LC_TA else TAMIL_PHONETIC_DATA |
|
phonetic_vectors= ALL_PHONETIC_VECTORS if lang!=li.LC_TA else TAMIL_PHONETIC_VECTORS |
|
|
|
return (phonetic_data, phonetic_vectors) |
|
|
|
def invalid_vector(): |
|
|
|
return np.array([0]*PHONETIC_VECTOR_LENGTH) |
|
|
|
def get_phonetic_feature_vector(c,lang): |
|
|
|
offset=get_offset(c,lang) |
|
|
|
if not in_coordinated_range_offset(offset): |
|
return invalid_vector() |
|
|
|
phonetic_data, phonetic_vectors= get_phonetic_info(lang) |
|
|
|
if phonetic_data.iloc[offset]['Valid Vector Representation']==0: |
|
return invalid_vector() |
|
|
|
return phonetic_vectors[offset] |
|
|
|
def get_phonetic_feature_vector_offset(offset,lang): |
|
|
|
if not in_coordinated_range_offset(offset): |
|
return invalid_vector() |
|
|
|
phonetic_data, phonetic_vectors= get_phonetic_info(lang) |
|
|
|
if phonetic_data.iloc[offset]['Valid Vector Representation']==0: |
|
return invalid_vector() |
|
|
|
return phonetic_vectors[offset] |
|
|
|
|
|
def is_valid(v): |
|
return np.sum(v)>0 |
|
|
|
def is_vowel(v): |
|
return v[PVIDX_BT_VOWEL]==1 |
|
|
|
def is_consonant(v): |
|
return v[PVIDX_BT_CONSONANT]==1 |
|
|
|
def is_halant(v): |
|
return v[PVIDX_BT_HALANT]==1 |
|
|
|
def is_nukta(v): |
|
return v[PVIDX_BT_NUKTA]==1 |
|
|
|
def is_anusvaar(v): |
|
return v[PVIDX_BT_ANUSVAAR]==1 |
|
|
|
def is_misc(v): |
|
return v[PVIDX_BT_MISC]==1 |
|
|
|
def is_dependent_vowel(v): |
|
return is_vowel(v) and v[PVIDX_VSTAT_DEP]==1 |
|
|
|
def is_plosive(v): |
|
return is_consonant(v) and get_property_vector(v,'consonant_type')[0]==1 |
|
|
|
|
|
|
|
def or_vectors(v1,v2): |
|
return np.array([ 1 if (b1+b2)>=1 else 0 for b1,b2 in zip(v1,v2) ]) |
|
|
|
def xor_vectors(v1,v2): |
|
return np.array([ 1 if b1!=b2 else 0 for b1,b2 in zip(v1,v2) ]) |
|
|
|
|
|
|
|
def get_property_vector(v,prop_name): |
|
return v[PV_PROP_RANGES[prop_name][0]:PV_PROP_RANGES[prop_name][1]] |
|
|
|
def get_property_value(v,prop_name): |
|
factor_bits=get_property_vector(v,prop_name).tolist() |
|
|
|
v=0 |
|
c=1 |
|
for b in factor_bits[::-1]: |
|
v+=(c*b) |
|
c=c*2.0 |
|
|
|
return int(v) |
|
|
|
def lcsr_indic(srcw,tgtw,slang,tlang): |
|
""" |
|
compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level. |
|
This works for Indic scripts by mapping both languages to a common script |
|
|
|
srcw: source language string |
|
tgtw: source language string |
|
slang: source language |
|
tlang: target language |
|
""" |
|
score_mat=np.zeros((len(srcw)+1,len(tgtw)+1)) |
|
|
|
for si,sc in enumerate(srcw,1): |
|
for ti,tc in enumerate(tgtw,1): |
|
so=get_offset(sc,slang) |
|
to=get_offset(tc,tlang) |
|
|
|
if in_coordinated_range_offset(so) and in_coordinated_range_offset(to) and so==to: |
|
score_mat[si,ti]=score_mat[si-1,ti-1]+1.0 |
|
elif not (in_coordinated_range_offset(so) or in_coordinated_range_offset(to)) and sc==tc: |
|
score_mat[si,ti]=score_mat[si-1,ti-1]+1.0 |
|
else: |
|
score_mat[si,ti]= max( |
|
score_mat[si,ti-1], |
|
score_mat[si-1,ti]) |
|
|
|
return (score_mat[-1,-1]/float(max(len(srcw),len(tgtw))),float(len(srcw)),float(len(tgtw))) |
|
|
|
def lcsr_any(srcw,tgtw): |
|
""" |
|
LCSR computation if both languages have the same script |
|
""" |
|
score_mat=np.zeros((len(srcw)+1,len(tgtw)+1)) |
|
|
|
for si,sc in enumerate(srcw,1): |
|
for ti,tc in enumerate(tgtw,1): |
|
|
|
if sc==tc: |
|
score_mat[si,ti]=score_mat[si-1,ti-1]+1.0 |
|
else: |
|
score_mat[si,ti]= max( |
|
score_mat[si,ti-1], |
|
score_mat[si-1,ti]) |
|
|
|
return (score_mat[-1,-1]/float(max(len(srcw),len(tgtw))),float(len(srcw)),float(len(tgtw))) |
|
|
|
def lcsr(srcw,tgtw,slang,tlang): |
|
""" |
|
compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level. |
|
|
|
srcw: source language string |
|
tgtw: source language string |
|
slang: source language |
|
tlang: target language |
|
""" |
|
|
|
if slang==tlang or not is_supported_language(slang) or not is_supported_language(tlang): |
|
return lcsr_any(srcw,tgtw,slang,tlang) |
|
else: |
|
return lcsr_indic(srcw,tgtw) |
|
|
|
|
|
|
|
|