|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
import numpy as np |
|
|
|
from indicnlp import common |
|
from indicnlp.common import IndicNlpException |
|
|
|
|
|
|
|
ARPABET_ID_MAP={} |
|
ID_ARPABET_MAP={} |
|
|
|
|
|
|
|
|
|
|
|
|
|
""" Phonetic data for English """ |
|
ENGLISH_PHONETIC_DATA=None |
|
|
|
""" Phonetic vector for English""" |
|
ENGLISH_PHONETIC_VECTORS=None |
|
|
|
""" Length of phonetic vector """ |
|
PHONETIC_VECTOR_LENGTH=38 |
|
|
|
""" Start offset for the phonetic feature vector in the phonetic data vector """ |
|
PHONETIC_VECTOR_START_OFFSET=6 |
|
|
|
|
|
|
|
PV_PROP=['basic_type', |
|
'vowel_length', |
|
'vowel_strength', |
|
'vowel_status', |
|
'consonant_type', |
|
'articulation_place', |
|
'aspiration', |
|
'voicing', |
|
'nasalization', |
|
'vowel_horizontal', |
|
'vowel_vertical', |
|
'vowel_roundness', |
|
] |
|
|
|
|
|
|
|
|
|
|
|
PV_PROP_RANGES={ |
|
'basic_type': [0,6], |
|
'vowel_length': [6,8], |
|
'vowel_strength': [8,11], |
|
'vowel_status': [11,13], |
|
'consonant_type': [13,18], |
|
'articulation_place': [18,23], |
|
'aspiration': [23,25], |
|
'voicing': [25,27], |
|
'nasalization': [27,29], |
|
'vowel_horizontal': [29,32], |
|
'vowel_vertical': [32,36], |
|
'vowel_roundness': [36,38], |
|
} |
|
|
|
|
|
|
|
|
|
|
|
PVIDX_BT_VOWEL=0 |
|
PVIDX_BT_CONSONANT=1 |
|
PVIDX_BT_NUKTA=2 |
|
PVIDX_BT_HALANT=3 |
|
PVIDX_BT_ANUSVAAR=4 |
|
PVIDX_BT_MISC=5 |
|
PVIDX_BT_S=PVIDX_BT_VOWEL |
|
PVIDX_BT_E=PVIDX_BT_MISC+1 |
|
|
|
PVIDX_VSTAT_DEP=12 |
|
|
|
|
|
SCRIPT_RANGE_START=0x0D00 |
|
|
|
SCRIPT_RANGE_END=0x0D2E |
|
|
|
|
|
def init(): |
|
""" |
|
To be called by library loader, do not call it in your program |
|
""" |
|
|
|
global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET |
|
|
|
ENGLISH_PHONETIC_DATA=pd.read_csv(common.get_resources_path()+'/script/english_script_phonetic_data.csv',encoding='utf-8') |
|
|
|
ENGLISH_PHONETIC_VECTORS=ENGLISH_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values |
|
|
|
PHONETIC_VECTOR_LENGTH=ENGLISH_PHONETIC_VECTORS.shape[1] |
|
|
|
|
|
global ARPABET_ID_MAP, ID_ARPABET_MAP |
|
|
|
with open(common.get_resources_path()+'/script/english_arpabet_list.csv','r',encoding='utf-8') as infile: |
|
for ph_id, name in enumerate(iter(infile)): |
|
name=name.strip() |
|
ARPABET_ID_MAP[name]=ph_id |
|
ID_ARPABET_MAP[ph_id]=name |
|
|
|
|
|
def phoneme_to_offset(ph): |
|
return ARPABET_ID_MAP[ph] |
|
|
|
def offset_to_phoneme(ph_id): |
|
return ID_ARPABET_MAP[ph_id] |
|
|
|
def phoneme_to_enc(ph): |
|
return chr(SCRIPT_RANGE_START+phoneme_to_offset(ph)) |
|
|
|
def enc_to_phoneme(ph): |
|
return offset_to_phoneme(enc_to_offset(ph)) |
|
|
|
def enc_to_offset(c): |
|
return ord(c)-SCRIPT_RANGE_START |
|
|
|
def in_range(offset): |
|
return offset>=SCRIPT_RANGE_START and offset<SCRIPT_RANGE_END |
|
|
|
def get_phonetic_info(lang): |
|
return (ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS) |
|
|
|
def invalid_vector(): |
|
|
|
return np.array([0]*PHONETIC_VECTOR_LENGTH) |
|
|
|
def get_phonetic_feature_vector(p,lang): |
|
|
|
offset=enc_to_offset(p) |
|
|
|
if not in_range(offset): |
|
return invalid_vector() |
|
|
|
phonetic_data, phonetic_vectors= get_phonetic_info(lang) |
|
|
|
if phonetic_data.iloc[offset]['Valid Vector Representation']==0: |
|
return invalid_vector() |
|
|
|
return phonetic_vectors[offset] |
|
|
|
|