sakharamg
/

NMTKD

Model card Files Files and versions Community

NMTKD / translation /tools /indic_nlp_library /indicnlp /langinfo.py

sakharamg

Uploading all files

158b61b about 2 years ago

raw

history blame contribute delete

11.1 kB

	#
	# Copyright (c) 2013-present, Anoop Kunchukuttan
	# All rights reserved.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.
	#

	## language codes
	LC_TA='ta'

	SCRIPT_RANGES={
	'pa':[0x0a00,0x0a7f] ,
	'gu':[0x0a80,0x0aff] ,
	'or':[0x0b00,0x0b7f] ,
	'ta':[0x0b80,0x0bff] ,
	'te':[0x0c00,0x0c7f] ,
	'kn':[0x0c80,0x0cff] ,
	'ml':[0x0d00,0x0d7f] ,
	'si':[0x0d80,0x0dff] ,
	'hi':[0x0900,0x097f] ,
	'mr':[0x0900,0x097f] ,
	'kK':[0x0900,0x097f] ,
	'sa':[0x0900,0x097f] ,
	'ne':[0x0900,0x097f] ,
	'sd':[0x0900,0x097f] ,
	'bn':[0x0980,0x09ff] ,
	'as':[0x0980,0x09ff] ,
	}

	DRAVIDIAN_LANGUAGES=['ta', 'te', 'kn', 'ml',]
	IE_LANGUAGES=['hi', 'mr', 'kK', 'sa', 'ne', 'sd', 'bn', 'as', 'pa', 'gu', 'or', 'si', ]

	URDU_RANGES=[
	[0x0600,0x06ff],
	[0x0750,0x077f],
	[0xfb50,0xfdff],
	[0xfe70,0xfeff],
	]

	COORDINATED_RANGE_START_INCLUSIVE=0
	COORDINATED_RANGE_END_INCLUSIVE=0x6f

	NUMERIC_OFFSET_START=0x66
	NUMERIC_OFFSET_END=0x6f

	HALANTA_OFFSET=0x4d
	AUM_OFFSET=0x50
	NUKTA_OFFSET=0x3c

	RUPEE_SIGN=0x20b9

	DANDA=0x0964
	DOUBLE_DANDA=0x0965

	#TODO: add missing fricatives and approximants
	VELAR_RANGE=[0x15,0x19]
	PALATAL_RANGE=[0x1a,0x1e]
	RETROFLEX_RANGE=[0x1f,0x23]
	DENTAL_RANGE=[0x24,0x29]
	LABIAL_RANGE=[0x2a,0x2e]

	# verify
	VOICED_LIST=[0x17,0x18,0x1c,0x1d,0x21,0x22,0x26,0x27,0x2c,0x2d]
	UNVOICED_LIST=[0x15,0x16,0x1a,0x1b,0x1f,0x20,0x24,0x25,0x2a,0x2b] #TODO: add sibilants/sonorants
	ASPIRATED_LIST=[0x16,0x18,0x1b,0x1d,0x20,0x22,0x25,0x27,0x2b,0x2d]
	UNASPIRATED_LIST=[0x15,0x17,0x1a,0x1c,0x1f,0x21,0x24,0x26,0x2a,0x2c]
	NASAL_LIST=[0x19,0x1e,0x23,0x28,0x29,0x2d]
	FRICATIVE_LIST=[0x36,0x37,0x38]
	APPROXIMANT_LIST=[0x2f,0x30,0x31,0x32,0x33,0x34,0x35]

	#TODO: ha has to be properly categorized

	def get_offset(c,lang):
	"""
	Applicable to Brahmi derived Indic scripts
	"""
	return ord(c)-SCRIPT_RANGES[lang][0]

	def offset_to_char(c,lang):
	"""
	Applicable to Brahmi derived Indic scripts
	"""
	return chr(c+SCRIPT_RANGES[lang][0])

	def in_coordinated_range(c_offset):
	"""
	Applicable to Brahmi derived Indic scripts
	"""
	return (c_offset>=COORDINATED_RANGE_START_INCLUSIVE and c_offset<=COORDINATED_RANGE_END_INCLUSIVE)

	def is_indiclang_char(c,lang):
	"""
	Applicable to Brahmi derived Indic scripts
	"""
	o=get_offset(c,lang)
	return (o>=0 and o<=0x7f) or ord(c)==DANDA or ord(c)==DOUBLE_DANDA

	# def is_vowel(c,lang):
	# """
	# Is the character a vowel
	# """
	# o=get_offset(c,lang)
	# return (o>=0x04 and o<=0x14)

	# def is_vowel_sign(c,lang):
	# """
	# Is the character a vowel sign (maatraa)
	# """
	# o=get_offset(c,lang)
	# return (o>=0x3e and o<=0x4c)

	# def is_halanta(c,lang):
	# """
	# Is the character the halanta character
	# """
	# o=get_offset(c,lang)
	# return (o==HALANTA_OFFSET)

	# def is_nukta(c,lang):
	# """
	# Is the character the halanta character
	# """
	# o=get_offset(c,lang)
	# return (o==NUKTA_OFFSET)

	# def is_aum(c,lang):
	# """
	# Is the character a vowel sign (maatraa)
	# """
	# o=get_offset(c,lang)
	# return (o==AUM_OFFSET)

	# def is_consonant(c,lang):
	# """
	# Is the character a consonant
	# """
	# o=get_offset(c,lang)
	# return (o>=0x15 and o<=0x39)

	# def is_velar(c,lang):
	# """
	# Is the character a velar
	# """
	# o=get_offset(c,lang)
	# return (o>=VELAR_RANGE[0] and o<=VELAR_RANGE[1])

	# def is_palatal(c,lang):
	# """
	# Is the character a palatal
	# """
	# o=get_offset(c,lang)
	# return (o>=PALATAL_RANGE[0] and o<=PALATAL_RANGE[1])

	# def is_retroflex(c,lang):
	# """
	# Is the character a retroflex
	# """
	# o=get_offset(c,lang)
	# return (o>=RETROFLEX_RANGE[0] and o<=RETROFLEX_RANGE[1])

	# def is_dental(c,lang):
	# """
	# Is the character a dental
	# """
	# o=get_offset(c,lang)
	# return (o>=DENTAL_RANGE[0] and o<=DENTAL_RANGE[1])

	# def is_labial(c,lang):
	# """
	# Is the character a labial
	# """
	# o=get_offset(c,lang)
	# return (o>=LABIAL_RANGE[0] and o<=LABIAL_RANGE[1])

	# def is_voiced(c,lang):
	# """
	# Is the character a voiced consonant
	# """
	# o=get_offset(c,lang)
	# return o in VOICED_LIST

	# def is_unvoiced(c,lang):
	# """
	# Is the character a unvoiced consonant
	# """
	# o=get_offset(c,lang)
	# return o in UNVOICED_LIST

	# def is_aspirated(c,lang):
	# """
	# Is the character a aspirated consonant
	# """
	# o=get_offset(c,lang)
	# return o in ASPIRATED_LIST

	# def is_unaspirated(c,lang):
	# """
	# Is the character a unaspirated consonant
	# """
	# o=get_offset(c,lang)
	# return o in UNASPIRATED_LIST

	# def is_nasal(c,lang):
	# """
	# Is the character a nasal consonant
	# """
	# o=get_offset(c,lang)
	# return o in NASAL_LIST

	# def is_fricative(c,lang):
	# """
	# Is the character a fricative consonant
	# """
	# o=get_offset(c,lang)
	# return o in FRICATIVE_LIST

	# def is_approximant(c,lang):
	# """
	# Is the character an approximant consonant
	# """
	# o=get_offset(c,lang)
	# return o in APPROXIMANT_LIST

	# def is_number(c,lang):
	# """
	# Is the character a number
	# """
	# o=get_offset(c,lang)
	# return (o>=0x66 and o<=0x6f)


	def is_vowel(c,lang):
	"""
	Is the character a vowel
	"""
	o=get_offset(c,lang)
	return (o>=0x04 and o<=0x14)

	def is_vowel_sign(c,lang):
	"""
	Is the character a vowel sign (maatraa)
	"""
	o=get_offset(c,lang)
	return (o>=0x3e and o<=0x4c)

	def is_halanta(c,lang):
	"""
	Is the character the halanta character
	"""
	o=get_offset(c,lang)
	return (o==HALANTA_OFFSET)

	def is_nukta(c,lang):
	"""
	Is the character the halanta character
	"""
	o=get_offset(c,lang)
	return (o==NUKTA_OFFSET)

	def is_aum(c,lang):
	"""
	Is the character a vowel sign (maatraa)
	"""
	o=get_offset(c,lang)
	return (o==AUM_OFFSET)

	def is_consonant(c,lang):
	"""
	Is the character a consonant
	"""
	o=get_offset(c,lang)
	return (o>=0x15 and o<=0x39)

	def is_velar(c,lang):
	"""
	Is the character a velar
	"""
	o=get_offset(c,lang)
	return (o>=VELAR_RANGE[0] and o<=VELAR_RANGE[1])

	def is_palatal(c,lang):
	"""
	Is the character a palatal
	"""
	o=get_offset(c,lang)
	return (o>=PALATAL_RANGE[0] and o<=PALATAL_RANGE[1])

	def is_retroflex(c,lang):
	"""
	Is the character a retroflex
	"""
	o=get_offset(c,lang)
	return (o>=RETROFLEX_RANGE[0] and o<=RETROFLEX_RANGE[1])

	def is_dental(c,lang):
	"""
	Is the character a dental
	"""
	o=get_offset(c,lang)
	return (o>=DENTAL_RANGE[0] and o<=DENTAL_RANGE[1])

	def is_labial(c,lang):
	"""
	Is the character a labial
	"""
	o=get_offset(c,lang)
	return (o>=LABIAL_RANGE[0] and o<=LABIAL_RANGE[1])

	def is_voiced(c,lang):
	"""
	Is the character a voiced consonant
	"""
	o=get_offset(c,lang)
	return o in VOICED_LIST

	def is_unvoiced(c,lang):
	"""
	Is the character a unvoiced consonant
	"""
	o=get_offset(c,lang)
	return o in UNVOICED_LIST

	def is_aspirated(c,lang):
	"""
	Is the character a aspirated consonant
	"""
	o=get_offset(c,lang)
	return o in ASPIRATED_LIST

	def is_unaspirated(c,lang):
	"""
	Is the character a unaspirated consonant
	"""
	o=get_offset(c,lang)
	return o in UNASPIRATED_LIST

	def is_nasal(c,lang):
	"""
	Is the character a nasal consonant
	"""
	o=get_offset(c,lang)
	return o in NASAL_LIST

	def is_fricative(c,lang):
	"""
	Is the character a fricative consonant
	"""
	o=get_offset(c,lang)
	return o in FRICATIVE_LIST

	def is_approximant(c,lang):
	"""
	Is the character an approximant consonant
	"""
	o=get_offset(c,lang)
	return o in APPROXIMANT_LIST

	def is_number(c,lang):
	"""
	Is the character a number
	"""
	o=get_offset(c,lang)
	return (o>=0x66 and o<=0x6f)


	##################################################

	def is_vowel_offset(c_offset):
	"""
	Is the offset a vowel
	"""
	return (c_offset>=0x04 and c_offset<=0x14)

	def is_vowel_sign_offset(c_offset):
	"""
	Is the offset a vowel sign (maatraa)
	"""
	return (c_offset>=0x3e and c_offset<=0x4c)

	def is_halanta_offset(c_offset):
	"""
	Is the offset the halanta offset
	"""
	return (c_offset==HALANTA_OFFSET)

	def is_nukta_offset(c_offset):
	"""
	Is the offset the halanta offset
	"""
	return (c_offset==NUKTA_OFFSET)

	def is_aum_offset(c_offset):
	"""
	Is the offset a vowel sign (maatraa)
	"""
	return (c_offset==AUM_OFFSET)

	def is_consonant_offset(c_offset):
	"""
	Is the offset a consonant
	"""
	return (c_offset>=0x15 and c_offset<=0x39)

	def is_velar_offset(c_offset):
	"""
	Is the offset a velar
	"""
	return (c_offset>=VELAR_RANGE[0] and c_offset<=VELAR_RANGE[1])

	def is_palatal_offset(c_offset):
	"""
	Is the offset a palatal
	"""
	return (c_offset>=PALATAL_RANGE[0] and c_offset<=PALATAL_RANGE[1])

	def is_retroflex_offset(c_offset):
	"""
	Is the offset a retroflex
	"""
	return (c_offset>=RETROFLEX_RANGE[0] and c_offset<=RETROFLEX_RANGE[1])

	def is_dental_offset(c_offset):
	"""
	Is the offset a dental
	"""
	return (c_offset>=DENTAL_RANGE[0] and c_offset<=DENTAL_RANGE[1])

	def is_labial_offset(c_offset):
	"""
	Is the offset a labial
	"""
	return (c_offset>=LABIAL_RANGE[0] and c_offset<=LABIAL_RANGE[1])

	def is_voiced_offset(c_offset):
	"""
	Is the offset a voiced consonant
	"""
	return c_offset in VOICED_LIST

	def is_unvoiced_offset(c_offset):
	"""
	Is the offset a unvoiced consonant
	"""
	return c_offset in UNVOICED_LIST

	def is_aspirated_offset(c_offset):
	"""
	Is the offset a aspirated consonant
	"""
	return c_offset in ASPIRATED_LIST

	def is_unaspirated_offset(c_offset):
	"""
	Is the offset a unaspirated consonant
	"""
	return c_offset in UNASPIRATED_LIST

	def is_nasal_offset(c_offset):
	"""
	Is the offset a nasal consonant
	"""
	return c_offset in NASAL_LIST

	def is_fricative_offset(c_offset):
	"""
	Is the offset a fricative consonant
	"""
	return c_offset in FRICATIVE_LIST

	def is_approximant_offset(c_offset):
	"""
	Is the offset an approximant consonant
	"""
	return c_offset in APPROXIMANT_LIST

	def is_number_offset(c_offset):
	"""
	Is the offset a number
	"""
	return (c_offset>=0x66 and c_offset<=0x6f)