Spaces:
Build error
Build error
File size: 4,556 Bytes
d984b22 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
#
# Copyright (c) 2013-present, Anoop Kunchukuttan
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
import codecs, sys, itertools,re,os
import morfessor
from functools import lru_cache
from indicnlp import langinfo
from indicnlp import common
from indicnlp.tokenize import indic_tokenize
# Unsupervised Morphological Analyser for Indian languages.
#
# @author Anoop Kunchukuttan
#
class MorphAnalyzerI(object):
"""
Interface for Morph Analyzer
"""
def morph_analyze(word):
pass
def morph_analyze_document(tokens):
pass
class UnsupervisedMorphAnalyzer(MorphAnalyzerI):
"""
Unsupervised Morphological analyser built using Morfessor 2.0
"""
def __init__(self,lang,add_marker=False):
self.lang=lang
self.add_marker=add_marker
io = morfessor.MorfessorIO()
self._morfessor_model=io.read_any_model(os.path.join(common.INDIC_RESOURCES_PATH,'morph','morfessor','{}.model'.format(lang)))
self._script_range_pat=r'^[{}-{}]+$'.format(chr(langinfo.SCRIPT_RANGES[lang][0]),chr(langinfo.SCRIPT_RANGES[lang][1]))
self._script_check_re=re.compile(self._script_range_pat)
def _contains_number(self,text):
if self.lang in langinfo.SCRIPT_RANGES:
for c in text:
offset=ord(c)-langinfo.SCRIPT_RANGES[self.lang][0]
if offset >=langinfo.NUMERIC_OFFSET_START and offset <= langinfo.NUMERIC_OFFSET_END:
return True
return False
def _morphanalysis_needed(self,word):
return self._script_check_re.match(word) and not self._contains_number(word)
@lru_cache(maxsize=16384)
def morph_analyze(self,word):
"""
Morphanalyzes a single word and returns a list of component morphemes
@param word: string input word
"""
m_list=[]
if self._morphanalysis_needed(word):
val=self._morfessor_model.viterbi_segment(word)
m_list=val[0]
if self.add_marker:
m_list= [ '{}_S_'.format(m) if i>0 else '{}_R_'.format(m) for i,m in enumerate(m_list)]
else:
if self.add_marker:
word='{}_E_'.format(word)
m_list=[word]
return m_list
### Older implementation
#val=self._morfessor_model.viterbi_segment(word)
#m_list=val[0]
#if self.add_marker:
# m_list= [ u'{}_S_'.format(m) if i>0 else u'{}_R_'.format(m) for i,m in enumerate(m_list)]
#return m_list
def morph_analyze_document(self,tokens):
"""
Morphanalyzes a document, represented as a list of tokens
Each word is morphanalyzed and result is a list of morphemes constituting the document
@param tokens: string sequence of words
@return list of segments in the document after morph analysis
"""
out_tokens=[]
for token in tokens:
morphs=self.morph_analyze(token)
out_tokens.extend(morphs)
return out_tokens
#### Older implementation
#out_tokens=[]
#for token in tokens:
# if self._morphanalysis_needed(token):
# morphs=self.morph_analyze(token)
# out_tokens.extend(morphs)
# else:
# if self.add_marker:
# token=u'{}_E_'.format(token)
# out_tokens.append(token)
#return out_tokens
if __name__ == '__main__':
if len(sys.argv)<4:
print("Usage: python unsupervised_morph.py <infile> <outfile> <language> <indic_resources_path> [<add_marker>]")
sys.exit(1)
language=sys.argv[3]
common.INDIC_RESOURCES_PATH=sys.argv[4]
add_marker=False
if len(sys.argv)==6:
add_marker= True if sys.argv[5] == 'True' else False
print('Loading morph analyser for ' + language)
analyzer=UnsupervisedMorphAnalyzer(language,add_marker)
print('Loaded morph analyser for ' + language)
with codecs.open(sys.argv[1],'r','utf-8') as ifile:
with codecs.open(sys.argv[2],'w','utf-8') as ofile:
for line in ifile.readlines():
line=line.strip()
tokens=indic_tokenize.trivial_tokenize(line)
morph_tokens=analyzer.morph_analyze_document(tokens)
ofile.write(' '.join(morph_tokens))
ofile.write('\n')
|