from nltk.parse.dependencygraph import DependencyGraph from nltk.parse.malt import MaltParser import os import tempfile from .stemmer import FindStems from .postagger import POSTagger from .tokenizer import Tokenizer from .normalizer import Normalizer class MyMaltParser(MaltParser): def __init__(self, parser_dirname, model_filename, tagger, stemmer): """ An interface for parsing with the Malt Parser. :param parser_dirname: The path to the maltparser directory that contains the maltparser-1.x.jar :type parser_dirname: str :param model_filename: The name of the pre-trained model with .mco file extension. If provided, training will not be required. (see http://www.maltparser.org/mco/mco.html and see http://www.patful.com/chalk/node/185) :type model_filename: str :param tagger: The tagger used to POS tag the raw string before formatting to CONLL format. It should behave like `nltk.pos_tag` :type tagger: function :param stemmer: a function which returns stem of the word :type function """ self.working_dir = parser_dirname self.mco = model_filename self.pos_tagger = tagger self._malt_bin = os.path.join(parser_dirname, 'maltparser-1.9.2.jar') self.stemmer = stemmer.convert_to_stem if stemmer else lambda w, t: '_' def parse_tagged_sent(self, sentences, verbose=False, top_relation_label='null'): tmp_file_address = tempfile.gettempdir() input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll', dir=tmp_file_address, delete=False) output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll', dir=tmp_file_address, delete=False) for sentence in sentences: for i, (word, tag) in enumerate(sentence, start=1): word = word.strip() if not word: word = '_' input_file.write(('\t'.join([str(i), word.replace(' ', '_'), self.stemmer(word, tag).replace(' ', '_'), tag, tag, '_', '0', 'ROOT', '_', '_', '\n'])).encode('utf8')) input_file.write('\n'.encode('utf8')) input_file.close() cmd = ['java', '-jar', self._malt_bin, '-w', self.working_dir, '-c', self.mco, '-i', input_file.name, '-o', output_file.name, '-m', 'parse'] if self._execute(cmd, verbose) != 0: raise Exception("MaltParser parsing failed: %s" % (' '.join(cmd))) dependency_graph = [] with open(output_file.name, encoding='utf-8') as infile: content = infile.read().strip().split('\n\n') for sent in content: dependency_graph.append(DependencyGraph(sent)) input_file.close() output_file.close() os.remove(input_file.name) os.remove(output_file.name) return dependency_graph class DependencyParser: def __init__(self, _normalizer=None, _tokenizer=None, _stemmer=None, _tagger=None): self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/" if _normalizer is None: self.my_normalizer = Normalizer() else: self.my_normalizer = _normalizer if _tokenizer is None: self.my_tokenizer = Tokenizer() else: self.my_tokenizer = _tokenizer if _stemmer is None: self.my_stemmer = FindStems() else: self.my_stemmer = _stemmer if _tagger is None: self.my_tagger = POSTagger(tagging_model="wapiti").parse else: self.my_tagger = _tagger self.parser = MyMaltParser(parser_dirname=self.dir_path + 'resource/dependency_parser', model_filename='total_dep_parser.mco', tagger=self.my_tagger, stemmer=self.my_stemmer) def make_trainable_corpus(self, in_file, out_file): tagger = self.my_tagger with open(in_file, 'r') as infile: content = infile.read().strip().split('\n\n') for i, sent in enumerate(content): if len(sent) == 0: continue lines = sent.split('\n') sent_tokens = [x.split('\t')[1] for x in lines] tagged_sent = tagger(sent_tokens) tages = [x[1] for x in tagged_sent] for j, line in enumerate(lines): line = line.split('\t') line[3] = tages[j] line[4] = tages[j] line = '\t'.join(line) lines[j] = line sent = '\n'.join(lines) content[i] = sent content = '\n\n'.join(content) with open(out_file, 'w') as outfile: outfile.write(content) return content def parse_sents(self, sents, verbose=False): tagger = self.my_tagger tagged_sents = [tagger(self.my_tokenizer.tokenize_words(sent)) for sent in sents] return self.parser.parse_tagged_sent(tagged_sents, verbose)