# -*- coding: utf-8 -*- import re import nltk import Levenshtein from App.bin import constants class ParameterExtractor(object): def __init__(self, sentence): self.sentence = sentence def clean_parameter(self, parameter): line = re.sub(r'\s[a-zA-Z]$', r'', parameter) line = line.strip() return line def extract_parameters(self): sentence = self.sentence parameters_list = [] with open(constants.ASSETS + "parameter_core", 'r') as l: words_list = l.read().splitlines() match_word = re.compile(r'(\b(?:%s)\b)' % '|'.join(words_list)) with open(constants.ASSETS + "exclude_from_parameters", 'r') as m: not_included_words_list = m.read().splitlines() match_not_included_word = re.compile(r'(\b(?:%s)\b)' % '|'.join(not_included_words_list)) parameter_indice = re.search(match_word, sentence) if parameter_indice: words = nltk.word_tokenize(sentence) sentence = nltk.pos_tag(words) grammar = """PARAMETER:{+
?+} {+} """ parameter_parser = nltk.RegexpParser(grammar) tree = parameter_parser.parse(sentence) for subtree in tree.subtrees(): if subtree.label() == 'PARAMETER': parameter_candidate = " ".join(word for word, tag in subtree.leaves()) parameter_candidate_indice = re.search(match_word, parameter_candidate) not_parameter = re.search(match_not_included_word, parameter_candidate) if parameter_candidate_indice and not not_parameter : #parameter_candidate=self.clean_parameter(parameter_candidate) parameters_list.append(parameter_candidate) parameters_list = list(set(parameters_list)) return list(parameters_list)