Spaces:
Build error
Build error
# -*- coding: utf-8 -*- | |
import re | |
import nltk | |
import Levenshtein | |
from App.bin import constants | |
class ParameterExtractor(object): | |
def __init__(self, sentence): | |
self.sentence = sentence | |
def clean_parameter(self, parameter): | |
line = re.sub(r'\s[a-zA-Z]$', r'', parameter) | |
line = line.strip() | |
return line | |
def extract_parameters(self): | |
sentence = self.sentence | |
parameters_list = [] | |
with open(constants.ASSETS + "parameter_core", 'r') as l: | |
words_list = l.read().splitlines() | |
match_word = re.compile(r'(\b(?:%s)\b)' % '|'.join(words_list)) | |
with open(constants.ASSETS + "exclude_from_parameters", 'r') as m: | |
not_included_words_list = m.read().splitlines() | |
match_not_included_word = re.compile(r'(\b(?:%s)\b)' % '|'.join(not_included_words_list)) | |
parameter_indice = re.search(match_word, sentence) | |
if parameter_indice: | |
words = nltk.word_tokenize(sentence) | |
sentence = nltk.pos_tag(words) | |
grammar = """PARAMETER:{<NN>+<IN><DT>?<NN.*>+} | |
{<NN*>+} | |
""" | |
parameter_parser = nltk.RegexpParser(grammar) | |
tree = parameter_parser.parse(sentence) | |
for subtree in tree.subtrees(): | |
if subtree.label() == 'PARAMETER': | |
parameter_candidate = " ".join(word for word, tag in subtree.leaves()) | |
parameter_candidate_indice = re.search(match_word, parameter_candidate) | |
not_parameter = re.search(match_not_included_word, parameter_candidate) | |
if parameter_candidate_indice and not not_parameter : | |
#parameter_candidate=self.clean_parameter(parameter_candidate) | |
parameters_list.append(parameter_candidate) | |
parameters_list = list(set(parameters_list)) | |
return list(parameters_list) | |