File size: 2,031 Bytes
22738ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# -*- coding: utf-8 -*-

import re
import nltk
import Levenshtein
from App.bin import constants

class ParameterExtractor(object):

    def __init__(self, sentence):
        self.sentence = sentence

    def clean_parameter(self, parameter):
        line = re.sub(r'\s[a-zA-Z]$', r'', parameter)
        line = line.strip()
        return line

    def extract_parameters(self):
        sentence = self.sentence
        parameters_list = []
        with open(constants.ASSETS + "parameter_core", 'r') as l:
            words_list = l.read().splitlines()
            match_word = re.compile(r'(\b(?:%s)\b)' % '|'.join(words_list))

        with open(constants.ASSETS + "exclude_from_parameters", 'r') as m:
            not_included_words_list = m.read().splitlines()
            match_not_included_word = re.compile(r'(\b(?:%s)\b)' % '|'.join(not_included_words_list))

        parameter_indice = re.search(match_word, sentence)
        if parameter_indice:
            words = nltk.word_tokenize(sentence)
            sentence = nltk.pos_tag(words)
            grammar = """PARAMETER:{<NN>+<IN><DT>?<NN.*>+}

                                {<NN*>+}

                        """
            parameter_parser = nltk.RegexpParser(grammar)
            tree = parameter_parser.parse(sentence)
            for subtree in tree.subtrees():
                if subtree.label() == 'PARAMETER':
                    parameter_candidate = " ".join(word for word, tag in subtree.leaves())
                    parameter_candidate_indice = re.search(match_word, parameter_candidate)
                    not_parameter = re.search(match_not_included_word, parameter_candidate)
                    if parameter_candidate_indice and not not_parameter :
                        #parameter_candidate=self.clean_parameter(parameter_candidate)
                        parameters_list.append(parameter_candidate)
        parameters_list = list(set(parameters_list))



        return list(parameters_list)