Spaces:

xin
/

PatentSolver

Build error

App Files Files Community

PatentSolver / App /bin /ParameterExtractor.py

xin

initial commit

22738ca about 3 years ago

raw

history blame contribute delete

2.03 kB

	# -- coding: utf-8 --

	import re
	import nltk
	import Levenshtein
	from App.bin import constants

	class ParameterExtractor(object):

	def __init__(self, sentence):
	self.sentence = sentence

	def clean_parameter(self, parameter):
	line = re.sub(r'\s[a-zA-Z]$', r'', parameter)
	line = line.strip()
	return line

	def extract_parameters(self):
	sentence = self.sentence
	parameters_list = []
	with open(constants.ASSETS + "parameter_core", 'r') as l:
	words_list = l.read().splitlines()
	match_word = re.compile(r'(\b(?:%s)\b)' % '\|'.join(words_list))

	with open(constants.ASSETS + "exclude_from_parameters", 'r') as m:
	not_included_words_list = m.read().splitlines()
	match_not_included_word = re.compile(r'(\b(?:%s)\b)' % '\|'.join(not_included_words_list))

	parameter_indice = re.search(match_word, sentence)
	if parameter_indice:
	words = nltk.word_tokenize(sentence)
	sentence = nltk.pos_tag(words)
	grammar = """PARAMETER:{<NN>+<IN><DT>?<NN.*>+}
	{<NN*>+}
	"""
	parameter_parser = nltk.RegexpParser(grammar)
	tree = parameter_parser.parse(sentence)
	for subtree in tree.subtrees():
	if subtree.label() == 'PARAMETER':
	parameter_candidate = " ".join(word for word, tag in subtree.leaves())
	parameter_candidate_indice = re.search(match_word, parameter_candidate)
	not_parameter = re.search(match_not_included_word, parameter_candidate)
	if parameter_candidate_indice and not not_parameter :
	#parameter_candidate=self.clean_parameter(parameter_candidate)
	parameters_list.append(parameter_candidate)
	parameters_list = list(set(parameters_list))



	return list(parameters_list)