Spaces:

xin
/

PatentSolver

Build error

App Files Files Community

PatentSolver / App /bin /ClassifierWithIncr.py

xin

initial commit

22738ca about 3 years ago

raw

history blame

6.7 kB

	# -- coding: utf-8 --
	"""
	basic_sentiment_analysis
	~~~~~~~~~~~~~~~~~~~~~~~~

	This module contains the code and examples described in
	http://fjavieralba.com/basic-sentiment-analysis-with-python.html

	"""

	from pprint import pprint
	import nltk
	import yaml
	import sys
	import os
	import re
	from App.bin.constants import ASSETS


	class Splitter(object):
	def __init__(self):
	self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
	self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()

	def split(self, text):
	"""
	input format: a paragraph of text
	output format: a list of lists of words.
	e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
	"""
	sentences = self.nltk_splitter.tokenize(text)
	tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
	return tokenized_sentences


	class POSTagger(object):
	def __init__(self):
	pass

	def pos_tag(self, sentences):
	"""
	input format: list of lists of words
	e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
	output format: list of lists of tagged tokens. Each tagged tokens has a
	form, a lemma, and a list of tags
	e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])],
	[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]]
	"""

	pos = [nltk.pos_tag(sentence) for sentence in sentences]
	# adapt format
	pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
	return pos


	class DictionaryTagger(object):
	def __init__(self, dictionary_paths):
	files = [open(path, 'r') for path in dictionary_paths]
	dictionaries = [yaml.safe_load(dict_file) for dict_file in files]
	map(lambda x: x.close(), files)
	self.dictionary = {}
	self.max_key_size = 0
	for curr_dict in dictionaries:
	for key in curr_dict:
	if key in self.dictionary:
	self.dictionary[key].extend(curr_dict[key])
	else:
	self.dictionary[key] = curr_dict[key]
	self.max_key_size = max(self.max_key_size, len(key))

	def tag(self, postagged_sentences):
	return [self.tag_sentence(sentence) for sentence in postagged_sentences]

	def tag_sentence(self, sentence, tag_with_lemmas=False):
	"""
	the result is only one tagging of all the possible ones.
	The resulting tagging is determined by these two priority rules:
	- longest matches have higher priority
	- search is made from left to right
	"""
	tag_sentence = []
	N = len(sentence)
	if self.max_key_size == 0:
	self.max_key_size = N
	i = 0
	while (i < N):
	j = min(i + self.max_key_size, N) # avoid overflow
	tagged = False
	while (j > i):
	expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
	expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
	if tag_with_lemmas:
	literal = expression_lemma
	else:
	literal = expression_form
	if literal in self.dictionary:
	# self.logger.debug("found: %s" % literal)
	is_single_token = j - i == 1
	original_position = i
	i = j
	taggings = [tag for tag in self.dictionary[literal]]
	tagged_expression = (expression_form, expression_lemma, taggings)
	if is_single_token: # if the tagged literal is a single token, conserve its previous taggings:
	original_token_tagging = sentence[original_position][2]
	tagged_expression[2].extend(original_token_tagging)
	tag_sentence.append(tagged_expression)
	tagged = True
	else:
	j = j - 1
	if not tagged:
	tag_sentence.append(sentence[i])
	i += 1
	return tag_sentence

	class ClassifyWithIncr_it(object):

	def __init__(self):
	print("printing")


	def value_of(self,sentiment):
	if sentiment == 'positive': return 1
	if sentiment == 'negative': return -1
	return 0


	def sentence_score(self, sentence_tokens, previous_token, acum_score):
	if not sentence_tokens:
	return acum_score
	else:
	current_token = sentence_tokens[0]
	tags = current_token[2]
	token_score = sum([self.value_of(tag) for tag in tags])
	if previous_token is not None:
	previous_tags = previous_token[2]
	if 'inc' in previous_tags:
	token_score *= 2.0
	elif 'dec' in previous_tags:
	token_score /= 2.0
	elif 'inv' in previous_tags:
	token_score *= -1.0
	return self.sentence_score(sentence_tokens[1:], current_token, acum_score + token_score)


	def sentiment_score(self,review):

	return sum([self.sentence_score(sentence, None, 0.0) for sentence in review])


	def main(self,sentence):


	splitter = Splitter()
	postagger = POSTagger()
	pos=ASSETS+"dicts/positive.yml"
	neg= ASSETS+"dicts/negative.yml"
	inc=ASSETS+"dicts/inc.yml"
	dec=ASSETS+"dicts/dec.yml"
	inv=ASSETS+"dicts/inv.yml"
	dicttagger = DictionaryTagger([pos, neg,
	inc, dec, inv])

	splitted_sentences = splitter.split(sentence)


	pos_tagged_sentences = postagger.pos_tag(splitted_sentences)


	dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)

	print("Classification...")

	result = self.sentiment_score(dict_tagged_sentences)
	print (result)
	if result < 0:
	polarity = "problem"
	elif result > 0:
	polarity ="partialSolution"
	else:
	polarity = "neutre"
	return polarity

	if __name__ == '__main__':
	text = """this/these can be annoying"""
	test = ClassifyWithIncr_it()
	print(test.main(text))