PatentSolver / App /bin /ClassifierWithIncr.py
xin's picture
initial commit
22738ca
raw
history blame
6.7 kB
# -*- coding: utf-8 -*-
"""
basic_sentiment_analysis
~~~~~~~~~~~~~~~~~~~~~~~~
This module contains the code and examples described in
http://fjavieralba.com/basic-sentiment-analysis-with-python.html
"""
from pprint import pprint
import nltk
import yaml
import sys
import os
import re
from App.bin.constants import ASSETS
class Splitter(object):
def __init__(self):
self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()
def split(self, text):
"""
input format: a paragraph of text
output format: a list of lists of words.
e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
"""
sentences = self.nltk_splitter.tokenize(text)
tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
return tokenized_sentences
class POSTagger(object):
def __init__(self):
pass
def pos_tag(self, sentences):
"""
input format: list of lists of words
e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
output format: list of lists of tagged tokens. Each tagged tokens has a
form, a lemma, and a list of tags
e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])],
[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]]
"""
pos = [nltk.pos_tag(sentence) for sentence in sentences]
# adapt format
pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
return pos
class DictionaryTagger(object):
def __init__(self, dictionary_paths):
files = [open(path, 'r') for path in dictionary_paths]
dictionaries = [yaml.safe_load(dict_file) for dict_file in files]
map(lambda x: x.close(), files)
self.dictionary = {}
self.max_key_size = 0
for curr_dict in dictionaries:
for key in curr_dict:
if key in self.dictionary:
self.dictionary[key].extend(curr_dict[key])
else:
self.dictionary[key] = curr_dict[key]
self.max_key_size = max(self.max_key_size, len(key))
def tag(self, postagged_sentences):
return [self.tag_sentence(sentence) for sentence in postagged_sentences]
def tag_sentence(self, sentence, tag_with_lemmas=False):
"""
the result is only one tagging of all the possible ones.
The resulting tagging is determined by these two priority rules:
- longest matches have higher priority
- search is made from left to right
"""
tag_sentence = []
N = len(sentence)
if self.max_key_size == 0:
self.max_key_size = N
i = 0
while (i < N):
j = min(i + self.max_key_size, N) # avoid overflow
tagged = False
while (j > i):
expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
if tag_with_lemmas:
literal = expression_lemma
else:
literal = expression_form
if literal in self.dictionary:
# self.logger.debug("found: %s" % literal)
is_single_token = j - i == 1
original_position = i
i = j
taggings = [tag for tag in self.dictionary[literal]]
tagged_expression = (expression_form, expression_lemma, taggings)
if is_single_token: # if the tagged literal is a single token, conserve its previous taggings:
original_token_tagging = sentence[original_position][2]
tagged_expression[2].extend(original_token_tagging)
tag_sentence.append(tagged_expression)
tagged = True
else:
j = j - 1
if not tagged:
tag_sentence.append(sentence[i])
i += 1
return tag_sentence
class ClassifyWithIncr_it(object):
def __init__(self):
print("printing")
def value_of(self,sentiment):
if sentiment == 'positive': return 1
if sentiment == 'negative': return -1
return 0
def sentence_score(self, sentence_tokens, previous_token, acum_score):
if not sentence_tokens:
return acum_score
else:
current_token = sentence_tokens[0]
tags = current_token[2]
token_score = sum([self.value_of(tag) for tag in tags])
if previous_token is not None:
previous_tags = previous_token[2]
if 'inc' in previous_tags:
token_score *= 2.0
elif 'dec' in previous_tags:
token_score /= 2.0
elif 'inv' in previous_tags:
token_score *= -1.0
return self.sentence_score(sentence_tokens[1:], current_token, acum_score + token_score)
def sentiment_score(self,review):
return sum([self.sentence_score(sentence, None, 0.0) for sentence in review])
def main(self,sentence):
splitter = Splitter()
postagger = POSTagger()
pos=ASSETS+"dicts/positive.yml"
neg= ASSETS+"dicts/negative.yml"
inc=ASSETS+"dicts/inc.yml"
dec=ASSETS+"dicts/dec.yml"
inv=ASSETS+"dicts/inv.yml"
dicttagger = DictionaryTagger([pos, neg,
inc, dec, inv])
splitted_sentences = splitter.split(sentence)
pos_tagged_sentences = postagger.pos_tag(splitted_sentences)
dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
print("Classification...")
result = self.sentiment_score(dict_tagged_sentences)
print (result)
if result < 0:
polarity = "problem"
elif result > 0:
polarity ="partialSolution"
else:
polarity = "neutre"
return polarity
if __name__ == '__main__':
text = """this/these can be annoying"""
test = ClassifyWithIncr_it()
print(test.main(text))