Spaces:
Build error
Build error
# -*- coding: utf-8 -*- | |
""" | |
basic_sentiment_analysis | |
~~~~~~~~~~~~~~~~~~~~~~~~ | |
This module contains the code and examples described in | |
http://fjavieralba.com/basic-sentiment-analysis-with-python.html | |
""" | |
from pprint import pprint | |
import nltk | |
import yaml | |
import sys | |
import os | |
import re | |
from App.bin.constants import ASSETS | |
class Splitter(object): | |
def __init__(self): | |
self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle') | |
self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer() | |
def split(self, text): | |
""" | |
input format: a paragraph of text | |
output format: a list of lists of words. | |
e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']] | |
""" | |
sentences = self.nltk_splitter.tokenize(text) | |
tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences] | |
return tokenized_sentences | |
class POSTagger(object): | |
def __init__(self): | |
pass | |
def pos_tag(self, sentences): | |
""" | |
input format: list of lists of words | |
e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']] | |
output format: list of lists of tagged tokens. Each tagged tokens has a | |
form, a lemma, and a list of tags | |
e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])], | |
[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]] | |
""" | |
pos = [nltk.pos_tag(sentence) for sentence in sentences] | |
# adapt format | |
pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos] | |
return pos | |
class DictionaryTagger(object): | |
def __init__(self, dictionary_paths): | |
files = [open(path, 'r') for path in dictionary_paths] | |
dictionaries = [yaml.safe_load(dict_file) for dict_file in files] | |
map(lambda x: x.close(), files) | |
self.dictionary = {} | |
self.max_key_size = 0 | |
for curr_dict in dictionaries: | |
for key in curr_dict: | |
if key in self.dictionary: | |
self.dictionary[key].extend(curr_dict[key]) | |
else: | |
self.dictionary[key] = curr_dict[key] | |
self.max_key_size = max(self.max_key_size, len(key)) | |
def tag(self, postagged_sentences): | |
return [self.tag_sentence(sentence) for sentence in postagged_sentences] | |
def tag_sentence(self, sentence, tag_with_lemmas=False): | |
""" | |
the result is only one tagging of all the possible ones. | |
The resulting tagging is determined by these two priority rules: | |
- longest matches have higher priority | |
- search is made from left to right | |
""" | |
tag_sentence = [] | |
N = len(sentence) | |
if self.max_key_size == 0: | |
self.max_key_size = N | |
i = 0 | |
while (i < N): | |
j = min(i + self.max_key_size, N) # avoid overflow | |
tagged = False | |
while (j > i): | |
expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower() | |
expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower() | |
if tag_with_lemmas: | |
literal = expression_lemma | |
else: | |
literal = expression_form | |
if literal in self.dictionary: | |
# self.logger.debug("found: %s" % literal) | |
is_single_token = j - i == 1 | |
original_position = i | |
i = j | |
taggings = [tag for tag in self.dictionary[literal]] | |
tagged_expression = (expression_form, expression_lemma, taggings) | |
if is_single_token: # if the tagged literal is a single token, conserve its previous taggings: | |
original_token_tagging = sentence[original_position][2] | |
tagged_expression[2].extend(original_token_tagging) | |
tag_sentence.append(tagged_expression) | |
tagged = True | |
else: | |
j = j - 1 | |
if not tagged: | |
tag_sentence.append(sentence[i]) | |
i += 1 | |
return tag_sentence | |
class ClassifyWithIncr_it(object): | |
def __init__(self): | |
print("printing") | |
def value_of(self,sentiment): | |
if sentiment == 'positive': return 1 | |
if sentiment == 'negative': return -1 | |
return 0 | |
def sentence_score(self, sentence_tokens, previous_token, acum_score): | |
if not sentence_tokens: | |
return acum_score | |
else: | |
current_token = sentence_tokens[0] | |
tags = current_token[2] | |
token_score = sum([self.value_of(tag) for tag in tags]) | |
if previous_token is not None: | |
previous_tags = previous_token[2] | |
if 'inc' in previous_tags: | |
token_score *= 2.0 | |
elif 'dec' in previous_tags: | |
token_score /= 2.0 | |
elif 'inv' in previous_tags: | |
token_score *= -1.0 | |
return self.sentence_score(sentence_tokens[1:], current_token, acum_score + token_score) | |
def sentiment_score(self,review): | |
return sum([self.sentence_score(sentence, None, 0.0) for sentence in review]) | |
def main(self,sentence): | |
splitter = Splitter() | |
postagger = POSTagger() | |
pos=ASSETS+"dicts/positive.yml" | |
neg= ASSETS+"dicts/negative.yml" | |
inc=ASSETS+"dicts/inc.yml" | |
dec=ASSETS+"dicts/dec.yml" | |
inv=ASSETS+"dicts/inv.yml" | |
dicttagger = DictionaryTagger([pos, neg, | |
inc, dec, inv]) | |
splitted_sentences = splitter.split(sentence) | |
pos_tagged_sentences = postagger.pos_tag(splitted_sentences) | |
dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences) | |
print("Classification...") | |
result = self.sentiment_score(dict_tagged_sentences) | |
print (result) | |
if result < 0: | |
polarity = "problem" | |
elif result > 0: | |
polarity ="partialSolution" | |
else: | |
polarity = "neutre" | |
return polarity | |
if __name__ == '__main__': | |
text = """this/these can be annoying""" | |
test = ClassifyWithIncr_it() | |
print(test.main(text)) | |