Spaces:
Build error
Build error
File size: 6,704 Bytes
22738ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
# -*- coding: utf-8 -*-
"""
basic_sentiment_analysis
~~~~~~~~~~~~~~~~~~~~~~~~
This module contains the code and examples described in
http://fjavieralba.com/basic-sentiment-analysis-with-python.html
"""
from pprint import pprint
import nltk
import yaml
import sys
import os
import re
from App.bin.constants import ASSETS
class Splitter(object):
def __init__(self):
self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()
def split(self, text):
"""
input format: a paragraph of text
output format: a list of lists of words.
e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
"""
sentences = self.nltk_splitter.tokenize(text)
tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
return tokenized_sentences
class POSTagger(object):
def __init__(self):
pass
def pos_tag(self, sentences):
"""
input format: list of lists of words
e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
output format: list of lists of tagged tokens. Each tagged tokens has a
form, a lemma, and a list of tags
e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])],
[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]]
"""
pos = [nltk.pos_tag(sentence) for sentence in sentences]
# adapt format
pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
return pos
class DictionaryTagger(object):
def __init__(self, dictionary_paths):
files = [open(path, 'r') for path in dictionary_paths]
dictionaries = [yaml.safe_load(dict_file) for dict_file in files]
map(lambda x: x.close(), files)
self.dictionary = {}
self.max_key_size = 0
for curr_dict in dictionaries:
for key in curr_dict:
if key in self.dictionary:
self.dictionary[key].extend(curr_dict[key])
else:
self.dictionary[key] = curr_dict[key]
self.max_key_size = max(self.max_key_size, len(key))
def tag(self, postagged_sentences):
return [self.tag_sentence(sentence) for sentence in postagged_sentences]
def tag_sentence(self, sentence, tag_with_lemmas=False):
"""
the result is only one tagging of all the possible ones.
The resulting tagging is determined by these two priority rules:
- longest matches have higher priority
- search is made from left to right
"""
tag_sentence = []
N = len(sentence)
if self.max_key_size == 0:
self.max_key_size = N
i = 0
while (i < N):
j = min(i + self.max_key_size, N) # avoid overflow
tagged = False
while (j > i):
expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
if tag_with_lemmas:
literal = expression_lemma
else:
literal = expression_form
if literal in self.dictionary:
# self.logger.debug("found: %s" % literal)
is_single_token = j - i == 1
original_position = i
i = j
taggings = [tag for tag in self.dictionary[literal]]
tagged_expression = (expression_form, expression_lemma, taggings)
if is_single_token: # if the tagged literal is a single token, conserve its previous taggings:
original_token_tagging = sentence[original_position][2]
tagged_expression[2].extend(original_token_tagging)
tag_sentence.append(tagged_expression)
tagged = True
else:
j = j - 1
if not tagged:
tag_sentence.append(sentence[i])
i += 1
return tag_sentence
class ClassifyWithIncr_it(object):
def __init__(self):
print("printing")
def value_of(self,sentiment):
if sentiment == 'positive': return 1
if sentiment == 'negative': return -1
return 0
def sentence_score(self, sentence_tokens, previous_token, acum_score):
if not sentence_tokens:
return acum_score
else:
current_token = sentence_tokens[0]
tags = current_token[2]
token_score = sum([self.value_of(tag) for tag in tags])
if previous_token is not None:
previous_tags = previous_token[2]
if 'inc' in previous_tags:
token_score *= 2.0
elif 'dec' in previous_tags:
token_score /= 2.0
elif 'inv' in previous_tags:
token_score *= -1.0
return self.sentence_score(sentence_tokens[1:], current_token, acum_score + token_score)
def sentiment_score(self,review):
return sum([self.sentence_score(sentence, None, 0.0) for sentence in review])
def main(self,sentence):
splitter = Splitter()
postagger = POSTagger()
pos=ASSETS+"dicts/positive.yml"
neg= ASSETS+"dicts/negative.yml"
inc=ASSETS+"dicts/inc.yml"
dec=ASSETS+"dicts/dec.yml"
inv=ASSETS+"dicts/inv.yml"
dicttagger = DictionaryTagger([pos, neg,
inc, dec, inv])
splitted_sentences = splitter.split(sentence)
pos_tagged_sentences = postagger.pos_tag(splitted_sentences)
dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
print("Classification...")
result = self.sentiment_score(dict_tagged_sentences)
print (result)
if result < 0:
polarity = "problem"
elif result > 0:
polarity ="partialSolution"
else:
polarity = "neutre"
return polarity
if __name__ == '__main__':
text = """this/these can be annoying"""
test = ClassifyWithIncr_it()
print(test.main(text))
|