Spaces:

xin
/

PatentSolver

Build error

File size: 6,704 Bytes

22738ca

# -*- coding: utf-8 -*-
"""

basic_sentiment_analysis

~~~~~~~~~~~~~~~~~~~~~~~~



This module contains the code and examples described in 

http://fjavieralba.com/basic-sentiment-analysis-with-python.html



"""

from pprint import pprint
import nltk
import yaml
import sys
import os
import re
from App.bin.constants import ASSETS


class Splitter(object):
    def __init__(self):
        self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self, text):
        """

        input format: a paragraph of text

        output format: a list of lists of words.

            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]

        """
        sentences = self.nltk_splitter.tokenize(text)
        tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
        return tokenized_sentences


class POSTagger(object):
    def __init__(self):
        pass

    def pos_tag(self, sentences):
        """

        input format: list of lists of words

            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]

        output format: list of lists of tagged tokens. Each tagged tokens has a

        form, a lemma, and a list of tags

            e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])],

                    [('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]]

        """

        pos = [nltk.pos_tag(sentence) for sentence in sentences]
        # adapt format
        pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
        return pos


class DictionaryTagger(object):
    def __init__(self, dictionary_paths):
        files = [open(path, 'r') for path in dictionary_paths]
        dictionaries = [yaml.safe_load(dict_file) for dict_file in files]
        map(lambda x: x.close(), files)
        self.dictionary = {}
        self.max_key_size = 0
        for curr_dict in dictionaries:
            for key in curr_dict:
                if key in self.dictionary:
                    self.dictionary[key].extend(curr_dict[key])
                else:
                    self.dictionary[key] = curr_dict[key]
                    self.max_key_size = max(self.max_key_size, len(key))

    def tag(self, postagged_sentences):
        return [self.tag_sentence(sentence) for sentence in postagged_sentences]

    def tag_sentence(self, sentence, tag_with_lemmas=False):
        """

        the result is only one tagging of all the possible ones.

        The resulting tagging is determined by these two priority rules:

            - longest matches have higher priority

            - search is made from left to right

        """
        tag_sentence = []
        N = len(sentence)
        if self.max_key_size == 0:
            self.max_key_size = N
        i = 0
        while (i < N):
            j = min(i + self.max_key_size, N)  # avoid overflow
            tagged = False
            while (j > i):
                expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
                expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
                if tag_with_lemmas:
                    literal = expression_lemma
                else:
                    literal = expression_form
                if literal in self.dictionary:
                    # self.logger.debug("found: %s" % literal)
                    is_single_token = j - i == 1
                    original_position = i
                    i = j
                    taggings = [tag for tag in self.dictionary[literal]]
                    tagged_expression = (expression_form, expression_lemma, taggings)
                    if is_single_token:  # if the tagged literal is a single token, conserve its previous taggings:
                        original_token_tagging = sentence[original_position][2]
                        tagged_expression[2].extend(original_token_tagging)
                    tag_sentence.append(tagged_expression)
                    tagged = True
                else:
                    j = j - 1
            if not tagged:
                tag_sentence.append(sentence[i])
                i += 1
        return tag_sentence

class ClassifyWithIncr_it(object):

    def __init__(self):
        print("printing")


    def value_of(self,sentiment):
        if sentiment == 'positive': return 1
        if sentiment == 'negative': return -1
        return 0


    def sentence_score(self, sentence_tokens, previous_token, acum_score):
        if not sentence_tokens:
            return acum_score
        else:
            current_token = sentence_tokens[0]
            tags = current_token[2]
            token_score = sum([self.value_of(tag) for tag in tags])
            if previous_token is not None:
                previous_tags = previous_token[2]
                if 'inc' in previous_tags:
                    token_score *= 2.0
                elif 'dec' in previous_tags:
                    token_score /= 2.0
                elif 'inv' in previous_tags:
                    token_score *= -1.0
            return self.sentence_score(sentence_tokens[1:], current_token, acum_score + token_score)


    def sentiment_score(self,review):

        return sum([self.sentence_score(sentence, None, 0.0) for sentence in review])


    def main(self,sentence):


        splitter = Splitter()
        postagger = POSTagger()
        pos=ASSETS+"dicts/positive.yml"
        neg= ASSETS+"dicts/negative.yml"
        inc=ASSETS+"dicts/inc.yml"
        dec=ASSETS+"dicts/dec.yml"
        inv=ASSETS+"dicts/inv.yml"
        dicttagger = DictionaryTagger([pos, neg,
                                       inc, dec, inv])

        splitted_sentences = splitter.split(sentence)


        pos_tagged_sentences = postagger.pos_tag(splitted_sentences)


        dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)

        print("Classification...")

        result = self.sentiment_score(dict_tagged_sentences)
        print (result)
        if result < 0:
            polarity = "problem"
        elif result > 0:
            polarity ="partialSolution"
        else:
            polarity = "neutre"
        return polarity

if __name__ == '__main__':
    text = """this/these can be annoying"""
    test = ClassifyWithIncr_it()
    print(test.main(text))