PatentSolver / App /bin /SentenceClassifier.py
xin's picture
initial commit
22738ca
raw
history blame
1.9 kB
# -*- coding: utf-8 -*-
import nltk
from App.bin import constants
class SentenceClassifier(object):
def __init__(self, sentence):
self.sentence = sentence
print("Classification....")
def classifySentence(self):
sentence = self.sentence
def bagOfWords(labelled):
wordsList = []
for (words, sentiment) in labelled:
wordsList.extend(words)
return wordsList
def wordFeatures(wordList):
wordList = nltk.FreqDist(wordList)
wordFeatures = wordList.keys()
return wordFeatures
def extract_Features(doc):
docWords = set(doc)
feat = {}
for word in wordFeatures:
feat['contains(%s)' % word] = (word in docWords)
return feat
with open(constants.ASSETS+"trainingsNegative") as l:
problems = [tuple(map(str, i.strip().split(':'))) for i in l]
with open(constants.ASSETS+"trainingsPositive") as f:
solutions = [tuple(map(str, i.strip().split(':'))) for i in f]
labelled = []
for (words, polarity) in solutions + problems:
words_filtered = [e.lower() for e in nltk.word_tokenize(words) if len(e) >= 3]
labelled.append((words_filtered, polarity))
wordFeatures = wordFeatures(bagOfWords(labelled))
training_set = nltk.classify.apply_features(extract_Features, labelled)
classifier = nltk.NaiveBayesClassifier.train(training_set)
#print(classifier.show_most_informative_features(32))
#print (sentence)
#print("{0} \n Polarity: {1} \n".format(sentence, classifier.classify(extract_Features(sentence.split()))))
classes = classifier.classify(extract_Features(sentence.split()))
return classes