|
"""# MODEL BUILDING""" |
|
|
|
import numpy as np |
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
|
|
import nltk |
|
from nltk.corpus import stopwords |
|
nltk.download('stopwords') |
|
nltk.download('punkt') |
|
|
|
def remove_stopword(text): |
|
stopword=nltk.corpus.stopwords.words('english') |
|
stopword.remove('not') |
|
a=[w for w in nltk.word_tokenize(text) if w not in stopword] |
|
return ' '.join(a) |
|
|
|
|
|
data = pd.read_csv('train-cleaned.csv') |
|
data |
|
|
|
import nltk |
|
def punc_clean(text): |
|
import string as st |
|
a=[w for w in text if w not in st.punctuation] |
|
return ''.join(a) |
|
data[''] = data['Extracted text'].apply(punc_clean) |
|
|
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
|
vectr = TfidfVectorizer(ngram_range=(1,2),min_df=1) |
|
vectr.fit(data['Extracted text']) |
|
|
|
vect_X = vectr.transform(data['Extracted text']) |
|
|
|
|
|
|
|
from sklearn.svm import SVC |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.ensemble import VotingClassifier |
|
|
|
svm_classifier = SVC(kernel='linear', probability=True) |
|
logistic_classifier = LogisticRegression() |
|
|
|
|
|
model = VotingClassifier(estimators=[ |
|
('svm', svm_classifier), |
|
('logistic', logistic_classifier) |
|
], voting='hard') |
|
|
|
|
|
clf=model.fit(vect_X,data['saliency']) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|