# -*- coding: utf-8 -*-
"""Kaltstart.195

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1W1WPsxSyG7efWOHMRIcMxuKWq0GDFdtG
"""

!pip install nltk

import string

import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

nltk.download('stopwords')

df = pd.read_csv("spam_ham_dataset.csv")

df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ' '))

df

df.info()

stemmer = PorterStemmer()
corpus  = []

stopwords_set =set(stopwords.words('english'))

for i in range(len(df)):
  text = df['text'].iloc[1].lower()
  text = text.translate(str.maketrans('', '', string.punctuation)).split()
  text = [stemmer.stem(word) for word in text if word not in stopwords_set]
  text = ' '.join(text)
  corpus.append(text)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus).toarray()
y = df.label_num

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier(n_jobs=-1)
clf.fit(X_train, y_train)

clf.score(X_test, y_test)

email_to_classify = df.text.values[16]

email_text = email_to_classify.lower().translate(str.maketrans('', '', string.punctuation)).split()
email_text = [stemmer.stem(word) for word in text if word not in stopwords_set]
email_text = ' '.join(email_text)

email_corpus = [email_text]

X_email = vectorizer.transform(email_corpus)

clf.predict(X_email)

df.label_num.iloc[16]