File size: 3,361 Bytes
efb9e6f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# -*- coding: utf-8 -*-
"""First_Text_Classification.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1sdLss09e3OxYVoeK3oBA6qrUSj_iOxp-
<h3 align = "center">Importing Libraries</h3>
"""
import numpy as np
import pandas as pd
"""<h3 align = "center">Importing Dataset</h3>"""
data = pd.read_csv("spam.csv", encoding = "ISO-8859-1")
"""<h3 align = "center">Preliminary Data Checks</h3>"""
data.head()
data.isnull().sum()
data.shape
data['v1'].value_counts()
data.info()
"""<h3 align = "center">Putting the Length of Characters of each row in a column.</h3>"""
data["Unnamed: 2"] = data["v2"].str.len()
"""<h3 align = "center">Visualising Length of Characters for each category!</h3>"""
"""<h5>It is evident from the above plot that spam texts are usually longer in length!</h5>
<h3 align = "center">Defining Variables</h3>
"""
X = data["v2"]
y = data["v1"]
"""<h3 align = "center">Train Test Split</h3>"""
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
"""<h3 align = "center">Vecrorizing Words into Matrix</h3>"""
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts
X_train.shape
X_train_counts.shape
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape
"""<h3 align = "center">Using TDIF Vectorizer for optimum vectorization!</h3>"""
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_train_tfidf.shape
"""<h3 align = "center">Creating Model</h3>"""
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)
"""<h3 align = "center">Creating Pipeline</h3>"""
from sklearn.pipeline import Pipeline
text_clf = Pipeline([("tfidf",TfidfVectorizer()),("clf",LinearSVC())])
text_clf.fit(X_train,y_train)
predictions = text_clf.predict(X_test)
X_test
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
"""<h3 align = "center">Accuracy Score</h3>"""
print(accuracy_score(y_test,predictions))
"""<h3 align = "center">Predictions </h3>"""
text_clf.predict(["Hi how are you doing today?"])
text_clf.predict(["Congratulations! You are selected for a free vouchar worth $500"])
"""<h3 align = "center">Creating User Interface!</h3>"""
import gradio as gr
def first_nlp_spam_detector(text):
list = []
list.append(text)
arr = text_clf.predict(list)
if arr[0] == 'ham':
return "Your Text is a Legitimate One!"
else:
return "Beware of such text messages, It\'s a Spam! "
interface = gr.Interface(first_nlp_spam_detector,inputs = gr.Textbox(lines=2, placeholder="Enter your Text Here.....!", show_label = False),
outputs = gr.Label(value = "Predicting the Text Classification..!"),description = "Predicting Text Legitimacy!")
first_nlp_spam_detector("Congratulations! You are selected for a free vouchar worth $500")
interface.launch() |