File size: 4,191 Bytes
59804c6
 
 
6476a97
 
8a9969c
59804c6
959b9ce
 
 
 
1785380
 
959b9ce
 
 
 
 
 
 
1785380
 
959b9ce
1785380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
959b9ce
 
1785380
 
 
 
 
 
 
 
 
6476a97
1785380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
959b9ce
1785380
 
959b9ce
1785380
959b9ce
1785380
 
959b9ce
1785380
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
os.system('pip install nltk')
os.system('pip install sklearn')
os.system('pip install wget')

wget 'https://raw.githubusercontent.com/surge-ai/copilot-toxicity/main/toxicity.csv'

import streamlit as st
import pandas as pd
import numpy as np
import pickle
import itertools
import matplotlib.pyplot as plt
from PIL import Image
# preprocessing
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
# modeling
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
st.title("Toxic Comment Detection App ")
st.write('\n\n')


def clean_text(text):
    import re
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_output = []
    for word in stopwords_removed:
        lemmatized_output.append(lemmatizer.lemmatize(word))
        
    return text
    
    
def classifier_evaluation(y_pred, y_test):
    fig, ax = plt.subplots()
    confusion_matrix = pd.crosstab(y_pred, y_test, rownames=['Actual'], colnames=['Predicted'])
    sns.heatmap(confusion_matrix, annot=True, cmap = 'Blues')
    st.write("Confusion Matrix:")
    st.write(fig)
    st.text('Model Report:\n ' + classification_report(y_pred, y_test))


df = pd.read_csv('toxicity.csv')

def clean_text_2(text):
    # make text lowercase
    text = text.lower()
    # removing text within parentheses
    text = re.sub('\(.*?\)', '', text)
    # removing numbers
    text = re.sub('\w*\d\w*', '', text)
    # if there's more than 1 whitespace, then make it just 1
    text = re.sub('\s+', ' ', text)
    # if there's a new line, then make it a whitespace
    text = re.sub('\n', ' ', text)
    # removing any quotes
    text = re.sub('\"+', '', text)
    # getting rid of punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
    return text

clean = lambda x: clean_text_2(x)

df['clean_text'] = df['text'].apply(clean)

text_df = df[['clean_text', 'is_toxic']].copy()

text_df['is_toxic'] = text_df['is_toxic'].replace('Toxic', 1)
text_df['is_toxic'] = text_df['is_toxic'].replace('Not Toxic', 0)

data = text_df['clean_text']
target = text_df['is_toxic']

stop_words = set(stopwords.words('english'))
def process_text(text):
    tokens = nltk.word_tokenize(text)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
    return stopwords_removed
    
# applying the above function to our data/features 
processed_data = list(map(process_text, data))

# creating a list with all lemmatized outputs
lemmatizer = WordNetLemmatizer() 
lemmatized_output = []

for listy in processed_data:
    lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy])
    lemmatized_output.append(lemmed)

X_lem = lemmatized_output
y_lem = target

X_train, X_test, y_train, y_test = train_test_split(X_lem, y_lem, test_size=0.20, random_state=15)

tfidf = TfidfVectorizer(stop_words= stop_words, ngram_range=(1,2))

tfidf_data_train = tfidf.fit_transform(X_train)
tfidf_data_test = tfidf.transform(X_test)

if st.checkbox('Evaluate The Binary Classification Model (Toxic, Non-Toxic)'):
    bayes = MultinomialNB(alpha = .01)
    bayes.fit(tfidf_data_train, y_train)
    bayes_test_preds = bayes.predict(tfidf_data_test)
    classifier_evaluation(bayes_test_preds, y_test)

    st.write("""##### Try it out yourself!""")
    binary_text = st.text_area("Classify Using The Binary Model:", "Enter Text")
    binary_text = clean_text(binary_text)
    
    if st.checkbox('Apply Binary Model'):
        binary_model = Pipeline([('vectorizer', tfidf), ('classifier', bayes)])
        
        result = binary_model.predict([binary_text])
        
        if result.astype(int) == 1:
           result_text = "Toxic"
        else:
           result_text = "Not Toxic"
           
        st.write(" ##### Result: ", result_text)