Spaces:

azizbarank
/

Toxic-Comment-Detection-App

Runtime error

File size: 4,191 Bytes

import os
os.system('pip install nltk')
os.system('pip install sklearn')
os.system('pip install wget')

wget 'https://raw.githubusercontent.com/surge-ai/copilot-toxicity/main/toxicity.csv'

import streamlit as st
import pandas as pd
import numpy as np
import pickle
import itertools
import matplotlib.pyplot as plt
from PIL import Image
# preprocessing
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
# modeling
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
st.title("Toxic Comment Detection App ")
st.write('\n\n')


def clean_text(text):
    import re
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_output = []
    for word in stopwords_removed:
        lemmatized_output.append(lemmatizer.lemmatize(word))
        
    return text
    
    
def classifier_evaluation(y_pred, y_test):
    fig, ax = plt.subplots()
    confusion_matrix = pd.crosstab(y_pred, y_test, rownames=['Actual'], colnames=['Predicted'])
    sns.heatmap(confusion_matrix, annot=True, cmap = 'Blues')
    st.write("Confusion Matrix:")
    st.write(fig)
    st.text('Model Report:\n ' + classification_report(y_pred, y_test))


df = pd.read_csv('toxicity.csv')

def clean_text_2(text):
    # make text lowercase
    text = text.lower()
    # removing text within parentheses
    text = re.sub('\(.*?\)', '', text)
    # removing numbers
    text = re.sub('\w*\d\w*', '', text)
    # if there's more than 1 whitespace, then make it just 1
    text = re.sub('\s+', ' ', text)
    # if there's a new line, then make it a whitespace
    text = re.sub('\n', ' ', text)
    # removing any quotes
    text = re.sub('\"+', '', text)
    # getting rid of punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
    return text

clean = lambda x: clean_text_2(x)

df['clean_text'] = df['text'].apply(clean)

text_df = df[['clean_text', 'is_toxic']].copy()

text_df['is_toxic'] = text_df['is_toxic'].replace('Toxic', 1)
text_df['is_toxic'] = text_df['is_toxic'].replace('Not Toxic', 0)

data = text_df['clean_text']
target = text_df['is_toxic']

stop_words = set(stopwords.words('english'))
def process_text(text):
    tokens = nltk.word_tokenize(text)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
    return stopwords_removed
    
# applying the above function to our data/features 
processed_data = list(map(process_text, data))

# creating a list with all lemmatized outputs
lemmatizer = WordNetLemmatizer() 
lemmatized_output = []

for listy in processed_data:
    lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy])
    lemmatized_output.append(lemmed)

X_lem = lemmatized_output
y_lem = target

X_train, X_test, y_train, y_test = train_test_split(X_lem, y_lem, test_size=0.20, random_state=15)

tfidf = TfidfVectorizer(stop_words= stop_words, ngram_range=(1,2))

tfidf_data_train = tfidf.fit_transform(X_train)
tfidf_data_test = tfidf.transform(X_test)

if st.checkbox('Evaluate The Binary Classification Model (Toxic, Non-Toxic)'):
    bayes = MultinomialNB(alpha = .01)
    bayes.fit(tfidf_data_train, y_train)
    bayes_test_preds = bayes.predict(tfidf_data_test)
    classifier_evaluation(bayes_test_preds, y_test)

    st.write("""##### Try it out yourself!""")
    binary_text = st.text_area("Classify Using The Binary Model:", "Enter Text")
    binary_text = clean_text(binary_text)
    
    if st.checkbox('Apply Binary Model'):
        binary_model = Pipeline([('vectorizer', tfidf), ('classifier', bayes)])
        
        result = binary_model.predict([binary_text])
        
        if result.astype(int) == 1:
           result_text = "Toxic"
        else:
           result_text = "Not Toxic"
           
        st.write(" ##### Result: ", result_text)