azizbarank's picture
Update app.py
8a9969c
raw
history blame
4.19 kB
import os
os.system('pip install nltk')
os.system('pip install sklearn')
os.system('pip install wget')
wget 'https://raw.githubusercontent.com/surge-ai/copilot-toxicity/main/toxicity.csv'
import streamlit as st
import pandas as pd
import numpy as np
import pickle
import itertools
import matplotlib.pyplot as plt
from PIL import Image
# preprocessing
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
# modeling
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
st.title("Toxic Comment Detection App ")
st.write('\n\n')
def clean_text(text):
import re
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
stop_words = set(stopwords.words('english'))
tokens = nltk.word_tokenize(text)
stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
lemmatizer = WordNetLemmatizer()
lemmatized_output = []
for word in stopwords_removed:
lemmatized_output.append(lemmatizer.lemmatize(word))
return text
def classifier_evaluation(y_pred, y_test):
fig, ax = plt.subplots()
confusion_matrix = pd.crosstab(y_pred, y_test, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True, cmap = 'Blues')
st.write("Confusion Matrix:")
st.write(fig)
st.text('Model Report:\n ' + classification_report(y_pred, y_test))
df = pd.read_csv('toxicity.csv')
def clean_text_2(text):
# make text lowercase
text = text.lower()
# removing text within parentheses
text = re.sub('\(.*?\)', '', text)
# removing numbers
text = re.sub('\w*\d\w*', '', text)
# if there's more than 1 whitespace, then make it just 1
text = re.sub('\s+', ' ', text)
# if there's a new line, then make it a whitespace
text = re.sub('\n', ' ', text)
# removing any quotes
text = re.sub('\"+', '', text)
# getting rid of punctuations
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
return text
clean = lambda x: clean_text_2(x)
df['clean_text'] = df['text'].apply(clean)
text_df = df[['clean_text', 'is_toxic']].copy()
text_df['is_toxic'] = text_df['is_toxic'].replace('Toxic', 1)
text_df['is_toxic'] = text_df['is_toxic'].replace('Not Toxic', 0)
data = text_df['clean_text']
target = text_df['is_toxic']
stop_words = set(stopwords.words('english'))
def process_text(text):
tokens = nltk.word_tokenize(text)
stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
return stopwords_removed
# applying the above function to our data/features
processed_data = list(map(process_text, data))
# creating a list with all lemmatized outputs
lemmatizer = WordNetLemmatizer()
lemmatized_output = []
for listy in processed_data:
lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy])
lemmatized_output.append(lemmed)
X_lem = lemmatized_output
y_lem = target
X_train, X_test, y_train, y_test = train_test_split(X_lem, y_lem, test_size=0.20, random_state=15)
tfidf = TfidfVectorizer(stop_words= stop_words, ngram_range=(1,2))
tfidf_data_train = tfidf.fit_transform(X_train)
tfidf_data_test = tfidf.transform(X_test)
if st.checkbox('Evaluate The Binary Classification Model (Toxic, Non-Toxic)'):
bayes = MultinomialNB(alpha = .01)
bayes.fit(tfidf_data_train, y_train)
bayes_test_preds = bayes.predict(tfidf_data_test)
classifier_evaluation(bayes_test_preds, y_test)
st.write("""##### Try it out yourself!""")
binary_text = st.text_area("Classify Using The Binary Model:", "Enter Text")
binary_text = clean_text(binary_text)
if st.checkbox('Apply Binary Model'):
binary_model = Pipeline([('vectorizer', tfidf), ('classifier', bayes)])
result = binary_model.predict([binary_text])
if result.astype(int) == 1:
result_text = "Toxic"
else:
result_text = "Not Toxic"
st.write(" ##### Result: ", result_text)