Spaces:
Runtime error
Runtime error
import os | |
os.system('pip install nltk') | |
os.system('pip install sklearn') | |
os.system('pip install wget') | |
wget 'https://raw.githubusercontent.com/surge-ai/copilot-toxicity/main/toxicity.csv' | |
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import pickle | |
import itertools | |
import matplotlib.pyplot as plt | |
from PIL import Image | |
# preprocessing | |
import re | |
import string | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics import confusion_matrix | |
# modeling | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.metrics import confusion_matrix | |
st.title("Toxic Comment Detection App ") | |
st.write('\n\n') | |
def clean_text(text): | |
import re | |
text = re.sub('[%s]' % re.escape(string.punctuation), '', text) | |
stop_words = set(stopwords.words('english')) | |
tokens = nltk.word_tokenize(text) | |
stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words] | |
lemmatizer = WordNetLemmatizer() | |
lemmatized_output = [] | |
for word in stopwords_removed: | |
lemmatized_output.append(lemmatizer.lemmatize(word)) | |
return text | |
def classifier_evaluation(y_pred, y_test): | |
fig, ax = plt.subplots() | |
confusion_matrix = pd.crosstab(y_pred, y_test, rownames=['Actual'], colnames=['Predicted']) | |
sns.heatmap(confusion_matrix, annot=True, cmap = 'Blues') | |
st.write("Confusion Matrix:") | |
st.write(fig) | |
st.text('Model Report:\n ' + classification_report(y_pred, y_test)) | |
df = pd.read_csv('toxicity.csv') | |
def clean_text_2(text): | |
# make text lowercase | |
text = text.lower() | |
# removing text within parentheses | |
text = re.sub('\(.*?\)', '', text) | |
# removing numbers | |
text = re.sub('\w*\d\w*', '', text) | |
# if there's more than 1 whitespace, then make it just 1 | |
text = re.sub('\s+', ' ', text) | |
# if there's a new line, then make it a whitespace | |
text = re.sub('\n', ' ', text) | |
# removing any quotes | |
text = re.sub('\"+', '', text) | |
# getting rid of punctuations | |
text = re.sub('[%s]' % re.escape(string.punctuation), '', text) | |
return text | |
clean = lambda x: clean_text_2(x) | |
df['clean_text'] = df['text'].apply(clean) | |
text_df = df[['clean_text', 'is_toxic']].copy() | |
text_df['is_toxic'] = text_df['is_toxic'].replace('Toxic', 1) | |
text_df['is_toxic'] = text_df['is_toxic'].replace('Not Toxic', 0) | |
data = text_df['clean_text'] | |
target = text_df['is_toxic'] | |
stop_words = set(stopwords.words('english')) | |
def process_text(text): | |
tokens = nltk.word_tokenize(text) | |
stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words] | |
return stopwords_removed | |
# applying the above function to our data/features | |
processed_data = list(map(process_text, data)) | |
# creating a list with all lemmatized outputs | |
lemmatizer = WordNetLemmatizer() | |
lemmatized_output = [] | |
for listy in processed_data: | |
lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy]) | |
lemmatized_output.append(lemmed) | |
X_lem = lemmatized_output | |
y_lem = target | |
X_train, X_test, y_train, y_test = train_test_split(X_lem, y_lem, test_size=0.20, random_state=15) | |
tfidf = TfidfVectorizer(stop_words= stop_words, ngram_range=(1,2)) | |
tfidf_data_train = tfidf.fit_transform(X_train) | |
tfidf_data_test = tfidf.transform(X_test) | |
if st.checkbox('Evaluate The Binary Classification Model (Toxic, Non-Toxic)'): | |
bayes = MultinomialNB(alpha = .01) | |
bayes.fit(tfidf_data_train, y_train) | |
bayes_test_preds = bayes.predict(tfidf_data_test) | |
classifier_evaluation(bayes_test_preds, y_test) | |
st.write("""##### Try it out yourself!""") | |
binary_text = st.text_area("Classify Using The Binary Model:", "Enter Text") | |
binary_text = clean_text(binary_text) | |
if st.checkbox('Apply Binary Model'): | |
binary_model = Pipeline([('vectorizer', tfidf), ('classifier', bayes)]) | |
result = binary_model.predict([binary_text]) | |
if result.astype(int) == 1: | |
result_text = "Toxic" | |
else: | |
result_text = "Not Toxic" | |
st.write(" ##### Result: ", result_text) |