Spaces:
Runtime error
Runtime error
File size: 4,191 Bytes
59804c6 6476a97 8a9969c 59804c6 959b9ce 1785380 959b9ce 1785380 959b9ce 1785380 959b9ce 1785380 6476a97 1785380 959b9ce 1785380 959b9ce 1785380 959b9ce 1785380 959b9ce 1785380 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import os
os.system('pip install nltk')
os.system('pip install sklearn')
os.system('pip install wget')
wget 'https://raw.githubusercontent.com/surge-ai/copilot-toxicity/main/toxicity.csv'
import streamlit as st
import pandas as pd
import numpy as np
import pickle
import itertools
import matplotlib.pyplot as plt
from PIL import Image
# preprocessing
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
# modeling
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
st.title("Toxic Comment Detection App ")
st.write('\n\n')
def clean_text(text):
import re
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
stop_words = set(stopwords.words('english'))
tokens = nltk.word_tokenize(text)
stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
lemmatizer = WordNetLemmatizer()
lemmatized_output = []
for word in stopwords_removed:
lemmatized_output.append(lemmatizer.lemmatize(word))
return text
def classifier_evaluation(y_pred, y_test):
fig, ax = plt.subplots()
confusion_matrix = pd.crosstab(y_pred, y_test, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True, cmap = 'Blues')
st.write("Confusion Matrix:")
st.write(fig)
st.text('Model Report:\n ' + classification_report(y_pred, y_test))
df = pd.read_csv('toxicity.csv')
def clean_text_2(text):
# make text lowercase
text = text.lower()
# removing text within parentheses
text = re.sub('\(.*?\)', '', text)
# removing numbers
text = re.sub('\w*\d\w*', '', text)
# if there's more than 1 whitespace, then make it just 1
text = re.sub('\s+', ' ', text)
# if there's a new line, then make it a whitespace
text = re.sub('\n', ' ', text)
# removing any quotes
text = re.sub('\"+', '', text)
# getting rid of punctuations
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
return text
clean = lambda x: clean_text_2(x)
df['clean_text'] = df['text'].apply(clean)
text_df = df[['clean_text', 'is_toxic']].copy()
text_df['is_toxic'] = text_df['is_toxic'].replace('Toxic', 1)
text_df['is_toxic'] = text_df['is_toxic'].replace('Not Toxic', 0)
data = text_df['clean_text']
target = text_df['is_toxic']
stop_words = set(stopwords.words('english'))
def process_text(text):
tokens = nltk.word_tokenize(text)
stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
return stopwords_removed
# applying the above function to our data/features
processed_data = list(map(process_text, data))
# creating a list with all lemmatized outputs
lemmatizer = WordNetLemmatizer()
lemmatized_output = []
for listy in processed_data:
lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy])
lemmatized_output.append(lemmed)
X_lem = lemmatized_output
y_lem = target
X_train, X_test, y_train, y_test = train_test_split(X_lem, y_lem, test_size=0.20, random_state=15)
tfidf = TfidfVectorizer(stop_words= stop_words, ngram_range=(1,2))
tfidf_data_train = tfidf.fit_transform(X_train)
tfidf_data_test = tfidf.transform(X_test)
if st.checkbox('Evaluate The Binary Classification Model (Toxic, Non-Toxic)'):
bayes = MultinomialNB(alpha = .01)
bayes.fit(tfidf_data_train, y_train)
bayes_test_preds = bayes.predict(tfidf_data_test)
classifier_evaluation(bayes_test_preds, y_test)
st.write("""##### Try it out yourself!""")
binary_text = st.text_area("Classify Using The Binary Model:", "Enter Text")
binary_text = clean_text(binary_text)
if st.checkbox('Apply Binary Model'):
binary_model = Pipeline([('vectorizer', tfidf), ('classifier', bayes)])
result = binary_model.predict([binary_text])
if result.astype(int) == 1:
result_text = "Toxic"
else:
result_text = "Not Toxic"
st.write(" ##### Result: ", result_text) |