import os os.system('pip install nltk') os.system('pip install sklearn') os.system('pip install wget') !wget 'https://raw.githubusercontent.com/surge-ai/copilot-toxicity/main/toxicity.csv' import streamlit as st import pandas as pd import numpy as np import pickle import itertools import matplotlib.pyplot as plt from PIL import Image # preprocessing import re import string import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import confusion_matrix # modeling from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import confusion_matrix st.title("Toxic Comment Detection App ") st.write('\n\n') def clean_text(text): import re text = re.sub('[%s]' % re.escape(string.punctuation), '', text) stop_words = set(stopwords.words('english')) tokens = nltk.word_tokenize(text) stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words] lemmatizer = WordNetLemmatizer() lemmatized_output = [] for word in stopwords_removed: lemmatized_output.append(lemmatizer.lemmatize(word)) return text def classifier_evaluation(y_pred, y_test): fig, ax = plt.subplots() confusion_matrix = pd.crosstab(y_pred, y_test, rownames=['Actual'], colnames=['Predicted']) sns.heatmap(confusion_matrix, annot=True, cmap = 'Blues') st.write("Confusion Matrix:") st.write(fig) st.text('Model Report:\n ' + classification_report(y_pred, y_test)) df = pd.read_csv('toxicity.csv') def clean_text_2(text): # make text lowercase text = text.lower() # removing text within parentheses text = re.sub('\(.*?\)', '', text) # removing numbers text = re.sub('\w*\d\w*', '', text) # if there's more than 1 whitespace, then make it just 1 text = re.sub('\s+', ' ', text) # if there's a new line, then make it a whitespace text = re.sub('\n', ' ', text) # removing any quotes text = re.sub('\"+', '', text) # getting rid of punctuations text = re.sub('[%s]' % re.escape(string.punctuation), '', text) return text clean = lambda x: clean_text_2(x) df['clean_text'] = df['text'].apply(clean) text_df = df[['clean_text', 'is_toxic']].copy() text_df['is_toxic'] = text_df['is_toxic'].replace('Toxic', 1) text_df['is_toxic'] = text_df['is_toxic'].replace('Not Toxic', 0) data = text_df['clean_text'] target = text_df['is_toxic'] stop_words = set(stopwords.words('english')) def process_text(text): tokens = nltk.word_tokenize(text) stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words] return stopwords_removed # applying the above function to our data/features processed_data = list(map(process_text, data)) # creating a list with all lemmatized outputs lemmatizer = WordNetLemmatizer() lemmatized_output = [] for listy in processed_data: lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy]) lemmatized_output.append(lemmed) X_lem = lemmatized_output y_lem = target X_train, X_test, y_train, y_test = train_test_split(X_lem, y_lem, test_size=0.20, random_state=15) tfidf = TfidfVectorizer(stop_words= stop_words, ngram_range=(1,2)) tfidf_data_train = tfidf.fit_transform(X_train) tfidf_data_test = tfidf.transform(X_test) if st.checkbox('Evaluate The Binary Classification Model (Toxic, Non-Toxic)'): bayes = MultinomialNB(alpha = .01) bayes.fit(tfidf_data_train, y_train) bayes_test_preds = bayes.predict(tfidf_data_test) classifier_evaluation(bayes_test_preds, y_test) st.write("""##### Try it out yourself!""") binary_text = st.text_area("Classify Using The Binary Model:", "Enter Text") binary_text = clean_text(binary_text) if st.checkbox('Apply Binary Model'): binary_model = Pipeline([('vectorizer', tfidf), ('classifier', bayes)]) result = binary_model.predict([binary_text]) if result.astype(int) == 1: result_text = "Toxic" else: result_text = "Not Toxic" st.write(" ##### Result: ", result_text)