Spaces:

azizbarank
/

Toxic-Comment-Detection-App

Runtime error

App Files Files Community

Toxic-Comment-Detection-App / app.py

azizbarank

Update app.py

8a9969c about 3 years ago

raw

history blame

4.19 kB

	import os
	os.system('pip install nltk')
	os.system('pip install sklearn')
	os.system('pip install wget')

	wget 'https://raw.githubusercontent.com/surge-ai/copilot-toxicity/main/toxicity.csv'

	import streamlit as st
	import pandas as pd
	import numpy as np
	import pickle
	import itertools
	import matplotlib.pyplot as plt
	from PIL import Image
	# preprocessing
	import re
	import string
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics import confusion_matrix
	# modeling
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.metrics import confusion_matrix
	st.title("Toxic Comment Detection App ")
	st.write('\n\n')


	def clean_text(text):
	import re
	text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
	stop_words = set(stopwords.words('english'))
	tokens = nltk.word_tokenize(text)
	stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
	lemmatizer = WordNetLemmatizer()
	lemmatized_output = []
	for word in stopwords_removed:
	lemmatized_output.append(lemmatizer.lemmatize(word))

	return text


	def classifier_evaluation(y_pred, y_test):
	fig, ax = plt.subplots()
	confusion_matrix = pd.crosstab(y_pred, y_test, rownames=['Actual'], colnames=['Predicted'])
	sns.heatmap(confusion_matrix, annot=True, cmap = 'Blues')
	st.write("Confusion Matrix:")
	st.write(fig)
	st.text('Model Report:\n ' + classification_report(y_pred, y_test))


	df = pd.read_csv('toxicity.csv')

	def clean_text_2(text):
	# make text lowercase
	text = text.lower()
	# removing text within parentheses
	text = re.sub('\(.*?\)', '', text)
	# removing numbers
	text = re.sub('\w\d\w', '', text)
	# if there's more than 1 whitespace, then make it just 1
	text = re.sub('\s+', ' ', text)
	# if there's a new line, then make it a whitespace
	text = re.sub('\n', ' ', text)
	# removing any quotes
	text = re.sub('\"+', '', text)
	# getting rid of punctuations
	text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

	return text

	clean = lambda x: clean_text_2(x)

	df['clean_text'] = df['text'].apply(clean)

	text_df = df[['clean_text', 'is_toxic']].copy()

	text_df['is_toxic'] = text_df['is_toxic'].replace('Toxic', 1)
	text_df['is_toxic'] = text_df['is_toxic'].replace('Not Toxic', 0)

	data = text_df['clean_text']
	target = text_df['is_toxic']

	stop_words = set(stopwords.words('english'))
	def process_text(text):
	tokens = nltk.word_tokenize(text)
	stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
	return stopwords_removed

	# applying the above function to our data/features
	processed_data = list(map(process_text, data))

	# creating a list with all lemmatized outputs
	lemmatizer = WordNetLemmatizer()
	lemmatized_output = []

	for listy in processed_data:
	lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy])
	lemmatized_output.append(lemmed)

	X_lem = lemmatized_output
	y_lem = target

	X_train, X_test, y_train, y_test = train_test_split(X_lem, y_lem, test_size=0.20, random_state=15)

	tfidf = TfidfVectorizer(stop_words= stop_words, ngram_range=(1,2))

	tfidf_data_train = tfidf.fit_transform(X_train)
	tfidf_data_test = tfidf.transform(X_test)

	if st.checkbox('Evaluate The Binary Classification Model (Toxic, Non-Toxic)'):
	bayes = MultinomialNB(alpha = .01)
	bayes.fit(tfidf_data_train, y_train)
	bayes_test_preds = bayes.predict(tfidf_data_test)
	classifier_evaluation(bayes_test_preds, y_test)

	st.write("""##### Try it out yourself!""")
	binary_text = st.text_area("Classify Using The Binary Model:", "Enter Text")
	binary_text = clean_text(binary_text)

	if st.checkbox('Apply Binary Model'):
	binary_model = Pipeline([('vectorizer', tfidf), ('classifier', bayes)])

	result = binary_model.predict([binary_text])

	if result.astype(int) == 1:
	result_text = "Toxic"
	else:
	result_text = "Not Toxic"

	st.write(" ##### Result: ", result_text)