Spaces:

azizbarank
/

Toxic-Comment-Detection-App

Runtime error

App Files Files Community

azizbarank commited on Jun 6, 2022

Commit

ceaa4fc

1 Parent(s): 5f617ed

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -113

app.py CHANGED Viewed

@@ -3,14 +3,11 @@ os.system('pip install nltk')
 os.system('pip install sklearn')
 os.system('pip install wget')
-!wget 'https://raw.githubusercontent.com/surge-ai/copilot-toxicity/main/toxicity.csv'
 import streamlit as st
 import pandas as pd
 import numpy as np
 import pickle
-import itertools
-import matplotlib.pyplot as plt
 from PIL import Image
 # preprocessing
 import re
@@ -19,114 +16,63 @@ import nltk
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics import confusion_matrix
 # modeling
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.metrics import confusion_matrix
-st.title("Toxic Comment Detection App ")
-st.write('\n\n')
-def clean_text(text):
-    import re
-    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
-    stop_words = set(stopwords.words('english'))
-    tokens = nltk.word_tokenize(text)
-    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
-    lemmatizer = WordNetLemmatizer()
-    lemmatized_output = []
-    for word in stopwords_removed:
-        lemmatized_output.append(lemmatizer.lemmatize(word))
-    return text
-def classifier_evaluation(y_pred, y_test):
-    fig, ax = plt.subplots()
-    confusion_matrix = pd.crosstab(y_pred, y_test, rownames=['Actual'], colnames=['Predicted'])
-    sns.heatmap(confusion_matrix, annot=True, cmap = 'Blues')
-    st.write("Confusion Matrix:")
-    st.write(fig)
-    st.text('Model Report:\n ' + classification_report(y_pred, y_test))
-df = pd.read_csv('toxicity.csv')
-def clean_text_2(text):
-    # make text lowercase
-    text = text.lower()
-    # removing text within parentheses
-    text = re.sub('\(.*?\)', '', text)
-    # removing numbers
-    text = re.sub('\w*\d\w*', '', text)
-    # if there's more than 1 whitespace, then make it just 1
-    text = re.sub('\s+', ' ', text)
-    # if there's a new line, then make it a whitespace
-    text = re.sub('\n', ' ', text)
-    # removing any quotes
-    text = re.sub('\"+', '', text)
-    # getting rid of punctuations
-    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
-    return text
-clean = lambda x: clean_text_2(x)
-df['clean_text'] = df['text'].apply(clean)
-text_df = df[['clean_text', 'is_toxic']].copy()
-text_df['is_toxic'] = text_df['is_toxic'].replace('Toxic', 1)
-text_df['is_toxic'] = text_df['is_toxic'].replace('Not Toxic', 0)
-data = text_df['clean_text']
-target = text_df['is_toxic']
-stop_words = set(stopwords.words('english'))
-def process_text(text):
-    tokens = nltk.word_tokenize(text)
-    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
-    return stopwords_removed
-# applying the above function to our data/features
-processed_data = list(map(process_text, data))
-# creating a list with all lemmatized outputs
-lemmatizer = WordNetLemmatizer()
-lemmatized_output = []
-for listy in processed_data:
-    lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy])
-    lemmatized_output.append(lemmed)
-X_lem = lemmatized_output
-y_lem = target
-X_train, X_test, y_train, y_test = train_test_split(X_lem, y_lem, test_size=0.20, random_state=15)
-tfidf = TfidfVectorizer(stop_words= stop_words, ngram_range=(1,2))
-tfidf_data_train = tfidf.fit_transform(X_train)
-tfidf_data_test = tfidf.transform(X_test)
-if st.checkbox('Evaluate The Binary Classification Model (Toxic, Non-Toxic)'):
-    bayes = MultinomialNB(alpha = .01)
-    bayes.fit(tfidf_data_train, y_train)
-    bayes_test_preds = bayes.predict(tfidf_data_test)
-    classifier_evaluation(bayes_test_preds, y_test)
-    st.write("""##### Try it out yourself!""")
-    binary_text = st.text_area("Classify Using The Binary Model:", "Enter Text")
-    binary_text = clean_text(binary_text)
-    if st.checkbox('Apply Binary Model'):
-        binary_model = Pipeline([('vectorizer', tfidf), ('classifier', bayes)])
-        result = binary_model.predict([binary_text])
-        if result.astype(int) == 1:
-           result_text = "Toxic"
         else:
-           result_text = "Not Toxic"
-        st.write(" ##### Result: ", result_text)

 os.system('pip install sklearn')
 os.system('pip install wget')
+# importing relevant python packages
 import streamlit as st
 import pandas as pd
 import numpy as np
 import pickle
 from PIL import Image
 # preprocessing
 import re
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 from sklearn.feature_extraction.text import TfidfVectorizer
 # modeling
+from sklearn import svm
+# sentiment analysis
+# creating page sections
+site_header = st.container()
+business_context = st.container()
+data_desc = st.container()
+performance = st.container()
+tweet_input = st.container()
+model_results = st.container()
+sentiment_analysis = st.container()
+contact = st.container()
+with site_header:
+    st.title('Toxic Comment Detection')
+with tweet_input:
+    st.header('Is Your Tweet Considered Hate Speech?')
+    st.write("""*Please note that this prediction is based on how the model was trained, so it may not be an accurate representation.*""")
+    # user input here
+    user_text = st.text_input('Enter Tweet', max_chars=280) # setting input as user_text
+with model_results:
+    st.subheader('Prediction:')
+    if user_text:
+    # processing user_text
+        # removing punctuation
+        user_text = re.sub('[%s]' % re.escape(string.punctuation), '', user_text)
+        # tokenizing
+        stop_words = set(stopwords.words('english'))
+        tokens = nltk.word_tokenize(user_text)
+        # removing stop words
+        stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
+        # taking root word
+        lemmatizer = WordNetLemmatizer()
+        lemmatized_output = []
+        for word in stopwords_removed:
+            lemmatized_output.append(lemmatizer.lemmatize(word))
+        # instantiating count vectorizor
+        tfidf = TfidfVectorizer(stop_words=stop_words)
+        X_train = pickle.load(open('X_train.pickle', 'rb'))
+        X_test = lemmatized_output
+        X_train_count = tfidf.fit_transform(X_train)
+        X_test_count = tfidf.transform(X_test)
+        # loading in model
+        final_model = pickle.load(open('final_bayes.pickle', 'rb'))
+        # apply model to make predictions
+        prediction = final_model.predict(X_test_count[0])
+        if prediction == 0:
+            st.subheader('**Not Hate Speech**')
         else:
+            st.subheader('**Hate Speech**')
+        st.text('')