Spaces:

azizbarank
/

Toxic-Comment-Detection-App

Runtime error

App Files Files Community

azizbarank commited on Jun 6, 2022

Commit

1785380

1 Parent(s): 01d568a

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -72

app.py CHANGED Viewed

@@ -6,6 +6,8 @@ import streamlit as st
 import pandas as pd
 import numpy as np
 import pickle
 from PIL import Image
 # preprocessing
 import re
@@ -13,82 +15,116 @@ import string
 import nltk
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
-from sklearn.feature_extraction.text import CountVectorizer
 # modeling
-from sklearn import svm
-# creating page sections
-site_header = st.container()
-business_context = st.container()
-data_desc = st.container()
-performance = st.container()
-tweet_input = st.container()
-model_results = st.container()
-sentiment_analysis = st.container()
-contact = st.container()
-with site_header:
-    st.title('Twitter Hate Speech Detection')
-with business_context:
-    st.header('The Problem of Content Moderation')
-    st.write("""
-    **Human content moderation exploits people by consistently traumatizing and underpaying them.** In 2019, an [article](https://www.theverge.com/2019/6/19/18681845/facebook-moderator-interviews-video-trauma-ptsd-cognizant-tampa) on The Verge exposed the extensive list of horrific working conditions that employees faced at Cognizant, which was Facebook’s primary moderation contractor. Unfortunately, **every major tech company**, including **Twitter**, uses human moderators to some extent, both domestically and overseas.
-    Hate speech is defined as **abusive or threatening speech that expresses prejudice against a particular group, especially on the basis of race, religion or sexual orientation.**  Usually, the difference between hate speech and offensive language comes down to subtle context or diction.
-    """)
-with data_desc:
-    understanding, venn = st.columns(2)
-    with understanding:
-        st.text('')
-        st.write("""
-        The **data** for this project was sourced from a Cornell University [study](https://github.com/t-davidson/hate-speech-and-offensive-language) titled *Automated Hate Speech Detection and the Problem of Offensive Language*.
-        The `.csv` file has **24,802 rows** where **6% of the tweets were labeled as "Hate Speech".**
-        Each tweet's label was voted on by crowdsource and determined by majority rules.
-        """)
-with tweet_input:
-    st.header('Is Your Tweet Considered Hate Speech?')
-    st.write("""*Please note that this prediction is based on how the model was trained, so it may not be an accurate representation.*""")
-    # user input here
-    user_text = st.text_input('Enter Tweet', max_chars=280) # setting input as user_text
-with model_results:
-    st.subheader('Prediction:')
-    if user_text:
-    # processing user_text
-        # removing punctuation
-        user_text = re.sub('[%s]' % re.escape(string.punctuation), '', user_text)
-        # tokenizing
-        stop_words = set(stopwords.words('english'))
-        tokens = nltk.word_tokenize(user_text)
-        # removing stop words
-        stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
-        # taking root word
-        lemmatizer = WordNetLemmatizer()
-        lemmatized_output = []
-        for word in stopwords_removed:
-            lemmatized_output.append(lemmatizer.lemmatize(word))
-        # instantiating count vectorizor
-        count = CountVectorizer(stop_words=stop_words)
-        X_train = pickle.load(open("C:\Users\User\Downloads\X_train", 'rb'))
-        X_test = lemmatized_output
-        X_train_count = count.fit_transform(X_train)
-        X_test_count = count.transform(X_test)
-        # loading in model
-        final_model = pickle.load(open("C:\Users\User\Downloads\bayes", 'rb'))
-        # apply model to make predictions
-        prediction = final_model.predict(X_test_count[0])
-        if prediction == 0:
-            st.subheader('**Not Hate Speech**')
         else:
-            st.subheader('**Hate Speech**')
-        st.text('')

 import pandas as pd
 import numpy as np
 import pickle
+import itertools
+import matplotlib.pyplot as plt
 from PIL import Image
 # preprocessing
 import re
 import nltk
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics import confusion_matrix
 # modeling
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.metrics import confusion_matrix
+st.title("Toxic Comment Detection App ")
+st.write('\n\n')
+def clean_text(text):
+    import re
+    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
+    stop_words = set(stopwords.words('english'))
+    tokens = nltk.word_tokenize(text)
+    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
+    lemmatizer = WordNetLemmatizer()
+    lemmatized_output = []
+    for word in stopwords_removed:
+        lemmatized_output.append(lemmatizer.lemmatize(word))
+    return text
+def classifier_evaluation(y_pred, y_test):
+    fig, ax = plt.subplots()
+    confusion_matrix = pd.crosstab(y_pred, y_test, rownames=['Actual'], colnames=['Predicted'])
+    sns.heatmap(confusion_matrix, annot=True, cmap = 'Blues')
+    st.write("Confusion Matrix:")
+    st.write(fig)
+    st.text('Model Report:\n ' + classification_report(y_pred, y_test))
+df = pd.read_csv("C:\Users\User\Downloads\toxicity.csv")
+def clean_text_2(text):
+    # make text lowercase
+    text = text.lower()
+    # removing text within parentheses
+    text = re.sub('\(.*?\)', '', text)
+    # removing numbers
+    text = re.sub('\w*\d\w*', '', text)
+    # if there's more than 1 whitespace, then make it just 1
+    text = re.sub('\s+', ' ', text)
+    # if there's a new line, then make it a whitespace
+    text = re.sub('\n', ' ', text)
+    # removing any quotes
+    text = re.sub('\"+', '', text)
+    # getting rid of punctuations
+    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
+    return text
+clean = lambda x: clean_text_2(x)
+df['clean_text'] = df['text'].apply(clean)
+text_df = df[['clean_text', 'is_toxic']].copy()
+text_df['is_toxic'] = text_df['is_toxic'].replace('Toxic', 1)
+text_df['is_toxic'] = text_df['is_toxic'].replace('Not Toxic', 0)
+data = text_df['clean_text']
+target = text_df['is_toxic']
+stop_words = set(stopwords.words('english'))
+def process_text(text):
+    tokens = nltk.word_tokenize(text)
+    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
+    return stopwords_removed
+# applying the above function to our data/features
+processed_data = list(map(process_text, data))
+# creating a list with all lemmatized outputs
+lemmatizer = WordNetLemmatizer()
+lemmatized_output = []
+for listy in processed_data:
+    lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy])
+    lemmatized_output.append(lemmed)
+X_lem = lemmatized_output
+y_lem = target
+X_train, X_test, y_train, y_test = train_test_split(X_lem, y_lem, test_size=0.20, random_state=15)
+tfidf = TfidfVectorizer(stop_words= stop_words, ngram_range=(1,2))
+tfidf_data_train = tfidf.fit_transform(X_train)
+tfidf_data_test = tfidf.transform(X_test)
+if st.checkbox('Evaluate The Binary Classification Model (Toxic, Non-Toxic)'):
+    bayes = MultinomialNB(alpha = .01)
+    bayes.fit(tfidf_data_train, y_train)
+    bayes_test_preds = bayes.predict(tfidf_data_test)
+    classifier_evaluation(bayes_test_preds, y_test)
+    st.write("""##### Try it out yourself!""")
+    binary_text = st.text_area("Classify Using The Binary Model:", "Enter Text")
+    binary_text = clean_text(binary_text)
+    if st.checkbox('Apply Binary Model'):
+        binary_model = Pipeline([('vectorizer', tfidf), ('classifier', bayes)])
+        result = binary_model.predict([binary_text])
+        if result.astype(int) == 1:
+           result_text = "Toxic"
         else:
+           result_text = "Not Toxic"
+        st.write(" ##### Result: ", result_text)