Spaces:

azizbarank
/

Toxic-Comment-Detection-App

Runtime error

App Files Files Community

azizbarank commited on Jun 4, 2022

Commit

959b9ce

1 Parent(s): ef82fc2

Create app.py

Browse files

Files changed (1) hide show

app.py +97 -0

app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import pickle
+from PIL import Image
+# preprocessing
+import re
+import string
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from sklearn.feature_extraction.text import CountVectorizer
+# modeling
+from sklearn import svm
+# sentiment analysis
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+# creating page sections
+site_header = st.container()
+business_context = st.container()
+data_desc = st.container()
+performance = st.container()
+tweet_input = st.container()
+model_results = st.container()
+sentiment_analysis = st.container()
+contact = st.container()
+with site_header:
+    st.title('Twitter Hate Speech Detection')
+    This project aims to **automate content moderation** to identify hate speech using **machine learning binary classification algorithms.**
+    Baseline models included Random Forest, Naive Bayes, Logistic Regression and Support Vector Machine (SVM). The final model was a **Logistic Regression** model that used Count Vectorization for feature engineering. It produced an F1 of 0.3958 and Recall (TPR) of 0.624.
+with business_context:
+    st.header('The Problem of Content Moderation')
+    st.write("""
+    **Human content moderation exploits people by consistently traumatizing and underpaying them.** In 2019, an [article](https://www.theverge.com/2019/6/19/18681845/facebook-moderator-interviews-video-trauma-ptsd-cognizant-tampa) on The Verge exposed the extensive list of horrific working conditions that employees faced at Cognizant, which was Facebook’s primary moderation contractor. Unfortunately, **every major tech company**, including **Twitter**, uses human moderators to some extent, both domestically and overseas.
+    Hate speech is defined as **abusive or threatening speech that expresses prejudice against a particular group, especially on the basis of race, religion or sexual orientation.**  Usually, the difference between hate speech and offensive language comes down to subtle context or diction.
+    """)
+with data_desc:
+    understanding, venn = st.columns(2)
+    with understanding:
+        st.text('')
+        st.write("""
+        The **data** for this project was sourced from a Cornell University [study](https://github.com/t-davidson/hate-speech-and-offensive-language) titled *Automated Hate Speech Detection and the Problem of Offensive Language*.
+        The `.csv` file has **24,802 rows** where **6% of the tweets were labeled as "Hate Speech".**
+        Each tweet's label was voted on by crowdsource and determined by majority rules.
+        """)
+with tweet_input:
+    st.header('Is Your Tweet Considered Hate Speech?')
+    st.write("""*Please note that this prediction is based on how the model was trained, so it may not be an accurate representation.*""")
+    # user input here
+    user_text = st.text_input('Enter Tweet', max_chars=280) # setting input as user_text
+with model_results:
+    st.subheader('Prediction:')
+    if user_text:
+    # processing user_text
+        # removing punctuation
+        user_text = re.sub('[%s]' % re.escape(string.punctuation), '', user_text)
+        # tokenizing
+        stop_words = set(stopwords.words('english'))
+        tokens = nltk.word_tokenize(user_text)
+        # removing stop words
+        stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
+        # taking root word
+        lemmatizer = WordNetLemmatizer()
+        lemmatized_output = []
+        for word in stopwords_removed:
+            lemmatized_output.append(lemmatizer.lemmatize(word))
+        # instantiating count vectorizor
+        count = CountVectorizer(stop_words=stop_words)
+        X_train = pickle.load(open('X_train', 'rb'))
+        X_test = lemmatized_output
+        X_train_count = count.fit_transform(X_train)
+        X_test_count = count.transform(X_test)
+        # loading in model
+        final_model = pickle.load(open('bayes', 'rb'))
+        # apply model to make predictions
+        prediction = final_model.predict(X_test_count[0])
+        if prediction == 0:
+            st.subheader('**Not Hate Speech**')
+        else:
+            st.subheader('**Hate Speech**')
+        st.text('')