Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import pickle | |
from PIL import Image | |
# preprocessing | |
import re | |
import string | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
from sklearn.feature_extraction.text import CountVectorizer | |
# modeling | |
from sklearn import svm | |
# sentiment analysis | |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
# creating page sections | |
site_header = st.container() | |
business_context = st.container() | |
data_desc = st.container() | |
performance = st.container() | |
tweet_input = st.container() | |
model_results = st.container() | |
sentiment_analysis = st.container() | |
contact = st.container() | |
with site_header: | |
st.title('Twitter Hate Speech Detection') | |
This project aims to **automate content moderation** to identify hate speech using **machine learning binary classification algorithms.** | |
Baseline models included Random Forest, Naive Bayes, Logistic Regression and Support Vector Machine (SVM). The final model was a **Logistic Regression** model that used Count Vectorization for feature engineering. It produced an F1 of 0.3958 and Recall (TPR) of 0.624. | |
with business_context: | |
st.header('The Problem of Content Moderation') | |
st.write(""" | |
**Human content moderation exploits people by consistently traumatizing and underpaying them.** In 2019, an [article](https://www.theverge.com/2019/6/19/18681845/facebook-moderator-interviews-video-trauma-ptsd-cognizant-tampa) on The Verge exposed the extensive list of horrific working conditions that employees faced at Cognizant, which was Facebook’s primary moderation contractor. Unfortunately, **every major tech company**, including **Twitter**, uses human moderators to some extent, both domestically and overseas. | |
Hate speech is defined as **abusive or threatening speech that expresses prejudice against a particular group, especially on the basis of race, religion or sexual orientation.** Usually, the difference between hate speech and offensive language comes down to subtle context or diction. | |
""") | |
with data_desc: | |
understanding, venn = st.columns(2) | |
with understanding: | |
st.text('') | |
st.write(""" | |
The **data** for this project was sourced from a Cornell University [study](https://github.com/t-davidson/hate-speech-and-offensive-language) titled *Automated Hate Speech Detection and the Problem of Offensive Language*. | |
The `.csv` file has **24,802 rows** where **6% of the tweets were labeled as "Hate Speech".** | |
Each tweet's label was voted on by crowdsource and determined by majority rules. | |
""") | |
with tweet_input: | |
st.header('Is Your Tweet Considered Hate Speech?') | |
st.write("""*Please note that this prediction is based on how the model was trained, so it may not be an accurate representation.*""") | |
# user input here | |
user_text = st.text_input('Enter Tweet', max_chars=280) # setting input as user_text | |
with model_results: | |
st.subheader('Prediction:') | |
if user_text: | |
# processing user_text | |
# removing punctuation | |
user_text = re.sub('[%s]' % re.escape(string.punctuation), '', user_text) | |
# tokenizing | |
stop_words = set(stopwords.words('english')) | |
tokens = nltk.word_tokenize(user_text) | |
# removing stop words | |
stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words] | |
# taking root word | |
lemmatizer = WordNetLemmatizer() | |
lemmatized_output = [] | |
for word in stopwords_removed: | |
lemmatized_output.append(lemmatizer.lemmatize(word)) | |
# instantiating count vectorizor | |
count = CountVectorizer(stop_words=stop_words) | |
X_train = pickle.load(open('X_train', 'rb')) | |
X_test = lemmatized_output | |
X_train_count = count.fit_transform(X_train) | |
X_test_count = count.transform(X_test) | |
# loading in model | |
final_model = pickle.load(open('bayes', 'rb')) | |
# apply model to make predictions | |
prediction = final_model.predict(X_test_count[0]) | |
if prediction == 0: | |
st.subheader('**Not Hate Speech**') | |
else: | |
st.subheader('**Hate Speech**') | |
st.text('') |