azizbarank's picture
Update app.py
01d568a
raw
history blame
3.9 kB
import os
os.system('pip install nltk')
os.system('pip install sklearn')
import streamlit as st
import pandas as pd
import numpy as np
import pickle
from PIL import Image
# preprocessing
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
# modeling
from sklearn import svm
# creating page sections
site_header = st.container()
business_context = st.container()
data_desc = st.container()
performance = st.container()
tweet_input = st.container()
model_results = st.container()
sentiment_analysis = st.container()
contact = st.container()
with site_header:
st.title('Twitter Hate Speech Detection')
with business_context:
st.header('The Problem of Content Moderation')
st.write("""
**Human content moderation exploits people by consistently traumatizing and underpaying them.** In 2019, an [article](https://www.theverge.com/2019/6/19/18681845/facebook-moderator-interviews-video-trauma-ptsd-cognizant-tampa) on The Verge exposed the extensive list of horrific working conditions that employees faced at Cognizant, which was Facebook’s primary moderation contractor. Unfortunately, **every major tech company**, including **Twitter**, uses human moderators to some extent, both domestically and overseas.
Hate speech is defined as **abusive or threatening speech that expresses prejudice against a particular group, especially on the basis of race, religion or sexual orientation.** Usually, the difference between hate speech and offensive language comes down to subtle context or diction.
""")
with data_desc:
understanding, venn = st.columns(2)
with understanding:
st.text('')
st.write("""
The **data** for this project was sourced from a Cornell University [study](https://github.com/t-davidson/hate-speech-and-offensive-language) titled *Automated Hate Speech Detection and the Problem of Offensive Language*.
The `.csv` file has **24,802 rows** where **6% of the tweets were labeled as "Hate Speech".**
Each tweet's label was voted on by crowdsource and determined by majority rules.
""")
with tweet_input:
st.header('Is Your Tweet Considered Hate Speech?')
st.write("""*Please note that this prediction is based on how the model was trained, so it may not be an accurate representation.*""")
# user input here
user_text = st.text_input('Enter Tweet', max_chars=280) # setting input as user_text
with model_results:
st.subheader('Prediction:')
if user_text:
# processing user_text
# removing punctuation
user_text = re.sub('[%s]' % re.escape(string.punctuation), '', user_text)
# tokenizing
stop_words = set(stopwords.words('english'))
tokens = nltk.word_tokenize(user_text)
# removing stop words
stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
# taking root word
lemmatizer = WordNetLemmatizer()
lemmatized_output = []
for word in stopwords_removed:
lemmatized_output.append(lemmatizer.lemmatize(word))
# instantiating count vectorizor
count = CountVectorizer(stop_words=stop_words)
X_train = pickle.load(open("C:\Users\User\Downloads\X_train", 'rb'))
X_test = lemmatized_output
X_train_count = count.fit_transform(X_train)
X_test_count = count.transform(X_test)
# loading in model
final_model = pickle.load(open("C:\Users\User\Downloads\bayes", 'rb'))
# apply model to make predictions
prediction = final_model.predict(X_test_count[0])
if prediction == 0:
st.subheader('**Not Hate Speech**')
else:
st.subheader('**Hate Speech**')
st.text('')