Spaces:
Runtime error
Runtime error
Commit
·
959b9ce
1
Parent(s):
ef82fc2
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import pickle
|
5 |
+
from PIL import Image
|
6 |
+
# preprocessing
|
7 |
+
import re
|
8 |
+
import string
|
9 |
+
import nltk
|
10 |
+
from nltk.corpus import stopwords
|
11 |
+
from nltk.stem import WordNetLemmatizer
|
12 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
13 |
+
# modeling
|
14 |
+
from sklearn import svm
|
15 |
+
# sentiment analysis
|
16 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
17 |
+
|
18 |
+
# creating page sections
|
19 |
+
site_header = st.container()
|
20 |
+
business_context = st.container()
|
21 |
+
data_desc = st.container()
|
22 |
+
performance = st.container()
|
23 |
+
tweet_input = st.container()
|
24 |
+
model_results = st.container()
|
25 |
+
sentiment_analysis = st.container()
|
26 |
+
contact = st.container()
|
27 |
+
|
28 |
+
with site_header:
|
29 |
+
st.title('Twitter Hate Speech Detection')
|
30 |
+
|
31 |
+
|
32 |
+
This project aims to **automate content moderation** to identify hate speech using **machine learning binary classification algorithms.**
|
33 |
+
|
34 |
+
Baseline models included Random Forest, Naive Bayes, Logistic Regression and Support Vector Machine (SVM). The final model was a **Logistic Regression** model that used Count Vectorization for feature engineering. It produced an F1 of 0.3958 and Recall (TPR) of 0.624.
|
35 |
+
|
36 |
+
with business_context:
|
37 |
+
st.header('The Problem of Content Moderation')
|
38 |
+
st.write("""
|
39 |
+
|
40 |
+
**Human content moderation exploits people by consistently traumatizing and underpaying them.** In 2019, an [article](https://www.theverge.com/2019/6/19/18681845/facebook-moderator-interviews-video-trauma-ptsd-cognizant-tampa) on The Verge exposed the extensive list of horrific working conditions that employees faced at Cognizant, which was Facebook’s primary moderation contractor. Unfortunately, **every major tech company**, including **Twitter**, uses human moderators to some extent, both domestically and overseas.
|
41 |
+
|
42 |
+
Hate speech is defined as **abusive or threatening speech that expresses prejudice against a particular group, especially on the basis of race, religion or sexual orientation.** Usually, the difference between hate speech and offensive language comes down to subtle context or diction.
|
43 |
+
|
44 |
+
""")
|
45 |
+
|
46 |
+
with data_desc:
|
47 |
+
understanding, venn = st.columns(2)
|
48 |
+
with understanding:
|
49 |
+
st.text('')
|
50 |
+
st.write("""
|
51 |
+
The **data** for this project was sourced from a Cornell University [study](https://github.com/t-davidson/hate-speech-and-offensive-language) titled *Automated Hate Speech Detection and the Problem of Offensive Language*.
|
52 |
+
|
53 |
+
The `.csv` file has **24,802 rows** where **6% of the tweets were labeled as "Hate Speech".**
|
54 |
+
Each tweet's label was voted on by crowdsource and determined by majority rules.
|
55 |
+
""")
|
56 |
+
|
57 |
+
with tweet_input:
|
58 |
+
st.header('Is Your Tweet Considered Hate Speech?')
|
59 |
+
st.write("""*Please note that this prediction is based on how the model was trained, so it may not be an accurate representation.*""")
|
60 |
+
# user input here
|
61 |
+
user_text = st.text_input('Enter Tweet', max_chars=280) # setting input as user_text
|
62 |
+
|
63 |
+
with model_results:
|
64 |
+
st.subheader('Prediction:')
|
65 |
+
if user_text:
|
66 |
+
# processing user_text
|
67 |
+
# removing punctuation
|
68 |
+
user_text = re.sub('[%s]' % re.escape(string.punctuation), '', user_text)
|
69 |
+
# tokenizing
|
70 |
+
stop_words = set(stopwords.words('english'))
|
71 |
+
tokens = nltk.word_tokenize(user_text)
|
72 |
+
# removing stop words
|
73 |
+
stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
|
74 |
+
# taking root word
|
75 |
+
lemmatizer = WordNetLemmatizer()
|
76 |
+
lemmatized_output = []
|
77 |
+
for word in stopwords_removed:
|
78 |
+
lemmatized_output.append(lemmatizer.lemmatize(word))
|
79 |
+
|
80 |
+
# instantiating count vectorizor
|
81 |
+
count = CountVectorizer(stop_words=stop_words)
|
82 |
+
X_train = pickle.load(open('X_train', 'rb'))
|
83 |
+
X_test = lemmatized_output
|
84 |
+
X_train_count = count.fit_transform(X_train)
|
85 |
+
X_test_count = count.transform(X_test)
|
86 |
+
|
87 |
+
# loading in model
|
88 |
+
final_model = pickle.load(open('bayes', 'rb'))
|
89 |
+
|
90 |
+
# apply model to make predictions
|
91 |
+
prediction = final_model.predict(X_test_count[0])
|
92 |
+
|
93 |
+
if prediction == 0:
|
94 |
+
st.subheader('**Not Hate Speech**')
|
95 |
+
else:
|
96 |
+
st.subheader('**Hate Speech**')
|
97 |
+
st.text('')
|