Spaces:
Runtime error
Runtime error
Commit
·
1785380
1
Parent(s):
01d568a
Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,8 @@ import streamlit as st
|
|
6 |
import pandas as pd
|
7 |
import numpy as np
|
8 |
import pickle
|
|
|
|
|
9 |
from PIL import Image
|
10 |
# preprocessing
|
11 |
import re
|
@@ -13,82 +15,116 @@ import string
|
|
13 |
import nltk
|
14 |
from nltk.corpus import stopwords
|
15 |
from nltk.stem import WordNetLemmatizer
|
16 |
-
from sklearn.feature_extraction.text import
|
|
|
17 |
# modeling
|
18 |
-
from sklearn import
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
|
37 |
-
**Human content moderation exploits people by consistently traumatizing and underpaying them.** In 2019, an [article](https://www.theverge.com/2019/6/19/18681845/facebook-moderator-interviews-video-trauma-ptsd-cognizant-tampa) on The Verge exposed the extensive list of horrific working conditions that employees faced at Cognizant, which was Facebook’s primary moderation contractor. Unfortunately, **every major tech company**, including **Twitter**, uses human moderators to some extent, both domestically and overseas.
|
38 |
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
with data_desc:
|
44 |
-
understanding, venn = st.columns(2)
|
45 |
-
with understanding:
|
46 |
-
st.text('')
|
47 |
-
st.write("""
|
48 |
-
The **data** for this project was sourced from a Cornell University [study](https://github.com/t-davidson/hate-speech-and-offensive-language) titled *Automated Hate Speech Detection and the Problem of Offensive Language*.
|
49 |
|
50 |
-
|
51 |
-
Each tweet's label was voted on by crowdsource and determined by majority rules.
|
52 |
-
""")
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
st.write("""*Please note that this prediction is based on how the model was trained, so it may not be an accurate representation.*""")
|
57 |
-
# user input here
|
58 |
-
user_text = st.text_input('Enter Tweet', max_chars=280) # setting input as user_text
|
59 |
-
|
60 |
-
with model_results:
|
61 |
-
st.subheader('Prediction:')
|
62 |
-
if user_text:
|
63 |
-
# processing user_text
|
64 |
-
# removing punctuation
|
65 |
-
user_text = re.sub('[%s]' % re.escape(string.punctuation), '', user_text)
|
66 |
-
# tokenizing
|
67 |
-
stop_words = set(stopwords.words('english'))
|
68 |
-
tokens = nltk.word_tokenize(user_text)
|
69 |
-
# removing stop words
|
70 |
-
stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
|
71 |
-
# taking root word
|
72 |
-
lemmatizer = WordNetLemmatizer()
|
73 |
-
lemmatized_output = []
|
74 |
-
for word in stopwords_removed:
|
75 |
-
lemmatized_output.append(lemmatizer.lemmatize(word))
|
76 |
-
|
77 |
-
# instantiating count vectorizor
|
78 |
-
count = CountVectorizer(stop_words=stop_words)
|
79 |
-
X_train = pickle.load(open("C:\Users\User\Downloads\X_train", 'rb'))
|
80 |
-
X_test = lemmatized_output
|
81 |
-
X_train_count = count.fit_transform(X_train)
|
82 |
-
X_test_count = count.transform(X_test)
|
83 |
-
|
84 |
-
# loading in model
|
85 |
-
final_model = pickle.load(open("C:\Users\User\Downloads\bayes", 'rb'))
|
86 |
-
|
87 |
-
# apply model to make predictions
|
88 |
-
prediction = final_model.predict(X_test_count[0])
|
89 |
-
|
90 |
-
if prediction == 0:
|
91 |
-
st.subheader('**Not Hate Speech**')
|
92 |
else:
|
93 |
-
|
94 |
-
|
|
|
|
6 |
import pandas as pd
|
7 |
import numpy as np
|
8 |
import pickle
|
9 |
+
import itertools
|
10 |
+
import matplotlib.pyplot as plt
|
11 |
from PIL import Image
|
12 |
# preprocessing
|
13 |
import re
|
|
|
15 |
import nltk
|
16 |
from nltk.corpus import stopwords
|
17 |
from nltk.stem import WordNetLemmatizer
|
18 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
19 |
+
from sklearn.metrics import confusion_matrix
|
20 |
# modeling
|
21 |
+
from sklearn.naive_bayes import MultinomialNB
|
22 |
+
from sklearn.metrics import confusion_matrix
|
23 |
+
st.title("Toxic Comment Detection App ")
|
24 |
+
st.write('\n\n')
|
25 |
+
|
26 |
+
|
27 |
+
def clean_text(text):
|
28 |
+
import re
|
29 |
+
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
|
30 |
+
stop_words = set(stopwords.words('english'))
|
31 |
+
tokens = nltk.word_tokenize(text)
|
32 |
+
stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
|
33 |
+
lemmatizer = WordNetLemmatizer()
|
34 |
+
lemmatized_output = []
|
35 |
+
for word in stopwords_removed:
|
36 |
+
lemmatized_output.append(lemmatizer.lemmatize(word))
|
37 |
+
|
38 |
+
return text
|
39 |
|
|
|
40 |
|
41 |
+
def classifier_evaluation(y_pred, y_test):
|
42 |
+
fig, ax = plt.subplots()
|
43 |
+
confusion_matrix = pd.crosstab(y_pred, y_test, rownames=['Actual'], colnames=['Predicted'])
|
44 |
+
sns.heatmap(confusion_matrix, annot=True, cmap = 'Blues')
|
45 |
+
st.write("Confusion Matrix:")
|
46 |
+
st.write(fig)
|
47 |
+
st.text('Model Report:\n ' + classification_report(y_pred, y_test))
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
df = pd.read_csv("C:\Users\User\Downloads\toxicity.csv")
|
52 |
+
|
53 |
+
def clean_text_2(text):
|
54 |
+
# make text lowercase
|
55 |
+
text = text.lower()
|
56 |
+
# removing text within parentheses
|
57 |
+
text = re.sub('\(.*?\)', '', text)
|
58 |
+
# removing numbers
|
59 |
+
text = re.sub('\w*\d\w*', '', text)
|
60 |
+
# if there's more than 1 whitespace, then make it just 1
|
61 |
+
text = re.sub('\s+', ' ', text)
|
62 |
+
# if there's a new line, then make it a whitespace
|
63 |
+
text = re.sub('\n', ' ', text)
|
64 |
+
# removing any quotes
|
65 |
+
text = re.sub('\"+', '', text)
|
66 |
+
# getting rid of punctuations
|
67 |
+
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
|
68 |
+
|
69 |
+
return text
|
70 |
+
|
71 |
+
clean = lambda x: clean_text_2(x)
|
72 |
+
|
73 |
+
df['clean_text'] = df['text'].apply(clean)
|
74 |
+
|
75 |
+
text_df = df[['clean_text', 'is_toxic']].copy()
|
76 |
+
|
77 |
+
text_df['is_toxic'] = text_df['is_toxic'].replace('Toxic', 1)
|
78 |
+
text_df['is_toxic'] = text_df['is_toxic'].replace('Not Toxic', 0)
|
79 |
+
|
80 |
+
data = text_df['clean_text']
|
81 |
+
target = text_df['is_toxic']
|
82 |
+
|
83 |
+
stop_words = set(stopwords.words('english'))
|
84 |
+
def process_text(text):
|
85 |
+
tokens = nltk.word_tokenize(text)
|
86 |
+
stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
|
87 |
+
return stopwords_removed
|
88 |
+
|
89 |
+
# applying the above function to our data/features
|
90 |
+
processed_data = list(map(process_text, data))
|
91 |
+
|
92 |
+
# creating a list with all lemmatized outputs
|
93 |
+
lemmatizer = WordNetLemmatizer()
|
94 |
+
lemmatized_output = []
|
95 |
+
|
96 |
+
for listy in processed_data:
|
97 |
+
lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy])
|
98 |
+
lemmatized_output.append(lemmed)
|
99 |
+
|
100 |
+
X_lem = lemmatized_output
|
101 |
+
y_lem = target
|
102 |
+
|
103 |
+
X_train, X_test, y_train, y_test = train_test_split(X_lem, y_lem, test_size=0.20, random_state=15)
|
104 |
+
|
105 |
+
tfidf = TfidfVectorizer(stop_words= stop_words, ngram_range=(1,2))
|
106 |
+
|
107 |
+
tfidf_data_train = tfidf.fit_transform(X_train)
|
108 |
+
tfidf_data_test = tfidf.transform(X_test)
|
109 |
+
|
110 |
+
if st.checkbox('Evaluate The Binary Classification Model (Toxic, Non-Toxic)'):
|
111 |
+
bayes = MultinomialNB(alpha = .01)
|
112 |
+
bayes.fit(tfidf_data_train, y_train)
|
113 |
+
bayes_test_preds = bayes.predict(tfidf_data_test)
|
114 |
+
classifier_evaluation(bayes_test_preds, y_test)
|
115 |
+
|
116 |
+
st.write("""##### Try it out yourself!""")
|
117 |
+
binary_text = st.text_area("Classify Using The Binary Model:", "Enter Text")
|
118 |
+
binary_text = clean_text(binary_text)
|
119 |
|
120 |
+
if st.checkbox('Apply Binary Model'):
|
121 |
+
binary_model = Pipeline([('vectorizer', tfidf), ('classifier', bayes)])
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
+
result = binary_model.predict([binary_text])
|
|
|
|
|
124 |
|
125 |
+
if result.astype(int) == 1:
|
126 |
+
result_text = "Toxic"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
else:
|
128 |
+
result_text = "Not Toxic"
|
129 |
+
|
130 |
+
st.write(" ##### Result: ", result_text)
|