Spaces:
Runtime error
Runtime error
Commit
·
ceaa4fc
1
Parent(s):
5f617ed
Update app.py
Browse files
app.py
CHANGED
@@ -3,14 +3,11 @@ os.system('pip install nltk')
|
|
3 |
os.system('pip install sklearn')
|
4 |
os.system('pip install wget')
|
5 |
|
6 |
-
|
7 |
-
|
8 |
import streamlit as st
|
9 |
import pandas as pd
|
10 |
import numpy as np
|
11 |
import pickle
|
12 |
-
import itertools
|
13 |
-
import matplotlib.pyplot as plt
|
14 |
from PIL import Image
|
15 |
# preprocessing
|
16 |
import re
|
@@ -19,114 +16,63 @@ import nltk
|
|
19 |
from nltk.corpus import stopwords
|
20 |
from nltk.stem import WordNetLemmatizer
|
21 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
22 |
-
from sklearn.metrics import confusion_matrix
|
23 |
# modeling
|
24 |
-
from sklearn
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
st.
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
text_df['is_toxic'] = text_df['is_toxic'].replace('Not Toxic', 0)
|
81 |
-
|
82 |
-
data = text_df['clean_text']
|
83 |
-
target = text_df['is_toxic']
|
84 |
-
|
85 |
-
stop_words = set(stopwords.words('english'))
|
86 |
-
def process_text(text):
|
87 |
-
tokens = nltk.word_tokenize(text)
|
88 |
-
stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
|
89 |
-
return stopwords_removed
|
90 |
-
|
91 |
-
# applying the above function to our data/features
|
92 |
-
processed_data = list(map(process_text, data))
|
93 |
-
|
94 |
-
# creating a list with all lemmatized outputs
|
95 |
-
lemmatizer = WordNetLemmatizer()
|
96 |
-
lemmatized_output = []
|
97 |
-
|
98 |
-
for listy in processed_data:
|
99 |
-
lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy])
|
100 |
-
lemmatized_output.append(lemmed)
|
101 |
-
|
102 |
-
X_lem = lemmatized_output
|
103 |
-
y_lem = target
|
104 |
-
|
105 |
-
X_train, X_test, y_train, y_test = train_test_split(X_lem, y_lem, test_size=0.20, random_state=15)
|
106 |
-
|
107 |
-
tfidf = TfidfVectorizer(stop_words= stop_words, ngram_range=(1,2))
|
108 |
-
|
109 |
-
tfidf_data_train = tfidf.fit_transform(X_train)
|
110 |
-
tfidf_data_test = tfidf.transform(X_test)
|
111 |
-
|
112 |
-
if st.checkbox('Evaluate The Binary Classification Model (Toxic, Non-Toxic)'):
|
113 |
-
bayes = MultinomialNB(alpha = .01)
|
114 |
-
bayes.fit(tfidf_data_train, y_train)
|
115 |
-
bayes_test_preds = bayes.predict(tfidf_data_test)
|
116 |
-
classifier_evaluation(bayes_test_preds, y_test)
|
117 |
-
|
118 |
-
st.write("""##### Try it out yourself!""")
|
119 |
-
binary_text = st.text_area("Classify Using The Binary Model:", "Enter Text")
|
120 |
-
binary_text = clean_text(binary_text)
|
121 |
-
|
122 |
-
if st.checkbox('Apply Binary Model'):
|
123 |
-
binary_model = Pipeline([('vectorizer', tfidf), ('classifier', bayes)])
|
124 |
-
|
125 |
-
result = binary_model.predict([binary_text])
|
126 |
-
|
127 |
-
if result.astype(int) == 1:
|
128 |
-
result_text = "Toxic"
|
129 |
else:
|
130 |
-
|
131 |
-
|
132 |
-
st.write(" ##### Result: ", result_text)
|
|
|
3 |
os.system('pip install sklearn')
|
4 |
os.system('pip install wget')
|
5 |
|
6 |
+
# importing relevant python packages
|
|
|
7 |
import streamlit as st
|
8 |
import pandas as pd
|
9 |
import numpy as np
|
10 |
import pickle
|
|
|
|
|
11 |
from PIL import Image
|
12 |
# preprocessing
|
13 |
import re
|
|
|
16 |
from nltk.corpus import stopwords
|
17 |
from nltk.stem import WordNetLemmatizer
|
18 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
19 |
# modeling
|
20 |
+
from sklearn import svm
|
21 |
+
# sentiment analysis
|
22 |
+
|
23 |
+
|
24 |
+
# creating page sections
|
25 |
+
site_header = st.container()
|
26 |
+
business_context = st.container()
|
27 |
+
data_desc = st.container()
|
28 |
+
performance = st.container()
|
29 |
+
tweet_input = st.container()
|
30 |
+
model_results = st.container()
|
31 |
+
sentiment_analysis = st.container()
|
32 |
+
contact = st.container()
|
33 |
+
|
34 |
+
with site_header:
|
35 |
+
st.title('Toxic Comment Detection')
|
36 |
+
|
37 |
+
|
38 |
+
with tweet_input:
|
39 |
+
st.header('Is Your Tweet Considered Hate Speech?')
|
40 |
+
st.write("""*Please note that this prediction is based on how the model was trained, so it may not be an accurate representation.*""")
|
41 |
+
# user input here
|
42 |
+
user_text = st.text_input('Enter Tweet', max_chars=280) # setting input as user_text
|
43 |
+
|
44 |
+
with model_results:
|
45 |
+
st.subheader('Prediction:')
|
46 |
+
if user_text:
|
47 |
+
# processing user_text
|
48 |
+
# removing punctuation
|
49 |
+
user_text = re.sub('[%s]' % re.escape(string.punctuation), '', user_text)
|
50 |
+
# tokenizing
|
51 |
+
stop_words = set(stopwords.words('english'))
|
52 |
+
tokens = nltk.word_tokenize(user_text)
|
53 |
+
# removing stop words
|
54 |
+
stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
|
55 |
+
# taking root word
|
56 |
+
lemmatizer = WordNetLemmatizer()
|
57 |
+
lemmatized_output = []
|
58 |
+
for word in stopwords_removed:
|
59 |
+
lemmatized_output.append(lemmatizer.lemmatize(word))
|
60 |
+
|
61 |
+
# instantiating count vectorizor
|
62 |
+
tfidf = TfidfVectorizer(stop_words=stop_words)
|
63 |
+
X_train = pickle.load(open('X_train.pickle', 'rb'))
|
64 |
+
X_test = lemmatized_output
|
65 |
+
X_train_count = tfidf.fit_transform(X_train)
|
66 |
+
X_test_count = tfidf.transform(X_test)
|
67 |
+
|
68 |
+
# loading in model
|
69 |
+
final_model = pickle.load(open('final_bayes.pickle', 'rb'))
|
70 |
+
|
71 |
+
# apply model to make predictions
|
72 |
+
prediction = final_model.predict(X_test_count[0])
|
73 |
+
|
74 |
+
if prediction == 0:
|
75 |
+
st.subheader('**Not Hate Speech**')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
else:
|
77 |
+
st.subheader('**Hate Speech**')
|
78 |
+
st.text('')
|
|