azizbarank commited on
Commit
1785380
·
1 Parent(s): 01d568a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -72
app.py CHANGED
@@ -6,6 +6,8 @@ import streamlit as st
6
  import pandas as pd
7
  import numpy as np
8
  import pickle
 
 
9
  from PIL import Image
10
  # preprocessing
11
  import re
@@ -13,82 +15,116 @@ import string
13
  import nltk
14
  from nltk.corpus import stopwords
15
  from nltk.stem import WordNetLemmatizer
16
- from sklearn.feature_extraction.text import CountVectorizer
 
17
  # modeling
18
- from sklearn import svm
19
-
20
- # creating page sections
21
- site_header = st.container()
22
- business_context = st.container()
23
- data_desc = st.container()
24
- performance = st.container()
25
- tweet_input = st.container()
26
- model_results = st.container()
27
- sentiment_analysis = st.container()
28
- contact = st.container()
29
-
30
- with site_header:
31
- st.title('Twitter Hate Speech Detection')
32
-
33
- with business_context:
34
- st.header('The Problem of Content Moderation')
35
- st.write("""
36
 
37
- **Human content moderation exploits people by consistently traumatizing and underpaying them.** In 2019, an [article](https://www.theverge.com/2019/6/19/18681845/facebook-moderator-interviews-video-trauma-ptsd-cognizant-tampa) on The Verge exposed the extensive list of horrific working conditions that employees faced at Cognizant, which was Facebook’s primary moderation contractor. Unfortunately, **every major tech company**, including **Twitter**, uses human moderators to some extent, both domestically and overseas.
38
 
39
- Hate speech is defined as **abusive or threatening speech that expresses prejudice against a particular group, especially on the basis of race, religion or sexual orientation.** Usually, the difference between hate speech and offensive language comes down to subtle context or diction.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- """)
42
-
43
- with data_desc:
44
- understanding, venn = st.columns(2)
45
- with understanding:
46
- st.text('')
47
- st.write("""
48
- The **data** for this project was sourced from a Cornell University [study](https://github.com/t-davidson/hate-speech-and-offensive-language) titled *Automated Hate Speech Detection and the Problem of Offensive Language*.
49
 
50
- The `.csv` file has **24,802 rows** where **6% of the tweets were labeled as "Hate Speech".**
51
- Each tweet's label was voted on by crowdsource and determined by majority rules.
52
- """)
53
 
54
- with tweet_input:
55
- st.header('Is Your Tweet Considered Hate Speech?')
56
- st.write("""*Please note that this prediction is based on how the model was trained, so it may not be an accurate representation.*""")
57
- # user input here
58
- user_text = st.text_input('Enter Tweet', max_chars=280) # setting input as user_text
59
-
60
- with model_results:
61
- st.subheader('Prediction:')
62
- if user_text:
63
- # processing user_text
64
- # removing punctuation
65
- user_text = re.sub('[%s]' % re.escape(string.punctuation), '', user_text)
66
- # tokenizing
67
- stop_words = set(stopwords.words('english'))
68
- tokens = nltk.word_tokenize(user_text)
69
- # removing stop words
70
- stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
71
- # taking root word
72
- lemmatizer = WordNetLemmatizer()
73
- lemmatized_output = []
74
- for word in stopwords_removed:
75
- lemmatized_output.append(lemmatizer.lemmatize(word))
76
-
77
- # instantiating count vectorizor
78
- count = CountVectorizer(stop_words=stop_words)
79
- X_train = pickle.load(open("C:\Users\User\Downloads\X_train", 'rb'))
80
- X_test = lemmatized_output
81
- X_train_count = count.fit_transform(X_train)
82
- X_test_count = count.transform(X_test)
83
-
84
- # loading in model
85
- final_model = pickle.load(open("C:\Users\User\Downloads\bayes", 'rb'))
86
-
87
- # apply model to make predictions
88
- prediction = final_model.predict(X_test_count[0])
89
-
90
- if prediction == 0:
91
- st.subheader('**Not Hate Speech**')
92
  else:
93
- st.subheader('**Hate Speech**')
94
- st.text('')
 
 
6
  import pandas as pd
7
  import numpy as np
8
  import pickle
9
+ import itertools
10
+ import matplotlib.pyplot as plt
11
  from PIL import Image
12
  # preprocessing
13
  import re
 
15
  import nltk
16
  from nltk.corpus import stopwords
17
  from nltk.stem import WordNetLemmatizer
18
+ from sklearn.feature_extraction.text import TfidfVectorizer
19
+ from sklearn.metrics import confusion_matrix
20
  # modeling
21
+ from sklearn.naive_bayes import MultinomialNB
22
+ from sklearn.metrics import confusion_matrix
23
+ st.title("Toxic Comment Detection App ")
24
+ st.write('\n\n')
25
+
26
+
27
+ def clean_text(text):
28
+ import re
29
+ text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
30
+ stop_words = set(stopwords.words('english'))
31
+ tokens = nltk.word_tokenize(text)
32
+ stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
33
+ lemmatizer = WordNetLemmatizer()
34
+ lemmatized_output = []
35
+ for word in stopwords_removed:
36
+ lemmatized_output.append(lemmatizer.lemmatize(word))
37
+
38
+ return text
39
 
 
40
 
41
+ def classifier_evaluation(y_pred, y_test):
42
+ fig, ax = plt.subplots()
43
+ confusion_matrix = pd.crosstab(y_pred, y_test, rownames=['Actual'], colnames=['Predicted'])
44
+ sns.heatmap(confusion_matrix, annot=True, cmap = 'Blues')
45
+ st.write("Confusion Matrix:")
46
+ st.write(fig)
47
+ st.text('Model Report:\n ' + classification_report(y_pred, y_test))
48
+
49
+
50
+
51
+ df = pd.read_csv("C:\Users\User\Downloads\toxicity.csv")
52
+
53
+ def clean_text_2(text):
54
+ # make text lowercase
55
+ text = text.lower()
56
+ # removing text within parentheses
57
+ text = re.sub('\(.*?\)', '', text)
58
+ # removing numbers
59
+ text = re.sub('\w*\d\w*', '', text)
60
+ # if there's more than 1 whitespace, then make it just 1
61
+ text = re.sub('\s+', ' ', text)
62
+ # if there's a new line, then make it a whitespace
63
+ text = re.sub('\n', ' ', text)
64
+ # removing any quotes
65
+ text = re.sub('\"+', '', text)
66
+ # getting rid of punctuations
67
+ text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
68
+
69
+ return text
70
+
71
+ clean = lambda x: clean_text_2(x)
72
+
73
+ df['clean_text'] = df['text'].apply(clean)
74
+
75
+ text_df = df[['clean_text', 'is_toxic']].copy()
76
+
77
+ text_df['is_toxic'] = text_df['is_toxic'].replace('Toxic', 1)
78
+ text_df['is_toxic'] = text_df['is_toxic'].replace('Not Toxic', 0)
79
+
80
+ data = text_df['clean_text']
81
+ target = text_df['is_toxic']
82
+
83
+ stop_words = set(stopwords.words('english'))
84
+ def process_text(text):
85
+ tokens = nltk.word_tokenize(text)
86
+ stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
87
+ return stopwords_removed
88
+
89
+ # applying the above function to our data/features
90
+ processed_data = list(map(process_text, data))
91
+
92
+ # creating a list with all lemmatized outputs
93
+ lemmatizer = WordNetLemmatizer()
94
+ lemmatized_output = []
95
+
96
+ for listy in processed_data:
97
+ lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy])
98
+ lemmatized_output.append(lemmed)
99
+
100
+ X_lem = lemmatized_output
101
+ y_lem = target
102
+
103
+ X_train, X_test, y_train, y_test = train_test_split(X_lem, y_lem, test_size=0.20, random_state=15)
104
+
105
+ tfidf = TfidfVectorizer(stop_words= stop_words, ngram_range=(1,2))
106
+
107
+ tfidf_data_train = tfidf.fit_transform(X_train)
108
+ tfidf_data_test = tfidf.transform(X_test)
109
+
110
+ if st.checkbox('Evaluate The Binary Classification Model (Toxic, Non-Toxic)'):
111
+ bayes = MultinomialNB(alpha = .01)
112
+ bayes.fit(tfidf_data_train, y_train)
113
+ bayes_test_preds = bayes.predict(tfidf_data_test)
114
+ classifier_evaluation(bayes_test_preds, y_test)
115
+
116
+ st.write("""##### Try it out yourself!""")
117
+ binary_text = st.text_area("Classify Using The Binary Model:", "Enter Text")
118
+ binary_text = clean_text(binary_text)
119
 
120
+ if st.checkbox('Apply Binary Model'):
121
+ binary_model = Pipeline([('vectorizer', tfidf), ('classifier', bayes)])
 
 
 
 
 
 
122
 
123
+ result = binary_model.predict([binary_text])
 
 
124
 
125
+ if result.astype(int) == 1:
126
+ result_text = "Toxic"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  else:
128
+ result_text = "Not Toxic"
129
+
130
+ st.write(" ##### Result: ", result_text)