azizbarank commited on
Commit
ceaa4fc
·
1 Parent(s): 5f617ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -113
app.py CHANGED
@@ -3,14 +3,11 @@ os.system('pip install nltk')
3
  os.system('pip install sklearn')
4
  os.system('pip install wget')
5
 
6
- !wget 'https://raw.githubusercontent.com/surge-ai/copilot-toxicity/main/toxicity.csv'
7
-
8
  import streamlit as st
9
  import pandas as pd
10
  import numpy as np
11
  import pickle
12
- import itertools
13
- import matplotlib.pyplot as plt
14
  from PIL import Image
15
  # preprocessing
16
  import re
@@ -19,114 +16,63 @@ import nltk
19
  from nltk.corpus import stopwords
20
  from nltk.stem import WordNetLemmatizer
21
  from sklearn.feature_extraction.text import TfidfVectorizer
22
- from sklearn.metrics import confusion_matrix
23
  # modeling
24
- from sklearn.naive_bayes import MultinomialNB
25
- from sklearn.metrics import confusion_matrix
26
- st.title("Toxic Comment Detection App ")
27
- st.write('\n\n')
28
-
29
-
30
- def clean_text(text):
31
- import re
32
- text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
33
- stop_words = set(stopwords.words('english'))
34
- tokens = nltk.word_tokenize(text)
35
- stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
36
- lemmatizer = WordNetLemmatizer()
37
- lemmatized_output = []
38
- for word in stopwords_removed:
39
- lemmatized_output.append(lemmatizer.lemmatize(word))
40
-
41
- return text
42
-
43
-
44
- def classifier_evaluation(y_pred, y_test):
45
- fig, ax = plt.subplots()
46
- confusion_matrix = pd.crosstab(y_pred, y_test, rownames=['Actual'], colnames=['Predicted'])
47
- sns.heatmap(confusion_matrix, annot=True, cmap = 'Blues')
48
- st.write("Confusion Matrix:")
49
- st.write(fig)
50
- st.text('Model Report:\n ' + classification_report(y_pred, y_test))
51
-
52
-
53
- df = pd.read_csv('toxicity.csv')
54
-
55
- def clean_text_2(text):
56
- # make text lowercase
57
- text = text.lower()
58
- # removing text within parentheses
59
- text = re.sub('\(.*?\)', '', text)
60
- # removing numbers
61
- text = re.sub('\w*\d\w*', '', text)
62
- # if there's more than 1 whitespace, then make it just 1
63
- text = re.sub('\s+', ' ', text)
64
- # if there's a new line, then make it a whitespace
65
- text = re.sub('\n', ' ', text)
66
- # removing any quotes
67
- text = re.sub('\"+', '', text)
68
- # getting rid of punctuations
69
- text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
70
-
71
- return text
72
-
73
- clean = lambda x: clean_text_2(x)
74
-
75
- df['clean_text'] = df['text'].apply(clean)
76
-
77
- text_df = df[['clean_text', 'is_toxic']].copy()
78
-
79
- text_df['is_toxic'] = text_df['is_toxic'].replace('Toxic', 1)
80
- text_df['is_toxic'] = text_df['is_toxic'].replace('Not Toxic', 0)
81
-
82
- data = text_df['clean_text']
83
- target = text_df['is_toxic']
84
-
85
- stop_words = set(stopwords.words('english'))
86
- def process_text(text):
87
- tokens = nltk.word_tokenize(text)
88
- stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
89
- return stopwords_removed
90
-
91
- # applying the above function to our data/features
92
- processed_data = list(map(process_text, data))
93
-
94
- # creating a list with all lemmatized outputs
95
- lemmatizer = WordNetLemmatizer()
96
- lemmatized_output = []
97
-
98
- for listy in processed_data:
99
- lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy])
100
- lemmatized_output.append(lemmed)
101
-
102
- X_lem = lemmatized_output
103
- y_lem = target
104
-
105
- X_train, X_test, y_train, y_test = train_test_split(X_lem, y_lem, test_size=0.20, random_state=15)
106
-
107
- tfidf = TfidfVectorizer(stop_words= stop_words, ngram_range=(1,2))
108
-
109
- tfidf_data_train = tfidf.fit_transform(X_train)
110
- tfidf_data_test = tfidf.transform(X_test)
111
-
112
- if st.checkbox('Evaluate The Binary Classification Model (Toxic, Non-Toxic)'):
113
- bayes = MultinomialNB(alpha = .01)
114
- bayes.fit(tfidf_data_train, y_train)
115
- bayes_test_preds = bayes.predict(tfidf_data_test)
116
- classifier_evaluation(bayes_test_preds, y_test)
117
-
118
- st.write("""##### Try it out yourself!""")
119
- binary_text = st.text_area("Classify Using The Binary Model:", "Enter Text")
120
- binary_text = clean_text(binary_text)
121
-
122
- if st.checkbox('Apply Binary Model'):
123
- binary_model = Pipeline([('vectorizer', tfidf), ('classifier', bayes)])
124
-
125
- result = binary_model.predict([binary_text])
126
-
127
- if result.astype(int) == 1:
128
- result_text = "Toxic"
129
  else:
130
- result_text = "Not Toxic"
131
-
132
- st.write(" ##### Result: ", result_text)
 
3
  os.system('pip install sklearn')
4
  os.system('pip install wget')
5
 
6
+ # importing relevant python packages
 
7
  import streamlit as st
8
  import pandas as pd
9
  import numpy as np
10
  import pickle
 
 
11
  from PIL import Image
12
  # preprocessing
13
  import re
 
16
  from nltk.corpus import stopwords
17
  from nltk.stem import WordNetLemmatizer
18
  from sklearn.feature_extraction.text import TfidfVectorizer
 
19
  # modeling
20
+ from sklearn import svm
21
+ # sentiment analysis
22
+
23
+
24
+ # creating page sections
25
+ site_header = st.container()
26
+ business_context = st.container()
27
+ data_desc = st.container()
28
+ performance = st.container()
29
+ tweet_input = st.container()
30
+ model_results = st.container()
31
+ sentiment_analysis = st.container()
32
+ contact = st.container()
33
+
34
+ with site_header:
35
+ st.title('Toxic Comment Detection')
36
+
37
+
38
+ with tweet_input:
39
+ st.header('Is Your Tweet Considered Hate Speech?')
40
+ st.write("""*Please note that this prediction is based on how the model was trained, so it may not be an accurate representation.*""")
41
+ # user input here
42
+ user_text = st.text_input('Enter Tweet', max_chars=280) # setting input as user_text
43
+
44
+ with model_results:
45
+ st.subheader('Prediction:')
46
+ if user_text:
47
+ # processing user_text
48
+ # removing punctuation
49
+ user_text = re.sub('[%s]' % re.escape(string.punctuation), '', user_text)
50
+ # tokenizing
51
+ stop_words = set(stopwords.words('english'))
52
+ tokens = nltk.word_tokenize(user_text)
53
+ # removing stop words
54
+ stopwords_removed = [token.lower() for token in tokens if token.lower() not in stop_words]
55
+ # taking root word
56
+ lemmatizer = WordNetLemmatizer()
57
+ lemmatized_output = []
58
+ for word in stopwords_removed:
59
+ lemmatized_output.append(lemmatizer.lemmatize(word))
60
+
61
+ # instantiating count vectorizor
62
+ tfidf = TfidfVectorizer(stop_words=stop_words)
63
+ X_train = pickle.load(open('X_train.pickle', 'rb'))
64
+ X_test = lemmatized_output
65
+ X_train_count = tfidf.fit_transform(X_train)
66
+ X_test_count = tfidf.transform(X_test)
67
+
68
+ # loading in model
69
+ final_model = pickle.load(open('final_bayes.pickle', 'rb'))
70
+
71
+ # apply model to make predictions
72
+ prediction = final_model.predict(X_test_count[0])
73
+
74
+ if prediction == 0:
75
+ st.subheader('**Not Hate Speech**')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  else:
77
+ st.subheader('**Hate Speech**')
78
+ st.text('')