Jayesh13 commited on
Commit
7e8bf02
·
1 Parent(s): aa4e53a

Upload 3 files

Browse files
Files changed (3) hide show
  1. APP.py +108 -0
  2. tox_model.pkl +3 -0
  3. train.csv.zip +3 -0
APP.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pickle
3
+ import numpy as np
4
+ import pandas as pd
5
+ import re
6
+ import tensorflow
7
+ from tensorflow import keras
8
+ from keras.preprocessing import text,sequence,utils
9
+ import html
10
+ import string
11
+ import nltk
12
+ from nltk.stem.porter import PorterStemmer
13
+ from nltk.stem import WordNetLemmatizer
14
+ from nltk.tokenize import word_tokenize
15
+ from nltk.corpus import stopwords
16
+ stop_words = stopwords.words('english')
17
+ from tensorflow.keras.preprocessing.text import text_to_word_sequence
18
+ from tensorflow.keras.preprocessing.text import Tokenizer
19
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
20
+ from tensorflow.keras import models
21
+ from tensorflow.keras import layers
22
+ from tensorflow.keras import losses
23
+ from tensorflow.keras import metrics
24
+ from tensorflow.keras import optimizers
25
+ from tensorflow.keras.utils import plot_model
26
+
27
+ def remove_special_chars(text):
28
+ re1 = re.compile(r' +')
29
+ x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
30
+ 'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
31
+ '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
32
+ ' @-@ ', '-').replace('\\', ' \\ ')
33
+ return re1.sub(' ', html.unescape(x1))
34
+
35
+ def to_lowercase(text):
36
+ return text.lower()
37
+
38
+ def remove_punctuation(text):
39
+ """Remove punctuation from list of tokenized words"""
40
+ translator = str.maketrans('', '', string.punctuation)
41
+ return text.translate(translator)
42
+
43
+ def replace_numbers(text):
44
+ """Replace all interger occurrences in list of tokenized words with textual representation"""
45
+ return re.sub(r'\d+', '', text)
46
+
47
+ def remove_whitespaces(text):
48
+ return text.strip()
49
+
50
+ def remove_stopwords(words, stop_words):
51
+ return [word for word in words if word not in stop_words]
52
+
53
+ def stem_words(words):
54
+ """Stem words in text"""
55
+ stemmer = PorterStemmer()
56
+ return [stemmer.stem(word) for word in words]
57
+
58
+ def lemmatize_words(words):
59
+ """Lemmatize words in text"""
60
+
61
+ lemmatizer = WordNetLemmatizer()
62
+ return [lemmatizer.lemmatize(word) for word in words]
63
+
64
+ def lemmatize_verbs(words):
65
+ """Lemmatize verbs in text"""
66
+
67
+ lemmatizer = WordNetLemmatizer()
68
+ return ' '.join([lemmatizer.lemmatize(word, pos='v') for word in words])
69
+
70
+ def text2words(text):
71
+ return word_tokenize(text)
72
+
73
+ def clean_text( text):
74
+ text = remove_special_chars(text)
75
+ text = remove_punctuation(text)
76
+ text = to_lowercase(text)
77
+ text = replace_numbers(text)
78
+ words = text2words(text)
79
+ words = remove_stopwords(words, stop_words)
80
+ #words = stem_words(words)# Either stem ovocar lemmatize
81
+ words = lemmatize_words(words)
82
+ words = lemmatize_verbs(words)
83
+
84
+ return ''.join(words)
85
+
86
+ df = pd.read_csv('C:\Users\HP\Documents\Model_deployment\train.csv.zip')
87
+ df['comment_text'] = df['comment_text'].apply(lambda x: clean_text(x))
88
+
89
+ model = pickle.load(open('C:\Users\HP\Documents\Model_deployment\tox_model.pkl','rb'))
90
+
91
+ st.title('Toxic comment classification')
92
+ input = st.text_area('Enter your comment')
93
+
94
+ input = input.apply(lambda x: clean_text(x))
95
+ tok = Tokenizer(num_words=1000, oov_token='UNK')
96
+ tok.fit_on_texts(df['comment_text'] )
97
+
98
+ x_test = tok.texts_to_sequence(input)
99
+ input_text = pad_sequences(x_test,
100
+ maxlen=50,
101
+ truncating='post',
102
+ padding='post'
103
+ )
104
+ if input:
105
+ out = model.predict(input_text)
106
+ st.json(out)
107
+
108
+
tox_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e679960774a127bdcb1670399b77ad59fa944fed043249c20b1c20ee10ae66a2
3
+ size 113453577
train.csv.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59046551e4723d37993933a629d9de4bef9dd5b3adb9ed6b41ac7932ffae2eb1
3
+ size 27619914