import streamlit as st import pickle import numpy as np import pandas as pd import re import tensorflow from tensorflow import keras from keras.preprocessing import text,sequence,utils import html import string import nltk from nltk.stem.porter import PorterStemmer from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize from nltk.corpus import stopwords stop_words = stopwords.words('english') from tensorflow.keras.preprocessing.text import text_to_word_sequence from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras import models from tensorflow.keras import layers from tensorflow.keras import losses from tensorflow.keras import metrics from tensorflow.keras import optimizers from tensorflow.keras.utils import plot_model def remove_special_chars(text): re1 = re.compile(r' +') x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace( 'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace( '
', "\n").replace('\\"', '"').replace('', 'u_n').replace(' @.@ ', '.').replace( ' @-@ ', '-').replace('\\', ' \\ ') return re1.sub(' ', html.unescape(x1)) def to_lowercase(text): return text.lower() def remove_punctuation(text): """Remove punctuation from list of tokenized words""" translator = str.maketrans('', '', string.punctuation) return text.translate(translator) def replace_numbers(text): """Replace all interger occurrences in list of tokenized words with textual representation""" return re.sub(r'\d+', '', text) def remove_whitespaces(text): return text.strip() def remove_stopwords(words, stop_words): return [word for word in words if word not in stop_words] def stem_words(words): """Stem words in text""" stemmer = PorterStemmer() return [stemmer.stem(word) for word in words] def lemmatize_words(words): """Lemmatize words in text""" lemmatizer = WordNetLemmatizer() return [lemmatizer.lemmatize(word) for word in words] def lemmatize_verbs(words): """Lemmatize verbs in text""" lemmatizer = WordNetLemmatizer() return ' '.join([lemmatizer.lemmatize(word, pos='v') for word in words]) def text2words(text): return word_tokenize(text) def clean_text( text): text = remove_special_chars(text) text = remove_punctuation(text) text = to_lowercase(text) text = replace_numbers(text) words = text2words(text) words = remove_stopwords(words, stop_words) #words = stem_words(words)# Either stem ovocar lemmatize words = lemmatize_words(words) words = lemmatize_verbs(words) return ''.join(words) #df = pd.read_csv('train.csv.zip') #df['comment_text'] = df['comment_text'].apply(lambda x: clean_text(x)) model = pickle.load(open('tox_model.h5','rb')) st.title('Toxic comment classification') input = st.text_area('Enter your comment') input = input.apply(lambda x: clean_text(x)) tok = Tokenizer(num_words=1000, oov_token='UNK') #tok.fit_on_texts(df['comment_text'] ) x_test = tok.texts_to_sequence(input) input_text = pad_sequences(x_test, maxlen=50, truncating='post', padding='post' ) if input: out = model.predict(input_text) st.json(out)