File size: 3,051 Bytes
b6da8f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5266d8
b6da8f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1624882
b6da8f8
 
 
 
 
 
 
 
 
 
38351fd
b6da8f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import streamlit as st 
import pandas as pd 
import numpy as np 
import joblib 
import tensorflow as tf 
import nltk 

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 

import re 
import ast 
import string 
from tensorflow.keras.models import load_model 

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('punkt')

# open chatwords.txt
with open('chatwords.txt') as f:
    data = f.read()
chatwords = ast.literal_eval(data)

# open abbreviation.txt
with open('abbreviation.txt') as abb:
    ab2 = abb.read()
abbreviation = ast.literal_eval(ab2)

# define stopwords
stop_words = stopwords.words('english')

# define lemmatizer
lem = WordNetLemmatizer()

# load model
final_gru = tf.keras.models.load_model('model_gru')

# import functions
def check_chatwords(text):
    temp=[]
    for chat in text.split():
        if chat.upper() in chatwords:
            temp.append(chatwords[chat.upper()])
        else:
            temp.append(chat)
    return " ".join(temp)

def lower(text):
    data = text.lower()
    return data 

def check_abbr(text):
    temp2=[]
    for abbr in text.split():
        if abbr in abbreviation:
            temp2.append(abbreviation[abbr])
        else:
            temp2.append(abbr)

    return " ".join(temp2)

def check_punctuation(text):
    data = re.sub("[^a-zA-Z]",' ', text)
    data = re.sub("[[^]]*]", ' ', data)
    data = re.sub(r"\n", " ", data)
    data = data.strip()
    data = ' '.join(data.split())
    return data

def token_stopwords_lemma(text):
    tokens = word_tokenize(text)
    stop_words2 = ' '.join([word for word in tokens if word not in stop_words])
    data = [lem.lemmatize(word) for word in stop_words2.split()]
    data = ' '.join(data)
    return data 

st.title("SPAM Message Detection")

message = st.text_input('Please input your message here (in English):')
st.write('Message:', message)

df_inf = [message]
df_inf1 = pd.DataFrame()
df_inf1['message'] = df_inf

df_inf1['message'] = df_inf1['message'].apply(lambda j: check_chatwords(j))
df_inf1['message'] = df_inf1['message'].apply(lambda k: lower(k))
df_inf1['message'] = df_inf1['message'].apply(lambda v: check_abbr(v))
df_inf1['message'] = df_inf1['message'].apply(lambda r: check_punctuation(r))
df_inf1['message'] = df_inf1['message'].apply(lambda m: token_stopwords_lemma(m))

y_pred_inf = final_gru.predict(df_inf1['message'])
y_pred_inf = np.where(y_pred_inf >= 0.5, 1, 0)

# Membuat dataframe dari array 
pred_df = pd.DataFrame(y_pred_inf, columns=['label'])

# Melakukan prediksi pada new dataframe
df_inf2 = pd.DataFrame(df_inf, columns=['message'])
df_combined = pd.concat([df_inf2, pred_df], axis=1)

# Predict
if st.button('Predict'):
    y_pred_inf = final_gru.predict(df_inf1['message'])
    y_pred_inf = np.where(y_pred_inf >= 0.5, 1, 0)
    spam_status = str(y_pred_inf[0][0])

    if spam_status == "0":
        st.success("Your message is not spam.")
    else:
        st.error("Your message is spam")