File size: 3,130 Bytes
0f8ae82
 
 
 
 
 
 
 
 
 
eb4b2f6
0f8ae82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
## Imports
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import gradio

## Load Data
dataset = pd.read_csv('./SPAMtextmessage.csv')

## Data Preprocessing 
# Convert ham to 0 and spam to 1
dataset['Category']= dataset['Category'].str.replace('ham','0')
dataset['Category']= dataset['Category'].str.replace('spam','1')
dataset['Category']= dataset['Category'].astype(int)
sentences = dataset['Message'].tolist()
labels = dataset['Category'].tolist()
# Separate out the sentences and labels into training and test sets
training_size = int(len(sentences) * 0.8)
# Sentence variables
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
# Labels variables
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]
# Make labels into numpy arrays for use with the network later
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

## Text Preprocessing
vocab_size = 1000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = ""
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences,maxlen=max_length, padding=padding_type,
                       truncating=trunc_type)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length,
                               padding=padding_type, truncating=trunc_type)

## Modeling 
# Set lr = 0.01
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(20,activation='relu'),
    tf.keras.layers.Dense(10,activation= 'relu'),
    tf.keras.layers.Dense(1,activation= 'sigmoid')
])

model.compile(loss='binary_crossentropy',metrics=['accuracy'],
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.01))
model.fit(padded,training_labels_final,batch_size=128,epochs=50,
           validation_data=(testing_padded,testing_labels_final))

## Gradio App
def spam_detection(message):
    # Preprocess the input message
    sequence = tokenizer.texts_to_sequences([message])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    
    # Make prediction
    prediction = model.predict(padded_sequence)[0, 0]
    
    # Return the result
    return "Spam" if prediction >= 0.5 else "Not Spam"

# Gradio Interface
iface = gr.Interface(
    fn=spam_detection,
    inputs=gr.Textbox(prompt="Enter a message:"),
    outputs="text",
    live=True,
    theme="huggingface",
    title="Spam Message Detection",
    description="A demo app for learning purposes. Detects spam messages with 98% accuracy based on the dataset."
)

# Launch the app
iface.launch()