|
import tensorflow as tf |
|
import numpy as np |
|
from tensorflow.keras.preprocessing.text import Tokenizer |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
from tensorflow.keras.models import Sequential |
|
|
|
import json |
|
|
|
|
|
test_div = 0.75 |
|
|
|
vocab_size = 10000 |
|
embedding_dim = 16 |
|
max_length = 100 |
|
trunc_type = 'post' |
|
padding_type = 'post' |
|
oov_tok = "<OOV>" |
|
|
|
sentences = [ |
|
'Wow this AI is astonishing', |
|
'This is the worst AI', |
|
'This is the best AI', |
|
'I am the best AI', |
|
'It is very astonishing that we can train a model on any data we have', |
|
] |
|
|
|
|
|
headlines = [] |
|
is_sarcastic = [] |
|
article_link = [] |
|
|
|
with open('Sarcasm_Headlines_Dataset.json', 'r') as f: |
|
data = json.load(f) |
|
|
|
|
|
for i in data: |
|
headlines.append(i['headline']) |
|
is_sarcastic.append(i['is_sarcastic']) |
|
article_link.append(i['article_link']) |
|
|
|
|
|
train_data = headlines[:int(len(headlines) * test_div)] |
|
train_result = is_sarcastic[:int(len(is_sarcastic) * test_div)] |
|
test_data = headlines[int(len(headlines) * test_div):] |
|
test_result = is_sarcastic[int(len(is_sarcastic) * test_div):] |
|
|
|
|
|
tokenizer = Tokenizer(num_words=10000, oov_token=oov_tok) |
|
tokenizer.fit_on_texts(train_data) |
|
|
|
word_index = tokenizer.word_index |
|
|
|
train_sequences = tokenizer.texts_to_sequences(train_data) |
|
test_sequences = tokenizer.texts_to_sequences(test_data) |
|
train_padded = pad_sequences( |
|
train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) |
|
test_padded = pad_sequences( |
|
test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) |
|
|
|
training_padded = np.array(train_padded) |
|
training_labels = np.array(train_result) |
|
testing_padded = np.array(test_padded) |
|
testing_labels = np.array(test_result) |
|
|
|
|
|
model = Sequential([ |
|
tf.keras.layers.Embedding( |
|
vocab_size, embedding_dim, input_length=max_length), |
|
tf.keras.layers.GlobalAveragePooling1D(), |
|
tf.keras.layers.Dense(24, activation='relu'), |
|
tf.keras.layers.Dense(1, activation='sigmoid') |
|
]) |
|
model.compile(loss='binary_crossentropy', |
|
optimizer='adam', metrics=['accuracy']) |
|
|
|
model.summary() |
|
num_epochs = 30 |
|
history = model.fit(training_padded, training_labels, epochs=num_epochs, |
|
validation_data=(testing_padded, testing_labels), verbose=2) |
|
|
|
sentence = ["granny starting to fear spiders in the garden might be real", |
|
"game of thrones season finale showing this sunday night", |
|
"Central Valley Coalition Suing the EPA Over Clean Air Failures"] |
|
sequences = tokenizer.texts_to_sequences(sentence) |
|
padded = pad_sequences(sequences, maxlen=max_length, |
|
padding=padding_type, truncating=trunc_type) |
|
print(model.predict(padded)) |
|
|