import tensorflow as tf import numpy as np from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Sequential import json test_div = 0.75 vocab_size = 10000 embedding_dim = 16 max_length = 100 trunc_type = 'post' padding_type = 'post' oov_tok = "" sentences = [ 'Wow this AI is astonishing', 'This is the worst AI', 'This is the best AI', 'I am the best AI', 'It is very astonishing that we can train a model on any data we have', ] headlines = [] is_sarcastic = [] article_link = [] with open('Sarcasm_Headlines_Dataset.json', 'r') as f: data = json.load(f) for i in data: headlines.append(i['headline']) is_sarcastic.append(i['is_sarcastic']) article_link.append(i['article_link']) train_data = headlines[:int(len(headlines) * test_div)] train_result = is_sarcastic[:int(len(is_sarcastic) * test_div)] test_data = headlines[int(len(headlines) * test_div):] test_result = is_sarcastic[int(len(is_sarcastic) * test_div):] tokenizer = Tokenizer(num_words=10000, oov_token=oov_tok) tokenizer.fit_on_texts(train_data) word_index = tokenizer.word_index train_sequences = tokenizer.texts_to_sequences(train_data) test_sequences = tokenizer.texts_to_sequences(test_data) train_padded = pad_sequences( train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) test_padded = pad_sequences( test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) training_padded = np.array(train_padded) training_labels = np.array(train_result) testing_padded = np.array(test_padded) testing_labels = np.array(test_result) model = Sequential([ tf.keras.layers.Embedding( vocab_size, embedding_dim, input_length=max_length), tf.keras.layers.GlobalAveragePooling1D(), tf.keras.layers.Dense(24, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid') ]) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.summary() num_epochs = 30 history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2) sentence = ["granny starting to fear spiders in the garden might be real", "game of thrones season finale showing this sunday night", "Central Valley Coalition Suing the EPA Over Clean Air Failures"] sequences = tokenizer.texts_to_sequences(sentence) padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) print(model.predict(padded))