File size: 1,557 Bytes
97b6013 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
"""IMDB Dataset module for sentiment analysis."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
from data.util import OOV_CHAR
from data.util import pad_sentence
from data.util import START_CHAR
NUM_CLASS = 2
def load(vocabulary_size, sentence_length):
"""Returns training and evaluation input for imdb dataset.
Args:
vocabulary_size: The number of the most frequent tokens
to be used from the corpus.
sentence_length: The number of words in each sentence.
Longer sentences get cut, shorter ones padded.
Raises:
ValueError: if the dataset value is not valid.
Returns:
A tuple of length 4, for training and evaluation data,
each being an numpy array.
"""
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(
path="imdb.npz",
num_words=vocabulary_size,
skip_top=0,
maxlen=None,
seed=113,
start_char=START_CHAR,
oov_char=OOV_CHAR,
index_from=OOV_CHAR+1)
x_train_processed = []
for sen in x_train:
sen = pad_sentence(sen, sentence_length)
x_train_processed.append(np.array(sen))
x_train_processed = np.array(x_train_processed)
x_test_processed = []
for sen in x_test:
sen = pad_sentence(sen, sentence_length)
x_test_processed.append(np.array(sen))
x_test_processed = np.array(x_test_processed)
return x_train_processed, np.eye(NUM_CLASS)[y_train], \
x_test_processed, np.eye(NUM_CLASS)[y_test]
|