|
"""IMDB Dataset module for sentiment analysis.""" |
|
|
|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
import numpy as np |
|
import tensorflow as tf |
|
|
|
from data.util import OOV_CHAR |
|
from data.util import pad_sentence |
|
from data.util import START_CHAR |
|
|
|
NUM_CLASS = 2 |
|
|
|
|
|
def load(vocabulary_size, sentence_length): |
|
"""Returns training and evaluation input for imdb dataset. |
|
|
|
Args: |
|
vocabulary_size: The number of the most frequent tokens |
|
to be used from the corpus. |
|
sentence_length: The number of words in each sentence. |
|
Longer sentences get cut, shorter ones padded. |
|
Raises: |
|
ValueError: if the dataset value is not valid. |
|
Returns: |
|
A tuple of length 4, for training and evaluation data, |
|
each being an numpy array. |
|
""" |
|
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data( |
|
path="imdb.npz", |
|
num_words=vocabulary_size, |
|
skip_top=0, |
|
maxlen=None, |
|
seed=113, |
|
start_char=START_CHAR, |
|
oov_char=OOV_CHAR, |
|
index_from=OOV_CHAR+1) |
|
|
|
x_train_processed = [] |
|
for sen in x_train: |
|
sen = pad_sentence(sen, sentence_length) |
|
x_train_processed.append(np.array(sen)) |
|
x_train_processed = np.array(x_train_processed) |
|
|
|
x_test_processed = [] |
|
for sen in x_test: |
|
sen = pad_sentence(sen, sentence_length) |
|
x_test_processed.append(np.array(sen)) |
|
x_test_processed = np.array(x_test_processed) |
|
|
|
return x_train_processed, np.eye(NUM_CLASS)[y_train], \ |
|
x_test_processed, np.eye(NUM_CLASS)[y_test] |
|
|