File size: 1,433 Bytes
97b6013 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
"""Dataset module for sentiment analysis.
Currently imdb dataset is available.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import data.imdb as imdb
DATASET_IMDB = "imdb"
def load(dataset, vocabulary_size, sentence_length):
"""Returns training and evaluation input.
Args:
dataset: Dataset to be trained and evaluated.
Currently only imdb is supported.
vocabulary_size: The number of the most frequent tokens
to be used from the corpus.
sentence_length: The number of words in each sentence.
Longer sentences get cut, shorter ones padded.
Raises:
ValueError: if the dataset value is not valid.
Returns:
A tuple of length 4, for training sentences, labels,
evaluation sentences, and evaluation labels,
each being an numpy array.
"""
if dataset == DATASET_IMDB:
return imdb.load(vocabulary_size, sentence_length)
else:
raise ValueError("unsupported dataset: " + dataset)
def get_num_class(dataset):
"""Returns an integer for the number of label classes.
Args:
dataset: Dataset to be trained and evaluated.
Currently only imdb is supported.
Raises:
ValueError: if the dataset value is not valid.
Returns:
int: The number of label classes.
"""
if dataset == DATASET_IMDB:
return imdb.NUM_CLASS
else:
raise ValueError("unsupported dataset: " + dataset)
|