|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Classes for storing hyperparameters, data locations, etc.""" |
|
|
|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
import json |
|
from os.path import join |
|
import tensorflow as tf |
|
|
|
|
|
class Config(object): |
|
"""Stores everything needed to train a model.""" |
|
|
|
def __init__(self, **kwargs): |
|
|
|
self.data_dir = './data' |
|
self.model_name = 'default_model' |
|
|
|
|
|
self.mode = 'train' |
|
self.task_names = ['chunk'] |
|
|
|
self.is_semisup = True |
|
self.for_preprocessing = False |
|
|
|
|
|
self.pretrained_embeddings = 'glove.6B.300d.txt' |
|
|
|
self.word_embedding_size = 300 |
|
|
|
|
|
self.use_chars = True |
|
self.char_embedding_size = 50 |
|
self.char_cnn_filter_widths = [2, 3, 4] |
|
self.char_cnn_n_filters = 100 |
|
self.unidirectional_sizes = [1024] |
|
self.bidirectional_sizes = [512] |
|
self.projection_size = 512 |
|
|
|
|
|
self.depparse_projection_size = 128 |
|
|
|
|
|
|
|
self.label_encoding = 'BIOES' |
|
|
|
self.label_smoothing = 0.1 |
|
|
|
|
|
self.lr = 0.5 |
|
self.momentum = 0.9 |
|
self.grad_clip = 1.0 |
|
self.warm_up_steps = 5000.0 |
|
self.lr_decay = 0.005 |
|
|
|
|
|
self.ema_decay = 0.998 |
|
self.ema_test = True |
|
self.ema_teacher = False |
|
|
|
|
|
self.labeled_keep_prob = 0.5 |
|
self.unlabeled_keep_prob = 0.8 |
|
|
|
|
|
self.max_sentence_length = 100 |
|
self.max_word_length = 20 |
|
self.train_batch_size = 64 |
|
self.test_batch_size = 64 |
|
self.buckets = [(0, 15), (15, 40), (40, 1000)] |
|
|
|
|
|
|
|
self.print_every = 25 |
|
self.eval_dev_every = 500 |
|
self.eval_train_every = 2000 |
|
self.save_model_every = 1000 |
|
|
|
|
|
self.train_set_percent = 100 |
|
|
|
for k, v in kwargs.iteritems(): |
|
if k not in self.__dict__: |
|
raise ValueError("Unknown argument", k) |
|
self.__dict__[k] = v |
|
|
|
self.dev_set = self.mode == "train" |
|
|
|
|
|
|
|
self.raw_data_topdir = join(self.data_dir, 'raw_data') |
|
self.unsupervised_data = join( |
|
self.raw_data_topdir, |
|
'unlabeled_data', |
|
'1-billion-word-language-modeling-benchmark-r13output', |
|
'training-monolingual.tokenized.shuffled') |
|
self.pretrained_embeddings_file = join( |
|
self.raw_data_topdir, 'pretrained_embeddings', |
|
self.pretrained_embeddings) |
|
|
|
self.preprocessed_data_topdir = join(self.data_dir, 'preprocessed_data') |
|
self.embeddings_dir = join(self.preprocessed_data_topdir, |
|
self.pretrained_embeddings.rsplit('.', 1)[0]) |
|
self.word_vocabulary = join(self.embeddings_dir, 'word_vocabulary.pkl') |
|
self.word_embeddings = join(self.embeddings_dir, 'word_embeddings.pkl') |
|
|
|
self.model_dir = join(self.data_dir, "models", self.model_name) |
|
self.checkpoints_dir = join(self.model_dir, 'checkpoints') |
|
self.checkpoint = join(self.checkpoints_dir, 'checkpoint.ckpt') |
|
self.best_model_checkpoints_dir = join( |
|
self.model_dir, 'best_model_checkpoints') |
|
self.best_model_checkpoint = join( |
|
self.best_model_checkpoints_dir, 'checkpoint.ckpt') |
|
self.progress = join(self.checkpoints_dir, 'progress.pkl') |
|
self.summaries_dir = join(self.model_dir, 'summaries') |
|
self.history_file = join(self.model_dir, 'history.pkl') |
|
|
|
def write(self): |
|
tf.gfile.MakeDirs(self.model_dir) |
|
with open(join(self.model_dir, 'config.json'), 'w') as f: |
|
f.write(json.dumps(self.__dict__, sort_keys=True, indent=4, |
|
separators=(',', ': '))) |
|
|
|
|