|
|
|
|
|
|
|
|
|
|
|
|
|
"""Train feed-forward neural network LM with NPLM tool. |
|
|
|
The resulting model can be used in Moses as feature function NeuralLM. |
|
""" |
|
|
|
from __future__ import print_function, unicode_literals |
|
|
|
import logging |
|
import argparse |
|
import subprocess |
|
import sys |
|
import os |
|
import codecs |
|
|
|
|
|
sys.path.append(os.path.join(sys.path[0], 'bilingual-lm')) |
|
import train_nplm |
|
import averageNullEmbedding |
|
|
|
|
|
logging.basicConfig( |
|
format='%(asctime)s %(levelname)s: %(message)s', |
|
datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG) |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
"--working-dir", dest="working_dir", metavar="PATH") |
|
parser.add_argument( |
|
"--corpus", '-text', dest="corpus_stem", metavar="PATH", |
|
help="Input file.") |
|
parser.add_argument( |
|
"--nplm-home", dest="nplm_home", metavar="PATH", required=True, |
|
help="Location of NPLM.") |
|
parser.add_argument( |
|
"--epochs", dest="epochs", type=int, metavar="INT", |
|
help="Number of training epochs (default: %(default)s).") |
|
parser.add_argument( |
|
"--order", dest="order", type=int, metavar="INT", |
|
help="N-gram order of language model (default: %(default)s).") |
|
parser.add_argument( |
|
"--minibatch-size", dest="minibatch_size", type=int, metavar="INT", |
|
help="Minibatch size (default: %(default)s).") |
|
parser.add_argument( |
|
"--noise", dest="noise", type=int, metavar="INT", |
|
help="Number of noise samples for NCE (default: %(default)s).") |
|
parser.add_argument( |
|
"--hidden", dest="hidden", type=int, metavar="INT", |
|
help=( |
|
"Size of hidden layer (0 for single hidden layer) " |
|
"(default: %(default)s)")) |
|
parser.add_argument( |
|
"--input-embedding", dest="input_embedding", type=int, metavar="INT", |
|
help="Size of input embedding layer (default: %(default)s).") |
|
parser.add_argument( |
|
"--output-embedding", dest="output_embedding", type=int, metavar="INT", |
|
help="Size of output embedding layer (default: %(default)s).") |
|
parser.add_argument( |
|
"--threads", "-t", dest="threads", type=int, metavar="INT", |
|
help="Number of threads (default: %(default)s).") |
|
parser.add_argument( |
|
"--output-model", dest="output_model", metavar="PATH", |
|
help="Name of output model (default: %(default)s).") |
|
parser.add_argument( |
|
"--output-dir", dest="output_dir", metavar="PATH", |
|
help="Output directory (default: same as working-dir).") |
|
parser.add_argument( |
|
"--config-options-file", dest="config_options_file", metavar="PATH") |
|
parser.add_argument( |
|
"--log-file", dest="log_file", metavar="PATH", |
|
help="Log file to write to (default: %(default)s).") |
|
parser.add_argument( |
|
"--validation-corpus", dest="validation_corpus", metavar="PATH", |
|
help="Validation file (default: %(default)s).") |
|
parser.add_argument( |
|
"--activation-function", dest="activation_fn", |
|
choices=['identity', 'rectifier', 'tanh', 'hardtanh'], |
|
help="Activation function (default: %(default)s).") |
|
parser.add_argument( |
|
"--learning-rate", dest="learning_rate", type=float, metavar="FLOAT", |
|
help="Learning rate (default: %(default)s).") |
|
parser.add_argument( |
|
"--words-file", dest="words_file", metavar="PATH", |
|
help="Output vocabulary file (default: %(default)s).") |
|
parser.add_argument( |
|
"--vocab-size", dest="vocab_size", type=int, metavar="INT", |
|
help="Vocabulary size (default: %(default)s).") |
|
parser.add_argument( |
|
"--mmap", dest="mmap", action="store_true", |
|
help="Use memory-mapped file (for lower memory consumption).") |
|
parser.add_argument( |
|
"--dropout", dest="dropout", action="store", |
|
help="Pass dropout to nplm") |
|
parser.add_argument( |
|
"--input-dropout", dest="input_dropout", action="store", |
|
help="Pass input dropout to nplm") |
|
parser.add_argument( |
|
"--extra-settings", dest="extra_settings", |
|
help="Extra settings for nplm") |
|
parser.add_argument( |
|
"--train-host", dest="train_host", |
|
help="Execute nplm training on this host, via ssh") |
|
|
|
parser.set_defaults( |
|
working_dir="working", |
|
corpus_stem="train", |
|
nplm_home="/home/bhaddow/tools/nplm", |
|
epochs=2, |
|
order=5, |
|
minibatch_size=1000, |
|
noise=100, |
|
hidden=0, |
|
input_embedding=150, |
|
output_embedding=750, |
|
threads=4, |
|
output_model="train", |
|
output_dir=None, |
|
config_options_file="config", |
|
log_file="log", |
|
validation_corpus=None, |
|
activation_fn="rectifier", |
|
learning_rate=1, |
|
words_file='vocab', |
|
vocab_size=500000) |
|
|
|
|
|
def main(options): |
|
|
|
options.ngram_size = options.order |
|
|
|
if options.output_dir is None: |
|
options.output_dir = options.working_dir |
|
|
|
if not os.path.exists(options.working_dir): |
|
os.makedirs(options.working_dir) |
|
if not os.path.exists(options.output_dir): |
|
os.makedirs(options.output_dir) |
|
|
|
numberized_file = os.path.basename(options.corpus_stem) + '.numberized' |
|
vocab_file =os.path.join(options.working_dir, options.words_file) |
|
train_file = numberized_file |
|
if options.mmap: |
|
train_file += '.mmap' |
|
|
|
extraction_cmd = [] |
|
if options.train_host: |
|
extraction_cmd = ["ssh", options.train_host] |
|
extraction_cmd += [ |
|
os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'), |
|
'--train_text', options.corpus_stem, |
|
'--ngramize', '1', |
|
'--ngram_size', str(options.ngram_size), |
|
'--vocab_size', str(options.vocab_size), |
|
'--write_words_file', vocab_file, |
|
'--train_file', os.path.join(options.working_dir, numberized_file) |
|
] |
|
|
|
sys.stderr.write('extracting n-grams\n') |
|
sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n') |
|
subprocess.check_call(extraction_cmd) |
|
|
|
|
|
null_id = None |
|
if options.dropout or options.input_dropout: |
|
with open(vocab_file) as vfh: |
|
for i,line in enumerate(vfh): |
|
if line[:-1].decode("utf8") == "<null>": |
|
null_id = i |
|
break |
|
if null_id == None: |
|
sys.stderr.write("WARN: could not identify null token, cannot enable dropout\n") |
|
else: |
|
if not options.extra_settings: |
|
options.extra_settings = "" |
|
if options.dropout or options.input_dropout: |
|
options.extra_settings += " --null_index %d " % null_id |
|
if options.dropout: |
|
options.extra_settings += " --dropout %s " % options.dropout |
|
if options.input_dropout: |
|
options.extra_settings += " --input_dropout %s " % options.input_dropout |
|
|
|
|
|
if options.mmap: |
|
try: |
|
os.remove(os.path.join(options.working_dir, train_file)) |
|
except OSError: |
|
pass |
|
mmap_cmd = [] |
|
if options.train_host: |
|
mmap_cmd = ["ssh", options.train_host] |
|
mmap_cmd += [ |
|
os.path.join(options.nplm_home, 'src', 'createMmap'), |
|
'--input_file', |
|
os.path.join(options.working_dir, numberized_file), |
|
'--output_file', |
|
os.path.join(options.working_dir, train_file) |
|
] |
|
sys.stderr.write('creating memory-mapped file\n') |
|
sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n') |
|
ret = subprocess.call(mmap_cmd) |
|
if ret: |
|
raise Exception("creating memory-mapped file failed") |
|
|
|
if options.validation_corpus: |
|
|
|
extraction_cmd = [] |
|
if options.train_host: |
|
extraction_cmd = ["ssh", options.train_host] |
|
extraction_cmd += [ |
|
os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'), |
|
'--train_text', options.validation_corpus, |
|
'--ngramize', '1', |
|
'--ngram_size', str(options.ngram_size), |
|
'--vocab_size', str(options.vocab_size), |
|
'--words_file', vocab_file, |
|
'--train_file', os.path.join( |
|
options.working_dir, |
|
os.path.basename(options.validation_corpus) + '.numberized') |
|
] |
|
|
|
sys.stderr.write('extracting n-grams (validation file)\n') |
|
sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n') |
|
ret = subprocess.call(extraction_cmd) |
|
if ret: |
|
raise Exception("preparing neural LM failed") |
|
options.validation_file = os.path.join( |
|
options.working_dir, os.path.basename(options.validation_corpus)) |
|
|
|
else: |
|
options.validation_file = None |
|
|
|
options.input_words_file = vocab_file |
|
options.output_words_file = vocab_file |
|
options.input_vocab_size = options.vocab_size |
|
options.output_vocab_size = options.vocab_size |
|
|
|
sys.stderr.write('training neural network\n') |
|
train_nplm.main(options) |
|
|
|
sys.stderr.write('averaging null words\n') |
|
output_model_file = os.path.join( |
|
options.output_dir, |
|
options.output_model + '.model.nplm.best') |
|
if not os.path.exists(output_model_file): |
|
output_model_file = os.path.join( |
|
options.output_dir, |
|
options.output_model + '.model.nplm.' + str(options.epochs)) |
|
average_options = averageNullEmbedding.parser.parse_args([ |
|
'-i', output_model_file , |
|
'-o', os.path.join( |
|
options.output_dir, options.output_model + '.model.nplm'), |
|
'-t', os.path.join(options.working_dir, numberized_file), |
|
'-p', os.path.join(options.nplm_home, 'python'), |
|
]) |
|
averageNullEmbedding.main(average_options) |
|
|
|
|
|
if __name__ == "__main__": |
|
if sys.version_info < (3, 0): |
|
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) |
|
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) |
|
sys.stdin = codecs.getreader('UTF-8')(sys.stdin) |
|
|
|
options = parser.parse_known_args()[0] |
|
if parser.parse_known_args()[1]: |
|
sys.stderr.write( |
|
"Warning: unknown arguments: {0}\n".format( |
|
parser.parse_known_args()[1])) |
|
main(options) |
|
|