NMTKD / translation /OpenNMT-py /tools /extract_embeddings.py
sakharamg's picture
Uploading all files
158b61b
import argparse
import torch
import onmt
import onmt.model_builder
from onmt.utils.parse import ArgumentParser
import onmt.opts
from onmt.utils.misc import use_gpu
from onmt.utils.logging import init_logger, logger
parser = argparse.ArgumentParser(description='translate.py')
parser.add_argument('-model', required=True,
help='Path to model .pt file')
parser.add_argument('-output_dir', default='.',
help="""Path to output the embeddings""")
parser.add_argument('-gpu', type=int, default=-1,
help="Device to run on")
def write_embeddings(filename, dict, embeddings):
with open(filename, 'wb') as file:
for i in range(min(len(embeddings), len(dict.itos))):
str = dict.itos[i].encode("utf-8")
for j in range(len(embeddings[0])):
str = str + (" %5f" % (embeddings[i][j])).encode("utf-8")
file.write(str + b"\n")
def main():
dummy_parser = argparse.ArgumentParser(description='train.py')
onmt.opts.model_opts(dummy_parser)
dummy_opt = dummy_parser.parse_known_args([])[0]
opt = parser.parse_args()
opt.cuda = opt.gpu > -1
if opt.cuda:
torch.cuda.set_device(opt.gpu)
# Add in default model arguments, possibly added since training.
checkpoint = torch.load(opt.model,
map_location=lambda storage, loc: storage)
model_opt = checkpoint['opt']
fields = checkpoint['vocab']
src_dict = fields['src'].base_field.vocab # assumes src is text
tgt_dict = fields['tgt'].base_field.vocab
model_opt = checkpoint['opt']
for arg in dummy_opt.__dict__:
if arg not in model_opt:
model_opt.__dict__[arg] = dummy_opt.__dict__[arg]
# build_base_model expects updated and validated opts
ArgumentParser.update_model_opts(model_opt)
ArgumentParser.validate_model_opts(model_opt)
model = onmt.model_builder.build_base_model(
model_opt, fields, use_gpu(opt), checkpoint)
encoder = model.encoder # no encoder for LM task
decoder = model.decoder
encoder_embeddings = encoder.embeddings.word_lut.weight.data.tolist()
decoder_embeddings = decoder.embeddings.word_lut.weight.data.tolist()
logger.info("Writing source embeddings")
write_embeddings(opt.output_dir + "/src_embeddings.txt", src_dict,
encoder_embeddings)
logger.info("Writing target embeddings")
write_embeddings(opt.output_dir + "/tgt_embeddings.txt", tgt_dict,
decoder_embeddings)
logger.info('... done.')
logger.info('Converting model...')
if __name__ == "__main__":
init_logger('extract_embeddings.log')
main()