Spaces:

NCTCMumbai
/

HS_Code_AI-Explanability

Running

App Files Files Community

HS_Code_AI-Explanability / models /research /maskgan /generate_samples.py

NCTCMumbai

Upload 2583 files

97b6013 verified about 1 year ago

raw

history blame contribute delete

9.45 kB

	# Copyright 2017 The TensorFlow Authors All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================

	"""Generate samples from the MaskGAN.

	Launch command:
	python generate_samples.py
	--data_dir=/tmp/data/imdb --data_set=imdb
	--batch_size=256 --sequence_length=20 --base_directory=/tmp/imdb
	--hparams="gen_rnn_size=650,dis_rnn_size=650,gen_num_layers=2,
	gen_vd_keep_prob=1.0" --generator_model=seq2seq_vd
	--discriminator_model=seq2seq_vd --is_present_rate=0.5
	--maskgan_ckpt=/tmp/model.ckpt-45494
	--seq2seq_share_embedding=True --dis_share_embedding=True
	--attention_option=luong --mask_strategy=contiguous --baseline_method=critic
	--number_epochs=4
	"""

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	from functools import partial
	import os
	# Dependency imports

	import numpy as np
	from six.moves import xrange
	import tensorflow as tf

	import train_mask_gan
	from data import imdb_loader
	from data import ptb_loader

	# Data.
	from model_utils import helper
	from model_utils import model_utils

	SAMPLE_TRAIN = 'TRAIN'
	SAMPLE_VALIDATION = 'VALIDATION'

	## Sample Generation.
	## Binary and setup FLAGS.
	tf.app.flags.DEFINE_enum('sample_mode', 'TRAIN',
	[SAMPLE_TRAIN, SAMPLE_VALIDATION],
	'Dataset to sample from.')
	tf.app.flags.DEFINE_string('output_path', '/tmp', 'Model output directory.')
	tf.app.flags.DEFINE_boolean(
	'output_masked_logs', False,
	'Whether to display for human evaluation (show masking).')
	tf.app.flags.DEFINE_integer('number_epochs', 1,
	'The number of epochs to produce.')

	FLAGS = tf.app.flags.FLAGS


	def get_iterator(data):
	"""Return the data iterator."""
	if FLAGS.data_set == 'ptb':
	iterator = ptb_loader.ptb_iterator(data, FLAGS.batch_size,
	FLAGS.sequence_length,
	FLAGS.epoch_size_override)
	elif FLAGS.data_set == 'imdb':
	iterator = imdb_loader.imdb_iterator(data, FLAGS.batch_size,
	FLAGS.sequence_length)
	return iterator


	def convert_to_human_readable(id_to_word, arr, p, max_num_to_print):
	"""Convert a np.array of indices into words using id_to_word dictionary.
	Return max_num_to_print results.
	"""

	assert arr.ndim == 2

	samples = []
	for sequence_id in xrange(min(len(arr), max_num_to_print)):
	sample = []
	for i, index in enumerate(arr[sequence_id, :]):
	if p[sequence_id, i] == 1:
	sample.append(str(id_to_word[index]))
	else:
	sample.append('*' + str(id_to_word[index]))
	buffer_str = ' '.join(sample)
	samples.append(buffer_str)
	return samples


	def write_unmasked_log(log, id_to_word, sequence_eval):
	"""Helper function for logging evaluated sequences without mask."""
	indices_arr = np.asarray(sequence_eval)
	samples = helper.convert_to_human_readable(id_to_word, indices_arr,
	FLAGS.batch_size)
	for sample in samples:
	log.write(sample + '\n')
	log.flush()
	return samples


	def write_masked_log(log, id_to_word, sequence_eval, present_eval):
	indices_arr = np.asarray(sequence_eval)
	samples = convert_to_human_readable(id_to_word, indices_arr, present_eval,
	FLAGS.batch_size)
	for sample in samples:
	log.write(sample + '\n')
	log.flush()
	return samples


	def generate_logs(sess, model, log, id_to_word, feed):
	"""Impute Sequences using the model for a particular feed and send it to
	logs.
	"""
	# Impute Sequences.
	[p, inputs_eval, sequence_eval] = sess.run(
	[model.present, model.inputs, model.fake_sequence], feed_dict=feed)

	# Add the 0th time-step for coherence.
	first_token = np.expand_dims(inputs_eval[:, 0], axis=1)
	sequence_eval = np.concatenate((first_token, sequence_eval), axis=1)

	# 0th token always present.
	p = np.concatenate((np.ones((FLAGS.batch_size, 1)), p), axis=1)

	if FLAGS.output_masked_logs:
	samples = write_masked_log(log, id_to_word, sequence_eval, p)
	else:
	samples = write_unmasked_log(log, id_to_word, sequence_eval)
	return samples


	def generate_samples(hparams, data, id_to_word, log_dir, output_file):
	""""Generate samples.

	Args:
	hparams: Hyperparameters for the MaskGAN.
	data: Data to evaluate.
	id_to_word: Dictionary of indices to words.
	log_dir: Log directory.
	output_file: Output file for the samples.
	"""
	# Boolean indicating operational mode.
	is_training = False

	# Set a random seed to keep fixed mask.
	np.random.seed(0)

	with tf.Graph().as_default():
	# Construct the model.
	model = train_mask_gan.create_MaskGAN(hparams, is_training)

	## Retrieve the initial savers.
	init_savers = model_utils.retrieve_init_savers(hparams)

	## Initial saver function to supervisor.
	init_fn = partial(model_utils.init_fn, init_savers)

	is_chief = FLAGS.task == 0

	# Create the supervisor. It will take care of initialization, summaries,
	# checkpoints, and recovery.
	sv = tf.Supervisor(
	logdir=log_dir,
	is_chief=is_chief,
	saver=model.saver,
	global_step=model.global_step,
	recovery_wait_secs=30,
	summary_op=None,
	init_fn=init_fn)

	# Get an initialized, and possibly recovered session. Launch the
	# services: Checkpointing, Summaries, step counting.
	#
	# When multiple replicas of this program are running the services are
	# only launched by the 'chief' replica.
	with sv.managed_session(
	FLAGS.master, start_standard_services=False) as sess:

	# Generator statefulness over the epoch.
	[gen_initial_state_eval, fake_gen_initial_state_eval] = sess.run(
	[model.eval_initial_state, model.fake_gen_initial_state])

	for n in xrange(FLAGS.number_epochs):
	print('Epoch number: %d' % n)
	# print('Percent done: %.2f' % float(n) / float(FLAGS.number_epochs))
	iterator = get_iterator(data)
	for x, y, _ in iterator:
	if FLAGS.eval_language_model:
	is_present_rate = 0.
	else:
	is_present_rate = FLAGS.is_present_rate
	tf.logging.info(
	'Evaluating on is_present_rate=%.3f.' % is_present_rate)

	model_utils.assign_percent_real(sess, model.percent_real_update,
	model.new_rate, is_present_rate)

	# Randomly mask out tokens.
	p = model_utils.generate_mask()

	eval_feed = {model.inputs: x, model.targets: y, model.present: p}

	if FLAGS.data_set == 'ptb':
	# Statefulness for evaluation Generator.
	for i, (c, h) in enumerate(model.eval_initial_state):
	eval_feed[c] = gen_initial_state_eval[i].c
	eval_feed[h] = gen_initial_state_eval[i].h

	# Statefulness for the Generator.
	for i, (c, h) in enumerate(model.fake_gen_initial_state):
	eval_feed[c] = fake_gen_initial_state_eval[i].c
	eval_feed[h] = fake_gen_initial_state_eval[i].h

	[gen_initial_state_eval, fake_gen_initial_state_eval, _] = sess.run(
	[
	model.eval_final_state, model.fake_gen_final_state,
	model.global_step
	],
	feed_dict=eval_feed)

	generate_logs(sess, model, output_file, id_to_word, eval_feed)
	output_file.close()
	print('Closing output_file.')
	return


	def main(_):
	hparams = train_mask_gan.create_hparams()
	log_dir = FLAGS.base_directory

	tf.gfile.MakeDirs(FLAGS.output_path)
	output_file = tf.gfile.GFile(
	os.path.join(FLAGS.output_path, 'reviews.txt'), mode='w')

	# Load data set.
	if FLAGS.data_set == 'ptb':
	raw_data = ptb_loader.ptb_raw_data(FLAGS.data_dir)
	train_data, valid_data, _, _ = raw_data
	elif FLAGS.data_set == 'imdb':
	raw_data = imdb_loader.imdb_raw_data(FLAGS.data_dir)
	train_data, valid_data = raw_data
	else:
	raise NotImplementedError

	# Generating more data on train set.
	if FLAGS.sample_mode == SAMPLE_TRAIN:
	data_set = train_data
	elif FLAGS.sample_mode == SAMPLE_VALIDATION:
	data_set = valid_data
	else:
	raise NotImplementedError

	# Dictionary and reverse dictionry.
	if FLAGS.data_set == 'ptb':
	word_to_id = ptb_loader.build_vocab(
	os.path.join(FLAGS.data_dir, 'ptb.train.txt'))
	elif FLAGS.data_set == 'imdb':
	word_to_id = imdb_loader.build_vocab(
	os.path.join(FLAGS.data_dir, 'vocab.txt'))
	id_to_word = {v: k for k, v in word_to_id.iteritems()}

	FLAGS.vocab_size = len(id_to_word)
	print('Vocab size: %d' % FLAGS.vocab_size)

	generate_samples(hparams, data_set, id_to_word, log_dir, output_file)


	if __name__ == '__main__':
	tf.app.run()