PreMode / analysis /Hsu.et.al.git /src /unirep_evotune.py
gzhong's picture
Upload folder using huggingface_hub
7718235 verified
'''
Unsupervised fine-tuning of UniRep on evolutionary data, "evo-tuning"
'''
import argparse
import os
import pathlib
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from unirep import babbler1900 as babbler
import utils
#os.environ["CUDA_VISIBLE_DEVICES"] = ''
def main():
parser = argparse.ArgumentParser()
parser.add_argument('seqs_fasta_path', type=pathlib.Path)
parser.add_argument('save_weights_dir', type=pathlib.Path)
parser.add_argument('--initial_weights_dir', type=pathlib.Path,
default="weights/unirep/global")
parser.add_argument('--batch_size', type=int, default=128)
parser.add_argument('--max_seq_len', type=int, default=500)
parser.add_argument('--num_steps', type=int, default=10000)
parser.add_argument('--learning_rate', type=float, default=0.00001)
args = parser.parse_args()
# Set seeds
tf.set_random_seed(0)
np.random.seed(0)
print("Num GPUs Available: ",
len(tf.config.experimental.list_physical_devices('GPU')))
# Load pre-trained models
# Sync relevant weight files
# !aws s3 sync --no-sign-request --quiet s3://unirep-public/1900_weights/ 1900_weights/
# Import the mLSTM babbler model
# Where model weights are stored.
b = babbler(batch_size=args.batch_size, model_path=args.initial_weights_dir)
# Load seqs from fasta.
seqs_all = utils.read_fasta(args.seqs_fasta_path)
seqs = dict()
seqs['train'], seqs['val'] = train_test_split(seqs_all, test_size=0.2)
bucket_ops = {
'train': None,
'val': None,
}
for mode in ['train', 'val']:
prefix = str(args.seqs_fasta_path).replace('.a2m', '')
formatted_seqs_path = prefix + f'_{mode}_formatted.txt'
with open(formatted_seqs_path, "w") as destination:
for i,seq in enumerate(seqs[mode]):
seq = seq.upper().replace('-', 'X')
seq = seq.replace('.', 'X')
if b.is_valid_seq(seq, max_len=args.max_seq_len):
formatted = ",".join(map(str,b.format_seq(seq)))
destination.write(formatted)
destination.write('\n')
bucket_ops[mode] = b.bucket_batch_pad(formatted_seqs_path,
lower=100, upper=args.max_seq_len, interval=50)
logits, seqloss, x_ph, y_ph, batch_size_ph, initial_state_ph = (
b.get_babbler_ops())
optimizer = tf.train.AdamOptimizer(args.learning_rate)
tuning_op = optimizer.minimize(seqloss)
args.save_weights_dir.mkdir(parents=True, exist_ok=True)
train_loss = np.zeros(args.num_steps)
val_loss = np.zeros(args.num_steps)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
sess.graph.finalize()
for i in range(args.num_steps):
print(f"Step {i}")
batch_train = sess.run(bucket_ops['train'])
train_loss[i], __, = sess.run([seqloss, tuning_op],
feed_dict={
x_ph: batch_train[:, :-1],
y_ph: batch_train[:, 1:],
batch_size_ph: args.batch_size,
initial_state_ph:b._zero_state
},
)
batch_val = sess.run(bucket_ops['val'])
val_loss[i] = sess.run(seqloss,
feed_dict={
x_ph: batch_val[:, :-1],
y_ph: batch_val[:, 1:],
batch_size_ph: args.batch_size,
initial_state_ph:b._zero_state
},
)
print("Step {0}: {1} (train), {2} (val)".format(
i, train_loss[i], val_loss[i]))
# Save periodically
if i % 1000 == 0 and i > 0:
suffix = f'_{int(i / 1000)}k'
savedir = os.path.join(args.save_weights_dir, suffix)
pathlib.Path(savedir).mkdir(exist_ok=True)
# Save weights
b.dump_weights(sess, dir_name=savedir)
# Save loss trajectories
np.savetxt(
os.path.join(args.save_weights_dir, 'loss_trajectory_train.npy'),
train_loss)
np.savetxt(
os.path.join(args.save_weights_dir, 'loss_trajectory_val.npy'),
val_loss)
# Save final weights
b.dump_weights(sess, dir_name=args.save_weights_dir)
# Save loss trajectories
np.savetxt(
os.path.join(args.save_weights_dir, 'loss_trajectory_train.npy'),
train_loss)
np.savetxt(
os.path.join(args.save_weights_dir, 'loss_trajectory_val.npy'),
val_loss)
if __name__ == "__main__":
main()