niksapraljak1
/

BioM3

Model card Files Files and versions Community

Niksa Praljak

BioM3-PenCL push with no weights

0655b48 7 months ago

57.4 kB

	# pytorch fucntions
	import torch
	from torch import nn, optim
	from torch.nn import functional as F
	import torch.distributed as dist

	# PL functions
	import pytorch_lightning as pl
	from pytorch_lightning import Trainer, seed_everything

	# misc functions
	import itertools
	import matplotlib.pyplot as plt
	import numpy as np
	import sys
	from tqdm import tqdm
	import time

	# other learning packages
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

	# our packages
	import Stage1_source.helper_funcs as helper_tools
	import Stage1_source.preprocess as prep
	import Stage1_source.model as mod


	######################
	# Default PL wrapper #
	######################

	class PL_PEN_CL(pl.LightningModule):


	def __init__(
	self,
	args: any,
	model: nn.Module,
	text_tokenizer: any,
	sequence_tokenizer: any
	):

	super().__init__()
	# arguments
	self.script_args = args

	# model components
	self.model = model

	# tokenizers
	self.text_tokenizer = text_tokenizer
	self.sequence_tokenizer = sequence_tokenizer

	# validation tracker for outputs
	self.val_text_joint_latents = []
	self.val_seq_joint_latents = []

	# prediction tracker for outputs
	self.predict_text_joint_latents = []
	self.predict_seq_joint_latents = []

	def forward(
	self,
	x_t: torch.Tensor,
	x_s: torch.Tensor
	) -> (
	torch.Tensor,
	torch.Tensor,
	torch.Tensor
	):

	outputs = self.model(
	x_t=x_t,
	x_s=x_s
	)

	return (
	outputs['text_joint_latent'],
	outputs['seq_joint_latent'],
	)

	def training_step(
	self,
	batch: torch.Tensor,
	batch_idx: any,
	) -> dict:

	if isinstance(batch, list):
	# split the
	text_batch, protein_batch = batch

	# forward pass
	z_t, z_s = self(
	x_t=text_batch,
	x_s=protein_batch
	)
	dist.barrier()

	# gather all tensors
	z_t_all = self.all_gather(z_t, sync_grads=True)
	dist.barrier()
	z_s_all = self.all_gather(z_s, sync_grads=True)

	# stack the embeddings
	z_t_all = z_t_all.view(-1, z_t.shape[-1])
	z_s_all = z_s_all.view(-1, z_s.shape[-1])

	# compute loss values
	loss, logits = self.model.compute_loss(
	protein_embeddings=z_s_all,
	text_embeddings=z_t_all
	)

	# track loss ...
	self.log('train_loss', loss, prog_bar=True, on_step=True, on_epoch=True, sync_dist=True)

	# track metrics
	metric_dict = self.performance_metrics(logits=logits)
	for key in metric_dict:
	values = metric_dict[key]

	final_key = 'train_' + key
	self.log(final_key, metric_dict[key], prog_bar=True if 'f1' in key else False, on_step=True, on_epoch=True, sync_dist=True)

	if batch_idx == 0:
	gpu_memory_usage = helper_tools.print_gpu_initialization()
	self.log(f'gpu_memory_usage', gpu_memory_usage, sync_dist=True)

	return {'loss': loss}


	def validation_step(
	self,
	batch: list,
	batch_idx: any
	) -> dict:

	# split the batch
	if isinstance(batch, list):
	# mean loss
	text_batch, protein_batch = batch

	# forward pass
	z_t, z_s = self(
	x_t=text_batch,
	x_s=protein_batch
	)

	dist.barrier()
	# gather all tensors
	z_t_all = self.all_gather(z_t, sync_grads=True).view(-1, z_t.shape[-1])
	dist.barrier()
	z_s_all = self.all_gather(z_s, sync_grads=True).view(-1, z_s.shape[-1])

	# stack the embeddings
	z_t_all = z_t_all.view(-1, z_t.shape[-1])
	z_s_all = z_s_all.view(-1, z_s.shape[-1])

	# compute loss values
	loss, logits = self.model.compute_loss(
	protein_embeddings=z_s_all,
	text_embeddings=z_t_all
	)


	# track validation loss ...
	self.log('valid_loss', loss, prog_bar=True, sync_dist=True)

	# copmute validation metrics
	metric_dict = self.performance_metrics(logits=logits.detach().cpu())

	for key in metric_dict:
	values = metric_dict[key]
	final_key = 'valid_' + key
	self.log(final_key, metric_dict[key], prog_bar=True if 'f1' in key else False, sync_dist=True)

	# collect joint embedding
	self.val_text_joint_latents.append(z_t_all.detach().cpu())
	self.val_seq_joint_latents.append(z_s_all.detach().cpu())

	return {'valid_loss': loss}

	def on_validation_epoch_end(self):

	# collect and aggregate outputs from all validation steps
	val_z_t_joint = torch.cat(self.val_text_joint_latents, dim=0)
	val_z_s_joint = torch.cat(self.val_seq_joint_latents, dim=0)

	# compute singular values
	text_log_sigma_k, S_text = self.compute_singular(val_z_t_joint.detach().cpu())
	protein_log_sigma_k, S_protein = self.compute_singular(val_z_s_joint.detach().cpu())

	# save image pngs for tracking dimensionality collapse
	self.save_png_to_tensorboard(
	data=text_log_sigma_k.numpy(),
	title='text',
	)
	self.save_png_to_tensorboard(
	data=protein_log_sigma_k.numpy(),
	title='protein'
	)

	# free memory
	self.val_text_joint_latents.clear()
	self.val_seq_joint_latents.clear()


	# compute effective rank (RankME):
	erank_text = self.compute_effective_rank(sigma_ks=S_text)
	erank_protein = self.compute_effective_rank(sigma_ks=S_protein)

	# log erank metrics
	self.log('valid_erank_text', erank_text, sync_dist=True)
	self.log('valid_erank_protein', erank_protein, sync_dist=True)


	def configure_optimizers(self,):

	params = [
	{"params": self.model.protein_encoder.parameters(), "lr": self.script_args.protein_encoder_lr},
	{"params": self.model.text_encoder.parameters(), "lr": self.script_args.text_encoder_lr},
	{"params": itertools.chain(
	self.model.protein_projection.parameters(),
	self.model.text_projection.parameters()
	),
	"lr": self.script_args.head_lr,
	"weight_decay": self.script_args.weight_decay}
	]

	optimizer = torch.optim.AdamW(params, weight_decay=self.script_args.weight_decay)

	return {
	"optimizer": optimizer,
	}

	@torch.no_grad()
	def compute_class_metrics(
	self,
	outputs: torch.Tensor,
	targets: torch.Tensor,
	source: str
	) -> dict:

	# convert torch tensors to numpy array
	outputs_np = outputs.numpy()
	targets_np = targets.numpy()

	# compute the metrics
	accuracy = accuracy_score(targets_np, outputs_np.round())
	precision = precision_score(targets_np, outputs_np.round(), average='micro')
	recall = recall_score(targets_np, outputs_np.round(), average='micro')
	f1 = f1_score(targets_np, outputs_np.round(), average='micro')

	return {
	f'{source}_accuracy': accuracy,
	f'{source}_precision': precision,
	f'{source}_recall': recall,
	f'{source}_f1': f1
	}

	@torch.no_grad()
	def performance_metrics(self, logits: torch.Tensor) -> tuple:

	logits = logits.cpu().float()

	# get probs
	p_text = F.softmax(logits, dim=-1) # prob of a given text captions aligning well with seq. pairs
	p_seq = F.softmax(logits.T, dim=-1) # prob of a given seq aligning well with text pairs
	p_tot = (p_seq + p_text) / 2 # total prob

	# get class labels
	y_pred_text = torch.argmax(p_text, dim=-1)
	y_pred_seq = torch.argmax(p_seq, dim=-1)
	y_pred = torch.argmax(p_tot, dim=-1)
	y_true = torch.arange(y_pred_text.shape[0])

	# compute class metrics
	text_metrics = self.compute_class_metrics(
	outputs=y_pred_text,
	targets=y_true,
	source='text'
	)
	seq_metrics = self.compute_class_metrics(
	outputs=y_pred_seq,
	targets=y_true,
	source='seq'
	)
	total_metrics = self.compute_class_metrics(
	outputs=y_pred,
	targets=y_true,
	source='total'
	)

	# combine dicts into one
	combined_dict = {}
	combined_dict.update(text_metrics)
	combined_dict.update(seq_metrics)
	combined_dict.update(total_metrics)

	return combined_dict

	@torch.no_grad()
	def compute_singular(self, inputs: torch.Tensor) -> (
	torch.Tensor,
	torch.Tensor
	):

	# goal of this function: track for dimensionality collapse
	# inputs dim: (batch_size, emb_dim)

	mean_inputs = torch.mean(inputs, dim=0) # average over batch dimension
	norm_inputs = inputs - mean_inputs # normalize vectors

	# compute correlation matrix #TODO: double check work...
	C = torch.zeros((norm_inputs.shape[-1], norm_inputs.shape[-1]))
	for sample_idx in range(norm_inputs.shape[0]):
	norm_vector = norm_inputs[sample_idx, :].unsqueeze(0)
	C += norm_vector.T @ norm_vector
	C *= 1/norm_vector.shape[0]

	_, S, _ = torch.linalg.svd(C, full_matrices=False)

	# return singular value indexes
	log_sigma_k, _ = torch.sort(torch.log(S), descending=True)
	return (
	log_sigma_k,
	S
	)

	def compute_effective_rank(self, sigma_ks: torch.Tensor) -> torch.Tensor:
	"""
	references:
	- Roy et al. The effective rank: a measure of effective dimensionality
	- Garrido et al. RankMe: Assessing the Downstream Performnace of Pretrained SS Reps by their Rank.
	"""
	# sort the singular values
	sigma_ks, _ = torch.sort(sigma_ks, descending=True)

	# copute L1 norm for sing values.
	l1_norm_sigma = torch.norm(sigma_ks, p=1)

	# compute singular value distribution
	p_k = sigma_ks / l1_norm_sigma + torch.finfo(torch.float).eps

	# compute Shannon entropy
	entropy = - torch.sum(p_k * torch.log(p_k))

	# get effective rank (RankME):
	erank = torch.exp(entropy)

	return erank

	def save_png_to_tensorboard(
	self,
	data: np.single,
	title: str,
	x_axis_label: str='Singular Value Rank Index',
	y_axis_label: str='Log of singular values',
	):

	current_epoch = self.trainer.current_epoch

	# Plot the line
	fig, ax = plt.subplots(dpi=300)
	ax.plot(data)
	ax.set_xlabel(x_axis_label)
	ax.set_ylabel(y_axis_label)
	ax.set_title(title)
	ax.set_ylim([-25,3])

	# Log the plot in TensorBoard
	self.logger.experiment.add_figure(f'{title}_SingularValues_{current_epoch}', fig, current_epoch)

	def predict_step(
	self,
	batch: torch.Tensor,
	batch_idx: torch.Tensor,
	dataloder_idx: bool=False
	) -> (
	torch.Tensor,
	torch.Tensor
	):


	if isinstance(batch, list):
	# mean loss
	text_batch, protein_batch = batch
	outputs = self(
	x_t=text_batch,
	x_s=protein_batch,
	)

	z_t_joint, z_p_joint = outputs

	self.predict_text_joint_latents.append(z_t_joint.detach().cpu())
	self.predict_seq_joint_latents.append(z_p_joint.detach().cpu())

	return outputs

	def on_predict_epoch_end(self, outputs=None):

	self.predict_text_joint_latents = torch.cat(self.predict_text_joint_latents).cpu()
	self.predict_seq_joint_latents = torch.cat(self.predict_seq_joint_latents).cpu()



	##########################
	# Masked-task PL wrapper #
	##########################

	class mask_PL_PEN_CL(pl.LightningModule):


	def __init__(
	self,
	args: any,
	model: nn.Module,
	text_tokenizer: any,
	sequence_tokenizer: any
	):

	super().__init__()
	# arguments
	self.script_args = args

	# model components
	self.model = model

	# tokenizers
	self.text_tokenizer = text_tokenizer
	self.sequence_tokenizer = sequence_tokenizer

	# validation tracker for outputs
	self.val_text_joint_latents = []
	self.val_seq_joint_latents = []

	# prediction tracker for outputs
	self.predict_text_joint_latents = []
	self.predict_seq_joint_latents = []

	def forward(
	self,
	x_t: torch.Tensor,
	x_s: torch.Tensor,
	compute_masked_logits: bool=False
	) -> (
	torch.Tensor,
	torch.Tensor,
	torch.Tensor
	):

	outputs = self.model(
	x_t=x_t,
	x_s=x_s,
	compute_masked_logits=compute_masked_logits
	)

	if compute_masked_logits:
	# forward pass for computing logits for masked language objective
	return (
	outputs['text_masked_logits'],
	outputs['protein_masked_logits']
	)
	else:
	# forward pass for computing latent embeddings in the joint space
	return (
	outputs['text_joint_latent'],
	outputs['seq_joint_latent'],
	)

	def training_step(
	self,
	batch: torch.Tensor,
	batch_idx: any,
	) -> dict:

	if isinstance(batch, list):
	# split the data
	text_batch, protein_batch, text_mask_batch, protein_mask_batch = batch

	# forward pass
	z_t, z_s = self(
	x_t=text_batch,
	x_s=protein_batch,
	compute_masked_logits=False
	)
	dist.barrier()

	# gather all tensors
	z_t_all = self.all_gather(z_t, sync_grads=True)
	dist.barrier()
	z_s_all = self.all_gather(z_s, sync_grads=True)

	# stack the embeddings
	z_t_all = z_t_all.view(-1, z_t.shape[-1])
	z_s_all = z_s_all.view(-1, z_s.shape[-1])

	# compute loss values
	loss_align, logits = self.model.compute_loss(
	protein_embeddings=z_s_all,
	text_embeddings=z_t_all
	)

	# compute mask language model logits
	logits_t_mask, logits_s_mask = self(
	x_t=text_mask_batch,
	x_s=protein_mask_batch,
	compute_masked_logits=True
	)

	# compute mask language loss for biomedical expert model
	loss_text_mask = self.model.compute_masked_lang_loss(
	logits_masked=logits_t_mask,
	targets=text_batch,
	targets_masked=text_mask_batch,
	mask_token_id=self.text_tokenizer.mask_token_id
	)

	# compute mask language loss for protein expert model
	loss_sequence_mask = self.model.compute_masked_lang_loss(
	logits_masked=logits_s_mask,
	targets=protein_batch,
	targets_masked=protein_mask_batch,
	mask_token_id=self.sequence_tokenizer.mask_idx
	)


	# total loss
	loss = loss_align + loss_text_mask + loss_sequence_mask

	# track loss ...
	self.log('train_loss', loss, prog_bar=True, on_step=True, on_epoch=True, sync_dist=True)
	self.log('train_loss_align', loss_align, prog_bar=True, on_step=True, on_epoch=True, sync_dist=True)
	self.log('train_loss_text_mask', loss_text_mask, prog_bar=False, on_step=True, on_epoch=True, sync_dist=True)
	self.log('train_loss_seq_mask', loss_sequence_mask, prog_bar=False, on_step=True, on_epoch=True, sync_dist=True)

	# track metrics
	metric_dict = self.performance_metrics(logits=logits)
	for key in metric_dict:
	values = metric_dict[key]

	final_key = 'train_' + key
	self.log(final_key, metric_dict[key], prog_bar=True if 'f1' in key else False, on_step=True, on_epoch=True, sync_dist=True)

	if batch_idx == 0:
	gpu_memory_usage = helper_tools.print_gpu_initialization()
	self.log(f'gpu_memory_usage', gpu_memory_usage, sync_dist=True)

	return {'loss': loss}


	def validation_step(
	self,
	batch: list,
	batch_idx: any
	) -> dict:

	# split the batch
	if isinstance(batch, list):
	# mean loss
	text_batch, protein_batch, text_mask_batch, protein_mask_batch = batch

	# forward pass
	z_t, z_s = self(
	x_t=text_batch,
	x_s=protein_batch
	)

	dist.barrier()
	# gather all tensors
	z_t_all = self.all_gather(z_t, sync_grads=True).view(-1, z_t.shape[-1])
	dist.barrier()
	z_s_all = self.all_gather(z_s, sync_grads=True).view(-1, z_s.shape[-1])

	# stack the embeddings
	z_t_all = z_t_all.view(-1, z_t.shape[-1])
	z_s_all = z_s_all.view(-1, z_s.shape[-1])

	# compute loss values
	loss_align, logits = self.model.compute_loss(
	protein_embeddings=z_s_all,
	text_embeddings=z_t_all
	)

	# compute mask language model logits
	logits_t_mask, logits_s_mask = self(
	x_t=text_mask_batch,
	x_s=protein_mask_batch,
	compute_masked_logits=True
	)

	# compute mask language loss for biomedical expert model
	loss_text_mask = self.model.compute_masked_lang_loss(
	logits_masked=logits_t_mask,
	targets=text_batch,
	targets_masked=text_mask_batch,
	mask_token_id=self.text_tokenizer.mask_token_id
	)

	# compute mask language loss for protein expert model
	loss_sequence_mask = self.model.compute_masked_lang_loss(
	logits_masked=logits_s_mask,
	targets=protein_batch,
	targets_masked=protein_mask_batch,
	mask_token_id=self.sequence_tokenizer.mask_idx
	)

	# total loss
	loss = loss_align + loss_text_mask + loss_sequence_mask

	# track validation loss ...
	self.log('valid_loss', loss, prog_bar=True, sync_dist=True)
	self.log('valid_loss_align', loss_align, prog_bar=True, sync_dist=True)
	self.log('valid_loss_text_mask', loss_text_mask, prog_bar=False, sync_dist=True)
	self.log('valid_loss_seq_mask', loss_sequence_mask, prog_bar=False, sync_dist=True)

	# copmute validation metrics
	metric_dict = self.performance_metrics(logits=logits.detach().cpu())

	for key in metric_dict:
	values = metric_dict[key]
	final_key = 'valid_' + key
	self.log(final_key, metric_dict[key], prog_bar=True if 'f1' in key else False, sync_dist=True)

	# collect joint embedding
	self.val_text_joint_latents.append(z_t_all.detach().cpu())
	self.val_seq_joint_latents.append(z_s_all.detach().cpu())

	return {'valid_loss': loss}

	def on_validation_epoch_end(self):

	# # collect and aggregate outputs from all validation steps
	# val_z_t_joint = torch.cat(self.val_text_joint_latents, dim=0)
	# val_z_s_joint = torch.cat(self.val_seq_joint_latents, dim=0)

	# compute singular values
	# text_log_sigma_k, S_text = self.compute_singular(val_z_t_joint.detach().cpu())
	# protein_log_sigma_k, S_protein = self.compute_singular(val_z_s_joint.detach().cpu())

	# save image pngs for tracking dimensionality collapse
	# self.save_png_to_tensorboard(
	# data=text_log_sigma_k.numpy(),
	# title='text',
	# )
	# self.save_png_to_tensorboard(
	# data=protein_log_sigma_k.numpy(),
	# title='protein'
	# )

	# free memory
	self.val_text_joint_latents.clear()
	self.val_seq_joint_latents.clear()


	# compute effective rank (RankME):
	# erank_text = self.compute_effective_rank(sigma_ks=S_text)
	# erank_protein = self.compute_effective_rank(sigma_ks=S_protein)

	# log erank metrics
	# self.log('valid_erank_text', erank_text, sync_dist=True)
	# self.log('valid_erank_protein', erank_protein, sync_dist=True)


	def configure_optimizers(self,):

	params = [
	{"params": self.model.protein_encoder.parameters(), "lr": self.script_args.protein_encoder_lr},
	{"params": self.model.text_encoder.parameters(), "lr": self.script_args.text_encoder_lr},
	{"params": itertools.chain(
	self.model.protein_projection.parameters(),
	self.model.text_projection.parameters()
	),
	"lr": self.script_args.head_lr,
	"weight_decay": self.script_args.weight_decay}
	]

	optimizer = torch.optim.AdamW(params, weight_decay=self.script_args.weight_decay)

	return {
	"optimizer": optimizer,
	}

	@torch.no_grad()
	def compute_class_metrics(
	self,
	outputs: torch.Tensor,
	targets: torch.Tensor,
	source: str
	) -> dict:

	# convert torch tensors to numpy array
	outputs_np = outputs.numpy()
	targets_np = targets.numpy()

	# compute the metrics
	accuracy = accuracy_score(targets_np, outputs_np.round())
	precision = precision_score(targets_np, outputs_np.round(), average='micro')
	recall = recall_score(targets_np, outputs_np.round(), average='micro')
	f1 = f1_score(targets_np, outputs_np.round(), average='micro')

	return {
	f'{source}_accuracy': accuracy,
	f'{source}_precision': precision,
	f'{source}_recall': recall,
	f'{source}_f1': f1
	}

	@torch.no_grad()
	def performance_metrics(self, logits: torch.Tensor) -> tuple:

	logits = logits.cpu().float()

	# get probs
	p_text = F.softmax(logits, dim=-1) # prob of a given text captions aligning well with seq. pairs
	p_seq = F.softmax(logits.T, dim=-1) # prob of a given seq aligning well with text pairs
	p_tot = (p_seq + p_text) / 2 # total prob

	# get class labels
	y_pred_text = torch.argmax(p_text, dim=-1)
	y_pred_seq = torch.argmax(p_seq, dim=-1)
	y_pred = torch.argmax(p_tot, dim=-1)
	y_true = torch.arange(y_pred_text.shape[0])

	# compute class metrics
	text_metrics = self.compute_class_metrics(
	outputs=y_pred_text,
	targets=y_true,
	source='text'
	)
	seq_metrics = self.compute_class_metrics(
	outputs=y_pred_seq,
	targets=y_true,
	source='seq'
	)
	total_metrics = self.compute_class_metrics(
	outputs=y_pred,
	targets=y_true,
	source='total'
	)

	# combine dicts into one
	combined_dict = {}
	combined_dict.update(text_metrics)
	combined_dict.update(seq_metrics)
	combined_dict.update(total_metrics)

	return combined_dict

	@torch.no_grad()
	def compute_singular(self, inputs: torch.Tensor) -> (
	torch.Tensor,
	torch.Tensor
	):

	# goal of this function: track for dimensionality collapse
	# inputs dim: (batch_size, emb_dim)

	mean_inputs = torch.mean(inputs, dim=0) # average over batch dimension
	norm_inputs = inputs - mean_inputs # normalize vectors

	# compute correlation matrix #TODO: double check work...
	C = torch.zeros((norm_inputs.shape[-1], norm_inputs.shape[-1]))
	for sample_idx in tqdm(range(norm_inputs.shape[0])):
	norm_vector = norm_inputs[sample_idx, :].unsqueeze(0)
	C += norm_vector.T @ norm_vector
	C *= 1/norm_vector.shape[0]

	_, S, _ = torch.linalg.svd(C, full_matrices=False)

	# return singular value indexes
	log_sigma_k, _ = torch.sort(torch.log(S), descending=True)
	return (
	log_sigma_k,
	S
	)

	def compute_effective_rank(self, sigma_ks: torch.Tensor) -> torch.Tensor:
	"""
	references:
	- Roy et al. The effective rank: a measure of effective dimensionality
	- Garrido et al. RankMe: Assessing the Downstream Performnace of Pretrained SS Reps by their Rank.
	"""
	# sort the singular values
	sigma_ks, _ = torch.sort(sigma_ks, descending=True)

	# copute L1 norm for sing values.
	l1_norm_sigma = torch.norm(sigma_ks, p=1)

	# compute singular value distribution
	p_k = sigma_ks / l1_norm_sigma + torch.finfo(torch.float).eps

	# compute Shannon entropy
	entropy = - torch.sum(p_k * torch.log(p_k))

	# get effective rank (RankME):
	erank = torch.exp(entropy)

	return erank

	def save_png_to_tensorboard(
	self,
	data: np.single,
	title: str,
	x_axis_label: str='Singular Value Rank Index',
	y_axis_label: str='Log of singular values',
	):

	current_epoch = self.trainer.current_epoch

	# Plot the line
	fig, ax = plt.subplots(dpi=300)
	ax.plot(data)
	ax.set_xlabel(x_axis_label)
	ax.set_ylabel(y_axis_label)
	ax.set_title(title)
	ax.set_ylim([-25,3])

	# Log the plot in TensorBoard
	self.logger.experiment.add_figure(f'{title}_SingularValues_{current_epoch}', fig, current_epoch)

	def predict_step(
	self,
	batch: torch.Tensor,
	batch_idx: torch.Tensor,
	dataloder_idx: bool=False
	) -> (
	torch.Tensor,
	torch.Tensor
	):


	if isinstance(batch, list):
	# mean loss
	text_batch, protein_batch = batch
	outputs = self(
	x_t=text_batch,
	x_s=protein_batch,
	compute_masked_logits=False
	)

	z_t_joint, z_p_joint = outputs

	self.predict_text_joint_latents.append(z_t_joint.detach().cpu())
	self.predict_seq_joint_latents.append(z_p_joint.detach().cpu())

	return outputs

	def on_predict_epoch_end(self, outputs=None):

	self.predict_text_joint_latents = torch.cat(self.predict_text_joint_latents).cpu()
	self.predict_seq_joint_latents = torch.cat(self.predict_seq_joint_latents).cpu()



	########################
	# Pfam-task PL wrapper #
	########################


	class pfam_PL_PEN_CL(pl.LightningModule):


	def __init__(
	self,
	args: any,
	model: nn.Module,
	text_tokenizer: any,
	sequence_tokenizer: any
	):

	super().__init__()
	# arguments
	self.script_args = args

	# model components
	self.model = model

	# tokenizers
	self.text_tokenizer = text_tokenizer
	self.sequence_tokenizer = sequence_tokenizer

	# validation tracker for outputs
	self.val_text_joint_latents = []
	self.val_seq_joint_latents = []

	# predictions...
	self.predict_text_joint_latents = []
	self.predict_seq_joint_latents = []

	def forward(
	self,
	x_t: torch.Tensor,
	x_p: torch.Tensor,
	compute_masked_logits: bool=False
	) -> (
	torch.Tensor,
	torch.Tensor,
	torch.Tensor
	):

	outputs = self.model(
	x_t=x_t,
	x_s=x_p,
	compute_masked_logits=compute_masked_logits
	)

	if compute_masked_logits:
	# forward pass for computing logits for masked language objective
	return (
	outputs['text_masked_logits'],
	outputs['protein_masked_logits']
	)
	else:
	# forward pass for computing latent embeddings in the joint space
	return (
	outputs['text_joint_latent'],
	outputs['seq_joint_latent'],
	)


	def on_train_batch_start(self, batch, batch_idx):
	self.batch_start_time = time.time()

	def on_train_batch_end(self, outputs, batch, batch_idx):
	batch_end_time = time.time()
	batch_time = batch_end_time - self.batch_start_time
	#print(f'Rank={dist.get_rank()}: time to process batch is {batch_time}')
	#self.log(f'batch_time_rank_{dist.get_rank()}', batch_time, on_step=True, on_epoch=False)

	def training_step(self, batch: torch.Tensor, batch_idx: any) -> dict:
	"""
	Execute a single training step.

	Given a batch of data, this function processes both Swiss-Prot and Pfam data through the model, computes
	various loss values including inter-modal, intra-modal, and masked language model losses for both text
	and protein sequences. This function also computes and logs various metrics and GPU memory usage.

	Parameters:
	- batch: The input data batch. This can include multiple types of data.
	- batch_idx: Index of the current batch.

	Steps:
	1. Split the data into Swiss-Prot and Pfam batches, if the batch is a list.
	2. Forward pass the Swiss-Prot data through the model.
	3. Synchronize and gather embeddings from all GPUs.
	4. Forward pass the Pfam data through the model.
	5. Synchronize and gather Pfam embeddings from all GPUs.
	6. Concatenate Swiss-Prot and Pfam embeddings.
	7. Compute inter-modal and intra-modal loss values.
	8. Compute masked language model logits for the concatenated batch.
	9. Compute masked language loss for both text and protein sequences.
	10. Compute and log the total loss and individual loss components.
	11. Compute and log performance metrics.
	12. Log GPU memory usage at the start of training.

	Returns:
	- Dictionary containing the total loss value.

	Note:
	This function is intended to be used within a distributed (multi-GPU) training context, as evident
	from the use of barriers and gathering operations. It's designed to handle batches that contain both
	Swiss-Prot and Pfam data, both being biological datasets used in multi-modal protein embeddings.
	The function utilizes both inter-modal (between modalities) and intra-modal (within the same modality)
	contrastive losses, as well as masked language modeling objectives similar to BERT's MLM objective.
	"""

	# Check if the batch is a list and split data if so.
	if isinstance(batch, list):
	text_batch, protein_batch, text_mask_batch, protein_mask_batch, \
	pfam_text_batch, pfam_protein_batch, pfam_text_mask_batch, pfam_protein_mask_batch, \
	bool_pfam_vector = batch


	#print(f'rank={dist.get_rank()}: text size {text_batch.shape}')

	#start_time_forward_pass = time.time()
	# Forward pass with Swiss-Prot data.
	z_t_swiss, z_p_swiss = self(
	x_t=text_batch,
	x_p=protein_batch,
	compute_masked_logits=False
	)
	# Timer end and log
	#end_time_forward_pass = time.time()
	#print(f"Rank={dist.get_rank()}: Time taken for Swiss-Prot forward pass: {end_time_forward_pass - start_time_forward_pass} seconds.")

	# Ensure all GPUs are synchronized.
	dist.barrier()

	# Forward pass with Pfam data.
	z_t_pfam, z_p_pfam = self(
	x_t=pfam_text_batch,
	x_p=pfam_protein_batch,
	compute_masked_logits=False
	)
	dist.barrier()

	#Gather tensors from all GPUs.
	z_t_swiss_all = self.all_gather(z_t_swiss, sync_grads=True)
	dist.barrier()
	z_p_swiss_all = self.all_gather(z_p_swiss, sync_grads=True)

	# Reshape the embeddings.
	z_t_swiss_all = z_t_swiss_all.view(-1, z_t_swiss.shape[-1])
	z_p_swiss_all = z_p_swiss_all.view(-1, z_p_swiss.shape[-1])


	# Gather tensors from all GPUs.
	z_t_pfam_all = self.all_gather(z_t_pfam, sync_grads=True)
	dist.barrier()
	z_p_pfam_all = self.all_gather(z_p_pfam, sync_grads=True)

	# Reshape the embeddings.
	z_t_pfam_all = z_t_pfam_all.view(-1, z_t_pfam.shape[-1])
	z_p_pfam_all = z_p_pfam_all.view(-1, z_p_pfam.shape[-1])

	# Concatenate Swiss-Prot and Pfam embeddings.
	z_t_all = torch.cat((z_t_swiss_all, z_t_pfam_all), dim=0)
	z_p_all = torch.cat((z_p_swiss_all, z_p_pfam_all), dim=0)

	# Timer start
	#start_time_loss_computation = time.time()

	# Compute inter-modal loss.
	loss_align, logits = self.model.compute_inter_loss(
	protein_embeddings=z_p_all,
	text_embeddings=z_t_all,
	batch_size=z_p_all.shape[0] // 2
	)
	# Timer end and log
	#end_time_loss_computation = time.time()
	#print(f"Rank={dist.get_rank()}: Time taken for loss computation: {end_time_loss_computation - start_time_loss_computation} seconds.")


	# Compute intra-modal loss.
	loss_intra, cosine_similarity = self.model.compute_intra_loss(
	protein_embeddings=z_p_all,
	batch_size=z_p_all.shape[0] // 2
	)

	# Concatenate batches for masked language modeling.
	all_text_batch = torch.cat((text_batch, pfam_text_batch), dim=0)
	all_protein_batch = torch.cat((protein_batch, pfam_protein_batch), dim=0)
	all_text_mask_batch = torch.cat((text_mask_batch, pfam_text_mask_batch), dim=0)
	all_protein_mask_batch = torch.cat((protein_mask_batch, pfam_protein_mask_batch), dim=0)

	#TODO: timer start
	#start_time_mask_comp = time.time()

	# Compute masked language model logits.
	logits_t_mask, logits_s_mask = self(
	x_t=all_text_mask_batch,
	x_p=all_protein_mask_batch,
	compute_masked_logits=True
	)
	#end_time_mask_comp = time.time()
	#print(f"Rank={dist.get_rank()}: Time taken for mask predictions: {end_time_mask_comp - start_time_mask_comp} seconds.")


	# Compute masked language model loss for text data.
	loss_text_mask = self.model.compute_masked_lang_loss(
	logits_masked=logits_t_mask,
	targets=all_text_batch,
	targets_masked=all_text_mask_batch,
	mask_token_id=self.text_tokenizer.mask_token_id
	)

	# Compute masked language model loss for protein data.
	loss_sequence_mask = self.model.compute_masked_lang_loss(
	logits_masked=logits_s_mask,
	targets=all_protein_batch,
	targets_masked=all_protein_mask_batch,
	mask_token_id=self.sequence_tokenizer.mask_idx
	)


	if self.script_args.dataset_type == 'pfam':
	# Aggregate all computed losses.
	loss = loss_align + loss_intra + loss_text_mask + loss_sequence_mask

	elif self.script_args.dataset_type == 'pfam_ablated':
	# Aggregate all losses besides PFC.
	loss = loss_align + loss_text_mask + loss_sequence_mask
	else:
	# Add an assertion here
	assert self.script_args.dataset_type in ['pfam', 'pfam_ablated'], "Unexpected dataset_type value"
	sys.stderr.write("Unexpected dataset_type value\n")
	sys.exit(1)

	# Log the individual and total loss values.
	self.log('train_loss', loss, prog_bar=True, on_step=True, on_epoch=True, sync_dist=True)
	self.log('train_loss_align', loss_align, prog_bar=True, on_step=True, on_epoch=True, sync_dist=True)
	self.log('train_loss_intra', loss_intra, prog_bar=True, on_step=True, on_epoch=True, sync_dist=True)
	self.log('train_loss_text_mask', loss_text_mask, prog_bar=False, on_step=True, on_epoch=True, sync_dist=True)
	self.log('train_loss_seq_mask', loss_sequence_mask, prog_bar=False, on_step=True, on_epoch=True, sync_dist=True)

	# Compute and log additional performance metrics.
	metric_dict = self.performance_metrics(logits=logits)
	for key in metric_dict:
	values = metric_dict[key]
	final_key = 'train_' + key
	self.log(final_key, metric_dict[key], prog_bar=True if 'f1' in key else False, on_step=True, on_epoch=True, sync_dist=True)

	# Log GPU memory usage at the beginning of the training.
	if batch_idx == 0:
	gpu_memory_usage = helper_tools.print_gpu_initialization()
	self.log(f'gpu_memory_usage', gpu_memory_usage, sync_dist=True)

	# log CPU memory
	memory_usage = helper_tools.print_memory_usage()
	self.log(f'memory_usage', memory_usage, sync_dist=True)

	return {'loss': loss}


	def validation_step(
	self,
	batch: torch.Tensor,
	batch_idx: any,
	) -> dict:

	"""
	`validation_step()`: Validates a single batch of data and computes loss and performance metrics.

	Parameters:
	- `self`: Reference to the current instance of the model or module.
	- `batch`: Input data, which might contain text and protein sequences, their corresponding masks, and additional data from both Swiss-Prot and Pfam datasets.
	- `batch_idx`: Identifier for the current batch.

	Functionality:
	1. Extracts and processes data from the given batch.
	2. Computes embeddings for Swiss-Prot and Pfam datasets.
	3. Concatenates these embeddings to form a unified representation.
	4. Computes various loss values: inter-modal, intra-modal, and masked language losses for both biomedical texts and protein sequences.
	5. Logs the computed loss values and other performance metrics, highlighting metrics such as F1-score.
	6. Collects and appends the joint embeddings of the batch for potential future use.

	Returns:
	- A dictionary with the total validation loss for the current batch.
	"""

	if isinstance(batch, list):
	# split the data
	text_batch, protein_batch, text_mask_batch, protein_mask_batch, \
	pfam_text_batch, pfam_protein_batch, pfam_text_mask_batch, pfam_protein_mask_batch, \
	bool_pfam_vector = batch


	# forward pass over the swiss-prot data
	z_t_swiss, z_p_swiss = self(
	x_t=text_batch,
	x_p=protein_batch,
	compute_masked_logits=False
	)
	dist.barrier() # wait till all GPUs catch up...

	# gather all tensors
	z_t_swiss_all = self.all_gather(z_t_swiss, sync_grads=True)
	dist.barrier()
	z_p_swiss_all = self.all_gather(z_p_swiss, sync_grads=True)

	# stack the embeddings
	z_t_swiss_all = z_t_swiss_all.view(-1, z_t_swiss.shape[-1])
	z_p_swiss_all = z_p_swiss_all.view(-1, z_p_swiss.shape[-1])

	# foward pass over the pfam data
	z_t_pfam, z_p_pfam = self(
	x_t=pfam_text_batch,
	x_p=pfam_protein_batch,
	compute_masked_logits=False
	)
	dist.barrier() # wait till all GPUs catch up...

	# gather all tensors
	z_t_pfam_all = self.all_gather(z_t_pfam, sync_grads=True)
	dist.barrier()
	z_p_pfam_all = self.all_gather(z_p_pfam, sync_grads=True)

	# stack the embeddings
	z_t_pfam_all = z_t_pfam_all.view(-1, z_t_pfam.shape[-1])
	z_p_pfam_all = z_p_pfam_all.view(-1, z_p_pfam.shape[-1])

	# concatenate swiss-prot <> pfam embeddings
	z_t_all = torch.cat((z_t_swiss_all, z_t_pfam_all), dim=0)
	z_p_all = torch.cat((z_p_swiss_all, z_p_pfam_all), dim=0)

	# compute inter-modal loss values
	loss_align, logits = self.model.compute_inter_loss(
	protein_embeddings=z_p_all,
	text_embeddings=z_t_all,
	batch_size=z_p_all.shape[0] // 2
	)

	# compute intra-modal loss values
	loss_intra, cosine_similarity = self.model.compute_intra_loss(
	protein_embeddings=z_p_all,
	batch_size=z_p_all.shape[0] // 2
	)

	# concatenate batch samples
	all_text_batch = torch.cat((text_batch, pfam_text_batch), dim=0)
	all_protein_batch = torch.cat((protein_batch, pfam_protein_batch), dim=0)
	all_text_mask_batch = torch.cat((text_mask_batch, pfam_text_mask_batch), dim=0)
	all_protein_mask_batch = torch.cat((protein_mask_batch, pfam_protein_mask_batch), dim=0)

	# compute mask language model logits
	logits_t_mask, logits_s_mask = self(
	x_t=all_text_mask_batch,
	x_p=all_protein_mask_batch,
	compute_masked_logits=True
	)

	# compute mask language loss for biomedical expert model
	loss_text_mask = self.model.compute_masked_lang_loss(
	logits_masked=logits_t_mask,
	targets=all_text_batch,
	targets_masked=all_text_mask_batch,
	mask_token_id=self.text_tokenizer.mask_token_id
	)

	# compute mask language loss for protein expert model
	loss_sequence_mask = self.model.compute_masked_lang_loss(
	logits_masked=logits_s_mask,
	targets=all_protein_batch,
	targets_masked=all_protein_mask_batch,
	mask_token_id=self.sequence_tokenizer.mask_idx
	)


	# total loss
	#loss = loss_align + loss_intra + loss_text_mask + loss_sequence_mask

	if self.script_args.dataset_type == 'pfam':
	# Aggregate all computed losses.
	loss = loss_align + loss_intra + loss_text_mask + loss_sequence_mask

	elif self.script_args.dataset_type == 'pfam_ablated':
	# Aggregate all losses besides PFC.
	loss = loss_align + loss_text_mask + loss_sequence_mask
	else:
	# Add an assertion here
	assert self.script_args.dataset_type in ['pfam', 'pfam_ablated'], "Unexpected dataset_type value"
	sys.stderr.write("Unexpected dataset_type value\n")
	sys.exit(1)


	# track loss ...
	self.log('valid_loss', loss, prog_bar=True, on_step=True, on_epoch=True, sync_dist=True)
	self.log('valid_loss_align', loss_align, prog_bar=True, on_step=True, on_epoch=True, sync_dist=True)
	self.log('valid_loss_intra', loss_intra, prog_bar=True, on_step=True, on_epoch=True, sync_dist=True)
	self.log('valid_loss_text_mask', loss_text_mask, prog_bar=False, on_step=True, on_epoch=True, sync_dist=True)
	self.log('valid_loss_seq_mask', loss_sequence_mask, prog_bar=False, on_step=True, on_epoch=True, sync_dist=True)
	# log CPU memory
	memory_usage = helper_tools.print_memory_usage()
	self.log(f'memory_usage', memory_usage, sync_dist=True)

	# track metrics
	metric_dict = self.performance_metrics(logits=logits.detach().cpu())
	for key in metric_dict:
	values = metric_dict[key]
	final_key = 'valid_' + key
	self.log(final_key, metric_dict[key], prog_bar=True if 'f1' in key else False, on_step=True, on_epoch=True, sync_dist=True)


	# collect joint embedding
	#self.val_text_joint_latents.append(z_t_all.detach().cpu())
	#self.val_seq_joint_latents.append(z_p_all.detach().cpu())

	return {'valid_loss': loss}


	# def on_validation_epoch_end(self):
	# print('Enter validation end of epoch analysis...')
	#
	# # collect and aggregate outputs from all validation steps
	# val_z_t_joint = torch.cat(self.val_text_joint_latents, dim=0)
	# val_z_s_joint = torch.cat(self.val_seq_joint_latents, dim=0)
	#
	# # compute singular values
	# print('Compute singular values...')
	# text_log_sigma_k, S_text = self.compute_singular(val_z_t_joint.detach().cpu())
	# protein_log_sigma_k, S_protein = self.compute_singular(val_z_s_joint.detach().cpu())
	#
	# # save image pngs for tracking dimensionality collapse
	# self.save_png_to_tensorboard(
	# data=text_log_sigma_k.numpy(),
	# title='text',
	# )
	# self.save_png_to_tensorboard(
	# data=protein_log_sigma_k.numpy(),
	# title='protein'
	# )
	#
	# # free memory
	# self.val_text_joint_latents.clear()
	# self.val_seq_joint_latents.clear()
	#
	#
	# # compute effective rank (RankME):
	# print('Compute eranks')
	# erank_text = self.compute_effective_rank(sigma_ks=S_text)
	# erank_protein = self.compute_effective_rank(sigma_ks=S_protein)
	#
	# # log erank metrics
	# self.log('valid_erank_text', erank_text, sync_dist=True)
	# self.log('valid_erank_protein', erank_protein, sync_dist=True)

	def configure_optimizers(self,):

	params = [
	{"params": self.model.protein_encoder.parameters(), "lr": self.script_args.protein_encoder_lr},
	{"params": self.model.text_encoder.parameters(), "lr": self.script_args.text_encoder_lr},
	{"params": itertools.chain(
	self.model.protein_projection.parameters(),
	self.model.text_projection.parameters()
	),
	"lr": self.script_args.head_lr,
	"weight_decay": self.script_args.weight_decay}
	]

	optimizer = torch.optim.AdamW(params, weight_decay=self.script_args.weight_decay)

	return {
	"optimizer": optimizer,
	}

	@torch.no_grad()
	def compute_class_metrics(
	self,
	outputs: torch.Tensor,
	targets: torch.Tensor,
	source: str
	) -> dict:

	# convert torch tensors to numpy array
	outputs_np = outputs.numpy()
	targets_np = targets.numpy()

	# compute the metrics
	accuracy = accuracy_score(targets_np, outputs_np.round())
	precision = precision_score(targets_np, outputs_np.round(), average='micro')
	recall = recall_score(targets_np, outputs_np.round(), average='micro')
	f1 = f1_score(targets_np, outputs_np.round(), average='micro')

	return {
	f'{source}_accuracy': accuracy,
	f'{source}_precision': precision,
	f'{source}_recall': recall,
	f'{source}_f1': f1
	}

	@torch.no_grad()
	def performance_metrics(self, logits: torch.Tensor) -> tuple:

	logits = logits.cpu().float()

	# get probs
	p_text = F.softmax(logits, dim=-1) # prob of a given text captions aligning well with seq. pairs
	p_seq = F.softmax(logits.T, dim=-1) # prob of a given seq aligning well with text pairs
	p_tot = (p_seq + p_text) / 2 # total prob

	# get class labels
	y_pred_text = torch.argmax(p_text, dim=-1)
	y_pred_seq = torch.argmax(p_seq, dim=-1)
	y_pred = torch.argmax(p_tot, dim=-1)
	y_true = torch.arange(y_pred_text.shape[0])

	# compute class metrics
	text_metrics = self.compute_class_metrics(
	outputs=y_pred_text,
	targets=y_true,
	source='text'
	)
	seq_metrics = self.compute_class_metrics(
	outputs=y_pred_seq,
	targets=y_true,
	source='seq'
	)
	total_metrics = self.compute_class_metrics(
	outputs=y_pred,
	targets=y_true,
	source='total'
	)

	# combine dicts into one
	combined_dict = {}
	combined_dict.update(text_metrics)
	combined_dict.update(seq_metrics)
	combined_dict.update(total_metrics)

	return combined_dict

	@torch.no_grad()
	def compute_singular(self, inputs: torch.Tensor) -> (
	torch.Tensor,
	torch.Tensor
	):

	# goal of this function: track for dimensionality collapse
	# inputs dim: (batch_size, emb_dim)

	mean_inputs = torch.mean(inputs, dim=0) # average over batch dimension
	norm_inputs = inputs - mean_inputs # normalize vectors

	# compute correlation matrix #TODO: double check work...
	C = torch.zeros((norm_inputs.shape[-1], norm_inputs.shape[-1]))
	for sample_idx in range(norm_inputs.shape[0]):
	norm_vector = norm_inputs[sample_idx, :].unsqueeze(0)
	C += norm_vector.T @ norm_vector
	C *= 1/norm_vector.shape[0]

	_, S, _ = torch.linalg.svd(C, full_matrices=False)

	# return singular value indexes
	log_sigma_k, _ = torch.sort(torch.log(S), descending=True)
	return (
	log_sigma_k,
	S
	)

	def compute_effective_rank(self, sigma_ks: torch.Tensor) -> torch.Tensor:
	"""
	references:
	- Roy et al. The effective rank: a measure of effective dimensionality
	- Garrido et al. RankMe: Assessing the Downstream Performnace of Pretrained SS Reps by their Rank.
	"""
	# sort the singular values
	sigma_ks, _ = torch.sort(sigma_ks, descending=True)

	# copute L1 norm for sing values.
	l1_norm_sigma = torch.norm(sigma_ks, p=1)

	# compute singular value distribution
	p_k = sigma_ks / l1_norm_sigma + torch.finfo(torch.float).eps

	# compute Shannon entropy
	entropy = - torch.sum(p_k * torch.log(p_k))

	# get effective rank (RankME):
	erank = torch.exp(entropy)

	return erank

	def save_png_to_tensorboard(
	self,
	data: np.single,
	title: str,
	x_axis_label: str='Singular Value Rank Index',
	y_axis_label: str='Log of singular values',
	):

	current_epoch = self.trainer.current_epoch

	# Plot the line
	fig, ax = plt.subplots(dpi=300)
	ax.plot(data)
	ax.set_xlabel(x_axis_label)
	ax.set_ylabel(y_axis_label)
	ax.set_title(title)
	ax.set_ylim([-25,3])

	# Log the plot in TensorBoard
	self.logger.experiment.add_figure(f'{title}_SingularValues_{current_epoch}', fig, current_epoch)

	# Close the figure to free up memory
	plt.close(fig)

	def predict_step(
	self,
	batch: torch.Tensor,
	batch_idx: torch.Tensor,
	dataloder_idx: bool=False
	) -> (
	torch.Tensor,
	torch.Tensor
	):


	if isinstance(batch, list):
	# mean loss
	text_batch, protein_batch = batch
	outputs = self(
	x_t=text_batch,
	x_p=protein_batch,
	compute_masked_logits=False
	)

	z_t_joint, z_p_joint = outputs

	self.predict_text_joint_latents.append(z_t_joint.detach().cpu())
	self.predict_seq_joint_latents.append(z_p_joint.detach().cpu())

	return outputs

	def on_predict_epoch_end(self, outputs=None):

	self.predict_text_joint_latents = torch.cat(self.predict_text_joint_latents).cpu()
	self.predict_seq_joint_latents = torch.cat(self.predict_seq_joint_latents).cpu()


	##########################
	# Facilitator PL wrapper #
	##########################

	class PL_Facilitator(pl.LightningModule):

	def __init__(
	self,
	args: any
	):

	super().__init__()

	# arguments
	self.args = args

	# model
	self.model = mod.Facilitator(
	in_dim=self.args.emb_dim,
	hid_dim=self.args.hid_dim,
	out_dim=self.args.emb_dim,
	dropout=self.args.dropout
	)

	self.text_to_protein_joint_embeddings = []

	def forward(
	self,
	z_t: torch.Tensor,
	) -> torch.Tensor:

	# reconfigure z_t to z_p (additional alignment)
	z_t_to_p = self.model(z_t)

	return z_t_to_p



	def training_step(self, batch: torch.Tensor, batch_id: any) -> dict:

	# check if the batch is a list and split data if so
	if isinstance(batch, list):
	text_embeddings, protein_embeddings = batch

	# forward pass with the model
	z_t_to_p = self(z_t=text_embeddings)

	# compute loss
	loss = self.model.compute_loss(
	output=z_t_to_p,
	target=protein_embeddings,
	loss_option=self.args.loss_type
	)

	# log the total loss
	self.log('train_loss', loss, prog_bar=True, on_step=True, on_epoch=True, sync_dist=True)

	return {'loss': loss}


	def validation_step(self, batch: torch.Tensor, batch_id: any) -> dict:

	# check if the batch is a list and split data if so
	if isinstance(batch, list):
	text_embeddings, protein_embeddings = batch

	# forward pass with the model
	z_t_to_p = self(z_t=text_embeddings)

	# compute loss
	loss = self.model.compute_loss(
	output=z_t_to_p,
	target=protein_embeddings,
	loss_option=self.args.loss_type
	)

	# log the total loss
	self.log('valid_loss', loss, prog_bar=True, on_step=True, on_epoch=True, sync_dist=True)

	return {'loss': loss}


	def configure_optimizers(self,):

	optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.args.lr, weight_decay=self.args.weight_decay)

	return {
	"optimizer": optimizer
	}


	def predict_step(self, batch: torch.Tensor, batch_idx: int, dataloader_idx: int = None) -> torch.Tensor:
	"""
	Defines a single prediction (inference) step.
	"""

	# Unpack the batch if it comes in a list format.
	# Here, we only take text embeddings for prediction as an example.
	if isinstance(batch, list):
	text_embeddings, _ = batch # We ignore the second element (protein_embeddings)
	else:
	text_embeddings = batch

	# Perform forward pass to get transformed text embeddings (z_t_to_p)
	z_t_to_p = self(z_t=text_embeddings)
	self.text_to_protein_joint_embeddings.append(z_t_to_p.detach().cpu())

	return z_t_to_p

	def on_predict_epoch_end(self, outputs=None):

	self.text_to_protein_joint_embeddings = torch.cat(self.text_to_protein_joint_embeddings).cpu()