niksapraljak1
/

BioM3

Model card Files Files and versions Community

BioM3 / run_PenCL_inference.py

Niksa Praljak

Update PenCL argparse and Finish Facilitator script

66d2e5f 15 days ago

5.97 kB

	import argparse
	import yaml
	from argparse import Namespace
	import json
	import pandas as pd
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import Stage1_source.preprocess as prep
	import Stage1_source.model as mod
	import Stage1_source.PL_wrapper as PL_wrap

	# Step 1: Load JSON Configuration
	def load_json_config(json_path):
	with open(json_path, "r") as f:
	config = json.load(f)
	return config

	# Step 2: Convert JSON dictionary to Namespace
	def convert_to_namespace(config_dict):
	for key, value in config_dict.items():
	if isinstance(value, dict):
	config_dict[key] = convert_to_namespace(value)
	return Namespace(**config_dict)

	# Step 3: Load Pre-trained Model
	def prepare_model(config_args, model_path) -> nn.Module:
	model = mod.pfam_PEN_CL(args=config_args)
	model.load_state_dict(torch.load(model_path, map_location="cpu"))
	model.eval()
	print("Model loaded successfully with weights!")
	return model

	# Step 4: Prepare Test Dataset
	def load_test_dataset(config_args):
	test_dict = {
	'primary_Accession': ['A0A009IHW8', 'A0A023I7E1'],
	'protein_sequence': [
	"MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENARIQSKL...",
	"MRFQVIVAAATITMITSYIPGVASQSTSDGDDLFVPVSNFDPKSIFPEIKHP..."
	],
	'[final]text_caption': [
	"PROTEIN NAME: 2' cyclic ADP-D-ribose synthase AbTIR...",
	"PROTEIN NAME: Glucan endo-1,3-beta-D-glucosidase 1..."
	],
	'pfam_label': ["['PF13676']", "['PF17652','PF03639']"]
	}
	test_df = pd.DataFrame(test_dict)
	test_dataset = prep.TextSeqPairing_Dataset(args=config_args, df=test_df)
	return test_dataset

	# Step 5: Argument Parser Function
	def parse_arguments():
	parser = argparse.ArgumentParser(description="BioM3 Inference Script (Stage 1)")
	parser.add_argument('--json_path', type=str, required=True,
	help="Path to the JSON configuration file (stage1_config.json)")
	parser.add_argument('--model_path', type=str, required=True,
	help="Path to the pre-trained model weights (pytorch_model.bin)")
	parser.add_argument('--output_path', type=str, required=True,
	help="Path to save output embeddings")

	return parser.parse_args()

	# Step 6: Compute Homology Probabilities
	def compute_homology_matrix(z_p_tensor):
	"""
	Compute the homology matrix as cosine similarities between protein latent vectors.
	"""
	# Normalize z_p to unit vectors
	z_p_normalized = F.normalize(z_p_tensor, p=2, dim=1) # L2 normalization

	# Compute cosine similarity matrix
	homology_matrix = torch.matmul(z_p_normalized, z_p_normalized.T) # (num_samples x num_samples)

	return homology_matrix


	# Main Execution
	if __name__ == '__main__':
	# Parse arguments
	config_args_parser = parse_arguments()

	# Load configuration
	config_dict = load_json_config(config_args_parser.json_path)
	config_args = convert_to_namespace(config_dict)

	# Load model
	model = prepare_model(config_args=config_args, model_path=config_args_parser.model_path)

	# Load test dataset
	test_dataset = load_test_dataset(config_args)

	# Run inference and store z_t, z_p
	z_t_list = []
	z_p_list = []
	text_list = []
	protein_list = []

	with torch.no_grad():
	for idx in range(len(test_dataset)):
	batch = test_dataset[idx]
	x_t, x_p = batch
	outputs = model(x_t, x_p, compute_masked_logits=False) # Infer Joint-Embeddings
	z_t = outputs['text_joint_latent'] # Text latent
	z_p = outputs['seq_joint_latent'] # Protein latent
	z_t_list.append(z_t)
	z_p_list.append(z_p)

	protein_sequence = test_dataset.protein_sequence_list[idx]
	text_prompt = test_dataset.text_captions_list[idx]
	text_list.append(text_prompt)
	protein_list.append(protein_sequence)


	# Stack all latent vectors
	z_t_tensor = torch.vstack(z_t_list) # Shape: (num_samples, latent_dim)
	z_p_tensor = torch.vstack(z_p_list) # Shape: (num_samples, latent_dim)

	# Prepare embedding dict.
	embedding_dict = {
	'sequence': protein_list,
	'text_prompts': text_list,
	'z_t': z_t_tensor,
	'z_p': z_p_tensor
	}

	# Compute Dot Product scores
	dot_product_scores = torch.matmul(z_p_tensor, z_t_tensor.T) # Dot product

	# Normalize scores into probabilities
	protein_given_text_probs = F.softmax(dot_product_scores, dim=0) # Normalize across rows (proteins), for each text
	text_given_protein_probs = F.softmax(dot_product_scores, dim=1) # Normalize across columns (texts), for each protein

	# Compute magnitudes (L2 norms) for z_t and z_p
	z_p_magnitude = torch.norm(z_p_tensor, dim=1) # L2 norm for each protein latent vector
	z_t_magnitude = torch.norm(z_t_tensor, dim=1) # L2 norm for each text latent vector

	# Compute homology probabilities
	homology_matrix = compute_homology_matrix(z_p_tensor)

	# Print results
	print("\n=== Inference Results ===")
	print(f"Shape of z_p (protein latent): {z_p_tensor.shape}")
	print(f"Shape of z_t (text latent): {z_t_tensor.shape}")
	print(f"\nMagnitudes of z_p vectors: {z_p_magnitude}")
	print(f"Magnitudes of z_t vectors: {z_t_magnitude}")

	print("\n=== Dot Product Scores Matrix ===")
	print(dot_product_scores)

	print("\n=== Normalized Probabilities ===")
	print("Protein-Normalized Probabilities (Softmax across Proteins for each Text):")
	print(protein_given_text_probs)

	print("\nText-Normalized Probabilities (Softmax across Texts for each Protein):")
	print(text_given_protein_probs)

	print("\n=== Homology Matrix (Dot Product of Normalized z_p) ===")
	print(homology_matrix)

	torch.save(embedding_dict, config_args_parser.output_path)