Upload Isoformer

Browse files

Files changed (4) hide show

config.json +4 -0
isoformer_config.py +111 -0
modeling_isoformer.py +168 -0
pytorch_model.bin +1 -1

config.json CHANGED Viewed

@@ -2,6 +2,10 @@
   "architectures": [
     "Isoformer"
   ],
   "enformer_attn_dim_key": 64,
   "enformer_attn_dropout": 0.05,
   "enformer_depth": 11,

   "architectures": [
     "Isoformer"
   ],
+  "auto_map": {
+    "AutoConfig": "isoformer_config.IsoformerConfig",
+    "AutoModel": "modeling_isoformer.Isoformer"
+  },
   "enformer_attn_dim_key": 64,
   "enformer_attn_dropout": 0.05,
   "enformer_depth": 11,

isoformer_config.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from transformers import PretrainedConfig
+class IsoformerConfig(PretrainedConfig):
+    model_type = "isoformer"
+    def __init__(
+        self,
+        esm_vocab_size=None,
+        esm_mask_token_id=None,
+        esm_pad_token_id=None,
+        esm_hidden_size=768,
+        esm_num_hidden_layers=12,
+        esm_num_attention_heads=12,
+        esm_intermediate_size=3072,
+        esm_hidden_dropout_prob=0.1,
+        esm_attention_probs_dropout_prob=0.1,
+        esm_max_position_embeddings=1026,
+        esm_position_embedding_type="absolute",
+        esm_use_cache=True,
+        esm_emb_layer_norm_before=None,
+        esm_token_dropout=False,
+        esm_add_bias_fnn=True,
+        esm_tie_word_embeddings=0,
+        nt_vocab_size=None,
+        nt_mask_token_id=None,
+        nt_pad_token_id=None,
+        nt_hidden_size=768,
+        nt_num_hidden_layers=12,
+        nt_num_attention_heads=12,
+        nt_intermediate_size=3072,
+        nt_hidden_dropout_prob=0.1,
+        nt_attention_probs_dropout_prob=0.1,
+        nt_max_position_embeddings=1026,
+        nt_position_embedding_type="absolute",
+        nt_use_cache=True,
+        nt_emb_layer_norm_before=None,
+        nt_token_dropout=False,
+        nt_add_bias_fnn=True,
+        nt_tie_word_embeddings=0,
+        enformer_dim=1536,
+        enformer_depth=11,
+        enformer_heads=8,
+        enformer_output_heads=0,
+        enformer_target_length=896,
+        enformer_attn_dim_key=64,
+        enformer_dropout_rate=0.4,
+        enformer_attn_dropout=0.05,
+        enformer_pos_dropout=0.01,
+        enformer_use_checkpointing=False,
+        enformer_use_convnext=False,
+        enformer_num_downsamples=7,  # genetic sequence is downsampled 2 ** 7 == 128x in default Enformer - can be changed for higher resolution
+        enformer_dim_divisible_by=128,
+        enformer_use_tf_gamma=False,
+        num_heads_omics_cross_attention=8,
+        num_tokens_per_seq_nuctf=2048,
+        num_tokens_per_seq_nuctf_rna=2048,
+        num_protein_tokens_per_seq=2048,
+        **kwargs,
+    ):
+        self.esm_vocab_size = esm_vocab_size
+        self.esm_mask_token_id = esm_mask_token_id
+        self.esm_pad_token_id = esm_pad_token_id
+        self.esm_hidden_size = esm_hidden_size
+        self.esm_num_hidden_layers = esm_num_hidden_layers
+        self.esm_num_attention_heads = esm_num_attention_heads
+        self.esm_intermediate_size = esm_intermediate_size
+        self.esm_max_position_embeddings = esm_max_position_embeddings
+        self.esm_token_dropout = esm_token_dropout
+        self.esm_emb_layer_norm_before = esm_emb_layer_norm_before
+        self.esm_attention_probs_dropout_prob = esm_attention_probs_dropout_prob
+        self.esm_hidden_dropout_prob = esm_hidden_dropout_prob
+        self.esm_use_cache = esm_use_cache
+        self.esm_add_bias_fnn = esm_add_bias_fnn
+        self.esm_position_embedding_type = esm_position_embedding_type
+        self.esm_tie_word_embeddings = esm_tie_word_embeddings
+        self.nt_vocab_size = nt_vocab_size
+        self.nt_mask_token_id = nt_mask_token_id
+        self.nt_pad_token_id = nt_pad_token_id
+        self.nt_hidden_size = nt_hidden_size
+        self.nt_num_hidden_layers = nt_num_hidden_layers
+        self.nt_num_attention_heads = nt_num_attention_heads
+        self.nt_intermediate_size = nt_intermediate_size
+        self.nt_max_position_embeddings = nt_max_position_embeddings
+        self.nt_token_dropout = nt_token_dropout
+        self.nt_emb_layer_norm_before = nt_emb_layer_norm_before
+        self.nt_attention_probs_dropout_prob = nt_attention_probs_dropout_prob
+        self.nt_hidden_dropout_prob = nt_hidden_dropout_prob
+        self.nt_use_cache = nt_use_cache
+        self.nt_add_bias_fnn = nt_add_bias_fnn
+        self.nt_position_embedding_type = nt_position_embedding_type
+        self.nt_tie_word_embeddings = nt_tie_word_embeddings
+        self.enformer_dim = enformer_dim
+        self.enformer_depth = enformer_depth
+        self.enformer_heads = enformer_heads
+        self.enformer_output_heads = enformer_output_heads
+        self.enformer_target_length = enformer_target_length
+        self.enformer_attn_dim_key = enformer_attn_dim_key
+        self.enformer_dropout_rate = enformer_dropout_rate
+        self.enformer_attn_dropout = enformer_attn_dropout
+        self.enformer_pos_dropout = enformer_pos_dropout
+        self.enformer_use_checkpointing = enformer_use_checkpointing
+        self.enformer_use_convnext = enformer_use_convnext
+        self.enformer_num_downsamples = enformer_num_downsamples
+        self.enformer_dim_divisible_by = enformer_dim_divisible_by
+        self.enformer_use_tf_gamma = enformer_use_tf_gamma
+        self.num_heads_omics_cross_attention = num_heads_omics_cross_attention
+        self.num_tokens_per_seq_nuctf = num_tokens_per_seq_nuctf
+        self.num_tokens_per_seq_nuctf_rna = num_tokens_per_seq_nuctf_rna
+        self.num_protein_tokens_per_seq = num_protein_tokens_per_seq
+        super().__init__(**kwargs)

modeling_isoformer.py ADDED Viewed

	@@ -0,0 +1,168 @@

+from transformers import PreTrainedModel
+#from genomics_research.biobrain_p2.huggingface.modeling_enformer import Enformer
+from genomics_research.biobrain_p2.huggingface.modeling_esm import NTForMaskedLM, MultiHeadAttention
+from genomics_research.biobrain_p2.huggingface.isoformer_config import IsoformerConfig
+#from genomics_research.biobrain_p2.huggingface.enformer_config import EnformerConfig
+from genomics_research.biobrain_p2.huggingface.esm_config import NTConfig
+from genomics_research.biobrain_p2.huggingface.modeling_esm_original import EsmForMaskedLM
+from transformers.models.esm.configuration_esm import EsmConfig
+from enformer_pytorch import Enformer, str_to_one_hot, EnformerConfig
+import torch
+from torch import nn
+class Isoformer(PreTrainedModel):
+    config_class = IsoformerConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.esm_config = EsmConfig(
+            vocab_size=config.esm_vocab_size,
+            mask_token_id=config.esm_mask_token_id,
+            pad_token_id=config.esm_pad_token_id,
+            hidden_size=config.esm_hidden_size,
+            num_hidden_layers=config.esm_num_hidden_layers,
+            num_attention_heads=config.esm_num_attention_heads,
+            intermediate_size=config.esm_intermediate_size,
+            max_position_embeddings=config.esm_max_position_embeddings,
+            token_dropout=config.esm_token_dropout,
+            emb_layer_norm_before=config.esm_emb_layer_norm_before,
+            attention_probs_dropout_prob=0.0,
+            hidden_dropout_prob=0.0,
+            use_cache=False,
+            add_bias_fnn=config.esm_add_bias_fnn,
+            position_embedding_type="rotary",
+            tie_word_embeddings=False,
+        )
+        self.nt_config = NTConfig(
+            vocab_size=config.nt_vocab_size,
+            mask_token_id=config.nt_mask_token_id,
+            pad_token_id=config.nt_pad_token_id,
+            hidden_size=config.nt_hidden_size,
+            num_hidden_layers=config.nt_num_hidden_layers,
+            num_attention_heads=config.nt_num_attention_heads,
+            intermediate_size=config.nt_intermediate_size,
+            max_position_embeddings=config.nt_max_position_embeddings,
+            token_dropout=config.nt_token_dropout,
+            emb_layer_norm_before=config.nt_emb_layer_norm_before,
+            attention_probs_dropout_prob=0.0,
+            hidden_dropout_prob=0.0,
+            use_cache=False,
+            add_bias_fnn=config.nt_add_bias_fnn,
+            position_embedding_type="rotary",
+            tie_word_embeddings=False,
+        )
+        self.config = config
+        # self.enformer_config = EnformerConfig(
+        #     dim=config.enformer_dim,
+        #     depth=config.enformer_depth,
+        #     heads=config.enformer_heads,
+        #     output_heads=dict(
+        #         human=1,
+        #         mouse=1 # TODO CHANGE
+        #     ),
+        #     target_length=config.enformer_target_length,  # 896,
+        #     attn_dim_key=config.enformer_attn_dim_key,
+        #     dropout_rate=0.4,
+        #     attn_dropout=0.05,
+        #     pos_dropout=0.01,
+        #     use_checkpointing=config.enformer_use_checkpointing,
+        #     use_convnext=config.enformer_use_convnext,
+        #     num_downsamples=config.enformer_num_downsamples,
+        #     # genetic sequence is downsampled 2 ** 7 == 128x in default Enformer - can be changed for higher resolution
+        #     dim_divisible_by=config.enformer_dim_divisible_by,
+        #     use_tf_gamma=False,
+        # )
+        self.esm_model = EsmForMaskedLM(self.esm_config) # protein encoder
+        self.nt_model = NTForMaskedLM(self.nt_config) # rna encoder
+        #self.enformer_model = Enformer(self.enformer_config) # dna encoder
+        self.enformer_model = Enformer.from_pretrained("EleutherAI/enformer-official-rough")
+        self.cross_attention_layer_rna = MultiHeadAttention(
+            config=EsmConfig(
+                num_attention_heads=config.num_heads_omics_cross_attention,
+                attention_head_size=3072 // config.num_heads_omics_cross_attention,
+                hidden_size=3072,
+                attention_probs_dropout_prob=0,
+                max_position_embeddings=0
+            ),
+            omics_of_interest_size=3072,
+            other_omic_size=768
+        )
+        self.cross_attention_layer_protein = MultiHeadAttention(
+            config=EsmConfig(
+                num_attention_heads=config.num_heads_omics_cross_attention,
+                attention_head_size=3072 // config.num_heads_omics_cross_attention,
+                hidden_size=3072,
+                attention_probs_dropout_prob=0,
+                max_position_embeddings=0
+            ),
+            omics_of_interest_size=3072,
+            other_omic_size=640
+        )
+        self.head_layer_1 = nn.Linear(3072, 2 * 3072)
+        self.head_layer_2 = nn.Linear(2 * 3072, 30)
+    def forward(
+            self,
+            tensor_dna,
+            tensor_rna,
+            tensor_protein,
+            attention_mask_dna,
+            attention_mask_rna,
+            attention_mask_protein
+    ):
+        tensor_dna = tensor_dna[:, 1:] # remove CLS
+        dna_embedding = self.enformer_model(
+            tensor_dna,
+            return_only_embeddings=True
+            # attention_mask=attention_mask_dna,
+            # encoder_attention_mask=attention_mask_dna,
+            # output_hidden_states=True
+        )
+        protein_embedding = self.esm_model(
+            tensor_protein,
+            attention_mask=attention_mask_protein,
+            encoder_attention_mask=attention_mask_protein,
+            output_hidden_states=True
+        )
+        rna_embedding = self.nt_model(
+            tensor_rna,
+            attention_mask=attention_mask_rna,
+            encoder_attention_mask=attention_mask_rna,
+            output_hidden_states=True
+        )
+        encoder_attention_mask = torch.unsqueeze(torch.unsqueeze(tensor_rna != 1, 0),0).repeat(1,1,dna_embedding.shape[1],1)
+        rna_to_dna = self.cross_attention_layer_rna.forward(
+            hidden_states=dna_embedding,
+            encoder_hidden_states=rna_embedding["hidden_states"][-1],
+            encoder_attention_mask=encoder_attention_mask
+        )
+        final_dna_embeddings = self.cross_attention_layer_protein.forward(
+            hidden_states=rna_to_dna["embeddings"],
+            encoder_hidden_states=protein_embedding["hidden_states"][-1],
+        )["embeddings"]
+        sequence_mask = torch.zeros(final_dna_embeddings.shape[1])
+        sequence_mask[self.config.pool_window_start:self.config.pool_window_end] = 1
+        x = torch.sum(torch.einsum('ijk,j->ijk', final_dna_embeddings, sequence_mask),axis=1)/torch.sum(sequence_mask)
+        x = self.head_layer_1(x)
+        x = torch.nn.functional.softplus(x)
+        x = self.head_layer_2(x)
+        return {
+            "gene_expression_predictions":x,
+            "rna_to_dna": rna_to_dna,
+            "final_embeddings": final_dna_embeddings,
+            "dna_embedding": dna_embedding,
+            "rna_embedding": rna_embedding,
+            "protein_embedding": protein_embedding
+        }

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cf803dfa3d135f58e9deb3b9a4958cca369a7959ab11043e21232bb994f35f36
 size 2803153818

 version https://git-lfs.github.com/spec/v1
+oid sha256:862bbda2ad7efe88014c38c046c517728295f2a038080c00071570dd20c9c7ac
 size 2803153818