isoformer-anonymous
commited on
Commit
•
8b8a26d
1
Parent(s):
dae1abd
Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
|
5 |
+
# Import the tokenizer and the model
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained("isoformer-anonymous/Isoformer", trust_remote_code=True)
|
7 |
+
model = AutoModelForMaskedLM.from_pretrained("isoformer-anonymous/Isoformer",trust_remote_code=True)
|
8 |
+
|
9 |
+
protein_sequences = ["RSRSRSRSRSRSRSRSRSRSRL" * 9]
|
10 |
+
rna_sequences = ["ATTCCGGTTTTCA" * 9]
|
11 |
+
sequence_length = 196_608
|
12 |
+
rng = np.random.default_rng(seed=0)
|
13 |
+
dna_sequences = ["".join(rng.choice(list("ATCGN"), size=(sequence_length,)))]
|
14 |
+
|
15 |
+
torch_tokens = tokenizer(
|
16 |
+
dna_input=dna_sequences, rna_input=rna_sequences, protein_input=protein_sequences
|
17 |
+
)
|
18 |
+
dna_torch_tokens = torch.tensor(torch_tokens[0]["input_ids"])
|
19 |
+
rna_torch_tokens = torch.tensor(torch_tokens[1]["input_ids"])
|
20 |
+
protein_torch_tokens = torch.tensor(torch_tokens[2]["input_ids"])
|
21 |
+
|
22 |
+
torch_output = model.forward(
|
23 |
+
tensor_dna=dna_torch_tokens,
|
24 |
+
tensor_rna=rna_torch_tokens,
|
25 |
+
tensor_protein=protein_torch_tokens,
|
26 |
+
attention_mask_rna=rna_torch_tokens != 1,
|
27 |
+
attention_mask_protein=protein_torch_tokens != 1,
|
28 |
+
)
|
29 |
+
|
30 |
+
print(f"Gene expression predictions: {torch_output['gene_expression_predictions']}")
|
31 |
+
print(f"Final DNA embedding: {torch_output['final_dna_embeddings']}")
|