Update README.md
Browse files
README.md
CHANGED
@@ -38,12 +38,35 @@ The 4-track model requires only a one-hot encoded sequence of your mRNA. This re
|
|
38 |
Here is example code
|
39 |
```
|
40 |
# Sequence for short mRNA
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
# One hot encode function
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
# Load Orthrus
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
46 |
# Generate embedding
|
|
|
|
|
|
|
47 |
```
|
48 |
|
49 |
#### 6-Track Model (Recommended)
|
@@ -64,14 +87,31 @@ chmod +x starter_build.sh
|
|
64 |
|
65 |
We can now generate six track encodings for any transcript!
|
66 |
```
|
67 |
-
# import six hot encoding function
|
68 |
-
|
69 |
# import Genome, Interval, instantiate Genome
|
|
|
|
|
|
|
|
|
70 |
|
71 |
# Load Orthrus 6 track
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
73 |
# Generate embedding
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
```
|
76 |
|
77 |
-
Alternatively, this information can be extracted from
|
|
|
38 |
Here is example code
|
39 |
```
|
40 |
# Sequence for short mRNA
|
41 |
+
> seq=(
|
42 |
+
'TCATCTGGATTATACATATTTCGCAATGAAAGAGAGGAAGAAAAGGAAGCAGCAAAATATGTGGAGGCCCA'
|
43 |
+
'ACAAAAGAGACTAGAAGCCTTATTCACTAAAATTCAGGAGGAATTTGAAGAACATGAAGTTACTTCCTCC
|
44 |
+
'ACTGAAGTCTTGAACCCCCCAAAGTCATCCATGAGGGTTGGAATCAACTTCTGAAAACACAACAAAACCA'
|
45 |
+
'TATTTACCATCACGTGCACTAACAAGACAGCAAGTTCGTGCTTTGCAAGATGGTGCAGAGCTTTATGAAG'
|
46 |
+
'CAGTGAAGAATGCAGCAGACCCAGCTTACCTTGAGGGTTATTTCAGTGAAGAGCAGTTAAGAGCCTTGAA'
|
47 |
+
'TAATCACAGGCAAATGTTGAATGATAAGAAACAAGCTCAGATCCAGTTGGAAATTAGGAAGGCCATGGAA'
|
48 |
+
'TCTGCTGAACAAAAGGAACAAGGTTTATCAAGGGATGTCACAACCGTGTGGAAGTTGCGTATTGTAAGCTATTC'
|
49 |
+
)
|
50 |
# One hot encode function
|
51 |
+
> oh = seq_to_oh(seq)
|
52 |
+
> one_hot = seq_to_oh(seq)
|
53 |
+
> one_hot = one_hot.T
|
54 |
+
> torch_one_hot = torch.tensor(one_hot, dtype=torch.float32)
|
55 |
+
> torch_one_hot = torch_one_hot.unsqueeze(0)
|
56 |
+
> print(torch_one_hot.shape)
|
57 |
+
> torch_one_hot = torch_one_hot.to(device='cuda')
|
58 |
+
> lengths = torch.tensor([torch_one_hot.shape[2]]).to(device='cuda')
|
59 |
# Load Orthrus
|
60 |
+
> run_name="orthrus_base_4_track"
|
61 |
+
> checkpoint="epoch=18-step=20000.ckpt"
|
62 |
+
> model_repository="./models"
|
63 |
+
> model = load_model(f"{model_repository}{run_name}", checkpoint_name=checkpoint)
|
64 |
+
> model = model.to(torch.device('cuda'))
|
65 |
+
> print(model)
|
66 |
# Generate embedding
|
67 |
+
> reps = model.representation(torch_one_hot, lengths)
|
68 |
+
> print(reps.shape)
|
69 |
+
# torch.Size([1, 256])
|
70 |
```
|
71 |
|
72 |
#### 6-Track Model (Recommended)
|
|
|
87 |
|
88 |
We can now generate six track encodings for any transcript!
|
89 |
```
|
|
|
|
|
90 |
# import Genome, Interval, instantiate Genome
|
91 |
+
> genome = Genome("gencode.v29")
|
92 |
+
> interval = Interval("chr7", "+", 117120016, 117120201, genome)
|
93 |
+
> genome.dna(interval)
|
94 |
+
# CTCTTATGCTCGGGTGATCC
|
95 |
|
96 |
# Load Orthrus 6 track
|
97 |
+
> run_name="orthrus_large_6_track"
|
98 |
+
> checkpoint="epoch=22-step=20000.ckpt"
|
99 |
+
> model_repository="./models"
|
100 |
+
> model = load_model(f"{model_repository}{run_name}", checkpoint_name=checkpoint)
|
101 |
+
> model = model.to(torch.device('cuda'))
|
102 |
+
> print(model)
|
103 |
# Generate embedding
|
104 |
+
> transcripts = find_transcript_by_gene_name(genome, 'BCL2L1')
|
105 |
+
> print(transcripts)
|
106 |
+
> t = transcripts[0]
|
107 |
+
> sixt = create_six_track_encoding(t)
|
108 |
+
> sixt = torch.tensor(sixt, dtype=torch.float32)
|
109 |
+
> sixt = sixt.unsqueeze(0)
|
110 |
+
> sixt = sixt.to(device='cuda')
|
111 |
+
> lengths = torch.tensor([sixt.shape[2]]).to(device='cuda')
|
112 |
+
> embedding = model.representation(sixt, lengths)
|
113 |
+
> print(embedding.shape)
|
114 |
+
# torch.Size([1, 512])
|
115 |
```
|
116 |
|
117 |
+
Alternatively, this information can be extracted from genePred files available for download from the UCSC Genome Browser [here](https://genome.ucsc.edu/cgi-bin/hgTables).
|