Spaces:
Runtime error
Runtime error
Commit
·
16bd580
1
Parent(s):
6af3edb
work towards gen emb
Browse files- .gitignore +2 -1
- app.py +9 -3
- ps4_data/get_embeddings.py +110 -0
.gitignore
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
.DS_Store
|
2 |
-
.idea/
|
|
|
|
1 |
.DS_Store
|
2 |
+
.idea/
|
3 |
+
ps4_data/__pycache__/
|
app.py
CHANGED
@@ -1,11 +1,17 @@
|
|
1 |
import gradio as gr
|
2 |
from ps4_models.classifiers import *
|
|
|
3 |
|
4 |
|
5 |
-
def pred(
|
|
|
6 |
model = PS4_Mega()
|
7 |
-
return "Hello " +
|
8 |
|
9 |
|
10 |
-
iface = gr.Interface(fn=pred,
|
|
|
|
|
|
|
|
|
11 |
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
from ps4_models.classifiers import *
|
3 |
+
from ps4_data.get_embeddings import generate_embedings
|
4 |
|
5 |
|
6 |
+
def pred(residue_seq):
|
7 |
+
generate_embedings(residue_seq)
|
8 |
model = PS4_Mega()
|
9 |
+
return "Hello " + residue_seq + "!!"
|
10 |
|
11 |
|
12 |
+
iface = gr.Interface(fn=pred, title="Protein Secondary Structure Prediction with PS4-Mega",
|
13 |
+
inputs="text", outputs="text", examples=[
|
14 |
+
["HXHVWPVQDAKARFSEFLDACITEGPQIVSRRGAEEAVLVPIGEWRRLQAAA"],
|
15 |
+
["AHKLFIGGLPNYLNDDQVKELLTSFGPLKAFNLVKDSATGLSKGYAFCEYVDINVTDQAIAGLNGMQLGDKKLLVQRASVGAKNA"]
|
16 |
+
])
|
17 |
iface.launch()
|
ps4_data/get_embeddings.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import T5EncoderModel, T5Tokenizer
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
import time
|
5 |
+
import os
|
6 |
+
|
7 |
+
|
8 |
+
def generate_embedings(input_seq, output_path=None):
|
9 |
+
|
10 |
+
# Create directories
|
11 |
+
protT5_path = "ps4_data/data/protT5"
|
12 |
+
# where to store the embeddings
|
13 |
+
per_residue_path = "ps4_data/data/protT5/output/per_residue_embeddings" if output_path is None else output_path
|
14 |
+
for dir_path in [protT5_path, per_residue_path]:
|
15 |
+
__create_dir(dir_path)
|
16 |
+
|
17 |
+
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
|
18 |
+
print("Using {}".format(device))
|
19 |
+
|
20 |
+
# Load the encoder part of ProtT5-XL-U50 in half-precision (recommended)
|
21 |
+
model, tokenizer = __get_T5_model(device)
|
22 |
+
|
23 |
+
# Load fasta.
|
24 |
+
all_seqs = {"0": input_seq}
|
25 |
+
|
26 |
+
chunk_size = 1000
|
27 |
+
|
28 |
+
# Compute embeddings and/or secondary structure predictions
|
29 |
+
for i in range(0, len(all_seqs), chunk_size):
|
30 |
+
keys = list(all_seqs.keys())[i: chunk_size + i]
|
31 |
+
seqs = {k: all_seqs[k] for k in keys}
|
32 |
+
results = __get_embeddings(model, tokenizer, seqs, device)
|
33 |
+
|
34 |
+
# Store per-residue embeddings
|
35 |
+
__save_embeddings(results["residue_embs"], per_residue_path + f"{i}.npz")
|
36 |
+
|
37 |
+
|
38 |
+
def __get_T5_model(device):
|
39 |
+
|
40 |
+
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc")
|
41 |
+
model = model.to(device) # move model to GPU
|
42 |
+
model = model.eval() # set model to evaluation model
|
43 |
+
tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)
|
44 |
+
|
45 |
+
return model, tokenizer
|
46 |
+
|
47 |
+
|
48 |
+
def __save_embeddings(emb_dict,out_path):
|
49 |
+
np.savez_compressed(out_path, **emb_dict)
|
50 |
+
|
51 |
+
|
52 |
+
def __get_embeddings(model, tokenizer, seqs, device, per_residue=True,
|
53 |
+
max_residues=4000, max_seq_len=1000, max_batch=100):
|
54 |
+
|
55 |
+
results = {"residue_embs": dict(),
|
56 |
+
"protein_embs": dict(),
|
57 |
+
"sec_structs": dict()
|
58 |
+
}
|
59 |
+
|
60 |
+
# sort sequences according to length (reduces unnecessary padding --> speeds up embedding)
|
61 |
+
seq_dict = sorted(seqs.items(), key=lambda kv: len(seqs[kv[0]]), reverse=True)
|
62 |
+
start = time.time()
|
63 |
+
batch = list()
|
64 |
+
for seq_idx, (pdb_id, seq) in enumerate(seq_dict, 1):
|
65 |
+
seq = seq
|
66 |
+
seq_len = len(seq)
|
67 |
+
seq = ' '.join(list(seq))
|
68 |
+
batch.append((pdb_id, seq, seq_len))
|
69 |
+
|
70 |
+
# count residues in current batch and add the last sequence length to
|
71 |
+
# avoid that batches with (n_res_batch > max_residues) get processed
|
72 |
+
n_res_batch = sum([s_len for _, _, s_len in batch]) + seq_len
|
73 |
+
if len(batch) >= max_batch or n_res_batch >= max_residues or seq_idx == len(seq_dict) or seq_len > max_seq_len:
|
74 |
+
pdb_ids, seqs, seq_lens = zip(*batch)
|
75 |
+
batch = list()
|
76 |
+
|
77 |
+
# add_special_tokens adds extra token at the end of each sequence
|
78 |
+
token_encoding = tokenizer.batch_encode_plus(seqs, add_special_tokens=True, padding="longest")
|
79 |
+
input_ids = torch.tensor(token_encoding['input_ids']).to(device)
|
80 |
+
attention_mask = torch.tensor(token_encoding['attention_mask']).to(device)
|
81 |
+
|
82 |
+
try:
|
83 |
+
with torch.no_grad():
|
84 |
+
# returns: ( batch-size x max_seq_len_in_minibatch x embedding_dim )
|
85 |
+
embedding_repr = model(input_ids, attention_mask=attention_mask)
|
86 |
+
except RuntimeError:
|
87 |
+
print("RuntimeError during embedding for {} (L={})".format(pdb_id, seq_len))
|
88 |
+
continue
|
89 |
+
|
90 |
+
for batch_idx, identifier in enumerate(pdb_ids): # for each protein in the current mini-batch
|
91 |
+
s_len = seq_lens[batch_idx]
|
92 |
+
# slice off padding --> batch-size x seq_len x embedding_dim
|
93 |
+
emb = embedding_repr.last_hidden_state[batch_idx, :s_len]
|
94 |
+
if per_residue: # store per-residue embeddings (Lx1024)
|
95 |
+
results["residue_embs"][identifier] = emb.detach().cpu().numpy().squeeze()
|
96 |
+
print("emb_count:", len(results["residue_embs"]))
|
97 |
+
|
98 |
+
passed_time = time.time() - start
|
99 |
+
avg_time = passed_time / len(results["residue_embs"]) if per_residue else passed_time / len(results["protein_embs"])
|
100 |
+
print('\n############# EMBEDDING STATS #############')
|
101 |
+
print('Total number of per-residue embeddings: {}'.format(len(results["residue_embs"])))
|
102 |
+
print("Time for generating embeddings: {:.1f}[m] ({:.3f}[s/protein])".format(
|
103 |
+
passed_time / 60, avg_time))
|
104 |
+
print('\n############# END #############')
|
105 |
+
return results
|
106 |
+
|
107 |
+
|
108 |
+
def __create_dir(path):
|
109 |
+
if not os.path.exists(path):
|
110 |
+
os.makedirs(path)
|