Spaces:
Runtime error
Runtime error
Add application file
Browse files
app.py
ADDED
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# @Time : 2021/8/17 23:08
|
3 |
+
# @Author : Cheng Ge
|
4 |
+
from tensorflow.keras.models import Model
|
5 |
+
import gradio as gr
|
6 |
+
import warnings
|
7 |
+
warnings.filterwarnings('ignore')
|
8 |
+
import numpy as np
|
9 |
+
from numpy import linalg as la
|
10 |
+
import tensorflow as tf
|
11 |
+
from tensorflow import keras
|
12 |
+
from tensorflow.keras import layers
|
13 |
+
def pad_to_length(input_data: list, pad_token, max_length: int) -> list:
|
14 |
+
assert len(input_data) <= max_length
|
15 |
+
result = input_data[:]
|
16 |
+
for i in range(max_length - len(result)):
|
17 |
+
result.append(pad_token)
|
18 |
+
return result
|
19 |
+
def TransDict_from_list(groups):
|
20 |
+
transDict = dict()
|
21 |
+
tar_list = ['0', '1', '2', '3', '4', '5', '6']
|
22 |
+
result = {}
|
23 |
+
index = 0
|
24 |
+
for group in groups:
|
25 |
+
g_members = sorted(group) # Alphabetically sorted list
|
26 |
+
for c in g_members:
|
27 |
+
# print('c' + str(c))
|
28 |
+
# print('g_members[0]' + str(g_members[0]))
|
29 |
+
result[c] = str(tar_list[index]) # K:V map, use group's first letter as represent.
|
30 |
+
index = index + 1
|
31 |
+
return result
|
32 |
+
def get_3_protein_trids():
|
33 |
+
nucle_com = []
|
34 |
+
chars = ['0', '1', '2', '3', '4', '5', '6']
|
35 |
+
base = len(chars)
|
36 |
+
end = len(chars) ** 3
|
37 |
+
for i in range(0, end):
|
38 |
+
n = i
|
39 |
+
ch0 = chars[n % base]
|
40 |
+
n = n / base
|
41 |
+
ch1 = chars[int(n % base)]
|
42 |
+
n = n / base
|
43 |
+
ch2 = chars[int(n % base)]
|
44 |
+
nucle_com.append(ch0 + ch1 + ch2)
|
45 |
+
return nucle_com
|
46 |
+
def translate_sequence(seq, TranslationDict):
|
47 |
+
'''
|
48 |
+
Given (seq) - a string/sequence to translate,
|
49 |
+
Translates into a reduced alphabet, using a translation dict provided
|
50 |
+
by the TransDict_from_list() method.
|
51 |
+
Returns the string/sequence in the new, reduced alphabet.
|
52 |
+
Remember - in Python string are immutable..
|
53 |
+
|
54 |
+
'''
|
55 |
+
import string
|
56 |
+
from_list = []
|
57 |
+
to_list = []
|
58 |
+
for k, v in TranslationDict.items():
|
59 |
+
from_list.append(k)
|
60 |
+
to_list.append(v)
|
61 |
+
# TRANS_seq = seq.translate(str.maketrans(zip(from_list,to_list)))
|
62 |
+
TRANS_seq = seq.translate(str.maketrans(str(from_list), str(to_list)))
|
63 |
+
# TRANS_seq = maketrans( TranslationDict, seq)
|
64 |
+
return TRANS_seq
|
65 |
+
def get_4_nucleotide_composition(tris, seq, pythoncount=True):
|
66 |
+
seq_len = len(seq)
|
67 |
+
tri_feature = [0] * len(tris)
|
68 |
+
k = len(tris[0])
|
69 |
+
note_feature = [[0 for cols in range(len(seq) - k + 1)] for rows in range(len(tris))]
|
70 |
+
if pythoncount:
|
71 |
+
for val in tris:
|
72 |
+
num = seq.count(val)
|
73 |
+
tri_feature.append(float(num) / seq_len)
|
74 |
+
else:
|
75 |
+
# tmp_fea = [0] * len(tris)
|
76 |
+
for x in range(len(seq) + 1 - k):
|
77 |
+
kmer = seq[x:x + k]
|
78 |
+
if kmer in tris:
|
79 |
+
ind = tris.index(kmer)
|
80 |
+
# tmp_fea[ind] = tmp_fea[ind] + 1
|
81 |
+
note_feature[ind][x] = note_feature[ind][x] + 1
|
82 |
+
# tri_feature = [float(val)/seq_len for val in tmp_fea] #tri_feature type:list len:256
|
83 |
+
u, s, v = la.svd(note_feature)
|
84 |
+
for i in range(len(s)):
|
85 |
+
tri_feature = tri_feature + u[i] * s[i] / seq_len
|
86 |
+
# print tri_feature
|
87 |
+
# pdb.set_trace()
|
88 |
+
|
89 |
+
return tri_feature
|
90 |
+
def BPF(seq_temp):
|
91 |
+
sequences = seq_temp
|
92 |
+
Seq1 = []
|
93 |
+
for i in range(len(sequences)):
|
94 |
+
kmer = sequences[i]
|
95 |
+
src_vocab = {'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20, 'O': 21, 'U': 22, 'Z': 23, 'X': 23}
|
96 |
+
seq = src_vocab[kmer]
|
97 |
+
Seq1.append(seq)
|
98 |
+
seq = pad_to_length(Seq1, 0, 7)
|
99 |
+
fea = []
|
100 |
+
tem_vec = []
|
101 |
+
for i in range(len(seq)):
|
102 |
+
if seq[i] == 1:
|
103 |
+
tem_vec = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
104 |
+
elif seq[i]==2:
|
105 |
+
tem_vec = [0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
106 |
+
elif seq[i]==3:
|
107 |
+
tem_vec = [0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
108 |
+
elif seq[i]==4:
|
109 |
+
tem_vec = [0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
110 |
+
elif seq[i]==5:
|
111 |
+
tem_vec = [0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
112 |
+
elif seq[i]==6:
|
113 |
+
tem_vec = [0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
114 |
+
elif seq[i]==7:
|
115 |
+
tem_vec = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
116 |
+
elif seq[i]==8:
|
117 |
+
tem_vec = [0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
118 |
+
elif seq[i]==9:
|
119 |
+
tem_vec = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
120 |
+
elif seq[i]==10:
|
121 |
+
tem_vec = [0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
122 |
+
elif seq[i]==11:
|
123 |
+
tem_vec = [0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
124 |
+
elif seq[i]==12:
|
125 |
+
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0]
|
126 |
+
elif seq[i]==13:
|
127 |
+
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0]
|
128 |
+
elif seq[i]==14:
|
129 |
+
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0]
|
130 |
+
elif seq[i]==15:
|
131 |
+
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0]
|
132 |
+
elif seq[i]==16:
|
133 |
+
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0]
|
134 |
+
elif seq[i]==17:
|
135 |
+
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
|
136 |
+
elif seq[i]==18:
|
137 |
+
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0]
|
138 |
+
elif seq[i]==19:
|
139 |
+
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0]
|
140 |
+
elif seq[i]==20:
|
141 |
+
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0]
|
142 |
+
elif seq[i]==21:
|
143 |
+
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
|
144 |
+
elif seq[i]==22:
|
145 |
+
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0]
|
146 |
+
elif seq[i]==23:
|
147 |
+
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
|
148 |
+
elif seq[i]==24:
|
149 |
+
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
|
150 |
+
elif seq[i]==0:
|
151 |
+
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
152 |
+
fea = fea + tem_vec
|
153 |
+
return fea
|
154 |
+
def transfer_label_from_prob(proba):
|
155 |
+
label = [1 if val >= 0.5 else 0 for val in proba]
|
156 |
+
return label
|
157 |
+
def prepare_feature(file):
|
158 |
+
files = file.name
|
159 |
+
protein_seq_dict = {}
|
160 |
+
protein_index = 1
|
161 |
+
with open(files, 'r') as fp:
|
162 |
+
for line in fp:
|
163 |
+
seq = line[:-1]
|
164 |
+
protein_seq_dict[protein_index] = seq
|
165 |
+
protein_index = protein_index + 1
|
166 |
+
|
167 |
+
groups = ['AGV', 'ILFPO', 'YMTS', 'HNQW', 'RK', 'DEZ', 'CU']
|
168 |
+
group_dict = TransDict_from_list(groups)
|
169 |
+
protein_tris = get_3_protein_trids()
|
170 |
+
bpf = []
|
171 |
+
kmer = []
|
172 |
+
sequence = []
|
173 |
+
|
174 |
+
for i in protein_seq_dict:
|
175 |
+
protein_seq = translate_sequence(protein_seq_dict[i], group_dict)
|
176 |
+
if len(protein_seq_dict[i]) > 7:
|
177 |
+
aaa = protein_seq_dict[i][0:7]
|
178 |
+
bpf_feature = BPF(aaa)
|
179 |
+
else:
|
180 |
+
bpf_feature = BPF(protein_seq_dict[i])
|
181 |
+
protein_tri_fea = get_4_nucleotide_composition(protein_tris, protein_seq, pythoncount =False)
|
182 |
+
|
183 |
+
bpf.append(bpf_feature)
|
184 |
+
kmer.append(protein_tri_fea)
|
185 |
+
sequence.append(protein_seq_dict[i])
|
186 |
+
return np.array(bpf), np.array(kmer), np.array(sequence)
|
187 |
+
|
188 |
+
class TransformerBlock(layers.Layer):
|
189 |
+
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
|
190 |
+
super(TransformerBlock, self).__init__()
|
191 |
+
self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
|
192 |
+
self.ffn = keras.Sequential(
|
193 |
+
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim), ]
|
194 |
+
)
|
195 |
+
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
|
196 |
+
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
|
197 |
+
self.dropout1 = layers.Dropout(rate)
|
198 |
+
self.dropout2 = layers.Dropout(rate)
|
199 |
+
|
200 |
+
def call(self, inputs, training):
|
201 |
+
attn_output = self.att(inputs, inputs)
|
202 |
+
attn_output = self.dropout1(attn_output, training=training)
|
203 |
+
out1 = self.layernorm1(inputs + attn_output)
|
204 |
+
ffn_output = self.ffn(out1)
|
205 |
+
ffn_output = self.dropout2(ffn_output, training=training)
|
206 |
+
return self.layernorm2(out1 + ffn_output)
|
207 |
+
class TokenAndPositionEmbedding(layers.Layer):
|
208 |
+
def __init__(self, maxlen, vocab_size, embed_dim):
|
209 |
+
super(TokenAndPositionEmbedding, self).__init__()
|
210 |
+
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
|
211 |
+
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
|
212 |
+
|
213 |
+
def call(self, x):
|
214 |
+
maxlen = tf.shape(x)[-1]
|
215 |
+
positions = tf.range(start=0, limit=maxlen, delta=1)
|
216 |
+
positions = self.pos_emb(positions)
|
217 |
+
x = self.token_emb(x)
|
218 |
+
return x + positions
|
219 |
+
|
220 |
+
def ACP_DL(file):
|
221 |
+
data_dim = 511
|
222 |
+
timesteps = 1
|
223 |
+
len_seq_max = 18
|
224 |
+
bpf, kmer, sequence = prepare_feature(file)
|
225 |
+
Seq2 = []
|
226 |
+
|
227 |
+
for m in sequence:
|
228 |
+
Seq1 = []
|
229 |
+
for i in range(len(m)):
|
230 |
+
subq = m[i]
|
231 |
+
src_vocab = {'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20,'O': 21, 'U': 22, 'Z': 23, 'X': 24 }
|
232 |
+
seq = src_vocab[subq]
|
233 |
+
Seq1.append(seq)
|
234 |
+
|
235 |
+
if len(Seq1) > len_seq_max:
|
236 |
+
Seq1 = Seq1[0:len_seq_max]
|
237 |
+
else:
|
238 |
+
Seq1 = Seq1
|
239 |
+
|
240 |
+
seq = pad_to_length(Seq1, 0, len_seq_max)
|
241 |
+
Seq2.append(seq)
|
242 |
+
Seq2 = np.array(Seq2)
|
243 |
+
X = np.concatenate((bpf, kmer), axis=1)
|
244 |
+
X = np.reshape(X, (len(X), timesteps, data_dim))
|
245 |
+
all_prob = {}
|
246 |
+
all_prob[0] = []
|
247 |
+
test1 = np.array(X)
|
248 |
+
test2 = np.array(Seq2)
|
249 |
+
|
250 |
+
Transformer_input = tf.keras.Input(shape=(len_seq_max,))
|
251 |
+
embedding_layer = TokenAndPositionEmbedding(len_seq_max, 25, 32)
|
252 |
+
x = embedding_layer(Transformer_input)
|
253 |
+
transformer_block = TransformerBlock(32, 8, 32)
|
254 |
+
x = transformer_block(x)
|
255 |
+
x = layers.GlobalAveragePooling1D()(x)
|
256 |
+
x = layers.Dropout(0.1)(x)
|
257 |
+
x = layers.Dense(20, activation="relu")(x)
|
258 |
+
x = layers.Dropout(0.1)(x)
|
259 |
+
Transformer_output = layers.Dense(256, activation="relu")(x)
|
260 |
+
|
261 |
+
lstm_input = tf.keras.Input(shape=(1, 511), name="lstm_input")
|
262 |
+
x = layers.LSTM(128, return_sequences=False)(lstm_input)
|
263 |
+
lstm_output = layers.Dense(1, activation="relu")(x)
|
264 |
+
output = layers.concatenate([Transformer_output, lstm_output])
|
265 |
+
outputss = layers.Dense(1, activation="sigmoid")(output)
|
266 |
+
model = Model(
|
267 |
+
inputs={'Transformer_input': Transformer_input, 'lstm_input': lstm_input},
|
268 |
+
outputs=outputss,
|
269 |
+
)
|
270 |
+
|
271 |
+
model.load_weights(filepath="AMP_818.h5", by_name=False, skip_mismatch=False, options=None)
|
272 |
+
proba = model.predict([test2,test1])
|
273 |
+
proba0 = (1-proba)*100
|
274 |
+
proba1 = (proba)*100
|
275 |
+
proba = transfer_label_from_prob(proba)
|
276 |
+
|
277 |
+
f = open (r'output.txt','a')
|
278 |
+
for i in range(len(proba)):
|
279 |
+
if proba[i]==0:
|
280 |
+
print(sequence[i], "Non-AMP", "%.3f%%"%proba0[i],file = f)
|
281 |
+
else:
|
282 |
+
print(sequence[i], "AMP", "%.3f%%"%proba1[i],file = f)
|
283 |
+
f.close()
|
284 |
+
return 'output.txt'
|
285 |
+
|
286 |
+
|
287 |
+
iface = gr.Interface(fn=ACP_DL,
|
288 |
+
inputs = [gr.File(label="input fasta")],
|
289 |
+
outputs= gr.File(label="download txt"))
|
290 |
+
iface.launch()
|