oucgc1996 commited on
Commit
0242109
1 Parent(s): 8fed8a8

Add application file

Browse files
Files changed (1) hide show
  1. app.py +290 -0
app.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Time : 2021/8/17 23:08
3
+ # @Author : Cheng Ge
4
+ from tensorflow.keras.models import Model
5
+ import gradio as gr
6
+ import warnings
7
+ warnings.filterwarnings('ignore')
8
+ import numpy as np
9
+ from numpy import linalg as la
10
+ import tensorflow as tf
11
+ from tensorflow import keras
12
+ from tensorflow.keras import layers
13
+ def pad_to_length(input_data: list, pad_token, max_length: int) -> list:
14
+ assert len(input_data) <= max_length
15
+ result = input_data[:]
16
+ for i in range(max_length - len(result)):
17
+ result.append(pad_token)
18
+ return result
19
+ def TransDict_from_list(groups):
20
+ transDict = dict()
21
+ tar_list = ['0', '1', '2', '3', '4', '5', '6']
22
+ result = {}
23
+ index = 0
24
+ for group in groups:
25
+ g_members = sorted(group) # Alphabetically sorted list
26
+ for c in g_members:
27
+ # print('c' + str(c))
28
+ # print('g_members[0]' + str(g_members[0]))
29
+ result[c] = str(tar_list[index]) # K:V map, use group's first letter as represent.
30
+ index = index + 1
31
+ return result
32
+ def get_3_protein_trids():
33
+ nucle_com = []
34
+ chars = ['0', '1', '2', '3', '4', '5', '6']
35
+ base = len(chars)
36
+ end = len(chars) ** 3
37
+ for i in range(0, end):
38
+ n = i
39
+ ch0 = chars[n % base]
40
+ n = n / base
41
+ ch1 = chars[int(n % base)]
42
+ n = n / base
43
+ ch2 = chars[int(n % base)]
44
+ nucle_com.append(ch0 + ch1 + ch2)
45
+ return nucle_com
46
+ def translate_sequence(seq, TranslationDict):
47
+ '''
48
+ Given (seq) - a string/sequence to translate,
49
+ Translates into a reduced alphabet, using a translation dict provided
50
+ by the TransDict_from_list() method.
51
+ Returns the string/sequence in the new, reduced alphabet.
52
+ Remember - in Python string are immutable..
53
+
54
+ '''
55
+ import string
56
+ from_list = []
57
+ to_list = []
58
+ for k, v in TranslationDict.items():
59
+ from_list.append(k)
60
+ to_list.append(v)
61
+ # TRANS_seq = seq.translate(str.maketrans(zip(from_list,to_list)))
62
+ TRANS_seq = seq.translate(str.maketrans(str(from_list), str(to_list)))
63
+ # TRANS_seq = maketrans( TranslationDict, seq)
64
+ return TRANS_seq
65
+ def get_4_nucleotide_composition(tris, seq, pythoncount=True):
66
+ seq_len = len(seq)
67
+ tri_feature = [0] * len(tris)
68
+ k = len(tris[0])
69
+ note_feature = [[0 for cols in range(len(seq) - k + 1)] for rows in range(len(tris))]
70
+ if pythoncount:
71
+ for val in tris:
72
+ num = seq.count(val)
73
+ tri_feature.append(float(num) / seq_len)
74
+ else:
75
+ # tmp_fea = [0] * len(tris)
76
+ for x in range(len(seq) + 1 - k):
77
+ kmer = seq[x:x + k]
78
+ if kmer in tris:
79
+ ind = tris.index(kmer)
80
+ # tmp_fea[ind] = tmp_fea[ind] + 1
81
+ note_feature[ind][x] = note_feature[ind][x] + 1
82
+ # tri_feature = [float(val)/seq_len for val in tmp_fea] #tri_feature type:list len:256
83
+ u, s, v = la.svd(note_feature)
84
+ for i in range(len(s)):
85
+ tri_feature = tri_feature + u[i] * s[i] / seq_len
86
+ # print tri_feature
87
+ # pdb.set_trace()
88
+
89
+ return tri_feature
90
+ def BPF(seq_temp):
91
+ sequences = seq_temp
92
+ Seq1 = []
93
+ for i in range(len(sequences)):
94
+ kmer = sequences[i]
95
+ src_vocab = {'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20, 'O': 21, 'U': 22, 'Z': 23, 'X': 23}
96
+ seq = src_vocab[kmer]
97
+ Seq1.append(seq)
98
+ seq = pad_to_length(Seq1, 0, 7)
99
+ fea = []
100
+ tem_vec = []
101
+ for i in range(len(seq)):
102
+ if seq[i] == 1:
103
+ tem_vec = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
104
+ elif seq[i]==2:
105
+ tem_vec = [0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
106
+ elif seq[i]==3:
107
+ tem_vec = [0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
108
+ elif seq[i]==4:
109
+ tem_vec = [0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
110
+ elif seq[i]==5:
111
+ tem_vec = [0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
112
+ elif seq[i]==6:
113
+ tem_vec = [0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
114
+ elif seq[i]==7:
115
+ tem_vec = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
116
+ elif seq[i]==8:
117
+ tem_vec = [0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
118
+ elif seq[i]==9:
119
+ tem_vec = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
120
+ elif seq[i]==10:
121
+ tem_vec = [0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
122
+ elif seq[i]==11:
123
+ tem_vec = [0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0]
124
+ elif seq[i]==12:
125
+ tem_vec = [0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0]
126
+ elif seq[i]==13:
127
+ tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0]
128
+ elif seq[i]==14:
129
+ tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0]
130
+ elif seq[i]==15:
131
+ tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0]
132
+ elif seq[i]==16:
133
+ tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0]
134
+ elif seq[i]==17:
135
+ tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
136
+ elif seq[i]==18:
137
+ tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0]
138
+ elif seq[i]==19:
139
+ tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0]
140
+ elif seq[i]==20:
141
+ tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0]
142
+ elif seq[i]==21:
143
+ tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
144
+ elif seq[i]==22:
145
+ tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0]
146
+ elif seq[i]==23:
147
+ tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
148
+ elif seq[i]==24:
149
+ tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
150
+ elif seq[i]==0:
151
+ tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
152
+ fea = fea + tem_vec
153
+ return fea
154
+ def transfer_label_from_prob(proba):
155
+ label = [1 if val >= 0.5 else 0 for val in proba]
156
+ return label
157
+ def prepare_feature(file):
158
+ files = file.name
159
+ protein_seq_dict = {}
160
+ protein_index = 1
161
+ with open(files, 'r') as fp:
162
+ for line in fp:
163
+ seq = line[:-1]
164
+ protein_seq_dict[protein_index] = seq
165
+ protein_index = protein_index + 1
166
+
167
+ groups = ['AGV', 'ILFPO', 'YMTS', 'HNQW', 'RK', 'DEZ', 'CU']
168
+ group_dict = TransDict_from_list(groups)
169
+ protein_tris = get_3_protein_trids()
170
+ bpf = []
171
+ kmer = []
172
+ sequence = []
173
+
174
+ for i in protein_seq_dict:
175
+ protein_seq = translate_sequence(protein_seq_dict[i], group_dict)
176
+ if len(protein_seq_dict[i]) > 7:
177
+ aaa = protein_seq_dict[i][0:7]
178
+ bpf_feature = BPF(aaa)
179
+ else:
180
+ bpf_feature = BPF(protein_seq_dict[i])
181
+ protein_tri_fea = get_4_nucleotide_composition(protein_tris, protein_seq, pythoncount =False)
182
+
183
+ bpf.append(bpf_feature)
184
+ kmer.append(protein_tri_fea)
185
+ sequence.append(protein_seq_dict[i])
186
+ return np.array(bpf), np.array(kmer), np.array(sequence)
187
+
188
+ class TransformerBlock(layers.Layer):
189
+ def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
190
+ super(TransformerBlock, self).__init__()
191
+ self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
192
+ self.ffn = keras.Sequential(
193
+ [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim), ]
194
+ )
195
+ self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
196
+ self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
197
+ self.dropout1 = layers.Dropout(rate)
198
+ self.dropout2 = layers.Dropout(rate)
199
+
200
+ def call(self, inputs, training):
201
+ attn_output = self.att(inputs, inputs)
202
+ attn_output = self.dropout1(attn_output, training=training)
203
+ out1 = self.layernorm1(inputs + attn_output)
204
+ ffn_output = self.ffn(out1)
205
+ ffn_output = self.dropout2(ffn_output, training=training)
206
+ return self.layernorm2(out1 + ffn_output)
207
+ class TokenAndPositionEmbedding(layers.Layer):
208
+ def __init__(self, maxlen, vocab_size, embed_dim):
209
+ super(TokenAndPositionEmbedding, self).__init__()
210
+ self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
211
+ self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
212
+
213
+ def call(self, x):
214
+ maxlen = tf.shape(x)[-1]
215
+ positions = tf.range(start=0, limit=maxlen, delta=1)
216
+ positions = self.pos_emb(positions)
217
+ x = self.token_emb(x)
218
+ return x + positions
219
+
220
+ def ACP_DL(file):
221
+ data_dim = 511
222
+ timesteps = 1
223
+ len_seq_max = 18
224
+ bpf, kmer, sequence = prepare_feature(file)
225
+ Seq2 = []
226
+
227
+ for m in sequence:
228
+ Seq1 = []
229
+ for i in range(len(m)):
230
+ subq = m[i]
231
+ src_vocab = {'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20,'O': 21, 'U': 22, 'Z': 23, 'X': 24 }
232
+ seq = src_vocab[subq]
233
+ Seq1.append(seq)
234
+
235
+ if len(Seq1) > len_seq_max:
236
+ Seq1 = Seq1[0:len_seq_max]
237
+ else:
238
+ Seq1 = Seq1
239
+
240
+ seq = pad_to_length(Seq1, 0, len_seq_max)
241
+ Seq2.append(seq)
242
+ Seq2 = np.array(Seq2)
243
+ X = np.concatenate((bpf, kmer), axis=1)
244
+ X = np.reshape(X, (len(X), timesteps, data_dim))
245
+ all_prob = {}
246
+ all_prob[0] = []
247
+ test1 = np.array(X)
248
+ test2 = np.array(Seq2)
249
+
250
+ Transformer_input = tf.keras.Input(shape=(len_seq_max,))
251
+ embedding_layer = TokenAndPositionEmbedding(len_seq_max, 25, 32)
252
+ x = embedding_layer(Transformer_input)
253
+ transformer_block = TransformerBlock(32, 8, 32)
254
+ x = transformer_block(x)
255
+ x = layers.GlobalAveragePooling1D()(x)
256
+ x = layers.Dropout(0.1)(x)
257
+ x = layers.Dense(20, activation="relu")(x)
258
+ x = layers.Dropout(0.1)(x)
259
+ Transformer_output = layers.Dense(256, activation="relu")(x)
260
+
261
+ lstm_input = tf.keras.Input(shape=(1, 511), name="lstm_input")
262
+ x = layers.LSTM(128, return_sequences=False)(lstm_input)
263
+ lstm_output = layers.Dense(1, activation="relu")(x)
264
+ output = layers.concatenate([Transformer_output, lstm_output])
265
+ outputss = layers.Dense(1, activation="sigmoid")(output)
266
+ model = Model(
267
+ inputs={'Transformer_input': Transformer_input, 'lstm_input': lstm_input},
268
+ outputs=outputss,
269
+ )
270
+
271
+ model.load_weights(filepath="AMP_818.h5", by_name=False, skip_mismatch=False, options=None)
272
+ proba = model.predict([test2,test1])
273
+ proba0 = (1-proba)*100
274
+ proba1 = (proba)*100
275
+ proba = transfer_label_from_prob(proba)
276
+
277
+ f = open (r'output.txt','a')
278
+ for i in range(len(proba)):
279
+ if proba[i]==0:
280
+ print(sequence[i], "Non-AMP", "%.3f%%"%proba0[i],file = f)
281
+ else:
282
+ print(sequence[i], "AMP", "%.3f%%"%proba1[i],file = f)
283
+ f.close()
284
+ return 'output.txt'
285
+
286
+
287
+ iface = gr.Interface(fn=ACP_DL,
288
+ inputs = [gr.File(label="input fasta")],
289
+ outputs= gr.File(label="download txt"))
290
+ iface.launch()