oucgc1996's picture
Add application file
0242109
raw
history blame
11.6 kB
# -*- coding: utf-8 -*-
# @Time : 2021/8/17 23:08
# @Author : Cheng Ge
from tensorflow.keras.models import Model
import gradio as gr
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from numpy import linalg as la
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
def pad_to_length(input_data: list, pad_token, max_length: int) -> list:
assert len(input_data) <= max_length
result = input_data[:]
for i in range(max_length - len(result)):
result.append(pad_token)
return result
def TransDict_from_list(groups):
transDict = dict()
tar_list = ['0', '1', '2', '3', '4', '5', '6']
result = {}
index = 0
for group in groups:
g_members = sorted(group) # Alphabetically sorted list
for c in g_members:
# print('c' + str(c))
# print('g_members[0]' + str(g_members[0]))
result[c] = str(tar_list[index]) # K:V map, use group's first letter as represent.
index = index + 1
return result
def get_3_protein_trids():
nucle_com = []
chars = ['0', '1', '2', '3', '4', '5', '6']
base = len(chars)
end = len(chars) ** 3
for i in range(0, end):
n = i
ch0 = chars[n % base]
n = n / base
ch1 = chars[int(n % base)]
n = n / base
ch2 = chars[int(n % base)]
nucle_com.append(ch0 + ch1 + ch2)
return nucle_com
def translate_sequence(seq, TranslationDict):
'''
Given (seq) - a string/sequence to translate,
Translates into a reduced alphabet, using a translation dict provided
by the TransDict_from_list() method.
Returns the string/sequence in the new, reduced alphabet.
Remember - in Python string are immutable..
'''
import string
from_list = []
to_list = []
for k, v in TranslationDict.items():
from_list.append(k)
to_list.append(v)
# TRANS_seq = seq.translate(str.maketrans(zip(from_list,to_list)))
TRANS_seq = seq.translate(str.maketrans(str(from_list), str(to_list)))
# TRANS_seq = maketrans( TranslationDict, seq)
return TRANS_seq
def get_4_nucleotide_composition(tris, seq, pythoncount=True):
seq_len = len(seq)
tri_feature = [0] * len(tris)
k = len(tris[0])
note_feature = [[0 for cols in range(len(seq) - k + 1)] for rows in range(len(tris))]
if pythoncount:
for val in tris:
num = seq.count(val)
tri_feature.append(float(num) / seq_len)
else:
# tmp_fea = [0] * len(tris)
for x in range(len(seq) + 1 - k):
kmer = seq[x:x + k]
if kmer in tris:
ind = tris.index(kmer)
# tmp_fea[ind] = tmp_fea[ind] + 1
note_feature[ind][x] = note_feature[ind][x] + 1
# tri_feature = [float(val)/seq_len for val in tmp_fea] #tri_feature type:list len:256
u, s, v = la.svd(note_feature)
for i in range(len(s)):
tri_feature = tri_feature + u[i] * s[i] / seq_len
# print tri_feature
# pdb.set_trace()
return tri_feature
def BPF(seq_temp):
sequences = seq_temp
Seq1 = []
for i in range(len(sequences)):
kmer = sequences[i]
src_vocab = {'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20, 'O': 21, 'U': 22, 'Z': 23, 'X': 23}
seq = src_vocab[kmer]
Seq1.append(seq)
seq = pad_to_length(Seq1, 0, 7)
fea = []
tem_vec = []
for i in range(len(seq)):
if seq[i] == 1:
tem_vec = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
elif seq[i]==2:
tem_vec = [0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
elif seq[i]==3:
tem_vec = [0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
elif seq[i]==4:
tem_vec = [0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
elif seq[i]==5:
tem_vec = [0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
elif seq[i]==6:
tem_vec = [0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
elif seq[i]==7:
tem_vec = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
elif seq[i]==8:
tem_vec = [0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
elif seq[i]==9:
tem_vec = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
elif seq[i]==10:
tem_vec = [0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
elif seq[i]==11:
tem_vec = [0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0]
elif seq[i]==12:
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0]
elif seq[i]==13:
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0]
elif seq[i]==14:
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0]
elif seq[i]==15:
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0]
elif seq[i]==16:
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0]
elif seq[i]==17:
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
elif seq[i]==18:
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0]
elif seq[i]==19:
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0]
elif seq[i]==20:
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0]
elif seq[i]==21:
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
elif seq[i]==22:
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0]
elif seq[i]==23:
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
elif seq[i]==24:
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
elif seq[i]==0:
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
fea = fea + tem_vec
return fea
def transfer_label_from_prob(proba):
label = [1 if val >= 0.5 else 0 for val in proba]
return label
def prepare_feature(file):
files = file.name
protein_seq_dict = {}
protein_index = 1
with open(files, 'r') as fp:
for line in fp:
seq = line[:-1]
protein_seq_dict[protein_index] = seq
protein_index = protein_index + 1
groups = ['AGV', 'ILFPO', 'YMTS', 'HNQW', 'RK', 'DEZ', 'CU']
group_dict = TransDict_from_list(groups)
protein_tris = get_3_protein_trids()
bpf = []
kmer = []
sequence = []
for i in protein_seq_dict:
protein_seq = translate_sequence(protein_seq_dict[i], group_dict)
if len(protein_seq_dict[i]) > 7:
aaa = protein_seq_dict[i][0:7]
bpf_feature = BPF(aaa)
else:
bpf_feature = BPF(protein_seq_dict[i])
protein_tri_fea = get_4_nucleotide_composition(protein_tris, protein_seq, pythoncount =False)
bpf.append(bpf_feature)
kmer.append(protein_tri_fea)
sequence.append(protein_seq_dict[i])
return np.array(bpf), np.array(kmer), np.array(sequence)
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super(TransformerBlock, self).__init__()
self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = keras.Sequential(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim), ]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
def call(self, inputs, training):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
super(TokenAndPositionEmbedding, self).__init__()
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
def ACP_DL(file):
data_dim = 511
timesteps = 1
len_seq_max = 18
bpf, kmer, sequence = prepare_feature(file)
Seq2 = []
for m in sequence:
Seq1 = []
for i in range(len(m)):
subq = m[i]
src_vocab = {'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20,'O': 21, 'U': 22, 'Z': 23, 'X': 24 }
seq = src_vocab[subq]
Seq1.append(seq)
if len(Seq1) > len_seq_max:
Seq1 = Seq1[0:len_seq_max]
else:
Seq1 = Seq1
seq = pad_to_length(Seq1, 0, len_seq_max)
Seq2.append(seq)
Seq2 = np.array(Seq2)
X = np.concatenate((bpf, kmer), axis=1)
X = np.reshape(X, (len(X), timesteps, data_dim))
all_prob = {}
all_prob[0] = []
test1 = np.array(X)
test2 = np.array(Seq2)
Transformer_input = tf.keras.Input(shape=(len_seq_max,))
embedding_layer = TokenAndPositionEmbedding(len_seq_max, 25, 32)
x = embedding_layer(Transformer_input)
transformer_block = TransformerBlock(32, 8, 32)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
Transformer_output = layers.Dense(256, activation="relu")(x)
lstm_input = tf.keras.Input(shape=(1, 511), name="lstm_input")
x = layers.LSTM(128, return_sequences=False)(lstm_input)
lstm_output = layers.Dense(1, activation="relu")(x)
output = layers.concatenate([Transformer_output, lstm_output])
outputss = layers.Dense(1, activation="sigmoid")(output)
model = Model(
inputs={'Transformer_input': Transformer_input, 'lstm_input': lstm_input},
outputs=outputss,
)
model.load_weights(filepath="AMP_818.h5", by_name=False, skip_mismatch=False, options=None)
proba = model.predict([test2,test1])
proba0 = (1-proba)*100
proba1 = (proba)*100
proba = transfer_label_from_prob(proba)
f = open (r'output.txt','a')
for i in range(len(proba)):
if proba[i]==0:
print(sequence[i], "Non-AMP", "%.3f%%"%proba0[i],file = f)
else:
print(sequence[i], "AMP", "%.3f%%"%proba1[i],file = f)
f.close()
return 'output.txt'
iface = gr.Interface(fn=ACP_DL,
inputs = [gr.File(label="input fasta")],
outputs= gr.File(label="download txt"))
iface.launch()