Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
# @Time : 2021/8/17 23:08 | |
# @Author : Cheng Ge | |
from tensorflow.keras.models import Model | |
import gradio as gr | |
import warnings | |
warnings.filterwarnings('ignore') | |
import numpy as np | |
from numpy import linalg as la | |
import tensorflow as tf | |
from tensorflow import keras | |
from tensorflow.keras import layers | |
def pad_to_length(input_data: list, pad_token, max_length: int) -> list: | |
assert len(input_data) <= max_length | |
result = input_data[:] | |
for i in range(max_length - len(result)): | |
result.append(pad_token) | |
return result | |
def TransDict_from_list(groups): | |
transDict = dict() | |
tar_list = ['0', '1', '2', '3', '4', '5', '6'] | |
result = {} | |
index = 0 | |
for group in groups: | |
g_members = sorted(group) # Alphabetically sorted list | |
for c in g_members: | |
# print('c' + str(c)) | |
# print('g_members[0]' + str(g_members[0])) | |
result[c] = str(tar_list[index]) # K:V map, use group's first letter as represent. | |
index = index + 1 | |
return result | |
def get_3_protein_trids(): | |
nucle_com = [] | |
chars = ['0', '1', '2', '3', '4', '5', '6'] | |
base = len(chars) | |
end = len(chars) ** 3 | |
for i in range(0, end): | |
n = i | |
ch0 = chars[n % base] | |
n = n / base | |
ch1 = chars[int(n % base)] | |
n = n / base | |
ch2 = chars[int(n % base)] | |
nucle_com.append(ch0 + ch1 + ch2) | |
return nucle_com | |
def translate_sequence(seq, TranslationDict): | |
''' | |
Given (seq) - a string/sequence to translate, | |
Translates into a reduced alphabet, using a translation dict provided | |
by the TransDict_from_list() method. | |
Returns the string/sequence in the new, reduced alphabet. | |
Remember - in Python string are immutable.. | |
''' | |
import string | |
from_list = [] | |
to_list = [] | |
for k, v in TranslationDict.items(): | |
from_list.append(k) | |
to_list.append(v) | |
# TRANS_seq = seq.translate(str.maketrans(zip(from_list,to_list))) | |
TRANS_seq = seq.translate(str.maketrans(str(from_list), str(to_list))) | |
# TRANS_seq = maketrans( TranslationDict, seq) | |
return TRANS_seq | |
def get_4_nucleotide_composition(tris, seq, pythoncount=True): | |
seq_len = len(seq) | |
tri_feature = [0] * len(tris) | |
k = len(tris[0]) | |
note_feature = [[0 for cols in range(len(seq) - k + 1)] for rows in range(len(tris))] | |
if pythoncount: | |
for val in tris: | |
num = seq.count(val) | |
tri_feature.append(float(num) / seq_len) | |
else: | |
# tmp_fea = [0] * len(tris) | |
for x in range(len(seq) + 1 - k): | |
kmer = seq[x:x + k] | |
if kmer in tris: | |
ind = tris.index(kmer) | |
# tmp_fea[ind] = tmp_fea[ind] + 1 | |
note_feature[ind][x] = note_feature[ind][x] + 1 | |
# tri_feature = [float(val)/seq_len for val in tmp_fea] #tri_feature type:list len:256 | |
u, s, v = la.svd(note_feature) | |
for i in range(len(s)): | |
tri_feature = tri_feature + u[i] * s[i] / seq_len | |
# print tri_feature | |
# pdb.set_trace() | |
return tri_feature | |
def BPF(seq_temp): | |
sequences = seq_temp | |
Seq1 = [] | |
for i in range(len(sequences)): | |
kmer = sequences[i] | |
src_vocab = {'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20, 'O': 21, 'U': 22, 'Z': 23, 'X': 23} | |
seq = src_vocab[kmer] | |
Seq1.append(seq) | |
seq = pad_to_length(Seq1, 0, 7) | |
fea = [] | |
tem_vec = [] | |
for i in range(len(seq)): | |
if seq[i] == 1: | |
tem_vec = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] | |
elif seq[i]==2: | |
tem_vec = [0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] | |
elif seq[i]==3: | |
tem_vec = [0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] | |
elif seq[i]==4: | |
tem_vec = [0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] | |
elif seq[i]==5: | |
tem_vec = [0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] | |
elif seq[i]==6: | |
tem_vec = [0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] | |
elif seq[i]==7: | |
tem_vec = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] | |
elif seq[i]==8: | |
tem_vec = [0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] | |
elif seq[i]==9: | |
tem_vec = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] | |
elif seq[i]==10: | |
tem_vec = [0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0] | |
elif seq[i]==11: | |
tem_vec = [0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0] | |
elif seq[i]==12: | |
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0] | |
elif seq[i]==13: | |
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0] | |
elif seq[i]==14: | |
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0] | |
elif seq[i]==15: | |
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0] | |
elif seq[i]==16: | |
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0] | |
elif seq[i]==17: | |
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] | |
elif seq[i]==18: | |
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0] | |
elif seq[i]==19: | |
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0] | |
elif seq[i]==20: | |
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0] | |
elif seq[i]==21: | |
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0] | |
elif seq[i]==22: | |
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0] | |
elif seq[i]==23: | |
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] | |
elif seq[i]==24: | |
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1] | |
elif seq[i]==0: | |
tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] | |
fea = fea + tem_vec | |
return fea | |
def transfer_label_from_prob(proba): | |
label = [1 if val >= 0.5 else 0 for val in proba] | |
return label | |
def prepare_feature(file): | |
files = file.name | |
protein_seq_dict = {} | |
protein_index = 1 | |
with open(files, 'r') as fp: | |
for line in fp: | |
seq = line[:-1] | |
protein_seq_dict[protein_index] = seq | |
protein_index = protein_index + 1 | |
groups = ['AGV', 'ILFPO', 'YMTS', 'HNQW', 'RK', 'DEZ', 'CU'] | |
group_dict = TransDict_from_list(groups) | |
protein_tris = get_3_protein_trids() | |
bpf = [] | |
kmer = [] | |
sequence = [] | |
for i in protein_seq_dict: | |
protein_seq = translate_sequence(protein_seq_dict[i], group_dict) | |
if len(protein_seq_dict[i]) > 7: | |
aaa = protein_seq_dict[i][0:7] | |
bpf_feature = BPF(aaa) | |
else: | |
bpf_feature = BPF(protein_seq_dict[i]) | |
protein_tri_fea = get_4_nucleotide_composition(protein_tris, protein_seq, pythoncount =False) | |
bpf.append(bpf_feature) | |
kmer.append(protein_tri_fea) | |
sequence.append(protein_seq_dict[i]) | |
return np.array(bpf), np.array(kmer), np.array(sequence) | |
class TransformerBlock(layers.Layer): | |
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1): | |
super(TransformerBlock, self).__init__() | |
self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) | |
self.ffn = keras.Sequential( | |
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim), ] | |
) | |
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6) | |
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6) | |
self.dropout1 = layers.Dropout(rate) | |
self.dropout2 = layers.Dropout(rate) | |
def call(self, inputs, training): | |
attn_output = self.att(inputs, inputs) | |
attn_output = self.dropout1(attn_output, training=training) | |
out1 = self.layernorm1(inputs + attn_output) | |
ffn_output = self.ffn(out1) | |
ffn_output = self.dropout2(ffn_output, training=training) | |
return self.layernorm2(out1 + ffn_output) | |
class TokenAndPositionEmbedding(layers.Layer): | |
def __init__(self, maxlen, vocab_size, embed_dim): | |
super(TokenAndPositionEmbedding, self).__init__() | |
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim) | |
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim) | |
def call(self, x): | |
maxlen = tf.shape(x)[-1] | |
positions = tf.range(start=0, limit=maxlen, delta=1) | |
positions = self.pos_emb(positions) | |
x = self.token_emb(x) | |
return x + positions | |
def ACP_DL(file): | |
data_dim = 511 | |
timesteps = 1 | |
len_seq_max = 18 | |
bpf, kmer, sequence = prepare_feature(file) | |
Seq2 = [] | |
for m in sequence: | |
Seq1 = [] | |
for i in range(len(m)): | |
subq = m[i] | |
src_vocab = {'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20,'O': 21, 'U': 22, 'Z': 23, 'X': 24 } | |
seq = src_vocab[subq] | |
Seq1.append(seq) | |
if len(Seq1) > len_seq_max: | |
Seq1 = Seq1[0:len_seq_max] | |
else: | |
Seq1 = Seq1 | |
seq = pad_to_length(Seq1, 0, len_seq_max) | |
Seq2.append(seq) | |
Seq2 = np.array(Seq2) | |
X = np.concatenate((bpf, kmer), axis=1) | |
X = np.reshape(X, (len(X), timesteps, data_dim)) | |
all_prob = {} | |
all_prob[0] = [] | |
test1 = np.array(X) | |
test2 = np.array(Seq2) | |
Transformer_input = tf.keras.Input(shape=(len_seq_max,)) | |
embedding_layer = TokenAndPositionEmbedding(len_seq_max, 25, 32) | |
x = embedding_layer(Transformer_input) | |
transformer_block = TransformerBlock(32, 8, 32) | |
x = transformer_block(x) | |
x = layers.GlobalAveragePooling1D()(x) | |
x = layers.Dropout(0.1)(x) | |
x = layers.Dense(20, activation="relu")(x) | |
x = layers.Dropout(0.1)(x) | |
Transformer_output = layers.Dense(256, activation="relu")(x) | |
lstm_input = tf.keras.Input(shape=(1, 511), name="lstm_input") | |
x = layers.LSTM(128, return_sequences=False)(lstm_input) | |
lstm_output = layers.Dense(1, activation="relu")(x) | |
output = layers.concatenate([Transformer_output, lstm_output]) | |
outputss = layers.Dense(1, activation="sigmoid")(output) | |
model = Model( | |
inputs={'Transformer_input': Transformer_input, 'lstm_input': lstm_input}, | |
outputs=outputss, | |
) | |
model.load_weights(filepath="AMP_818.h5", by_name=False, skip_mismatch=False, options=None) | |
proba = model.predict([test2,test1]) | |
proba0 = (1-proba)*100 | |
proba1 = (proba)*100 | |
proba = transfer_label_from_prob(proba) | |
f = open (r'output.txt','a') | |
for i in range(len(proba)): | |
if proba[i]==0: | |
print(sequence[i], "Non-AMP", "%.3f%%"%proba0[i],file = f) | |
else: | |
print(sequence[i], "AMP", "%.3f%%"%proba1[i],file = f) | |
f.close() | |
return 'output.txt' | |
iface = gr.Interface(fn=ACP_DL, | |
inputs = [gr.File(label="input fasta")], | |
outputs= gr.File(label="download txt")) | |
iface.launch() |