# -*- coding: utf-8 -*- # @Time : 2021/8/17 23:08 # @Author : Cheng Ge from tensorflow.keras.models import Model import gradio as gr import warnings warnings.filterwarnings('ignore') import numpy as np from numpy import linalg as la import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers def pad_to_length(input_data: list, pad_token, max_length: int) -> list: assert len(input_data) <= max_length result = input_data[:] for i in range(max_length - len(result)): result.append(pad_token) return result def TransDict_from_list(groups): transDict = dict() tar_list = ['0', '1', '2', '3', '4', '5', '6'] result = {} index = 0 for group in groups: g_members = sorted(group) # Alphabetically sorted list for c in g_members: # print('c' + str(c)) # print('g_members[0]' + str(g_members[0])) result[c] = str(tar_list[index]) # K:V map, use group's first letter as represent. index = index + 1 return result def get_3_protein_trids(): nucle_com = [] chars = ['0', '1', '2', '3', '4', '5', '6'] base = len(chars) end = len(chars) ** 3 for i in range(0, end): n = i ch0 = chars[n % base] n = n / base ch1 = chars[int(n % base)] n = n / base ch2 = chars[int(n % base)] nucle_com.append(ch0 + ch1 + ch2) return nucle_com def translate_sequence(seq, TranslationDict): ''' Given (seq) - a string/sequence to translate, Translates into a reduced alphabet, using a translation dict provided by the TransDict_from_list() method. Returns the string/sequence in the new, reduced alphabet. Remember - in Python string are immutable.. ''' import string from_list = [] to_list = [] for k, v in TranslationDict.items(): from_list.append(k) to_list.append(v) # TRANS_seq = seq.translate(str.maketrans(zip(from_list,to_list))) TRANS_seq = seq.translate(str.maketrans(str(from_list), str(to_list))) # TRANS_seq = maketrans( TranslationDict, seq) return TRANS_seq def get_4_nucleotide_composition(tris, seq, pythoncount=True): seq_len = len(seq) tri_feature = [0] * len(tris) k = len(tris[0]) note_feature = [[0 for cols in range(len(seq) - k + 1)] for rows in range(len(tris))] if pythoncount: for val in tris: num = seq.count(val) tri_feature.append(float(num) / seq_len) else: # tmp_fea = [0] * len(tris) for x in range(len(seq) + 1 - k): kmer = seq[x:x + k] if kmer in tris: ind = tris.index(kmer) # tmp_fea[ind] = tmp_fea[ind] + 1 note_feature[ind][x] = note_feature[ind][x] + 1 # tri_feature = [float(val)/seq_len for val in tmp_fea] #tri_feature type:list len:256 u, s, v = la.svd(note_feature) for i in range(len(s)): tri_feature = tri_feature + u[i] * s[i] / seq_len # print tri_feature # pdb.set_trace() return tri_feature def BPF(seq_temp): sequences = seq_temp Seq1 = [] for i in range(len(sequences)): kmer = sequences[i] src_vocab = {'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20, 'O': 21, 'U': 22, 'Z': 23, 'X': 23} seq = src_vocab[kmer] Seq1.append(seq) seq = pad_to_length(Seq1, 0, 7) fea = [] tem_vec = [] for i in range(len(seq)): if seq[i] == 1: tem_vec = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] elif seq[i]==2: tem_vec = [0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] elif seq[i]==3: tem_vec = [0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] elif seq[i]==4: tem_vec = [0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] elif seq[i]==5: tem_vec = [0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] elif seq[i]==6: tem_vec = [0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] elif seq[i]==7: tem_vec = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] elif seq[i]==8: tem_vec = [0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] elif seq[i]==9: tem_vec = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] elif seq[i]==10: tem_vec = [0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0] elif seq[i]==11: tem_vec = [0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0] elif seq[i]==12: tem_vec = [0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0] elif seq[i]==13: tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0] elif seq[i]==14: tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0] elif seq[i]==15: tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0] elif seq[i]==16: tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0] elif seq[i]==17: tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] elif seq[i]==18: tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0] elif seq[i]==19: tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0] elif seq[i]==20: tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0] elif seq[i]==21: tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0] elif seq[i]==22: tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0] elif seq[i]==23: tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] elif seq[i]==24: tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1] elif seq[i]==0: tem_vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] fea = fea + tem_vec return fea def transfer_label_from_prob(proba): label = [1 if val >= 0.5 else 0 for val in proba] return label def prepare_feature(file): files = file.name protein_seq_dict = {} protein_index = 1 with open(files, 'r') as fp: for line in fp: seq = line[:-1] protein_seq_dict[protein_index] = seq protein_index = protein_index + 1 groups = ['AGV', 'ILFPO', 'YMTS', 'HNQW', 'RK', 'DEZ', 'CU'] group_dict = TransDict_from_list(groups) protein_tris = get_3_protein_trids() bpf = [] kmer = [] sequence = [] for i in protein_seq_dict: protein_seq = translate_sequence(protein_seq_dict[i], group_dict) if len(protein_seq_dict[i]) > 7: aaa = protein_seq_dict[i][0:7] bpf_feature = BPF(aaa) else: bpf_feature = BPF(protein_seq_dict[i]) protein_tri_fea = get_4_nucleotide_composition(protein_tris, protein_seq, pythoncount =False) bpf.append(bpf_feature) kmer.append(protein_tri_fea) sequence.append(protein_seq_dict[i]) return np.array(bpf), np.array(kmer), np.array(sequence) class TransformerBlock(layers.Layer): def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1): super(TransformerBlock, self).__init__() self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) self.ffn = keras.Sequential( [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim), ] ) self.layernorm1 = layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = layers.LayerNormalization(epsilon=1e-6) self.dropout1 = layers.Dropout(rate) self.dropout2 = layers.Dropout(rate) def call(self, inputs, training): attn_output = self.att(inputs, inputs) attn_output = self.dropout1(attn_output, training=training) out1 = self.layernorm1(inputs + attn_output) ffn_output = self.ffn(out1) ffn_output = self.dropout2(ffn_output, training=training) return self.layernorm2(out1 + ffn_output) class TokenAndPositionEmbedding(layers.Layer): def __init__(self, maxlen, vocab_size, embed_dim): super(TokenAndPositionEmbedding, self).__init__() self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim) self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim) def call(self, x): maxlen = tf.shape(x)[-1] positions = tf.range(start=0, limit=maxlen, delta=1) positions = self.pos_emb(positions) x = self.token_emb(x) return x + positions def ACP_DL(file): data_dim = 511 timesteps = 1 len_seq_max = 18 bpf, kmer, sequence = prepare_feature(file) Seq2 = [] for m in sequence: Seq1 = [] for i in range(len(m)): subq = m[i] src_vocab = {'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20,'O': 21, 'U': 22, 'Z': 23, 'X': 24 } seq = src_vocab[subq] Seq1.append(seq) if len(Seq1) > len_seq_max: Seq1 = Seq1[0:len_seq_max] else: Seq1 = Seq1 seq = pad_to_length(Seq1, 0, len_seq_max) Seq2.append(seq) Seq2 = np.array(Seq2) X = np.concatenate((bpf, kmer), axis=1) X = np.reshape(X, (len(X), timesteps, data_dim)) all_prob = {} all_prob[0] = [] test1 = np.array(X) test2 = np.array(Seq2) Transformer_input = tf.keras.Input(shape=(len_seq_max,)) embedding_layer = TokenAndPositionEmbedding(len_seq_max, 25, 32) x = embedding_layer(Transformer_input) transformer_block = TransformerBlock(32, 8, 32) x = transformer_block(x) x = layers.GlobalAveragePooling1D()(x) x = layers.Dropout(0.1)(x) x = layers.Dense(20, activation="relu")(x) x = layers.Dropout(0.1)(x) Transformer_output = layers.Dense(256, activation="relu")(x) lstm_input = tf.keras.Input(shape=(1, 511), name="lstm_input") x = layers.LSTM(128, return_sequences=False)(lstm_input) lstm_output = layers.Dense(1, activation="relu")(x) output = layers.concatenate([Transformer_output, lstm_output]) outputss = layers.Dense(1, activation="sigmoid")(output) model = Model( inputs={'Transformer_input': Transformer_input, 'lstm_input': lstm_input}, outputs=outputss, ) model.load_weights(filepath="AMP_818.h5", by_name=False, skip_mismatch=False, options=None) proba = model.predict([test2,test1]) proba0 = (1-proba)*100 proba1 = (proba)*100 proba = transfer_label_from_prob(proba) f = open (r'output.txt','a') for i in range(len(proba)): if proba[i]==0: print(sequence[i], "Non-AMP", "%.3f%%"%proba0[i],file = f) else: print(sequence[i], "AMP", "%.3f%%"%proba1[i],file = f) f.close() return 'output.txt' iface = gr.Interface(fn=ACP_DL, inputs = [gr.File(label="input fasta")], outputs= gr.File(label="download txt")) iface.launch()