File size: 3,783 Bytes
76df63f 002ef29 76df63f cff9fff eeb3f58 76df63f f19339e f4c361f 0063dd4 f4c361f 76df63f 1efc8c1 76df63f 002ef29 f19339e 41d85ec 76df63f ef79cb5 90a3cbc ef79cb5 90a3cbc ef79cb5 90a3cbc 0cb8077 76df63f 772c696 f19339e 0fc78f8 e84e9ff 772c696 76df63f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from transformers import set_seed
import torch
import torch.nn as nn
from collections import OrderedDict
import warnings
import gradio as gr
from tqdm import tqdm
warnings.filterwarnings('ignore')
set_seed(4)
device = "cpu"
model_checkpoint = "esm2_t30_150M_UR50D"
class MyModel(nn.Module):
def __init__(self):
super().__init__()
self.bert = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,num_labels=320)
self.bn1 = nn.BatchNorm1d(256)
self.bn2 = nn.BatchNorm1d(128)
self.bn3 = nn.BatchNorm1d(64)
self.relu = nn.ReLU()
self.fc1 = nn.Linear(320,256)
self.fc2 = nn.Linear(256,128)
self.fc3 = nn.Linear(128,64)
self.output_layer = nn.Linear(64,2)
self.dropout = nn.Dropout(0)
def forward(self,x):
with torch.no_grad():
bert_output = self.bert(input_ids=x['input_ids'].to(device),attention_mask=x['attention_mask'].to(device))
output_feature = self.dropout(bert_output["logits"])
output_feature = self.relu(self.bn1(self.fc1(output_feature)))
output_feature = self.relu(self.bn2(self.fc2(output_feature)))
output_feature = self.relu(self.bn3(self.fc3(output_feature)))
output_feature = self.output_layer(output_feature)
return torch.softmax(output_feature,dim=1)
def Kmers_funct(seq,num):
for i in range(len(seq)):
a = seq[i]
l = []
for index in range(len(a)):
t = a[index:index + num]
if (len(t)) == num:
l.append(t)
return l
def ACE(file):
test_seq = file
all = []
seq_len = len(test_seq)
if seq_len > 30:
for j in range(2, 11):
X = Kmers_funct([test_seq], j)
all.extend(X)
else:
all.append(test_seq)
model = MyModel()
model.load_state_dict(torch.load("best_model.pth", map_location=torch.device('cpu')), strict=False)
model = model.to(device)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
max_len = 30
seq_all = []
output_all = []
probability_all = []
for seq in tqdm(all):
test_data = tokenizer(seq, max_length=max_len, padding="max_length",truncation=True, return_tensors='pt')
out_probability = []
with torch.no_grad():
predict = model(test_data)
out_probability.extend(np.max(np.array(predict.cpu()),axis=1).tolist())
test_argmax = np.argmax(predict.cpu(), axis=1).tolist()
id2str = {0:"non-ACE", 1:"ACE"}
output = id2str[test_argmax[0]]
probability = out_probability[0]
seq_all.append(seq)
output_all.append(output)
probability_all.append(probability)
summary = OrderedDict()
summary['Seq'] = seq_all
summary['Class'] = output_all
summary['Probability'] = probability_all
summary_df = pd.DataFrame(summary)
summary_df.to_csv('output.csv', index=False)
if seq_len > 30:
out_text = "None"
out_prob = "None"
else:
out_text = output
out_prob = probability
return 'output.csv', out_text, out_prob
with open("ACE.md", "r") as f:
description = f.read()
iface = gr.Interface(fn=ACE,
title="🏹DeepACE",
inputs=gr.Textbox(show_label=False, placeholder="Enter peptide or protein", lines=4),
outputs= ["file",gr.Textbox(show_label=False, placeholder="class", lines=1),gr.Textbox(show_label=False, placeholder="probability", lines=1)],
description=description)
iface.launch() |