|
import torch |
|
import torch.nn as nn |
|
from transformers import AutoTokenizer,AutoModelForSequenceClassification,AutoConfig |
|
import numpy as np |
|
import pandas as pd |
|
import re |
|
from Bio.Seq import Seq |
|
from collections import OrderedDict |
|
from transformers import set_seed |
|
import random |
|
import gradio as gr |
|
|
|
def setup_seed(seed): |
|
set_seed(seed) |
|
torch.manual_seed(seed) |
|
torch.cuda.manual_seed_all(seed) |
|
np.random.seed(seed) |
|
random.seed(seed) |
|
torch.backends.cudnn.deterministic = True |
|
setup_seed(4) |
|
|
|
device = "cpu" |
|
model_checkpoint = "facebook/esm2_t6_8M_UR50D" |
|
|
|
config = AutoConfig.from_pretrained(model_checkpoint) |
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) |
|
|
|
|
|
def conotoxinfinder(files): |
|
fr=open(files, 'r') |
|
seqs = [] |
|
for line in fr: |
|
if not line.startswith('>'): |
|
line = line.replace('\n','') |
|
line = line.replace(' ','') |
|
if line.islower(): |
|
seqs.append(str((Seq(line).translate()))) |
|
else: |
|
seqs.append(line) |
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=1) |
|
model.load_state_dict(torch.load("best_model.pth", map_location=torch.device('cpu')), strict=False) |
|
model = model.to(device) |
|
|
|
value_all = [] |
|
for i in seqs: |
|
tokenizer_test = tokenizer(i, return_tensors='pt').to(device) |
|
with torch.no_grad(): |
|
value = model(**tokenizer_test) |
|
value_all.append(np.exp(value["logits"][0].item())) |
|
|
|
summary = OrderedDict() |
|
summary['Seq'] = seqs |
|
summary['Value'] = value_all |
|
summary_df = pd.DataFrame(summary) |
|
summary_df.to_csv('output.csv', index=False) |
|
return 'output.csv' |
|
|
|
with open("conotoxinfinder.md", "r") as f: |
|
description = f.read() |
|
iface = gr.Interface(fn=conotoxinfinder, |
|
title="ConotoxinFinder α7 regression", |
|
inputs=["file" |
|
], |
|
outputs= "file", |
|
description=description |
|
) |
|
iface.launch() |