Spaces:
Configuration error
Configuration error
import torch | |
import torch.nn as nn | |
from transformers import AutoModelWithLMHead, AutoTokenizer | |
import os | |
from tqdm import tqdm | |
import pandas as pd | |
import torch.optim as optim | |
from torch.utils.data import DataLoader, Dataset | |
from sklearn.metrics import classification_report, confusion_matrix, average_precision_score, roc_auc_score | |
import math | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from sklearn.model_selection import KFold, train_test_split | |
adj_max=80 | |
fps_len=167 | |
max_len=120 | |
device = torch.device('cpu') | |
model_path = 'model/' | |
class chembert_encoder(nn.Module): | |
def __init__(self, output_dim=fps_len,dropout=0.5): | |
super(chembert_encoder, self).__init__() | |
self.bert = AutoModelWithLMHead.from_pretrained("seyonec/ChemBERTa-zinc-base-v1") | |
self.tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1") | |
self.dropout=nn.Dropout(dropout) | |
self.w=nn.Linear(767,output_dim) | |
def forward(self, x): | |
input_feat = self.tokenizer.batch_encode_plus(x, max_length=512, | |
padding='longest', # implements dynamic padding | |
truncation=True, | |
return_tensors='pt', | |
return_attention_mask=True, | |
return_token_type_ids=True | |
) | |
if cuda_available: | |
input_feat['attention_mask'] = input_feat['attention_mask'].cuda() | |
input_feat['input_ids'] = input_feat['input_ids'].cuda() | |
outputs = self.bert(input_feat['input_ids'], attention_mask=input_feat['attention_mask'],output_hidden_states=None).logits[:,0,:] | |
return self.w(self.dropout(outputs)) | |
class pretrain_dataset(Dataset): | |
def __init__(self,dataframe, max_len=max_len): | |
super(pretrain_dataset, self).__init__() | |
self.len = len(dataframe) | |
self.dataframe = dataframe | |
self.max_len = max_len | |
def __getitem__(self, idx): | |
sml = self.dataframe.canonical_smiles[idx] | |
chem_id = self.dataframe.chembl_id[idx] | |
s = self.dataframe.fps[idx] | |
s = list(s) | |
adj = torch.tensor([int(b) for b in s]) | |
return sml, adj, chem_id | |
def __len__(self): | |
return self.len | |
class jak_dataset(Dataset): | |
def __init__(self,dataframe): | |
super(jak_dataset, self).__init__() | |
self.len = len(dataframe) | |
self.dataframe = dataframe | |
def __getitem__(self, idx): | |
sml = self.dataframe.Smiles[idx] | |
y = 1 if self.dataframe.Activity[idx] == 1 else 0 | |
return sml, y | |
def __len__(self): | |
return self.len | |
class chembert(nn.Module): | |
def __init__(self, load_path='model/chem_bert_encoder_pretrain_9.pt', | |
last_layer_size=fps_len, output_size=2, dropout=0.5): | |
super(chembert, self).__init__() | |
self.last_layer_size = last_layer_size | |
self.output_size = output_size | |
self.pretrained = chembert_encoder() | |
self.pretrained.load_state_dict(torch.load(load_path, map_location=device)) | |
self.w = nn.Linear(self.last_layer_size, self.output_size) | |
self.dropout = nn.Dropout(dropout) | |
def forward(self, x): | |
return self.w(self.dropout(self.pretrained(x))) | |
def chembert_predict(enzyme, smi): | |
ml = 'chembert' | |
known_drugs = [smi] | |
file_path = 'model/' + ml + '_' + enzyme + '.pt' | |
model = chembert() | |
optimizer = optim.AdamW(params=model.parameters(), lr=1e-5, weight_decay=1e-2) | |
model.load_state_dict(torch.load(file_path, map_location=torch.device('cpu'))) | |
weight_dict = {1: torch.tensor([3.0, 1.0]), 2: torch.tensor([2.0, 1.0]), 3: torch.tensor([2.0, 1.0]), | |
4: torch.tensor([2.0, 1.0])} | |
params = {'batch_size': 16, 'shuffle': False, 'drop_last': False, 'num_workers': 0} | |
model.eval() | |
known_df = pd.DataFrame(known_drugs) | |
known_df.columns = ['Smiles'] | |
known_df['Activity'] = 0 | |
known_data = jak_dataset(known_df) | |
known_loader = DataLoader(known_data, **params) | |
for idx, (X, y_true) in tqdm(enumerate(known_loader), total=len(known_loader)): | |
model.eval() | |
output = model(list(X)) | |
a, y_pred = torch.max(output, 1) | |
y_prob = torch.softmax(output,1)[:, 1].tolist() | |
return y_prob, y_pred | |