Spaces:

oucgc1996
/

DeepACE_classification_model

Sleeping

App Files Files Community

oucgc1996 commited on Apr 13, 2024

Commit

75548e5

verified ·

1 Parent(s): f235a18

Upload app.py

Browse files

Files changed (1) hide show

app.py +281 -0

app.py ADDED Viewed

	@@ -0,0 +1,281 @@

+import VolumeMaker
+import utils
+import numpy as np
+import random
+import torch
+import torch.nn as nn
+import pandas as pd
+import shutil
+import subprocess
+from transformers import AutoModelForSequenceClassification
+from torch.utils.data import Dataset,DataLoader
+import pandas as pd
+device = torch.device("cpu")
+import os
+join=os.path.join
+from transformers import AutoTokenizer
+import torch.nn.functional as F
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from collections import OrderedDict
+from tqdm import tqdm
+import time
+model_checkpoint = "facebook/esm2_t6_8M_UR50D"
+pdb_path = "structure"
+# seq_path = "test3.csv"
+temp_path = "temp"
+def setup_seed(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+setup_seed(4)
+batch_size = 1
+num_labels = 2
+radius = 2
+n_features = 1024
+hid_dim = 300
+n_heads = 1
+dropout = 0
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+class MyDataset(Dataset):
+    def __init__(self,dict_data) -> None:
+        super(MyDataset,self).__init__()
+        self.data=dict_data
+        self.structure=pdb_structure(dict_data['structure'])
+    def __getitem__(self, index):
+        return self.data['text'][index], self.structure[index]
+    def __len__(self):
+        return len(self.data['text'])
+def collate_fn(batch):
+    data = [item[0] for item in batch]
+    structure = torch.tensor([item[1].tolist() for item in batch]).to(device)
+    max_len = max([len(b[0]) for b in batch])+2
+    fingerprint = torch.tensor(peptides_to_fingerprint_matrix(data, radius, n_features),dtype=float).to(device)
+    pt_batch=tokenizer(data, padding=True, truncation=True, max_length=max_len, return_tensors='pt')
+    return {'input_ids':pt_batch['input_ids'].to(device),
+            'attention_mask':pt_batch['attention_mask'].to(device)}, structure, fingerprint
+class AttentionBlock(nn.Module):
+    def __init__(self, hid_dim, n_heads, dropout):
+        super().__init__()
+        self.hid_dim = hid_dim
+        self.n_heads = n_heads
+        assert hid_dim % n_heads == 0
+        self.f_q = nn.Linear(hid_dim, hid_dim)
+        self.f_k = nn.Linear(hid_dim, hid_dim)
+        self.f_v = nn.Linear(hid_dim, hid_dim)
+        self.fc = nn.Linear(hid_dim, hid_dim)
+        self.do = nn.Dropout(dropout)
+        self.scale = torch.sqrt(torch.FloatTensor([hid_dim // n_heads])).cuda()
+    def forward(self, query, key, value, mask=None):
+        batch_size = query.shape[0]
+        Q = self.f_q(query)
+        K = self.f_k(key)
+        V = self.f_v(value)
+        Q = Q.view(batch_size, self.n_heads, self.hid_dim // self.n_heads).unsqueeze(3)
+        K_T = K.view(batch_size, self.n_heads, self.hid_dim // self.n_heads).unsqueeze(3).transpose(2,3)
+        V = V.view(batch_size, self.n_heads, self.hid_dim // self.n_heads).unsqueeze(3)
+        energy = torch.matmul(Q, K_T) / self.scale
+        if mask is not None:
+            energy = energy.masked_fill(mask == 0, -1e10)
+        attention = self.do(F.softmax(energy, dim=-1))
+        weighter_matrix = torch.matmul(attention, V)
+        weighter_matrix = weighter_matrix.permute(0, 2, 1, 3).contiguous()
+        weighter_matrix = weighter_matrix.view(batch_size, self.n_heads * (self.hid_dim // self.n_heads))
+        weighter_matrix = self.do(self.fc(weighter_matrix))
+        return weighter_matrix
+class CrossAttentionBlock(nn.Module):
+    def __init__(self):
+        super(CrossAttentionBlock, self).__init__()
+        self.att = AttentionBlock(hid_dim = hid_dim, n_heads = n_heads, dropout=0.1)
+    def forward(self, structure_feature, fingerprint_feature, sequence_feature):
+        # cross attention for compound information enrichment
+        fingerprint_feature = fingerprint_feature + self.att(fingerprint_feature, structure_feature, structure_feature)
+        # self-attention
+        fingerprint_feature = self.att(fingerprint_feature, fingerprint_feature, fingerprint_feature)
+        # cross-attention for interaction
+        output = self.att(fingerprint_feature, sequence_feature, sequence_feature)
+        return output
+def peptides_to_fingerprint_matrix(peptides, radius=radius, n_features=n_features):
+    n_peptides = len(peptides)
+    features = np.zeros((n_peptides, n_features))
+    for i, peptide in enumerate(peptides):
+        mol = Chem.MolFromSequence(peptide)
+        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_features)
+        fp_array = np.zeros((1,))
+        AllChem.DataStructs.ConvertToNumpyArray(fp, fp_array)
+        features[i, :] = fp_array
+    return features
+class MyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.bert = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,num_labels=hid_dim)
+        self.bn1 = nn.BatchNorm1d(256)
+        self.bn2 = nn.BatchNorm1d(128)
+        self.bn3 = nn.BatchNorm1d(64)
+        self.relu = nn.ReLU()
+        self.fc1 = nn.Linear(300,256)
+        self.fc2 = nn.Linear(256,128)
+        self.fc3 = nn.Linear(128,64)
+        self.fc_fingerprint = nn.Linear(1024,hid_dim)
+        self.fc_structure = nn.Linear(1500,hid_dim)
+        self.fingerprint_lstm = nn.LSTM(bidirectional=True, num_layers=2, input_size=1024, hidden_size=1024//2, batch_first=True)
+        self.structure_lstm = nn.LSTM(bidirectional=True, num_layers=2, input_size=500, hidden_size=500//2, batch_first=True)
+        self.output_layer = nn.Linear(64,num_labels)
+        self.dropout = nn.Dropout(0)
+        self.CAB = CrossAttentionBlock()
+    def forward(self,structure, x, fingerprint):
+        fingerprint = torch.unsqueeze(fingerprint, 2).float()
+        structure = structure.permute(0, 2, 1)
+        fingerprint = fingerprint.permute(0, 2, 1)
+        with torch.no_grad():
+            bert_output = self.bert(input_ids=x['input_ids'].to(device),attention_mask=x['attention_mask'].to(device))
+        sequence_feature = self.dropout(bert_output["logits"])
+        structure = structure.to(device)
+        fingerprint_feature, _  = self.fingerprint_lstm(fingerprint)
+        structure_feature, _  = self.structure_lstm(structure)
+        fingerprint_feature = fingerprint_feature.flatten(start_dim=1)
+        structure_feature = structure_feature.flatten(start_dim=1)
+        fingerprint_feature = self.fc_fingerprint(fingerprint_feature)
+        structure_feature = self.fc_structure(structure_feature)
+        output_feature = self.CAB(structure_feature, fingerprint_feature, sequence_feature)
+        output_feature = self.dropout(self.relu(self.bn1(self.fc1(output_feature))))
+        output_feature = self.dropout(self.relu(self.bn2(self.fc2(output_feature))))
+        output_feature = self.dropout(self.relu(self.bn3(self.fc3(output_feature))))
+        output_feature = self.dropout(self.output_layer(output_feature))
+        return torch.softmax(output_feature,dim=1)
+def pdb_structure(Structure_index):
+    created_folders = []
+    SurfacePoitCloud_all = []
+    for index in Structure_index:
+        structure_folder = join(temp_path, str(index))
+        os.makedirs(structure_folder, exist_ok=True)
+        created_folders.append(structure_folder)
+        pdb_file = join(pdb_path, f"{index}.pdb")
+        if os.path.exists(pdb_file):
+            shutil.copy2(pdb_file, structure_folder)
+        else:
+            print(f"PDB file not found for structure {index}")
+        coords, atname, pdbname, pdb_num = utils.parsePDB(structure_folder)
+        atoms_channel = utils.atomlistToChannels(atname)
+        radius = utils.atomlistToRadius(atname)
+        PointCloudSurfaceObject = VolumeMaker.PointCloudSurface(device=device)
+        coords = coords.to(device)
+        radius = radius.to(device)
+        atoms_channel = atoms_channel.to(device)
+        SurfacePoitCloud = PointCloudSurfaceObject(coords, radius)
+        feature = SurfacePoitCloud.view(pdb_num,-1,3).cpu()
+        SurfacePoitCloud_all.append(feature)
+    SurfacePoitCloud_all_tensor = torch.squeeze(torch.stack(SurfacePoitCloud_all),dim=1)
+    for folder in created_folders:
+        shutil.rmtree(folder)
+    return SurfacePoitCloud_all_tensor
+def ACE(file):
+    if not os.path.exists(pdb_path):
+        os.makedirs(pdb_path)
+    else:
+        shutil.rmtree(pdb_path)
+        os.makedirs(pdb_path)
+    # df = pd.read_csv(seq_path)
+    # test_sequences = df["Seq"].tolist()
+    # test_Structure_index = df["Structure_index"].tolist()
+    test_sequences = [file]
+    test_Structure_index = [f"structure_{i}" for i in range(len(test_sequences))]
+    test_dict = {"text":test_sequences, 'structure':test_Structure_index}
+    print("=================================Structure prediction========================")
+    for i in tqdm(range(0, len(test_sequences))):
+        while True:
+            command = ["curl", "-X", "POST", "-k", "--data", f"{test_sequences[i]}", "https://api.esmatlas.com/foldSequence/v1/pdb/"]
+            result = subprocess.run(command, capture_output=True, text=True)
+            with open(os.path.join(pdb_path, f'{test_Structure_index[i]}.pdb'), 'w') as file:
+                file.write(result.stdout)
+            stats = os.stat(os.path.join(pdb_path, f'{test_Structure_index[i]}.pdb'))
+            if stats.st_size < 1024:
+                print(f"Download for {test_Structure_index[i]} failed due to empty file. Retrying...")
+                time.sleep(20)
+                continue
+            else:
+                break
+    test_data=MyDataset(test_dict)
+    test_dataloader=DataLoader(test_data,batch_size=batch_size,collate_fn=collate_fn,shuffle=False)
+    # 导入模型
+    model = MyModel()
+    model.load_state_dict(torch.load("best_model.pth", map_location=torch.device('cpu')), strict=False)
+    model = model.to(device)
+    # 预测
+    model.eval()
+    with torch.no_grad():
+        probability_all = []
+        Target_all = []
+        print("=================================Start prediction========================")
+        for index, (batch, structure_fea, fingerprint) in enumerate(test_dataloader):
+            batchs = {k: v for k, v in batch.items()}
+            outputs = model(structure_fea, batchs, fingerprint)
+            probability = outputs[0].tolist()
+            train_argmax = np.argmax(outputs.cpu().detach().numpy(), axis=1)
+            for j in range(0,len(train_argmax)):
+                output = train_argmax[j]
+                if output == 0:
+                    Target = "low"
+                    probability = probability[0]
+                elif output == 1:
+                    Target = "high"
+                    probability = probability[1]
+                print(Target, probability)
+                probability_all.append(probability)
+                Target_all.append(Target)
+    summary = OrderedDict()
+    summary['Seq'] = test_sequences
+    summary['Target'] = Target_all
+    summary['Probability'] = probability_all
+    summary_df = pd.DataFrame(summary)
+    summary_df.to_csv('output.csv', index=False)
+    if len(test_sequences) > 1:
+        out_text = "Please download csv"
+        out_prob = "Please download csv"
+    else:
+        out_text = output
+        out_prob = probability
+    return 'outputs.csv', out_text, out_prob
+iface = gr.Interface(fn=ACE,
+                     inputs="text",
+                     outputs= ["file","text","text"])
+iface.launch()