Spaces:

HUBioDataLab
/

DrugGEN

Running

App Files Files Community

osbm commited on Jun 8, 2023

Commit

1a3cfaf

1 Parent(s): 0b7b562

add codes

Browse files

Files changed (6) hide show

models.py +392 -0
new_dataloader.py +349 -0
requirements.txt +8 -0
trainer.py +892 -0
training_data.py +50 -0
utils.py +462 -0

models.py ADDED Viewed

	@@ -0,0 +1,392 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from layers import TransformerEncoder, TransformerDecoder
+class Generator(nn.Module):
+    """Generator network."""
+    def __init__(self, z_dim, act, vertexes, edges, nodes, dropout, dim, depth, heads, mlp_ratio, submodel):
+        super(Generator, self).__init__()
+        self.submodel = submodel
+        self.vertexes = vertexes
+        self.edges = edges
+        self.nodes = nodes
+        self.depth = depth
+        self.dim = dim
+        self.heads = heads
+        self.mlp_ratio = mlp_ratio
+        self.dropout = dropout
+        self.z_dim = z_dim
+        if act == "relu":
+            act = nn.ReLU()
+        elif act == "leaky":
+            act = nn.LeakyReLU()
+        elif act == "sigmoid":
+            act = nn.Sigmoid()
+        elif act == "tanh":
+            act = nn.Tanh()
+        self.features = vertexes * vertexes * edges + vertexes * nodes
+        self.transformer_dim = vertexes * vertexes * dim + vertexes * dim
+        self.pos_enc_dim = 5
+        #self.pos_enc = nn.Linear(self.pos_enc_dim, self.dim)
+        self.node_layers = nn.Sequential(nn.Linear(nodes, 64), act, nn.Linear(64,dim), act, nn.Dropout(self.dropout))
+        self.edge_layers = nn.Sequential(nn.Linear(edges, 64), act, nn.Linear(64,dim), act, nn.Dropout(self.dropout))
+        self.TransformerEncoder = TransformerEncoder(dim=self.dim, depth=self.depth, heads=self.heads, act = act,
+                                                                    mlp_ratio=self.mlp_ratio, drop_rate=self.dropout)
+        self.readout_e = nn.Linear(self.dim, edges)
+        self.readout_n = nn.Linear(self.dim, nodes)
+        self.softmax = nn.Softmax(dim = -1)
+    def _generate_square_subsequent_mask(self, sz):
+        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+        return mask
+    def laplacian_positional_enc(self, adj):
+        A = adj
+        D = torch.diag(torch.count_nonzero(A, dim=-1))
+        L = torch.eye(A.shape[0], device=A.device) - D * A * D
+        EigVal, EigVec = torch.linalg.eig(L)
+        idx = torch.argsort(torch.real(EigVal))
+        EigVal, EigVec = EigVal[idx], torch.real(EigVec[:,idx])
+        pos_enc = EigVec[:,1:self.pos_enc_dim + 1]
+        return pos_enc
+    def forward(self, z_e, z_n):
+        b, n, c = z_n.shape
+        _, _, _ , d = z_e.shape
+        #random_mask_e = torch.randint(low=0,high=2,size=(b,n,n,d)).to(z_e.device).float()
+        #random_mask_n = torch.randint(low=0,high=2,size=(b,n,c)).to(z_n.device).float()
+        #z_e = F.relu(z_e - random_mask_e)
+        #z_n = F.relu(z_n - random_mask_n)
+        #mask = self._generate_square_subsequent_mask(self.vertexes).to(z_e.device)
+        node = self.node_layers(z_n)
+        edge = self.edge_layers(z_e)
+        edge = (edge + edge.permute(0,2,1,3))/2
+        #lap = [self.laplacian_positional_enc(torch.max(x,-1)[1]) for x in edge]
+        #lap = torch.stack(lap).to(node.device)
+        #pos_enc = self.pos_enc(lap)
+        #node = node + pos_enc
+        node, edge = self.TransformerEncoder(node,edge)
+        node_sample = self.softmax(self.readout_n(node))
+        edge_sample = self.softmax(self.readout_e(edge))
+        return node, edge, node_sample, edge_sample
+class Generator2(nn.Module):
+    def __init__(self, dim, dec_dim, depth, heads, mlp_ratio, drop_rate, drugs_m_dim, drugs_b_dim, submodel):
+        super().__init__()
+        self.submodel = submodel
+        self.depth = depth
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.heads = heads
+        self.dropout_rate = drop_rate
+        self.drugs_m_dim = drugs_m_dim
+        self.drugs_b_dim = drugs_b_dim
+        self.pos_enc_dim = 5
+        if self.submodel == "Prot":
+            self.prot_n = torch.nn.Linear(3822, 45)   ## exact dimension of protein features
+            self.prot_e = torch.nn.Linear(298116, 2025) ## exact dimension of protein features
+            self.protn_dim = torch.nn.Linear(1, dec_dim)
+            self.prote_dim = torch.nn.Linear(1, dec_dim)
+        self.mol_nodes = nn.Linear(dim, dec_dim)
+        self.mol_edges = nn.Linear(dim, dec_dim)
+        self.drug_nodes =  nn.Linear(self.drugs_m_dim, dec_dim)
+        self.drug_edges =  nn.Linear(self.drugs_b_dim, dec_dim)
+        self.TransformerDecoder = TransformerDecoder(dec_dim, depth, heads, mlp_ratio, drop_rate=self.dropout_rate)
+        self.nodes_output_layer = nn.Linear(dec_dim, self.drugs_m_dim)
+        self.edges_output_layer = nn.Linear(dec_dim, self.drugs_b_dim)
+        self.softmax = nn.Softmax(dim=-1)
+    def laplacian_positional_enc(self, adj):
+        A = adj
+        D = torch.diag(torch.count_nonzero(A, dim=-1))
+        L = torch.eye(A.shape[0], device=A.device) - D * A * D
+        EigVal, EigVec = torch.linalg.eig(L)
+        idx = torch.argsort(torch.real(EigVal))
+        EigVal, EigVec = EigVal[idx], torch.real(EigVec[:,idx])
+        pos_enc = EigVec[:,1:self.pos_enc_dim + 1]
+        return pos_enc
+    def _generate_square_subsequent_mask(self, sz):
+        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+        return mask
+    def forward(self, edges_logits, nodes_logits ,akt1_adj,akt1_annot):
+        edges_logits = self.mol_edges(edges_logits)
+        nodes_logits = self.mol_nodes(nodes_logits)
+        if self.submodel != "Prot":
+            akt1_annot = self.drug_nodes(akt1_annot)
+            akt1_adj = self.drug_edges(akt1_adj)
+        else:
+            akt1_adj = self.prote_dim(self.prot_e(akt1_adj).view(1,45,45,1))
+            akt1_annot = self.protn_dim(self.prot_n(akt1_annot).view(1,45,1))
+        #lap = [self.laplacian_positional_enc(torch.max(x,-1)[1]) for x in drug_e]
+        #lap = torch.stack(lap).to(drug_e.device)
+        #pos_enc = self.pos_enc(lap)
+        #drug_n = drug_n + pos_enc
+        nodes_logits,akt1_annot, edges_logits, akt1_adj = self.TransformerDecoder(nodes_logits,akt1_annot,edges_logits,akt1_adj)
+        edges_logits = self.edges_output_layer(edges_logits)
+        nodes_logits = self.nodes_output_layer(nodes_logits)
+        edges_logits = self.softmax(edges_logits)
+        nodes_logits = self.softmax(nodes_logits)
+        return edges_logits, nodes_logits
+class simple_disc(nn.Module):
+    def __init__(self, act, m_dim, vertexes, b_dim):
+        super().__init__()
+        if act == "relu":
+            act = nn.ReLU()
+        elif act == "leaky":
+            act = nn.LeakyReLU()
+        elif act == "sigmoid":
+            act = nn.Sigmoid()
+        elif act == "tanh":
+            act = nn.Tanh()
+        features = vertexes * m_dim + vertexes * vertexes * b_dim
+        self.predictor = nn.Sequential(nn.Linear(features,256), act, nn.Linear(256,128), act, nn.Linear(128,64), act,
+                                       nn.Linear(64,32), act, nn.Linear(32,16), act,
+                                       nn.Linear(16,1))
+    def forward(self, x):
+        prediction = self.predictor(x)
+        #prediction = F.softmax(prediction,dim=-1)
+        return prediction
+"""class Discriminator(nn.Module):
+    def __init__(self,deg,agg,sca,pna_in_ch,pna_out_ch,edge_dim,towers,pre_lay,post_lay,pna_layer_num, graph_add):
+        super(Discriminator, self).__init__()
+        self.degree = deg
+        self.aggregators = agg
+        self.scalers = sca
+        self.pna_in_channels = pna_in_ch
+        self.pna_out_channels = pna_out_ch
+        self.edge_dimension = edge_dim
+        self.towers = towers
+        self.pre_layers_num = pre_lay
+        self.post_layers_num = post_lay
+        self.pna_layer_num = pna_layer_num
+        self.graph_add = graph_add
+        self.PNA_layer = PNA(deg=self.degree, agg =self.aggregators,sca = self.scalers,
+                             pna_in_ch= self.pna_in_channels, pna_out_ch = self.pna_out_channels, edge_dim = self.edge_dimension,
+                             towers = self.towers, pre_lay = self.pre_layers_num, post_lay = self.post_layers_num,
+                             pna_layer_num = self.pna_layer_num, graph_add = self.graph_add)
+    def forward(self, x, edge_index, edge_attr, batch, activation=None):
+        h = self.PNA_layer(x, edge_index, edge_attr, batch)
+        h = activation(h) if activation is not None else h
+        return h"""
+"""class Discriminator2(nn.Module):
+    def __init__(self,deg,agg,sca,pna_in_ch,pna_out_ch,edge_dim,towers,pre_lay,post_lay,pna_layer_num, graph_add):
+        super(Discriminator2, self).__init__()
+        self.degree = deg
+        self.aggregators = agg
+        self.scalers = sca
+        self.pna_in_channels = pna_in_ch
+        self.pna_out_channels = pna_out_ch
+        self.edge_dimension = edge_dim
+        self.towers = towers
+        self.pre_layers_num = pre_lay
+        self.post_layers_num = post_lay
+        self.pna_layer_num = pna_layer_num
+        self.graph_add = graph_add
+        self.PNA_layer = PNA(deg=self.degree, agg =self.aggregators,sca = self.scalers,
+                             pna_in_ch= self.pna_in_channels, pna_out_ch = self.pna_out_channels, edge_dim = self.edge_dimension,
+                             towers = self.towers, pre_lay = self.pre_layers_num, post_lay = self.post_layers_num,
+                             pna_layer_num = self.pna_layer_num, graph_add = self.graph_add)
+    def forward(self, x, edge_index, edge_attr, batch, activation=None):
+        h = self.PNA_layer(x, edge_index, edge_attr, batch)
+        h = activation(h) if activation is not None else h
+        return h"""
+"""class Discriminator_old(nn.Module):
+    def __init__(self, conv_dim, m_dim, b_dim, dropout, gcn_depth):
+        super(Discriminator_old, self).__init__()
+        graph_conv_dim, aux_dim, linear_dim = conv_dim
+        # discriminator
+        self.gcn_layer = GraphConvolution(m_dim, graph_conv_dim, b_dim, dropout,gcn_depth)
+        self.agg_layer = GraphAggregation(graph_conv_dim[-1], aux_dim, m_dim, dropout)
+        # multi dense layer
+        layers = []
+        for c0, c1 in zip([aux_dim]+linear_dim[:-1], linear_dim):
+            layers.append(nn.Linear(c0,c1))
+            layers.append(nn.Dropout(dropout))
+        self.linear_layer = nn.Sequential(*layers)
+        self.output_layer = nn.Linear(linear_dim[-1], 1)
+    def forward(self, adj, hidden, node, activation=None):
+        adj = adj[:,:,:,1:].permute(0,3,1,2)
+        annotations = torch.cat((hidden, node), -1) if hidden is not None else node
+        h = self.gcn_layer(annotations, adj)
+        annotations = torch.cat((h, hidden, node) if hidden is not None\
+                                 else (h, node), -1)
+        h = self.agg_layer(annotations, torch.tanh)
+        h = self.linear_layer(h)
+        # Need to implement batch discriminator #
+        #########################################
+        output = self.output_layer(h)
+        output = activation(output) if activation is not None else output
+        return output, h"""
+"""class Discriminator_old2(nn.Module):
+    def __init__(self, conv_dim, m_dim, b_dim, dropout, gcn_depth):
+        super(Discriminator_old2, self).__init__()
+        graph_conv_dim, aux_dim, linear_dim = conv_dim
+        # discriminator
+        self.gcn_layer = GraphConvolution(m_dim, graph_conv_dim, b_dim, dropout, gcn_depth)
+        self.agg_layer = GraphAggregation(graph_conv_dim[-1], aux_dim, m_dim, dropout)
+        # multi dense layer
+        layers = []
+        for c0, c1 in zip([aux_dim]+linear_dim[:-1], linear_dim):
+            layers.append(nn.Linear(c0,c1))
+            layers.append(nn.Dropout(dropout))
+        self.linear_layer = nn.Sequential(*layers)
+        self.output_layer = nn.Linear(linear_dim[-1], 1)
+    def forward(self, adj, hidden, node, activation=None):
+        adj = adj[:,:,:,1:].permute(0,3,1,2)
+        annotations = torch.cat((hidden, node), -1) if hidden is not None else node
+        h = self.gcn_layer(annotations, adj)
+        annotations = torch.cat((h, hidden, node) if hidden is not None\
+                                 else (h, node), -1)
+        h = self.agg_layer(annotations, torch.tanh)
+        h = self.linear_layer(h)
+        # Need to implement batch discriminator #
+        #########################################
+        output = self.output_layer(h)
+        output = activation(output) if activation is not None else output
+        return output, h"""
+"""class Discriminator3(nn.Module):
+    def __init__(self,in_ch):
+        super(Discriminator3, self).__init__()
+        self.dim = in_ch
+        self.TraConv_layer = TransformerConv(in_channels = self.dim,out_channels =  self.dim//4,edge_dim = self.dim)
+        self.mlp = torch.nn.Sequential(torch.nn.Tanh(), torch.nn.Linear(self.dim//4,1))
+    def forward(self, x, edge_index, edge_attr, batch, activation=None):
+        h = self.TraConv_layer(x, edge_index, edge_attr)
+        h = global_add_pool(h,batch)
+        h = self.mlp(h)
+        h = activation(h) if activation is not None else h
+        return h"""
+"""class PNA_Net(nn.Module):
+    def __init__(self,deg):
+        super().__init__()
+        self.convs = nn.ModuleList()
+        self.lin = nn.Linear(5, 128)
+        for _ in range(1):
+            conv = DenseGCNConv(128, 128, improved=False, bias=True)
+            self.convs.append(conv)
+        self.agg_layer = GraphAggregation(128, 128, 0, dropout=0.1)
+        self.mlp = nn.Sequential(nn.Linear(128, 64), nn.Tanh(), nn.Linear(64, 32), nn.Tanh(),
+                              nn.Linear(32, 1))
+    def forward(self, x, adj,mask=None):
+        x = self.lin(x)
+        for conv in self.convs:
+            x = F.relu(conv(x, adj,mask=None))
+        x = self.agg_layer(x,torch.tanh)
+        return self.mlp(x) """

new_dataloader.py ADDED Viewed

	@@ -0,0 +1,349 @@

+import pickle
+import os.path as osp
+import re
+import torch
+import numpy as np
+from tqdm import tqdm
+from rdkit import Chem
+from rdkit import RDLogger
+from torch_geometric.data import (Data, InMemoryDataset)
+RDLogger.DisableLog('rdApp.*')
+class DruggenDataset(InMemoryDataset):
+    def __init__(self, root, dataset_file, raw_files, max_atom, features, transform=None, pre_transform=None, pre_filter=None):
+        self.dataset_name = dataset_file.split(".")[0]
+        self.dataset_file = dataset_file
+        self.raw_files = raw_files
+        self.max_atom = max_atom
+        self.features = features
+        super().__init__(root, transform, pre_transform, pre_filter)
+        self.data, self.slices = torch.load(osp.join(root, dataset_file))
+    @property
+    def raw_file_names(self):
+        return self.raw_files
+    @property
+    def processed_file_names(self):
+        '''
+        Return the processed file names. If these names are not present, they will be automatically processed using process function of this class.
+        '''
+        return self.dataset_file
+    def _generate_encoders_decoders(self, data):
+        """
+        Generates the encoders and decoders for the atoms and bonds.
+        """
+        self.data = data
+        print('Creating atoms encoder and decoder..')
+        atom_labels = set()
+        # bond_labels = set()
+        self.max_atom_size_in_data = 0
+        for smile in data:
+            mol = Chem.MolFromSmiles(smile)
+            atom_labels.update([atom.GetAtomicNum() for atom in mol.GetAtoms()])
+            # bond_labels.update([bond.GetBondType() for bond in mol.GetBonds()])
+            self.max_atom_size_in_data = max(self.max_atom_size_in_data, mol.GetNumAtoms())
+        atom_labels.update([0]) # add PAD symbol (for unknown atoms)
+        atom_labels = sorted(atom_labels) # turn set into list and sort it
+        # atom_labels = sorted(set([atom.GetAtomicNum() for mol in self.data for atom in mol.GetAtoms()] + [0]))
+        self.atom_encoder_m = {l: i for i, l in enumerate(atom_labels)}
+        self.atom_decoder_m = {i: l for i, l in enumerate(atom_labels)}
+        self.atom_num_types = len(atom_labels)
+        print(f'Created atoms encoder and decoder with {self.atom_num_types - 1} atom types and 1 PAD symbol!')
+        print("atom_labels", atom_labels)
+        print('Creating bonds encoder and decoder..')
+        # bond_labels = [Chem.rdchem.BondType.ZERO] + list(sorted(set(bond.GetBondType()
+        #                                                             for mol in self.data
+        #                                                             for bond in mol.GetBonds())))
+        bond_labels = [
+            Chem.rdchem.BondType.ZERO,
+            Chem.rdchem.BondType.SINGLE,
+            Chem.rdchem.BondType.DOUBLE,
+            Chem.rdchem.BondType.TRIPLE,
+            Chem.rdchem.BondType.AROMATIC,
+        ]
+        print("bond labels", bond_labels)
+        self.bond_encoder_m = {l: i for i, l in enumerate(bond_labels)}
+        self.bond_decoder_m = {i: l for i, l in enumerate(bond_labels)}
+        self.bond_num_types = len(bond_labels)
+        print(f'Created bonds encoder and decoder with {self.bond_num_types - 1} bond types and 1 PAD symbol!')
+        #dataset_names = str(self.dataset_name)
+        with open("DrugGEN/data/encoders/" +"atom_" + self.dataset_name + ".pkl","wb") as atom_encoders:
+            pickle.dump(self.atom_encoder_m,atom_encoders)
+        with open("DrugGEN/data/decoders/" +"atom_" +  self.dataset_name + ".pkl","wb") as atom_decoders:
+            pickle.dump(self.atom_decoder_m,atom_decoders)
+        with open("DrugGEN/data/encoders/" +"bond_" +  self.dataset_name + ".pkl","wb") as bond_encoders:
+            pickle.dump(self.bond_encoder_m,bond_encoders)
+        with open("DrugGEN/data/decoders/" +"bond_" +  self.dataset_name + ".pkl","wb") as bond_decoders:
+            pickle.dump(self.bond_decoder_m,bond_decoders)
+    def generate_adjacency_matrix(self, mol, connected=True, max_length=None):
+        """
+        Generates the adjacency matrix for a molecule.
+        Args:
+            mol (Molecule): The molecule object.
+            connected (bool): Whether to check for connectivity in the molecule. Defaults to True.
+            max_length (int): The maximum length of the adjacency matrix. Defaults to the number of atoms in the molecule.
+        Returns:
+            numpy.ndarray or None: The adjacency matrix if connected and all atoms have a degree greater than 0,
+            otherwise None.
+        """
+        max_length = max_length if max_length is not None else mol.GetNumAtoms()
+        A = np.zeros(shape=(max_length, max_length))
+        begin, end = [b.GetBeginAtomIdx() for b in mol.GetBonds()], [b.GetEndAtomIdx() for b in mol.GetBonds()]
+        bond_type = [self.bond_encoder_m[b.GetBondType()] for b in mol.GetBonds()]
+        A[begin, end] = bond_type
+        A[end, begin] = bond_type
+        degree = np.sum(A[:mol.GetNumAtoms(), :mol.GetNumAtoms()], axis=-1)
+        return A if connected and (degree > 0).all() else None
+    def generate_node_features(self, mol, max_length=None):
+        """
+        Generates the node features for a molecule.
+        Args:
+            mol (Molecule): The molecule object.
+            max_length (int): The maximum length of the node features. Defaults to the number of atoms in the molecule.
+        Returns:
+            numpy.ndarray: The node features matrix.
+        """
+        max_length = max_length if max_length is not None else mol.GetNumAtoms()
+        return np.array([self.atom_encoder_m[atom.GetAtomicNum()] for atom in mol.GetAtoms()] + [0] * (
+                    max_length - mol.GetNumAtoms()))
+    def generate_additional_features(self, mol, max_length=None):
+        """
+        Generates additional features for a molecule.
+        Args:
+            mol (Molecule): The molecule object.
+            max_length (int): The maximum length of the additional features. Defaults to the number of atoms in the molecule.
+        Returns:
+            numpy.ndarray: The additional features matrix.
+        """
+        max_length = max_length if max_length is not None else mol.GetNumAtoms()
+        features = np.array([[*[a.GetDegree() == i for i in range(5)],
+                              *[a.GetExplicitValence() == i for i in range(9)],
+                              *[int(a.GetHybridization()) == i for i in range(1, 7)],
+                              *[a.GetImplicitValence() == i for i in range(9)],
+                              a.GetIsAromatic(),
+                              a.GetNoImplicit(),
+                              *[a.GetNumExplicitHs() == i for i in range(5)],
+                              *[a.GetNumImplicitHs() == i for i in range(5)],
+                              *[a.GetNumRadicalElectrons() == i for i in range(5)],
+                              a.IsInRing(),
+                              *[a.IsInRingSize(i) for i in range(2, 9)]] for a in mol.GetAtoms()], dtype=np.int32)
+        return np.vstack((features, np.zeros((max_length - features.shape[0], features.shape[1]))))
+    def decoder_load(self, dictionary_name):
+        with open("DrugGEN/data/decoders/" + dictionary_name + "_" + self.dataset_name + '.pkl', 'rb') as f:
+            return pickle.load(f)
+    def drugs_decoder_load(self, dictionary_name):
+        with open("DrugGEN/data/decoders/" + dictionary_name +'.pkl', 'rb') as f:
+            return pickle.load(f)
+    def matrices2mol(self, node_labels, edge_labels, strict=True):
+        mol = Chem.RWMol()
+        RDLogger.DisableLog('rdApp.*')
+        atom_decoders = self.decoder_load("atom")
+        bond_decoders = self.decoder_load("bond")
+        for node_label in node_labels:
+            mol.AddAtom(Chem.Atom(atom_decoders[node_label]))
+        for start, end in zip(*np.nonzero(edge_labels)):
+            if start > end:
+                mol.AddBond(int(start), int(end), bond_decoders[edge_labels[start, end]])
+        mol = self.correct_mol(mol)
+        if strict:
+            try:
+                Chem.SanitizeMol(mol)
+            except:
+                mol = None
+        return mol
+    def drug_decoder_load(self, dictionary_name):
+        ''' Loading the atom and bond decoders '''
+        with open("DrugGEN/data/decoders/" + dictionary_name +"_" + "akt_train" +'.pkl', 'rb') as f:
+            return pickle.load(f)
+    def matrices2mol_drugs(self, node_labels, edge_labels, strict=True):
+        mol = Chem.RWMol()
+        RDLogger.DisableLog('rdApp.*')
+        atom_decoders = self.drug_decoder_load("atom")
+        bond_decoders = self.drug_decoder_load("bond")
+        for node_label in node_labels:
+            mol.AddAtom(Chem.Atom(atom_decoders[node_label]))
+        for start, end in zip(*np.nonzero(edge_labels)):
+            if start > end:
+                mol.AddBond(int(start), int(end), bond_decoders[edge_labels[start, end]])
+        mol = self.correct_mol(mol)
+        if strict:
+            try:
+                Chem.SanitizeMol(mol)
+            except:
+                mol = None
+        return mol
+    def check_valency(self,mol):
+        """
+        Checks that no atoms in the mol have exceeded their possible
+        valency
+        :return: True if no valency issues, False otherwise
+        """
+        try:
+            Chem.SanitizeMol(mol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_PROPERTIES)
+            return True, None
+        except ValueError as e:
+            e = str(e)
+            p = e.find('#')
+            e_sub = e[p:]
+            atomid_valence = list(map(int, re.findall(r'\d+', e_sub)))
+            return False, atomid_valence
+    def correct_mol(self,x):
+        # xsm = Chem.MolToSmiles(x, isomericSmiles=True)
+        mol = x
+        while True:
+            flag, atomid_valence = self.check_valency(mol)
+            if flag:
+                break
+            else:
+                assert len (atomid_valence) == 2
+                idx = atomid_valence[0]
+                v = atomid_valence[1]
+                queue = []
+                for b in mol.GetAtomWithIdx(idx).GetBonds():
+                    queue.append(
+                        (b.GetIdx(), int(b.GetBondType()), b.GetBeginAtomIdx(), b.GetEndAtomIdx())
+                    )
+                queue.sort(key=lambda tup: tup[1], reverse=True)
+                if len(queue) > 0:
+                    start = queue[0][2]
+                    end = queue[0][3]
+                    t = queue[0][1] - 1
+                    mol.RemoveBond(start, end)
+                    #if t >= 1:
+                        #mol.AddBond(start, end, self.decoder_load('bond_decoders')[t])
+                    # if '.' in Chem.MolToSmiles(mol, isomericSmiles=True):
+                    #     mol.AddBond(start, end, self.decoder_load('bond_decoders')[t])
+                    #     print(tt)
+                    #     print(Chem.MolToSmiles(mol, isomericSmiles=True))
+        return mol
+    def label2onehot(self, labels, dim):
+        """Convert label indices to one-hot vectors."""
+        out = torch.zeros(list(labels.size())+[dim])
+        out.scatter_(len(out.size())-1,labels.unsqueeze(-1),1.)
+        return out.float()
+    def process(self, size= None):
+        '''
+        Process the dataset. This function will be only run if processed_file_names does not exist in the data folder already.
+        '''
+        # mols = [Chem.MolFromSmiles(line) for line in open(self.raw_files, 'r').readlines()]
+        # mols = list(filter(lambda x: x.GetNumAtoms() <= self.max_atom, mols))
+        # mols = mols[:size] # i
+        # indices = range(len(mols))
+        smiles = pd.read_csv(self.raw_files, header=None)[0].tolist()
+        self._generate_encoders_decoders(smiles)
+        # pbar.set_description(f'Processing chembl dataset')
+        # max_length = max(mol.GetNumAtoms() for mol in mols)
+        data_list = []
+        max_length = min(self.max_atom_size_in_data, self.max_atom)
+        self.m_dim = len(self.atom_decoder_m)
+        # for idx in indices:
+        for smiles in tqdm(smiles, desc='Processing chembl dataset', total=len(smiles)):
+            # mol = mols[idx]
+            mol = Chem.MolFromSmiles(smile)
+            # filter by max atom size
+            if mol.GetNumAtoms() > max_length:
+                continue
+            A = self.generate_adjacency_matrix(mol, connected=True, max_length=max_length)
+            if A is not None:
+                x = torch.from_numpy(self.generate_node_features(mol, max_length=max_length)).to(torch.long).view(1, -1)
+                x = self.label2onehot(x,self.m_dim).squeeze()
+                if self.features:
+                    f = torch.from_numpy(self.generate_additional_features(mol, max_length=max_length)).to(torch.long).view(x.shape[0], -1)
+                    x = torch.concat((x,f), dim=-1)
+                adjacency = torch.from_numpy(A)
+                edge_index = adjacency.nonzero(as_tuple=False).t().contiguous()
+                edge_attr = adjacency[edge_index[0], edge_index[1]].to(torch.long)
+                data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
+                if self.pre_filter is not None and not self.pre_filter(data):
+                    continue
+                if self.pre_transform is not None:
+                    data = self.pre_transform(data)
+                data_list.append(data)
+                # pbar.update(1)
+        # pbar.close()
+        torch.save(self.collate(data_list), osp.join(self.processed_dir, self.dataset_file))
+if __name__ == '__main__':
+    data = DruggenDataset("DrugGEN/data")

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch
+rdkit-pypi
+tqdm
+numpy
+seaborn
+matplotlib
+pandas
+torch_geometric

trainer.py ADDED Viewed

	@@ -0,0 +1,892 @@

+import os
+import time
+import torch.nn
+import torch
+from utils import *
+from models import Generator, Generator2, simple_disc
+import torch_geometric.utils as geoutils
+#import #wandb
+import re
+from torch_geometric.loader import DataLoader
+from new_dataloader import DruggenDataset
+import torch.utils.data
+from rdkit import RDLogger
+import pickle
+from rdkit.Chem.Scaffolds import MurckoScaffold
+torch.set_num_threads(5)
+RDLogger.DisableLog('rdApp.*')
+from loss import discriminator_loss, generator_loss, discriminator2_loss, generator2_loss
+from training_data import load_data
+import random
+class Trainer(object):
+    """Trainer for training and testing DrugGEN."""
+    def __init__(self, config):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
+        """Initialize configurations."""
+        self.submodel = config.submodel
+        self.inference_model = config.inference_model
+        # Data loader.
+        self.raw_file = config.raw_file  # SMILES containing text file for first dataset.
+                                         # Write the full path to file.
+        self.drug_raw_file = config.drug_raw_file  # SMILES containing text file for second dataset.
+                                                   # Write the full path to file.
+        self.dataset_file = config.dataset_file    # Dataset file name for the first GAN.
+                                                   # Contains large number of molecules.
+        self.drugs_dataset_file = config.drug_dataset_file  # Drug dataset file name for the second GAN.
+                                                            # Contains drug molecules only. (In this case AKT1 inhibitors.)
+        self.inf_raw_file = config.inf_raw_file  # SMILES containing text file for first dataset.
+                                         # Write the full path to file.
+        self.inf_drug_raw_file = config.inf_drug_raw_file  # SMILES containing text file for second dataset.
+                                                   # Write the full path to file.
+        self.inf_dataset_file = config.inf_dataset_file    # Dataset file name for the first GAN.
+                                                   # Contains large number of molecules.
+        self.inf_drugs_dataset_file = config.inf_drug_dataset_file  # Drug dataset file name for the second GAN.
+                                                            # Contains drug molecules only. (In this case AKT1 inhibitors.)
+        self.mol_data_dir = config.mol_data_dir  # Directory where the dataset files are stored.
+        self.drug_data_dir = config.drug_data_dir  # Directory where the drug dataset files are stored.
+        self.dataset_name = self.dataset_file.split(".")[0]
+        self.drugs_name = self.drugs_dataset_file.split(".")[0]
+        self.max_atom = config.max_atom  # Model is based on one-shot generation.
+                                         # Max atom number for molecules must be specified.
+        self.features = config.features  # Small model uses atom types as node features. (Boolean, False uses atom types only.)
+                                         # Additional node features can be added. Please check new_dataloarder.py Line 102.
+        self.batch_size = config.batch_size  # Batch size for training.
+        self.dataset = DruggenDataset(self.mol_data_dir,
+                                      self.dataset_file,
+                                      self.raw_file,
+                                      self.max_atom,
+                                      self.features) # Dataset for the first GAN. Custom dataset class from PyG parent class.
+                                                     # Can create any molecular graph dataset given smiles string.
+                                                     # Nonisomeric SMILES are suggested but not necessary.
+                                                     # Uses sparse matrix representation for graphs,
+                                                     # For computational and speed efficiency.
+        self.loader = DataLoader(self.dataset,
+                                 shuffle=True,
+                                 batch_size=self.batch_size,
+                                 drop_last=True)  # PyG dataloader for the first GAN.
+        self.drugs = DruggenDataset(self.drug_data_dir,
+                                    self.drugs_dataset_file,
+                                    self.drug_raw_file,
+                                    self.max_atom,
+                                    self.features)   # Dataset for the second GAN. Custom dataset class from PyG parent class.
+                                                     # Can create any molecular graph dataset given smiles string.
+                                                     # Nonisomeric SMILES are suggested but not necessary.
+                                                     # Uses sparse matrix representation for graphs,
+                                                     # For computational and speed efficiency.
+        self.drugs_loader = DataLoader(self.drugs,
+                                       shuffle=True,
+                                       batch_size=self.batch_size,
+                                       drop_last=True)  # PyG dataloader for the second GAN.
+        # Atom and bond type dimensions for the construction of the model.
+        self.atom_decoders = self.decoder_load("atom")  # Atom type decoders for first GAN.
+                                                        # eg. 0:0, 1:6 (C), 2:7 (N), 3:8 (O), 4:9 (F)
+        self.bond_decoders = self.decoder_load("bond")  # Bond type decoders for first GAN.
+                                                        # eg. 0: (no-bond), 1: (single), 2: (double), 3: (triple), 4: (aromatic)
+        self.m_dim = len(self.atom_decoders) if not self.features else int(self.loader.dataset[0].x.shape[1]) # Atom type dimension.
+        self.b_dim = len(self.bond_decoders) # Bond type dimension.
+        self.vertexes = int(self.loader.dataset[0].x.shape[0]) # Number of nodes in the graph.
+        self.drugs_atom_decoders = self.drug_decoder_load("atom") # Atom type decoders for second GAN.
+                                                                  # eg. 0:0, 1:6 (C), 2:7 (N), 3:8 (O), 4:9 (F)
+        self.drugs_bond_decoders = self.drug_decoder_load("bond") # Bond type decoders for second GAN.
+                                                                  # eg. 0: (no-bond), 1: (single), 2: (double), 3: (triple), 4: (aromatic)
+        self.drugs_m_dim = len(self.drugs_atom_decoders) if not self.features else int(self.drugs_loader.dataset[0].x.shape[1]) # Atom type dimension.
+        self.drugs_b_dim = len(self.drugs_bond_decoders)    # Bond type dimension.
+        self.drug_vertexes = int(self.drugs_loader.dataset[0].x.shape[0])  # Number of nodes in the graph.
+        # Transformer and Convolution configurations.
+        self.act = config.act
+        self.z_dim = config.z_dim
+        self.lambda_gp = config.lambda_gp
+        self.dim = config.dim
+        self.depth = config.depth
+        self.heads = config.heads
+        self.mlp_ratio = config.mlp_ratio
+        self.dec_depth = config.dec_depth
+        self.dec_heads = config.dec_heads
+        self.dec_dim = config.dec_dim
+        self.dis_select = config.dis_select
+        """self.la = config.la
+        self.la2 = config.la2
+        self.gcn_depth = config.gcn_depth
+        self.g_conv_dim = config.g_conv_dim
+        self.d_conv_dim = config.d_conv_dim"""
+        """# PNA config
+        self.agg = config.aggregators
+        self.sca = config.scalers
+        self.pna_in_ch = config.pna_in_ch
+        self.pna_out_ch = config.pna_out_ch
+        self.edge_dim = config.edge_dim
+        self.towers = config.towers
+        self.pre_lay = config.pre_lay
+        self.post_lay = config.post_lay
+        self.pna_layer_num = config.pna_layer_num
+        self.graph_add = config.graph_add"""
+        # Training configurations.
+        self.epoch = config.epoch
+        self.g_lr = config.g_lr
+        self.d_lr = config.d_lr
+        self.g2_lr = config.g2_lr
+        self.d2_lr = config.d2_lr
+        self.dropout = config.dropout
+        self.dec_dropout = config.dec_dropout
+        self.n_critic = config.n_critic
+        self.beta1 = config.beta1
+        self.beta2 = config.beta2
+        self.resume_iters = config.resume_iters
+        self.warm_up_steps = config.warm_up_steps
+        # Test configurations.
+        self.num_test_epoch = config.num_test_epoch
+        self.test_iters = config.test_iters
+        self.inference_sample_num = config.inference_sample_num
+        # Directories.
+        self.log_dir = config.log_dir
+        self.sample_dir = config.sample_dir
+        self.model_save_dir = config.model_save_dir
+        self.result_dir = config.result_dir
+        # Step size.
+        self.log_step = config.log_sample_step
+        self.clipping_value = config.clipping_value
+        # Miscellaneous.
+        self.mode = config.mode
+        self.noise_strength_0 = torch.nn.Parameter(torch.zeros([]))
+        self.noise_strength_1 = torch.nn.Parameter(torch.zeros([]))
+        self.noise_strength_2 = torch.nn.Parameter(torch.zeros([]))
+        self.noise_strength_3 = torch.nn.Parameter(torch.zeros([]))
+        self.init_type = config.init_type
+        self.build_model()
+    def build_model(self):
+        """Create generators and discriminators."""
+        ''' Generator is based on Transformer Encoder:
+            @ g_conv_dim: Dimensions for first MLP layers before Transformer Encoder
+            @ vertexes: maximum length of generated molecules (atom length)
+            @ b_dim: number of bond types
+            @ m_dim: number of atom types (or number of features used)
+            @ dropout: dropout possibility
+            @ dim: Hidden dimension of Transformer Encoder
+            @ depth: Transformer layer number
+            @ heads: Number of multihead-attention heads
+            @ mlp_ratio: Read-out layer dimension of Transformer
+            @ drop_rate: depricated
+            @ tra_conv: Whether module creates output for TransformerConv discriminator
+            '''
+        self.G = Generator(self.z_dim,
+                           self.act,
+                           self.vertexes,
+                           self.b_dim,
+                           self.m_dim,
+                           self.dropout,
+                           dim=self.dim,
+                           depth=self.depth,
+                           heads=self.heads,
+                           mlp_ratio=self.mlp_ratio,
+                           submodel = self.submodel)
+        self.G2 = Generator2(self.dim,
+                           self.dec_dim,
+                           self.dec_depth,
+                           self.dec_heads,
+                           self.mlp_ratio,
+                           self.dec_dropout,
+                           self.drugs_m_dim,
+                           self.drugs_b_dim,
+                           self.submodel)
+        ''' Discriminator implementation with PNA:
+            @ deg: Degree distribution based on used data. (Created with _genDegree() function)
+            @ agg: aggregators used in PNA
+            @ sca: scalers used in PNA
+            @ pna_in_ch: First PNA hidden dimension
+            @ pna_out_ch: Last PNA hidden dimension
+            @ edge_dim: Edge hidden dimension
+            @ towers: Number of towers (Splitting the hidden dimension to multiple parallel processes)
+            @ pre_lay: Pre-transformation layer
+            @ post_lay: Post-transformation layer
+            @ pna_layer_num: number of PNA layers
+            @ graph_add: global pooling layer selection
+            '''
+        ''' Discriminator implementation with Graph Convolution:
+            @ d_conv_dim: convolution dimensions for GCN
+            @ m_dim: number of atom types (or number of features used)
+            @ b_dim: number of bond types
+            @ dropout: dropout possibility
+            '''
+        ''' Discriminator implementation with MLP:
+            @ act: Activation function for MLP
+            @ m_dim: number of atom types (or number of features used)
+            @ b_dim: number of bond types
+            @ dropout: dropout possibility
+            @ vertexes: maximum length of generated molecules (molecule length)
+            '''
+        #self.D = Discriminator_old(self.d_conv_dim, self.m_dim , self.b_dim, self.dropout, self.gcn_depth)
+        self.D2 = simple_disc("tanh", self.drugs_m_dim, self.drug_vertexes, self.drugs_b_dim)
+        self.D = simple_disc("tanh", self.m_dim, self.vertexes, self.b_dim)
+        self.V = simple_disc("tanh", self.m_dim, self.vertexes, self.b_dim)
+        self.V2 = simple_disc("tanh", self.drugs_m_dim, self.drug_vertexes, self.drugs_b_dim)
+        ''' Optimizers for G1, G2, D1, and D2:
+            Adam Optimizer is used and different beta1 and beta2s are used for GAN1 and GAN2
+            '''
+        self.g_optimizer = torch.optim.AdamW(self.G.parameters(), self.g_lr, [self.beta1, self.beta2])
+        self.g2_optimizer = torch.optim.AdamW(self.G2.parameters(), self.g2_lr, [self.beta1, self.beta2])
+        self.d_optimizer = torch.optim.AdamW(self.D.parameters(), self.d_lr, [self.beta1, self.beta2])
+        self.d2_optimizer = torch.optim.AdamW(self.D2.parameters(), self.d2_lr, [self.beta1, self.beta2])
+        self.v_optimizer = torch.optim.AdamW(self.V.parameters(), self.d_lr, [self.beta1, self.beta2])
+        self.v2_optimizer = torch.optim.AdamW(self.V2.parameters(), self.d2_lr, [self.beta1, self.beta2])
+        ''' Learning rate scheduler:
+            Changes learning rate based on loss.
+            '''
+        #self.scheduler_g = ReduceLROnPlateau(self.g_optimizer, mode='min', factor=0.5, patience=10, min_lr=0.00001)
+        #self.scheduler_d = ReduceLROnPlateau(self.d_optimizer, mode='min', factor=0.5, patience=10, min_lr=0.00001)
+        #self.scheduler_v = ReduceLROnPlateau(self.v_optimizer, mode='min', factor=0.5, patience=10, min_lr=0.00001)
+        #self.scheduler_g2 = ReduceLROnPlateau(self.g2_optimizer, mode='min', factor=0.5, patience=10, min_lr=0.00001)
+        #self.scheduler_d2 = ReduceLROnPlateau(self.d2_optimizer, mode='min', factor=0.5, patience=10, min_lr=0.00001)
+        #self.scheduler_v2 = ReduceLROnPlateau(self.v2_optimizer, mode='min', factor=0.5, patience=10, min_lr=0.00001)
+        self.print_network(self.G, 'G')
+        self.print_network(self.D, 'D')
+        self.print_network(self.G2, 'G2')
+        self.print_network(self.D2, 'D2')
+        self.G.to(self.device)
+        self.D.to(self.device)
+        self.V.to(self.device)
+        self.V2.to(self.device)
+        self.G2.to(self.device)
+        self.D2.to(self.device)
+        #self.V2.to(self.device)
+        #self.modules_of_the_model = (self.G, self.D, self.G2, self.D2)
+        """for p in self.G.parameters():
+            if p.dim() > 1:
+                if self.init_type == 'uniform':
+                    torch.nn.init.xavier_uniform_(p)
+                elif self.init_type == 'normal':
+                    torch.nn.init.xavier_normal_(p)
+                elif self.init_type == 'random_normal':
+                     torch.nn.init.normal_(p, 0.0, 0.02)
+        for p in self.G2.parameters():
+            if p.dim() > 1:
+                if self.init_type == 'uniform':
+                    torch.nn.init.xavier_uniform_(p)
+                elif self.init_type == 'normal':
+                    torch.nn.init.xavier_normal_(p)
+                elif self.init_type == 'random_normal':
+                     torch.nn.init.normal_(p, 0.0, 0.02)
+        if self.dis_select == "conv":
+            for p in self.D.parameters():
+                if p.dim() > 1:
+                    if self.init_type == 'uniform':
+                        torch.nn.init.xavier_uniform_(p)
+                    elif self.init_type == 'normal':
+                        torch.nn.init.xavier_normal_(p)
+                    elif self.init_type == 'random_normal':
+                        torch.nn.init.normal_(p, 0.0, 0.02)
+        if self.dis_select == "conv":
+            for p in self.D2.parameters():
+                if p.dim() > 1:
+                    if self.init_type == 'uniform':
+                        torch.nn.init.xavier_uniform_(p)
+                    elif self.init_type == 'normal':
+                        torch.nn.init.xavier_normal_(p)
+                    elif self.init_type == 'random_normal':
+                        torch.nn.init.normal_(p, 0.0, 0.02)"""
+    def decoder_load(self, dictionary_name):
+        ''' Loading the atom and bond decoders'''
+        with open("DrugGEN/data/decoders/" + dictionary_name + "_" + self.dataset_name + '.pkl', 'rb') as f:
+            return pickle.load(f)
+    def drug_decoder_load(self, dictionary_name):
+        ''' Loading the atom and bond decoders'''
+        with open("DrugGEN/data/decoders/" + dictionary_name +"_" + self.drugs_name +'.pkl', 'rb') as f:
+            return pickle.load(f)
+    def print_network(self, model, name):
+        """Print out the network information."""
+        num_params = 0
+        for p in model.parameters():
+            num_params += p.numel()
+        print(model)
+        print(name)
+        print("The number of parameters: {}".format(num_params))
+    def restore_model(self, epoch, iteration, model_directory):
+        """Restore the trained generator and discriminator."""
+        print('Loading the trained models from epoch / iteration {}-{}...'.format(epoch, iteration))
+        G_path = os.path.join(model_directory, '{}-{}-G.ckpt'.format(epoch, iteration))
+        #D_path = os.path.join(model_directory, '{}-{}-D.ckpt'.format(epoch, iteration))
+        self.G.load_state_dict(torch.load(G_path, map_location=lambda storage, loc: storage))
+        #self.D.load_state_dict(torch.load(D_path, map_location=lambda storage, loc: storage))
+        G2_path = os.path.join(model_directory, '{}-{}-G2.ckpt'.format(epoch, iteration))
+        #D2_path = os.path.join(model_directory, '{}-{}-D2.ckpt'.format(epoch, iteration))
+        self.G2.load_state_dict(torch.load(G2_path, map_location=lambda storage, loc: storage))
+        #self.D2.load_state_dict(torch.load(D2_path, map_location=lambda storage, loc: storage))
+    def save_model(self, model_directory, idx,i):
+        G_path = os.path.join(model_directory, '{}-{}-G.ckpt'.format(idx+1,i+1))
+        D_path = os.path.join(model_directory, '{}-{}-D.ckpt'.format(idx+1,i+1))
+        torch.save(self.G.state_dict(), G_path)
+        torch.save(self.D.state_dict(), D_path)
+        if self.submodel != "NoTarget" and self.submodel != "CrossLoss":
+            G2_path = os.path.join(model_directory, '{}-{}-G2.ckpt'.format(idx+1,i+1))
+            D2_path = os.path.join(model_directory, '{}-{}-D2.ckpt'.format(idx+1,i+1))
+            torch.save(self.G2.state_dict(), G2_path)
+            torch.save(self.D2.state_dict(), D2_path)
+    def reset_grad(self):
+        """Reset the gradient buffers."""
+        self.g_optimizer.zero_grad()
+        self.v_optimizer.zero_grad()
+        self.g2_optimizer.zero_grad()
+        self.v2_optimizer.zero_grad()
+        self.d_optimizer.zero_grad()
+        self.d2_optimizer.zero_grad()
+    def gradient_penalty(self, y, x):
+        """Compute gradient penalty: (L2_norm(dy/dx) - 1)**2."""
+        weight = torch.ones(y.size(),requires_grad=False).to(self.device)
+        dydx = torch.autograd.grad(outputs=y,
+                                   inputs=x,
+                                   grad_outputs=weight,
+                                   retain_graph=True,
+                                   create_graph=True,
+                                   only_inputs=True)[0]
+        dydx = dydx.view(dydx.size(0), -1)
+        gradient_penalty = ((dydx.norm(2, dim=1) - 1) ** 2).mean()
+        return gradient_penalty
+    def train(self):
+        ''' Training Script starts from here'''
+        #wandb.config = {'beta2': 0.999}
+        #wandb.init(project="DrugGEN2", entity="atabeyunlu")
+        # Defining sampling paths and creating logger
+        self.arguments = "{}_glr{}_dlr{}_g2lr{}_d2lr{}_dim{}_depth{}_heads{}_decdepth{}_decheads{}_ncritic{}_batch{}_epoch{}_warmup{}_dataset{}_dropout{}".format(self.submodel,self.g_lr,self.d_lr,self.g2_lr,self.d2_lr,self.dim,self.depth,self.heads,self.dec_depth,self.dec_heads,self.n_critic,self.batch_size,self.epoch,self.warm_up_steps,self.dataset_name,self.dropout)
+        self.model_directory= os.path.join(self.model_save_dir,self.arguments)
+        self.sample_directory=os.path.join(self.sample_dir,self.arguments)
+        self.log_path = os.path.join(self.log_dir, "{}.txt".format(self.arguments))
+        if not os.path.exists(self.model_directory):
+            os.makedirs(self.model_directory)
+        if not os.path.exists(self.sample_directory):
+            os.makedirs(self.sample_directory)
+        # Learning rate cache for decaying.
+        # protein data
+        full_smiles = [line for line in open("DrugGEN/data/chembl_train.smi", 'r').read().splitlines()]
+        drug_smiles = [line for line in open("DrugGEN/data/akt_train.smi", 'r').read().splitlines()]
+        drug_mols = [Chem.MolFromSmiles(smi) for smi in drug_smiles]
+        drug_scaf = [MurckoScaffold.GetScaffoldForMol(x) for x in drug_mols]
+        fps_r = [Chem.RDKFingerprint(x) for x in drug_scaf]
+        akt1_human_adj = torch.load("DrugGEN/data/akt/AKT1_human_adj.pt").reshape(1,-1).to(self.device).float()
+        akt1_human_annot = torch.load("DrugGEN/data/akt/AKT1_human_annot.pt").reshape(1,-1).to(self.device).float()
+        # Start training.
+        print('Start training...')
+        self.start_time = time.time()
+        for idx in range(self.epoch):
+            # =================================================================================== #
+            #                             1. Preprocess input data                                #
+            # =================================================================================== #
+            # Load the data
+            dataloader_iterator = iter(self.drugs_loader)
+            for i, data in enumerate(self.loader):
+                try:
+                    drugs = next(dataloader_iterator)
+                except StopIteration:
+                    dataloader_iterator = iter(self.drugs_loader)
+                    drugs = next(dataloader_iterator)
+                # Preprocess both dataset
+                bulk_data = load_data(data,
+                                     drugs,
+                                     self.batch_size,
+                                     self.device,
+                                     self.b_dim,
+                                     self.m_dim,
+                                     self.drugs_b_dim,
+                                     self.drugs_m_dim,
+                                     self.z_dim,
+                                     self.vertexes)
+                drug_graphs, real_graphs, a_tensor, x_tensor, drugs_a_tensor, drugs_x_tensor, z, z_edge, z_node = bulk_data
+                if self.submodel == "CrossLoss":
+                    GAN1_input_e = drugs_a_tensor
+                    GAN1_input_x = drugs_x_tensor
+                    GAN1_disc_e = a_tensor
+                    GAN1_disc_x = x_tensor
+                elif self.submodel == "Ligand":
+                    GAN1_input_e = a_tensor
+                    GAN1_input_x = x_tensor
+                    GAN1_disc_e = a_tensor
+                    GAN1_disc_x = x_tensor
+                    GAN2_input_e = drugs_a_tensor
+                    GAN2_input_x = drugs_x_tensor
+                    GAN2_disc_e = drugs_a_tensor
+                    GAN2_disc_x = drugs_x_tensor
+                elif self.submodel == "Prot":
+                    GAN1_input_e =  a_tensor
+                    GAN1_input_x = x_tensor
+                    GAN1_disc_e = a_tensor
+                    GAN1_disc_x = x_tensor
+                    GAN2_input_e = akt1_human_adj
+                    GAN2_input_x = akt1_human_annot
+                    GAN2_disc_e = drugs_a_tensor
+                    GAN2_disc_x = drugs_x_tensor
+                elif self.submodel == "RL":
+                    GAN1_input_e = z_edge
+                    GAN1_input_x = z_node
+                    GAN1_disc_e = a_tensor
+                    GAN1_disc_x = x_tensor
+                    GAN2_input_e = drugs_a_tensor
+                    GAN2_input_x = drugs_x_tensor
+                    GAN2_disc_e = drugs_a_tensor
+                    GAN2_disc_x = drugs_x_tensor
+                elif self.submodel == "NoTarget":
+                    GAN1_input_e = z_edge
+                    GAN1_input_x = z_node
+                    GAN1_disc_e = a_tensor
+                    GAN1_disc_x = x_tensor
+                # =================================================================================== #
+                #                             2. Train the discriminator                              #
+                # =================================================================================== #
+                loss = {}
+                self.reset_grad()
+                # Compute discriminator loss.
+                node, edge, d_loss = discriminator_loss(self.G,
+                                            self.D,
+                                            real_graphs,
+                                            GAN1_disc_e,
+                                            GAN1_disc_x,
+                                            self.batch_size,
+                                            self.device,
+                                            self.gradient_penalty,
+                                            self.lambda_gp,
+                                            GAN1_input_e,
+                                            GAN1_input_x)
+                d_total = d_loss
+                if self.submodel != "NoTarget" and self.submodel != "CrossLoss":
+                    d2_loss = discriminator2_loss(self.G2,
+                                                    self.D2,
+                                                    drug_graphs,
+                                                    edge,
+                                                    node,
+                                                    self.batch_size,
+                                                    self.device,
+                                                    self.gradient_penalty,
+                                                    self.lambda_gp,
+                                                    GAN2_input_e,
+                                                    GAN2_input_x)
+                    d_total = d_loss + d2_loss
+                loss["d_total"] = d_total.item()
+                d_total.backward()
+                self.d_optimizer.step()
+                if self.submodel != "NoTarget" and self.submodel != "CrossLoss":
+                    self.d2_optimizer.step()
+                self.reset_grad()
+                generator_output = generator_loss(self.G,
+                                                    self.D,
+                                                    self.V,
+                                                    GAN1_input_e,
+                                                    GAN1_input_x,
+                                                    self.batch_size,
+                                                    sim_reward,
+                                                    self.dataset.matrices2mol_drugs,
+                                                    fps_r,
+                                                    self.submodel)
+                g_loss, fake_mol, g_edges_hat_sample, g_nodes_hat_sample, node, edge = generator_output
+                self.reset_grad()
+                g_total = g_loss
+                if self.submodel != "NoTarget" and self.submodel != "CrossLoss":
+                    output = generator2_loss(self.G2,
+                                                self.D2,
+                                                self.V2,
+                                                edge,
+                                                node,
+                                                self.batch_size,
+                                                sim_reward,
+                                                self.dataset.matrices2mol_drugs,
+                                                fps_r,
+                                                GAN2_input_e,
+                                                GAN2_input_x,
+                                                self.submodel)
+                    g2_loss, fake_mol_g, dr_g_edges_hat_sample, dr_g_nodes_hat_sample = output
+                    g_total = g_loss + g2_loss
+                loss["g_total"] = g_total.item()
+                g_total.backward()
+                self.g_optimizer.step()
+                if self.submodel != "NoTarget" and self.submodel != "CrossLoss":
+                    self.g2_optimizer.step()
+                if self.submodel == "RL":
+                    self.v_optimizer.step()
+                    self.v2_optimizer.step()
+                if (i+1) % self.log_step == 0:
+                    logging(self.log_path, self.start_time, fake_mol, full_smiles, i, idx, loss, 1,self.sample_directory)
+                    mol_sample(self.sample_directory,"GAN1",fake_mol, g_edges_hat_sample.detach(), g_nodes_hat_sample.detach(), idx, i)
+                    if self.submodel != "NoTarget" and self.submodel != "CrossLoss":
+                        logging(self.log_path, self.start_time, fake_mol_g, drug_smiles, i, idx, loss, 2,self.sample_directory)
+                        mol_sample(self.sample_directory,"GAN2",fake_mol_g, dr_g_edges_hat_sample.detach(), dr_g_nodes_hat_sample.detach(), idx, i)
+            if (idx+1) % 10 == 0:
+                self.save_model(self.model_directory,idx,i)
+                print("model saved at epoch {} and iteration {}".format(idx,i))
+    def inference(self):
+        # Load the trained generator.
+        self.G.to(self.device)
+        #self.D.to(self.device)
+        self.G2.to(self.device)
+        #self.D2.to(self.device)
+        G_path = os.path.join(self.inference_model, '{}-G.ckpt'.format(self.submodel))
+        self.G.load_state_dict(torch.load(G_path, map_location=lambda storage, loc: storage))
+        G2_path = os.path.join(self.inference_model, '{}-G2.ckpt'.format(self.submodel))
+        self.G2.load_state_dict(torch.load(G2_path, map_location=lambda storage, loc: storage))
+        drug_smiles = [line for line in open("DrugGEN/data/akt_test.smi", 'r').read().splitlines()]
+        drug_mols = [Chem.MolFromSmiles(smi) for smi in drug_smiles]
+        drug_scaf = [MurckoScaffold.GetScaffoldForMol(x) for x in drug_mols]
+        fps_r = [Chem.RDKFingerprint(x) for x in drug_scaf]
+        akt1_human_adj = torch.load("DrugGEN/data/akt/AKT1_human_adj.pt").reshape(1,-1).to(self.device).float()
+        akt1_human_annot = torch.load("DrugGEN/data/akt/AKT1_human_annot.pt").reshape(1,-1).to(self.device).float()
+        self.G.eval()
+        #self.D.eval()
+        self.G2.eval()
+        #self.D2.eval()
+        self.inf_batch_size =256
+        self.inf_dataset = DruggenDataset(self.mol_data_dir,
+                                      self.inf_dataset_file,
+                                      self.inf_raw_file,
+                                      self.max_atom,
+                                      self.features) # Dataset for the first GAN. Custom dataset class from PyG parent class.
+                                                     # Can create any molecular graph dataset given smiles string.
+                                                     # Nonisomeric SMILES are suggested but not necessary.
+                                                     # Uses sparse matrix representation for graphs,
+                                                     # For computational and speed efficiency.
+        self.inf_loader = DataLoader(self.inf_dataset,
+                                 shuffle=True,
+                                 batch_size=self.inf_batch_size,
+                                 drop_last=True)  # PyG dataloader for the first GAN.
+        self.inf_drugs = DruggenDataset(self.drug_data_dir,
+                                    self.inf_drugs_dataset_file,
+                                    self.inf_drug_raw_file,
+                                    self.max_atom,
+                                    self.features)   # Dataset for the second GAN. Custom dataset class from PyG parent class.
+                                                     # Can create any molecular graph dataset given smiles string.
+                                                     # Nonisomeric SMILES are suggested but not necessary.
+                                                     # Uses sparse matrix representation for graphs,
+                                                     # For computational and speed efficiency.
+        self.inf_drugs_loader = DataLoader(self.inf_drugs,
+                                       shuffle=True,
+                                       batch_size=self.inf_batch_size,
+                                       drop_last=True)  # PyG dataloader for the second GAN.
+        start_time = time.time()
+        #metric_calc_mol = []
+        metric_calc_dr = []
+        date = time.time()
+        if not os.path.exists("DrugGEN/experiments/inference/{}".format(self.submodel)):
+            os.makedirs("DrugGEN/experiments/inference/{}".format(self.submodel))
+        with torch.inference_mode():
+            dataloader_iterator = iter(self.drugs_loader)
+            for i, data in enumerate(self.loader):
+                try:
+                    drugs = next(dataloader_iterator)
+                except StopIteration:
+                    dataloader_iterator = iter(self.drugs_loader)
+                    drugs = next(dataloader_iterator)
+                # Preprocess both dataset
+                bulk_data = load_data(data,
+                                     drugs,
+                                     self.batch_size,
+                                     self.device,
+                                     self.b_dim,
+                                     self.m_dim,
+                                     self.drugs_b_dim,
+                                     self.drugs_m_dim,
+                                     self.z_dim,
+                                     self.vertexes)
+                drug_graphs, real_graphs, a_tensor, x_tensor, drugs_a_tensor, drugs_x_tensor, z, z_edge, z_node = bulk_data
+                if self.submodel == "CrossLoss":
+                    GAN1_input_e = a_tensor
+                    GAN1_input_x = x_tensor
+                    GAN1_disc_e = drugs_a_tensor
+                    GAN1_disc_x = drugs_x_tensor
+                    GAN2_input_e = drugs_a_tensor
+                    GAN2_input_x = drugs_x_tensor
+                    GAN2_disc_e = a_tensor
+                    GAN2_disc_x = x_tensor
+                elif self.submodel == "Ligand":
+                    GAN1_input_e = a_tensor
+                    GAN1_input_x = x_tensor
+                    GAN1_disc_e = a_tensor
+                    GAN1_disc_x = x_tensor
+                    GAN2_input_e = drugs_a_tensor
+                    GAN2_input_x = drugs_x_tensor
+                    GAN2_disc_e = drugs_a_tensor
+                    GAN2_disc_x = drugs_x_tensor
+                elif self.submodel == "Prot":
+                    GAN1_input_e = a_tensor
+                    GAN1_input_x = x_tensor
+                    GAN1_disc_e = a_tensor
+                    GAN1_disc_x = x_tensor
+                    GAN2_input_e = akt1_human_adj
+                    GAN2_input_x = akt1_human_annot
+                    GAN2_disc_e = drugs_a_tensor
+                    GAN2_disc_x = drugs_x_tensor
+                elif self.submodel == "RL":
+                    GAN1_input_e = z_edge
+                    GAN1_input_x = z_node
+                    GAN1_disc_e = a_tensor
+                    GAN1_disc_x = x_tensor
+                    GAN2_input_e = drugs_a_tensor
+                    GAN2_input_x = drugs_x_tensor
+                    GAN2_disc_e = drugs_a_tensor
+                    GAN2_disc_x = drugs_x_tensor
+                elif self.submodel == "NoTarget":
+                    GAN1_input_e = z_edge
+                    GAN1_input_x = z_node
+                    GAN1_disc_e = a_tensor
+                    GAN1_disc_x = x_tensor
+                # =================================================================================== #
+                #                             2. GAN1 Inference                                       #
+                # =================================================================================== #
+                generator_output = generator_loss(self.G,
+                                                    self.D,
+                                                    self.V,
+                                                    GAN1_input_e,
+                                                    GAN1_input_x,
+                                                    self.batch_size,
+                                                    sim_reward,
+                                                    self.dataset.matrices2mol_drugs,
+                                                    fps_r,
+                                                    self.submodel)
+                _, fake_mol, _, _, node, edge = generator_output
+                # =================================================================================== #
+                #                             3. GAN2 Inference                                       #
+                # =================================================================================== #
+                output = generator2_loss(self.G2,
+                                            self.D2,
+                                            self.V2,
+                                            edge,
+                                            node,
+                                            self.batch_size,
+                                            sim_reward,
+                                            self.dataset.matrices2mol_drugs,
+                                            fps_r,
+                                            GAN2_input_e,
+                                            GAN2_input_x,
+                                            self.submodel)
+                _, fake_mol_g, _, _ = output
+                inference_drugs = [Chem.MolToSmiles(line) for line in fake_mol_g if line is not None]
+                #inference_smiles = [Chem.MolToSmiles(line) for line in fake_mol]
+                print("molecule batch {} inferred".format(i))
+                with open("DrugGEN/experiments/inference/{}/inference_drugs.txt".format(self.submodel), "a") as f:
+                    for molecules in inference_drugs:
+                        f.write(molecules)
+                        f.write("\n")
+                        metric_calc_dr.append(molecules)
+                if i == 120:
+                    break
+        et = time.time() - start_time
+        print("Inference mode is lasted for {:.2f} seconds".format(et))
+        print("Metrics calculation started using MOSES.")
+        print("Validity: ", fraction_valid(inference_drugs), "\n")
+        print("Uniqueness: ", fraction_unique(inference_drugs), "\n")
+        print("Validity: ", novelty(inference_drugs, drug_smiles), "\n")
+        print("Metrics are calculated.")

training_data.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+import torch_geometric.utils as geoutils
+from utils import *
+def load_data(data, drugs, batch_size, device, b_dim, m_dim, drugs_b_dim, drugs_m_dim,z_dim,vertexes):
+    z = sample_z(batch_size, z_dim)                                                   # (batch,max_len)
+    z = torch.from_numpy(z).to(device).float().requires_grad_(True)
+    data = data.to(device)
+    drugs = drugs.to(device)
+    z_e = sample_z_edge(batch_size,vertexes,b_dim)                                                   # (batch,max_len,max_len)
+    z_n = sample_z_node(batch_size,vertexes,m_dim)                                                   # (batch,max_len)
+    z_edge = torch.from_numpy(z_e).to(device).float().requires_grad_(True)                                      # Edge noise.(batch,max_len,max_len)
+    z_node = torch.from_numpy(z_n).to(device).float().requires_grad_(True)                                      # Node noise.(batch,max_len)
+    a = geoutils.to_dense_adj(edge_index = data.edge_index,batch=data.batch,edge_attr=data.edge_attr, max_num_nodes=int(data.batch.shape[0]/batch_size))
+    x = data.x.view(batch_size,int(data.batch.shape[0]/batch_size),-1)
+    a_tensor = label2onehot(a, b_dim, device)
+    #x_tensor = label2onehot(x, m_dim)
+    x_tensor = x
+    a_tensor = a_tensor #+ torch.randn([a_tensor.size(0), a_tensor.size(1), a_tensor.size(2),1], device=a_tensor.device) * noise_strength_0
+    x_tensor = x_tensor #+ torch.randn([x_tensor.size(0), x_tensor.size(1),1], device=x_tensor.device) * noise_strength_1
+    drugs_a = geoutils.to_dense_adj(edge_index = drugs.edge_index,batch=drugs.batch,edge_attr=drugs.edge_attr, max_num_nodes=int(drugs.batch.shape[0]/batch_size))
+    drugs_x = drugs.x.view(batch_size,int(drugs.batch.shape[0]/batch_size),-1)
+    drugs_a = drugs_a.to(device).long()
+    drugs_x = drugs_x.to(device)
+    drugs_a_tensor = label2onehot(drugs_a, drugs_b_dim,device).float()
+    drugs_x_tensor = drugs_x
+    drugs_a_tensor = drugs_a_tensor #+ torch.randn([drugs_a_tensor.size(0), drugs_a_tensor.size(1), drugs_a_tensor.size(2),1], device=drugs_a_tensor.device) * noise_strength_2
+    drugs_x_tensor = drugs_x_tensor #+ torch.randn([drugs_x_tensor.size(0), drugs_x_tensor.size(1),1], device=drugs_x_tensor.device) * noise_strength_3
+    #prot_n = akt1_human_annot[None,:].to(device).float()
+    #prot_e = akt1_human_adj[None,None,:].view(1,546,546,1).to(device).float()
+    a_tensor_vec = a_tensor.reshape(batch_size,-1)
+    x_tensor_vec = x_tensor.reshape(batch_size,-1)
+    real_graphs = torch.concat((x_tensor_vec,a_tensor_vec),dim=-1)
+    a_drug_vec = drugs_a_tensor.reshape(batch_size,-1)
+    x_drug_vec = drugs_x_tensor.reshape(batch_size,-1)
+    drug_graphs = torch.concat((x_drug_vec,a_drug_vec),dim=-1)
+    return drug_graphs, real_graphs, a_tensor, x_tensor, drugs_a_tensor, drugs_x_tensor, z, z_edge, z_node

utils.py ADDED Viewed

	@@ -0,0 +1,462 @@

+from statistics import mean
+from rdkit import DataStructs
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from rdkit.Chem import Draw
+import os
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+from matplotlib.lines import Line2D
+from rdkit import RDLogger
+import torch
+from rdkit.Chem.Scaffolds import MurckoScaffold
+import math
+import time
+import datetime
+import re
+RDLogger.DisableLog('rdApp.*')
+import warnings
+from multiprocessing import Pool
+class Metrics(object):
+    @staticmethod
+    def valid(x):
+        return x is not None and Chem.MolToSmiles(x) != ''
+    @staticmethod
+    def tanimoto_sim_1v2(data1, data2):
+        min_len = data1.size if data1.size > data2.size else data2
+        sims = []
+        for i in range(min_len):
+            sim = DataStructs.FingerprintSimilarity(data1[i], data2[i])
+            sims.append(sim)
+        mean_sim = mean(sim)
+        return mean_sim
+    @staticmethod
+    def mol_length(x):
+        if x is not None:
+            return  len([char for char in max(Chem.MolToSmiles(x).split(sep =".")).upper() if char.isalpha()])
+        else:
+            return 0
+    @staticmethod
+    def max_component(data, max_len):
+        return (np.array(list(map(Metrics.mol_length, data)), dtype=np.float32)/max_len).mean()
+def sim_reward(mol_gen, fps_r):
+    gen_scaf = []
+    for x in mol_gen:
+        if x is not None:
+            try:
+                gen_scaf.append(MurckoScaffold.GetScaffoldForMol(x))
+            except:
+                pass
+    if len(gen_scaf) == 0:
+        rew = 1
+    else:
+        fps = [Chem.RDKFingerprint(x) for x in gen_scaf]
+        fps = np.array(fps)
+        fps_r = np.array(fps_r)
+        rew =  average_agg_tanimoto(fps_r,fps)[0]
+        if math.isnan(rew):
+            rew = 1
+    return rew  ## change this to penalty
+##########################################
+##########################################
+##########################################
+def mols2grid_image(mols,path):
+    mols = [e if e is not None else Chem.RWMol() for e in mols]
+    for i in range(len(mols)):
+        if Metrics.valid(mols[i]):
+        #if Chem.MolToSmiles(mols[i]) != '':
+            AllChem.Compute2DCoords(mols[i])
+            Draw.MolToFile(mols[i], os.path.join(path,"{}.png".format(i+1)), size=(1200,1200))
+        else:
+            continue
+def save_smiles_matrices(mols,edges_hard, nodes_hard,path,data_source = None):
+    mols = [e if e is not None else Chem.RWMol() for e in mols]
+    for i in range(len(mols)):
+        if Metrics.valid(mols[i]):
+            #m0= all_scores_for_print(mols[i], data_source, norm=False)
+        #if Chem.MolToSmiles(mols[i]) != '':
+            save_path = os.path.join(path,"{}.txt".format(i+1))
+            with open(save_path, "a") as f:
+                np.savetxt(f, edges_hard[i].cpu().numpy(), header="edge matrix:\n",fmt='%1.2f')
+                f.write("\n")
+                np.savetxt(f, nodes_hard[i].cpu().numpy(), header="node matrix:\n", footer="\nsmiles:",fmt='%1.2f')
+                f.write("\n")
+                #f.write(m0)
+                f.write("\n")
+            print(Chem.MolToSmiles(mols[i]), file=open(save_path,"a"))
+        else:
+            continue
+##########################################
+##########################################
+##########################################
+def dense_to_sparse_with_attr(adj):
+    ###
+    assert adj.dim() >= 2 and adj.dim() <= 3
+    assert adj.size(-1) == adj.size(-2)
+    index = adj.nonzero(as_tuple=True)
+    edge_attr = adj[index]
+    if len(index) == 3:
+        batch = index[0] * adj.size(-1)
+        index = (batch + index[1], batch + index[2])
+        #index = torch.stack(index, dim=0)
+    return index, edge_attr
+def label2onehot(labels, dim, device):
+    """Convert label indices to one-hot vectors."""
+    out = torch.zeros(list(labels.size())+[dim]).to(device)
+    out.scatter_(len(out.size())-1,labels.unsqueeze(-1),1.)
+    return out.float()
+def sample_z_node(batch_size, vertexes, nodes):
+    ''' Random noise for nodes logits. '''
+    return np.random.normal(0,1, size=(batch_size,vertexes, nodes))  #  128, 9, 5
+def sample_z_edge(batch_size, vertexes, edges):
+    ''' Random noise for edges logits. '''
+    return np.random.normal(0,1, size=(batch_size, vertexes, vertexes, edges)) # 128, 9, 9, 5
+def sample_z( batch_size, z_dim):
+    ''' Random noise. '''
+    return np.random.normal(0,1, size=(batch_size,z_dim))  #  128, 9, 5
+def mol_sample(sample_directory, model_name, mol, edges, nodes, idx, i):
+    sample_path = os.path.join(sample_directory,"{}-{}_{}-epoch_iteration".format(model_name,idx+1, i+1))
+    if not os.path.exists(sample_path):
+        os.makedirs(sample_path)
+    mols2grid_image(mol,sample_path)
+    save_smiles_matrices(mol,edges.detach(), nodes.detach(), sample_path)
+    if len(os.listdir(sample_path)) == 0:
+        os.rmdir(sample_path)
+    print("Valid molecules are saved.")
+    print("Valid matrices and smiles are saved")
+def logging(log_path, start_time, mols, train_smiles, i,idx, loss,model_num, save_path):
+    gen_smiles =  []
+    for line in mols:
+        if line is not None:
+            gen_smiles.append(Chem.MolToSmiles(line))
+        elif line is None:
+            gen_smiles.append(None)
+    #gen_smiles_saves = [None if x is None else re.sub('\*', '', x) for x in gen_smiles]
+    #gen_smiles_saves = [None if x is None else re.sub('\.', '', x) for x in gen_smiles_saves]
+    gen_smiles_saves = [None if x is None else max(x.split('.'), key=len) for x in gen_smiles]
+    sample_save_dir = os.path.join(save_path, "samples-GAN{}.txt".format(model_num))
+    with open(sample_save_dir, "a") as f:
+        for idxs in range(len(gen_smiles_saves)):
+            if gen_smiles_saves[idxs] is not None:
+                f.write(gen_smiles_saves[idxs])
+                f.write("\n")
+    k = len(set(gen_smiles_saves) - {None})
+    et = time.time() - start_time
+    et = str(datetime.timedelta(seconds=et))[:-7]
+    log = "Elapsed [{}], Epoch/Iteration [{}/{}] for GAN{}".format(et, idx,  i+1, model_num)
+    # Log update
+    #m0 = get_all_metrics(gen = gen_smiles, train = train_smiles, batch_size=batch_size, k = valid_mol_num, device=self.device)
+    valid = fraction_valid(gen_smiles_saves)
+    unique = fraction_unique(gen_smiles_saves, k, check_validity=False)
+    novel = novelty(gen_smiles_saves, train_smiles)
+    #qed = [QED(mol) for mol in mols if mol is not None]
+    #sa = [SA(mol) for mol in mols if mol is not None]
+    #logp = [logP(mol) for mol in mols if mol is not None]
+    #IntDiv = internal_diversity(gen_smiles)
+    #m0= all_scores_val(fake_mol, mols, full_mols, full_smiles, vert, norm=True)     # 'mols' is output of Fake Reward
+    #m1 =all_scores_chem(fake_mol, mols, vert, norm=True)
+    #m0.update(m1)
+    #maxlen = MolecularMetrics.max_component(mols, 45)
+    #m0 = {k: np.array(v).mean() for k, v in m0.items()}
+    #loss.update(m0)
+    loss.update({'Valid': valid})
+    loss.update({'Unique@{}'.format(k): unique})
+    loss.update({'Novel': novel})
+    #loss.update({'QED': statistics.mean(qed)})
+    #loss.update({'SA': statistics.mean(sa)})
+    #loss.update({'LogP': statistics.mean(logp)})
+    #loss.update({'IntDiv': IntDiv})
+    #wandb.log({"maxlen": maxlen})
+    for tag, value in loss.items():
+        log += ", {}: {:.4f}".format(tag, value)
+    with open(log_path, "a") as f:
+        f.write(log)
+        f.write("\n")
+    print(log)
+    print("\n")
+def plot_attn(dataset_name, heads,attn_w, model, iter, epoch):
+    cols = 4
+    rows = int(heads/cols)
+    fig, axes = plt.subplots( rows,cols, figsize = (30, 14))
+    axes = axes.flat
+    attentions_pos = attn_w[0]
+    attentions_pos = attentions_pos.cpu().detach().numpy()
+    for i,att in enumerate(attentions_pos):
+        #im = axes[i].imshow(att, cmap='gray')
+        sns.heatmap(att,vmin = 0, vmax = 1,ax = axes[i])
+        axes[i].set_title(f'head - {i} ')
+        axes[i].set_ylabel('layers')
+    pltsavedir = "/home/atabey/attn/second"
+    plt.savefig(os.path.join(pltsavedir, "attn" + model + "_" + dataset_name + "_"  + str(iter) + "_" + str(epoch) +  ".png"), dpi= 500,bbox_inches='tight')
+def plot_grad_flow(named_parameters, model, iter, epoch):
+    # Based on https://discuss.pytorch.org/t/check-gradient-flow-in-network/15063/10
+    '''Plots the gradients flowing through different layers in the net during training.
+    Can be used for checking for possible gradient vanishing / exploding problems.
+    Usage: Plug this function in Trainer class after loss.backwards() as
+    "plot_grad_flow(self.model.named_parameters())" to visualize the gradient flow'''
+    ave_grads = []
+    max_grads= []
+    layers = []
+    for n, p in named_parameters:
+        if(p.requires_grad) and ("bias" not in n):
+            print(p.grad,n)
+            layers.append(n)
+            ave_grads.append(p.grad.abs().mean().cpu())
+            max_grads.append(p.grad.abs().max().cpu())
+    plt.bar(np.arange(len(max_grads)), max_grads, alpha=0.1, lw=1, color="c")
+    plt.bar(np.arange(len(max_grads)), ave_grads, alpha=0.1, lw=1, color="b")
+    plt.hlines(0, 0, len(ave_grads)+1, lw=2, color="k" )
+    plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
+    plt.xlim(left=0, right=len(ave_grads))
+    plt.ylim(bottom = -0.001, top=1) # zoom in on the lower gradient regions
+    plt.xlabel("Layers")
+    plt.ylabel("average gradient")
+    plt.title("Gradient flow")
+    plt.grid(True)
+    plt.legend([Line2D([0], [0], color="c", lw=4),
+                Line2D([0], [0], color="b", lw=4),
+                Line2D([0], [0], color="k", lw=4)], ['max-gradient', 'mean-gradient', 'zero-gradient'])
+    pltsavedir = "/home/atabey/gradients/tryout"
+    plt.savefig(os.path.join(pltsavedir, "weights_" + model  + "_"  + str(iter) + "_" + str(epoch) +  ".png"), dpi= 500,bbox_inches='tight')
+"""
+def _genDegree():
+    ''' Generates the Degree distribution tensor for PNA, should be used everytime a different
+        dataset is used.
+        Can be called without arguments and saves the tensor for later use. If tensor was created
+        before, it just loads the degree tensor.
+        '''
+    degree_path = os.path.join(self.degree_dir, self.dataset_name + '-degree.pt')
+    if not os.path.exists(degree_path):
+        max_degree = -1
+        for data in self.dataset:
+            d = geoutils.degree(data.edge_index[1], num_nodes=data.num_nodes, dtype=torch.long)
+            max_degree = max(max_degree, int(d.max()))
+        # Compute the in-degree histogram tensor
+        deg = torch.zeros(max_degree + 1, dtype=torch.long)
+        for data in self.dataset:
+            d = geoutils.degree(data.edge_index[1], num_nodes=data.num_nodes, dtype=torch.long)
+            deg += torch.bincount(d, minlength=deg.numel())
+        torch.save(deg, 'DrugGEN/data/' + self.dataset_name + '-degree.pt')
+    else:
+        deg = torch.load(degree_path, map_location=lambda storage, loc: storage)
+    return deg
+"""
+def get_mol(smiles_or_mol):
+    '''
+    Loads SMILES/molecule into RDKit's object
+    '''
+    if isinstance(smiles_or_mol, str):
+        if len(smiles_or_mol) == 0:
+            return None
+        mol = Chem.MolFromSmiles(smiles_or_mol)
+        if mol is None:
+            return None
+        try:
+            Chem.SanitizeMol(mol)
+        except ValueError:
+            return None
+        return mol
+    return smiles_or_mol
+def mapper(n_jobs):
+    '''
+    Returns function for map call.
+    If n_jobs == 1, will use standard map
+    If n_jobs > 1, will use multiprocessing pool
+    If n_jobs is a pool object, will return its map function
+    '''
+    if n_jobs == 1:
+        def _mapper(*args, **kwargs):
+            return list(map(*args, **kwargs))
+        return _mapper
+    if isinstance(n_jobs, int):
+        pool = Pool(n_jobs)
+        def _mapper(*args, **kwargs):
+            try:
+                result = pool.map(*args, **kwargs)
+            finally:
+                pool.terminate()
+            return result
+        return _mapper
+    return n_jobs.map
+def remove_invalid(gen, canonize=True, n_jobs=1):
+    """
+    Removes invalid molecules from the dataset
+    """
+    if not canonize:
+        mols = mapper(n_jobs)(get_mol, gen)
+        return [gen_ for gen_, mol in zip(gen, mols) if mol is not None]
+    return [x for x in mapper(n_jobs)(canonic_smiles, gen) if
+            x is not None]
+def fraction_valid(gen, n_jobs=1):
+    """
+    Computes a number of valid molecules
+    Parameters:
+        gen: list of SMILES
+        n_jobs: number of threads for calculation
+    """
+    gen = mapper(n_jobs)(get_mol, gen)
+    return 1 - gen.count(None) / len(gen)
+def canonic_smiles(smiles_or_mol):
+    mol = get_mol(smiles_or_mol)
+    if mol is None:
+        return None
+    return Chem.MolToSmiles(mol)
+def fraction_unique(gen, k=None, n_jobs=1, check_validity=True):
+    """
+    Computes a number of unique molecules
+    Parameters:
+        gen: list of SMILES
+        k: compute unique@k
+        n_jobs: number of threads for calculation
+        check_validity: raises ValueError if invalid molecules are present
+    """
+    if k is not None:
+        if len(gen) < k:
+            warnings.warn(
+                "Can't compute unique@{}.".format(k) +
+                "gen contains only {} molecules".format(len(gen))
+            )
+        gen = gen[:k]
+    canonic = set(mapper(n_jobs)(canonic_smiles, gen))
+    if None in canonic and check_validity:
+        raise ValueError("Invalid molecule passed to unique@k")
+    return 0 if len(gen) == 0 else len(canonic) / len(gen)
+def novelty(gen, train, n_jobs=1):
+    gen_smiles = mapper(n_jobs)(canonic_smiles, gen)
+    gen_smiles_set = set(gen_smiles) - {None}
+    train_set = set(train)
+    return 0 if len(gen_smiles_set) == 0 else len(gen_smiles_set - train_set) / len(gen_smiles_set)
+def average_agg_tanimoto(stock_vecs, gen_vecs,
+                         batch_size=5000, agg='max',
+                         device='cpu', p=1):
+    """
+    For each molecule in gen_vecs finds closest molecule in stock_vecs.
+    Returns average tanimoto score for between these molecules
+    Parameters:
+        stock_vecs: numpy array <n_vectors x dim>
+        gen_vecs: numpy array <n_vectors' x dim>
+        agg: max or mean
+        p: power for averaging: (mean x^p)^(1/p)
+    """
+    assert agg in ['max', 'mean'], "Can aggregate only max or mean"
+    agg_tanimoto = np.zeros(len(gen_vecs))
+    total = np.zeros(len(gen_vecs))
+    for j in range(0, stock_vecs.shape[0], batch_size):
+        x_stock = torch.tensor(stock_vecs[j:j + batch_size]).to(device).float()
+        for i in range(0, gen_vecs.shape[0], batch_size):
+            y_gen = torch.tensor(gen_vecs[i:i + batch_size]).to(device).float()
+            y_gen = y_gen.transpose(0, 1)
+            tp = torch.mm(x_stock, y_gen)
+            jac = (tp / (x_stock.sum(1, keepdim=True) +
+                         y_gen.sum(0, keepdim=True) - tp)).cpu().numpy()
+            jac[np.isnan(jac)] = 1
+            if p != 1:
+                jac = jac**p
+            if agg == 'max':
+                agg_tanimoto[i:i + y_gen.shape[1]] = np.maximum(
+                    agg_tanimoto[i:i + y_gen.shape[1]], jac.max(0))
+            elif agg == 'mean':
+                agg_tanimoto[i:i + y_gen.shape[1]] += jac.sum(0)
+                total[i:i + y_gen.shape[1]] += jac.shape[0]
+    if agg == 'mean':
+        agg_tanimoto /= total
+    if p != 1:
+        agg_tanimoto = (agg_tanimoto)**(1/p)
+    return np.mean(agg_tanimoto)