DeepSEQreen_fast_build

Running on CPU Upgrade

File size: 10,109 Bytes

c0ec7e6

import collections
from importlib import resources
import os
import re
from typing import Optional, List

import numpy as np
from transformers import BertTokenizer

SMI_REGEX_PATTERN = r"""(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"""
# \[[^\]]+\] # match anything inside square brackets
# |Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p # match elements
# |\(|\) # match parentheses
# |\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2} # match various symbols
# |[0-9] # match digits


def sequence_to_kmers(sequence, k=3):
    """ Divide a string into a list of kmers strings.

    Parameters:
        sequence (string)
        k (int), default 3
    Returns:
        List containing a list of kmers.
    """
    return [sequence[i:i + k] for i in range(len(sequence) - k + 1)]


def sequence_to_word_embedding(sequence, model):
    """Get protein embedding, infer a list of 3-mers to (num_word, 100) matrix"""
    kmers = sequence_to_kmers(sequence)
    vec = np.zeros((len(kmers), 100))
    i = 0
    for word in kmers:
        try:
            vec[i,] = model.wv[word]
        except KeyError:
            pass
        i += 1
    return vec


def sequence_to_token_ids(sequence, tokenizer):
    token_ids = tokenizer.encode(sequence)
    return np.array(token_ids)


# def sequence_to_token_ids(sequence, tokenizer, max_length: int):
#     token_ids = tokenizer.encode(sequence)
#     length = min(max_length, len(token_ids))
#
#     token_ids_padded = np.zeros(max_length, dtype='int')
#     token_ids_padded[:length] = token_ids[:length]
#
#     return token_ids_padded


class SmilesTokenizer(BertTokenizer):
    """
    Adapted from https://github.com/deepchem/deepchem/.

    Creates the SmilesTokenizer class. The tokenizer heavily inherits from the BertTokenizer
    implementation found in Huggingface's transformers library. It runs a WordPiece tokenization
    algorithm over SMILES strings using the tokenization SMILES regex developed by Schwaller et al.

    Please see https://github.com/huggingface/transformers
    and https://github.com/rxn4chemistry/rxnfp for more details.

    Examples
    --------
    >>> tokenizer = SmilesTokenizer(vocab_path, regex_pattern)
    >>> print(tokenizer.encode("CC(=O)OC1=CC=CC=C1C(=O)O"))
    [12, 16, 16, 17, 22, 19, 18, 19, 16, 20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 16, 17, 22, 19, 18, 19, 13]


    References
    ----------
    .. [1] Schwaller, Philippe; Probst, Daniel; Vaucher, Alain C.; Nair, Vishnu H; Kreutter, David;
        Laino, Teodoro; et al. (2019): Mapping the Space of Chemical Reactions using Attention-Based Neural
        Networks. ChemRxiv. Preprint. https://doi.org/10.26434/chemrxiv.9897365.v3

    Note
    ----
    This class requires huggingface's transformers and tokenizers libraries to be installed.
    """

    def __init__(
            self,
            vocab_file: str = 'resources/vocabs/smiles.txt',
            regex_pattern: str = SMI_REGEX_PATTERN,
            # unk_token="[UNK]",
            # sep_token="[SEP]",
            # pad_token="[PAD]",
            # cls_token="[CLS]",
            # mask_token="[MASK]",
            **kwargs):
        """Constructs a SmilesTokenizer.

        Parameters
        ----------
        vocab_file: str
            Path to a SMILES character per line vocabulary file.
            Default vocab file is found in deepchem/feat/tests/data/vocab.txt
        """

        super().__init__(vocab_file, **kwargs)

        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocab file at path '{}'.".format(vocab_file))
        self.vocab = load_vocab(vocab_file)
        unused_indexes = [i for i, v in enumerate(self.vocab.keys()) if v.startswith("[unused")]
        self.highest_unused_index = 0 if len(unused_indexes) == 0 else max(unused_indexes)
        self.ids_to_tokens = collections.OrderedDict([
            (ids, tok) for tok, ids in self.vocab.items()
        ])
        self.basic_tokenizer = BasicSmilesTokenizer(regex_pattern=regex_pattern)

    @property
    def vocab_size(self):
        return len(self.vocab)

    @property
    def vocab_list(self):
        return list(self.vocab.keys())

    def _tokenize(self, text: str, max_seq_length: int = 512, **kwargs):
        """Tokenize a string into a list of tokens.

        Parameters
        ----------
        text: str
            Input string sequence to be tokenized.
        """

        max_len_single_sentence = max_seq_length - 2
        split_tokens = [
            token for token in self.basic_tokenizer.tokenize(text)
            [:max_len_single_sentence]
        ]
        return split_tokens

    def _convert_token_to_id(self, token: str):
        """Converts a token (str/unicode) in an id using the vocab.

        Parameters
        ----------
        token: str
            String token from a larger sequence to be converted to a numerical id.
        """

        return self.vocab.get(token, self.vocab.get(self.unk_token))

    def _convert_id_to_token(self, index: int):
        """Converts an index (integer) in a token (string/unicode) using the vocab.

        Parameters
        ----------
        index: int
            Integer index to be converted back to a string-based token as part of a larger sequence.
        """

        return self.ids_to_tokens.get(index, self.unk_token)

    def convert_tokens_to_string(self, tokens: List[str]):
        """Converts a sequence of tokens (string) in a single string.

        Parameters
        ----------
        tokens: List[str]
            List of tokens for a given string sequence.

        Returns
        -------
        out_string: str
            Single string from combined tokens.
        """

        out_string: str = " ".join(tokens).replace(" ##", "").strip()
        return out_string

    def add_special_tokens_ids_single_sequence(self,
                                               token_ids: List[Optional[int]]):
        """Adds special tokens to a sequence for sequence classification tasks.

        A BERT sequence has the following format: [CLS] X [SEP]

        Parameters
        ----------
        token_ids: list[int]
            list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
        """

        return [self.cls_token_id] + token_ids + [self.sep_token_id]

    def add_special_tokens_single_sequence(self, tokens: List[str]):
        """Adds special tokens to the a sequence for sequence classification tasks.
        A BERT sequence has the following format: [CLS] X [SEP]

        Parameters
        ----------
        tokens: List[str]
            List of tokens for a given string sequence.
        """
        return [self.cls_token] + tokens + [self.sep_token]

    def add_special_tokens_ids_sequence_pair(
            self, token_ids_0: List[Optional[int]],
            token_ids_1: List[Optional[int]]) -> List[Optional[int]]:
        """Adds special tokens to a sequence pair for sequence classification tasks.
        A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]

        Parameters
        ----------
        token_ids_0: List[int]
            List of ids for the first string sequence in the sequence pair (A).
        token_ids_1: List[int]
            List of tokens for the second string sequence in the sequence pair (B).
        """

        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        return cls + token_ids_0 + sep + token_ids_1 + sep

    def add_padding_tokens(self,
                           token_ids: List[Optional[int]],
                           length: int,
                           right: bool = True) -> List[Optional[int]]:
        """Adds padding tokens to return a sequence of length max_length.
        By default padding tokens are added to the right of the sequence.

        Parameters
        ----------
        token_ids: list[optional[int]]
            list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
        length: int
        right: bool, default True

        Returns
        -------
        List[int]
        """
        padding = [self.pad_token_id] * (length - len(token_ids))

        if right:
            return token_ids + padding
        else:
            return padding + token_ids


class BasicSmilesTokenizer(object):
    """
    Adapted from https://github.com/deepchem/deepchem/.
    Run basic SMILES tokenization using a regex pattern developed by Schwaller et. al.
    This tokenizer is to be used when a tokenizer that does not require the transformers library by HuggingFace is required.

    Examples
    --------
    >>> tokenizer = BasicSmilesTokenizer()
    >>> print(tokenizer.tokenize("CC(=O)OC1=CC=CC=C1C(=O)O"))
    ['C', 'C', '(', '=', 'O', ')', 'O', 'C', '1', '=', 'C', 'C', '=', 'C', 'C', '=', 'C', '1', 'C', '(', '=', 'O', ')', 'O']


    References
    ----------
    .. [1] Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
        ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
        1572-1583 DOI: 10.1021/acscentsci.9b00576
    """

    def __init__(self, regex_pattern: str = SMI_REGEX_PATTERN):
        """Constructs a BasicSMILESTokenizer.

        Parameters
        ----------
        regex: string
            SMILES token regex
        """
        self.regex_pattern = regex_pattern
        self.regex = re.compile(self.regex_pattern)

    def tokenize(self, text):
        """Basic Tokenization of a SMILES.
        """
        tokens = [token for token in self.regex.findall(text)]
        return tokens


def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()
    for index, token in enumerate(tokens):
        token = token.rstrip("\n")
        vocab[token] = index
    return vocab