Spaces:

saicharan2804
/

AtomwiseTokenizer

Sleeping

File size: 2,359 Bytes

5a4be66
1cd9d39
5a4be66
 
 
 
1cd9d39
5a4be66
 
 
 
 
 
1cd9d39
 
5a4be66
1cd9d39
 
 
5a4be66
1cd9d39
5a4be66

def atomwise_tokenizer(smi, exclusive_tokens=None):
    """
    Tokenize a SMILES molecule at atom-level and return tokens with their token IDs.
    - 'Br' and 'Cl' are two-character tokens.
    - Symbols with brackets are considered as tokens.
    - If `exclusive_tokens` is provided, symbols with brackets not in `exclusive_tokens` will be replaced by '[UNK]'.

    Parameters:
    - smi (str): SMILES string to tokenize.
    - exclusive_tokens (list of str, optional): Specific symbols with brackets to keep.

    Returns:
    - tuple: (tokens, token_ids), where tokens is a list of atom-level tokens and token_ids is a list of corresponding token IDs.
    """
    import re
    pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]

    # Handle exclusive tokens, replacing non-exclusive bracketed tokens with '[UNK]'
    if exclusive_tokens:
        tokens = [tok if tok in exclusive_tokens or not tok.startswith('[') else '[UNK]' for tok in tokens]

    # Generating token IDs based on the order of unique token appearance
    token_to_id = {}
    token_ids = []
    for token in tokens:
        if token not in token_to_id:
            # Assign a new ID based on the current size of the dictionary
            token_to_id[token] = len(token_to_id)
        token_ids.append(token_to_id[token])

    return tokens, token_ids



# def atomwise_tokenizer(smi, exclusive_tokens = None):
#     """
#     Tokenize a SMILES molecule at atom-level:
#         (1) 'Br' and 'Cl' are two-character tokens
#         (2) Symbols with bracket are considered as tokens

#     exclusive_tokens: A list of specifical symbols with bracket you want to keep. e.g., ['[C@@H]', '[nH]'].
#     Other symbols with bracket will be replaced by '[UNK]'. default is `None`.
#     """
#     import re
#     pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
#     regex = re.compile(pattern)
#     tokens = [token for token in regex.findall(smi)]

#     if exclusive_tokens:
#         for i, tok in enumerate(tokens):
#             if tok.startswith('['):
#                 if tok not in exclusive_tokens:
#                     tokens[i] = '[UNK]'
#     return tokens