Spaces:
Sleeping
Sleeping
File size: 2,359 Bytes
5a4be66 1cd9d39 5a4be66 1cd9d39 5a4be66 1cd9d39 5a4be66 1cd9d39 5a4be66 1cd9d39 5a4be66 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
def atomwise_tokenizer(smi, exclusive_tokens=None):
"""
Tokenize a SMILES molecule at atom-level and return tokens with their token IDs.
- 'Br' and 'Cl' are two-character tokens.
- Symbols with brackets are considered as tokens.
- If `exclusive_tokens` is provided, symbols with brackets not in `exclusive_tokens` will be replaced by '[UNK]'.
Parameters:
- smi (str): SMILES string to tokenize.
- exclusive_tokens (list of str, optional): Specific symbols with brackets to keep.
Returns:
- tuple: (tokens, token_ids), where tokens is a list of atom-level tokens and token_ids is a list of corresponding token IDs.
"""
import re
pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
regex = re.compile(pattern)
tokens = [token for token in regex.findall(smi)]
# Handle exclusive tokens, replacing non-exclusive bracketed tokens with '[UNK]'
if exclusive_tokens:
tokens = [tok if tok in exclusive_tokens or not tok.startswith('[') else '[UNK]' for tok in tokens]
# Generating token IDs based on the order of unique token appearance
token_to_id = {}
token_ids = []
for token in tokens:
if token not in token_to_id:
# Assign a new ID based on the current size of the dictionary
token_to_id[token] = len(token_to_id)
token_ids.append(token_to_id[token])
return tokens, token_ids
# def atomwise_tokenizer(smi, exclusive_tokens = None):
# """
# Tokenize a SMILES molecule at atom-level:
# (1) 'Br' and 'Cl' are two-character tokens
# (2) Symbols with bracket are considered as tokens
# exclusive_tokens: A list of specifical symbols with bracket you want to keep. e.g., ['[C@@H]', '[nH]'].
# Other symbols with bracket will be replaced by '[UNK]'. default is `None`.
# """
# import re
# pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
# regex = re.compile(pattern)
# tokens = [token for token in regex.findall(smi)]
# if exclusive_tokens:
# for i, tok in enumerate(tokens):
# if tok.startswith('['):
# if tok not in exclusive_tokens:
# tokens[i] = '[UNK]'
# return tokens
|