|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
In order to build a regexp tokenizer model use the following command. |
|
The script will create: |
|
|
|
.vocab file - with learned vocabulary |
|
.model file - with provided regex |
|
To build vocabulary from text files: |
|
|
|
python -- scripts/nlp_language_modeling/build_regex_tokenizer.py \ |
|
--regex '\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9]' \ |
|
--input_type text \ |
|
--output_file regex_tokenizer -- \ |
|
data_file1.txt data_file2.txt |
|
|
|
To build vocabulary from CSV files ("smiles" column): |
|
|
|
python -- scripts/nlp_language_modeling/build_regex_tokenizer.py \ |
|
--regex '\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9]' \ |
|
--input_type csv \ |
|
--input_csv_col smiles \ |
|
--output_file regex_tokenizer -- \ |
|
data_file1.csv data_file2.csv |
|
""" |
|
import argparse |
|
|
|
from nemo.collections.common.tokenizers.regex_tokenizer import RegExTokenizer |
|
from nemo.utils import logging |
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser( |
|
description="Builds vocabulary from regex tokenizer. Outputs .model (regular expression) and .vocab (learned vocabualry)", |
|
) |
|
parser.add_argument( |
|
'input_files', type=str, nargs='+', help='Input text/csv file', |
|
) |
|
parser.add_argument( |
|
'--regex', type=str, required=True, help='Regular expression to split text', |
|
) |
|
parser.add_argument( |
|
'--output_file', |
|
type=str, |
|
required=True, |
|
help='Output base file name. Two files will be created: .vocab (learned vocabulary), .model (the regex)', |
|
) |
|
parser.add_argument( |
|
'--input_type', |
|
type=str, |
|
required=False, |
|
choices=["text", "csv"], |
|
default="text", |
|
help='Type of input file: text, csv', |
|
) |
|
parser.add_argument( |
|
'--input_csv_col', type=str, required=False, default="smiles", help='Column of data in CSV file', |
|
) |
|
args = parser.parse_args() |
|
|
|
tokenizer = RegExTokenizer(regex=args.regex) |
|
|
|
|
|
for input_file in args.input_files: |
|
if args.input_type == "csv": |
|
tokenizer.build_vocab_from_csv(data_csv_file=input_file, col=args.input_csv_col) |
|
elif args.input_type == "text": |
|
tokenizer.build_vocab_from_text(data_text_file=input_file) |
|
else: |
|
raise ValueError(f"Unknown input_type = {args.input_type}") |
|
|
|
|
|
if not args.output_file.endswith(".model"): |
|
args.output_file += ".model" |
|
logging.info("Adding .model to output file") |
|
|
|
tokenizer.save_tokenizer(args.output_file) |
|
|