crystal-technologies's picture
Upload 1287 files
2d8da09
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
In order to build a regexp tokenizer model use the following command.
The script will create:
.vocab file - with learned vocabulary
.model file - with provided regex
To build vocabulary from text files:
python -- scripts/nlp_language_modeling/build_regex_tokenizer.py \
--regex '\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9]' \
--input_type text \
--output_file regex_tokenizer -- \
data_file1.txt data_file2.txt
To build vocabulary from CSV files ("smiles" column):
python -- scripts/nlp_language_modeling/build_regex_tokenizer.py \
--regex '\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9]' \
--input_type csv \
--input_csv_col smiles \
--output_file regex_tokenizer -- \
data_file1.csv data_file2.csv
"""
import argparse
from nemo.collections.common.tokenizers.regex_tokenizer import RegExTokenizer
from nemo.utils import logging
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Builds vocabulary from regex tokenizer. Outputs .model (regular expression) and .vocab (learned vocabualry)",
)
parser.add_argument(
'input_files', type=str, nargs='+', help='Input text/csv file',
)
parser.add_argument(
'--regex', type=str, required=True, help='Regular expression to split text',
)
parser.add_argument(
'--output_file',
type=str,
required=True,
help='Output base file name. Two files will be created: .vocab (learned vocabulary), .model (the regex)',
)
parser.add_argument(
'--input_type',
type=str,
required=False,
choices=["text", "csv"],
default="text",
help='Type of input file: text, csv',
)
parser.add_argument(
'--input_csv_col', type=str, required=False, default="smiles", help='Column of data in CSV file',
)
args = parser.parse_args()
tokenizer = RegExTokenizer(regex=args.regex)
# build vocabulary from all files
for input_file in args.input_files:
if args.input_type == "csv":
tokenizer.build_vocab_from_csv(data_csv_file=input_file, col=args.input_csv_col)
elif args.input_type == "text":
tokenizer.build_vocab_from_text(data_text_file=input_file)
else:
raise ValueError(f"Unknown input_type = {args.input_type}")
# save model
if not args.output_file.endswith(".model"):
args.output_file += ".model"
logging.info("Adding .model to output file")
tokenizer.save_tokenizer(args.output_file)