import argparse import json import os import _jsonnet import tqdm from seq2struct import datasets from seq2struct import models from seq2struct.utils import registry from seq2struct.utils import vocab class Preprocessor: def __init__(self, config): self.config = config self.model_preproc = registry.instantiate( registry.lookup('model', config['model']).Preproc, config['model']) def preprocess(self): self.model_preproc.clear_items() for section in self.config['data']: data = registry.construct('dataset', self.config['data'][section]) for item in tqdm.tqdm(data, desc=section, dynamic_ncols=True): to_add, validation_info = self.model_preproc.validate_item(item, section) if to_add: self.model_preproc.add_item(item, section, validation_info) self.model_preproc.save() def add_parser(): parser = argparse.ArgumentParser() parser.add_argument('--config', required=True) parser.add_argument('--config-args') args = parser.parse_args() return args def main(args): if args.config_args: config = json.loads(_jsonnet.evaluate_file(args.config, tla_codes={'args': args.config_args})) else: config = json.loads(_jsonnet.evaluate_file(args.config)) preprocessor = Preprocessor(config) preprocessor.preprocess() if __name__ == '__main__': args = add_parser() main(args)