import json from pathlib import Path import spacy from spacy.training import Example def make_training_doc(nlp: spacy, data: list): """ To convert data into spacy doc type that can be use for training parameters: nlp: model data: training data returns: trainiing_data: list of spacy doc """ training_data = [] for text, annotations in data: doc = nlp.make_doc(text) example = Example.from_dict(doc, annotations) training_data.append(example) return training_data def load_model(model: str=None): """ Load the model indicated by model parameters: model: str , name of the model to load returns: nlp: spacy model object optimizer : the optimizer to be use in training """ if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) optimizer = nlp.resume_training() else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") optimizer = nlp.begin_training() return nlp, optimizer def save_model(model: spacy, output_dir: str): """ Save the model to the output_dir parameters: model: spacy model output_dir: path """ if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() model.to_disk(output_dir) print("Saved model to", output_dir) return None def load_data(args): """ Load training data, evaluation data as well as entities dictionary parameters: args: dict, configuration from the config file returns: train_dict, entities_dict, eval_dict """ assert args['train_dir'] != None, 'indicate path for training directory' # Load the training data with open(args['train_dir']) as f: train_dict = json.load(f) print('Loaded Training Data') try: entities_dict=train_dict[args['ent_key']] print('Loaded Entities from Training Data') except KeyError: entities_dict=None print('No classes for entities found in data loaded. Proceed to check in ent_dir') # Load entities if args['ent_dir'] is not None and entities_dict is None: with open(args['ent_dir']) as f: entities_dict = json.load(f) entities_dict = entities_dict[args['ent_key']] print('Loaded Entities from ent_dir') elif args['ent_dir'] is None and entities_dict is None: assert entities_dict != None, 'No entities found from training_dir & ent_dir' # Load eval data if args['eval_dir'] is not None: with open(args['eval_dir']) as f: eval_dict = json.load(f) print('Loaded Evaluating Data') else: return train_dict, entities_dict, None return train_dict, entities_dict, eval_dict