ner-analyzer / src /model_utils.py
Kaelan
initial commit
f5e3fa7
raw
history blame
3.01 kB
import json
from pathlib import Path
import spacy
from spacy.training import Example
def make_training_doc(nlp: spacy, data: list):
"""
To convert data into spacy doc type that can be use for training
parameters:
nlp: model
data: training data
returns:
trainiing_data: list of spacy doc
"""
training_data = []
for text, annotations in data:
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
training_data.append(example)
return training_data
def load_model(model: str=None):
"""
Load the model indicated by model
parameters:
model: str , name of the model to load
returns:
nlp: spacy model object
optimizer : the optimizer to be use in training
"""
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
optimizer = nlp.resume_training()
else:
nlp = spacy.blank('en') # create blank Language class
print("Created blank 'en' model")
optimizer = nlp.begin_training()
return nlp, optimizer
def save_model(model: spacy, output_dir: str):
"""
Save the model to the output_dir
parameters:
model: spacy model
output_dir: path
"""
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
model.to_disk(output_dir)
print("Saved model to", output_dir)
return None
def load_data(args):
"""
Load training data, evaluation data as well as entities dictionary
parameters:
args: dict, configuration from the config file
returns:
train_dict, entities_dict, eval_dict
"""
assert args['train_dir'] != None, 'indicate path for training directory'
# Load the training data
with open(args['train_dir']) as f:
train_dict = json.load(f)
print('Loaded Training Data')
try:
entities_dict=train_dict[args['ent_key']]
print('Loaded Entities from Training Data')
except KeyError:
entities_dict=None
print('No classes for entities found in data loaded. Proceed to check in ent_dir')
# Load entities
if args['ent_dir'] is not None and entities_dict is None:
with open(args['ent_dir']) as f:
entities_dict = json.load(f)
entities_dict = entities_dict[args['ent_key']]
print('Loaded Entities from ent_dir')
elif args['ent_dir'] is None and entities_dict is None:
assert entities_dict != None, 'No entities found from training_dir & ent_dir'
# Load eval data
if args['eval_dir'] is not None:
with open(args['eval_dir']) as f:
eval_dict = json.load(f)
print('Loaded Evaluating Data')
else:
return train_dict, entities_dict, None
return train_dict, entities_dict, eval_dict