File size: 3,007 Bytes
f5e3fa7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import json
from pathlib import Path

import spacy
from spacy.training import Example

def make_training_doc(nlp: spacy, data: list):
    """
    To convert data into spacy doc type that can be use for training

    parameters:
        nlp: model
        data: training data
    
    returns:
        trainiing_data: list of spacy doc
    """
    training_data = []
    for text, annotations in data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        training_data.append(example)

    return training_data


def load_model(model: str=None):
    """
    Load the model indicated by model

    parameters:
        model: str , name of the model to load
    
    returns:
        nlp: spacy model object
        optimizer : the optimizer to be use in training
    """
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
        optimizer = nlp.resume_training()
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
        optimizer = nlp.begin_training()

    return nlp, optimizer


def save_model(model: spacy, output_dir: str):
    """
    Save the model to the output_dir

    parameters:
        model: spacy model
        output_dir: path
    """
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        model.to_disk(output_dir)
        print("Saved model to", output_dir)
    
    return None


def load_data(args):
    """
    Load training data, evaluation data as well as entities dictionary

    parameters:
        args: dict, configuration from the config file
    
    returns:
        train_dict, entities_dict, eval_dict

    """

    assert args['train_dir'] != None, 'indicate path for training directory'

    # Load the training data
    with open(args['train_dir']) as f:
        train_dict = json.load(f)
        print('Loaded Training Data')   
    
    try:
        entities_dict=train_dict[args['ent_key']]
        print('Loaded Entities from Training Data')
    except KeyError:
        entities_dict=None
        print('No classes for entities found in data loaded. Proceed to check in ent_dir')
    
    # Load entities 
    if args['ent_dir'] is not None and entities_dict is None:
        with open(args['ent_dir']) as f:
            entities_dict = json.load(f)
            entities_dict = entities_dict[args['ent_key']]
            print('Loaded Entities from ent_dir')
    elif args['ent_dir'] is None and entities_dict is None:
        assert entities_dict != None, 'No entities found from training_dir & ent_dir'

    # Load eval data
    if args['eval_dir'] is not None:
        with open(args['eval_dir']) as f:
            eval_dict = json.load(f)
            print('Loaded Evaluating Data')
    else:
        return train_dict, entities_dict, None

    return train_dict, entities_dict, eval_dict