Spaces:
Runtime error
Runtime error
import spacy | |
from spacy.util import minibatch, compounding | |
from spacy.scorer import Scorer | |
from src.model_utils import * | |
import random | |
from tqdm import tqdm | |
def train_transformer(config: dict, train_data: list, components: list, iter: int, | |
batch_size: int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data: list=None)-> spacy: | |
""" | |
Finetune a transformer model or resume training from a fine-tuned model. | |
Parameters: | |
config: dict, configuration parameters | |
train_data: list, contain training data | |
components: list, list of components to be trained | |
iter: int, number of iterations to train | |
batch_size: int, batch size to be used for training | |
entities: list of entities to be trained on for NER | |
eval_data: list, containing evaluation data | |
Returns: | |
nlp : spacy transformer | |
losses: list of the losses at every iteration | |
""" | |
if config['dir'] is not None: | |
nlp = spacy.load(config['dir']) | |
optimizer = nlp.resume_training() | |
else: | |
nlp = spacy.blank("en") # empty English pipeline | |
nlp.add_pipe("transformer", config=config['config']) | |
for component in components: | |
nlp.add_pipe(component) | |
task=nlp.get_pipe(component) | |
if ('ner' in components) and (entities is not None): | |
for label in entities: | |
task.add_label(label) | |
nlp.initialize() # XXX don't forget this step! | |
optimizer = nlp.create_optimizer() | |
# convert data into training doc | |
train_data_doc = make_training_doc(nlp, train_data) | |
all_losses = [] | |
for itn in tqdm(range(1,iter+1)): | |
print("Starting iteration " + str(itn)) | |
random.shuffle(train_data) | |
losses = {} | |
# compounding(4.0, 32.0, 1.001) | |
batches = minibatch(train_data_doc, size=batch_size) | |
for batch in batches: | |
nlp.update(batch, sgd=optimizer,drop=0.2, losses=losses) | |
scores = eval_spacy(nlp, eval_data) if eval_data else eval_spacy(nlp, train_data) | |
print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \ | |
format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f'])) | |
all_losses.append([losses[component] for component in components]) | |
return nlp, all_losses | |
def train_spacy(model: spacy, train_data: list, components: list, iter: int, | |
batch_size:int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data:list=None)-> spacy: | |
""" | |
Finetune a spacy model or resume training from a fine-tuned model. | |
Parameters: | |
model: str, name of spacy model | |
train_data: list, contain training data | |
components: list, list of components to be trained | |
iter: int, number of iterations to train | |
batch_size: int, batch size to be used for training | |
entities: list of entities to be trained on for NER | |
eval_data: list, containing evaluation data | |
Returns: | |
nlp : spacy model | |
losses: list of the losses at every iteration | |
""" | |
# get model and optimizer | |
if model is not None: | |
nlp, optimizer = load_model(model) # load existing spaCy model/ blank models | |
# convert data into training doc | |
train_data_doc = make_training_doc(nlp, train_data) | |
# create the built-in pipeline components and add them to the pipeline | |
# nlp.create_pipe works for built-ins that are registered with spaCy | |
for component in components: | |
if component not in nlp.pipe_names: | |
ner = nlp.create_pipe(component) | |
nlp.add_pipe(component, last=True) | |
else: | |
ner = nlp.get_pipe(component) | |
# add labels if component is NER | |
if (component == 'ner') and (entities is not None): | |
for ent in entities: | |
ner.add_label(ent) | |
print(f'Entities in the model are: {nlp.get_pipe("ner").labels}') | |
# get names of other pipes to disable them during training | |
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in components] | |
all_losses = [] | |
with nlp.disable_pipes(*other_pipes): # only train NER | |
for itn in tqdm(range(1,iter+1)): | |
print("Starting iteration " + str(itn)) | |
random.shuffle(train_data) | |
losses = {} | |
batches = minibatch(train_data_doc, size=batch_size) | |
for batch in batches: | |
nlp.update(list(batch), | |
losses=losses, | |
drop=0.1, | |
sgd=optimizer) | |
scores = eval_spacy(nlp, eval_data) if eval_data else eval_spacy(nlp, train_data) | |
print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \ | |
format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f'])) | |
all_losses.append([losses[component] for component in components]) | |
return nlp, all_losses | |
def eval_spacy(model: spacy, data): | |
""" | |
Function to perform evaluation and scoring | |
Parameters: | |
model: either a spacy model or spacy transformer | |
data: evaluation data so that scoring can be done | |
Returns: | |
score: dict with scores of the model | |
""" | |
scorer = Scorer() | |
examples = [] | |
try: | |
# accept spacy format json data | |
for input_, annot in data: | |
doc = model.make_doc(input_) | |
example = Example.from_dict(doc, annot) | |
example.predicted = model(str(example.text)) | |
examples.append(example) | |
scores = scorer.score(examples) | |
return scores | |
except TypeError: | |
# accept alternative format json data | |
for row in data: | |
input_, annot = row.values() | |
doc = model.make_doc(input_) | |
example = Example.from_dict(doc, {'entities':annot}) | |
example.predicted = model(str(example.text)) | |
examples.append(example) | |
scores = scorer.score(examples) | |
return scores | |
except Exception as e: print(e) | |