#!/usr/bin/env python3 # Copyright 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. """Tokenizer that is backed by spaCy (spacy.io). Requires spaCy package and the spaCy english model. """ import spacy import copy from .tokenizer import Tokens, Tokenizer class SpacyTokenizer(Tokenizer): def __init__(self, **kwargs): """ Args: annotators: set that can include pos, lemma, and ner. model: spaCy model to use (either path, or keyword like 'en'). """ model = kwargs.get('model', 'en') self.annotators = copy.deepcopy(kwargs.get('annotators', set())) nlp_kwargs = {'parser': False} if not any([p in self.annotators for p in ['lemma', 'pos', 'ner']]): nlp_kwargs['tagger'] = False if 'ner' not in self.annotators: nlp_kwargs['entity'] = False self.nlp = spacy.load(model, **nlp_kwargs) def tokenize(self, text): # We don't treat new lines as tokens. clean_text = text.replace('\n', ' ') tokens = self.nlp.tokenizer(clean_text) if any([p in self.annotators for p in ['lemma', 'pos', 'ner']]): self.nlp.tagger(tokens) if 'ner' in self.annotators: self.nlp.entity(tokens) data = [] for i in range(len(tokens)): # Get whitespace start_ws = tokens[i].idx if i + 1 < len(tokens): end_ws = tokens[i + 1].idx else: end_ws = tokens[i].idx + len(tokens[i].text) data.append(( tokens[i].text, text[start_ws: end_ws], (tokens[i].idx, tokens[i].idx + len(tokens[i].text)), tokens[i].tag_, tokens[i].lemma_, tokens[i].ent_type_, )) # Set special option for non-entity tag: '' vs 'O' in spaCy return Tokens(data, self.annotators, opts={'non_ent': ''})