Spaces:
Sleeping
Sleeping
File size: 8,245 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
# Natural Language Toolkit: Interface to the Repp Tokenizer
#
# Copyright (C) 2001-2015 NLTK Project
# Authors: Rebecca Dridan and Stephan Oepen
# Contributors: Liling Tan
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import os
import re
import subprocess
import sys
import tempfile
from nltk.data import ZipFilePathPointer
from nltk.internals import find_dir
from nltk.tokenize.api import TokenizerI
class ReppTokenizer(TokenizerI):
"""
A class for word tokenization using the REPP parser described in
Rebecca Dridan and Stephan Oepen (2012) Tokenization: Returning to a
Long Solved Problem - A Survey, Contrastive Experiment, Recommendations,
and Toolkit. In ACL. http://anthology.aclweb.org/P/P12/P12-2.pdf#page=406
>>> sents = ['Tokenization is widely regarded as a solved problem due to the high accuracy that rulebased tokenizers achieve.' ,
... 'But rule-based tokenizers are hard to maintain and their rules language specific.' ,
... 'We evaluated our method on three languages and obtained error rates of 0.27% (English), 0.35% (Dutch) and 0.76% (Italian) for our best models.'
... ]
>>> tokenizer = ReppTokenizer('/home/alvas/repp/') # doctest: +SKIP
>>> for sent in sents: # doctest: +SKIP
... tokenizer.tokenize(sent) # doctest: +SKIP
...
(u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
(u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
(u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
>>> for sent in tokenizer.tokenize_sents(sents): # doctest: +SKIP
... print(sent) # doctest: +SKIP
...
(u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
(u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
(u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
>>> for sent in tokenizer.tokenize_sents(sents, keep_token_positions=True): # doctest: +SKIP
... print(sent) # doctest: +SKIP
...
[(u'Tokenization', 0, 12), (u'is', 13, 15), (u'widely', 16, 22), (u'regarded', 23, 31), (u'as', 32, 34), (u'a', 35, 36), (u'solved', 37, 43), (u'problem', 44, 51), (u'due', 52, 55), (u'to', 56, 58), (u'the', 59, 62), (u'high', 63, 67), (u'accuracy', 68, 76), (u'that', 77, 81), (u'rulebased', 82, 91), (u'tokenizers', 92, 102), (u'achieve', 103, 110), (u'.', 110, 111)]
[(u'But', 0, 3), (u'rule-based', 4, 14), (u'tokenizers', 15, 25), (u'are', 26, 29), (u'hard', 30, 34), (u'to', 35, 37), (u'maintain', 38, 46), (u'and', 47, 50), (u'their', 51, 56), (u'rules', 57, 62), (u'language', 63, 71), (u'specific', 72, 80), (u'.', 80, 81)]
[(u'We', 0, 2), (u'evaluated', 3, 12), (u'our', 13, 16), (u'method', 17, 23), (u'on', 24, 26), (u'three', 27, 32), (u'languages', 33, 42), (u'and', 43, 46), (u'obtained', 47, 55), (u'error', 56, 61), (u'rates', 62, 67), (u'of', 68, 70), (u'0.27', 71, 75), (u'%', 75, 76), (u'(', 77, 78), (u'English', 78, 85), (u')', 85, 86), (u',', 86, 87), (u'0.35', 88, 92), (u'%', 92, 93), (u'(', 94, 95), (u'Dutch', 95, 100), (u')', 100, 101), (u'and', 102, 105), (u'0.76', 106, 110), (u'%', 110, 111), (u'(', 112, 113), (u'Italian', 113, 120), (u')', 120, 121), (u'for', 122, 125), (u'our', 126, 129), (u'best', 130, 134), (u'models', 135, 141), (u'.', 141, 142)]
"""
def __init__(self, repp_dir, encoding="utf8"):
self.repp_dir = self.find_repptokenizer(repp_dir)
# Set a directory to store the temporary files.
self.working_dir = tempfile.gettempdir()
# Set an encoding for the input strings.
self.encoding = encoding
def tokenize(self, sentence):
"""
Use Repp to tokenize a single sentence.
:param sentence: A single sentence string.
:type sentence: str
:return: A tuple of tokens.
:rtype: tuple(str)
"""
return next(self.tokenize_sents([sentence]))
def tokenize_sents(self, sentences, keep_token_positions=False):
"""
Tokenize multiple sentences using Repp.
:param sentences: A list of sentence strings.
:type sentences: list(str)
:return: A list of tuples of tokens
:rtype: iter(tuple(str))
"""
with tempfile.NamedTemporaryFile(
prefix="repp_input.", dir=self.working_dir, mode="w", delete=False
) as input_file:
# Write sentences to temporary input file.
for sent in sentences:
input_file.write(str(sent) + "\n")
input_file.close()
# Generate command to run REPP.
cmd = self.generate_repp_command(input_file.name)
# Decode the stdout and strips the ending newline.
repp_output = self._execute(cmd).decode(self.encoding).strip()
for tokenized_sent in self.parse_repp_outputs(repp_output):
if not keep_token_positions:
# Removes token position information.
tokenized_sent, starts, ends = zip(*tokenized_sent)
yield tokenized_sent
def generate_repp_command(self, inputfilename):
"""
This module generates the REPP command to be used at the terminal.
:param inputfilename: path to the input file
:type inputfilename: str
"""
cmd = [self.repp_dir + "/src/repp"]
cmd += ["-c", self.repp_dir + "/erg/repp.set"]
cmd += ["--format", "triple"]
cmd += [inputfilename]
return cmd
@staticmethod
def _execute(cmd):
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
return stdout
@staticmethod
def parse_repp_outputs(repp_output):
"""
This module parses the tri-tuple format that REPP outputs using the
"--format triple" option and returns an generator with tuple of string
tokens.
:param repp_output:
:type repp_output: type
:return: an iterable of the tokenized sentences as tuples of strings
:rtype: iter(tuple)
"""
line_regex = re.compile(r"^\((\d+), (\d+), (.+)\)$", re.MULTILINE)
for section in repp_output.split("\n\n"):
words_with_positions = [
(token, int(start), int(end))
for start, end, token in line_regex.findall(section)
]
words = tuple(t[2] for t in words_with_positions)
yield words_with_positions
def find_repptokenizer(self, repp_dirname):
"""
A module to find REPP tokenizer binary and its *repp.set* config file.
"""
if os.path.exists(repp_dirname): # If a full path is given.
_repp_dir = repp_dirname
else: # Try to find path to REPP directory in environment variables.
_repp_dir = find_dir(repp_dirname, env_vars=("REPP_TOKENIZER",))
# Checks for the REPP binary and erg/repp.set config file.
assert os.path.exists(_repp_dir + "/src/repp")
assert os.path.exists(_repp_dir + "/erg/repp.set")
return _repp_dir
|