Spaces:

zhenyundeng
/

AVeriTeC-API

Build error

AVeriTeC-API / drqa /tokenizers /regexp_tokenizer.py

zhenyundeng

add files

e62781a 12 months ago

3.82 kB

	#!/usr/bin/env python3
	# Copyright 2017-present, Facebook, Inc.
	# All rights reserved.
	#
	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.
	"""Regex based tokenizer that emulates the Stanford/NLTK PTB tokenizers.

	However it is purely in Python, supports robust untokenization, unicode,
	and requires minimal dependencies.
	"""

	import regex
	import logging
	from .tokenizer import Tokens, Tokenizer

	logger = logging.getLogger(__name__)


	class RegexpTokenizer(Tokenizer):
	DIGIT = r'\p{Nd}+([:\.\,]\p{Nd}+)*'
	TITLE = (r'(dr\|esq\|hon\|jr\|mr\|mrs\|ms\|prof\|rev\|sr\|st\|rt\|messrs\|mmes\|msgr)'
	r'\.(?=\p{Z})')
	ABBRV = r'([\p{L}]\.){2,}(?=\p{Z}\|$)'
	ALPHA_NUM = r'[\p{L}\p{N}\p{M}]++'
	HYPHEN = r'{A}([-\u058A\u2010\u2011]{A})+'.format(A=ALPHA_NUM)
	NEGATION = r"((?!n't)[\p{L}\p{N}\p{M}])++(?=n't)\|n't"
	CONTRACTION1 = r"can(?=not\b)"
	CONTRACTION2 = r"'([tsdm]\|re\|ll\|ve)\b"
	START_DQUOTE = r'(?<=[\p{Z}\(\[{<]\|^)(``\|["\u0093\u201C\u00AB])(?!\p{Z})'
	START_SQUOTE = r'(?<=[\p{Z}\(\[{<]\|^)[\'\u0091\u2018\u201B\u2039](?!\p{Z})'
	END_DQUOTE = r'(?<!\p{Z})(\'\'\|["\u0094\u201D\u00BB])'
	END_SQUOTE = r'(?<!\p{Z})[\'\u0092\u2019\u203A]'
	DASH = r'--\|[\u0096\u0097\u2013\u2014\u2015]'
	ELLIPSES = r'\.\.\.\|\u2026'
	PUNCT = r'\p{P}'
	NON_WS = r'[^\p{Z}\p{C}]'

	def __init__(self, **kwargs):
	"""
	Args:
	annotators: None or empty set (only tokenizes).
	substitutions: if true, normalizes some token types (e.g. quotes).
	"""
	self._regexp = regex.compile(
	'(?P<digit>%s)\|(?P<title>%s)\|(?P<abbr>%s)\|(?P<neg>%s)\|(?P<hyph>%s)\|'
	'(?P<contr1>%s)\|(?P<alphanum>%s)\|(?P<contr2>%s)\|(?P<sdquote>%s)\|'
	'(?P<edquote>%s)\|(?P<ssquote>%s)\|(?P<esquote>%s)\|(?P<dash>%s)\|'
	'(?<ellipses>%s)\|(?P<punct>%s)\|(?P<nonws>%s)' %
	(self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN,
	self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2,
	self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE,
	self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT,
	self.NON_WS),
	flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
	)
	if len(kwargs.get('annotators', {})) > 0:
	logger.warning('%s only tokenizes! Skipping annotators: %s' %
	(type(self).__name__, kwargs.get('annotators')))
	self.annotators = set()
	self.substitutions = kwargs.get('substitutions', True)

	def tokenize(self, text):
	data = []
	matches = [m for m in self._regexp.finditer(text)]
	for i in range(len(matches)):
	# Get text
	token = matches[i].group()

	# Make normalizations for special token types
	if self.substitutions:
	groups = matches[i].groupdict()
	if groups['sdquote']:
	token = "``"
	elif groups['edquote']:
	token = "''"
	elif groups['ssquote']:
	token = "`"
	elif groups['esquote']:
	token = "'"
	elif groups['dash']:
	token = '--'
	elif groups['ellipses']:
	token = '...'

	# Get whitespace
	span = matches[i].span()
	start_ws = span[0]
	if i + 1 < len(matches):
	end_ws = matches[i + 1].span()[0]
	else:
	end_ws = span[1]

	# Format data
	data.append((
	token,
	text[start_ws: end_ws],
	span,
	))
	return Tokens(data, self.annotators)