Spaces:

AriNubar
/

hyw-en-demo-v2

Running

Ari Nubar Boyacıoğlu

add files

d093ea4 7 months ago

8.05 kB

	# -- coding: utf-8 --
	import re
	from pysbd.utils import Text
	from pysbd.lists_item_replacer import ListItemReplacer
	from pysbd.exclamation_words import ExclamationWords
	from pysbd.between_punctuation import BetweenPunctuation
	from pysbd.abbreviation_replacer import AbbreviationReplacer

	class Processor(object):

	def __init__(self, text, lang, char_span=False):
	"""Process a text - do pre and post processing - to get proper sentences

	Parameters
	----------
	text : str
	Original text
	language : object
	Language module
	char_span : bool, optional
	Get start & end character offsets of each sentences
	within original text, by default False
	"""
	self.text = text
	self.lang = lang
	self.char_span = char_span

	def process(self):
	if not self.text:
	return self.text
	self.text = self.text.replace('\n', '\r')
	li = ListItemReplacer(self.text)
	self.text = li.add_line_break()
	self.replace_abbreviations()
	self.replace_numbers()
	self.replace_continuous_punctuation()
	self.replace_periods_before_numeric_references()
	self.text = Text(self.text).apply(
	self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule,
	self.lang.GeoLocationRule, self.lang.FileFormatRule)
	postprocessed_sents = self.split_into_segments()
	return postprocessed_sents

	def rm_none_flatten(self, sents):
	"""Remove None values and unpack list of list sents

	Parameters
	----------
	sents : list
	list of sentences

	Returns
	-------
	list
	unpacked and None removed list of sents
	"""
	sents = list(filter(None, sents))
	if not any(isinstance(s, list) for s in sents):
	return sents
	new_sents = []
	for sent in sents:
	if isinstance(sent, list):
	for s in sent:
	new_sents.append(s)
	else:
	new_sents.append(sent)
	return new_sents

	def split_into_segments(self):
	self.check_for_parens_between_quotes()
	sents = self.text.split('\r')
	# remove empty and none values
	sents = self.rm_none_flatten(sents)
	sents = [
	Text(s).apply(self.lang.SingleNewLineRule, *self.lang.EllipsisRules.All)
	for s in sents
	]

	# # THESE LINES ARE NOT PRESENT IN THE ORIGINAL CODE --> ONLY USE FOR HYW
	# sents = [self.post_process_segments(s) for s in sents]
	# sents = self.rm_none_flatten(sents)


	sents = [self.check_for_punctuation(s) for s in sents]
	# flatten list of list of sentences
	sents = self.rm_none_flatten(sents)
	postprocessed_sents = []
	for sent in sents:
	sent = Text(sent).apply(*self.lang.SubSymbolsRules.All)
	post_process_sent = self.post_process_segments(sent)
	if post_process_sent and isinstance(post_process_sent, str):
	postprocessed_sents.append(post_process_sent)
	elif isinstance(post_process_sent, list):
	for pps in post_process_sent:
	postprocessed_sents.append(pps)
	postprocessed_sents = [Text(ns).apply(self.lang.SubSingleQuoteRule)
	for ns in postprocessed_sents]
	return postprocessed_sents

	def post_process_segments(self, txt):
	if len(txt) > 2 and re.search(r'\A[a-zA-Z]*\Z', txt):
	return txt

	# below condition present in pragmatic segmenter
	# dont know significance of it yet.
	# if self.consecutive_underscore(txt) or len(txt) < 2:
	# return txt

	if re.match(r'\t', txt):
	pass

	# TODO:
	# Decide on keeping or removing Standard.ExtraWhiteSpaceRule
	# removed to retain original text spans
	# txt = Text(txt).apply(*ReinsertEllipsisRules.All,
	# Standard.ExtraWhiteSpaceRule)
	txt = Text(txt).apply(*self.lang.ReinsertEllipsisRules.All)
	if re.search(self.lang.QUOTATION_AT_END_OF_SENTENCE_REGEX, txt):
	txt = re.split(
	self.lang.SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX, txt)
	return txt
	else:
	txt = txt.replace('\n', '')
	return txt.strip()

	def check_for_parens_between_quotes(self):
	def paren_replace(match):
	match = match.group()
	sub1 = re.sub(r'\s(?=\()', '\r', match)
	sub2 = re.sub(r'(?<=\))\s', '\r', sub1)
	return sub2
	self.text = re.sub(self.lang.PARENS_BETWEEN_DOUBLE_QUOTES_REGEX,
	paren_replace, self.text)

	def replace_continuous_punctuation(self):
	def continuous_puncs_replace(match):
	match = match.group()
	sub1 = re.sub(re.escape('!'), '&ᓴ&', match)
	sub2 = re.sub(re.escape('?'), '&ᓷ&', sub1)
	return sub2
	self.text = re.sub(self.lang.CONTINUOUS_PUNCTUATION_REGEX,
	continuous_puncs_replace, self.text)

	def replace_periods_before_numeric_references(self):
	# https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703
	self.text = re.sub(self.lang.NUMBERED_REFERENCE_REGEX,
	r"∯\2\r\7", self.text)

	def consecutive_underscore(self, txt):
	# Rubular: http://rubular.com/r/fTF2Ff3WBL
	txt = re.sub(r'_{3,}', '', txt)
	return len(txt) == 0

	def check_for_punctuation(self, txt):
	if any(p in txt for p in self.lang.Punctuations):
	sents = self.process_text(txt)
	return sents
	else:
	# NOTE: next steps of check_for_punctuation will unpack this list
	return [txt]

	def process_text(self, txt):
	if txt[-1] not in self.lang.Punctuations:
	txt += 'ȸ'
	txt = ExclamationWords.apply_rules(txt)
	txt = self.between_punctuation(txt)
	# handle text having only doublepunctuations
	if not re.match(self.lang.DoublePunctuationRules.DoublePunctuation, txt):
	txt = Text(txt).apply(*self.lang.DoublePunctuationRules.All)
	txt = Text(txt).apply(self.lang.QuestionMarkInQuotationRule,
	*self.lang.ExclamationPointRules.All)
	txt = ListItemReplacer(txt).replace_parens()
	txt = self.sentence_boundary_punctuation(txt)
	return txt

	def replace_numbers(self):
	self.text = Text(self.text).apply(*self.lang.Numbers.All)

	def abbreviations_replacer(self):
	if hasattr(self.lang, "AbbreviationReplacer"):
	return self.lang.AbbreviationReplacer(self.text, self.lang)
	else:
	return AbbreviationReplacer(self.text, self.lang)

	def replace_abbreviations(self):
	self.text = self.abbreviations_replacer().replace()

	def between_punctuation_processor(self, txt):
	if hasattr(self.lang, "BetweenPunctuation"):
	return self.lang.BetweenPunctuation(txt)
	else:
	return BetweenPunctuation(txt)

	def between_punctuation(self, txt):
	txt = self.between_punctuation_processor(txt).replace()
	return txt

	def sentence_boundary_punctuation(self, txt):
	if hasattr(self.lang, 'ReplaceColonBetweenNumbersRule'):
	txt = Text(txt).apply(
	self.lang.ReplaceColonBetweenNumbersRule)
	if hasattr(self.lang, 'ReplaceNonSentenceBoundaryCommaRule'):
	txt = Text(txt).apply(
	self.lang.ReplaceNonSentenceBoundaryCommaRule)
	# retain exclamation mark if it is an ending character of a given text
	txt = re.sub(r'&ᓴ&$', '!', txt)
	txt = [
	m.group() for m in re.finditer(self.lang.SENTENCE_BOUNDARY_REGEX, txt)
	]
	return txt