Spaces:

AriNubar
/

hyw-en-demo-v2

Running

Ari Nubar Boyacıoğlu

add files

d093ea4 6 months ago

2.39 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --
	import re
	import pysbd

	class Rule(object):

	def __init__(self, pattern, replacement):
	self.pattern = pattern
	self.replacement = replacement

	def __repr__(self): # pragma: no cover
	return '<{} pattern="{}" and replacement="{}">'.format(
	self.__class__.__name__, self.pattern, self.replacement)


	class Text(str):
	"""Extending str functionality to apply regex rules

	https://stackoverflow.com/questions/4698493/can-i-add-custom-methods-attributes-to-built-in-python-types

	Parameters
	----------
	str : str
	string content

	Returns
	-------
	str
	input as it is if rule pattern doesnt match
	else replacing found pattern with replacement chars
	"""
	def apply(self, *rules):
	for each_r in rules:
	self = re.sub(each_r.pattern, each_r.replacement, self)
	return self


	class TextSpan(object):

	def __init__(self, sent, start, end):
	"""
	Sentence text and its start & end character offsets within original text

	Parameters
	----------
	sent : str
	Sentence text
	start : int
	start character offset of a sentence in original text
	end : int
	end character offset of a sentence in original text
	"""
	self.sent = sent
	self.start = start
	self.end = end

	def __repr__(self): # pragma: no cover
	return "{0}(sent={1}, start={2}, end={3})".format(
	self.__class__.__name__, repr(self.sent), self.start, self.end)

	def __eq__(self, other):
	if isinstance(self, other.__class__):
	return self.sent == other.sent and self.start == other.start and self.end == other.end


	class PySBDFactory(object):
	"""pysbd as a spacy component through entrypoints"""

	def __init__(self, nlp, language='en'):
	self.nlp = nlp
	self.seg = pysbd.Segmenter(language=language, clean=False,
	char_span=True)

	def __call__(self, doc):
	sents_char_spans = self.seg.segment(doc.text_with_ws)
	start_token_ids = [sent.start for sent in sents_char_spans]
	for token in doc:
	token.is_sent_start = (True if token.idx
	in start_token_ids else False)
	return doc