Spaces:
Running
Running
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
import pysbd | |
class Rule(object): | |
def __init__(self, pattern, replacement): | |
self.pattern = pattern | |
self.replacement = replacement | |
def __repr__(self): # pragma: no cover | |
return '<{} pattern="{}" and replacement="{}">'.format( | |
self.__class__.__name__, self.pattern, self.replacement) | |
class Text(str): | |
"""Extending str functionality to apply regex rules | |
https://stackoverflow.com/questions/4698493/can-i-add-custom-methods-attributes-to-built-in-python-types | |
Parameters | |
---------- | |
str : str | |
string content | |
Returns | |
------- | |
str | |
input as it is if rule pattern doesnt match | |
else replacing found pattern with replacement chars | |
""" | |
def apply(self, *rules): | |
for each_r in rules: | |
self = re.sub(each_r.pattern, each_r.replacement, self) | |
return self | |
class TextSpan(object): | |
def __init__(self, sent, start, end): | |
""" | |
Sentence text and its start & end character offsets within original text | |
Parameters | |
---------- | |
sent : str | |
Sentence text | |
start : int | |
start character offset of a sentence in original text | |
end : int | |
end character offset of a sentence in original text | |
""" | |
self.sent = sent | |
self.start = start | |
self.end = end | |
def __repr__(self): # pragma: no cover | |
return "{0}(sent={1}, start={2}, end={3})".format( | |
self.__class__.__name__, repr(self.sent), self.start, self.end) | |
def __eq__(self, other): | |
if isinstance(self, other.__class__): | |
return self.sent == other.sent and self.start == other.start and self.end == other.end | |
class PySBDFactory(object): | |
"""pysbd as a spacy component through entrypoints""" | |
def __init__(self, nlp, language='en'): | |
self.nlp = nlp | |
self.seg = pysbd.Segmenter(language=language, clean=False, | |
char_span=True) | |
def __call__(self, doc): | |
sents_char_spans = self.seg.segment(doc.text_with_ws) | |
start_token_ids = [sent.start for sent in sents_char_spans] | |
for token in doc: | |
token.is_sent_start = (True if token.idx | |
in start_token_ids else False) | |
return doc | |