Spaces:
Running
Running
File size: 2,390 Bytes
d093ea4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import pysbd
class Rule(object):
def __init__(self, pattern, replacement):
self.pattern = pattern
self.replacement = replacement
def __repr__(self): # pragma: no cover
return '<{} pattern="{}" and replacement="{}">'.format(
self.__class__.__name__, self.pattern, self.replacement)
class Text(str):
"""Extending str functionality to apply regex rules
https://stackoverflow.com/questions/4698493/can-i-add-custom-methods-attributes-to-built-in-python-types
Parameters
----------
str : str
string content
Returns
-------
str
input as it is if rule pattern doesnt match
else replacing found pattern with replacement chars
"""
def apply(self, *rules):
for each_r in rules:
self = re.sub(each_r.pattern, each_r.replacement, self)
return self
class TextSpan(object):
def __init__(self, sent, start, end):
"""
Sentence text and its start & end character offsets within original text
Parameters
----------
sent : str
Sentence text
start : int
start character offset of a sentence in original text
end : int
end character offset of a sentence in original text
"""
self.sent = sent
self.start = start
self.end = end
def __repr__(self): # pragma: no cover
return "{0}(sent={1}, start={2}, end={3})".format(
self.__class__.__name__, repr(self.sent), self.start, self.end)
def __eq__(self, other):
if isinstance(self, other.__class__):
return self.sent == other.sent and self.start == other.start and self.end == other.end
class PySBDFactory(object):
"""pysbd as a spacy component through entrypoints"""
def __init__(self, nlp, language='en'):
self.nlp = nlp
self.seg = pysbd.Segmenter(language=language, clean=False,
char_span=True)
def __call__(self, doc):
sents_char_spans = self.seg.segment(doc.text_with_ws)
start_token_ids = [sent.start for sent in sents_char_spans]
for token in doc:
token.is_sent_start = (True if token.idx
in start_token_ids else False)
return doc
|