Spaces:
Running
Running
File size: 1,388 Bytes
d093ea4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# -*- coding: utf-8 -*-
import re
from pysbd.abbreviation_replacer import AbbreviationReplacer
from pysbd.between_punctuation import BetweenPunctuation
from pysbd.lang.common import Common, Standard
from pysbd.punctuation_replacer import replace_punctuation
class Chinese(Common, Standard):
iso_code = 'zh'
class AbbreviationReplacer(AbbreviationReplacer):
SENTENCE_STARTERS = []
class BetweenPunctuation(BetweenPunctuation):
def __init__(self, text):
super().__init__(text)
def replace(self):
self.sub_punctuation_between_quotes_and_parens()
return self.text
def sub_punctuation_between_double_angled_quotation_marks(self):
BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX = r"《(?=(?P<tmp>[^》\\]+|\\{2}|\\.)*)(?P=tmp)》"
self.text = re.sub(BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX, replace_punctuation,
self.text)
def sub_punctuation_between_l_bracket(self):
BETWEEN_L_BRACKET_REGEX = r"「(?=(?P<tmp>[^」\\]+|\\{2}|\\.)*)(?P=tmp)」"
self.text = re.sub(BETWEEN_L_BRACKET_REGEX, replace_punctuation,
self.text)
def sub_punctuation_between_quotes_and_parens(self):
self.sub_punctuation_between_double_angled_quotation_marks()
self.sub_punctuation_between_l_bracket()
|