Spaces:
Running
Running
File size: 1,540 Bytes
d093ea4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# -*- coding: utf-8 -*-
import re
from pysbd.utils import Rule, Text
class EscapeRegexReservedCharacters(object):
LeftParen = Rule(r'\(', '\\(')
RightParen = Rule(r'\)', '\\)')
# LeftParen = Rule(re.escape(r'('), '(')
# RightParen = Rule(re.escape(r')'), ')')
LeftBracket = Rule(r'\[', '\\[')
RightBracket = Rule(r'\]', '\\]')
Dash = Rule(r'\-', '\\-')
All = [LeftParen, RightParen, LeftBracket, RightBracket, Dash]
class SubEscapedRegexReservedCharacters(object):
SubLeftParen = Rule(r'\\\(', '(')
SubRightParen = Rule(r'\\\)', ')')
# SubLeftParen = Rule(re.escape(r"\\("), "(")
# SubRightParen = Rule(re.escape(r'\\)'), ')')
SubLeftBracket = Rule(r'\\\[', '[')
SubRightBracket = Rule(r'\\\]', ']')
SubDash = Rule(r'\\\-', '-')
All = [
SubLeftParen, SubRightParen, SubLeftBracket, SubRightBracket, SubDash
]
def replace_punctuation(match, match_type=None):
text = Text(match.group()).apply(*EscapeRegexReservedCharacters.All)
sub = re.sub(r'\.', '∯', text)
sub = re.sub(r'։', '⍟', sub) # ADDED FOR ARMENIAN
sub_1 = re.sub(r'\。', '&ᓰ&', sub)
sub_2 = re.sub(r'\.', '&ᓱ&', sub_1)
sub_3 = re.sub(r'\!', '&ᓳ&', sub_2)
sub_4 = re.sub(r'\!', '&ᓴ&', sub_3)
sub_5 = re.sub(r'\?', '&ᓷ&', sub_4)
last_sub = re.sub(r'\?', '&ᓸ&', sub_5)
if match_type != 'single':
last_sub = re.sub(r"'", '&⎋&', last_sub)
text = Text(last_sub).apply(*SubEscapedRegexReservedCharacters.All)
return text
|