Spaces:
Running
Running
# -*- coding: utf-8 -*- | |
from pysbd.abbreviation_replacer import AbbreviationReplacer | |
from pysbd.lang.common import Common, Standard | |
from pysbd.between_punctuation import BetweenPunctuation | |
import re | |
from functools import partial | |
from pysbd.punctuation_replacer import replace_punctuation | |
class Armenian(Common, Standard): | |
iso_code = 'hy' | |
SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[Ա-ՖA-Z])|「(?:[^」])*」(?=\s[Ա-ՖA-Z])|\((?:[^\)]){2,}\)(?=\s[Ա-ՖA-Z])|\'(?:[^\'])*[^,]\'(?=\s[Ա-ՖA-Z])|\"(?:[^\"])*[^,]\"(?=\s[Ա-ՖA-Z])|\“(?:[^\”])*[^,]\”(?=\s[Ա-ՖA-Z])|[。..!!?? ]{2,}|\S.*?[。..!!??ȸȹ☉☈☇☄]|[。..!!??]|.*?(?<!\d)[։]" | |
# SENTENCE_BOUNDARY_REGEX = r'((?:[^)])*)(?=\s?[Ա-ՖA-Z0-9])|.*?(?<!\d)[։]|.*?$' | |
Punctuations = ['։'] | |
QUOTATION_AT_END_OF_SENTENCE_REGEX = r'[.․։][\"\'“”»«]\s{1}[A-ZԱ-Ֆ]' | |
SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = r'(?<=[.․։][\"\'“”»«])\s{1}(?=[A-ZԱ-Ֆ])' | |
class AbbreviationReplacer(AbbreviationReplacer): | |
SENTENCE_STARTERS = [] | |
class BetweenPunctuation(BetweenPunctuation): | |
BETWEEN_SINGLE_QUOTES_ARMENIAN_REGEX = r"(?<=\s)'(?:[^']|'[ա-ֆԱ-Ֆ])*'" | |
BETWEEN_SINGLE_QUOTE_SLANTED_ARMENIAN_REGEX = r"(?<=\s)‘(?:[^’]|’[ա-ֆԱ-Ֆ])*’" | |
BETWEEN_DOUBLE_QUOTES_REGEX = r'"(?>[^"\\]+|\\{2}|\\.)*"' | |
BETWEEN_DOUBLE_QUOTES_ARMENIAN_REGEX_2 = r'"(?=(?P<tmp>[^\"\\]+|\\{2}|\\.)*)(?P=tmp)"' | |
# Rubular: http://rubular.com/r/x6s4PZK8jc | |
BETWEEN_QUOTE_ARROW_ARMENIAN_REGEX = r'«(?>[^»\\]+|\\{2}|\\.)*»' | |
BETWEEN_QUOTE_ARROW_ARMENIAN_REGEX_2 = r"\«(?=(?P<tmp>[^»\\]+|\\{2}|\\.)*)(?P=tmp)\»" | |
# Rubular: http://rubular.com/r/JbAIpKdlSq | |
BETWEEN_QUOTE_SLANTED_ARMENIAN_REGEX = r"“(?>[^”\\]+|\\{2}|\\.)*”" | |
BETWEEN_QUOTE_SLANTED_ARMENIAN_REGEX_2 = r"\“(?=(?P<tmp>[^”\\]+|\\{2}|\\.)*)(?P=tmp)\”" | |
# Rubular: http://rubular.com/r/WX4AvnZvlX | |
BETWEEN_SQUARE_BRACKETS_ARMENIAN_REGEX = r"\[(?>[^\]\\]+|\\{2}|\\.)*\]" | |
BETWEEN_SQUARE_BRACKETS_ARMENIAN_REGEX_2 = r'\[(?=(?P<tmp>[^\]\\]+|\\{2}|\\.)*)(?P=tmp)\]' | |
# Rubular: http://rubular.com/r/6tTityPflI | |
BETWEEN_PARENS_ARMENIAN_REGEX = r"\((?>[^\(\)\\]+|\\{2}|\\.)*\)" | |
BETWEEN_PARENS_ARMENIAN_REGEX_2 = r"\((?=(?P<tmp>[^\(\)\\]+|\\{2}|\\.)*)(?P=tmp)\)" | |
# Rubular: http://rubular.com/r/mXf8cW025o | |
WORD_WITH_LEADING_APOSTROPHE_ARMENIAN = r"(?<=\s)'(?:[^']|'[ա-ֆԱ-Ֆ])*'\S" | |
# Rubular: http://rubular.com/r/jTtDKfjxzr | |
BETWEEN_EM_DASHES_REGEX_ARMENIAN = r"\-\-(?>[^\-\-])*\-\-" | |
BETWEEN_EM_DASHES_REGEX_2_ARMENIAN = r"--(?=(?P<tmp>[^--]*))(?P=tmp)--" | |
def __init__(self, text): | |
super().__init__(text) | |
def replace(self): | |
text = self.sub_punctuation_between_quotes_and_parens(self.text) | |
return self.sub_punctuation_between_quotes_and_parens_armenian(text) | |
def sub_punctuation_between_quotes_and_parens_armenian(self, txt): | |
txt = self.sub_punctuation_between_single_quotes_armenian(txt) | |
txt = self.sub_punctuation_between_single_quote_slanted_armenian(txt) | |
txt = self.sub_punctuation_between_double_quotes_armenian(txt) | |
txt = self.sub_punctuation_between_square_brackets_armenian(txt) | |
txt = self.sub_punctuation_between_parens_armenian(txt) | |
txt = self.sub_punctuation_between_quotes_arrow_armenian(txt) | |
txt = self.sub_punctuation_between_em_dashes_armenian(txt) | |
txt = self.sub_punctuation_between_quotes_slanted_armenian(txt) | |
return txt | |
def sub_punctuation_between_single_quotes_armenian(self, txt): | |
if re.search(self.WORD_WITH_LEADING_APOSTROPHE_ARMENIAN, txt) and \ | |
(not re.search(r"'\s", txt)): | |
return txt | |
return re.sub(self.BETWEEN_SINGLE_QUOTES_ARMENIAN_REGEX, | |
partial(replace_punctuation, match_type='single'), txt) | |
def sub_punctuation_between_single_quote_slanted_armenian(self, txt): | |
return re.sub(self.BETWEEN_SINGLE_QUOTE_SLANTED_ARMENIAN_REGEX, | |
replace_punctuation, txt) | |
def sub_punctuation_between_parens_armenian(self, txt): | |
return re.sub(self.BETWEEN_PARENS_ARMENIAN_REGEX_2, replace_punctuation, txt) | |
def sub_punctuation_between_square_brackets_armenian(self, txt): | |
return re.sub(self.BETWEEN_SQUARE_BRACKETS_ARMENIAN_REGEX_2, replace_punctuation, | |
txt) | |
def sub_punctuation_between_double_quotes_armenian(self, txt): | |
return re.sub(self.BETWEEN_DOUBLE_QUOTES_ARMENIAN_REGEX_2, replace_punctuation, | |
txt) | |
def sub_punctuation_between_quotes_arrow_armenian(self, txt): | |
return re.sub(self.BETWEEN_QUOTE_ARROW_ARMENIAN_REGEX_2, replace_punctuation, txt) | |
def sub_punctuation_between_em_dashes_armenian(self, txt): | |
return re.sub(self.BETWEEN_EM_DASHES_REGEX_2_ARMENIAN, replace_punctuation, txt) | |
def sub_punctuation_between_quotes_slanted_armenian(self, txt): | |
return re.sub(self.BETWEEN_QUOTE_SLANTED_ARMENIAN_REGEX_2, replace_punctuation, | |
txt) | |