File size: 4,042 Bytes
d093ea4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# -*- coding: utf-8 -*-
import re
from pysbd.utils import Text


def replace_pre_number_abbr(txt, abbr):
    # prepend a space to avoid needing another regex for start of string
    txt = " " + txt
    txt = re.sub(r"(?<=\s{abbr})\.(?=(\s\d|\s+\())".format(abbr=abbr.strip()), "∯", txt)
    # remove the prepended space
    txt = txt[1:]
    return txt


def replace_prepositive_abbr(txt, abbr):
    # prepend a space to avoid needing another regex for start of string
    txt = " " + txt
    txt = re.sub(r"(?<=\s{abbr})\.(?=(\s|:\d+))".format(abbr=abbr.strip()), "∯", txt)
    # remove the prepended space
    txt = txt[1:]
    return txt


class AbbreviationReplacer(object):
    def __init__(self, text, lang):
        self.text = text
        self.lang = lang

    def replace(self):
        self.text = Text(self.text).apply(
            self.lang.PossessiveAbbreviationRule,
            self.lang.KommanditgesellschaftRule,
            *self.lang.SingleLetterAbbreviationRules.All
        )
        abbr_handled_text = ""
        for line in self.text.splitlines(True):
            abbr_handled_text += self.search_for_abbreviations_in_string(line)
        self.text = abbr_handled_text
        self.replace_multi_period_abbreviations()
        self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
        self.text = self.replace_abbreviation_as_sentence_boundary()
        return self.text

    def replace_abbreviation_as_sentence_boundary(self):
        sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in self.SENTENCE_STARTERS))
        regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯({})".format(sent_starters)
        self.text = re.sub(regex, '\\1.', self.text)
        return self.text

    def replace_multi_period_abbreviations(self):
        def mpa_replace(match):
            match = match.group()
            match = re.sub(re.escape(r"."), "∯", match)
            return match

        self.text = re.sub(
            self.lang.MULTI_PERIOD_ABBREVIATION_REGEX,
            mpa_replace,
            self.text,
            flags=re.IGNORECASE
        )

    def replace_period_of_abbr(self, txt, abbr):
        # prepend a space to avoid needing another regex for start of string
        txt = " " + txt
        txt = re.sub(
            r"(?<=\s{abbr})\.(?=((\.|\:|-|\?|,)|(\s([a-z]|I\s|I'm|I'll|\d|\())))".format(
                abbr=re.escape(abbr.strip())
            ),
            "∯",
            txt,
        )
        # remove the prepended space
        txt = txt[1:]
        return txt


    def search_for_abbreviations_in_string(self, text):
        lowered = text.lower()
        for abbr in self.lang.Abbreviation.ABBREVIATIONS:
            stripped = abbr.strip()
            if stripped not in lowered:
                continue
            abbrev_match = re.findall(
                r"(?:^|\s|\r|\n){}".format(stripped), text, flags=re.IGNORECASE
            )
            if not abbrev_match:
                continue
            next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}"
            char_array = re.findall(next_word_start, text)
            for ind, match in enumerate(abbrev_match):
                text = self.scan_for_replacements(
                    text, match, ind, char_array
                )
        return text

    def scan_for_replacements(self, txt, am, ind, char_array):
        try:
            char = char_array[ind]
        except IndexError:
            char = ""
        prepositive = self.lang.Abbreviation.PREPOSITIVE_ABBREVIATIONS
        number_abbr = self.lang.Abbreviation.NUMBER_ABBREVIATIONS
        upper = str(char).isupper()
        if not upper or am.strip().lower() in prepositive:
            if am.strip().lower() in prepositive:
                txt = replace_prepositive_abbr(txt, am)
            elif am.strip().lower() in number_abbr:
                txt = replace_pre_number_abbr(txt, am)
            else:
                txt = self.replace_period_of_abbr(txt, am)
        return txt