rbawden
/

modern_french_normalisation

@@ -4,12 +4,168 @@ from transformers.tokenization_utils_base import TruncationStrategy
 from torch import Tensor
 import html.parser
 import unicodedata
-import sys, os, re
 class NormalisationPipeline(Pipeline):
-    def __init__(self, beam_size=5, batch_size=32, **kwargs):
         self.beam_size = beam_size
         super().__init__(**kwargs)
@@ -141,15 +297,81 @@ class NormalisationPipeline(Pipeline):
         """
         result = super().__call__(*args, **kwargs)
-        if (
-            isinstance(args[0], list)
             and all(isinstance(el, str) for el in args[0])
-            and all(len(res) == 1 for res in result)
-        ):
-            return [res[0] for res in result]
-        return result
 def normalise_text(list_sents, batch_size=32, beam_size=5):
     tokeniser = AutoTokenizer.from_pretrained("rbawden/modern_french_normalisation", use_auth_token=True)
     model = AutoModelForSeq2SeqLM.from_pretrained("rbawden/modern_french_normalisation", use_auth_token=True)
@@ -169,10 +391,17 @@ def normalise_from_stdin(batch_size=32, beam_size=5):
                                               beam_size=beam_size)
     list_sents = []
     for sent in sys.stdin:
-        list_sents.append(sent)
     normalised_outputs = normalisation_pipeline(list_sents)
-    for sent in normalised_outputs:
-        print(sent['text'].strip())
     return normalised_outputs

 from torch import Tensor
 import html.parser
 import unicodedata
+import sys, os
+import re
+from tqdm.auto import tqdm
+import operator
+def basic_tokenise(string):
+    # separate punctuation
+    for char in r',.;?!:)("…-':
+        string = re.sub('(?<! )' + re.escape(char) + '+', ' ' + char, string)
+    for char in '\'"’':
+        string = re.sub(char + '(?! )' , char + ' ', string)
+    return string.strip()
+def homogenise(sent):
+    sent = sent.lower()
+#    sent = sent.replace("oe", "œ").replace("OE", "Œ")
+    replace_from = "ǽǣáàâäąãăåćčçďéèêëęěğìíîĩĭıïĺľłńñňòóôõöøŕřśšşťţùúûũüǔỳýŷÿźẑżžÁÀÂÄĄÃĂÅĆČÇĎÉÈÊËĘĚĞÌÍÎĨĬİÏĹĽŁŃÑŇÒÓÔÕÖØŔŘŚŠŞŤŢÙÚÛŨÜǓỲÝŶŸŹẐŻŽſ"
+    replace_into = "ææaaaaaaaacccdeeeeeegiiiiiiilllnnnoooooorrsssttuuuuuuyyyyzzzzAAAAAAAACCCDEEEEEEGIIIIIIILLLNNNOOOOOORRSSSTTUUUUUUYYYYZZZZs"
+    table = sent.maketrans(replace_from, replace_into)
+    return sent.translate(table)
+######## Edit distance functions #######
+def _wedit_dist_init(len1, len2):
+    lev = []
+    for i in range(len1):
+        lev.append([0] * len2)  # initialize 2D array to zero
+    for i in range(len1):
+        lev[i][0] = i  # column 0: 0,1,2,3,4,...
+    for j in range(len2):
+        lev[0][j] = j  # row 0: 0,1,2,3,4,...
+    return lev
+def _wedit_dist_step(
+    lev, i, j, s1, s2, last_left, last_right, transpositions=False
+):
+    c1 = s1[i - 1]
+    c2 = s2[j - 1]
+    # skipping a character in s1
+    a = lev[i - 1][j] + _wedit_dist_deletion_cost(c1,c2)
+    # skipping a character in s2
+    b = lev[i][j - 1] + _wedit_dist_insertion_cost(c1,c2)
+    # substitution
+    c = lev[i - 1][j - 1] + (_wedit_dist_substitution_cost(c1, c2) if c1 != c2 else 0)
+    # pick the cheapest
+    lev[i][j] = min(a, b, c)#, d)
+def _wedit_dist_backtrace(lev):
+    i, j = len(lev) - 1, len(lev[0]) - 1
+    alignment = [(i, j, lev[i][j])]
+    while (i, j) != (0, 0):
+        directions = [
+            (i - 1, j),  # skip s1
+            (i, j - 1),  # skip s2
+            (i - 1, j - 1),  # substitution
+        ]
+        direction_costs = (
+            (lev[i][j] if (i >= 0 and j >= 0) else float("inf"), (i, j))
+            for i, j in directions
+        )
+        _, (i, j) = min(direction_costs, key=operator.itemgetter(0))
+        alignment.append((i, j, lev[i][j]))
+    return list(reversed(alignment))
+def _wedit_dist_substitution_cost(c1, c2):
+    if c1 == ' ' and c2 != ' ':
+        return 1000000
+    if c2 == ' ' and c1 != ' ':
+        return 30
+    for c in ",.;-!?'":
+        if c1 == c and c2 != c:
+            return 20
+        if c2 == c and c1 != c:
+            return 20
+    return 1
+def _wedit_dist_deletion_cost(c1, c2):
+    if c1 == ' ':
+        return 2
+    if c2 == ' ':
+        return 1000000
+    return 0.8
+def _wedit_dist_insertion_cost(c1, c2):
+    if c1 == ' ':
+        return 1000000
+    if c2 == ' ':
+        return 2
+    return 0.8
+def wedit_distance_align(s1, s2):
+    """
+    Calculate the minimum Levenshtein edit-distance based alignment
+    mapping between two strings. The alignment finds the mapping
+    from string s1 to s2 that minimizes the edit distance cost.
+    For example, mapping "rain" to "shine" would involve 2
+    substitutions, 2 matches and an insertion resulting in
+    the following mapping:
+    [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (4, 5)]
+    NB: (0, 0) is the start state without any letters associated
+    See more: https://web.stanford.edu/class/cs124/lec/med.pdf
+    In case of multiple valid minimum-distance alignments, the
+    backtrace has the following operation precedence:
+    1. Skip s1 character
+    2. Skip s2 character
+    3. Substitute s1 and s2 characters
+    The backtrace is carried out in reverse string order.
+    This function does not support transposition.
+    :param s1, s2: The strings to be aligned
+    :type s1: str
+    :type s2: str
+    :rtype: List[Tuple(int, int)]
+    """
+    # set up a 2-D array
+    len1 = len(s1)
+    len2 = len(s2)
+    lev = _wedit_dist_init(len1 + 1, len2 + 1)
+    # iterate over the array
+    for i in range(len1):
+        for j in range(len2):
+            _wedit_dist_step(
+                lev,
+                i + 1,
+                j + 1,
+                s1,
+                s2,
+                0,
+                0,
+                transpositions=False,
+            )
+    # backtrace to find alignment
+    alignment = _wedit_dist_backtrace(lev)
+    return alignment
+def space_after(idx, sent):
+    if idx < len(sent) -1 and sent[idx + 1] == ' ':
+        return True
+    return False
+def space_before(idx, sent):
+    if idx > 0 and sent[idx - 1] == ' ':
+        return True
+    return False
+######## Normaliation pipeline #########
 class NormalisationPipeline(Pipeline):
+    def __init__(self, beam_size=5, batch_size=32, tokenise_func=None, **kwargs):
         self.beam_size = beam_size
+        # classic tokeniser function (used for alignments)
+        if tokenise_func is not None:
+            self.classic_tokenise = tokenise_func
+        else:
+            self.classic_tokenise = basic_tokenise
         super().__init__(**kwargs)
         """
         result = super().__call__(*args, **kwargs)
+        if (isinstance(args[0], list)
             and all(isinstance(el, str) for el in args[0])
+            and all(len(res) == 1 for res in result)):
+            output = []
+            for i in range(len(result)):
+                input_sent, pred_sent = args[0][i].strip(), result[i][0]['text'].strip()
+                alignment = self.align(input_sent, pred_sent)
+                char_spans = self.get_char_idx_align(input_sent, pred_sent, alignment)
+                output.append({'text': result[i][0]['text'], 'alignment': char_spans})
+            return output
+        else:
+            return [{'text': result, 'alignment': self.align(args, result[0]['text'].strip())}]
+    def align(self, sent_ref, sent_pred):
+        backpointers = wedit_distance_align(homogenise(self.classic_tokenise(re.sub('[  ]', '  ', sent_ref))),
+                                            homogenise(self.classic_tokenise(re.sub('[  ]', '  ', sent_pred))))
+        alignment, current_word, seen1, seen2, last_weight = [], ['', ''], [], [], 0
+        print(homogenise(sent_ref), homogenise(sent_pred))
+        for i_ref, i_pred, weight in backpointers:
+            if i_ref == 0 and i_pred == 0:
+                continue
+            # spaces in both, add straight away
+            if i_ref <= len(sent_ref) and sent_ref[i_ref-1] == ' ' and \
+               i_pred <= len(sent_pred) and sent_pred[i_pred-1] == ' ':
+                alignment.append((current_word[0].strip(), current_word[1].strip(), weight-last_weight))
+                last_weight = weight
+                current_word = ['', '']
+                seen1.append(i_ref)
+                seen2.append(i_pred)
+            else:
+                end_space = '░'
+                if i_ref <= len(sent_ref) and i_ref not in seen1:
+                    if i_ref > 0:
+                        current_word[0] += sent_ref[i_ref-1]
+                        seen1.append(i_ref)
+                if i_pred <= len(sent_pred) and i_pred not in seen2:
+                    if i_pred > 0:
+                        current_word[1] += sent_pred[i_pred-1] if sent_pred[i_pred-1] != ' ' else '▁'
+                        end_space = '' if space_after(i_pred, sent_pred) else '░'
+                        seen2.append(i_pred)
+                if i_ref <= len(sent_ref) and sent_ref[i_ref-1] == ' ' and current_word[0].strip() != '':
+                    alignment.append((current_word[0].strip(), current_word[1].strip() + end_space, weight-last_weight))
+                    last_weight = weight
+                    current_word = ['', '']
+        # final word
+        alignment.append((current_word[0].strip(), current_word[1].strip(), weight-last_weight))
+        # check that both strings are entirely covered
+        recovered1 = re.sub(' +', ' ', ' '.join([x[0] for x in alignment]))
+        recovered2 = re.sub(' +', ' ', ' '.join([x[1] for x in alignment]))
+        assert recovered1 == re.sub(' +', ' ', sent_ref), \
+            '\n1: ' + re.sub(' +', ' ', recovered1) + "\n1: " + re.sub(' +', ' ', sent_ref)
+        assert re.sub('[░▁ ]+', '', recovered2) == re.sub('[▁ ]+', '', sent_pred), \
+            '\n2: ' + re.sub(' +', ' ', recovered2) + "\n2: " + re.sub(' +', ' ', sent_pred)
+        return alignment
+    def get_char_idx_align(self, sent_ref, sent_pred, alignment):
+        covered_ref, covered_pred = 0, 0
+        ref_chars = [i for i, character in enumerate(sent_ref) if character not in [' ']]
+        pred_chars = [i for i, character in enumerate(sent_pred) if character not in [' ']]
+        align_idx = []
+        for a_ref, a_pred, _ in alignment:
+            if a_ref == '' and a_pred == '':
+                continue
+            a_pred = re.sub('[░▁ ]+', '', a_pred).strip()
+            span_ref = [ref_chars[covered_ref], ref_chars[covered_ref + len(a_ref) - 1]]
+            covered_ref += len(a_ref)
+            span_pred = [pred_chars[covered_pred], pred_chars[covered_pred + max(0, len(a_pred) - 1)]]
+            covered_pred += max(0, len(a_pred))
+            align_idx.append((span_ref, span_pred))
+        return align_idx
 def normalise_text(list_sents, batch_size=32, beam_size=5):
     tokeniser = AutoTokenizer.from_pretrained("rbawden/modern_french_normalisation", use_auth_token=True)
     model = AutoModelForSeq2SeqLM.from_pretrained("rbawden/modern_french_normalisation", use_auth_token=True)
                                               beam_size=beam_size)
     list_sents = []
     for sent in sys.stdin:
+        list_sents.append(sent.strip())
     normalised_outputs = normalisation_pipeline(list_sents)
+    for s, sent in enumerate(normalised_outputs):
+        alignment=sent['alignment']
+        print(list_sents[s], len(list_sents[s]))
+        print(sent['text'], len(sent['text']))
+        print(sent['alignment'])
+        #for b, a in alignment:
+        #    print('input: [' + ''.join([list_sents[s][x] for x in range(b[0], b[1]+1)]) + ']')
+        #    print('pred: [' + ''.join([sent['text'][x] for x in range(a[0], a[1]+1)]) + ']')
     return normalised_outputs