kontur-ai
/

sbert_punc_case_ru

@@ -8,7 +8,6 @@ import numpy as np
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 import re
 import string
-from typing import List, Optional
 TOKEN_RE = re.compile(r'-?\d*\.\d+|[a-zа-яё]+|-?[\d\+\(\)\-]+|\S', re.I)
@@ -84,49 +83,6 @@ def decode_label(label, classes='all'):
         return INVERSE_LABELS[label]
-def make_labeling(text: str):
-    # Разобъем предложение на слова и знаки препинания
-    tokens = TOKEN_RE.findall(text)
-    # Предобработаем слова, удалим знаки препинания и зададим метки
-    preprocessed_tokens = []
-    token_labels: List[List[str]] = []
-    # Убираем всю пунктуацию в начале предложения
-    while tokens[0] in string.punctuation:
-        tokens.pop(0)
-    for token in tokens:
-        if token in string.punctuation:
-            # Если встретился знак препинания который мы прогнозируем изменим метку предыдущего слова, иначе проигнорируем его
-            if token in PUNK_MAPPING:
-                token_labels[-1][1] = PUNK_MAPPING[token]
-        else:
-            # Если встретилось слово, то укажем метку регистра и добавим в список предобработанных слов в нижнем регистре
-            if sum(char.isupper() for char in token) > 1:
-                token_labels.append(['UPPER_TOTAL', 'O'])
-            elif token[0].isupper():
-                token_labels.append(['UPPER', 'O'])
-            else:
-                token_labels.append(['LOWER', 'O'])
-            preprocessed_tokens.append(token.lower())
-    token_labels_merged = ['_'.join(label) for label in token_labels]
-    token_labels_ids = [LABELS[label] for label in token_labels_merged]
-    return dict(words=preprocessed_tokens, labels=token_labels_merged, label_ids=token_labels_ids)
-def align_labels(label_ids: list[int], word_ids: list[Optional[int]]):
-    aligned_label_ids = []
-    previous_id = None
-    for word_id in word_ids:
-        if word_id is None or word_id == previous_id:
-            aligned_label_ids.append(LABELS['O'])
-        else:
-            aligned_label_ids.append(label_ids.pop(0))
-        previous_id = word_id
-    return aligned_label_ids
 MODEL_REPO = "kontur-ai/sbert-punc-case-ru"
@@ -151,22 +107,18 @@ class SbertPuncCase(nn.Module):
     def punctuate(self, text):
         text = text.strip().lower()
-        # preprocess
-        words_with_labels = make_labeling(text)
-        words = words_with_labels['words']
-        label_ids = words_with_labels['label_ids']
         tokenizer_output = self.tokenizer(words, is_split_into_words=True)
-        aligned_label_ids = [align_labels(label_ids, tokenizer_output.word_ids())]
-        result = dict(tokenizer_output)
-        result.update({'labels': aligned_label_ids})
-        if len(result['input_ids']) > 512:
             return ' '.join([self.punctuate(' '.join(text_part)) for text_part in np.array_split(words, 2)])
-        predictions = self(torch.tensor([result['input_ids']], device=self.model.device),
-                           torch.tensor([result['attention_mask']], device=self.model.device)).logits.cpu().data.numpy()
         predictions = np.argmax(predictions, axis=2)
         # decode punctuation and casing
@@ -183,7 +135,7 @@ class SbertPuncCase(nn.Module):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser("Punctuation and case restoration model sbert-punc-case-ru")
-    parser.add_argument("-i", "--input", type=str, help="text to restore", default='SbertPuncCase расставляет точки запятые и знаки вопроса вам нравится')
     parser.add_argument("-d", "--device", type=str, help="run model on cpu or gpu", choices=['cpu', 'cuda'], default='cpu')
     args = parser.parse_args()
     print(f"Source text:   {args.input}\n")

 from transformers import AutoTokenizer, AutoModelForTokenClassification
 import re
 import string
 TOKEN_RE = re.compile(r'-?\d*\.\d+|[a-zа-яё]+|-?[\d\+\(\)\-]+|\S', re.I)
         return INVERSE_LABELS[label]
 MODEL_REPO = "kontur-ai/sbert-punc-case-ru"
     def punctuate(self, text):
         text = text.strip().lower()
+        # Разобъем предложение на слова и знаки препинания
+        tokens = TOKEN_RE.findall(text)
+        # Удалим знаки препинания
+        words = [token for token in tokens if token not in string.punctuation]
         tokenizer_output = self.tokenizer(words, is_split_into_words=True)
+        if len(tokenizer_output.input_ids) > 512:
             return ' '.join([self.punctuate(' '.join(text_part)) for text_part in np.array_split(words, 2)])
+        predictions = self(torch.tensor([tokenizer_output.input_ids], device=self.model.device),
+                           torch.tensor([tokenizer_output.attention_mask], device=self.model.device)).logits.cpu().data.numpy()
         predictions = np.argmax(predictions, axis=2)
         # decode punctuation and casing
 if __name__ == '__main__':
     parser = argparse.ArgumentParser("Punctuation and case restoration model sbert-punc-case-ru")
+    parser.add_argument("-i", "--input", type=str, help="text to restore", default='sbert punc case расставляет точки запятые и знаки вопроса вам нравится')
     parser.add_argument("-d", "--device", type=str, help="run model on cpu or gpu", choices=['cpu', 'cuda'], default='cpu')
     args = parser.parse_args()
     print(f"Source text:   {args.input}\n")