Spaces:

fruitpicker01
/

Test_Pipeline_v10

Sleeping

App Files Files Community

fruitpicker01 commited on Dec 25, 2024

Commit

3656ed8

verified ·

1 Parent(s): 41df0c6

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -0

app.py CHANGED Viewed

@@ -200,6 +200,91 @@ def call_model(model_prompt):
     return completion.choices[0].message.content.strip()
 def correct_dash_usage(text):
     return text
 def clean_message(message):

     return completion.choices[0].message.content.strip()
 def correct_dash_usage(text):
+    morph = pymorphy3.MorphAnalyzer()
+    text = re.sub(r'\s[-–—]\s', ' — ', text)
+    text = re.sub(r'(?<=\d)[-–—](?=\d)', '–', text)
+    text = re.sub(r'(?<=[a-zA-Zа-яА-Я0-9])[-–—](?=[a-zA-Zа-яА-Я0-9])', '-', text)
+    text = re.sub(r'"([^\"]+)"', r'«\1»', text)
+    if text.count('"') == 1:
+        text = text.replace('"', '')
+    if (text.startswith('"') and text.endswith('"')) or (text.startswith('«') and text.endswith('»')):
+        text = text[1:-1].strip()
+    text = re.sub(r'(\d+)[kкКK]', r'\1 000', text, flags=re.IGNORECASE)
+    greeting_patterns = [
+        r"привет\b", r"здравствуй", r"добрый\s(день|вечер|утро)",
+        r"дорогой\b", r"уважаемый\b", r"дорогая\b", r"уважаемая\b",
+        r"господин\b", r"госпожа\b", r"друг\b", r"коллега\b",
+        r"товарищ\b", r"приятель\b", r"подруга\b"
+    ]
+    def is_greeting_sentence(sentence):
+        words = sentence.split()
+        if len(words) < 5:
+            for word in words:
+                parsed = morph.parse(word.lower())[0]
+                for pattern in greeting_patterns:
+                    if re.search(pattern, parsed.normal_form):
+                        return True
+        return False
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    if sentences and is_greeting_sentence(sentences[0]):
+        sentences = sentences[1:]
+    text = ' '.join(sentences)
+    def restore_yo(text):
+        morph = pymorphy3.MorphAnalyzer()
+        words = text.split()
+        restored_words = []
+        for word in words:
+            if word.isupper():
+                restored_words.append(word)
+                continue
+            if word.lower() == "все":
+                restored_words.append(word)
+                continue
+            parsed = morph.parse(word)[0]
+            restored_word = parsed.word
+            if word and word[0].isupper():
+                restored_word = restored_word.capitalize()
+            restored_words.append(restored_word)
+        return ' '.join(restored_words)
+    text = restore_yo(text)
+    text = re.sub(r'\bИп\b', 'ИП', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bОоо\b', 'ООО', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bРф\b', 'РФ', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bпользовуйтесь\b', 'пользуйтесь', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bею\b', 'ей', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bповышьте\b', 'повысьте', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bСбербизнес\b', 'СберБизнес', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bСбербизнеса\b', 'СберБизнес', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bСбербизнесе\b', 'СберБизнес', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bСбербанк\b', 'СберБанк', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bвашего ООО\b', 'вашей компании', text, flags=re.IGNORECASE)
+    text = re.sub(r'\b0₽\b', '0 р', text, flags=re.IGNORECASE)
+    text = re.sub(r'\b₽\b', 'р', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bруб\.(?=\W|$)', 'р', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bруб(?:ля|лей)\b', 'р', text, flags=re.IGNORECASE)
+    text = re.sub(r'(\d+)\s+тысяч(?:а|и)?(?:\s+рублей)?', r'\1 000 р', text, flags=re.IGNORECASE)
+    text = re.sub(r'(\d+)\s*тыс\.\s*руб\.', r'\1 000 р', text, flags=re.IGNORECASE)
+    text = re.sub(r'(\d+)\s*тыс\.\s*р\.', r'\1 000 р', text, flags=re.IGNORECASE)
+    text = re.sub(r'(\d+)\s*тыс\.\s*р', r'\1 000 р', text, flags=re.IGNORECASE)
+    text = re.sub(r'(\d+)\s+миллиона\b|\bмиллионов\b', r'\1 млн', text, flags=re.IGNORECASE)
+    text = re.sub(r'(\d+)\s*млн\s*руб\.', r'\1 млн р', text, flags=re.IGNORECASE)
+    text = re.sub(r'(\d+)\s*р\b', r'\1 р', text)
+    def remove_specific_sentences(text):
+        sentences = re.split(r'(?<=[.!?])\s+', text)
+        filtered_sentences = [
+            sentence for sentence in sentences
+            if not re.search(r'\bникаких\s+(посещений|визитов)\b', sentence, re.IGNORECASE)
+        ]
+        return ' '.join(filtered_sentences)
+    text = re.sub(r'\b(\d+)\s+000\s+000\s*р\b', r'\1 млн р', text, flags=re.IGNORECASE)
+    text = re.sub(r' р р ', r' р ', text, flags=re.IGNORECASE)
+    text = remove_specific_sentences(text)
     return text
 def clean_message(message):