Spaces:

fruitpicker01
/

Test_Pipeline_dev_2

Sleeping

App Files Files Community

fruitpicker01 commited on Oct 13, 2024

Commit

f8d1f68

verified ·

1 Parent(s): 8b77c1b

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -14

app.py CHANGED Viewed

@@ -515,21 +515,23 @@ def correct_dash_usage(text):
     # Join the sentences back
     text = ' '.join(sentences)
-    # Step 9: Replace common abbreviations and acronyms (Ип -> ИП, Ооо -> ООО, Рф -> РФ)
-    text = re.sub(r'\bИп\b', 'ИП', text, flags=re.IGNORECASE)
-    text = re.sub(r'\bОоо\b', 'ООО', text, flags=re.IGNORECASE)
-    text = re.sub(r'\bРф\b', 'РФ', text, flags=re.IGNORECASE)
-    # Step 10: Replace specific words (пользуйтесь -> пользуйтесь, ею -> ей)
-    text = re.sub(r'\bпользовуйтесь\b', 'пользуйтесь', text, flags=re.IGNORECASE)
-    text = re.sub(r'\bею\b', 'ей', text, flags=re.IGNORECASE)
     def restore_yo(text):
         morph = pymorphy3.MorphAnalyzer()
         words = text.split()
         restored_words = []
         for word in words:
             parsed = morph.parse(word)[0]
             restored_word = parsed.word
@@ -543,6 +545,58 @@ def correct_dash_usage(text):
     text = restore_yo(text)
     return text
@@ -1568,11 +1622,11 @@ def personalize_and_save(
     # После завершения персонализации, сохраняем результаты
     if last_personalization_result:
-        checks_gigachat_pro = perform_checks(last_personalization_result[1])
-        checks_gigachat_lite = perform_checks(last_personalization_result[2])
-        checks_gigachat_plus = perform_checks(last_personalization_result[3])
-        checks_gpt4o = perform_checks(last_personalization_result[4])
-        checks_meta_llama_405b = perform_checks(last_personalization_result[5])
         # Форматирование результатов проверок
         formatted_checks = [

     # Join the sentences back
     text = ' '.join(sentences)
     def restore_yo(text):
         morph = pymorphy3.MorphAnalyzer()
         words = text.split()
         restored_words = []
         for word in words:
+            # Пропускать обработку, если слово полностью в верхнем регистре (аббревиатуры)
+            if word.isupper():
+                restored_words.append(word)
+                continue
+            # Пропускать обработку, если слово "все" (независимо от регистра)
+            if word.lower() == "все":
+                restored_words.append(word)
+                continue
+            # Обработка остальных слов
             parsed = morph.parse(word)[0]
             restored_word = parsed.word
     text = restore_yo(text)
+    # Step 9: Replace common abbreviations and acronyms (Ип -> ИП, Ооо -> ООО, Рф -> РФ)
+    text = re.sub(r'\bИп\b', 'ИП', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bОоо\b', 'ООО', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bРф\b', 'РФ', text, flags=re.IGNORECASE)
+    # Step 10: Replace specific words (пользуйтесь -> пользуйтесь, ею -> ей)
+    text = re.sub(r'\bпользовуйтесь\b', 'пользуйтесь', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bею\b', 'ей', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bповышьте\b', 'повысьте', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bСбербизнес\b', 'СберБизнес', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bСбербизнеса\b', 'СберБизнес', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bСбербизнесе\b', 'СберБизнес', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bСбербанк\b', 'СберБанк', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bвашего ООО\b', 'вашей компании', text, flags=re.IGNORECASE)
+    text = re.sub(r'\b0₽\b', '0 р', text, flags=re.IGNORECASE)
+    text = re.sub(r'\b₽\b', 'р', text, flags=re.IGNORECASE)
+    # Step 11: Replace all forms of "рублей", "рубля", "руб." with "р"
+    # Используем два отдельных регулярных выражения для точности
+    # 1. Заменяем "руб." на "р", учитывая, что "руб." может быть перед символом "/" или другим несловесным символом
+    text = re.sub(r'\bруб\.(?=\W|$)', 'р', text, flags=re.IGNORECASE)
+    # 2. Заменяем "рубля" и "рублей" на "р"
+    text = re.sub(r'\bруб(?:ля|лей)\b', 'р', text, flags=re.IGNORECASE)
+    # Step 12: Replace thousands and millions with appropriate abbreviations
+    text = re.sub(r'(\d+)\s+тысяч(?:а|и)?(?:\s+рублей)?', r'\1 000 р', text, flags=re.IGNORECASE)
+    text = re.sub(r'(\d+)\s*тыс\.\s*руб\.', r'\1 000 р', text, flags=re.IGNORECASE)
+    text = re.sub(r'(\d+)\s*тыс\.\s*р\.', r'\1 000 р', text, flags=re.IGNORECASE)
+    text = re.sub(r'(\d+)\s*тыс\.\s*р', r'\1 000 р', text, flags=re.IGNORECASE)
+    # Replace millions with "млн"
+    text = re.sub(r'(\d+)\s+миллиона\b|\bмиллионов\b', r'\1 млн', text, flags=re.IGNORECASE)
+    text = re.sub(r'(\d+)\s*млн\s*руб\.', r'\1 млн р', text, flags=re.IGNORECASE)
+    # Ensure space formatting around currency abbreviations
+    text = re.sub(r'(\d+)\s*р\b', r'\1 р', text)
+    # Step 13: Remove sentences containing "никаких посещений" or "никаких визитов"
+    def remove_specific_sentences(text):
+        sentences = re.split(r'(?<=[.!?])\s+', text)  # Разбиваем текст на предложения
+        filtered_sentences = [
+            sentence for sentence in sentences
+            if not re.search(r'\bникаких\s+(посещений|визитов)\b', sentence, flags=re.IGNORECASE)
+        ]
+        return ' '.join(filtered_sentences)
+    # Шаг 14: Замена чисел вида "5 000 000 р" на "5 млн р"
+    text = re.sub(r'\b(\d+)\s+000\s+000\s*р\b', r'\1 млн р', text, flags=re.IGNORECASE)
+    text = remove_specific_sentences(text)
     return text
     # После завершения персонализации, сохраняем результаты
     if last_personalization_result:
+        checks_gigachat_pro = perform_checks(last_personalization_result[1], description, key_message)
+        checks_gigachat_lite = perform_checks(last_personalization_result[2], description, key_message)
+        checks_gigachat_plus = perform_checks(last_personalization_result[3], description, key_message)
+        checks_gpt4o = perform_checks(last_personalization_result[4], description, key_message)
+        checks_meta_llama_405b = perform_checks(last_personalization_result[5], description, key_message)
         # Форматирование результатов проверок
         formatted_checks = [