Test_Pipeline_v7

Sleeping

App Files Files Community

fruitpicker01 commited on Sep 21, 2024

Commit

7f1566f

verified ·

1 Parent(s): ce89bcb

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -2

app.py CHANGED Viewed

@@ -457,6 +457,7 @@ def update_download_link():
 def correct_dash_usage(text):
     # Step 1: Replace any dash with long dash if surrounded by spaces
     text = re.sub(r'\s[-–—]\s', ' — ', text)
@@ -466,12 +467,44 @@ def correct_dash_usage(text):
     # Step 3: Replace any dash with hyphen if surrounded by letters or a combination of letters and digits
     text = re.sub(r'(?<=[a-zA-Zа-яА-Я0-9])[-–—](?=[a-zA-Zа-яА-Я0-9])', '-', text)
-    # Кавычки "лапки" (они же "птички") тут же до кучи заменим на кавычки-елочки («...»)
     text = re.sub(r'"([^\"]+)"', r'«\1»', text)
-    # И тут же за компанию поменяем 100к на 100 000
     text = re.sub(r'(\d+)[kкКK]', r'\1 000', text, flags=re.IGNORECASE)
     return text

 def correct_dash_usage(text):
+    morph = pymorphy2.MorphAnalyzer()
     # Step 1: Replace any dash with long dash if surrounded by spaces
     text = re.sub(r'\s[-–—]\s', ' — ', text)
     # Step 3: Replace any dash with hyphen if surrounded by letters or a combination of letters and digits
     text = re.sub(r'(?<=[a-zA-Zа-яА-Я0-9])[-–—](?=[a-zA-Zа-яА-Я0-9])', '-', text)
+    # Step 4: Replace quotation marks "..." with «...»
     text = re.sub(r'"([^\"]+)"', r'«\1»', text)
+    # Step 5: Remove single quotes
+    if text.count('«') != text.count('»'):
+        text = text.replace('«', '').replace('»', '')
+    # Step 6: Replace 100k with 100 000
     text = re.sub(r'(\d+)[kкКK]', r'\1 000', text, flags=re.IGNORECASE)
+    # Step 7: Remove first sentence if it contains greetings and is less than 5 words
+    greeting_patterns = [
+        r"привет\b", r"здравствуй", r"добрый\s(день|вечер|утро)",
+        r"дорогой\b", r"уважаемый\b", r"дорогая\b", r"уважаемая\b",
+        r"господин\b", r"госпожа\b", r"друг\b", r"коллега\b",
+        r"товарищ\b", r"приятель\b", r"подруга\b"
+    ]
+    def is_greeting_sentence(sentence):
+        words = sentence.split()
+        if len(words) < 5:  # Check if sentence is less than 5 words
+            for word in words:
+                parsed = morph.parse(word.lower())[0]  # Parse the word to get its base form
+                for pattern in greeting_patterns:
+                    if re.search(pattern, parsed.normal_form):
+                        return True
+        return False
+    # Split text into sentences
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    # Check the first sentence for greetings and remove it if necessary
+    if sentences and is_greeting_sentence(sentences[0]):
+        sentences = sentences[1:]
+    # Join the sentences back
+    text = ' '.join(sentences)
     return text