Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -457,6 +457,7 @@ def update_download_link():
|
|
457 |
|
458 |
|
459 |
def correct_dash_usage(text):
|
|
|
460 |
# Step 1: Replace any dash with long dash if surrounded by spaces
|
461 |
text = re.sub(r'\s[-–—]\s', ' — ', text)
|
462 |
|
@@ -466,12 +467,44 @@ def correct_dash_usage(text):
|
|
466 |
# Step 3: Replace any dash with hyphen if surrounded by letters or a combination of letters and digits
|
467 |
text = re.sub(r'(?<=[a-zA-Zа-яА-Я0-9])[-–—](?=[a-zA-Zа-яА-Я0-9])', '-', text)
|
468 |
|
469 |
-
#
|
470 |
text = re.sub(r'"([^\"]+)"', r'«\1»', text)
|
471 |
|
472 |
-
#
|
|
|
|
|
|
|
|
|
473 |
text = re.sub(r'(\d+)[kкКK]', r'\1 000', text, flags=re.IGNORECASE)
|
474 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
475 |
return text
|
476 |
|
477 |
|
|
|
457 |
|
458 |
|
459 |
def correct_dash_usage(text):
|
460 |
+
morph = pymorphy2.MorphAnalyzer()
|
461 |
# Step 1: Replace any dash with long dash if surrounded by spaces
|
462 |
text = re.sub(r'\s[-–—]\s', ' — ', text)
|
463 |
|
|
|
467 |
# Step 3: Replace any dash with hyphen if surrounded by letters or a combination of letters and digits
|
468 |
text = re.sub(r'(?<=[a-zA-Zа-яА-Я0-9])[-–—](?=[a-zA-Zа-яА-Я0-9])', '-', text)
|
469 |
|
470 |
+
# Step 4: Replace quotation marks "..." with «...»
|
471 |
text = re.sub(r'"([^\"]+)"', r'«\1»', text)
|
472 |
|
473 |
+
# Step 5: Remove single quotes
|
474 |
+
if text.count('«') != text.count('»'):
|
475 |
+
text = text.replace('«', '').replace('»', '')
|
476 |
+
|
477 |
+
# Step 6: Replace 100k with 100 000
|
478 |
text = re.sub(r'(\d+)[kкКK]', r'\1 000', text, flags=re.IGNORECASE)
|
479 |
|
480 |
+
# Step 7: Remove first sentence if it contains greetings and is less than 5 words
|
481 |
+
greeting_patterns = [
|
482 |
+
r"привет\b", r"здравствуй", r"добрый\s(день|вечер|утро)",
|
483 |
+
r"дорогой\b", r"уважаемый\b", r"дорогая\b", r"уважаемая\b",
|
484 |
+
r"господин\b", r"госпожа\b", r"друг\b", r"коллега\b",
|
485 |
+
r"товарищ\b", r"приятель\b", r"подруга\b"
|
486 |
+
]
|
487 |
+
|
488 |
+
def is_greeting_sentence(sentence):
|
489 |
+
words = sentence.split()
|
490 |
+
if len(words) < 5: # Check if sentence is less than 5 words
|
491 |
+
for word in words:
|
492 |
+
parsed = morph.parse(word.lower())[0] # Parse the word to get its base form
|
493 |
+
for pattern in greeting_patterns:
|
494 |
+
if re.search(pattern, parsed.normal_form):
|
495 |
+
return True
|
496 |
+
return False
|
497 |
+
|
498 |
+
# Split text into sentences
|
499 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
500 |
+
|
501 |
+
# Check the first sentence for greetings and remove it if necessary
|
502 |
+
if sentences and is_greeting_sentence(sentences[0]):
|
503 |
+
sentences = sentences[1:]
|
504 |
+
|
505 |
+
# Join the sentences back
|
506 |
+
text = ' '.join(sentences)
|
507 |
+
|
508 |
return text
|
509 |
|
510 |
|