fruitpicker01 commited on
Commit
7f1566f
·
verified ·
1 Parent(s): ce89bcb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -2
app.py CHANGED
@@ -457,6 +457,7 @@ def update_download_link():
457
 
458
 
459
  def correct_dash_usage(text):
 
460
  # Step 1: Replace any dash with long dash if surrounded by spaces
461
  text = re.sub(r'\s[-–—]\s', ' — ', text)
462
 
@@ -466,12 +467,44 @@ def correct_dash_usage(text):
466
  # Step 3: Replace any dash with hyphen if surrounded by letters or a combination of letters and digits
467
  text = re.sub(r'(?<=[a-zA-Zа-яА-Я0-9])[-–—](?=[a-zA-Zа-яА-Я0-9])', '-', text)
468
 
469
- # Кавычки "лапки" (они же "птички") тут же до кучи заменим на кавычки-елочки («...»)
470
  text = re.sub(r'"([^\"]+)"', r'«\1»', text)
471
 
472
- # И тут же за компанию поменяем 100к на 100 000
 
 
 
 
473
  text = re.sub(r'(\d+)[kкКK]', r'\1 000', text, flags=re.IGNORECASE)
474
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
  return text
476
 
477
 
 
457
 
458
 
459
  def correct_dash_usage(text):
460
+ morph = pymorphy2.MorphAnalyzer()
461
  # Step 1: Replace any dash with long dash if surrounded by spaces
462
  text = re.sub(r'\s[-–—]\s', ' — ', text)
463
 
 
467
  # Step 3: Replace any dash with hyphen if surrounded by letters or a combination of letters and digits
468
  text = re.sub(r'(?<=[a-zA-Zа-яА-Я0-9])[-–—](?=[a-zA-Zа-яА-Я0-9])', '-', text)
469
 
470
+ # Step 4: Replace quotation marks "..." with «...»
471
  text = re.sub(r'"([^\"]+)"', r'«\1»', text)
472
 
473
+ # Step 5: Remove single quotes
474
+ if text.count('«') != text.count('»'):
475
+ text = text.replace('«', '').replace('»', '')
476
+
477
+ # Step 6: Replace 100k with 100 000
478
  text = re.sub(r'(\d+)[kкКK]', r'\1 000', text, flags=re.IGNORECASE)
479
 
480
+ # Step 7: Remove first sentence if it contains greetings and is less than 5 words
481
+ greeting_patterns = [
482
+ r"привет\b", r"здравствуй", r"добрый\s(день|вечер|утро)",
483
+ r"дорогой\b", r"уважаемый\b", r"дорогая\b", r"уважаемая\b",
484
+ r"господин\b", r"госпожа\b", r"друг\b", r"коллега\b",
485
+ r"товарищ\b", r"приятель\b", r"подруга\b"
486
+ ]
487
+
488
+ def is_greeting_sentence(sentence):
489
+ words = sentence.split()
490
+ if len(words) < 5: # Check if sentence is less than 5 words
491
+ for word in words:
492
+ parsed = morph.parse(word.lower())[0] # Parse the word to get its base form
493
+ for pattern in greeting_patterns:
494
+ if re.search(pattern, parsed.normal_form):
495
+ return True
496
+ return False
497
+
498
+ # Split text into sentences
499
+ sentences = re.split(r'(?<=[.!?])\s+', text)
500
+
501
+ # Check the first sentence for greetings and remove it if necessary
502
+ if sentences and is_greeting_sentence(sentences[0]):
503
+ sentences = sentences[1:]
504
+
505
+ # Join the sentences back
506
+ text = ' '.join(sentences)
507
+
508
  return text
509
 
510