Eason Lu commited on
Commit
1312451
·
1 Parent(s): de46850

spell check: now could check phrases and change replace logic

Browse files
Files changed (2) hide show
  1. SRT.py +37 -70
  2. pipeline.py +1 -30
SRT.py CHANGED
@@ -5,6 +5,7 @@ from csv import reader
5
  from datetime import timedelta
6
  import logging
7
  import openai
 
8
 
9
 
10
  class SRT_segment(object):
@@ -411,89 +412,55 @@ class SRT_script():
411
 
412
  def fetchfunc(self,word,threshold):
413
  import enchant
414
- result = word;
 
415
  threshold = threshold*len(word)
416
  if len(self.comp_dict)==0:
417
- with open("./finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
418
- self.comp_dict = {rows[0]: rows[1] for rows in reader(f)}
419
  temp = ""
420
  for matched in self.comp_dict:
421
- if enchant.utils.levenshtein(word, matched)<enchant.utils.levenshtein(word, temp):
422
- temp = matched
 
423
  if enchant.utils.levenshtein(word, temp) < threshold:
 
424
  result = temp
425
- return result
 
 
 
 
 
 
 
 
 
 
426
 
427
  def spell_check_term(self):
428
- ## known bug: I've will be replaced because i've is not in the dict
429
  logging.info("performing spell check")
430
  import enchant
431
  dict = enchant.Dict('en_US')
432
  term_spellDict = enchant.PyPWL('./finetune_data/dict_freq.txt')
433
 
434
- for seg in self.segments:
435
- ready_words = seg.source_text.split(" ")
436
  for i in range(len(ready_words)):
437
- word = ready_words[i]
438
- [real_word, pos] = self.get_real_word(word)
439
- if not dict.check(word[:pos]) and not term_spellDict.check(real_word):
440
- new_word = word.replace(word[:pos],self.fetchfunc(word[:pos],0.5))
441
-
442
- logging.info(real_word + "\t" + self.fetchfunc(word[:pos],0.5) + "\t" + str(enchant.utils.levenshtein(real_word, self.fetchfunc(word[:pos],0.5)))+'\n')
443
-
444
- #suggest = term_spellDict.suggest(real_word)
445
- #if suggest and enchant.utils.levenshtein(real_word, suggest[0]) < (len(real_word)+len(suggest[0]))/4: # relax spell check
446
-
447
- # with open("dislog.log","a") as log:
448
- # if not os.path.exists("dislog.log"):
449
- # log.write("word \t suggest \t levenshtein \n")
450
- # logging.info(word + "\t" + suggest[0] + "\t" + str(enchant.utils.levenshtein(word, suggest[0]))+'\n')
451
- # #print(word + ":" + suggest[0] + ":---:levenshtein:" + str(enchant.utils.levenshtein(word, suggest[0])))
452
- # new_word = word.replace(word[:pos],suggest[0])
453
- #else:
454
- # new_word = word
455
- else:
456
- new_word = word
457
- ready_words[i] = new_word
458
- seg.source_text = " ".join(ready_words)
459
- pass
460
-
461
- def spell_correction(self, word: str, arg: int):
462
- try:
463
- arg in [0, 1]
464
- except ValueError:
465
- print('only 0 or 1 for argument')
466
-
467
- def uncover(word: str):
468
- if word[-2:] == ".\n":
469
- real_word = word[:-2].lower()
470
- n = -2
471
- elif word[-1:] in [".", "\n", ",", "!", "?"]:
472
- real_word = word[:-1].lower()
473
- n = -1
474
- else:
475
- real_word = word.lower()
476
- n = 0
477
- return real_word, len(word) + n
478
-
479
- real_word = uncover(word)[0]
480
- pos = uncover(word)[1]
481
- new_word = word
482
- if arg == 0: # term translate mode
483
- with open("finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
484
- term_enzh_dict = {rows[0]: rows[1] for rows in reader(f)}
485
- if real_word in term_enzh_dict:
486
- new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
487
- elif arg == 1: # term spell check mode
488
- import enchant
489
- dict = enchant.Dict('en_US')
490
- term_spellDict = enchant.PyPWL('./finetune_data/dict_freq.txt')
491
- if not dict.check(real_word):
492
- if term_spellDict.suggest(real_word): # relax spell check
493
- new_word = word.replace(word[:pos], term_spellDict.suggest(real_word)[0])
494
- return new_word
495
-
496
- def get_real_word(self, word: str):
497
  if word[-2:] == ".\n":
498
  real_word = word[:-2].lower()
499
  n = -2
@@ -503,7 +470,7 @@ class SRT_script():
503
  else:
504
  real_word = word.lower()
505
  n = 0
506
- return real_word, len(word) + n
507
 
508
  ## WRITE AND READ FUNCTIONS ##
509
 
 
5
  from datetime import timedelta
6
  import logging
7
  import openai
8
+ from tqdm import tqdm
9
 
10
 
11
  class SRT_segment(object):
 
412
 
413
  def fetchfunc(self,word,threshold):
414
  import enchant
415
+ result = word
416
+ distance = 0
417
  threshold = threshold*len(word)
418
  if len(self.comp_dict)==0:
419
+ with open("./finetune_data/dict_freq.txt", 'r', encoding='utf-8') as f:
420
+ self.comp_dict = {rows[0]: 1 for rows in reader(f)}
421
  temp = ""
422
  for matched in self.comp_dict:
423
+ if (" " in matched and " " in word) or (" " not in matched and " " not in word):
424
+ if enchant.utils.levenshtein(word, matched)<enchant.utils.levenshtein(word, temp):
425
+ temp = matched
426
  if enchant.utils.levenshtein(word, temp) < threshold:
427
+ distance = enchant.utils.levenshtein(word, temp)
428
  result = temp
429
+ return distance, result
430
+
431
+ def extract_words(self, sentence, n):
432
+ # this function split the sentence to chunks by n of words
433
+ # e.g. sentence: "this is a sentence", n = 2
434
+ # result: ["this", "is", "a", "sentence", "this is", "is a", "a sentence"]
435
+ words = sentence.split()
436
+ res = []
437
+ for j in range(1, n+1):
438
+ res += [words[i:i+j] for i in range(len(words)-j+1)]
439
+ return res
440
 
441
  def spell_check_term(self):
 
442
  logging.info("performing spell check")
443
  import enchant
444
  dict = enchant.Dict('en_US')
445
  term_spellDict = enchant.PyPWL('./finetune_data/dict_freq.txt')
446
 
447
+ for seg in tqdm(self.segments):
448
+ ready_words = self.extract_words(seg.source_text, 2)
449
  for i in range(len(ready_words)):
450
+ word_list = ready_words[i]
451
+ word, real_word, pos = self.get_real_word(word_list)
452
+ if not dict.check(real_word) and not term_spellDict.check(real_word):
453
+ distance, correct_term = self.fetchfunc(real_word, 0.3)
454
+ if distance != 0:
455
+ seg.source_text = re.sub(word[:pos], correct_term, seg.source_text, flags=re.IGNORECASE)
456
+ logging.info("replace: " + word[:pos] + " to " + correct_term + "\t distance = " + str(distance))
457
+
458
+
459
+ def get_real_word(self, word_list:list):
460
+ word = ""
461
+ for w in word_list:
462
+ word += f"{w} "
463
+ word = word[:-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  if word[-2:] == ".\n":
465
  real_word = word[:-2].lower()
466
  n = -2
 
470
  else:
471
  real_word = word.lower()
472
  n = 0
473
+ return word, real_word, len(word) + n
474
 
475
  ## WRITE AND READ FUNCTIONS ##
476
 
pipeline.py CHANGED
@@ -157,35 +157,6 @@ def script_split(script_in, chunk_size = 1000):
157
  assert len(script_arr) == len(range_arr)
158
  return script_arr, range_arr
159
 
160
- # check whether previous translation is done
161
- # zh_file = "{}/{}/{}_zh.srt".format(RESULT_PATH, VIDEO_NAME, VIDEO_NAME)
162
- # segidx = 1
163
- # if os.path.exists(zh_file):
164
- # temp_file = "{}/{}/temp.srt".format(RESULT_PATH, VIDEO_NAME)
165
- # if os.path.exists(temp_file):
166
- # os.remove(temp_file)
167
- # with open(zh_file, "r") as f0:
168
- # for count, _ in enumerate(f0):
169
- # pass
170
- # count += 1
171
- # segidx = int(count/4)+1
172
- # en_file = "{}/{}/{}_en.srt".format(RESULT_PATH, VIDEO_NAME, VIDEO_NAME)
173
- # if args.srt_file is not None:
174
- # en_file = args.srt_file
175
- # with open(en_file, "r") as f1, open(temp_file, "a") as f2:
176
- # x = f1.readlines()
177
- # #print(len(x))
178
- # if count >= len(x):
179
- # print('Work already done! Please delete {}_zh.srt files in result directory first in order to rework'.format(VIDEO_NAME))
180
- # exit()
181
- # for i, line in enumerate(x):
182
- # if i >= count:
183
- # f2.write(line)
184
-
185
- # srt = SRT_script.parse_from_srt_file(temp_file)
186
- # print('temp_contents')
187
- # print(srt.get_source_only())
188
-
189
  def check_translation(sentence, translation):
190
  """
191
  check merge sentence issue from openai translation
@@ -342,7 +313,7 @@ def main():
342
  logging.info("---------------------Start Preprocessing SRT class---------------------")
343
  srt.write_srt_file_src(srt_file_en)
344
  srt.form_whole_sentence()
345
- # srt.spell_check_term()
346
  srt.correct_with_force_term()
347
  processed_srt_file_en = srt_file_en.split('.srt')[0] + '_processed.srt'
348
  srt.write_srt_file_src(processed_srt_file_en)
 
157
  assert len(script_arr) == len(range_arr)
158
  return script_arr, range_arr
159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  def check_translation(sentence, translation):
161
  """
162
  check merge sentence issue from openai translation
 
313
  logging.info("---------------------Start Preprocessing SRT class---------------------")
314
  srt.write_srt_file_src(srt_file_en)
315
  srt.form_whole_sentence()
316
+ srt.spell_check_term()
317
  srt.correct_with_force_term()
318
  processed_srt_file_en = srt_file_en.split('.srt')[0] + '_processed.srt'
319
  srt.write_srt_file_src(processed_srt_file_en)