Spaces:
Sleeping
Sleeping
Eason Lu
commited on
Commit
·
1312451
1
Parent(s):
de46850
spell check: now could check phrases and change replace logic
Browse files- SRT.py +37 -70
- pipeline.py +1 -30
SRT.py
CHANGED
@@ -5,6 +5,7 @@ from csv import reader
|
|
5 |
from datetime import timedelta
|
6 |
import logging
|
7 |
import openai
|
|
|
8 |
|
9 |
|
10 |
class SRT_segment(object):
|
@@ -411,89 +412,55 @@ class SRT_script():
|
|
411 |
|
412 |
def fetchfunc(self,word,threshold):
|
413 |
import enchant
|
414 |
-
result = word
|
|
|
415 |
threshold = threshold*len(word)
|
416 |
if len(self.comp_dict)==0:
|
417 |
-
with open("./finetune_data/
|
418 |
-
self.comp_dict = {rows[0]:
|
419 |
temp = ""
|
420 |
for matched in self.comp_dict:
|
421 |
-
if
|
422 |
-
|
|
|
423 |
if enchant.utils.levenshtein(word, temp) < threshold:
|
|
|
424 |
result = temp
|
425 |
-
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
426 |
|
427 |
def spell_check_term(self):
|
428 |
-
## known bug: I've will be replaced because i've is not in the dict
|
429 |
logging.info("performing spell check")
|
430 |
import enchant
|
431 |
dict = enchant.Dict('en_US')
|
432 |
term_spellDict = enchant.PyPWL('./finetune_data/dict_freq.txt')
|
433 |
|
434 |
-
for seg in self.segments:
|
435 |
-
ready_words = seg.source_text
|
436 |
for i in range(len(ready_words)):
|
437 |
-
|
438 |
-
|
439 |
-
if not dict.check(
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
# #print(word + ":" + suggest[0] + ":---:levenshtein:" + str(enchant.utils.levenshtein(word, suggest[0])))
|
452 |
-
# new_word = word.replace(word[:pos],suggest[0])
|
453 |
-
#else:
|
454 |
-
# new_word = word
|
455 |
-
else:
|
456 |
-
new_word = word
|
457 |
-
ready_words[i] = new_word
|
458 |
-
seg.source_text = " ".join(ready_words)
|
459 |
-
pass
|
460 |
-
|
461 |
-
def spell_correction(self, word: str, arg: int):
|
462 |
-
try:
|
463 |
-
arg in [0, 1]
|
464 |
-
except ValueError:
|
465 |
-
print('only 0 or 1 for argument')
|
466 |
-
|
467 |
-
def uncover(word: str):
|
468 |
-
if word[-2:] == ".\n":
|
469 |
-
real_word = word[:-2].lower()
|
470 |
-
n = -2
|
471 |
-
elif word[-1:] in [".", "\n", ",", "!", "?"]:
|
472 |
-
real_word = word[:-1].lower()
|
473 |
-
n = -1
|
474 |
-
else:
|
475 |
-
real_word = word.lower()
|
476 |
-
n = 0
|
477 |
-
return real_word, len(word) + n
|
478 |
-
|
479 |
-
real_word = uncover(word)[0]
|
480 |
-
pos = uncover(word)[1]
|
481 |
-
new_word = word
|
482 |
-
if arg == 0: # term translate mode
|
483 |
-
with open("finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
|
484 |
-
term_enzh_dict = {rows[0]: rows[1] for rows in reader(f)}
|
485 |
-
if real_word in term_enzh_dict:
|
486 |
-
new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
|
487 |
-
elif arg == 1: # term spell check mode
|
488 |
-
import enchant
|
489 |
-
dict = enchant.Dict('en_US')
|
490 |
-
term_spellDict = enchant.PyPWL('./finetune_data/dict_freq.txt')
|
491 |
-
if not dict.check(real_word):
|
492 |
-
if term_spellDict.suggest(real_word): # relax spell check
|
493 |
-
new_word = word.replace(word[:pos], term_spellDict.suggest(real_word)[0])
|
494 |
-
return new_word
|
495 |
-
|
496 |
-
def get_real_word(self, word: str):
|
497 |
if word[-2:] == ".\n":
|
498 |
real_word = word[:-2].lower()
|
499 |
n = -2
|
@@ -503,7 +470,7 @@ class SRT_script():
|
|
503 |
else:
|
504 |
real_word = word.lower()
|
505 |
n = 0
|
506 |
-
return real_word, len(word) + n
|
507 |
|
508 |
## WRITE AND READ FUNCTIONS ##
|
509 |
|
|
|
5 |
from datetime import timedelta
|
6 |
import logging
|
7 |
import openai
|
8 |
+
from tqdm import tqdm
|
9 |
|
10 |
|
11 |
class SRT_segment(object):
|
|
|
412 |
|
413 |
def fetchfunc(self,word,threshold):
|
414 |
import enchant
|
415 |
+
result = word
|
416 |
+
distance = 0
|
417 |
threshold = threshold*len(word)
|
418 |
if len(self.comp_dict)==0:
|
419 |
+
with open("./finetune_data/dict_freq.txt", 'r', encoding='utf-8') as f:
|
420 |
+
self.comp_dict = {rows[0]: 1 for rows in reader(f)}
|
421 |
temp = ""
|
422 |
for matched in self.comp_dict:
|
423 |
+
if (" " in matched and " " in word) or (" " not in matched and " " not in word):
|
424 |
+
if enchant.utils.levenshtein(word, matched)<enchant.utils.levenshtein(word, temp):
|
425 |
+
temp = matched
|
426 |
if enchant.utils.levenshtein(word, temp) < threshold:
|
427 |
+
distance = enchant.utils.levenshtein(word, temp)
|
428 |
result = temp
|
429 |
+
return distance, result
|
430 |
+
|
431 |
+
def extract_words(self, sentence, n):
|
432 |
+
# this function split the sentence to chunks by n of words
|
433 |
+
# e.g. sentence: "this is a sentence", n = 2
|
434 |
+
# result: ["this", "is", "a", "sentence", "this is", "is a", "a sentence"]
|
435 |
+
words = sentence.split()
|
436 |
+
res = []
|
437 |
+
for j in range(1, n+1):
|
438 |
+
res += [words[i:i+j] for i in range(len(words)-j+1)]
|
439 |
+
return res
|
440 |
|
441 |
def spell_check_term(self):
|
|
|
442 |
logging.info("performing spell check")
|
443 |
import enchant
|
444 |
dict = enchant.Dict('en_US')
|
445 |
term_spellDict = enchant.PyPWL('./finetune_data/dict_freq.txt')
|
446 |
|
447 |
+
for seg in tqdm(self.segments):
|
448 |
+
ready_words = self.extract_words(seg.source_text, 2)
|
449 |
for i in range(len(ready_words)):
|
450 |
+
word_list = ready_words[i]
|
451 |
+
word, real_word, pos = self.get_real_word(word_list)
|
452 |
+
if not dict.check(real_word) and not term_spellDict.check(real_word):
|
453 |
+
distance, correct_term = self.fetchfunc(real_word, 0.3)
|
454 |
+
if distance != 0:
|
455 |
+
seg.source_text = re.sub(word[:pos], correct_term, seg.source_text, flags=re.IGNORECASE)
|
456 |
+
logging.info("replace: " + word[:pos] + " to " + correct_term + "\t distance = " + str(distance))
|
457 |
+
|
458 |
+
|
459 |
+
def get_real_word(self, word_list:list):
|
460 |
+
word = ""
|
461 |
+
for w in word_list:
|
462 |
+
word += f"{w} "
|
463 |
+
word = word[:-1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
464 |
if word[-2:] == ".\n":
|
465 |
real_word = word[:-2].lower()
|
466 |
n = -2
|
|
|
470 |
else:
|
471 |
real_word = word.lower()
|
472 |
n = 0
|
473 |
+
return word, real_word, len(word) + n
|
474 |
|
475 |
## WRITE AND READ FUNCTIONS ##
|
476 |
|
pipeline.py
CHANGED
@@ -157,35 +157,6 @@ def script_split(script_in, chunk_size = 1000):
|
|
157 |
assert len(script_arr) == len(range_arr)
|
158 |
return script_arr, range_arr
|
159 |
|
160 |
-
# check whether previous translation is done
|
161 |
-
# zh_file = "{}/{}/{}_zh.srt".format(RESULT_PATH, VIDEO_NAME, VIDEO_NAME)
|
162 |
-
# segidx = 1
|
163 |
-
# if os.path.exists(zh_file):
|
164 |
-
# temp_file = "{}/{}/temp.srt".format(RESULT_PATH, VIDEO_NAME)
|
165 |
-
# if os.path.exists(temp_file):
|
166 |
-
# os.remove(temp_file)
|
167 |
-
# with open(zh_file, "r") as f0:
|
168 |
-
# for count, _ in enumerate(f0):
|
169 |
-
# pass
|
170 |
-
# count += 1
|
171 |
-
# segidx = int(count/4)+1
|
172 |
-
# en_file = "{}/{}/{}_en.srt".format(RESULT_PATH, VIDEO_NAME, VIDEO_NAME)
|
173 |
-
# if args.srt_file is not None:
|
174 |
-
# en_file = args.srt_file
|
175 |
-
# with open(en_file, "r") as f1, open(temp_file, "a") as f2:
|
176 |
-
# x = f1.readlines()
|
177 |
-
# #print(len(x))
|
178 |
-
# if count >= len(x):
|
179 |
-
# print('Work already done! Please delete {}_zh.srt files in result directory first in order to rework'.format(VIDEO_NAME))
|
180 |
-
# exit()
|
181 |
-
# for i, line in enumerate(x):
|
182 |
-
# if i >= count:
|
183 |
-
# f2.write(line)
|
184 |
-
|
185 |
-
# srt = SRT_script.parse_from_srt_file(temp_file)
|
186 |
-
# print('temp_contents')
|
187 |
-
# print(srt.get_source_only())
|
188 |
-
|
189 |
def check_translation(sentence, translation):
|
190 |
"""
|
191 |
check merge sentence issue from openai translation
|
@@ -342,7 +313,7 @@ def main():
|
|
342 |
logging.info("---------------------Start Preprocessing SRT class---------------------")
|
343 |
srt.write_srt_file_src(srt_file_en)
|
344 |
srt.form_whole_sentence()
|
345 |
-
|
346 |
srt.correct_with_force_term()
|
347 |
processed_srt_file_en = srt_file_en.split('.srt')[0] + '_processed.srt'
|
348 |
srt.write_srt_file_src(processed_srt_file_en)
|
|
|
157 |
assert len(script_arr) == len(range_arr)
|
158 |
return script_arr, range_arr
|
159 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
def check_translation(sentence, translation):
|
161 |
"""
|
162 |
check merge sentence issue from openai translation
|
|
|
313 |
logging.info("---------------------Start Preprocessing SRT class---------------------")
|
314 |
srt.write_srt_file_src(srt_file_en)
|
315 |
srt.form_whole_sentence()
|
316 |
+
srt.spell_check_term()
|
317 |
srt.correct_with_force_term()
|
318 |
processed_srt_file_en = srt_file_en.split('.srt')[0] + '_processed.srt'
|
319 |
srt.write_srt_file_src(processed_srt_file_en)
|