Spaces:

StarPigeon
/

ViDove

Sleeping

App Files Files Community

Eason Lu commited on Apr 17, 2023

Commit

1312451

1 Parent(s): de46850

spell check: now could check phrases and change replace logic

Browse files

Files changed (2) hide show

SRT.py +37 -70
pipeline.py +1 -30

SRT.py CHANGED Viewed

@@ -5,6 +5,7 @@ from csv import reader
 from datetime import timedelta
 import logging
 import openai
 class SRT_segment(object):
@@ -411,89 +412,55 @@ class SRT_script():
     def fetchfunc(self,word,threshold):
         import enchant
-        result = word;
         threshold = threshold*len(word)
         if len(self.comp_dict)==0:
-            with open("./finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
-                    self.comp_dict = {rows[0]: rows[1] for rows in reader(f)}
         temp = ""
         for matched in self.comp_dict:
-            if enchant.utils.levenshtein(word, matched)<enchant.utils.levenshtein(word, temp):
-                temp = matched
         if enchant.utils.levenshtein(word, temp) < threshold:
             result = temp
-        return result
     def spell_check_term(self):
-        ## known bug: I've will be replaced because i've is not in the dict
         logging.info("performing spell check")
         import enchant
         dict = enchant.Dict('en_US')
         term_spellDict = enchant.PyPWL('./finetune_data/dict_freq.txt')
-        for seg in self.segments:
-            ready_words = seg.source_text.split(" ")
             for i in range(len(ready_words)):
-                word = ready_words[i]
-                [real_word, pos] = self.get_real_word(word)
-                if not dict.check(word[:pos]) and not term_spellDict.check(real_word):
-                    new_word = word.replace(word[:pos],self.fetchfunc(word[:pos],0.5))
-                    logging.info(real_word + "\t" + self.fetchfunc(word[:pos],0.5) + "\t" + str(enchant.utils.levenshtein(real_word, self.fetchfunc(word[:pos],0.5)))+'\n')
-                    #suggest = term_spellDict.suggest(real_word)
-                    #if suggest and enchant.utils.levenshtein(real_word, suggest[0]) < (len(real_word)+len(suggest[0]))/4:  # relax spell check
-                        # with open("dislog.log","a") as log:
-                        #     if not os.path.exists("dislog.log"):
-                        #         log.write("word \t suggest \t levenshtein \n")
-                    #    logging.info(word + "\t" + suggest[0] + "\t" + str(enchant.utils.levenshtein(word, suggest[0]))+'\n')
-                    #    #print(word + ":" + suggest[0] + ":---:levenshtein:" + str(enchant.utils.levenshtein(word, suggest[0])))
-                    #    new_word = word.replace(word[:pos],suggest[0])
-                    #else:
-                    #    new_word = word
-                else:
-                    new_word = word
-                ready_words[i] = new_word
-            seg.source_text = " ".join(ready_words)
-        pass
-    def spell_correction(self, word: str, arg: int):
-        try:
-            arg in [0, 1]
-        except ValueError:
-            print('only 0 or 1 for argument')
-        def uncover(word: str):
-            if word[-2:] == ".\n":
-                real_word = word[:-2].lower()
-                n = -2
-            elif word[-1:] in [".", "\n", ",", "!", "?"]:
-                real_word = word[:-1].lower()
-                n = -1
-            else:
-                real_word = word.lower()
-                n = 0
-            return real_word, len(word) + n
-        real_word = uncover(word)[0]
-        pos = uncover(word)[1]
-        new_word = word
-        if arg == 0:  # term translate mode
-            with open("finetune_data/dict_enzh.csv", 'r', encoding='utf-8') as f:
-                term_enzh_dict = {rows[0]: rows[1] for rows in reader(f)}
-            if real_word in term_enzh_dict:
-                new_word = word.replace(word[:pos], term_enzh_dict.get(real_word))
-        elif arg == 1:  # term spell check mode
-            import enchant
-            dict = enchant.Dict('en_US')
-            term_spellDict = enchant.PyPWL('./finetune_data/dict_freq.txt')
-            if not dict.check(real_word):
-                if term_spellDict.suggest(real_word):  # relax spell check
-                    new_word = word.replace(word[:pos], term_spellDict.suggest(real_word)[0])
-        return new_word
-    def get_real_word(self, word: str):
         if word[-2:] == ".\n":
             real_word = word[:-2].lower()
             n = -2
@@ -503,7 +470,7 @@ class SRT_script():
         else:
             real_word = word.lower()
             n = 0
-        return real_word, len(word) + n
     ## WRITE AND READ FUNCTIONS ##

 from datetime import timedelta
 import logging
 import openai
+from tqdm import tqdm
 class SRT_segment(object):
     def fetchfunc(self,word,threshold):
         import enchant
+        result = word
+        distance = 0
         threshold = threshold*len(word)
         if len(self.comp_dict)==0:
+            with open("./finetune_data/dict_freq.txt", 'r', encoding='utf-8') as f:
+                    self.comp_dict = {rows[0]: 1 for rows in reader(f)}
         temp = ""
         for matched in self.comp_dict:
+            if (" " in matched and " " in word) or (" " not in matched and " " not in word):
+                if enchant.utils.levenshtein(word, matched)<enchant.utils.levenshtein(word, temp):
+                    temp = matched
         if enchant.utils.levenshtein(word, temp) < threshold:
+            distance = enchant.utils.levenshtein(word, temp)
             result = temp
+        return distance, result
+    def extract_words(self, sentence, n):
+        # this function split the sentence to chunks by n of words
+        # e.g. sentence: "this is a sentence", n = 2
+        # result: ["this", "is", "a", "sentence", "this is", "is a", "a sentence"]
+        words = sentence.split()
+        res = []
+        for j in range(1, n+1):
+            res += [words[i:i+j] for i in range(len(words)-j+1)]
+        return  res
     def spell_check_term(self):
         logging.info("performing spell check")
         import enchant
         dict = enchant.Dict('en_US')
         term_spellDict = enchant.PyPWL('./finetune_data/dict_freq.txt')
+        for seg in tqdm(self.segments):
+            ready_words = self.extract_words(seg.source_text, 2)
             for i in range(len(ready_words)):
+                word_list = ready_words[i]
+                word, real_word, pos = self.get_real_word(word_list)
+                if not dict.check(real_word) and not term_spellDict.check(real_word):
+                    distance, correct_term = self.fetchfunc(real_word, 0.3)
+                    if distance != 0:
+                        seg.source_text = re.sub(word[:pos], correct_term, seg.source_text, flags=re.IGNORECASE)
+                        logging.info("replace: " + word[:pos] + " to " + correct_term + "\t distance = " + str(distance))
+    def get_real_word(self, word_list:list):
+        word = ""
+        for w in word_list:
+            word += f"{w} "
+        word = word[:-1]
         if word[-2:] == ".\n":
             real_word = word[:-2].lower()
             n = -2
         else:
             real_word = word.lower()
             n = 0
+        return word, real_word, len(word) + n
     ## WRITE AND READ FUNCTIONS ##

pipeline.py CHANGED Viewed

@@ -157,35 +157,6 @@ def script_split(script_in, chunk_size = 1000):
     assert len(script_arr) == len(range_arr)
     return script_arr, range_arr
-# check whether previous translation is done
-# zh_file = "{}/{}/{}_zh.srt".format(RESULT_PATH, VIDEO_NAME, VIDEO_NAME)
-# segidx = 1
-# if os.path.exists(zh_file):
-#     temp_file = "{}/{}/temp.srt".format(RESULT_PATH, VIDEO_NAME)
-#     if os.path.exists(temp_file):
-#         os.remove(temp_file)
-#     with open(zh_file, "r") as f0:
-#         for count, _ in enumerate(f0):
-#             pass
-#         count += 1
-#         segidx = int(count/4)+1
-#     en_file = "{}/{}/{}_en.srt".format(RESULT_PATH, VIDEO_NAME, VIDEO_NAME)
-#     if args.srt_file is not None:
-#         en_file = args.srt_file
-#     with open(en_file, "r") as f1, open(temp_file, "a") as f2:
-#         x = f1.readlines()
-#         #print(len(x))
-#         if count >= len(x):
-#             print('Work already done! Please delete {}_zh.srt files in result directory first in order to rework'.format(VIDEO_NAME))
-#             exit()
-#         for i, line in enumerate(x):
-#             if i >= count:
-#                 f2.write(line)
-#     srt = SRT_script.parse_from_srt_file(temp_file)
-#     print('temp_contents')
-#     print(srt.get_source_only())
 def check_translation(sentence, translation):
     """
     check merge sentence issue from openai translation
@@ -342,7 +313,7 @@ def main():
     logging.info("---------------------Start Preprocessing SRT class---------------------")
     srt.write_srt_file_src(srt_file_en)
     srt.form_whole_sentence()
-    # srt.spell_check_term()
     srt.correct_with_force_term()
     processed_srt_file_en = srt_file_en.split('.srt')[0] + '_processed.srt'
     srt.write_srt_file_src(processed_srt_file_en)

     assert len(script_arr) == len(range_arr)
     return script_arr, range_arr
 def check_translation(sentence, translation):
     """
     check merge sentence issue from openai translation
     logging.info("---------------------Start Preprocessing SRT class---------------------")
     srt.write_srt_file_src(srt_file_en)
     srt.form_whole_sentence()
+    srt.spell_check_term()
     srt.correct_with_force_term()
     processed_srt_file_en = srt_file_en.split('.srt')[0] + '_processed.srt'
     srt.write_srt_file_src(processed_srt_file_en)